diff --git "a/checkpoint-10945/trainer_state.json" "b/checkpoint-10945/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10945/trainer_state.json" @@ -0,0 +1,163781 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10945, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.136592051164915e-05, + "grad_norm": 37.25, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": 981163724.8, + "logits/rejected": 584397738.6666666, + "logps/chosen": -349.39609375, + "logps/rejected": -404.4066569010417, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0001827318410232983, + "grad_norm": 34.5, + "kl": 0.0, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": 1517737344.0, + "logits/rejected": 792758698.6666666, + "logps/chosen": -598.5811767578125, + "logps/rejected": -349.8846028645833, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.00027409776153494746, + "grad_norm": 50.0, + "kl": 0.6288528442382812, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 642897536.0, + "logits/rejected": 734878592.0, + "logps/chosen": -330.6077880859375, + "logps/rejected": -573.5179443359375, + "loss": 0.4908, + "rewards/chosen": 0.030656814575195312, + "rewards/margins": 0.07338829338550568, + "rewards/rejected": -0.042731478810310364, + "step": 3 + }, + { + "epoch": 0.0003654636820465966, + "grad_norm": 29.75, + "kl": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": 457633792.0, + "logits/rejected": 576672051.2, + "logps/chosen": -341.28590901692706, + "logps/rejected": -332.9212890625, + "loss": 0.5104, + "rewards/chosen": -0.07924270629882812, + "rewards/margins": -0.09842635989189148, + "rewards/rejected": 0.019183653593063354, + "step": 4 + }, + { + "epoch": 0.00045682960255824577, + "grad_norm": 50.25, + "kl": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 689293013.3333334, + "logits/rejected": 582818304.0, + "logps/chosen": -353.6021321614583, + "logps/rejected": -446.69248046875, + "loss": 0.4837, + "rewards/chosen": 0.023518117765585583, + "rewards/margins": 0.11424809942642848, + "rewards/rejected": -0.0907299816608429, + "step": 5 + }, + { + "epoch": 0.0005481955230698949, + "grad_norm": 40.5, + "kl": 0.0, + "learning_rate": 2.5e-06, + "logits/chosen": 615606442.6666666, + "logits/rejected": 470078720.0, + "logps/chosen": -308.94598388671875, + "logps/rejected": -514.92587890625, + "loss": 0.4943, + "rewards/chosen": -0.0019007374842961629, + "rewards/margins": 0.035703633228937784, + "rewards/rejected": -0.03760437071323395, + "step": 6 + }, + { + "epoch": 0.0006395614435815441, + "grad_norm": 39.5, + "kl": 0.0, + "learning_rate": 3e-06, + "logits/chosen": 751301973.3333334, + "logits/rejected": 778279014.4, + "logps/chosen": -336.8445638020833, + "logps/rejected": -253.6494384765625, + "loss": 0.4761, + "rewards/chosen": 0.06360473732153575, + "rewards/margins": 0.17849549154440564, + "rewards/rejected": -0.11489075422286987, + "step": 7 + }, + { + "epoch": 0.0007309273640931932, + "grad_norm": 29.0, + "kl": 0.0, + "learning_rate": 3.5e-06, + "logits/chosen": 445824256.0, + "logits/rejected": 464757184.0, + "logps/chosen": -249.4971923828125, + "logps/rejected": -307.68017578125, + "loss": 0.4723, + "rewards/chosen": 0.020929334685206413, + "rewards/margins": 0.22298773936927319, + "rewards/rejected": -0.20205840468406677, + "step": 8 + }, + { + "epoch": 0.0008222932846048424, + "grad_norm": 43.0, + "kl": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 711453952.0, + "logits/rejected": 554997760.0, + "logps/chosen": -388.4285583496094, + "logps/rejected": -462.6708170572917, + "loss": 0.46, + "rewards/chosen": -0.06703033298254013, + "rewards/margins": 0.17242977768182755, + "rewards/rejected": -0.23946011066436768, + "step": 9 + }, + { + "epoch": 0.0009136592051164915, + "grad_norm": 33.75, + "kl": 0.0, + "learning_rate": 4.5e-06, + "logits/chosen": 604856704.0, + "logits/rejected": 691866240.0, + "logps/chosen": -426.9454040527344, + "logps/rejected": -266.69189453125, + "loss": 0.4687, + "rewards/chosen": -0.03875274956226349, + "rewards/margins": 0.2529834657907486, + "rewards/rejected": -0.2917362153530121, + "step": 10 + }, + { + "epoch": 0.0010050251256281408, + "grad_norm": 31.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 545593770.6666666, + "logits/rejected": 695319654.4, + "logps/chosen": -295.1769205729167, + "logps/rejected": -292.628466796875, + "loss": 0.4461, + "rewards/chosen": 0.0883013407389323, + "rewards/margins": 0.38332025210062665, + "rewards/rejected": -0.29501891136169434, + "step": 11 + }, + { + "epoch": 0.0010963910461397899, + "grad_norm": 29.0, + "kl": 0.0, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": 546261930.6666666, + "logits/rejected": 659439616.0, + "logps/chosen": -277.3191324869792, + "logps/rejected": -328.818115234375, + "loss": 0.4343, + "rewards/chosen": -0.01151479035615921, + "rewards/margins": 0.42999412268400194, + "rewards/rejected": -0.44150891304016116, + "step": 12 + }, + { + "epoch": 0.0011877569666514391, + "grad_norm": 37.25, + "kl": 0.0, + "learning_rate": 6e-06, + "logits/chosen": 696533065.1428572, + "logits/rejected": 215391872.0, + "logps/chosen": -384.0857631138393, + "logps/rejected": -162.12643432617188, + "loss": 0.4569, + "rewards/chosen": 0.02737862297466823, + "rewards/margins": 1.3970441264765603, + "rewards/rejected": -1.369665503501892, + "step": 13 + }, + { + "epoch": 0.0012791228871630882, + "grad_norm": 34.5, + "kl": 0.0, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": 1474736128.0, + "logits/rejected": 930639564.8, + "logps/chosen": -230.5843505859375, + "logps/rejected": -530.39951171875, + "loss": 0.3823, + "rewards/chosen": 0.08029313882191975, + "rewards/margins": 0.8410719474156698, + "rewards/rejected": -0.76077880859375, + "step": 14 + }, + { + "epoch": 0.0013704888076747374, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 7e-06, + "logits/chosen": 415240512.0, + "logits/rejected": 392989504.0, + "logps/chosen": -359.478759765625, + "logps/rejected": -336.8031311035156, + "loss": 0.3732, + "rewards/chosen": 0.050609588623046875, + "rewards/margins": 1.2184991836547852, + "rewards/rejected": -1.1678895950317383, + "step": 15 + }, + { + "epoch": 0.0014618547281863865, + "grad_norm": 30.25, + "kl": 0.0, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": 390358688.0, + "logits/rejected": 620025536.0, + "logps/chosen": -353.20648193359375, + "logps/rejected": -535.4990234375, + "loss": 0.3764, + "rewards/chosen": -0.09081001579761505, + "rewards/margins": 1.124785229563713, + "rewards/rejected": -1.2155952453613281, + "step": 16 + }, + { + "epoch": 0.0015532206486980357, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 648111616.0, + "logits/rejected": 459035306.6666667, + "logps/chosen": -430.381982421875, + "logps/rejected": -492.8487141927083, + "loss": 0.3687, + "rewards/chosen": -0.10066864490509034, + "rewards/margins": 2.3571377833684286, + "rewards/rejected": -2.457806428273519, + "step": 17 + }, + { + "epoch": 0.0016445865692096848, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 8.5e-06, + "logits/chosen": 426926080.0, + "logits/rejected": 423966924.8, + "logps/chosen": -335.34804280598956, + "logps/rejected": -479.871826171875, + "loss": 0.2436, + "rewards/chosen": -0.07163696487744649, + "rewards/margins": 2.5684543589750923, + "rewards/rejected": -2.640091323852539, + "step": 18 + }, + { + "epoch": 0.001735952489721334, + "grad_norm": 29.375, + "kl": 0.0, + "learning_rate": 9e-06, + "logits/chosen": 537552256.0, + "logits/rejected": 549725312.0, + "logps/chosen": -397.0262451171875, + "logps/rejected": -375.8096923828125, + "loss": 0.3518, + "rewards/chosen": -0.05130748823285103, + "rewards/margins": 1.6866888515651226, + "rewards/rejected": -1.7379963397979736, + "step": 19 + }, + { + "epoch": 0.001827318410232983, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 9.5e-06, + "logits/chosen": 455270432.0, + "logits/rejected": 342499456.0, + "logps/chosen": -232.4154815673828, + "logps/rejected": -310.9127197265625, + "loss": 0.3195, + "rewards/chosen": 0.15977707505226135, + "rewards/margins": 2.921502500772476, + "rewards/rejected": -2.761725425720215, + "step": 20 + }, + { + "epoch": 0.0019186843307446323, + "grad_norm": 28.875, + "kl": 0.0, + "learning_rate": 1e-05, + "logits/chosen": 671181824.0, + "logits/rejected": 284123392.0, + "logps/chosen": -233.7552286783854, + "logps/rejected": -181.03321838378906, + "loss": 0.4267, + "rewards/chosen": 0.0623507152001063, + "rewards/margins": 1.348945106069247, + "rewards/rejected": -1.2865943908691406, + "step": 21 + }, + { + "epoch": 0.0020100502512562816, + "grad_norm": 26.625, + "kl": 0.0, + "learning_rate": 9.999999793273163e-06, + "logits/chosen": 428229376.0, + "logits/rejected": 519169450.6666667, + "logps/chosen": -233.436865234375, + "logps/rejected": -559.1494954427084, + "loss": 0.3121, + "rewards/chosen": 0.08242599964141846, + "rewards/margins": 4.209520904223124, + "rewards/rejected": -4.127094904581706, + "step": 22 + }, + { + "epoch": 0.0021014161717679307, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 9.99999917309267e-06, + "logits/chosen": 589381248.0, + "logits/rejected": 568530730.6666666, + "logps/chosen": -268.30889892578125, + "logps/rejected": -485.2750651041667, + "loss": 0.2283, + "rewards/chosen": 0.1339370757341385, + "rewards/margins": 2.508205766479174, + "rewards/rejected": -2.3742686907450357, + "step": 23 + }, + { + "epoch": 0.0021927820922795797, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 9.999998139458571e-06, + "logits/chosen": 673094016.0, + "logits/rejected": 404725930.6666667, + "logps/chosen": -431.46075439453125, + "logps/rejected": -420.947021484375, + "loss": 0.17, + "rewards/chosen": -0.04740601032972336, + "rewards/margins": 3.890135315557321, + "rewards/rejected": -3.9375413258870444, + "step": 24 + }, + { + "epoch": 0.0022841480127912287, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 9.99999669237095e-06, + "logits/chosen": 1442760448.0, + "logits/rejected": 570391552.0, + "logps/chosen": -492.32891845703125, + "logps/rejected": -323.60670979817706, + "loss": 0.2162, + "rewards/chosen": 1.0278137922286987, + "rewards/margins": 3.280263304710388, + "rewards/rejected": -2.2524495124816895, + "step": 25 + }, + { + "epoch": 0.0023755139333028782, + "grad_norm": 33.5, + "kl": 0.0, + "learning_rate": 9.999994831829932e-06, + "logits/chosen": 1196251136.0, + "logits/rejected": 495961856.0, + "logps/chosen": -408.0096842447917, + "logps/rejected": -229.8767578125, + "loss": 0.2899, + "rewards/chosen": 0.035558074712753296, + "rewards/margins": 1.8779256284236907, + "rewards/rejected": -1.8423675537109374, + "step": 26 + }, + { + "epoch": 0.0024668798538145273, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 9.999992557835666e-06, + "logits/chosen": 1320213248.0, + "logits/rejected": 1087849344.0, + "logps/chosen": -506.71063232421875, + "logps/rejected": -466.09356689453125, + "loss": 0.2953, + "rewards/chosen": 0.24570122361183167, + "rewards/margins": 2.8272701799869537, + "rewards/rejected": -2.581568956375122, + "step": 27 + }, + { + "epoch": 0.0025582457743261763, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 9.99998987038834e-06, + "logits/chosen": 483857261.71428573, + "logits/rejected": 293263168.0, + "logps/chosen": -299.450439453125, + "logps/rejected": -316.9371643066406, + "loss": 0.3379, + "rewards/chosen": 0.6085285459245954, + "rewards/margins": 2.214860967227391, + "rewards/rejected": -1.6063324213027954, + "step": 28 + }, + { + "epoch": 0.0026496116948378254, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 9.999986769488176e-06, + "logits/chosen": 402043392.0, + "logits/rejected": 584645568.0, + "logps/chosen": -347.7413635253906, + "logps/rejected": -574.6985473632812, + "loss": 0.2117, + "rewards/chosen": 0.4169071912765503, + "rewards/margins": 4.800082087516785, + "rewards/rejected": -4.383174896240234, + "step": 29 + }, + { + "epoch": 0.002740977615349475, + "grad_norm": 26.75, + "kl": 0.0, + "learning_rate": 9.999983255135435e-06, + "logits/chosen": 681597354.6666666, + "logits/rejected": 603607552.0, + "logps/chosen": -382.3561197916667, + "logps/rejected": -408.9732971191406, + "loss": 0.3036, + "rewards/chosen": 0.7372148831685384, + "rewards/margins": 3.878155787785848, + "rewards/rejected": -3.1409409046173096, + "step": 30 + }, + { + "epoch": 0.002832343535861124, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.999979327330404e-06, + "logits/chosen": 438476373.3333333, + "logits/rejected": 1001212416.0, + "logps/chosen": -203.13370768229166, + "logps/rejected": -535.046630859375, + "loss": 0.2162, + "rewards/chosen": 0.6975606282552084, + "rewards/margins": 2.9881957372029624, + "rewards/rejected": -2.290635108947754, + "step": 31 + }, + { + "epoch": 0.002923709456372773, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.99997498607341e-06, + "logits/chosen": 471696736.0, + "logits/rejected": 524157824.0, + "logps/chosen": -268.15716552734375, + "logps/rejected": -612.0850219726562, + "loss": 0.1438, + "rewards/chosen": 1.2204220294952393, + "rewards/margins": 5.017296075820923, + "rewards/rejected": -3.7968740463256836, + "step": 32 + }, + { + "epoch": 0.003015075376884422, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 9.999970231364809e-06, + "logits/chosen": 555248230.4, + "logits/rejected": 682777386.6666666, + "logps/chosen": -391.5080322265625, + "logps/rejected": -324.7770182291667, + "loss": 0.2471, + "rewards/chosen": 1.9828302383422851, + "rewards/margins": 2.461264483133952, + "rewards/rejected": -0.4784342447916667, + "step": 33 + }, + { + "epoch": 0.0031064412973960715, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 9.999965063204996e-06, + "logits/chosen": 463717504.0, + "logits/rejected": 593143424.0, + "logps/chosen": -319.12815348307294, + "logps/rejected": -296.0106506347656, + "loss": 0.1777, + "rewards/chosen": 1.8310778935750325, + "rewards/margins": 3.093377312024434, + "rewards/rejected": -1.2622994184494019, + "step": 34 + }, + { + "epoch": 0.0031978072179077205, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.999959481594398e-06, + "logits/chosen": 298880000.0, + "logits/rejected": 559928768.0, + "logps/chosen": -251.0867919921875, + "logps/rejected": -478.085693359375, + "loss": 0.1387, + "rewards/chosen": 1.7860512733459473, + "rewards/margins": 4.446882009506226, + "rewards/rejected": -2.6608307361602783, + "step": 35 + }, + { + "epoch": 0.0032891731384193696, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 9.999953486533476e-06, + "logits/chosen": 505700693.3333333, + "logits/rejected": 587782707.2, + "logps/chosen": -207.0274861653646, + "logps/rejected": -370.2146484375, + "loss": 0.1522, + "rewards/chosen": 2.0110317866007485, + "rewards/margins": 4.147255388895671, + "rewards/rejected": -2.1362236022949217, + "step": 36 + }, + { + "epoch": 0.0033805390589310186, + "grad_norm": 23.125, + "kl": 0.0, + "learning_rate": 9.999947078022726e-06, + "logits/chosen": 1289183104.0, + "logits/rejected": 803621824.0, + "logps/chosen": -515.231201171875, + "logps/rejected": -285.18560791015625, + "loss": 0.1752, + "rewards/chosen": 1.6247841119766235, + "rewards/margins": 3.728514790534973, + "rewards/rejected": -2.1037306785583496, + "step": 37 + }, + { + "epoch": 0.003471904979442668, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 9.99994025606268e-06, + "logits/chosen": 436931104.0, + "logits/rejected": 401583424.0, + "logps/chosen": -520.9030151367188, + "logps/rejected": -332.5743408203125, + "loss": 0.1591, + "rewards/chosen": 1.835371732711792, + "rewards/margins": 4.398111343383789, + "rewards/rejected": -2.562739610671997, + "step": 38 + }, + { + "epoch": 0.003563270899954317, + "grad_norm": 33.5, + "kl": 0.0, + "learning_rate": 9.999933020653898e-06, + "logits/chosen": 554038016.0, + "logits/rejected": 1140671897.6, + "logps/chosen": -426.7984212239583, + "logps/rejected": -389.522119140625, + "loss": 0.2088, + "rewards/chosen": 1.3200043042500813, + "rewards/margins": 5.036685212453206, + "rewards/rejected": -3.716680908203125, + "step": 39 + }, + { + "epoch": 0.003654636820465966, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.999925371796981e-06, + "logits/chosen": 365939200.0, + "logits/rejected": 434596300.8, + "logps/chosen": -310.6006673177083, + "logps/rejected": -388.967138671875, + "loss": 0.0841, + "rewards/chosen": 2.707329750061035, + "rewards/margins": 6.275337409973145, + "rewards/rejected": -3.5680076599121096, + "step": 40 + }, + { + "epoch": 0.003746002740977615, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 9.999917309492561e-06, + "logits/chosen": 444407961.6, + "logits/rejected": 681566250.6666666, + "logps/chosen": -251.619384765625, + "logps/rejected": -566.0886637369791, + "loss": 0.2004, + "rewards/chosen": 1.0545282363891602, + "rewards/margins": 4.762812296549479, + "rewards/rejected": -3.708284060160319, + "step": 41 + }, + { + "epoch": 0.0038373686614892647, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 9.999908833741307e-06, + "logits/chosen": 521467946.6666667, + "logits/rejected": 436853376.0, + "logps/chosen": -480.2768147786458, + "logps/rejected": -317.75, + "loss": 0.1615, + "rewards/chosen": 2.3790931701660156, + "rewards/margins": 3.020178973674774, + "rewards/rejected": -0.6410858035087585, + "step": 42 + }, + { + "epoch": 0.003928734582000913, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 9.999899944543917e-06, + "logits/chosen": 340645120.0, + "logits/rejected": 390442752.0, + "logps/chosen": -361.0386962890625, + "logps/rejected": -306.352783203125, + "loss": 0.1894, + "rewards/chosen": 2.65283203125, + "rewards/margins": 4.526552200317383, + "rewards/rejected": -1.8737201690673828, + "step": 43 + }, + { + "epoch": 0.004020100502512563, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.999890641901124e-06, + "logits/chosen": 551446848.0, + "logits/rejected": 346815584.0, + "logps/chosen": -355.0975036621094, + "logps/rejected": -349.72808837890625, + "loss": 0.0958, + "rewards/chosen": 2.2526707649230957, + "rewards/margins": 5.174274921417236, + "rewards/rejected": -2.9216041564941406, + "step": 44 + }, + { + "epoch": 0.004111466423024212, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.9998809258137e-06, + "logits/chosen": 570520883.2, + "logits/rejected": 850861141.3333334, + "logps/chosen": -413.144775390625, + "logps/rejected": -296.9521077473958, + "loss": 0.1109, + "rewards/chosen": 2.2615119934082033, + "rewards/margins": 4.66077569325765, + "rewards/rejected": -2.3992636998494468, + "step": 45 + }, + { + "epoch": 0.004202832343535861, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.999870796282452e-06, + "logits/chosen": 478474272.0, + "logits/rejected": 852864341.3333334, + "logps/chosen": -235.56350708007812, + "logps/rejected": -464.0865478515625, + "loss": 0.0903, + "rewards/chosen": 2.1333107948303223, + "rewards/margins": 5.6135071118672695, + "rewards/rejected": -3.4801963170369468, + "step": 46 + }, + { + "epoch": 0.00429419826404751, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 9.999860253308211e-06, + "logits/chosen": 1009198899.2, + "logits/rejected": 535837098.6666667, + "logps/chosen": -423.1767578125, + "logps/rejected": -477.708251953125, + "loss": 0.1468, + "rewards/chosen": 1.471944808959961, + "rewards/margins": 6.172437349955241, + "rewards/rejected": -4.70049254099528, + "step": 47 + }, + { + "epoch": 0.004385564184559159, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.999849296891854e-06, + "logits/chosen": 651788885.3333334, + "logits/rejected": 490678425.6, + "logps/chosen": -477.6344401041667, + "logps/rejected": -309.082568359375, + "loss": 0.0754, + "rewards/chosen": 2.013704299926758, + "rewards/margins": 5.690828323364258, + "rewards/rejected": -3.6771240234375, + "step": 48 + }, + { + "epoch": 0.0044769301050708084, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.999837927034284e-06, + "logits/chosen": 452963104.0, + "logits/rejected": 749722368.0, + "logps/chosen": -300.3709716796875, + "logps/rejected": -368.5940856933594, + "loss": 0.1675, + "rewards/chosen": 1.3432636260986328, + "rewards/margins": 5.07336950302124, + "rewards/rejected": -3.7301058769226074, + "step": 49 + }, + { + "epoch": 0.0045682960255824575, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.999826143736441e-06, + "logits/chosen": 551608012.8, + "logits/rejected": 350196800.0, + "logps/chosen": -505.86474609375, + "logps/rejected": -236.8348388671875, + "loss": 0.1372, + "rewards/chosen": 2.0717029571533203, + "rewards/margins": 4.503201484680176, + "rewards/rejected": -2.4314985275268555, + "step": 50 + }, + { + "epoch": 0.0046596619460941065, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.9998139469993e-06, + "logits/chosen": 848630186.6666666, + "logits/rejected": 854665318.4, + "logps/chosen": -566.044677734375, + "logps/rejected": -434.58828125, + "loss": 0.1108, + "rewards/chosen": 2.3380066553751626, + "rewards/margins": 6.0741882960001625, + "rewards/rejected": -3.736181640625, + "step": 51 + }, + { + "epoch": 0.0047510278666057565, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.999801336823871e-06, + "logits/chosen": 754775808.0, + "logits/rejected": 1076567381.3333333, + "logps/chosen": -278.720751953125, + "logps/rejected": -493.1020100911458, + "loss": 0.1097, + "rewards/chosen": 1.758782196044922, + "rewards/margins": 6.349667676289876, + "rewards/rejected": -4.590885480244954, + "step": 52 + }, + { + "epoch": 0.0048423937871174055, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.999788313211196e-06, + "logits/chosen": 789083328.0, + "logits/rejected": 631280128.0, + "logps/chosen": -312.7073974609375, + "logps/rejected": -486.7452087402344, + "loss": 0.0729, + "rewards/chosen": 2.1830005645751953, + "rewards/margins": 7.194818019866943, + "rewards/rejected": -5.011817455291748, + "step": 53 + }, + { + "epoch": 0.0049337597076290545, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.99977487616235e-06, + "logits/chosen": 440175200.0, + "logits/rejected": 657491712.0, + "logps/chosen": -337.6492614746094, + "logps/rejected": -842.5576782226562, + "loss": 0.071, + "rewards/chosen": 1.9724400043487549, + "rewards/margins": 10.62178921699524, + "rewards/rejected": -8.649349212646484, + "step": 54 + }, + { + "epoch": 0.005025125628140704, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.999761025678448e-06, + "logits/chosen": 769861888.0, + "logits/rejected": 716852544.0, + "logps/chosen": -295.38295491536456, + "logps/rejected": -509.1496887207031, + "loss": 0.1225, + "rewards/chosen": 1.8952773412068684, + "rewards/margins": 6.841698010762532, + "rewards/rejected": -4.946420669555664, + "step": 55 + }, + { + "epoch": 0.005116491548652353, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.999746761760632e-06, + "logits/chosen": 459658976.0, + "logits/rejected": 503131712.0, + "logps/chosen": -285.3960876464844, + "logps/rejected": -388.3927001953125, + "loss": 0.0901, + "rewards/chosen": 1.972360372543335, + "rewards/margins": 7.272141695022583, + "rewards/rejected": -5.299781322479248, + "step": 56 + }, + { + "epoch": 0.005207857469164002, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.999732084410082e-06, + "logits/chosen": 240044000.0, + "logits/rejected": 513084635.4285714, + "logps/chosen": -195.21902465820312, + "logps/rejected": -575.3398786272321, + "loss": 0.0572, + "rewards/chosen": 0.5988388061523438, + "rewards/margins": 8.22679683140346, + "rewards/rejected": -7.627958025251116, + "step": 57 + }, + { + "epoch": 0.005299223389675651, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.99971699362801e-06, + "logits/chosen": 630427136.0, + "logits/rejected": 490157888.0, + "logps/chosen": -352.4774576822917, + "logps/rejected": -344.29486083984375, + "loss": 0.1113, + "rewards/chosen": 2.2012507120768228, + "rewards/margins": 5.445312182108561, + "rewards/rejected": -3.2440614700317383, + "step": 58 + }, + { + "epoch": 0.0053905893101873, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 9.99970148941567e-06, + "logits/chosen": 686695744.0, + "logits/rejected": 903971712.0, + "logps/chosen": -440.14794921875, + "logps/rejected": -447.47039794921875, + "loss": 0.1405, + "rewards/chosen": 1.4599071741104126, + "rewards/margins": 7.519340634346008, + "rewards/rejected": -6.059433460235596, + "step": 59 + }, + { + "epoch": 0.00548195523069895, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.999685571774338e-06, + "logits/chosen": 402567648.0, + "logits/rejected": 392069461.3333333, + "logps/chosen": -350.4725341796875, + "logps/rejected": -428.272216796875, + "loss": 0.0453, + "rewards/chosen": 2.5612688064575195, + "rewards/margins": 7.007386207580566, + "rewards/rejected": -4.446117401123047, + "step": 60 + }, + { + "epoch": 0.005573321151210599, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 9.999669240705332e-06, + "logits/chosen": 504294698.6666667, + "logits/rejected": 705165056.0, + "logps/chosen": -402.9945068359375, + "logps/rejected": -655.8289794921875, + "loss": 0.1133, + "rewards/chosen": 2.26107390721639, + "rewards/margins": 10.980092843373617, + "rewards/rejected": -8.719018936157227, + "step": 61 + }, + { + "epoch": 0.005664687071722248, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.999652496210004e-06, + "logits/chosen": 563712085.3333334, + "logits/rejected": 447832960.0, + "logps/chosen": -329.8593343098958, + "logps/rejected": -480.57757568359375, + "loss": 0.1366, + "rewards/chosen": 1.8120975494384766, + "rewards/margins": 7.419693946838379, + "rewards/rejected": -5.607596397399902, + "step": 62 + }, + { + "epoch": 0.005756052992233897, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.999635338289737e-06, + "logits/chosen": 824662976.0, + "logits/rejected": 506150112.0, + "logps/chosen": -339.1749572753906, + "logps/rejected": -347.61309814453125, + "loss": 0.1247, + "rewards/chosen": 2.224782705307007, + "rewards/margins": 6.1130475997924805, + "rewards/rejected": -3.8882648944854736, + "step": 63 + }, + { + "epoch": 0.005847418912745546, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.99961776694595e-06, + "logits/chosen": 656741171.2, + "logits/rejected": 718118314.6666666, + "logps/chosen": -357.8573486328125, + "logps/rejected": -435.949462890625, + "loss": 0.0924, + "rewards/chosen": 2.550014686584473, + "rewards/margins": 5.979188791910808, + "rewards/rejected": -3.4291741053263345, + "step": 64 + }, + { + "epoch": 0.005938784833257195, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.999599782180096e-06, + "logits/chosen": 1562524330.6666667, + "logits/rejected": 620488089.6, + "logps/chosen": -443.2604166666667, + "logps/rejected": -545.054296875, + "loss": 0.0955, + "rewards/chosen": 1.3891909917195637, + "rewards/margins": 6.616550000508626, + "rewards/rejected": -5.227359008789063, + "step": 65 + }, + { + "epoch": 0.006030150753768844, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.999581383993664e-06, + "logits/chosen": 1200145152.0, + "logits/rejected": 1196677760.0, + "logps/chosen": -508.48687744140625, + "logps/rejected": -595.623046875, + "loss": 0.0882, + "rewards/chosen": 1.7553963661193848, + "rewards/margins": 6.560874938964844, + "rewards/rejected": -4.805478572845459, + "step": 66 + }, + { + "epoch": 0.006121516674280493, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.99956257238817e-06, + "logits/chosen": 542358656.0, + "logits/rejected": 268580832.0, + "logps/chosen": -343.068603515625, + "logps/rejected": -373.43109130859375, + "loss": 0.0895, + "rewards/chosen": 1.6975092887878418, + "rewards/margins": 6.4940361976623535, + "rewards/rejected": -4.796526908874512, + "step": 67 + }, + { + "epoch": 0.006212882594792143, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.999543347365177e-06, + "logits/chosen": 658812928.0, + "logits/rejected": 594626048.0, + "logps/chosen": -345.14117431640625, + "logps/rejected": -506.5757242838542, + "loss": 0.0377, + "rewards/chosen": 2.0009355545043945, + "rewards/margins": 7.471582730611165, + "rewards/rejected": -5.4706471761067705, + "step": 68 + }, + { + "epoch": 0.006304248515303792, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.99952370892627e-06, + "logits/chosen": 755485866.6666666, + "logits/rejected": 461738905.6, + "logps/chosen": -271.9269612630208, + "logps/rejected": -346.8568359375, + "loss": 0.1421, + "rewards/chosen": 1.3530228932698567, + "rewards/margins": 5.561391576131185, + "rewards/rejected": -4.2083686828613285, + "step": 69 + }, + { + "epoch": 0.006395614435815441, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.999503657073072e-06, + "logits/chosen": 607040064.0, + "logits/rejected": 843865002.6666666, + "logps/chosen": -210.44244384765625, + "logps/rejected": -452.3583984375, + "loss": 0.0858, + "rewards/chosen": 2.0541203022003174, + "rewards/margins": 7.652382453282674, + "rewards/rejected": -5.5982621510823565, + "step": 70 + }, + { + "epoch": 0.00648698035632709, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.999483191807245e-06, + "logits/chosen": 588574515.2, + "logits/rejected": 650703701.3333334, + "logps/chosen": -395.45537109375, + "logps/rejected": -383.0423177083333, + "loss": 0.0773, + "rewards/chosen": 2.4763607025146483, + "rewards/margins": 6.48607546488444, + "rewards/rejected": -4.009714762369792, + "step": 71 + }, + { + "epoch": 0.006578346276838739, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.999462313130478e-06, + "logits/chosen": 379879338.6666667, + "logits/rejected": 378753216.0, + "logps/chosen": -435.6193033854167, + "logps/rejected": -324.29022216796875, + "loss": 0.0946, + "rewards/chosen": 2.3181234995524087, + "rewards/margins": 6.091519991556803, + "rewards/rejected": -3.7733964920043945, + "step": 72 + }, + { + "epoch": 0.006669712197350388, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.9994410210445e-06, + "logits/chosen": 455300224.0, + "logits/rejected": 524283840.0, + "logps/chosen": -327.77520751953125, + "logps/rejected": -464.78033447265625, + "loss": 0.061, + "rewards/chosen": 3.439657211303711, + "rewards/margins": 7.334168195724487, + "rewards/rejected": -3.8945109844207764, + "step": 73 + }, + { + "epoch": 0.006761078117862037, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.999419315551069e-06, + "logits/chosen": 371856640.0, + "logits/rejected": 506393702.4, + "logps/chosen": -286.5380452473958, + "logps/rejected": -507.34775390625, + "loss": 0.0951, + "rewards/chosen": 2.5518178939819336, + "rewards/margins": 7.32028865814209, + "rewards/rejected": -4.768470764160156, + "step": 74 + }, + { + "epoch": 0.006852444038373686, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 9.999397196651982e-06, + "logits/chosen": 411864934.4, + "logits/rejected": 378972501.3333333, + "logps/chosen": -341.4308837890625, + "logps/rejected": -431.8441569010417, + "loss": 0.1073, + "rewards/chosen": 2.100117492675781, + "rewards/margins": 5.612150637308757, + "rewards/rejected": -3.512033144632975, + "step": 75 + }, + { + "epoch": 0.006943809958885336, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.999374664349067e-06, + "logits/chosen": 398485964.8, + "logits/rejected": 556746880.0, + "logps/chosen": -312.954296875, + "logps/rejected": -557.2219645182291, + "loss": 0.1251, + "rewards/chosen": 2.1293014526367187, + "rewards/margins": 4.965991719563802, + "rewards/rejected": -2.8366902669270835, + "step": 76 + }, + { + "epoch": 0.007035175879396985, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.999351718644188e-06, + "logits/chosen": 915518259.2, + "logits/rejected": 542793386.6666666, + "logps/chosen": -399.313232421875, + "logps/rejected": -433.8512369791667, + "loss": 0.0892, + "rewards/chosen": 2.3771217346191404, + "rewards/margins": 8.093326059977214, + "rewards/rejected": -5.716204325358073, + "step": 77 + }, + { + "epoch": 0.007126541799908634, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 9.999328359539243e-06, + "logits/chosen": 719800320.0, + "logits/rejected": 942849536.0, + "logps/chosen": -335.088623046875, + "logps/rejected": -549.689697265625, + "loss": 0.1681, + "rewards/chosen": 1.7237802233014787, + "rewards/margins": 6.50653178351266, + "rewards/rejected": -4.782751560211182, + "step": 78 + }, + { + "epoch": 0.007217907720420283, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.99930458703616e-06, + "logits/chosen": 635299174.4, + "logits/rejected": 484221994.6666667, + "logps/chosen": -508.17705078125, + "logps/rejected": -392.7220052083333, + "loss": 0.0662, + "rewards/chosen": 2.4004161834716795, + "rewards/margins": 7.132271575927734, + "rewards/rejected": -4.731855392456055, + "step": 79 + }, + { + "epoch": 0.007309273640931932, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.999280401136909e-06, + "logits/chosen": 390983765.3333333, + "logits/rejected": 563390336.0, + "logps/chosen": -454.0709635416667, + "logps/rejected": -661.2803344726562, + "loss": 0.0994, + "rewards/chosen": 2.3733463287353516, + "rewards/margins": 7.1019062995910645, + "rewards/rejected": -4.728559970855713, + "step": 80 + }, + { + "epoch": 0.007400639561443581, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.999255801843487e-06, + "logits/chosen": 1436225877.3333333, + "logits/rejected": 700833484.8, + "logps/chosen": -196.6618448893229, + "logps/rejected": -512.858203125, + "loss": 0.0696, + "rewards/chosen": 2.088996410369873, + "rewards/margins": 8.717257976531982, + "rewards/rejected": -6.628261566162109, + "step": 81 + }, + { + "epoch": 0.00749200548195523, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.99923078915793e-06, + "logits/chosen": 617324224.0, + "logits/rejected": 537092928.0, + "logps/chosen": -167.12062072753906, + "logps/rejected": -389.23486328125, + "loss": 0.1296, + "rewards/chosen": 2.065999984741211, + "rewards/margins": 6.345474720001221, + "rewards/rejected": -4.27947473526001, + "step": 82 + }, + { + "epoch": 0.0075833714024668795, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 9.999205363082305e-06, + "logits/chosen": 774964224.0, + "logits/rejected": 853113088.0, + "logps/chosen": -363.3079427083333, + "logps/rejected": -446.780517578125, + "loss": 0.1468, + "rewards/chosen": 2.4599167505900064, + "rewards/margins": 4.912426869074503, + "rewards/rejected": -2.452510118484497, + "step": 83 + }, + { + "epoch": 0.007674737322978529, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 9.999179523618715e-06, + "logits/chosen": 423204128.0, + "logits/rejected": 783750336.0, + "logps/chosen": -282.6246337890625, + "logps/rejected": -425.5149230957031, + "loss": 0.1313, + "rewards/chosen": 2.823007822036743, + "rewards/margins": 5.823093414306641, + "rewards/rejected": -3.0000855922698975, + "step": 84 + }, + { + "epoch": 0.0077661032434901784, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.999153270769298e-06, + "logits/chosen": 436267008.0, + "logits/rejected": 459310592.0, + "logps/chosen": -167.16461181640625, + "logps/rejected": -400.7852376302083, + "loss": 0.1047, + "rewards/chosen": 1.7776031494140625, + "rewards/margins": 5.33476448059082, + "rewards/rejected": -3.557161331176758, + "step": 85 + }, + { + "epoch": 0.007857469164001827, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.999126604536224e-06, + "logits/chosen": 492013088.0, + "logits/rejected": 451838250.6666667, + "logps/chosen": -355.6337890625, + "logps/rejected": -452.3369547526042, + "loss": 0.1049, + "rewards/chosen": 2.5389404296875, + "rewards/margins": 6.184647878011068, + "rewards/rejected": -3.645707448323568, + "step": 86 + }, + { + "epoch": 0.007948835084513476, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.999099524921696e-06, + "logits/chosen": 635807616.0, + "logits/rejected": 335328819.2, + "logps/chosen": -333.88623046875, + "logps/rejected": -348.6157470703125, + "loss": 0.0944, + "rewards/chosen": 2.088528633117676, + "rewards/margins": 5.925960731506348, + "rewards/rejected": -3.837432098388672, + "step": 87 + }, + { + "epoch": 0.008040201005025126, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 9.999072031927957e-06, + "logits/chosen": 630380748.8, + "logits/rejected": 643897856.0, + "logps/chosen": -362.128662109375, + "logps/rejected": -408.7527669270833, + "loss": 0.1415, + "rewards/chosen": 1.9611045837402343, + "rewards/margins": 3.8548201243082683, + "rewards/rejected": -1.893715540568034, + "step": 88 + }, + { + "epoch": 0.008131566925536775, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.999044125557277e-06, + "logits/chosen": 617453714.2857143, + "logits/rejected": 37267732.0, + "logps/chosen": -440.01429966517856, + "logps/rejected": -1596.66845703125, + "loss": 0.1091, + "rewards/chosen": 2.2263171332223073, + "rewards/margins": 12.139781679425921, + "rewards/rejected": -9.913464546203613, + "step": 89 + }, + { + "epoch": 0.008222932846048425, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.999015805811965e-06, + "logits/chosen": 386442325.3333333, + "logits/rejected": 576837632.0, + "logps/chosen": -259.36008707682294, + "logps/rejected": -462.6103515625, + "loss": 0.0364, + "rewards/chosen": 3.1739670435587564, + "rewards/margins": 9.377681414286295, + "rewards/rejected": -6.203714370727539, + "step": 90 + }, + { + "epoch": 0.008314298766560074, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.998987072694363e-06, + "logits/chosen": 168197312.0, + "logits/rejected": 503189162.6666667, + "logps/chosen": -416.7679748535156, + "logps/rejected": -397.5300699869792, + "loss": 0.049, + "rewards/chosen": 2.572958469390869, + "rewards/margins": 8.863752841949463, + "rewards/rejected": -6.290794372558594, + "step": 91 + }, + { + "epoch": 0.008405664687071723, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 9.998957926206848e-06, + "logits/chosen": 455411404.8, + "logits/rejected": 314681514.6666667, + "logps/chosen": -415.914892578125, + "logps/rejected": -261.4064534505208, + "loss": 0.1447, + "rewards/chosen": 1.6469842910766601, + "rewards/margins": 5.018102200826009, + "rewards/rejected": -3.371117909749349, + "step": 92 + }, + { + "epoch": 0.008497030607583372, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 9.998928366351826e-06, + "logits/chosen": 531622186.6666667, + "logits/rejected": 442105241.6, + "logps/chosen": -469.8058675130208, + "logps/rejected": -560.32265625, + "loss": 0.0426, + "rewards/chosen": 2.3435230255126953, + "rewards/margins": 9.864054489135743, + "rewards/rejected": -7.520531463623047, + "step": 93 + }, + { + "epoch": 0.00858839652809502, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.998898393131746e-06, + "logits/chosen": 334721088.0, + "logits/rejected": 260435984.0, + "logps/chosen": -245.5447540283203, + "logps/rejected": -492.866943359375, + "loss": 0.058, + "rewards/chosen": 2.181605815887451, + "rewards/margins": 8.959604263305664, + "rewards/rejected": -6.777998447418213, + "step": 94 + }, + { + "epoch": 0.00867976244860667, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.998868006549085e-06, + "logits/chosen": 771122048.0, + "logits/rejected": 510957184.0, + "logps/chosen": -351.5924377441406, + "logps/rejected": -353.7041015625, + "loss": 0.0511, + "rewards/chosen": 1.5520660877227783, + "rewards/margins": 7.634522040685018, + "rewards/rejected": -6.082455952962239, + "step": 95 + }, + { + "epoch": 0.008771128369118319, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.998837206606355e-06, + "logits/chosen": 497211084.8, + "logits/rejected": 541847722.6666666, + "logps/chosen": -294.7218994140625, + "logps/rejected": -473.6376139322917, + "loss": 0.1095, + "rewards/chosen": 2.4349828720092774, + "rewards/margins": 5.586231168111166, + "rewards/rejected": -3.151248296101888, + "step": 96 + }, + { + "epoch": 0.008862494289629968, + "grad_norm": 23.0, + "kl": 0.0, + "learning_rate": 9.998805993306102e-06, + "logits/chosen": 581252864.0, + "logits/rejected": 536360576.0, + "logps/chosen": -405.89520263671875, + "logps/rejected": -389.2760009765625, + "loss": 0.1757, + "rewards/chosen": 1.4143342971801758, + "rewards/margins": 5.196229934692383, + "rewards/rejected": -3.781895637512207, + "step": 97 + }, + { + "epoch": 0.008953860210141617, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 9.99877436665091e-06, + "logits/chosen": 367604684.8, + "logits/rejected": 399436629.3333333, + "logps/chosen": -265.0141845703125, + "logps/rejected": -379.7289225260417, + "loss": 0.1554, + "rewards/chosen": 2.14964599609375, + "rewards/margins": 4.828665924072266, + "rewards/rejected": -2.6790199279785156, + "step": 98 + }, + { + "epoch": 0.009045226130653266, + "grad_norm": 25.625, + "kl": 0.0, + "learning_rate": 9.998742326643392e-06, + "logits/chosen": 578828544.0, + "logits/rejected": 567489664.0, + "logps/chosen": -430.3673909505208, + "logps/rejected": -598.36181640625, + "loss": 0.205, + "rewards/chosen": 1.412208080291748, + "rewards/margins": 6.237311840057373, + "rewards/rejected": -4.825103759765625, + "step": 99 + }, + { + "epoch": 0.009136592051164915, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 9.998709873286199e-06, + "logits/chosen": 536498901.3333333, + "logits/rejected": 547572428.8, + "logps/chosen": -358.1209716796875, + "logps/rejected": -603.391064453125, + "loss": 0.067, + "rewards/chosen": 2.087646484375, + "rewards/margins": 9.45535888671875, + "rewards/rejected": -7.36771240234375, + "step": 100 + }, + { + "epoch": 0.009227957971676564, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.998677006582012e-06, + "logits/chosen": 603282880.0, + "logits/rejected": 494785152.0, + "logps/chosen": -420.2431335449219, + "logps/rejected": -552.7396240234375, + "loss": 0.112, + "rewards/chosen": 1.3893921375274658, + "rewards/margins": 8.557427644729614, + "rewards/rejected": -7.168035507202148, + "step": 101 + }, + { + "epoch": 0.009319323892188213, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.998643726533551e-06, + "logits/chosen": 757392810.6666666, + "logits/rejected": 431585177.6, + "logps/chosen": -331.8006998697917, + "logps/rejected": -433.146875, + "loss": 0.1228, + "rewards/chosen": 1.327025334040324, + "rewards/margins": 7.435052982966106, + "rewards/rejected": -6.108027648925781, + "step": 102 + }, + { + "epoch": 0.009410689812699862, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.998610033143568e-06, + "logits/chosen": 436461653.3333333, + "logits/rejected": 519460198.4, + "logps/chosen": -297.19691975911456, + "logps/rejected": -631.1525390625, + "loss": 0.055, + "rewards/chosen": 2.914208730061849, + "rewards/margins": 10.436263783772787, + "rewards/rejected": -7.522055053710938, + "step": 103 + }, + { + "epoch": 0.009502055733211513, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.998575926414847e-06, + "logits/chosen": 429880217.6, + "logits/rejected": 603875285.3333334, + "logps/chosen": -334.54765625, + "logps/rejected": -678.1293131510416, + "loss": 0.055, + "rewards/chosen": 2.7323556900024415, + "rewards/margins": 10.127629534403484, + "rewards/rejected": -7.395273844401042, + "step": 104 + }, + { + "epoch": 0.009593421653723162, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.99854140635021e-06, + "logits/chosen": 926764885.3333334, + "logits/rejected": 424427238.4, + "logps/chosen": -299.6827392578125, + "logps/rejected": -439.05, + "loss": 0.0561, + "rewards/chosen": 2.1458589235941568, + "rewards/margins": 9.230252234141032, + "rewards/rejected": -7.084393310546875, + "step": 105 + }, + { + "epoch": 0.009684787574234811, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.998506472952512e-06, + "logits/chosen": 585051392.0, + "logits/rejected": 845345984.0, + "logps/chosen": -302.7089538574219, + "logps/rejected": -705.5555419921875, + "loss": 0.1038, + "rewards/chosen": 1.7717636823654175, + "rewards/margins": 9.608830332756042, + "rewards/rejected": -7.837066650390625, + "step": 106 + }, + { + "epoch": 0.00977615349474646, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.99847112622464e-06, + "logits/chosen": 650496832.0, + "logits/rejected": 654096640.0, + "logps/chosen": -441.0171203613281, + "logps/rejected": -574.965576171875, + "loss": 0.1462, + "rewards/chosen": 1.3091751337051392, + "rewards/margins": 9.774156212806702, + "rewards/rejected": -8.464981079101562, + "step": 107 + }, + { + "epoch": 0.009867519415258109, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.998435366169519e-06, + "logits/chosen": 480529493.3333333, + "logits/rejected": 595978905.6, + "logps/chosen": -211.96358235677084, + "logps/rejected": -500.1755859375, + "loss": 0.116, + "rewards/chosen": 2.824381192525228, + "rewards/margins": 7.6594039281209305, + "rewards/rejected": -4.835022735595703, + "step": 108 + }, + { + "epoch": 0.009958885335769758, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.998399192790102e-06, + "logits/chosen": 574872917.3333334, + "logits/rejected": 691649843.2, + "logps/chosen": -302.526611328125, + "logps/rejected": -370.0342041015625, + "loss": 0.0706, + "rewards/chosen": 1.9301055272420247, + "rewards/margins": 7.591490872701009, + "rewards/rejected": -5.661385345458984, + "step": 109 + }, + { + "epoch": 0.010050251256281407, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 9.998362606089387e-06, + "logits/chosen": 366416128.0, + "logits/rejected": 749045350.4, + "logps/chosen": -202.84505208333334, + "logps/rejected": -464.48447265625, + "loss": 0.1025, + "rewards/chosen": 1.6295277277628581, + "rewards/margins": 8.830904070536295, + "rewards/rejected": -7.201376342773438, + "step": 110 + }, + { + "epoch": 0.010141617176793056, + "grad_norm": 27.625, + "kl": 0.0, + "learning_rate": 9.998325606070392e-06, + "logits/chosen": 656592896.0, + "logps/chosen": -423.50872802734375, + "loss": 0.1865, + "rewards/chosen": 1.8025710582733154, + "step": 111 + }, + { + "epoch": 0.010232983097304705, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.99828819273618e-06, + "logits/chosen": 675901732.5714285, + "logits/rejected": 746611520.0, + "logps/chosen": -425.18819754464283, + "logps/rejected": -658.3265991210938, + "loss": 0.1637, + "rewards/chosen": 1.931525639125279, + "rewards/margins": 7.147297314235143, + "rewards/rejected": -5.215771675109863, + "step": 112 + }, + { + "epoch": 0.010324349017816354, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.998250366089848e-06, + "logits/chosen": 333187968.0, + "logits/rejected": 426496896.0, + "logps/chosen": -224.2114054361979, + "logps/rejected": -456.89239501953125, + "loss": 0.1081, + "rewards/chosen": 2.3618291219075522, + "rewards/margins": 7.345755894978842, + "rewards/rejected": -4.983926773071289, + "step": 113 + }, + { + "epoch": 0.010415714938328003, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.998212126134517e-06, + "logits/chosen": 277573440.0, + "logits/rejected": 391987507.2, + "logps/chosen": -257.1490478515625, + "logps/rejected": -458.9974609375, + "loss": 0.0861, + "rewards/chosen": 1.947444756825765, + "rewards/margins": 8.11323226292928, + "rewards/rejected": -6.1657875061035154, + "step": 114 + }, + { + "epoch": 0.010507080858839652, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.998173472873354e-06, + "logits/chosen": 616676608.0, + "logits/rejected": 441191744.0, + "logps/chosen": -273.8734130859375, + "logps/rejected": -364.1439208984375, + "loss": 0.1223, + "rewards/chosen": 1.6410858631134033, + "rewards/margins": 5.629139423370361, + "rewards/rejected": -3.988053560256958, + "step": 115 + }, + { + "epoch": 0.010598446779351301, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.998134406309555e-06, + "logits/chosen": 194817008.0, + "logits/rejected": 494642834.28571427, + "logps/chosen": -173.742919921875, + "logps/rejected": -434.72732979910717, + "loss": 0.0659, + "rewards/chosen": 2.9549591541290283, + "rewards/margins": 8.208632707595825, + "rewards/rejected": -5.253673553466797, + "step": 116 + }, + { + "epoch": 0.01068981269986295, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.998094926446347e-06, + "logits/chosen": 628480554.6666666, + "logits/rejected": 580144128.0, + "logps/chosen": -300.1923421223958, + "logps/rejected": -276.86981201171875, + "loss": 0.0859, + "rewards/chosen": 2.481581528981527, + "rewards/margins": 5.892811377843222, + "rewards/rejected": -3.4112298488616943, + "step": 117 + }, + { + "epoch": 0.0107811786203746, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.998055033286998e-06, + "logits/chosen": 512748851.2, + "logits/rejected": 382642602.6666667, + "logps/chosen": -301.2209716796875, + "logps/rejected": -305.5750732421875, + "loss": 0.0839, + "rewards/chosen": 2.232250785827637, + "rewards/margins": 7.199589347839355, + "rewards/rejected": -4.967338562011719, + "step": 118 + }, + { + "epoch": 0.010872544540886249, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.998014726834806e-06, + "logits/chosen": 418093696.0, + "logits/rejected": 518452192.0, + "logps/chosen": -180.65777587890625, + "logps/rejected": -344.9411926269531, + "loss": 0.0844, + "rewards/chosen": 2.3260550498962402, + "rewards/margins": 6.824210166931152, + "rewards/rejected": -4.498155117034912, + "step": 119 + }, + { + "epoch": 0.0109639104613979, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.997974007093103e-06, + "logits/chosen": 576276309.3333334, + "logits/rejected": 749371392.0, + "logps/chosen": -362.6407877604167, + "logps/rejected": -286.05059814453125, + "loss": 0.1469, + "rewards/chosen": 2.0136946042378745, + "rewards/margins": 5.494123776753744, + "rewards/rejected": -3.480429172515869, + "step": 120 + }, + { + "epoch": 0.011055276381909548, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.997932874065259e-06, + "logits/chosen": 545624985.6, + "logits/rejected": 355180757.3333333, + "logps/chosen": -336.0628662109375, + "logps/rejected": -398.8329671223958, + "loss": 0.08, + "rewards/chosen": 2.5349498748779298, + "rewards/margins": 6.714760716756185, + "rewards/rejected": -4.179810841878255, + "step": 121 + }, + { + "epoch": 0.011146642302421197, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.99789132775467e-06, + "logits/chosen": 556710912.0, + "logits/rejected": 285492736.0, + "logps/chosen": -399.4795735677083, + "logps/rejected": -311.6238037109375, + "loss": 0.0795, + "rewards/chosen": 2.175806681315104, + "rewards/margins": 5.647691599527995, + "rewards/rejected": -3.471884918212891, + "step": 122 + }, + { + "epoch": 0.011238008222932847, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.997849368164776e-06, + "logits/chosen": 544239744.0, + "logits/rejected": 422122837.3333333, + "logps/chosen": -491.3172607421875, + "logps/rejected": -382.6382649739583, + "loss": 0.0329, + "rewards/chosen": 3.344834804534912, + "rewards/margins": 7.909556229909261, + "rewards/rejected": -4.564721425374349, + "step": 123 + }, + { + "epoch": 0.011329374143444496, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.997806995299047e-06, + "logits/chosen": 750819669.3333334, + "logits/rejected": 584386662.4, + "logps/chosen": -277.4761962890625, + "logps/rejected": -377.47822265625, + "loss": 0.1778, + "rewards/chosen": 1.897523562113444, + "rewards/margins": 4.631594340006511, + "rewards/rejected": -2.7340707778930664, + "step": 124 + }, + { + "epoch": 0.011420740063956145, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.997764209160982e-06, + "logits/chosen": 800498176.0, + "logits/rejected": 1177612032.0, + "logps/chosen": -253.8199951171875, + "logps/rejected": -421.0630289713542, + "loss": 0.155, + "rewards/chosen": 2.2074653625488283, + "rewards/margins": 5.9738250096639, + "rewards/rejected": -3.7663596471150718, + "step": 125 + }, + { + "epoch": 0.011512105984467794, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.997721009754125e-06, + "logits/chosen": 525252768.0, + "logits/rejected": 555374784.0, + "logps/chosen": -136.9763946533203, + "logps/rejected": -382.18194580078125, + "loss": 0.047, + "rewards/chosen": 2.6727654933929443, + "rewards/margins": 7.996193170547485, + "rewards/rejected": -5.323427677154541, + "step": 126 + }, + { + "epoch": 0.011603471904979443, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.997677397082045e-06, + "logits/chosen": 505449173.3333333, + "logits/rejected": 579113792.0, + "logps/chosen": -446.2256673177083, + "logps/rejected": -578.076171875, + "loss": 0.0677, + "rewards/chosen": 2.704678217569987, + "rewards/margins": 7.761611620585123, + "rewards/rejected": -5.056933403015137, + "step": 127 + }, + { + "epoch": 0.011694837825491092, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 9.99763337114835e-06, + "logits/chosen": 535276032.0, + "logits/rejected": 454287040.0, + "logps/chosen": -369.0766906738281, + "logps/rejected": -455.6640625, + "loss": 0.1499, + "rewards/chosen": 2.6543426513671875, + "rewards/margins": 5.893114805221558, + "rewards/rejected": -3.23877215385437, + "step": 128 + }, + { + "epoch": 0.01178620374600274, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.997588931956677e-06, + "logits/chosen": 317080576.0, + "logits/rejected": 395551445.3333333, + "logps/chosen": -321.9363037109375, + "logps/rejected": -295.64642333984375, + "loss": 0.0615, + "rewards/chosen": 3.3505294799804686, + "rewards/margins": 5.943486499786377, + "rewards/rejected": -2.592957019805908, + "step": 129 + }, + { + "epoch": 0.01187756966651439, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.997544079510702e-06, + "logits/chosen": 704366848.0, + "logits/rejected": 459788416.0, + "logps/chosen": -492.18487548828125, + "logps/rejected": -558.1162516276041, + "loss": 0.0654, + "rewards/chosen": 1.721449375152588, + "rewards/margins": 6.556329568227132, + "rewards/rejected": -4.834880193074544, + "step": 130 + }, + { + "epoch": 0.011968935587026039, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.997498813814138e-06, + "logits/chosen": 384555050.6666667, + "logits/rejected": 414834176.0, + "logps/chosen": -289.60333251953125, + "logps/rejected": -409.5342712402344, + "loss": 0.0553, + "rewards/chosen": 3.0129543940226235, + "rewards/margins": 7.695235888163248, + "rewards/rejected": -4.682281494140625, + "step": 131 + }, + { + "epoch": 0.012060301507537688, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.997453134870722e-06, + "logits/chosen": 420280661.3333333, + "logits/rejected": 595392409.6, + "logps/chosen": -235.54638671875, + "logps/rejected": -422.9021484375, + "loss": 0.0987, + "rewards/chosen": 1.407989501953125, + "rewards/margins": 6.459561538696289, + "rewards/rejected": -5.051572036743164, + "step": 132 + }, + { + "epoch": 0.012151667428049337, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 9.997407042684236e-06, + "logits/chosen": 523830374.4, + "logits/rejected": 775414357.3333334, + "logps/chosen": -333.3016357421875, + "logps/rejected": -495.978271484375, + "loss": 0.1237, + "rewards/chosen": 2.0455432891845704, + "rewards/margins": 5.168631807963053, + "rewards/rejected": -3.123088518778483, + "step": 133 + }, + { + "epoch": 0.012243033348560986, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.99736053725849e-06, + "logits/chosen": 394983782.4, + "logits/rejected": 346860885.3333333, + "logps/chosen": -229.002001953125, + "logps/rejected": -507.629150390625, + "loss": 0.0897, + "rewards/chosen": 2.0539518356323243, + "rewards/margins": 9.927346483866375, + "rewards/rejected": -7.87339464823405, + "step": 134 + }, + { + "epoch": 0.012334399269072635, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.997313618597327e-06, + "logits/chosen": 439837536.0, + "logits/rejected": 303226144.0, + "logps/chosen": -206.1785125732422, + "logps/rejected": -509.5521240234375, + "loss": 0.0381, + "rewards/chosen": 2.888655185699463, + "rewards/margins": 12.288561344146729, + "rewards/rejected": -9.399906158447266, + "step": 135 + }, + { + "epoch": 0.012425765189584286, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.99726628670463e-06, + "logits/chosen": 622575872.0, + "logits/rejected": 359077056.0, + "logps/chosen": -385.26165771484375, + "logps/rejected": -319.69866943359375, + "loss": 0.1557, + "rewards/chosen": 1.699887990951538, + "rewards/margins": 5.411389112472534, + "rewards/rejected": -3.711501121520996, + "step": 136 + }, + { + "epoch": 0.012517131110095935, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.997218541584312e-06, + "logits/chosen": 611165184.0, + "logits/rejected": 688338636.8, + "logps/chosen": -419.6193440755208, + "logps/rejected": -365.3631591796875, + "loss": 0.0893, + "rewards/chosen": 1.3316656748453777, + "rewards/margins": 8.297810236612955, + "rewards/rejected": -6.966144561767578, + "step": 137 + }, + { + "epoch": 0.012608497030607584, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 9.997170383240322e-06, + "logits/chosen": 521366688.0, + "logits/rejected": 637164224.0, + "logps/chosen": -284.03851318359375, + "logps/rejected": -443.23974609375, + "loss": 0.1077, + "rewards/chosen": 2.6046571731567383, + "rewards/margins": 5.47566819190979, + "rewards/rejected": -2.8710110187530518, + "step": 138 + }, + { + "epoch": 0.012699862951119233, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 9.99712181167664e-06, + "logits/chosen": 592319040.0, + "logits/rejected": 411905024.0, + "logps/chosen": -503.58880615234375, + "logps/rejected": -441.1768493652344, + "loss": 0.1654, + "rewards/chosen": 1.7028193473815918, + "rewards/margins": 6.271139144897461, + "rewards/rejected": -4.568319797515869, + "step": 139 + }, + { + "epoch": 0.012791228871630882, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.997072826897283e-06, + "logits/chosen": 245744746.66666666, + "logits/rejected": 361045683.2, + "logps/chosen": -226.8411865234375, + "logps/rejected": -452.1841796875, + "loss": 0.0673, + "rewards/chosen": 2.102822462717692, + "rewards/margins": 7.69202610651652, + "rewards/rejected": -5.589203643798828, + "step": 140 + }, + { + "epoch": 0.012882594792142531, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.997023428906302e-06, + "logits/chosen": 374804659.2, + "logits/rejected": 564352896.0, + "logps/chosen": -296.038232421875, + "logps/rejected": -639.9710286458334, + "loss": 0.0889, + "rewards/chosen": 2.0729511260986326, + "rewards/margins": 7.396220016479492, + "rewards/rejected": -5.323268890380859, + "step": 141 + }, + { + "epoch": 0.01297396071265418, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.996973617707783e-06, + "logits/chosen": 947912256.0, + "logits/rejected": 877312256.0, + "logps/chosen": -325.4892578125, + "logps/rejected": -903.66845703125, + "loss": 0.0637, + "rewards/chosen": 2.1270406246185303, + "rewards/margins": 8.379358053207397, + "rewards/rejected": -6.252317428588867, + "step": 142 + }, + { + "epoch": 0.01306532663316583, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.996923393305845e-06, + "logits/chosen": 465833600.0, + "logits/rejected": 335064960.0, + "logps/chosen": -387.5977783203125, + "logps/rejected": -325.74053955078125, + "loss": 0.0659, + "rewards/chosen": 2.7178871631622314, + "rewards/margins": 6.853147268295288, + "rewards/rejected": -4.135260105133057, + "step": 143 + }, + { + "epoch": 0.013156692553677478, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.996872755704636e-06, + "logits/chosen": 388287008.0, + "logits/rejected": 403198549.3333333, + "logps/chosen": -306.3246765136719, + "logps/rejected": -494.9126790364583, + "loss": 0.0368, + "rewards/chosen": 1.9783616065979004, + "rewards/margins": 8.104091485341389, + "rewards/rejected": -6.125729878743489, + "step": 144 + }, + { + "epoch": 0.013248058474189127, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.99682170490835e-06, + "logits/chosen": 558971904.0, + "logits/rejected": 776920405.3333334, + "logps/chosen": -314.280224609375, + "logps/rejected": -582.7445882161459, + "loss": 0.1229, + "rewards/chosen": 2.623088073730469, + "rewards/margins": 7.231156857808431, + "rewards/rejected": -4.608068784077962, + "step": 145 + }, + { + "epoch": 0.013339424394700776, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 9.996770240921205e-06, + "logits/chosen": 489751338.6666667, + "logits/rejected": 948525056.0, + "logps/chosen": -406.609375, + "logps/rejected": -369.149462890625, + "loss": 0.1451, + "rewards/chosen": 1.3879577318827312, + "rewards/margins": 4.299528090159098, + "rewards/rejected": -2.911570358276367, + "step": 146 + }, + { + "epoch": 0.013430790315212425, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.996718363747457e-06, + "logits/chosen": 587829043.2, + "logits/rejected": 544729685.3333334, + "logps/chosen": -360.54306640625, + "logps/rejected": -875.6702473958334, + "loss": 0.0911, + "rewards/chosen": 2.855602264404297, + "rewards/margins": 15.707785288492838, + "rewards/rejected": -12.852183024088541, + "step": 147 + }, + { + "epoch": 0.013522156235724074, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.996666073391397e-06, + "logits/chosen": 446477363.2, + "logits/rejected": 399582122.6666667, + "logps/chosen": -294.755908203125, + "logps/rejected": -484.7743733723958, + "loss": 0.0783, + "rewards/chosen": 2.5268486022949217, + "rewards/margins": 7.910677083333333, + "rewards/rejected": -5.383828481038411, + "step": 148 + }, + { + "epoch": 0.013613522156235723, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.996613369857347e-06, + "logits/chosen": 700850483.2, + "logits/rejected": 558520832.0, + "logps/chosen": -271.6248779296875, + "logps/rejected": -448.9497477213542, + "loss": 0.0989, + "rewards/chosen": 2.2066677093505858, + "rewards/margins": 8.747174072265626, + "rewards/rejected": -6.540506362915039, + "step": 149 + }, + { + "epoch": 0.013704888076747372, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.996560253149665e-06, + "logits/chosen": 227275360.0, + "logits/rejected": 540741046.8571428, + "logps/chosen": -227.490966796875, + "logps/rejected": -580.0009765625, + "loss": 0.0198, + "rewards/chosen": 3.689526319503784, + "rewards/margins": 9.662736245564052, + "rewards/rejected": -5.973209926060268, + "step": 150 + }, + { + "epoch": 0.013796253997259022, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.996506723272744e-06, + "logits/chosen": 340374336.0, + "logits/rejected": 486803882.6666667, + "logps/chosen": -274.8784484863281, + "logps/rejected": -475.5374348958333, + "loss": 0.1433, + "rewards/chosen": 2.1704330444335938, + "rewards/margins": 6.582754770914714, + "rewards/rejected": -4.41232172648112, + "step": 151 + }, + { + "epoch": 0.013887619917770672, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 9.996452780231015e-06, + "logits/chosen": 544801152.0, + "logits/rejected": 407504896.0, + "logps/chosen": -380.34637451171875, + "logps/rejected": -464.88934326171875, + "loss": 0.0421, + "rewards/chosen": 2.5227131843566895, + "rewards/margins": 8.82504653930664, + "rewards/rejected": -6.302333354949951, + "step": 152 + }, + { + "epoch": 0.013978985838282321, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 9.99639842402893e-06, + "logits/chosen": 1025167155.2, + "logits/rejected": 989414997.3333334, + "logps/chosen": -293.00927734375, + "logps/rejected": -284.81072998046875, + "loss": 0.1747, + "rewards/chosen": 2.5982421875, + "rewards/margins": 3.287592077255249, + "rewards/rejected": -0.689349889755249, + "step": 153 + }, + { + "epoch": 0.01407035175879397, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.996343654670988e-06, + "logits/chosen": 511533568.0, + "logits/rejected": 374112384.0, + "logps/chosen": -389.0658365885417, + "logps/rejected": -429.603662109375, + "loss": 0.0659, + "rewards/chosen": 1.7214070955912273, + "rewards/margins": 8.173486868540445, + "rewards/rejected": -6.452079772949219, + "step": 154 + }, + { + "epoch": 0.01416171767930562, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.99628847216172e-06, + "logits/chosen": 431605504.0, + "logits/rejected": 413866048.0, + "logps/chosen": -358.1993103027344, + "logps/rejected": -580.7735595703125, + "loss": 0.0248, + "rewards/chosen": 3.1161866188049316, + "rewards/margins": 11.29806661605835, + "rewards/rejected": -8.181879997253418, + "step": 155 + }, + { + "epoch": 0.014253083599817268, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.996232876505684e-06, + "logits/chosen": 439285145.6, + "logits/rejected": 685792682.6666666, + "logps/chosen": -360.7053466796875, + "logps/rejected": -733.0807291666666, + "loss": 0.0442, + "rewards/chosen": 2.9171186447143556, + "rewards/margins": 10.229528363545736, + "rewards/rejected": -7.31240971883138, + "step": 156 + }, + { + "epoch": 0.014344449520328918, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.996176867707484e-06, + "logits/chosen": 449734451.2, + "logits/rejected": 1068474709.3333334, + "logps/chosen": -203.23326416015624, + "logps/rejected": -779.4746907552084, + "loss": 0.107, + "rewards/chosen": 1.9133092880249023, + "rewards/margins": 8.900195630391439, + "rewards/rejected": -6.986886342366536, + "step": 157 + }, + { + "epoch": 0.014435815440840567, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.996120445771745e-06, + "logits/chosen": 550699072.0, + "logits/rejected": 653822720.0, + "logps/chosen": -355.1217041015625, + "logps/rejected": -603.9375, + "loss": 0.057, + "rewards/chosen": 2.366633176803589, + "rewards/margins": 7.877247095108032, + "rewards/rejected": -5.510613918304443, + "step": 158 + }, + { + "epoch": 0.014527181361352216, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.996063610703138e-06, + "logits/chosen": 564233984.0, + "logps/chosen": -275.875244140625, + "loss": 0.1505, + "rewards/chosen": 1.9724054336547852, + "step": 159 + }, + { + "epoch": 0.014618547281863865, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 9.996006362506356e-06, + "logits/chosen": 1050510165.3333334, + "logits/rejected": 545938022.4, + "logps/chosen": -199.01959228515625, + "logps/rejected": -429.261962890625, + "loss": 0.1724, + "rewards/chosen": 0.6758882204691569, + "rewards/margins": 4.622569433848064, + "rewards/rejected": -3.9466812133789064, + "step": 160 + }, + { + "epoch": 0.014709913202375514, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.99594870118614e-06, + "logits/chosen": 529419724.8, + "logits/rejected": 449026048.0, + "logps/chosen": -256.77470703125, + "logps/rejected": -497.7988688151042, + "loss": 0.1019, + "rewards/chosen": 1.7667591094970703, + "rewards/margins": 6.750644556681316, + "rewards/rejected": -4.983885447184245, + "step": 161 + }, + { + "epoch": 0.014801279122887163, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.995890626747253e-06, + "logits/chosen": 752314709.3333334, + "logits/rejected": 763398016.0, + "logps/chosen": -363.3944498697917, + "logps/rejected": -451.4078369140625, + "loss": 0.1306, + "rewards/chosen": 1.9934929211934407, + "rewards/margins": 7.7702406247456866, + "rewards/rejected": -5.776747703552246, + "step": 162 + }, + { + "epoch": 0.014892645043398812, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.9958321391945e-06, + "logits/chosen": 653388714.6666666, + "logits/rejected": 319127961.6, + "logps/chosen": -404.7450764973958, + "logps/rejected": -353.368017578125, + "loss": 0.0583, + "rewards/chosen": 1.8147328694661458, + "rewards/margins": 8.671791585286458, + "rewards/rejected": -6.857058715820313, + "step": 163 + }, + { + "epoch": 0.01498401096391046, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.995773238532716e-06, + "logits/chosen": 508256320.0, + "logits/rejected": 260766992.0, + "logps/chosen": -315.9140319824219, + "logps/rejected": -396.430419921875, + "loss": 0.066, + "rewards/chosen": 2.032170534133911, + "rewards/margins": 8.600802659988403, + "rewards/rejected": -6.568632125854492, + "step": 164 + }, + { + "epoch": 0.01507537688442211, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.995713924766772e-06, + "logits/chosen": 792372992.0, + "logits/rejected": 448964352.0, + "logps/chosen": -387.0123779296875, + "logps/rejected": -284.1710611979167, + "loss": 0.1061, + "rewards/chosen": 2.4012466430664063, + "rewards/margins": 6.225024159749349, + "rewards/rejected": -3.823777516682943, + "step": 165 + }, + { + "epoch": 0.015166742804933759, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.995654197901575e-06, + "logits/chosen": 986078037.3333334, + "logits/rejected": 517496473.6, + "logps/chosen": -239.84004720052084, + "logps/rejected": -510.80908203125, + "loss": 0.0338, + "rewards/chosen": 2.419210433959961, + "rewards/margins": 9.637997817993163, + "rewards/rejected": -7.218787384033203, + "step": 166 + }, + { + "epoch": 0.015258108725445408, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.995594057942058e-06, + "logits/chosen": 758889152.0, + "logits/rejected": 1074168960.0, + "logps/chosen": -365.06268310546875, + "logps/rejected": -493.5641174316406, + "loss": 0.0574, + "rewards/chosen": 2.207286834716797, + "rewards/margins": 7.796268463134766, + "rewards/rejected": -5.588981628417969, + "step": 167 + }, + { + "epoch": 0.015349474645957059, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.9955335048932e-06, + "logits/chosen": 880269568.0, + "logits/rejected": 554792256.0, + "logps/chosen": -308.03558349609375, + "logps/rejected": -334.45513916015625, + "loss": 0.0866, + "rewards/chosen": 2.689962863922119, + "rewards/margins": 5.351337671279907, + "rewards/rejected": -2.661374807357788, + "step": 168 + }, + { + "epoch": 0.015440840566468708, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.995472538760005e-06, + "logits/chosen": 445330329.6, + "logits/rejected": 749482325.3333334, + "logps/chosen": -315.40341796875, + "logps/rejected": -663.9922688802084, + "loss": 0.0691, + "rewards/chosen": 2.6659175872802736, + "rewards/margins": 5.728657913208008, + "rewards/rejected": -3.0627403259277344, + "step": 169 + }, + { + "epoch": 0.015532206486980357, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 9.995411159547515e-06, + "logits/chosen": 587177984.0, + "logits/rejected": 404585830.4, + "logps/chosen": -440.8013509114583, + "logps/rejected": -463.479736328125, + "loss": 0.1681, + "rewards/chosen": 2.005125045776367, + "rewards/margins": 6.3908344268798825, + "rewards/rejected": -4.385709381103515, + "step": 170 + }, + { + "epoch": 0.015623572407492006, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.995349367260807e-06, + "logits/chosen": 407773760.0, + "logits/rejected": 448988245.3333333, + "logps/chosen": -102.16820526123047, + "logps/rejected": -324.89898681640625, + "loss": 0.1013, + "rewards/chosen": 2.5251107215881348, + "rewards/margins": 6.504878520965576, + "rewards/rejected": -3.9797677993774414, + "step": 171 + }, + { + "epoch": 0.015714938328003653, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.995287161904988e-06, + "logits/chosen": 722483136.0, + "logits/rejected": 1150804992.0, + "logps/chosen": -311.7030334472656, + "logps/rejected": -407.6500244140625, + "loss": 0.0996, + "rewards/chosen": 2.608365535736084, + "rewards/margins": 6.228809118270874, + "rewards/rejected": -3.62044358253479, + "step": 172 + }, + { + "epoch": 0.015806304248515302, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.995224543485202e-06, + "logits/chosen": 462811584.0, + "logits/rejected": 818075221.3333334, + "logps/chosen": -334.01727294921875, + "logps/rejected": -476.5711263020833, + "loss": 0.0771, + "rewards/chosen": 3.119502305984497, + "rewards/margins": 7.257491985956828, + "rewards/rejected": -4.137989679972331, + "step": 173 + }, + { + "epoch": 0.01589767016902695, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 9.99516151200663e-06, + "logits/chosen": 445550336.0, + "logits/rejected": 550333568.0, + "logps/chosen": -330.88113839285717, + "logps/rejected": -566.404541015625, + "loss": 0.0521, + "rewards/chosen": 3.0110789707728793, + "rewards/margins": 11.211683409554617, + "rewards/rejected": -8.200604438781738, + "step": 174 + }, + { + "epoch": 0.015989036089538604, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.995098067474482e-06, + "logits/chosen": 610984857.6, + "logits/rejected": 1104443648.0, + "logps/chosen": -322.0853759765625, + "logps/rejected": -560.43408203125, + "loss": 0.091, + "rewards/chosen": 2.0893936157226562, + "rewards/margins": 9.916609446207683, + "rewards/rejected": -7.827215830485026, + "step": 175 + }, + { + "epoch": 0.016080402010050253, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.995034209894003e-06, + "logits/chosen": 408425830.4, + "logits/rejected": 437180416.0, + "logps/chosen": -212.7073486328125, + "logps/rejected": -456.3935139973958, + "loss": 0.059, + "rewards/chosen": 2.3621219635009765, + "rewards/margins": 11.496542994181315, + "rewards/rejected": -9.134421030680338, + "step": 176 + }, + { + "epoch": 0.016171767930561902, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.994969939270477e-06, + "logits/chosen": 951456938.6666666, + "logits/rejected": 1093444505.6, + "logps/chosen": -685.4672037760416, + "logps/rejected": -395.170361328125, + "loss": 0.1669, + "rewards/chosen": 1.7218424479166667, + "rewards/margins": 5.862511698404949, + "rewards/rejected": -4.140669250488282, + "step": 177 + }, + { + "epoch": 0.01626313385107355, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 9.994905255609214e-06, + "logits/chosen": 721950646.8571428, + "logits/rejected": 679650368.0, + "logps/chosen": -345.5985630580357, + "logps/rejected": -568.7574462890625, + "loss": 0.1794, + "rewards/chosen": 1.5453344072614397, + "rewards/margins": 4.467990704945155, + "rewards/rejected": -2.922656297683716, + "step": 178 + }, + { + "epoch": 0.0163544997715852, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.994840158915567e-06, + "logits/chosen": 412391424.0, + "logits/rejected": 457744469.3333333, + "logps/chosen": -356.58599853515625, + "logps/rejected": -534.5172526041666, + "loss": 0.0219, + "rewards/chosen": 2.8404552936553955, + "rewards/margins": 10.620422283808391, + "rewards/rejected": -7.779966990152995, + "step": 179 + }, + { + "epoch": 0.01644586569209685, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.994774649194916e-06, + "logits/chosen": 496976844.8, + "logits/rejected": 492204245.3333333, + "logps/chosen": -444.153076171875, + "logps/rejected": -604.1487223307291, + "loss": 0.074, + "rewards/chosen": 2.39072265625, + "rewards/margins": 9.443296178181965, + "rewards/rejected": -7.052573521931966, + "step": 180 + }, + { + "epoch": 0.016537231612608498, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.99470872645268e-06, + "logits/chosen": 721597248.0, + "logits/rejected": 948086464.0, + "logps/chosen": -462.1420593261719, + "logps/rejected": -508.7821044921875, + "loss": 0.0469, + "rewards/chosen": 2.5194199085235596, + "rewards/margins": 11.790830850601196, + "rewards/rejected": -9.271410942077637, + "step": 181 + }, + { + "epoch": 0.016628597533120147, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.994642390694308e-06, + "logits/chosen": 502836326.4, + "logits/rejected": 391955029.3333333, + "logps/chosen": -380.184033203125, + "logps/rejected": -468.5186360677083, + "loss": 0.0809, + "rewards/chosen": 2.037322425842285, + "rewards/margins": 8.645354652404786, + "rewards/rejected": -6.6080322265625, + "step": 182 + }, + { + "epoch": 0.016719963453631796, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.994575641925289e-06, + "logits/chosen": 539493760.0, + "logits/rejected": 443056224.0, + "logps/chosen": -383.2914733886719, + "logps/rejected": -429.11309814453125, + "loss": 0.038, + "rewards/chosen": 3.153201103210449, + "rewards/margins": 8.091046333312988, + "rewards/rejected": -4.937845230102539, + "step": 183 + }, + { + "epoch": 0.016811329374143445, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.994508480151137e-06, + "logits/chosen": 827270784.0, + "logits/rejected": 503460736.0, + "logps/chosen": -511.4700927734375, + "logps/rejected": -574.6007690429688, + "loss": 0.0669, + "rewards/chosen": 2.1524744033813477, + "rewards/margins": 10.21186351776123, + "rewards/rejected": -8.059389114379883, + "step": 184 + }, + { + "epoch": 0.016902695294655094, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 9.994440905377412e-06, + "logits/rejected": 1101714432.0, + "logps/rejected": -495.81256103515625, + "loss": 0.0076, + "rewards/rejected": -5.707149505615234, + "step": 185 + }, + { + "epoch": 0.016994061215166743, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.994372917609696e-06, + "logits/chosen": 489654144.0, + "logits/rejected": 376002880.0, + "logps/chosen": -329.180908203125, + "logps/rejected": -341.57183837890625, + "loss": 0.1003, + "rewards/chosen": 2.4308207035064697, + "rewards/margins": 5.622649908065796, + "rewards/rejected": -3.191829204559326, + "step": 186 + }, + { + "epoch": 0.017085427135678392, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.994304516853615e-06, + "logits/chosen": 901234240.0, + "logits/rejected": 702204416.0, + "logps/chosen": -549.9593505859375, + "logps/rejected": -495.9773864746094, + "loss": 0.0267, + "rewards/chosen": 3.693507671356201, + "rewards/margins": 9.385332584381104, + "rewards/rejected": -5.691824913024902, + "step": 187 + }, + { + "epoch": 0.01717679305619004, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.994235703114823e-06, + "logits/chosen": 369070677.3333333, + "logits/rejected": 545068544.0, + "logps/chosen": -308.8835856119792, + "logps/rejected": -485.6312255859375, + "loss": 0.1153, + "rewards/chosen": 2.7325995763142905, + "rewards/margins": 6.250614007314047, + "rewards/rejected": -3.518014430999756, + "step": 188 + }, + { + "epoch": 0.01726815897670169, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.994166476399013e-06, + "logits/chosen": 590282496.0, + "logits/rejected": 603108288.0, + "logps/chosen": -501.0787658691406, + "logps/rejected": -632.2008056640625, + "loss": 0.0414, + "rewards/chosen": 2.6728603839874268, + "rewards/margins": 11.146185159683228, + "rewards/rejected": -8.4733247756958, + "step": 189 + }, + { + "epoch": 0.01735952489721334, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.994096836711905e-06, + "logits/chosen": 886037674.6666666, + "logits/rejected": 589576192.0, + "logps/chosen": -452.4869791666667, + "logps/rejected": -419.018798828125, + "loss": 0.0675, + "rewards/chosen": 2.061737060546875, + "rewards/margins": 6.6552574157714846, + "rewards/rejected": -4.5935203552246096, + "step": 190 + }, + { + "epoch": 0.01745089081772499, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.99402678405926e-06, + "logits/chosen": 517691477.3333333, + "logits/rejected": 378074496.0, + "logps/chosen": -204.7192586263021, + "logps/rejected": -317.79107666015625, + "loss": 0.1076, + "rewards/chosen": 2.2613770167032876, + "rewards/margins": 8.266544024149576, + "rewards/rejected": -6.005167007446289, + "step": 191 + }, + { + "epoch": 0.017542256738236638, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.993956318446872e-06, + "logits/chosen": 635619797.3333334, + "logits/rejected": 650694809.6, + "logps/chosen": -217.23905436197916, + "logps/rejected": -699.53173828125, + "loss": 0.1282, + "rewards/chosen": 1.8130922317504883, + "rewards/margins": 10.955374336242675, + "rewards/rejected": -9.142282104492187, + "step": 192 + }, + { + "epoch": 0.017633622658748287, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.993885439880566e-06, + "logits/chosen": 562866432.0, + "logits/rejected": 448355648.0, + "logps/chosen": -298.3255208333333, + "logps/rejected": -511.3240966796875, + "loss": 0.0874, + "rewards/chosen": 2.130950450897217, + "rewards/margins": 6.848798751831055, + "rewards/rejected": -4.717848300933838, + "step": 193 + }, + { + "epoch": 0.017724988579259936, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.993814148366204e-06, + "logits/chosen": 587683225.6, + "logits/rejected": 556196864.0, + "logps/chosen": -467.61455078125, + "logps/rejected": -351.0515950520833, + "loss": 0.1051, + "rewards/chosen": 2.1025516510009767, + "rewards/margins": 5.568403752644857, + "rewards/rejected": -3.4658521016438804, + "step": 194 + }, + { + "epoch": 0.017816354499771585, + "grad_norm": 24.75, + "kl": 0.0, + "learning_rate": 9.99374244390968e-06, + "logits/chosen": 855700309.3333334, + "logits/rejected": 1112010752.0, + "logps/chosen": -456.3519694010417, + "logps/rejected": -793.3237915039062, + "loss": 0.1721, + "rewards/chosen": 1.5496541659037273, + "rewards/margins": 10.057440916697184, + "rewards/rejected": -8.507786750793457, + "step": 195 + }, + { + "epoch": 0.017907720420283234, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.993670326516924e-06, + "logits/chosen": 726617216.0, + "logits/rejected": 667348544.0, + "logps/chosen": -381.25286865234375, + "logps/rejected": -343.9889831542969, + "loss": 0.116, + "rewards/chosen": 2.0662989616394043, + "rewards/margins": 6.273783206939697, + "rewards/rejected": -4.207484245300293, + "step": 196 + }, + { + "epoch": 0.017999086340794883, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.9935977961939e-06, + "logits/chosen": 601246890.6666666, + "logits/rejected": 246237184.0, + "logps/chosen": -320.616455078125, + "logps/rejected": -397.26796875, + "loss": 0.1514, + "rewards/chosen": 2.5290025075276694, + "rewards/margins": 6.81605733235677, + "rewards/rejected": -4.287054824829101, + "step": 197 + }, + { + "epoch": 0.018090452261306532, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.993524852946605e-06, + "logits/chosen": 484597760.0, + "logits/rejected": 541747507.2, + "logps/chosen": -165.2218017578125, + "logps/rejected": -513.203271484375, + "loss": 0.0887, + "rewards/chosen": 2.0102291107177734, + "rewards/margins": 7.400114822387695, + "rewards/rejected": -5.389885711669922, + "step": 198 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.993451496781069e-06, + "logits/chosen": 699433045.3333334, + "logits/rejected": 568239206.4, + "logps/chosen": -462.5459798177083, + "logps/rejected": -338.7053955078125, + "loss": 0.0462, + "rewards/chosen": 2.729755719502767, + "rewards/margins": 6.140801175435384, + "rewards/rejected": -3.411045455932617, + "step": 199 + }, + { + "epoch": 0.01827318410232983, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.993377727703361e-06, + "logits/chosen": 612605952.0, + "logits/rejected": 526077824.0, + "logps/chosen": -233.60781860351562, + "logps/rejected": -426.59490966796875, + "loss": 0.0243, + "rewards/chosen": 3.6286654472351074, + "rewards/margins": 8.917576313018799, + "rewards/rejected": -5.288910865783691, + "step": 200 + }, + { + "epoch": 0.01836455002284148, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.99330354571958e-06, + "logits/chosen": 259142688.0, + "logits/rejected": 438279168.0, + "logps/chosen": -143.20977783203125, + "logps/rejected": -392.03250558035717, + "loss": 0.058, + "rewards/chosen": 0.8405655026435852, + "rewards/margins": 5.621083080768585, + "rewards/rejected": -4.780517578125, + "step": 201 + }, + { + "epoch": 0.018455915943353128, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 9.993228950835859e-06, + "logits/chosen": 258461290.66666666, + "logits/rejected": 344833024.0, + "logps/chosen": -246.44771321614584, + "logps/rejected": -506.04608154296875, + "loss": 0.0387, + "rewards/chosen": 3.1363983154296875, + "rewards/margins": 11.349413871765137, + "rewards/rejected": -8.21301555633545, + "step": 202 + }, + { + "epoch": 0.018547281863864777, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.993153943058366e-06, + "logits/chosen": 791820970.6666666, + "logits/rejected": 782821632.0, + "logps/chosen": -216.7513427734375, + "logps/rejected": -189.47573852539062, + "loss": 0.1319, + "rewards/chosen": 2.278069019317627, + "rewards/margins": 6.641904354095459, + "rewards/rejected": -4.363835334777832, + "step": 203 + }, + { + "epoch": 0.018638647784376426, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.993078522393305e-06, + "logits/chosen": 400019968.0, + "logits/rejected": 455241088.0, + "logps/chosen": -345.49774169921875, + "logps/rejected": -521.021728515625, + "loss": 0.046, + "rewards/chosen": 1.9883239269256592, + "rewards/margins": 7.374150037765503, + "rewards/rejected": -5.385826110839844, + "step": 204 + }, + { + "epoch": 0.018730013704888075, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.993002688846913e-06, + "logits/chosen": 253713056.0, + "logits/rejected": 598079530.6666666, + "logps/chosen": -100.69898223876953, + "logps/rejected": -313.53415934244794, + "loss": 0.1804, + "rewards/chosen": 1.9950653314590454, + "rewards/margins": 5.3517086903254185, + "rewards/rejected": -3.3566433588663735, + "step": 205 + }, + { + "epoch": 0.018821379625399724, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.992926442425461e-06, + "logits/chosen": 189050032.0, + "logits/rejected": 423621717.3333333, + "logps/chosen": -363.59625244140625, + "logps/rejected": -466.5875651041667, + "loss": 0.0215, + "rewards/chosen": 2.7692229747772217, + "rewards/margins": 8.68074615796407, + "rewards/rejected": -5.911523183186849, + "step": 206 + }, + { + "epoch": 0.018912745545911377, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.99284978313525e-06, + "logits/chosen": 569597952.0, + "logits/rejected": 525191456.0, + "logps/chosen": -406.554931640625, + "logps/rejected": -689.8336181640625, + "loss": 0.0529, + "rewards/chosen": 3.102741241455078, + "rewards/margins": 11.642938613891602, + "rewards/rejected": -8.540197372436523, + "step": 207 + }, + { + "epoch": 0.019004111466423026, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 9.992772710982624e-06, + "logits/chosen": 452954976.0, + "logits/rejected": 511992896.0, + "logps/chosen": -292.3658142089844, + "logps/rejected": -486.78936767578125, + "loss": 0.0381, + "rewards/chosen": 3.1423633098602295, + "rewards/margins": 8.387789487838745, + "rewards/rejected": -5.245426177978516, + "step": 208 + }, + { + "epoch": 0.019095477386934675, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.992695225973952e-06, + "logits/chosen": 373837408.0, + "logits/rejected": 455309312.0, + "logps/chosen": -441.6529541015625, + "logps/rejected": -472.74532645089283, + "loss": 0.0759, + "rewards/chosen": 3.4313600063323975, + "rewards/margins": 8.396832636424474, + "rewards/rejected": -4.9654726300920755, + "step": 209 + }, + { + "epoch": 0.019186843307446324, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.992617328115644e-06, + "logits/chosen": 713773482.6666666, + "logits/rejected": 321190860.8, + "logps/chosen": -521.9673665364584, + "logps/rejected": -293.15869140625, + "loss": 0.0709, + "rewards/chosen": 1.8876465161641438, + "rewards/margins": 5.82180331548055, + "rewards/rejected": -3.934156799316406, + "step": 210 + }, + { + "epoch": 0.019278209227957973, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 9.99253901741414e-06, + "logits/chosen": 559584768.0, + "logits/rejected": 482015232.0, + "logps/chosen": -196.07705688476562, + "logps/rejected": -259.2427978515625, + "loss": 0.1144, + "rewards/chosen": 1.4614330530166626, + "rewards/margins": 6.697579026222229, + "rewards/rejected": -5.236145973205566, + "step": 211 + }, + { + "epoch": 0.019369575148469622, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 9.992460293875918e-06, + "logits/chosen": 520890572.8, + "logits/rejected": 532131840.0, + "logps/chosen": -286.086962890625, + "logps/rejected": -647.6072184244791, + "loss": 0.1468, + "rewards/chosen": 1.5559676170349122, + "rewards/margins": 10.450979518890382, + "rewards/rejected": -8.895011901855469, + "step": 212 + }, + { + "epoch": 0.01946094106898127, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.992381157507485e-06, + "logits/chosen": 867402880.0, + "logits/rejected": 669821312.0, + "logps/chosen": -335.33868408203125, + "logps/rejected": -267.9691467285156, + "loss": 0.0517, + "rewards/chosen": 2.4526329040527344, + "rewards/margins": 7.912874698638916, + "rewards/rejected": -5.460241794586182, + "step": 213 + }, + { + "epoch": 0.01955230698949292, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.992301608315385e-06, + "logits/chosen": 721185792.0, + "logits/rejected": 705744640.0, + "logps/chosen": -386.6332194010417, + "logps/rejected": -590.1075439453125, + "loss": 0.1134, + "rewards/chosen": 1.979005495707194, + "rewards/margins": 6.221110502878825, + "rewards/rejected": -4.242105007171631, + "step": 214 + }, + { + "epoch": 0.01964367291000457, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.992221646306196e-06, + "logits/chosen": 791870656.0, + "logits/rejected": 1250553216.0, + "logps/chosen": -412.9703369140625, + "logps/rejected": -512.6621704101562, + "loss": 0.0936, + "rewards/chosen": 3.00164794921875, + "rewards/margins": 6.8084681034088135, + "rewards/rejected": -3.8068201541900635, + "step": 215 + }, + { + "epoch": 0.019735038830516218, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.992141271486532e-06, + "logits/chosen": 405536426.6666667, + "logits/rejected": 285236480.0, + "logps/chosen": -241.4912109375, + "logps/rejected": -417.189501953125, + "loss": 0.0461, + "rewards/chosen": 2.736846923828125, + "rewards/margins": 8.291629791259766, + "rewards/rejected": -5.554782867431641, + "step": 216 + }, + { + "epoch": 0.019826404751027867, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.992060483863037e-06, + "logits/chosen": 385560704.0, + "logits/rejected": 508933546.6666667, + "logps/chosen": -313.79473876953125, + "logps/rejected": -617.4539794921875, + "loss": 0.0219, + "rewards/chosen": 2.8816933631896973, + "rewards/margins": 10.95763635635376, + "rewards/rejected": -8.075942993164062, + "step": 217 + }, + { + "epoch": 0.019917770671539516, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.991979283442394e-06, + "logits/chosen": 494762752.0, + "logits/rejected": 1079610112.0, + "logps/chosen": -366.464599609375, + "logps/rejected": -708.318359375, + "loss": 0.0895, + "rewards/chosen": 2.4257097244262695, + "rewards/margins": 8.130548477172852, + "rewards/rejected": -5.704838752746582, + "step": 218 + }, + { + "epoch": 0.020009136592051165, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.991897670231316e-06, + "logits/chosen": 730894336.0, + "logits/rejected": 774100480.0, + "logps/chosen": -283.0043131510417, + "logps/rejected": -495.898046875, + "loss": 0.0607, + "rewards/chosen": 2.450815995534261, + "rewards/margins": 7.633569367726643, + "rewards/rejected": -5.182753372192383, + "step": 219 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.99181564423655e-06, + "logits/chosen": 613337856.0, + "logits/rejected": 526440618.6666667, + "logps/chosen": -441.427734375, + "logps/rejected": -369.800048828125, + "loss": 0.0819, + "rewards/chosen": 2.496769332885742, + "rewards/margins": 8.618272527058918, + "rewards/rejected": -6.121503194173177, + "step": 220 + }, + { + "epoch": 0.020191868433074463, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 9.991733205464882e-06, + "logits/chosen": 390727744.0, + "logits/rejected": 506253994.6666667, + "logps/chosen": -178.97923278808594, + "logps/rejected": -619.471923828125, + "loss": 0.026, + "rewards/chosen": 2.369915008544922, + "rewards/margins": 10.865432739257812, + "rewards/rejected": -8.49551773071289, + "step": 221 + }, + { + "epoch": 0.020283234353586112, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.991650353923126e-06, + "logits/chosen": 643042432.0, + "logits/rejected": 368497216.0, + "logps/chosen": -407.9096374511719, + "logps/rejected": -457.67840576171875, + "loss": 0.0624, + "rewards/chosen": 2.3330368995666504, + "rewards/margins": 9.534937381744385, + "rewards/rejected": -7.201900482177734, + "step": 222 + }, + { + "epoch": 0.02037460027409776, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.991567089618135e-06, + "logits/chosen": 516885344.0, + "logits/rejected": 471334688.0, + "logps/chosen": -331.48748779296875, + "logps/rejected": -258.1556396484375, + "loss": 0.1738, + "rewards/chosen": 1.3753951787948608, + "rewards/margins": 4.836753487586975, + "rewards/rejected": -3.4613583087921143, + "step": 223 + }, + { + "epoch": 0.02046596619460941, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.991483412556795e-06, + "logits/chosen": 472103328.0, + "logits/rejected": 486893098.6666667, + "logps/chosen": -385.6112976074219, + "logps/rejected": -334.4883626302083, + "loss": 0.1002, + "rewards/chosen": 2.056553840637207, + "rewards/margins": 6.479163805643718, + "rewards/rejected": -4.422609965006511, + "step": 224 + }, + { + "epoch": 0.02055733211512106, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.991399322746022e-06, + "logits/chosen": 424451993.6, + "logits/rejected": 460253440.0, + "logps/chosen": -354.3839111328125, + "logps/rejected": -506.1589762369792, + "loss": 0.0626, + "rewards/chosen": 2.8144866943359377, + "rewards/margins": 8.80920778910319, + "rewards/rejected": -5.994721094767253, + "step": 225 + }, + { + "epoch": 0.02064869803563271, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.991314820192772e-06, + "logits/chosen": 928075712.0, + "logits/rejected": 800902144.0, + "logps/chosen": -311.6332092285156, + "logps/rejected": -491.39306640625, + "loss": 0.068, + "rewards/chosen": 2.2808830738067627, + "rewards/margins": 6.860722780227661, + "rewards/rejected": -4.579839706420898, + "step": 226 + }, + { + "epoch": 0.020740063956144358, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.991229904904032e-06, + "logits/chosen": 560097536.0, + "logits/rejected": 367549269.3333333, + "logps/chosen": -517.3539428710938, + "logps/rejected": -434.3548990885417, + "loss": 0.1119, + "rewards/chosen": 1.0562858581542969, + "rewards/margins": 7.115818023681641, + "rewards/rejected": -6.059532165527344, + "step": 227 + }, + { + "epoch": 0.020831429876656007, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.991144576886824e-06, + "logits/chosen": 561397376.0, + "logits/rejected": 1025839232.0, + "logps/chosen": -320.1685791015625, + "logps/rejected": -1038.7132568359375, + "loss": 0.0446, + "rewards/chosen": 2.437636375427246, + "rewards/margins": 14.321452140808105, + "rewards/rejected": -11.88381576538086, + "step": 228 + }, + { + "epoch": 0.020922795797167656, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 9.991058836148201e-06, + "logits/chosen": 524920000.0, + "logits/rejected": 477004416.0, + "logps/chosen": -402.447265625, + "logps/rejected": -473.9984130859375, + "loss": 0.0312, + "rewards/chosen": 3.0068111419677734, + "rewards/margins": 9.786557674407959, + "rewards/rejected": -6.7797465324401855, + "step": 229 + }, + { + "epoch": 0.021014161717679305, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.990972682695259e-06, + "logits/chosen": 393076672.0, + "logits/rejected": 456662400.0, + "logps/chosen": -383.7826232910156, + "logps/rejected": -529.94873046875, + "loss": 0.0435, + "rewards/chosen": 2.6859169006347656, + "rewards/margins": 11.657693862915039, + "rewards/rejected": -8.971776962280273, + "step": 230 + }, + { + "epoch": 0.021105527638190954, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 9.990886116535116e-06, + "logits/chosen": 307888554.6666667, + "logits/rejected": 754918400.0, + "logps/chosen": -191.57625325520834, + "logps/rejected": -679.628369140625, + "loss": 0.1173, + "rewards/chosen": 2.5147980054219565, + "rewards/margins": 9.025129731496175, + "rewards/rejected": -6.510331726074218, + "step": 231 + }, + { + "epoch": 0.021196893558702603, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.990799137674934e-06, + "logits/chosen": 619116134.4, + "logits/rejected": 674339584.0, + "logps/chosen": -301.6049072265625, + "logps/rejected": -538.984130859375, + "loss": 0.0924, + "rewards/chosen": 2.104789924621582, + "rewards/margins": 9.920995903015136, + "rewards/rejected": -7.816205978393555, + "step": 232 + }, + { + "epoch": 0.021288259479214252, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.990711746121903e-06, + "logits/chosen": 997014357.3333334, + "logits/rejected": 798524518.4, + "logps/chosen": -377.8448079427083, + "logps/rejected": -611.037353515625, + "loss": 0.0753, + "rewards/chosen": 1.5215312639872234, + "rewards/margins": 9.987049452463785, + "rewards/rejected": -8.465518188476562, + "step": 233 + }, + { + "epoch": 0.0213796253997259, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.990623941883253e-06, + "logits/chosen": 667447808.0, + "logits/rejected": 636789162.6666666, + "logps/chosen": -283.6394348144531, + "logps/rejected": -357.3273111979167, + "loss": 0.0712, + "rewards/chosen": 1.4993469715118408, + "rewards/margins": 6.611297845840454, + "rewards/rejected": -5.111950874328613, + "step": 234 + }, + { + "epoch": 0.02147099132023755, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.99053572496624e-06, + "logits/chosen": 649762944.0, + "logits/rejected": 470589632.0, + "logps/chosen": -257.09307861328125, + "logps/rejected": -243.55462646484375, + "loss": 0.2122, + "rewards/chosen": 1.7351592381795247, + "rewards/margins": 3.458025296529134, + "rewards/rejected": -1.7228660583496094, + "step": 235 + }, + { + "epoch": 0.0215623572407492, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 9.990447095378162e-06, + "logits/chosen": 668599961.6, + "logits/rejected": 784396458.6666666, + "logps/chosen": -432.29912109375, + "logps/rejected": -341.594482421875, + "loss": 0.2161, + "rewards/chosen": 1.5370655059814453, + "rewards/margins": 4.545072237650553, + "rewards/rejected": -3.008006731669108, + "step": 236 + }, + { + "epoch": 0.021653723161260848, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.990358053126345e-06, + "logits/chosen": 772552704.0, + "logits/rejected": 959486080.0, + "logps/chosen": -239.80526733398438, + "logps/rejected": -519.5701904296875, + "loss": 0.1581, + "rewards/chosen": 1.387071967124939, + "rewards/margins": 6.539050459861755, + "rewards/rejected": -5.151978492736816, + "step": 237 + }, + { + "epoch": 0.021745089081772497, + "grad_norm": 15.0625, + "kl": 4.623010635375977, + "learning_rate": 9.990268598218156e-06, + "logits/chosen": 498940635.4285714, + "logits/rejected": 469591232.0, + "logps/chosen": -368.02657645089283, + "logps/rejected": -377.1593322753906, + "loss": 0.1264, + "rewards/chosen": 2.3535472324916293, + "rewards/margins": 6.0322491100856235, + "rewards/rejected": -3.678701877593994, + "step": 238 + }, + { + "epoch": 0.02183645500228415, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.990178730660987e-06, + "logits/chosen": 843796906.6666666, + "logits/rejected": 780107072.0, + "logps/chosen": -542.3501383463541, + "logps/rejected": -383.02166748046875, + "loss": 0.1736, + "rewards/chosen": 2.040956179300944, + "rewards/margins": 4.127301136652628, + "rewards/rejected": -2.0863449573516846, + "step": 239 + }, + { + "epoch": 0.0219278209227958, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 9.990088450462274e-06, + "logits/chosen": 1017579392.0, + "logits/rejected": 582856320.0, + "logps/chosen": -423.0716857910156, + "logps/rejected": -520.9855346679688, + "loss": 0.1046, + "rewards/chosen": 1.8513832092285156, + "rewards/margins": 6.241135597229004, + "rewards/rejected": -4.389752388000488, + "step": 240 + }, + { + "epoch": 0.022019186843307448, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.98999775762948e-06, + "logits/chosen": 615788646.4, + "logits/rejected": 474290005.3333333, + "logps/chosen": -345.02724609375, + "logps/rejected": -326.828369140625, + "loss": 0.1144, + "rewards/chosen": 2.4438629150390625, + "rewards/margins": 5.385625203450521, + "rewards/rejected": -2.9417622884114585, + "step": 241 + }, + { + "epoch": 0.022110552763819097, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.989906652170104e-06, + "logits/chosen": 575647232.0, + "logits/rejected": 450280923.4285714, + "logps/chosen": -642.45556640625, + "logps/rejected": -485.2383510044643, + "loss": 0.0415, + "rewards/chosen": 2.6427979469299316, + "rewards/margins": 7.649003914424351, + "rewards/rejected": -5.006205967494419, + "step": 242 + }, + { + "epoch": 0.022201918684330746, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.989815134091683e-06, + "logits/chosen": 829959680.0, + "logits/rejected": 605742933.3333334, + "logps/chosen": -241.97100830078125, + "logps/rejected": -502.7779134114583, + "loss": 0.0388, + "rewards/chosen": 2.8780531883239746, + "rewards/margins": 8.455138047536213, + "rewards/rejected": -5.577084859212239, + "step": 243 + }, + { + "epoch": 0.022293284604842395, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.98972320340178e-06, + "logits/chosen": 738683562.6666666, + "logits/rejected": 712295321.6, + "logps/chosen": -245.82023111979166, + "logps/rejected": -564.7763671875, + "loss": 0.0366, + "rewards/chosen": 2.378445625305176, + "rewards/margins": 8.193150520324707, + "rewards/rejected": -5.814704895019531, + "step": 244 + }, + { + "epoch": 0.022384650525354044, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.989630860107999e-06, + "logits/chosen": 585158080.0, + "logits/rejected": 379297984.0, + "logps/chosen": -379.69805908203125, + "logps/rejected": -474.100341796875, + "loss": 0.0621, + "rewards/chosen": 2.3775277137756348, + "rewards/margins": 6.937644004821777, + "rewards/rejected": -4.560116291046143, + "step": 245 + }, + { + "epoch": 0.022476016445865693, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.989538104217975e-06, + "logits/chosen": 729687680.0, + "logits/rejected": 702358016.0, + "logps/chosen": -310.5385437011719, + "logps/rejected": -554.0506591796875, + "loss": 0.0367, + "rewards/chosen": 3.1906845569610596, + "rewards/margins": 9.163178205490112, + "rewards/rejected": -5.972493648529053, + "step": 246 + }, + { + "epoch": 0.022567382366377342, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.98944493573938e-06, + "logits/chosen": 584123520.0, + "logits/rejected": 410693171.2, + "logps/chosen": -368.8150227864583, + "logps/rejected": -271.70498046875, + "loss": 0.1359, + "rewards/chosen": 2.2310709953308105, + "rewards/margins": 5.364109706878662, + "rewards/rejected": -3.1330387115478517, + "step": 247 + }, + { + "epoch": 0.02265874828688899, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.989351354679918e-06, + "logits/chosen": 523421184.0, + "logits/rejected": 1011584064.0, + "logps/chosen": -342.4018961588542, + "logps/rejected": -431.1961975097656, + "loss": 0.0559, + "rewards/chosen": 2.96101442972819, + "rewards/margins": 6.48642341295878, + "rewards/rejected": -3.525408983230591, + "step": 248 + }, + { + "epoch": 0.02275011420740064, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.989257361047325e-06, + "logits/chosen": 681795584.0, + "logits/rejected": 472290048.0, + "logps/chosen": -241.39036560058594, + "logps/rejected": -360.5315856933594, + "loss": 0.1538, + "rewards/chosen": 2.5047736167907715, + "rewards/margins": 6.70467472076416, + "rewards/rejected": -4.199901103973389, + "step": 249 + }, + { + "epoch": 0.02284148012791229, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.989162954849374e-06, + "logits/chosen": 398036800.0, + "logits/rejected": 540276096.0, + "logps/chosen": -198.89122009277344, + "logps/rejected": -679.644775390625, + "loss": 0.0712, + "rewards/chosen": 2.4619810581207275, + "rewards/margins": 8.245923280715942, + "rewards/rejected": -5.783942222595215, + "step": 250 + }, + { + "epoch": 0.022932846048423938, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.989068136093873e-06, + "logits/chosen": 556608426.6666666, + "logits/rejected": 457645152.0, + "logps/chosen": -200.8885498046875, + "logps/rejected": -698.65673828125, + "loss": 0.1094, + "rewards/chosen": 2.536529064178467, + "rewards/margins": 8.427919864654541, + "rewards/rejected": -5.891390800476074, + "step": 251 + }, + { + "epoch": 0.023024211968935587, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.988972904788663e-06, + "logits/chosen": 293493920.0, + "logits/rejected": 576636562.2857143, + "logps/chosen": -180.12123107910156, + "logps/rejected": -524.4627859933036, + "loss": 0.0378, + "rewards/chosen": 3.6441268920898438, + "rewards/margins": 8.625415257045201, + "rewards/rejected": -4.981288364955357, + "step": 252 + }, + { + "epoch": 0.023115577889447236, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.988877260941615e-06, + "logits/chosen": 414124006.4, + "logits/rejected": 540029098.6666666, + "logps/chosen": -434.002197265625, + "logps/rejected": -401.838623046875, + "loss": 0.0711, + "rewards/chosen": 2.716554641723633, + "rewards/margins": 6.980688095092773, + "rewards/rejected": -4.264133453369141, + "step": 253 + }, + { + "epoch": 0.023206943809958885, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.988781204560643e-06, + "logits/chosen": 1159373653.3333333, + "logits/rejected": 438005920.0, + "logps/chosen": -343.22021484375, + "logps/rejected": -305.3616638183594, + "loss": 0.11, + "rewards/chosen": 2.267154057820638, + "rewards/margins": 7.541133244832357, + "rewards/rejected": -5.273979187011719, + "step": 254 + }, + { + "epoch": 0.023298309730470534, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 9.988684735653687e-06, + "logits/chosen": 634842560.0, + "logits/rejected": 593949696.0, + "logps/chosen": -550.70361328125, + "logps/rejected": -536.9923095703125, + "loss": 0.0732, + "rewards/chosen": 2.020458221435547, + "rewards/margins": 9.448333740234375, + "rewards/rejected": -7.427875518798828, + "step": 255 + }, + { + "epoch": 0.023389675650982183, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 9.988587854228723e-06, + "logits/chosen": 684047001.6, + "logits/rejected": 692339712.0, + "logps/chosen": -363.8359375, + "logps/rejected": -392.0063069661458, + "loss": 0.0992, + "rewards/chosen": 1.8380046844482423, + "rewards/margins": 7.362256622314453, + "rewards/rejected": -5.524251937866211, + "step": 256 + }, + { + "epoch": 0.023481041571493833, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.988490560293767e-06, + "logits/chosen": 668464469.3333334, + "logits/rejected": 736326860.8, + "logps/chosen": -416.5837809244792, + "logps/rejected": -451.9833984375, + "loss": 0.0449, + "rewards/chosen": 2.9122467041015625, + "rewards/margins": 9.791700744628907, + "rewards/rejected": -6.879454040527344, + "step": 257 + }, + { + "epoch": 0.02357240749200548, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.988392853856859e-06, + "logits/chosen": 1229457237.3333333, + "logits/rejected": 674200985.6, + "logps/chosen": -285.33827718098956, + "logps/rejected": -496.91259765625, + "loss": 0.0373, + "rewards/chosen": 2.857041676839193, + "rewards/margins": 9.062544759114584, + "rewards/rejected": -6.20550308227539, + "step": 258 + }, + { + "epoch": 0.02366377341251713, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.988294734926081e-06, + "logits/chosen": 738401484.8, + "logits/rejected": 890562816.0, + "logps/chosen": -446.925048828125, + "logps/rejected": -407.2517496744792, + "loss": 0.1263, + "rewards/chosen": 2.1874767303466798, + "rewards/margins": 6.125370597839355, + "rewards/rejected": -3.937893867492676, + "step": 259 + }, + { + "epoch": 0.02375513933302878, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.988196203509545e-06, + "logits/chosen": 1017687552.0, + "logits/rejected": 644696832.0, + "logps/chosen": -217.8238728841146, + "logps/rejected": -413.10087890625, + "loss": 0.0311, + "rewards/chosen": 2.553213437398275, + "rewards/margins": 9.041290219624837, + "rewards/rejected": -6.488076782226562, + "step": 260 + }, + { + "epoch": 0.02384650525354043, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 9.988097259615399e-06, + "logits/chosen": 441083296.0, + "logits/rejected": 320297325.71428573, + "logps/chosen": -278.88385009765625, + "logps/rejected": -542.1674107142857, + "loss": 0.0152, + "rewards/chosen": 2.3192505836486816, + "rewards/margins": 11.365313598087855, + "rewards/rejected": -9.046063014439174, + "step": 261 + }, + { + "epoch": 0.023937871174052078, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.987997903251827e-06, + "logits/chosen": 490905536.0, + "logits/rejected": 100385608.0, + "logps/chosen": -286.9778137207031, + "logps/rejected": -216.06536865234375, + "loss": 0.2017, + "rewards/chosen": 2.2967586517333984, + "rewards/margins": 3.8592480421066284, + "rewards/rejected": -1.56248939037323, + "step": 262 + }, + { + "epoch": 0.024029237094563727, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 9.987898134427043e-06, + "logits/chosen": 628413866.6666666, + "logits/rejected": 217324569.6, + "logps/chosen": -461.9263509114583, + "logps/rejected": -290.39970703125, + "loss": 0.0762, + "rewards/chosen": 1.561635971069336, + "rewards/margins": 8.549973678588866, + "rewards/rejected": -6.988337707519531, + "step": 263 + }, + { + "epoch": 0.024120603015075376, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 9.987797953149297e-06, + "logits/chosen": 512246880.0, + "logits/rejected": 522851776.0, + "logps/chosen": -359.45245361328125, + "logps/rejected": -435.06182861328125, + "loss": 0.106, + "rewards/chosen": 1.5887579917907715, + "rewards/margins": 6.916870594024658, + "rewards/rejected": -5.328112602233887, + "step": 264 + }, + { + "epoch": 0.024211968935587025, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 9.987697359426874e-06, + "logits/chosen": 201081504.0, + "logits/rejected": 286452297.14285713, + "logps/chosen": -347.87420654296875, + "logps/rejected": -477.5137416294643, + "loss": 0.0154, + "rewards/chosen": 3.569751024246216, + "rewards/margins": 10.459118264062063, + "rewards/rejected": -6.889367239815848, + "step": 265 + }, + { + "epoch": 0.024303334856098674, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.987596353268089e-06, + "logits/chosen": 353994137.6, + "logits/rejected": 467177813.3333333, + "logps/chosen": -196.9825439453125, + "logps/rejected": -293.0401204427083, + "loss": 0.0935, + "rewards/chosen": 2.150922393798828, + "rewards/margins": 8.5954408009847, + "rewards/rejected": -6.444518407185872, + "step": 266 + }, + { + "epoch": 0.024394700776610323, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.987494934681298e-06, + "logits/chosen": 416044544.0, + "logits/rejected": 515874112.0, + "logps/chosen": -217.06683349609375, + "logps/rejected": -588.5084228515625, + "loss": 0.0911, + "rewards/chosen": 2.052809715270996, + "rewards/margins": 9.057718753814697, + "rewards/rejected": -7.004909038543701, + "step": 267 + }, + { + "epoch": 0.024486066697121972, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.987393103674887e-06, + "logits/chosen": 594375552.0, + "logits/rejected": 722627456.0, + "logps/chosen": -136.43722534179688, + "logps/rejected": -502.7633056640625, + "loss": 0.0925, + "rewards/chosen": 1.554053544998169, + "rewards/margins": 11.596458673477173, + "rewards/rejected": -10.042405128479004, + "step": 268 + }, + { + "epoch": 0.02457743261763362, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.987290860257276e-06, + "logits/chosen": 572684074.6666666, + "logits/rejected": 511578521.6, + "logps/chosen": -449.4049886067708, + "logps/rejected": -268.7827880859375, + "loss": 0.0981, + "rewards/chosen": 1.3653828303019206, + "rewards/margins": 6.201099077860515, + "rewards/rejected": -4.835716247558594, + "step": 269 + }, + { + "epoch": 0.02466879853814527, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.987188204436918e-06, + "logits/chosen": 632062336.0, + "logits/rejected": 557839744.0, + "logps/chosen": -476.67437744140625, + "logps/rejected": -605.0831298828125, + "loss": 0.0421, + "rewards/chosen": 2.906850814819336, + "rewards/margins": 10.995805740356445, + "rewards/rejected": -8.08895492553711, + "step": 270 + }, + { + "epoch": 0.024760164458656923, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.987085136222302e-06, + "logits/chosen": 457063722.6666667, + "logits/rejected": 397451264.0, + "logps/chosen": -292.12392171223956, + "logps/rejected": -629.55224609375, + "loss": 0.048, + "rewards/chosen": 2.068217913309733, + "rewards/margins": 10.69573713938395, + "rewards/rejected": -8.627519226074218, + "step": 271 + }, + { + "epoch": 0.02485153037916857, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.986981655621952e-06, + "logits/chosen": 622550954.6666666, + "logits/rejected": 719693184.0, + "logps/chosen": -172.77388509114584, + "logps/rejected": -360.0885009765625, + "loss": 0.1074, + "rewards/chosen": 1.935920238494873, + "rewards/margins": 8.678826808929443, + "rewards/rejected": -6.74290657043457, + "step": 272 + }, + { + "epoch": 0.02494289629968022, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.986877762644425e-06, + "logits/chosen": 536628800.0, + "logits/rejected": 346718537.14285713, + "logps/chosen": -626.943115234375, + "logps/rejected": -396.4697265625, + "loss": 0.0116, + "rewards/chosen": 2.5442750453948975, + "rewards/margins": 9.607895476477488, + "rewards/rejected": -7.063620431082589, + "step": 273 + }, + { + "epoch": 0.02503426222019187, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.986773457298311e-06, + "logits/chosen": 797779520.0, + "logits/rejected": 886114368.0, + "logps/chosen": -254.0465087890625, + "logps/rejected": -521.3494873046875, + "loss": 0.0724, + "rewards/chosen": 1.915778398513794, + "rewards/margins": 10.186691522598267, + "rewards/rejected": -8.270913124084473, + "step": 274 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.986668739592237e-06, + "logits/chosen": 589702809.6, + "logits/rejected": 280753322.6666667, + "logps/chosen": -369.3801025390625, + "logps/rejected": -157.59300740559897, + "loss": 0.1388, + "rewards/chosen": 3.0742877960205077, + "rewards/margins": 5.9712416966756185, + "rewards/rejected": -2.896953900655111, + "step": 275 + }, + { + "epoch": 0.025216994061215168, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.986563609534859e-06, + "logits/chosen": 653996352.0, + "logits/rejected": 575198208.0, + "logps/chosen": -316.46075439453125, + "logps/rejected": -599.5432739257812, + "loss": 0.0883, + "rewards/chosen": 2.409043550491333, + "rewards/margins": 8.242481470108032, + "rewards/rejected": -5.833437919616699, + "step": 276 + }, + { + "epoch": 0.025308359981726817, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.986458067134873e-06, + "logits/chosen": 635380224.0, + "logits/rejected": 332578752.0, + "logps/chosen": -472.43194580078125, + "logps/rejected": -315.6485595703125, + "loss": 0.0575, + "rewards/chosen": 2.4830429553985596, + "rewards/margins": 8.328497171401978, + "rewards/rejected": -5.845454216003418, + "step": 277 + }, + { + "epoch": 0.025399725902238466, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.986352112401005e-06, + "logits/chosen": 748797696.0, + "logits/rejected": 829461248.0, + "logps/chosen": -323.26556396484375, + "logps/rejected": -983.1655883789062, + "loss": 0.1133, + "rewards/chosen": 1.9634594917297363, + "rewards/margins": 11.652757167816162, + "rewards/rejected": -9.689297676086426, + "step": 278 + }, + { + "epoch": 0.025491091822750115, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.986245745342018e-06, + "logits/chosen": 849290598.4, + "logits/rejected": 833031338.6666666, + "logps/chosen": -276.768505859375, + "logps/rejected": -705.2237955729166, + "loss": 0.0666, + "rewards/chosen": 2.3225311279296874, + "rewards/margins": 9.391856892903645, + "rewards/rejected": -7.069325764973958, + "step": 279 + }, + { + "epoch": 0.025582457743261764, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.986138965966705e-06, + "logits/chosen": 655470745.6, + "logits/rejected": 349099370.6666667, + "logps/chosen": -394.142724609375, + "logps/rejected": -650.1853434244791, + "loss": 0.0573, + "rewards/chosen": 3.1402849197387694, + "rewards/margins": 8.454597536722819, + "rewards/rejected": -5.31431261698405, + "step": 280 + }, + { + "epoch": 0.025673823663773413, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.986031774283897e-06, + "logits/chosen": 537057587.2, + "logits/rejected": 641768576.0, + "logps/chosen": -303.9196533203125, + "logps/rejected": -267.1143798828125, + "loss": 0.0814, + "rewards/chosen": 2.5940818786621094, + "rewards/margins": 6.165536244710287, + "rewards/rejected": -3.5714543660481772, + "step": 281 + }, + { + "epoch": 0.025765189584285062, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.985924170302458e-06, + "logits/chosen": 460310912.0, + "logits/rejected": 365199744.0, + "logps/chosen": -289.17828369140625, + "logps/rejected": -329.152587890625, + "loss": 0.0642, + "rewards/chosen": 2.2562050819396973, + "rewards/margins": 7.858558654785156, + "rewards/rejected": -5.602353572845459, + "step": 282 + }, + { + "epoch": 0.02585655550479671, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.985816154031287e-06, + "logits/chosen": 596965171.2, + "logits/rejected": 459869013.3333333, + "logps/chosen": -414.118994140625, + "logps/rejected": -458.4837239583333, + "loss": 0.1188, + "rewards/chosen": 2.950896072387695, + "rewards/margins": 8.85978266398112, + "rewards/rejected": -5.908886591593425, + "step": 283 + }, + { + "epoch": 0.02594792142530836, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.985707725479314e-06, + "logits/chosen": 274682538.6666667, + "logits/rejected": 544448000.0, + "logps/chosen": -166.31924438476562, + "logps/rejected": -464.28427734375, + "loss": 0.0404, + "rewards/chosen": 3.004824956258138, + "rewards/margins": 7.979032262166342, + "rewards/rejected": -4.9742073059082035, + "step": 284 + }, + { + "epoch": 0.02603928734582001, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.985598884655506e-06, + "logits/chosen": 526947754.6666667, + "logits/rejected": 1041604300.8, + "logps/chosen": -238.08243815104166, + "logps/rejected": -622.97607421875, + "loss": 0.0257, + "rewards/chosen": 3.1282265981038413, + "rewards/margins": 8.854956181844075, + "rewards/rejected": -5.7267295837402346, + "step": 285 + }, + { + "epoch": 0.02613065326633166, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.985489631568863e-06, + "logits/chosen": 710836275.2, + "logits/rejected": 626326485.3333334, + "logps/chosen": -234.5092529296875, + "logps/rejected": -588.0397542317709, + "loss": 0.0474, + "rewards/chosen": 2.8405231475830077, + "rewards/margins": 9.47548713684082, + "rewards/rejected": -6.6349639892578125, + "step": 286 + }, + { + "epoch": 0.026222019186843307, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.985379966228418e-06, + "logits/chosen": 571709269.3333334, + "logits/rejected": 488597824.0, + "logps/chosen": -513.4179280598959, + "logps/rejected": -385.88751220703125, + "loss": 0.0744, + "rewards/chosen": 2.533079465230306, + "rewards/margins": 6.073582490285238, + "rewards/rejected": -3.5405030250549316, + "step": 287 + }, + { + "epoch": 0.026313385107354956, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.98526988864324e-06, + "logits/chosen": 611823146.6666666, + "logits/rejected": 802960691.2, + "logps/chosen": -301.0284423828125, + "logps/rejected": -202.617919921875, + "loss": 0.1746, + "rewards/chosen": 2.729168256123861, + "rewards/margins": 5.600874265034994, + "rewards/rejected": -2.871706008911133, + "step": 288 + }, + { + "epoch": 0.026404751027866605, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.985159398822433e-06, + "logits/chosen": 1036148906.6666666, + "logits/rejected": 2284708352.0, + "logps/chosen": -361.1427408854167, + "logps/rejected": -1050.85986328125, + "loss": 0.104, + "rewards/chosen": 1.9273770650227864, + "rewards/margins": 11.722207387288412, + "rewards/rejected": -9.794830322265625, + "step": 289 + }, + { + "epoch": 0.026496116948378255, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.985048496775132e-06, + "logits/chosen": 450318080.0, + "logits/rejected": 303757516.8, + "logps/chosen": -140.1624959309896, + "logps/rejected": -280.8400390625, + "loss": 0.161, + "rewards/chosen": 2.348213036855062, + "rewards/margins": 6.494868310292562, + "rewards/rejected": -4.1466552734375, + "step": 290 + }, + { + "epoch": 0.026587482868889904, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.984937182510509e-06, + "logits/chosen": 421677824.0, + "logits/rejected": 505835712.0, + "logps/chosen": -314.26706949869794, + "logps/rejected": -670.84619140625, + "loss": 0.0932, + "rewards/chosen": 3.133789380391439, + "rewards/margins": 5.2795322736104335, + "rewards/rejected": -2.145742893218994, + "step": 291 + }, + { + "epoch": 0.026678848789401553, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.984825456037766e-06, + "logits/chosen": 425946112.0, + "logits/rejected": 437151957.3333333, + "logps/chosen": -390.8543395996094, + "logps/rejected": -492.1344401041667, + "loss": 0.0498, + "rewards/chosen": 2.1621673107147217, + "rewards/margins": 8.070186694463093, + "rewards/rejected": -5.908019383748372, + "step": 292 + }, + { + "epoch": 0.0267702147099132, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.984713317366142e-06, + "logits/chosen": 565324873.1428572, + "logits/rejected": 207117216.0, + "logps/chosen": -279.06668526785717, + "logps/rejected": -198.53329467773438, + "loss": 0.1329, + "rewards/chosen": 2.377504621233259, + "rewards/margins": 6.081107173647199, + "rewards/rejected": -3.7036025524139404, + "step": 293 + }, + { + "epoch": 0.02686158063042485, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.984600766504913e-06, + "logits/chosen": 837660672.0, + "logits/rejected": 439827541.3333333, + "logps/chosen": -487.838671875, + "logps/rejected": -463.2299397786458, + "loss": 0.0688, + "rewards/chosen": 2.261138153076172, + "rewards/margins": 8.312990697224935, + "rewards/rejected": -6.051852544148763, + "step": 294 + }, + { + "epoch": 0.0269529465509365, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 9.984487803463384e-06, + "logits/chosen": 625745334.8571428, + "logits/rejected": 902225536.0, + "logps/chosen": -433.32882254464283, + "logps/rejected": -1177.4007568359375, + "loss": 0.0576, + "rewards/chosen": 2.7555337633405412, + "rewards/margins": 13.23337800162179, + "rewards/rejected": -10.47784423828125, + "step": 295 + }, + { + "epoch": 0.02704431247144815, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.984374428250894e-06, + "logits/chosen": 767068330.6666666, + "logits/rejected": 538634304.0, + "logps/chosen": -290.005126953125, + "logps/rejected": -165.48020935058594, + "loss": 0.0702, + "rewards/chosen": 2.987309137980143, + "rewards/margins": 6.759462038675943, + "rewards/rejected": -3.772152900695801, + "step": 296 + }, + { + "epoch": 0.027135678391959798, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.984260640876821e-06, + "logits/chosen": 715487872.0, + "logits/rejected": 703208960.0, + "logps/chosen": -480.5042724609375, + "logps/rejected": -502.49814453125, + "loss": 0.0405, + "rewards/chosen": 2.7745300928751626, + "rewards/margins": 8.7164337793986, + "rewards/rejected": -5.941903686523437, + "step": 297 + }, + { + "epoch": 0.027227044312471447, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.984146441350575e-06, + "logits/chosen": 721147712.0, + "logits/rejected": 606250368.0, + "logps/chosen": -451.5003662109375, + "logps/rejected": -403.51995849609375, + "loss": 0.0447, + "rewards/chosen": 2.981252670288086, + "rewards/margins": 8.619046211242676, + "rewards/rejected": -5.63779354095459, + "step": 298 + }, + { + "epoch": 0.027318410232983096, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.984031829681595e-06, + "logits/chosen": 1027433472.0, + "logits/rejected": 641161472.0, + "logps/chosen": -324.3051452636719, + "logps/rejected": -519.5325927734375, + "loss": 0.0384, + "rewards/chosen": 2.8811092376708984, + "rewards/margins": 9.389934539794922, + "rewards/rejected": -6.508825302124023, + "step": 299 + }, + { + "epoch": 0.027409776153494745, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.983916805879362e-06, + "logits/chosen": 825000576.0, + "logits/rejected": 642267904.0, + "logps/chosen": -401.35052490234375, + "logps/rejected": -535.4710693359375, + "loss": 0.0192, + "rewards/chosen": 2.876736640930176, + "rewards/margins": 9.871064186096191, + "rewards/rejected": -6.994327545166016, + "step": 300 + }, + { + "epoch": 0.027501142074006394, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.983801369953387e-06, + "logits/chosen": 234970784.0, + "logits/rejected": 681250730.6666666, + "logps/chosen": -82.14852905273438, + "logps/rejected": -664.4145914713541, + "loss": 0.0693, + "rewards/chosen": 1.0413053035736084, + "rewards/margins": 8.75641353925069, + "rewards/rejected": -7.715108235677083, + "step": 301 + }, + { + "epoch": 0.027592507994518043, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.983685521913213e-06, + "logits/chosen": 685192960.0, + "logits/rejected": 875522730.6666666, + "logps/chosen": -464.98623046875, + "logps/rejected": -414.4026692708333, + "loss": 0.0475, + "rewards/chosen": 2.779584503173828, + "rewards/margins": 9.201479848225912, + "rewards/rejected": -6.421895345052083, + "step": 302 + }, + { + "epoch": 0.027683873915029696, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.983569261768421e-06, + "logits/chosen": 1087256149.3333333, + "logits/rejected": 511864832.0, + "logps/chosen": -333.9617919921875, + "logps/rejected": -406.8006896972656, + "loss": 0.0688, + "rewards/chosen": 2.359522819519043, + "rewards/margins": 8.322301387786865, + "rewards/rejected": -5.962778568267822, + "step": 303 + }, + { + "epoch": 0.027775239835541345, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.983452589528627e-06, + "logits/chosen": 292904362.6666667, + "logits/rejected": 291400038.4, + "logps/chosen": -183.86112467447916, + "logps/rejected": -334.920361328125, + "loss": 0.0366, + "rewards/chosen": 3.0393060048421225, + "rewards/margins": 7.672141393025717, + "rewards/rejected": -4.632835388183594, + "step": 304 + }, + { + "epoch": 0.027866605756052994, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.983335505203476e-06, + "logits/chosen": 475460915.2, + "logits/rejected": 393060864.0, + "logps/chosen": -318.9941650390625, + "logps/rejected": -363.5411783854167, + "loss": 0.0671, + "rewards/chosen": 2.514352798461914, + "rewards/margins": 6.832815170288086, + "rewards/rejected": -4.318462371826172, + "step": 305 + }, + { + "epoch": 0.027957971676564643, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 9.983218008802648e-06, + "logits/chosen": 426466986.6666667, + "logits/rejected": 536905600.0, + "logps/chosen": -290.4379475911458, + "logps/rejected": -385.3030700683594, + "loss": 0.1034, + "rewards/chosen": 2.0808229446411133, + "rewards/margins": 6.503900527954102, + "rewards/rejected": -4.423077583312988, + "step": 306 + }, + { + "epoch": 0.028049337597076292, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.983100100335863e-06, + "logits/chosen": 585713536.0, + "logits/rejected": 499660373.3333333, + "logps/chosen": -216.72901916503906, + "logps/rejected": -513.4378255208334, + "loss": 0.1149, + "rewards/chosen": 1.8209481239318848, + "rewards/margins": 8.868797779083252, + "rewards/rejected": -7.047849655151367, + "step": 307 + }, + { + "epoch": 0.02814070351758794, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.982981779812869e-06, + "logits/chosen": 752588160.0, + "logits/rejected": 583110400.0, + "logps/chosen": -232.67471313476562, + "logps/rejected": -538.7149047851562, + "loss": 0.0381, + "rewards/chosen": 2.7922146320343018, + "rewards/margins": 10.335373163223267, + "rewards/rejected": -7.543158531188965, + "step": 308 + }, + { + "epoch": 0.02823206943809959, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 9.98286304724345e-06, + "logits/chosen": 299735957.3333333, + "logits/rejected": 428614144.0, + "logps/chosen": -184.57234700520834, + "logps/rejected": -509.930712890625, + "loss": 0.0089, + "rewards/chosen": 3.9748996098836265, + "rewards/margins": 11.223903973897299, + "rewards/rejected": -7.249004364013672, + "step": 309 + }, + { + "epoch": 0.02832343535861124, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.982743902637424e-06, + "logits/chosen": 714822997.3333334, + "logits/rejected": 527273779.2, + "logps/chosen": -377.61865234375, + "logps/rejected": -474.688330078125, + "loss": 0.0718, + "rewards/chosen": 1.8433574040730794, + "rewards/margins": 6.495581754048665, + "rewards/rejected": -4.652224349975586, + "step": 310 + }, + { + "epoch": 0.028414801279122888, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.982624346004645e-06, + "logits/chosen": 698496192.0, + "logits/rejected": 517136192.0, + "logps/chosen": -240.8699188232422, + "logps/rejected": -216.7144775390625, + "loss": 0.1278, + "rewards/chosen": 2.4109888076782227, + "rewards/margins": 5.149702787399292, + "rewards/rejected": -2.7387139797210693, + "step": 311 + }, + { + "epoch": 0.028506167199634537, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.982504377354996e-06, + "logits/chosen": 558670182.4, + "logits/rejected": 922140330.6666666, + "logps/chosen": -284.057568359375, + "logps/rejected": -506.9239908854167, + "loss": 0.0329, + "rewards/chosen": 3.3002349853515627, + "rewards/margins": 10.344741058349609, + "rewards/rejected": -7.044506072998047, + "step": 312 + }, + { + "epoch": 0.028597533120146186, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.982383996698398e-06, + "logits/chosen": 699008128.0, + "logits/rejected": 474482944.0, + "logps/chosen": -327.11798095703125, + "logps/rejected": -480.07147216796875, + "loss": 0.0537, + "rewards/chosen": 2.1813125610351562, + "rewards/margins": 9.907434463500977, + "rewards/rejected": -7.72612190246582, + "step": 313 + }, + { + "epoch": 0.028688899040657835, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.982263204044809e-06, + "logits/chosen": 537429299.2, + "logits/rejected": 479428864.0, + "logps/chosen": -320.77529296875, + "logps/rejected": -712.5884602864584, + "loss": 0.0684, + "rewards/chosen": 2.2398956298828123, + "rewards/margins": 13.975341288248696, + "rewards/rejected": -11.735445658365885, + "step": 314 + }, + { + "epoch": 0.028780264961169484, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.982141999404213e-06, + "logits/chosen": 397440170.6666667, + "logits/rejected": 349616384.0, + "logps/chosen": -348.5042317708333, + "logps/rejected": -215.26202392578125, + "loss": 0.1204, + "rewards/chosen": 2.7189712524414062, + "rewards/margins": 5.58466649055481, + "rewards/rejected": -2.8656952381134033, + "step": 315 + }, + { + "epoch": 0.028871630881681133, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.982020382786635e-06, + "logits/chosen": 676027904.0, + "logits/rejected": 706548480.0, + "logps/chosen": -437.8844299316406, + "logps/rejected": -548.7899169921875, + "loss": 0.0315, + "rewards/chosen": 2.824848175048828, + "rewards/margins": 10.22135305404663, + "rewards/rejected": -7.396504878997803, + "step": 316 + }, + { + "epoch": 0.028962996802192782, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.981898354202129e-06, + "logits/chosen": 378385088.0, + "logits/rejected": 346346464.0, + "logps/chosen": -279.9366455078125, + "logps/rejected": -245.13839721679688, + "loss": 0.0336, + "rewards/chosen": 2.977651596069336, + "rewards/margins": 8.48690128326416, + "rewards/rejected": -5.509249687194824, + "step": 317 + }, + { + "epoch": 0.02905436272270443, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.981775913660789e-06, + "logits/chosen": 323859456.0, + "logits/rejected": 566645205.3333334, + "logps/chosen": -310.22216796875, + "logps/rejected": -353.5431722005208, + "loss": 0.0636, + "rewards/chosen": 3.0173394680023193, + "rewards/margins": 6.770660161972046, + "rewards/rejected": -3.7533206939697266, + "step": 318 + }, + { + "epoch": 0.02914572864321608, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.981653061172738e-06, + "logits/chosen": 576265045.3333334, + "logits/rejected": 472480512.0, + "logps/chosen": -422.9313151041667, + "logps/rejected": -545.809375, + "loss": 0.0289, + "rewards/chosen": 2.9157918294270835, + "rewards/margins": 10.121392567952475, + "rewards/rejected": -7.205600738525391, + "step": 319 + }, + { + "epoch": 0.02923709456372773, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.981529796748135e-06, + "logits/chosen": 632372949.3333334, + "logits/rejected": 708252569.6, + "logps/chosen": -447.2373453776042, + "logps/rejected": -467.06474609375, + "loss": 0.0483, + "rewards/chosen": 2.514637311299642, + "rewards/margins": 7.865859158833821, + "rewards/rejected": -5.3512218475341795, + "step": 320 + }, + { + "epoch": 0.02932846048423938, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.981406120397172e-06, + "logits/chosen": 365861862.4, + "logits/rejected": 407317845.3333333, + "logps/chosen": -326.8753662109375, + "logps/rejected": -261.13596598307294, + "loss": 0.0631, + "rewards/chosen": 2.77404670715332, + "rewards/margins": 8.994629669189454, + "rewards/rejected": -6.220582962036133, + "step": 321 + }, + { + "epoch": 0.029419826404751027, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.981282032130077e-06, + "logits/chosen": 711414314.6666666, + "logits/rejected": 463566336.0, + "logps/chosen": -478.7364501953125, + "logps/rejected": -174.58232421875, + "loss": 0.1533, + "rewards/chosen": 2.349972724914551, + "rewards/margins": 5.425691795349121, + "rewards/rejected": -3.0757190704345705, + "step": 322 + }, + { + "epoch": 0.029511192325262676, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.98115753195711e-06, + "logits/chosen": 962838016.0, + "logits/rejected": 531610752.0, + "logps/chosen": -389.659423828125, + "logps/rejected": -404.6059875488281, + "loss": 0.0918, + "rewards/chosen": 2.2556864420572915, + "rewards/margins": 7.505021254221598, + "rewards/rejected": -5.249334812164307, + "step": 323 + }, + { + "epoch": 0.029602558245774326, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.98103261988857e-06, + "logits/chosen": 437138688.0, + "logits/rejected": 542588096.0, + "logps/chosen": -371.4336853027344, + "logps/rejected": -350.14971923828125, + "loss": 0.1013, + "rewards/chosen": 2.5269107818603516, + "rewards/margins": 7.7815728187561035, + "rewards/rejected": -5.254662036895752, + "step": 324 + }, + { + "epoch": 0.029693924166285975, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.980907295934779e-06, + "logits/chosen": 666916505.6, + "logits/rejected": 503553706.6666667, + "logps/chosen": -285.077685546875, + "logps/rejected": -471.6160481770833, + "loss": 0.0347, + "rewards/chosen": 3.179273986816406, + "rewards/margins": 9.928528849283854, + "rewards/rejected": -6.749254862467448, + "step": 325 + }, + { + "epoch": 0.029785290086797624, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 9.980781560106104e-06, + "logits/chosen": 560167082.6666666, + "logits/rejected": 552989388.8, + "logps/chosen": -405.6617024739583, + "logps/rejected": -480.51474609375, + "loss": 0.0195, + "rewards/chosen": 3.152851422627767, + "rewards/margins": 10.075638898213704, + "rewards/rejected": -6.922787475585937, + "step": 326 + }, + { + "epoch": 0.029876656007309273, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.980655412412943e-06, + "logits/chosen": 725785280.0, + "logits/rejected": 472774784.0, + "logps/chosen": -169.39254760742188, + "logps/rejected": -368.5707600911458, + "loss": 0.1271, + "rewards/chosen": 2.2062225341796875, + "rewards/margins": 8.833550771077473, + "rewards/rejected": -6.627328236897786, + "step": 327 + }, + { + "epoch": 0.02996802192782092, + "grad_norm": 24.375, + "kl": 0.0, + "learning_rate": 9.980528852865725e-06, + "logits/chosen": 701406208.0, + "logits/rejected": 432862912.0, + "logps/chosen": -484.6132405598958, + "logps/rejected": -511.229736328125, + "loss": 0.1317, + "rewards/chosen": 1.8969186147054036, + "rewards/margins": 10.749218304951986, + "rewards/rejected": -8.852299690246582, + "step": 328 + }, + { + "epoch": 0.03005938784833257, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.980401881474918e-06, + "logits/chosen": 672630592.0, + "logits/rejected": 939229184.0, + "logps/chosen": -333.78515625, + "logps/rejected": -640.0730794270834, + "loss": 0.0511, + "rewards/chosen": 1.3808090686798096, + "rewards/margins": 10.277815421422323, + "rewards/rejected": -8.897006352742514, + "step": 329 + }, + { + "epoch": 0.03015075376884422, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.980274498251018e-06, + "logits/chosen": 308156160.0, + "logits/rejected": 687610240.0, + "logps/chosen": -134.281005859375, + "logps/rejected": -385.2908528645833, + "loss": 0.0835, + "rewards/chosen": 2.1359333992004395, + "rewards/margins": 8.404493172963459, + "rewards/rejected": -6.2685597737630205, + "step": 330 + }, + { + "epoch": 0.03024211968935587, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 9.980146703204561e-06, + "logits/chosen": 622038869.3333334, + "logits/rejected": 424520550.4, + "logps/chosen": -181.70182291666666, + "logps/rejected": -325.94296875, + "loss": 0.0866, + "rewards/chosen": 2.3420745531717935, + "rewards/margins": 7.203853575388591, + "rewards/rejected": -4.861779022216797, + "step": 331 + }, + { + "epoch": 0.030333485609867518, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.980018496346115e-06, + "logits/chosen": 725934592.0, + "logits/rejected": 576217380.5714285, + "logps/chosen": -529.1868286132812, + "logps/rejected": -542.7723214285714, + "loss": 0.0512, + "rewards/chosen": 1.055206298828125, + "rewards/margins": 9.180609566824776, + "rewards/rejected": -8.125403267996651, + "step": 332 + }, + { + "epoch": 0.030424851530379167, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.97988987768628e-06, + "logits/chosen": 713258393.6, + "logits/rejected": 1250473216.0, + "logps/chosen": -374.252685546875, + "logps/rejected": -435.530517578125, + "loss": 0.0653, + "rewards/chosen": 2.304951858520508, + "rewards/margins": 9.14552993774414, + "rewards/rejected": -6.840578079223633, + "step": 333 + }, + { + "epoch": 0.030516217450890816, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.979760847235689e-06, + "logits/chosen": 518756800.0, + "logits/rejected": 873486464.0, + "logps/chosen": -296.53192138671875, + "logps/rejected": -637.6754760742188, + "loss": 0.027, + "rewards/chosen": 3.267029285430908, + "rewards/margins": 15.19675874710083, + "rewards/rejected": -11.929729461669922, + "step": 334 + }, + { + "epoch": 0.03060758337140247, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.979631405005017e-06, + "logits/chosen": 372316480.0, + "logits/rejected": 320172181.3333333, + "logps/chosen": -178.30679321289062, + "logps/rejected": -449.06640625, + "loss": 0.0479, + "rewards/chosen": 1.5772767066955566, + "rewards/margins": 8.929960091908772, + "rewards/rejected": -7.352683385213216, + "step": 335 + }, + { + "epoch": 0.030698949291914118, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.979501551004964e-06, + "logits/chosen": 696101418.6666666, + "logits/rejected": 1256332544.0, + "logps/chosen": -549.2565104166666, + "logps/rejected": -470.2998046875, + "loss": 0.0807, + "rewards/chosen": 2.326303482055664, + "rewards/margins": 9.753650665283203, + "rewards/rejected": -7.427347183227539, + "step": 336 + }, + { + "epoch": 0.030790315212425767, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.97937128524627e-06, + "logits/chosen": 836550997.3333334, + "logits/rejected": 520306483.2, + "logps/chosen": -258.1940104166667, + "logps/rejected": -412.84248046875, + "loss": 0.0729, + "rewards/chosen": 2.1008790334065757, + "rewards/margins": 8.124287541707357, + "rewards/rejected": -6.023408508300781, + "step": 337 + }, + { + "epoch": 0.030881681132937416, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.979240607739706e-06, + "logits/chosen": 348273408.0, + "logits/rejected": 812606122.6666666, + "logps/chosen": -243.8782958984375, + "logps/rejected": -422.6798095703125, + "loss": 0.0445, + "rewards/chosen": 3.110158157348633, + "rewards/margins": 9.563499323527019, + "rewards/rejected": -6.453341166178386, + "step": 338 + }, + { + "epoch": 0.030973047053449065, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 9.979109518496076e-06, + "logits/chosen": 291936384.0, + "logits/rejected": 425986688.0, + "logps/chosen": -210.5006103515625, + "logps/rejected": -563.0192464192709, + "loss": 0.0166, + "rewards/chosen": 2.8665199279785156, + "rewards/margins": 10.060499827067058, + "rewards/rejected": -7.193979899088542, + "step": 339 + }, + { + "epoch": 0.031064412973960714, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 9.978978017526222e-06, + "logits/chosen": 355587808.0, + "logits/rejected": 499488256.0, + "logps/chosen": -261.1611328125, + "logps/rejected": -428.6769205729167, + "loss": 0.0678, + "rewards/chosen": 2.5324509143829346, + "rewards/margins": 8.110827048619587, + "rewards/rejected": -5.578376134236653, + "step": 340 + }, + { + "epoch": 0.031155778894472363, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.978846104841016e-06, + "logits/chosen": 489479594.6666667, + "logits/rejected": 613245030.4, + "logps/chosen": -276.2311197916667, + "logps/rejected": -622.48505859375, + "loss": 0.1073, + "rewards/chosen": 2.2708420753479004, + "rewards/margins": 7.959740543365479, + "rewards/rejected": -5.6888984680175785, + "step": 341 + }, + { + "epoch": 0.031247144814984012, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.978713780451367e-06, + "logits/chosen": 716744704.0, + "logits/rejected": 953787289.6, + "logps/chosen": -300.80153401692706, + "logps/rejected": -474.73671875, + "loss": 0.0463, + "rewards/chosen": 2.8950573603312173, + "rewards/margins": 7.967599932352702, + "rewards/rejected": -5.072542572021485, + "step": 342 + }, + { + "epoch": 0.03133851073549566, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.97858104436822e-06, + "logits/chosen": 619984042.6666666, + "logits/rejected": 490631782.4, + "logps/chosen": -467.5723470052083, + "logps/rejected": -474.862109375, + "loss": 0.0456, + "rewards/chosen": 2.339531580607096, + "rewards/margins": 8.384200159708659, + "rewards/rejected": -6.044668579101563, + "step": 343 + }, + { + "epoch": 0.031429876656007306, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.978447896602546e-06, + "logits/chosen": 856268851.2, + "logits/rejected": 442668544.0, + "logps/chosen": -285.386962890625, + "logps/rejected": -298.7592366536458, + "loss": 0.118, + "rewards/chosen": 2.515532684326172, + "rewards/margins": 7.9404753367106125, + "rewards/rejected": -5.42494265238444, + "step": 344 + }, + { + "epoch": 0.031521242576518956, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.978314337165358e-06, + "logits/chosen": 467514848.0, + "logits/rejected": 406990720.0, + "logps/chosen": -333.2935791015625, + "logps/rejected": -391.1026611328125, + "loss": 0.0815, + "rewards/chosen": 2.272404432296753, + "rewards/margins": 6.93091082572937, + "rewards/rejected": -4.658506393432617, + "step": 345 + }, + { + "epoch": 0.031612608497030605, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.9781803660677e-06, + "logits/chosen": 398800512.0, + "logits/rejected": 399124138.6666667, + "logps/chosen": -244.1231689453125, + "logps/rejected": -439.794677734375, + "loss": 0.0388, + "rewards/chosen": 3.0575233459472657, + "rewards/margins": 9.246883010864257, + "rewards/rejected": -6.189359664916992, + "step": 346 + }, + { + "epoch": 0.031703974417542254, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.978045983320648e-06, + "logits/chosen": 795341653.3333334, + "logits/rejected": 565216665.6, + "logps/chosen": -385.472900390625, + "logps/rejected": -428.5998046875, + "loss": 0.1042, + "rewards/chosen": 1.8635738690694172, + "rewards/margins": 7.385157426198323, + "rewards/rejected": -5.521583557128906, + "step": 347 + }, + { + "epoch": 0.0317953403380539, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.977911188935317e-06, + "logits/chosen": 647577130.6666666, + "logits/rejected": 724762496.0, + "logps/chosen": -478.5809326171875, + "logps/rejected": -354.83441162109375, + "loss": 0.0799, + "rewards/chosen": 3.0716965993245444, + "rewards/margins": 6.022425969441732, + "rewards/rejected": -2.9507293701171875, + "step": 348 + }, + { + "epoch": 0.03188670625856555, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.97777598292285e-06, + "logits/chosen": 491043413.3333333, + "logits/rejected": 769443840.0, + "logps/chosen": -323.1643880208333, + "logps/rejected": -631.5232421875, + "loss": 0.0749, + "rewards/chosen": 1.9032440185546875, + "rewards/margins": 7.5322216033935545, + "rewards/rejected": -5.628977584838867, + "step": 349 + }, + { + "epoch": 0.03197807217907721, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.977640365294433e-06, + "logits/chosen": 701885269.3333334, + "logits/rejected": 608988160.0, + "logps/chosen": -328.19150797526044, + "logps/rejected": -483.1638671875, + "loss": 0.0699, + "rewards/chosen": 1.9005896250406902, + "rewards/margins": 7.444814173380534, + "rewards/rejected": -5.544224548339844, + "step": 350 + }, + { + "epoch": 0.03206943809958886, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.977504336061274e-06, + "logits/chosen": 744870604.8, + "logits/rejected": 631157973.3333334, + "logps/chosen": -468.825537109375, + "logps/rejected": -468.5934651692708, + "loss": 0.0606, + "rewards/chosen": 2.7272403717041014, + "rewards/margins": 9.014574813842774, + "rewards/rejected": -6.287334442138672, + "step": 351 + }, + { + "epoch": 0.032160804020100506, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.977367895234624e-06, + "logits/chosen": 478045920.0, + "logits/rejected": 567177792.0, + "logps/chosen": -353.05877685546875, + "logps/rejected": -406.60333251953125, + "loss": 0.0443, + "rewards/chosen": 2.73563289642334, + "rewards/margins": 9.411049842834473, + "rewards/rejected": -6.675416946411133, + "step": 352 + }, + { + "epoch": 0.032252169940612155, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.977231042825767e-06, + "logits/chosen": 470640844.8, + "logits/rejected": 382839637.3333333, + "logps/chosen": -378.0087646484375, + "logps/rejected": -591.5567626953125, + "loss": 0.063, + "rewards/chosen": 2.3938201904296874, + "rewards/margins": 11.8460875193278, + "rewards/rejected": -9.452267328898111, + "step": 353 + }, + { + "epoch": 0.032343535861123804, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.977093778846016e-06, + "logits/chosen": 959733930.6666666, + "logits/rejected": 451001472.0, + "logps/chosen": -404.9049886067708, + "logps/rejected": -455.1192932128906, + "loss": 0.0712, + "rewards/chosen": 3.0195385615030923, + "rewards/margins": 10.73602549235026, + "rewards/rejected": -7.716486930847168, + "step": 354 + }, + { + "epoch": 0.03243490178163545, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.976956103306726e-06, + "logits/chosen": 429783008.0, + "logits/rejected": 544887296.0, + "logps/chosen": -307.9815673828125, + "logps/rejected": -413.2242431640625, + "loss": 0.0408, + "rewards/chosen": 3.4651899337768555, + "rewards/margins": 7.92677116394043, + "rewards/rejected": -4.461581230163574, + "step": 355 + }, + { + "epoch": 0.0325262677021471, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.976818016219277e-06, + "logits/chosen": 745662592.0, + "logits/rejected": 535374272.0, + "logps/chosen": -371.5352783203125, + "logps/rejected": -599.8822631835938, + "loss": 0.0458, + "rewards/chosen": 2.64682936668396, + "rewards/margins": 8.980257749557495, + "rewards/rejected": -6.333428382873535, + "step": 356 + }, + { + "epoch": 0.03261763362265875, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.976679517595088e-06, + "logits/chosen": 223452336.0, + "logits/rejected": 568286464.0, + "logps/chosen": -86.49948120117188, + "logps/rejected": -252.05121721540178, + "loss": 0.1187, + "rewards/chosen": 4.0535125732421875, + "rewards/margins": 7.828948429652623, + "rewards/rejected": -3.7754358564104353, + "step": 357 + }, + { + "epoch": 0.0327089995431704, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.976540607445615e-06, + "logits/chosen": 1203862272.0, + "logits/rejected": 624055193.6, + "logps/chosen": -184.13924153645834, + "logps/rejected": -412.6126953125, + "loss": 0.1446, + "rewards/chosen": 2.0102017720540366, + "rewards/margins": 4.904900105794271, + "rewards/rejected": -2.8946983337402346, + "step": 358 + }, + { + "epoch": 0.03280036546368205, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.976401285782341e-06, + "logits/chosen": 1370219008.0, + "logits/rejected": 424570272.0, + "logps/chosen": -412.6087646484375, + "logps/rejected": -256.78900146484375, + "loss": 0.0618, + "rewards/chosen": 2.2802248001098633, + "rewards/margins": 7.38528299331665, + "rewards/rejected": -5.105058193206787, + "step": 359 + }, + { + "epoch": 0.0328917313841937, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.976261552616788e-06, + "logits/chosen": 1105983692.8, + "logits/rejected": 532086400.0, + "logps/chosen": -517.4849609375, + "logps/rejected": -431.0495198567708, + "loss": 0.0491, + "rewards/chosen": 2.8260406494140624, + "rewards/margins": 9.41664098103841, + "rewards/rejected": -6.590600331624349, + "step": 360 + }, + { + "epoch": 0.03298309730470535, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.976121407960512e-06, + "logits/chosen": 568316620.8, + "logits/rejected": 475402965.3333333, + "logps/chosen": -400.392236328125, + "logps/rejected": -487.589599609375, + "loss": 0.0625, + "rewards/chosen": 2.75501708984375, + "rewards/margins": 10.05255521138509, + "rewards/rejected": -7.297538121541341, + "step": 361 + }, + { + "epoch": 0.033074463225216996, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.9759808518251e-06, + "logits/chosen": 463151744.0, + "logits/rejected": 382268672.0, + "logps/chosen": -268.7980651855469, + "logps/rejected": -577.259521484375, + "loss": 0.0462, + "rewards/chosen": 2.4462673664093018, + "rewards/margins": 8.313706636428833, + "rewards/rejected": -5.867439270019531, + "step": 362 + }, + { + "epoch": 0.033165829145728645, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.975839884222173e-06, + "logits/chosen": 694338713.6, + "logits/rejected": 520435968.0, + "logps/chosen": -267.935693359375, + "logps/rejected": -521.8617350260416, + "loss": 0.0513, + "rewards/chosen": 2.7578155517578127, + "rewards/margins": 12.027991485595702, + "rewards/rejected": -9.27017593383789, + "step": 363 + }, + { + "epoch": 0.033257195066240294, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 9.975698505163392e-06, + "logits/chosen": 836165888.0, + "logits/rejected": 1096098816.0, + "logps/chosen": -95.85292053222656, + "logps/rejected": -606.4890950520834, + "loss": 0.0571, + "rewards/chosen": 1.3318564891815186, + "rewards/margins": 9.722391843795776, + "rewards/rejected": -8.390535354614258, + "step": 364 + }, + { + "epoch": 0.03334856098675194, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.975556714660445e-06, + "logits/chosen": 1152081612.8, + "logits/rejected": 618434816.0, + "logps/chosen": -347.66552734375, + "logps/rejected": -513.3892822265625, + "loss": 0.1226, + "rewards/chosen": 1.587946891784668, + "rewards/margins": 8.503086407979328, + "rewards/rejected": -6.915139516194661, + "step": 365 + }, + { + "epoch": 0.03343992690726359, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.975414512725058e-06, + "logits/chosen": 449425280.0, + "logits/rejected": 428306048.0, + "logps/chosen": -507.3070068359375, + "logps/rejected": -497.3306884765625, + "loss": 0.0842, + "rewards/chosen": 2.835742950439453, + "rewards/margins": 7.587071418762207, + "rewards/rejected": -4.751328468322754, + "step": 366 + }, + { + "epoch": 0.03353129282777524, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 9.975271899368989e-06, + "logits/chosen": 610207616.0, + "logits/rejected": 369736096.0, + "logps/chosen": -280.9571838378906, + "logps/rejected": -361.1112060546875, + "loss": 0.1704, + "rewards/chosen": 1.205095648765564, + "rewards/margins": 7.184152960777283, + "rewards/rejected": -5.979057312011719, + "step": 367 + }, + { + "epoch": 0.03362265874828689, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.97512887460403e-06, + "logits/chosen": 635072384.0, + "logits/rejected": 303429408.0, + "logps/chosen": -455.1417236328125, + "logps/rejected": -418.3476257324219, + "loss": 0.0816, + "rewards/chosen": 2.5544567108154297, + "rewards/margins": 7.90317964553833, + "rewards/rejected": -5.3487229347229, + "step": 368 + }, + { + "epoch": 0.03371402466879854, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 9.974985438442008e-06, + "logits/chosen": 338570112.0, + "logits/rejected": 551625472.0, + "logps/chosen": -533.9149169921875, + "logps/rejected": -463.9668375651042, + "loss": 0.0456, + "rewards/chosen": 2.0180587768554688, + "rewards/margins": 7.417198181152344, + "rewards/rejected": -5.399139404296875, + "step": 369 + }, + { + "epoch": 0.03380539058931019, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.974841590894785e-06, + "logits/chosen": 749716889.6, + "logits/rejected": 529189248.0, + "logps/chosen": -444.85634765625, + "logps/rejected": -431.4267578125, + "loss": 0.0351, + "rewards/chosen": 2.9853864669799806, + "rewards/margins": 8.958288129170736, + "rewards/rejected": -5.972901662190755, + "step": 370 + }, + { + "epoch": 0.03389675650982184, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.974697331974255e-06, + "logits/chosen": 622715840.0, + "logits/rejected": 569798272.0, + "logps/chosen": -409.67279052734375, + "logps/rejected": -346.5406494140625, + "loss": 0.0657, + "rewards/chosen": 2.407459259033203, + "rewards/margins": 6.236487865447998, + "rewards/rejected": -3.829028606414795, + "step": 371 + }, + { + "epoch": 0.03398812243033349, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.97455266169235e-06, + "logits/chosen": 636592588.8, + "logits/rejected": 990693034.6666666, + "logps/chosen": -358.804833984375, + "logps/rejected": -635.9763590494791, + "loss": 0.0961, + "rewards/chosen": 2.217915344238281, + "rewards/margins": 8.468027242024739, + "rewards/rejected": -6.250111897786458, + "step": 372 + }, + { + "epoch": 0.034079488350845136, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.974407580061027e-06, + "logits/chosen": 689475635.2, + "logits/rejected": 1281958058.6666667, + "logps/chosen": -405.7575439453125, + "logps/rejected": -527.7412516276041, + "loss": 0.0837, + "rewards/chosen": 2.534380340576172, + "rewards/margins": 10.561851247151694, + "rewards/rejected": -8.027470906575521, + "step": 373 + }, + { + "epoch": 0.034170854271356785, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.974262087092287e-06, + "logits/chosen": 325326643.2, + "logits/rejected": 371906304.0, + "logps/chosen": -286.633056640625, + "logps/rejected": -326.06585693359375, + "loss": 0.0573, + "rewards/chosen": 2.680057144165039, + "rewards/margins": 9.04880485534668, + "rewards/rejected": -6.368747711181641, + "step": 374 + }, + { + "epoch": 0.034262220191868434, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.974116182798159e-06, + "logits/chosen": 513455552.0, + "logits/rejected": 580203306.6666666, + "logps/chosen": -138.5105743408203, + "logps/rejected": -365.5184733072917, + "loss": 0.0517, + "rewards/chosen": 2.476553440093994, + "rewards/margins": 8.407736937204998, + "rewards/rejected": -5.931183497111003, + "step": 375 + }, + { + "epoch": 0.03435358611238008, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.97396986719071e-06, + "logits/chosen": 390355754.6666667, + "logits/rejected": 331171584.0, + "logps/chosen": -356.7649739583333, + "logps/rejected": -392.586279296875, + "loss": 0.0271, + "rewards/chosen": 2.961998621622721, + "rewards/margins": 9.770779291788736, + "rewards/rejected": -6.808780670166016, + "step": 376 + }, + { + "epoch": 0.03444495203289173, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.973823140282038e-06, + "logits/chosen": 432954538.6666667, + "logits/rejected": 918746009.6, + "logps/chosen": -249.71624755859375, + "logps/rejected": -380.150244140625, + "loss": 0.0215, + "rewards/chosen": 3.457839330037435, + "rewards/margins": 8.50064557393392, + "rewards/rejected": -5.042806243896484, + "step": 377 + }, + { + "epoch": 0.03453631795340338, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.973676002084275e-06, + "logits/chosen": 525725593.6, + "logits/rejected": 839466325.3333334, + "logps/chosen": -387.9227294921875, + "logps/rejected": -346.7799072265625, + "loss": 0.1437, + "rewards/chosen": 3.0475032806396483, + "rewards/margins": 5.049232769012451, + "rewards/rejected": -2.0017294883728027, + "step": 378 + }, + { + "epoch": 0.03462768387391503, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.97352845260959e-06, + "logits/chosen": 687013580.8, + "logits/rejected": 518557397.3333333, + "logps/chosen": -316.61572265625, + "logps/rejected": -489.7386067708333, + "loss": 0.0398, + "rewards/chosen": 3.031917762756348, + "rewards/margins": 12.074876340230308, + "rewards/rejected": -9.042958577473959, + "step": 379 + }, + { + "epoch": 0.03471904979442668, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.973380491870181e-06, + "logits/chosen": 324474496.0, + "logits/rejected": 526070336.0, + "logps/chosen": -407.78131103515625, + "logps/rejected": -680.958984375, + "loss": 0.0498, + "rewards/chosen": 2.3937675952911377, + "rewards/margins": 9.055458307266235, + "rewards/rejected": -6.661690711975098, + "step": 380 + }, + { + "epoch": 0.03481041571493833, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.973232119878287e-06, + "logits/chosen": 443618976.0, + "logits/rejected": 525218112.0, + "logps/chosen": -230.15032958984375, + "logps/rejected": -363.347412109375, + "loss": 0.1083, + "rewards/chosen": 2.4856152534484863, + "rewards/margins": 6.041311264038086, + "rewards/rejected": -3.5556960105895996, + "step": 381 + }, + { + "epoch": 0.03490178163544998, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.973083336646172e-06, + "logits/chosen": 455138816.0, + "logits/rejected": 391794176.0, + "logps/chosen": -294.7173258463542, + "logps/rejected": -456.3181640625, + "loss": 0.081, + "rewards/chosen": 3.3022705713907876, + "rewards/margins": 8.93784262339274, + "rewards/rejected": -5.635572052001953, + "step": 382 + }, + { + "epoch": 0.034993147555961626, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.972934142186144e-06, + "logits/chosen": 637336960.0, + "logits/rejected": 715013120.0, + "logps/chosen": -344.5589294433594, + "logps/rejected": -319.9447937011719, + "loss": 0.0876, + "rewards/chosen": 2.2281081676483154, + "rewards/margins": 7.020530462265015, + "rewards/rejected": -4.792422294616699, + "step": 383 + }, + { + "epoch": 0.035084513476473275, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.972784536510537e-06, + "logits/chosen": 288892352.0, + "logits/rejected": 481831082.6666667, + "logps/chosen": -242.40243530273438, + "logps/rejected": -398.9506429036458, + "loss": 0.0232, + "rewards/chosen": 3.2873482704162598, + "rewards/margins": 8.788291136423748, + "rewards/rejected": -5.500942866007487, + "step": 384 + }, + { + "epoch": 0.035175879396984924, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 9.972634519631723e-06, + "logits/chosen": 634343872.0, + "logits/rejected": 1258819328.0, + "logps/chosen": -219.79563903808594, + "logps/rejected": -450.3194580078125, + "loss": 0.0598, + "rewards/chosen": 2.8693065643310547, + "rewards/margins": 7.670329570770264, + "rewards/rejected": -4.801023006439209, + "step": 385 + }, + { + "epoch": 0.03526724531749657, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.972484091562106e-06, + "logits/chosen": 720490035.2, + "logits/rejected": 1840242688.0, + "logps/chosen": -250.2340087890625, + "logps/rejected": -488.4994303385417, + "loss": 0.0635, + "rewards/chosen": 3.1745677947998048, + "rewards/margins": 7.198652521769207, + "rewards/rejected": -4.024084726969401, + "step": 386 + }, + { + "epoch": 0.03535861123800822, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.972333252314127e-06, + "logits/chosen": 631369472.0, + "logits/rejected": 625256294.4, + "logps/chosen": -391.1392822265625, + "logps/rejected": -524.83662109375, + "loss": 0.1176, + "rewards/chosen": 1.9255504608154297, + "rewards/margins": 7.011430740356445, + "rewards/rejected": -5.085880279541016, + "step": 387 + }, + { + "epoch": 0.03544997715851987, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.972182001900254e-06, + "logits/chosen": 656062080.0, + "logits/rejected": 373689664.0, + "logps/chosen": -381.647705078125, + "logps/rejected": -317.77740478515625, + "loss": 0.0926, + "rewards/chosen": 1.6089156866073608, + "rewards/margins": 7.036187529563904, + "rewards/rejected": -5.427271842956543, + "step": 388 + }, + { + "epoch": 0.03554134307903152, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.972030340333e-06, + "logits/chosen": 324174080.0, + "logits/rejected": 741119573.3333334, + "logps/chosen": -167.86965942382812, + "logps/rejected": -442.2347005208333, + "loss": 0.0325, + "rewards/chosen": 2.6300783157348633, + "rewards/margins": 7.81614335378011, + "rewards/rejected": -5.186065038045247, + "step": 389 + }, + { + "epoch": 0.03563270899954317, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.971878267624906e-06, + "logits/chosen": 367400320.0, + "logits/rejected": 749542528.0, + "logps/chosen": -451.625732421875, + "logps/rejected": -394.5749816894531, + "loss": 0.1838, + "rewards/chosen": 2.293008327484131, + "rewards/margins": 6.732961177825928, + "rewards/rejected": -4.439952850341797, + "step": 390 + }, + { + "epoch": 0.03572407492005482, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.971725783788543e-06, + "logits/chosen": 763159500.8, + "logits/rejected": 462412117.3333333, + "logps/chosen": -204.4743408203125, + "logps/rejected": -340.12158203125, + "loss": 0.0603, + "rewards/chosen": 2.3799915313720703, + "rewards/margins": 8.734861373901367, + "rewards/rejected": -6.354869842529297, + "step": 391 + }, + { + "epoch": 0.03581544084056647, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 9.97157288883652e-06, + "logits/chosen": 664649280.0, + "logits/rejected": 492516736.0, + "logps/chosen": -304.7976379394531, + "logps/rejected": -390.6095275878906, + "loss": 0.0533, + "rewards/chosen": 2.5241661071777344, + "rewards/margins": 7.259763240814209, + "rewards/rejected": -4.735597133636475, + "step": 392 + }, + { + "epoch": 0.03590680676107812, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 9.971419582781484e-06, + "logits/chosen": 546178432.0, + "logits/rejected": 570452437.3333334, + "logps/chosen": -369.99029541015625, + "logps/rejected": -491.5631510416667, + "loss": 0.0113, + "rewards/chosen": 3.4142470359802246, + "rewards/margins": 10.765471935272217, + "rewards/rejected": -7.351224899291992, + "step": 393 + }, + { + "epoch": 0.035998172681589766, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.97126586563611e-06, + "logits/chosen": 505584537.6, + "logits/rejected": 541095424.0, + "logps/chosen": -298.78515625, + "logps/rejected": -374.69677734375, + "loss": 0.0812, + "rewards/chosen": 2.270049476623535, + "rewards/margins": 6.351903978983561, + "rewards/rejected": -4.081854502360026, + "step": 394 + }, + { + "epoch": 0.036089538602101415, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 9.971111737413108e-06, + "logits/chosen": 505236416.0, + "logits/rejected": 611940544.0, + "logps/chosen": -348.66864013671875, + "logps/rejected": -342.91412353515625, + "loss": 0.0178, + "rewards/chosen": 3.6788058280944824, + "rewards/margins": 9.242686748504639, + "rewards/rejected": -5.563880920410156, + "step": 395 + }, + { + "epoch": 0.036180904522613064, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.970957198125224e-06, + "logits/chosen": 583677098.6666666, + "logits/rejected": 425766016.0, + "logps/chosen": -365.5794270833333, + "logps/rejected": -578.0845947265625, + "loss": 0.0654, + "rewards/chosen": 2.49199644724528, + "rewards/margins": 15.70037810007731, + "rewards/rejected": -13.208381652832031, + "step": 396 + }, + { + "epoch": 0.03627227044312471, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.970802247785235e-06, + "logits/chosen": 313403477.3333333, + "logits/rejected": 443045376.0, + "logps/chosen": -210.96529134114584, + "logps/rejected": -395.8905029296875, + "loss": 0.0631, + "rewards/chosen": 2.4516967137654624, + "rewards/margins": 10.637824185689292, + "rewards/rejected": -8.186127471923829, + "step": 397 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.970646886405957e-06, + "logits/chosen": 554931419.4285715, + "logits/rejected": 436097280.0, + "logps/chosen": -407.6992885044643, + "logps/rejected": -923.4595947265625, + "loss": 0.0569, + "rewards/chosen": 2.8172264099121094, + "rewards/margins": 22.31942367553711, + "rewards/rejected": -19.502197265625, + "step": 398 + }, + { + "epoch": 0.03645500228414801, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.970491114000234e-06, + "logits/chosen": 511271526.4, + "logits/rejected": 220514858.66666666, + "logps/chosen": -317.25029296875, + "logps/rejected": -361.0024007161458, + "loss": 0.1772, + "rewards/chosen": 1.4476099014282227, + "rewards/margins": 8.57375939687093, + "rewards/rejected": -7.126149495442708, + "step": 399 + }, + { + "epoch": 0.03654636820465966, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.970334930580951e-06, + "logits/chosen": 456875622.4, + "logits/rejected": 535948586.6666667, + "logps/chosen": -258.0383056640625, + "logps/rejected": -283.91444905598956, + "loss": 0.1731, + "rewards/chosen": 1.6106082916259765, + "rewards/margins": 5.87650826772054, + "rewards/rejected": -4.2658999760945635, + "step": 400 + }, + { + "epoch": 0.03663773412517131, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.970178336161018e-06, + "logits/chosen": 488413088.0, + "logits/rejected": 575884672.0, + "logps/chosen": -405.3372497558594, + "logps/rejected": -623.9171142578125, + "loss": 0.0822, + "rewards/chosen": 2.4787399768829346, + "rewards/margins": 10.51103138923645, + "rewards/rejected": -8.032291412353516, + "step": 401 + }, + { + "epoch": 0.03672910004568296, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.970021330753388e-06, + "logits/chosen": 584177536.0, + "logits/rejected": 877066112.0, + "logps/chosen": -202.2061767578125, + "logps/rejected": -436.71112060546875, + "loss": 0.0446, + "rewards/chosen": 2.74041748046875, + "rewards/margins": 11.960323333740234, + "rewards/rejected": -9.219905853271484, + "step": 402 + }, + { + "epoch": 0.03682046596619461, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.969863914371043e-06, + "logits/chosen": 532137301.3333333, + "logits/rejected": 472394649.6, + "logps/chosen": -256.7783203125, + "logps/rejected": -484.3236328125, + "loss": 0.0595, + "rewards/chosen": 2.608478228251139, + "rewards/margins": 9.392038408915202, + "rewards/rejected": -6.783560180664063, + "step": 403 + }, + { + "epoch": 0.036911831886706256, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.969706087026998e-06, + "logits/chosen": 305012160.0, + "logits/rejected": 368063018.6666667, + "logps/chosen": -269.9728088378906, + "logps/rejected": -386.0425618489583, + "loss": 0.0294, + "rewards/chosen": 2.222878932952881, + "rewards/margins": 9.365106105804443, + "rewards/rejected": -7.1422271728515625, + "step": 404 + }, + { + "epoch": 0.037003197807217905, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.969547848734305e-06, + "logits/chosen": 710235093.3333334, + "logits/rejected": 659109056.0, + "logps/chosen": -385.06298828125, + "logps/rejected": -598.135009765625, + "loss": 0.0603, + "rewards/chosen": 2.7929347356160483, + "rewards/margins": 7.276488622029623, + "rewards/rejected": -4.483553886413574, + "step": 405 + }, + { + "epoch": 0.037094563727729554, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.969389199506047e-06, + "logits/chosen": 696804096.0, + "logits/rejected": 910424768.0, + "logps/chosen": -239.078125, + "logps/rejected": -617.18115234375, + "loss": 0.0378, + "rewards/chosen": 2.8536183834075928, + "rewards/margins": 10.577636957168579, + "rewards/rejected": -7.724018573760986, + "step": 406 + }, + { + "epoch": 0.0371859296482412, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.969230139355348e-06, + "logits/chosen": 335656243.2, + "logits/rejected": 585520384.0, + "logps/chosen": -419.65439453125, + "logps/rejected": -336.15236409505206, + "loss": 0.0439, + "rewards/chosen": 3.2204174041748046, + "rewards/margins": 6.775129699707032, + "rewards/rejected": -3.5547122955322266, + "step": 407 + }, + { + "epoch": 0.03727729556875285, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.969070668295353e-06, + "logits/chosen": 501530112.0, + "logits/rejected": 889342156.8, + "logps/chosen": -335.1486002604167, + "logps/rejected": -525.795654296875, + "loss": 0.0417, + "rewards/chosen": 3.2826639811197915, + "rewards/margins": 12.093256632486979, + "rewards/rejected": -8.810592651367188, + "step": 408 + }, + { + "epoch": 0.0373686614892645, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 9.968910786339257e-06, + "logits/chosen": 806985728.0, + "logits/rejected": 359588416.0, + "logps/chosen": -391.4116617838542, + "logps/rejected": -231.9819793701172, + "loss": 0.1147, + "rewards/chosen": 2.306669553120931, + "rewards/margins": 4.918378432591757, + "rewards/rejected": -2.611708879470825, + "step": 409 + }, + { + "epoch": 0.03746002740977615, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.968750493500277e-06, + "logits/chosen": 379692864.0, + "logits/rejected": 480578944.0, + "logps/chosen": -215.5712432861328, + "logps/rejected": -391.48870849609375, + "loss": 0.1986, + "rewards/chosen": 0.8424301743507385, + "rewards/margins": 5.060128271579742, + "rewards/rejected": -4.217698097229004, + "step": 410 + }, + { + "epoch": 0.0375513933302878, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 9.968589789791665e-06, + "logits/chosen": 776701286.4, + "logits/rejected": 366522112.0, + "logps/chosen": -318.2818359375, + "logps/rejected": -263.48964436848956, + "loss": 0.1229, + "rewards/chosen": 1.9031244277954102, + "rewards/margins": 5.478665606180827, + "rewards/rejected": -3.5755411783854165, + "step": 411 + }, + { + "epoch": 0.03764275925079945, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.968428675226714e-06, + "logits/chosen": 663810611.2, + "logits/rejected": 351010346.6666667, + "logps/chosen": -332.41123046875, + "logps/rejected": -426.689697265625, + "loss": 0.0746, + "rewards/chosen": 2.020363998413086, + "rewards/margins": 10.525888315836589, + "rewards/rejected": -8.505524317423502, + "step": 412 + }, + { + "epoch": 0.0377341251713111, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.968267149818745e-06, + "logits/chosen": 544717397.3333334, + "logits/rejected": 409206016.0, + "logps/chosen": -214.5260009765625, + "logps/rejected": -426.344677734375, + "loss": 0.0508, + "rewards/chosen": 2.600026766459147, + "rewards/margins": 8.438297335306803, + "rewards/rejected": -5.838270568847657, + "step": 413 + }, + { + "epoch": 0.037825491091822754, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.968105213581115e-06, + "logits/chosen": 558561450.6666666, + "logits/rejected": 710805888.0, + "logps/chosen": -411.6864013671875, + "logps/rejected": -790.3692626953125, + "loss": 0.0443, + "rewards/chosen": 3.368810017903646, + "rewards/margins": 11.265937169392904, + "rewards/rejected": -7.897127151489258, + "step": 414 + }, + { + "epoch": 0.0379168570123344, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 9.967942866527214e-06, + "logits/chosen": 473914112.0, + "logits/rejected": 445622579.2, + "logps/chosen": -262.068603515625, + "logps/rejected": -545.9302734375, + "loss": 0.0178, + "rewards/chosen": 3.7789624532063804, + "rewards/margins": 11.680670674641927, + "rewards/rejected": -7.9017082214355465, + "step": 415 + }, + { + "epoch": 0.03800822293284605, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.967780108670466e-06, + "logits/chosen": 842648490.6666666, + "logits/rejected": 2827494400.0, + "logps/chosen": -257.40944417317706, + "logps/rejected": -613.9473876953125, + "loss": 0.0886, + "rewards/chosen": 2.6940294901529946, + "rewards/margins": 9.833782831827799, + "rewards/rejected": -7.139753341674805, + "step": 416 + }, + { + "epoch": 0.0380995888533577, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.967616940024332e-06, + "logits/chosen": 636994304.0, + "logits/rejected": 555924736.0, + "logps/chosen": -314.44020589192706, + "logps/rejected": -354.41241455078125, + "loss": 0.1435, + "rewards/chosen": 2.086979866027832, + "rewards/margins": 6.154526710510254, + "rewards/rejected": -4.067546844482422, + "step": 417 + }, + { + "epoch": 0.03819095477386935, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 9.967453360602302e-06, + "logits/chosen": 813562560.0, + "logits/rejected": 718956800.0, + "logps/chosen": -429.1457214355469, + "logps/rejected": -333.14792887369794, + "loss": 0.1614, + "rewards/chosen": 2.4130401611328125, + "rewards/margins": 5.78346029917399, + "rewards/rejected": -3.3704201380411782, + "step": 418 + }, + { + "epoch": 0.038282320694381, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.967289370417904e-06, + "logits/chosen": 393800789.3333333, + "logits/rejected": 576914227.2, + "logps/chosen": -171.3343505859375, + "logps/rejected": -466.77890625, + "loss": 0.0591, + "rewards/chosen": 2.0493222872416177, + "rewards/margins": 11.012917868296304, + "rewards/rejected": -8.963595581054687, + "step": 419 + }, + { + "epoch": 0.03837368661489265, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.967124969484698e-06, + "logits/chosen": 968306278.4, + "logits/rejected": 627831509.3333334, + "logps/chosen": -476.637548828125, + "logps/rejected": -460.30126953125, + "loss": 0.0763, + "rewards/chosen": 2.382659912109375, + "rewards/margins": 7.744330724080403, + "rewards/rejected": -5.361670811971028, + "step": 420 + }, + { + "epoch": 0.0384650525354043, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.966960157816279e-06, + "logits/chosen": 405305600.0, + "logits/rejected": 375137664.0, + "logps/chosen": -247.59255981445312, + "logps/rejected": -365.63134765625, + "loss": 0.0686, + "rewards/chosen": 2.26609468460083, + "rewards/margins": 7.813014030456543, + "rewards/rejected": -5.546919345855713, + "step": 421 + }, + { + "epoch": 0.038556418455915946, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.966794935426274e-06, + "logits/chosen": 424698197.3333333, + "logits/rejected": 130287528.0, + "logps/chosen": -228.92012532552084, + "logps/rejected": -92.16937255859375, + "loss": 0.1795, + "rewards/chosen": 2.265850385030111, + "rewards/margins": 3.5170008738835654, + "rewards/rejected": -1.2511504888534546, + "step": 422 + }, + { + "epoch": 0.038647784376427595, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.966629302328347e-06, + "logits/chosen": 618905536.0, + "logits/rejected": 962092970.6666666, + "logps/chosen": -193.16258239746094, + "logps/rejected": -683.7861328125, + "loss": 0.0217, + "rewards/chosen": 2.8108839988708496, + "rewards/margins": 9.690160910288494, + "rewards/rejected": -6.8792769114176435, + "step": 423 + }, + { + "epoch": 0.038739150296939244, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.966463258536194e-06, + "logits/chosen": 383680704.0, + "logits/rejected": 504017728.0, + "logps/chosen": -535.4757080078125, + "logps/rejected": -383.740966796875, + "loss": 0.0942, + "rewards/chosen": 2.3796486854553223, + "rewards/margins": 5.329187870025635, + "rewards/rejected": -2.9495391845703125, + "step": 424 + }, + { + "epoch": 0.03883051621745089, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 9.966296804063544e-06, + "logits/chosen": 406383908.5714286, + "logits/rejected": 162542368.0, + "logps/chosen": -260.52146693638394, + "logps/rejected": -163.83056640625, + "loss": 0.0531, + "rewards/chosen": 3.1433849334716797, + "rewards/margins": 7.312851428985596, + "rewards/rejected": -4.169466495513916, + "step": 425 + }, + { + "epoch": 0.03892188213796254, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.966129938924162e-06, + "logits/chosen": 975500288.0, + "logits/rejected": 671510101.3333334, + "logps/chosen": -460.919921875, + "logps/rejected": -201.6129150390625, + "loss": 0.1451, + "rewards/chosen": 2.58934326171875, + "rewards/margins": 5.2595115025838215, + "rewards/rejected": -2.6701682408650718, + "step": 426 + }, + { + "epoch": 0.03901324805847419, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.965962663131845e-06, + "logits/chosen": 330204117.3333333, + "logits/rejected": 339480268.8, + "logps/chosen": -147.55119832356772, + "logps/rejected": -376.58173828125, + "loss": 0.0532, + "rewards/chosen": 2.8983227411905923, + "rewards/margins": 8.316902224222819, + "rewards/rejected": -5.418579483032227, + "step": 427 + }, + { + "epoch": 0.03910461397898584, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.96579497670043e-06, + "logits/chosen": 574802773.3333334, + "logits/rejected": 523288576.0, + "logps/chosen": -262.9764811197917, + "logps/rejected": -387.935693359375, + "loss": 0.0315, + "rewards/chosen": 3.148090680440267, + "rewards/margins": 8.761368878682454, + "rewards/rejected": -5.613278198242187, + "step": 428 + }, + { + "epoch": 0.03919597989949749, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.965626879643776e-06, + "logits/chosen": 186489856.0, + "logits/rejected": 485281645.71428573, + "logps/chosen": -212.2392120361328, + "logps/rejected": -376.4882114955357, + "loss": 0.0354, + "rewards/chosen": 2.6790847778320312, + "rewards/margins": 7.738887241908482, + "rewards/rejected": -5.0598024640764505, + "step": 429 + }, + { + "epoch": 0.03928734582000914, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.965458371975787e-06, + "logits/chosen": 589819861.3333334, + "logits/rejected": 1037747328.0, + "logps/chosen": -360.1795654296875, + "logps/rejected": -723.6500244140625, + "loss": 0.0653, + "rewards/chosen": 2.748180707295736, + "rewards/margins": 13.571342786153158, + "rewards/rejected": -10.823162078857422, + "step": 430 + }, + { + "epoch": 0.03937871174052079, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 9.965289453710399e-06, + "logits/chosen": 560174933.3333334, + "logits/rejected": 777784115.2, + "logps/chosen": -182.4400431315104, + "logps/rejected": -435.39970703125, + "loss": 0.06, + "rewards/chosen": 2.2867377599080405, + "rewards/margins": 7.237476285298666, + "rewards/rejected": -4.950738525390625, + "step": 431 + }, + { + "epoch": 0.039470077661032436, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 9.965120124861576e-06, + "logits/chosen": 365997465.6, + "logits/rejected": 566071466.6666666, + "logps/chosen": -199.24998779296874, + "logps/rejected": -493.5657145182292, + "loss": 0.0333, + "rewards/chosen": 3.3323204040527346, + "rewards/margins": 9.895077896118163, + "rewards/rejected": -6.56275749206543, + "step": 432 + }, + { + "epoch": 0.039561443581544085, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 9.964950385443322e-06, + "logits/chosen": 1041546956.8, + "logits/rejected": 614752426.6666666, + "logps/chosen": -428.989111328125, + "logps/rejected": -468.0006103515625, + "loss": 0.0178, + "rewards/chosen": 3.9963607788085938, + "rewards/margins": 9.6002623240153, + "rewards/rejected": -5.603901545206706, + "step": 433 + }, + { + "epoch": 0.039652809502055734, + "grad_norm": 19.0, + "kl": 0.7364425659179688, + "learning_rate": 9.96478023546967e-06, + "logits/chosen": 492523178.6666667, + "logits/rejected": 564542080.0, + "logps/chosen": -305.77927652994794, + "logps/rejected": -590.5647583007812, + "loss": 0.1552, + "rewards/chosen": 1.6766220728556316, + "rewards/margins": 8.539475599924723, + "rewards/rejected": -6.862853527069092, + "step": 434 + }, + { + "epoch": 0.039744175422567384, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.964609674954696e-06, + "logits/chosen": 538751360.0, + "logits/rejected": 659885696.0, + "logps/chosen": -270.71234130859375, + "logps/rejected": -486.192138671875, + "loss": 0.059, + "rewards/chosen": 2.500123977661133, + "rewards/margins": 7.1707658767700195, + "rewards/rejected": -4.670641899108887, + "step": 435 + }, + { + "epoch": 0.03983554134307903, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.964438703912499e-06, + "logits/chosen": 365865429.3333333, + "logits/rejected": 455606976.0, + "logps/chosen": -262.326171875, + "logps/rejected": -616.6845092773438, + "loss": 0.078, + "rewards/chosen": 2.772298812866211, + "rewards/margins": 12.355525016784668, + "rewards/rejected": -9.583226203918457, + "step": 436 + }, + { + "epoch": 0.03992690726359068, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.964267322357216e-06, + "logits/chosen": 598449536.0, + "logits/rejected": 279803776.0, + "logps/chosen": -386.5675455729167, + "logps/rejected": -383.1189453125, + "loss": 0.0235, + "rewards/chosen": 3.365642229715983, + "rewards/margins": 9.117941729227702, + "rewards/rejected": -5.752299499511719, + "step": 437 + }, + { + "epoch": 0.04001827318410233, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.964095530303021e-06, + "logits/chosen": 400836736.0, + "logits/rejected": 383938912.0, + "logps/chosen": -298.9263916015625, + "logps/rejected": -499.7379150390625, + "loss": 0.0752, + "rewards/chosen": 2.7064194679260254, + "rewards/margins": 7.826530933380127, + "rewards/rejected": -5.120111465454102, + "step": 438 + }, + { + "epoch": 0.04010963910461398, + "grad_norm": 22.625, + "kl": 0.0, + "learning_rate": 9.96392332776412e-06, + "logits/chosen": 319029888.0, + "logits/rejected": 411734816.0, + "logps/chosen": -276.97283935546875, + "logps/rejected": -370.23687744140625, + "loss": 0.2063, + "rewards/chosen": 1.8589584827423096, + "rewards/margins": 5.405968904495239, + "rewards/rejected": -3.5470104217529297, + "step": 439 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.963750714754752e-06, + "logits/chosen": 735158144.0, + "logits/rejected": 671379584.0, + "logps/chosen": -362.61651611328125, + "logps/rejected": -376.7789306640625, + "loss": 0.0762, + "rewards/chosen": 2.742453098297119, + "rewards/margins": 7.688799858093262, + "rewards/rejected": -4.946346759796143, + "step": 440 + }, + { + "epoch": 0.04029237094563728, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.963577691289188e-06, + "logits/chosen": 682779136.0, + "logits/rejected": 706068480.0, + "logps/chosen": -294.40565999348956, + "logps/rejected": -519.320458984375, + "loss": 0.0334, + "rewards/chosen": 4.492364247639974, + "rewards/margins": 10.027663167317709, + "rewards/rejected": -5.535298919677734, + "step": 441 + }, + { + "epoch": 0.04038373686614893, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 9.96340425738174e-06, + "logits/chosen": 482228309.3333333, + "logits/rejected": 485672512.0, + "logps/chosen": -288.80674235026044, + "logps/rejected": -408.5319519042969, + "loss": 0.0445, + "rewards/chosen": 3.0997740427652993, + "rewards/margins": 9.317440668741861, + "rewards/rejected": -6.2176666259765625, + "step": 442 + }, + { + "epoch": 0.040475102786660576, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.963230413046745e-06, + "logits/chosen": 704129322.6666666, + "logits/rejected": 612430387.2, + "logps/chosen": -238.75740559895834, + "logps/rejected": -477.269677734375, + "loss": 0.0789, + "rewards/chosen": 1.9765226046244304, + "rewards/margins": 8.322679297129312, + "rewards/rejected": -6.3461566925048825, + "step": 443 + }, + { + "epoch": 0.040566468707172225, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.96305615829858e-06, + "logits/chosen": 673263829.3333334, + "logits/rejected": 803841638.4, + "logps/chosen": -335.98781331380206, + "logps/rejected": -470.00888671875, + "loss": 0.0921, + "rewards/chosen": 1.9429284731547039, + "rewards/margins": 7.864547125498454, + "rewards/rejected": -5.92161865234375, + "step": 444 + }, + { + "epoch": 0.040657834627683874, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.962881493151654e-06, + "logits/chosen": 553649088.0, + "logits/rejected": 455175808.0, + "logps/chosen": -248.7276153564453, + "logps/rejected": -387.7765706380208, + "loss": 0.0511, + "rewards/chosen": 3.815446138381958, + "rewards/margins": 9.949640830357868, + "rewards/rejected": -6.134194691975911, + "step": 445 + }, + { + "epoch": 0.04074920054819552, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.962706417620413e-06, + "logits/chosen": 421652480.0, + "logits/rejected": 741863424.0, + "logps/chosen": -306.072314453125, + "logps/rejected": -501.6575113932292, + "loss": 0.0568, + "rewards/chosen": 2.9426557540893556, + "rewards/margins": 7.83377316792806, + "rewards/rejected": -4.891117413838704, + "step": 446 + }, + { + "epoch": 0.04084056646870717, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.96253093171933e-06, + "logits/chosen": 363739264.0, + "logits/rejected": 321467136.0, + "logps/chosen": -219.6589813232422, + "logps/rejected": -289.6287841796875, + "loss": 0.0621, + "rewards/chosen": 2.273138999938965, + "rewards/margins": 7.094203948974609, + "rewards/rejected": -4.8210649490356445, + "step": 447 + }, + { + "epoch": 0.04093193238921882, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.962355035462918e-06, + "logits/chosen": 447132864.0, + "logits/rejected": 620694464.0, + "logps/chosen": -177.53355407714844, + "logps/rejected": -659.1767578125, + "loss": 0.0969, + "rewards/chosen": 1.9726756811141968, + "rewards/margins": 9.492874503135681, + "rewards/rejected": -7.520198822021484, + "step": 448 + }, + { + "epoch": 0.04102329830973047, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.962178728865723e-06, + "logits/chosen": 1100851456.0, + "logits/rejected": 593612224.0, + "logps/chosen": -503.36004638671875, + "logps/rejected": -446.78558349609375, + "loss": 0.0241, + "rewards/chosen": 3.7517902851104736, + "rewards/margins": 11.390796899795532, + "rewards/rejected": -7.639006614685059, + "step": 449 + }, + { + "epoch": 0.04111466423024212, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.962002011942322e-06, + "logits/chosen": 454844620.8, + "logits/rejected": 427056810.6666667, + "logps/chosen": -401.083056640625, + "logps/rejected": -429.4147135416667, + "loss": 0.0926, + "rewards/chosen": 2.9130836486816407, + "rewards/margins": 9.489551798502605, + "rewards/rejected": -6.576468149820964, + "step": 450 + }, + { + "epoch": 0.04120603015075377, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.961824884707329e-06, + "logits/chosen": 481169578.6666667, + "logits/rejected": 270678220.8, + "logps/chosen": -318.80674235026044, + "logps/rejected": -327.77734375, + "loss": 0.0752, + "rewards/chosen": 2.2744455337524414, + "rewards/margins": 6.570707893371582, + "rewards/rejected": -4.29626235961914, + "step": 451 + }, + { + "epoch": 0.04129739607126542, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.96164734717539e-06, + "logits/chosen": 760585728.0, + "logits/rejected": 600138560.0, + "logps/chosen": -441.20692661830356, + "logps/rejected": -704.218505859375, + "loss": 0.0541, + "rewards/chosen": 2.9167837415422713, + "rewards/margins": 9.507719925471715, + "rewards/rejected": -6.590936183929443, + "step": 452 + }, + { + "epoch": 0.041388761991777066, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.961469399361186e-06, + "logits/chosen": 278535552.0, + "logits/rejected": 443887564.8, + "logps/chosen": -343.98828125, + "logps/rejected": -549.912646484375, + "loss": 0.0243, + "rewards/chosen": 3.5921427408854165, + "rewards/margins": 12.035271708170573, + "rewards/rejected": -8.443128967285157, + "step": 453 + }, + { + "epoch": 0.041480127912288715, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.96129104127943e-06, + "logits/chosen": 478925004.8, + "logits/rejected": 287541354.6666667, + "logps/chosen": -287.3760009765625, + "logps/rejected": -338.181396484375, + "loss": 0.0481, + "rewards/chosen": 2.816733551025391, + "rewards/margins": 8.808192443847656, + "rewards/rejected": -5.991458892822266, + "step": 454 + }, + { + "epoch": 0.041571493832800364, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.961112272944874e-06, + "logits/chosen": 677714112.0, + "logits/rejected": 444441563.4285714, + "logps/chosen": -437.69976806640625, + "logps/rejected": -548.81103515625, + "loss": 0.0449, + "rewards/chosen": 2.5388429164886475, + "rewards/margins": 11.018979242869786, + "rewards/rejected": -8.480136326381139, + "step": 455 + }, + { + "epoch": 0.04166285975331201, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.960933094372298e-06, + "logits/chosen": 807907776.0, + "logits/rejected": 318226560.0, + "logps/chosen": -642.8145751953125, + "logps/rejected": -368.3934631347656, + "loss": 0.0664, + "rewards/chosen": 2.4204134941101074, + "rewards/margins": 7.846241474151611, + "rewards/rejected": -5.425827980041504, + "step": 456 + }, + { + "epoch": 0.04175422567382366, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.960753505576522e-06, + "logits/chosen": 378825600.0, + "logits/rejected": 499016320.0, + "logps/chosen": -292.47711181640625, + "logps/rejected": -459.6995849609375, + "loss": 0.0743, + "rewards/chosen": 2.4630277156829834, + "rewards/margins": 7.9936840534210205, + "rewards/rejected": -5.530656337738037, + "step": 457 + }, + { + "epoch": 0.04184559159433531, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.960573506572391e-06, + "logits/chosen": 829776469.3333334, + "logits/rejected": 810391552.0, + "logps/chosen": -435.0302327473958, + "logps/rejected": -664.6912109375, + "loss": 0.0563, + "rewards/chosen": 1.7652816772460938, + "rewards/margins": 12.075033569335938, + "rewards/rejected": -10.309751892089844, + "step": 458 + }, + { + "epoch": 0.04193695751484696, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.96039309737479e-06, + "logits/chosen": 376441173.3333333, + "logits/rejected": 355738496.0, + "logps/chosen": -266.9126790364583, + "logps/rejected": -390.98271484375, + "loss": 0.0878, + "rewards/chosen": 3.1274534861246743, + "rewards/margins": 8.902973047892253, + "rewards/rejected": -5.775519561767578, + "step": 459 + }, + { + "epoch": 0.04202832343535861, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.960212277998642e-06, + "logits/chosen": 544915660.8, + "logits/rejected": 326412992.0, + "logps/chosen": -256.0255859375, + "logps/rejected": -322.7209879557292, + "loss": 0.0313, + "rewards/chosen": 3.512289810180664, + "rewards/margins": 10.184216944376628, + "rewards/rejected": -6.671927134195964, + "step": 460 + }, + { + "epoch": 0.04211968935587026, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.960031048458895e-06, + "logits/chosen": 440619648.0, + "logits/rejected": 696053418.6666666, + "logps/chosen": -298.78082275390625, + "logps/rejected": -400.3189290364583, + "loss": 0.1469, + "rewards/chosen": 1.4050788879394531, + "rewards/margins": 6.482152938842773, + "rewards/rejected": -5.07707405090332, + "step": 461 + }, + { + "epoch": 0.04221105527638191, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.959849408770536e-06, + "logits/chosen": 293508437.3333333, + "logits/rejected": 260800928.0, + "logps/chosen": -198.97102864583334, + "logps/rejected": -429.10406494140625, + "loss": 0.1446, + "rewards/chosen": 2.115100542704264, + "rewards/margins": 7.46792189280192, + "rewards/rejected": -5.352821350097656, + "step": 462 + }, + { + "epoch": 0.04230242119689356, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 9.959667358948584e-06, + "logits/chosen": 699914624.0, + "logits/rejected": 516472896.0, + "logps/chosen": -416.44952392578125, + "logps/rejected": -618.4439697265625, + "loss": 0.0448, + "rewards/chosen": 2.4197444915771484, + "rewards/margins": 10.030881404876709, + "rewards/rejected": -7.6111369132995605, + "step": 463 + }, + { + "epoch": 0.042393787117405206, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 9.959484899008095e-06, + "logits/chosen": 223590304.0, + "logits/rejected": 387052842.6666667, + "logps/chosen": -236.02920532226562, + "logps/rejected": -433.8243815104167, + "loss": 0.0066, + "rewards/chosen": 3.9336061477661133, + "rewards/margins": 11.746715863545734, + "rewards/rejected": -7.813109715779622, + "step": 464 + }, + { + "epoch": 0.042485153037916855, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.959302028964155e-06, + "logits/chosen": 494453760.0, + "logits/rejected": 641850624.0, + "logps/chosen": -272.92588588169644, + "logps/rejected": -558.3563232421875, + "loss": 0.1229, + "rewards/chosen": 2.6651333400181363, + "rewards/margins": 11.418820244925364, + "rewards/rejected": -8.753686904907227, + "step": 465 + }, + { + "epoch": 0.042576518958428504, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.959118748831886e-06, + "logits/chosen": 426348501.3333333, + "logits/rejected": 308676736.0, + "logps/chosen": -397.244384765625, + "logps/rejected": -310.43463134765625, + "loss": 0.0698, + "rewards/chosen": 2.741734504699707, + "rewards/margins": 9.317000389099121, + "rewards/rejected": -6.575265884399414, + "step": 466 + }, + { + "epoch": 0.04266788487894015, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 9.958935058626445e-06, + "logits/chosen": 787653222.4, + "logits/rejected": 624582912.0, + "logps/chosen": -496.43056640625, + "logps/rejected": -350.8994140625, + "loss": 0.0424, + "rewards/chosen": 3.0334625244140625, + "rewards/margins": 9.97634442647298, + "rewards/rejected": -6.942881902058919, + "step": 467 + }, + { + "epoch": 0.0427592507994518, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.95875095836302e-06, + "logits/chosen": 574066560.0, + "logits/rejected": 664794816.0, + "logps/chosen": -308.32073974609375, + "logps/rejected": -657.321044921875, + "loss": 0.1216, + "rewards/chosen": 1.2656548023223877, + "rewards/margins": 10.501914262771606, + "rewards/rejected": -9.236259460449219, + "step": 468 + }, + { + "epoch": 0.04285061671996345, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.958566448056833e-06, + "logits/chosen": 383995808.0, + "logits/rejected": 667580928.0, + "logps/chosen": -372.1829833984375, + "logps/rejected": -232.80172729492188, + "loss": 0.0806, + "rewards/chosen": 2.3532676696777344, + "rewards/margins": 7.535993576049805, + "rewards/rejected": -5.18272590637207, + "step": 469 + }, + { + "epoch": 0.0429419826404751, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.958381527723141e-06, + "logits/chosen": 767835328.0, + "logits/rejected": 725277952.0, + "logps/chosen": -448.1646728515625, + "logps/rejected": -608.7476806640625, + "loss": 0.0566, + "rewards/chosen": 2.6820075511932373, + "rewards/margins": 8.584937810897827, + "rewards/rejected": -5.90293025970459, + "step": 470 + }, + { + "epoch": 0.04303334856098675, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.958196197377242e-06, + "logits/chosen": 538027712.0, + "logits/rejected": 418823082.6666667, + "logps/chosen": -302.69976806640625, + "logps/rejected": -441.5118815104167, + "loss": 0.0356, + "rewards/chosen": 2.169598340988159, + "rewards/margins": 6.936396996180217, + "rewards/rejected": -4.766798655192058, + "step": 471 + }, + { + "epoch": 0.0431247144814984, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.958010457034454e-06, + "logits/chosen": 457289856.0, + "logits/rejected": 674330304.0, + "logps/chosen": -293.077392578125, + "logps/rejected": -767.6080322265625, + "loss": 0.0794, + "rewards/chosen": 1.8737332820892334, + "rewards/margins": 10.621721029281616, + "rewards/rejected": -8.747987747192383, + "step": 472 + }, + { + "epoch": 0.04321608040201005, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.957824306710138e-06, + "logits/chosen": 285332352.0, + "logits/rejected": 534573653.3333333, + "logps/chosen": -269.6838134765625, + "logps/rejected": -463.3739420572917, + "loss": 0.0877, + "rewards/chosen": 2.661249351501465, + "rewards/margins": 7.868107032775879, + "rewards/rejected": -5.206857681274414, + "step": 473 + }, + { + "epoch": 0.043307446322521696, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.957637746419686e-06, + "logits/chosen": 485553024.0, + "logits/rejected": 242121952.0, + "logps/chosen": -272.66986083984375, + "logps/rejected": -280.09173583984375, + "loss": 0.056, + "rewards/chosen": 2.5029282569885254, + "rewards/margins": 7.441773891448975, + "rewards/rejected": -4.938845634460449, + "step": 474 + }, + { + "epoch": 0.043398812243033345, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.957450776178528e-06, + "logits/chosen": 671790336.0, + "logits/rejected": 794785408.0, + "logps/chosen": -297.91668701171875, + "logps/rejected": -384.800048828125, + "loss": 0.0703, + "rewards/chosen": 2.8692104816436768, + "rewards/margins": 7.620692491531372, + "rewards/rejected": -4.751482009887695, + "step": 475 + }, + { + "epoch": 0.043490178163544994, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.957263396002122e-06, + "logits/chosen": 425431808.0, + "logits/rejected": 347159168.0, + "logps/chosen": -268.9737955729167, + "logps/rejected": -456.60626220703125, + "loss": 0.0527, + "rewards/chosen": 3.000825564066569, + "rewards/margins": 8.40148655573527, + "rewards/rejected": -5.400660991668701, + "step": 476 + }, + { + "epoch": 0.04358154408405664, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.957075605905962e-06, + "logits/chosen": 462915136.0, + "logits/rejected": 351713536.0, + "logps/chosen": -289.2113037109375, + "logps/rejected": -276.4743957519531, + "loss": 0.0334, + "rewards/chosen": 2.759871244430542, + "rewards/margins": 9.299938440322876, + "rewards/rejected": -6.540067195892334, + "step": 477 + }, + { + "epoch": 0.0436729100045683, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.95688740590558e-06, + "logits/chosen": 538615859.2, + "logits/rejected": 1147001856.0, + "logps/chosen": -375.9861328125, + "logps/rejected": -875.6643880208334, + "loss": 0.0308, + "rewards/chosen": 3.353385162353516, + "rewards/margins": 17.049226633707683, + "rewards/rejected": -13.695841471354166, + "step": 478 + }, + { + "epoch": 0.04376427592507995, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.956698796016534e-06, + "logits/chosen": 474777292.8, + "logits/rejected": 555852288.0, + "logps/chosen": -423.58876953125, + "logps/rejected": -424.0211995442708, + "loss": 0.0373, + "rewards/chosen": 3.126592445373535, + "rewards/margins": 7.527373123168945, + "rewards/rejected": -4.40078067779541, + "step": 479 + }, + { + "epoch": 0.0438556418455916, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 9.956509776254423e-06, + "logits/chosen": 824336320.0, + "logits/rejected": 642059264.0, + "logps/chosen": -272.22637939453125, + "logps/rejected": -470.13668387276783, + "loss": 0.0089, + "rewards/chosen": 3.9360382556915283, + "rewards/margins": 10.805856806891306, + "rewards/rejected": -6.869818551199777, + "step": 480 + }, + { + "epoch": 0.04394700776610325, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.956320346634877e-06, + "logits/chosen": 531395686.4, + "logits/rejected": 596272042.6666666, + "logps/chosen": -301.456201171875, + "logps/rejected": -791.3513997395834, + "loss": 0.0408, + "rewards/chosen": 3.167926025390625, + "rewards/margins": 10.97296765645345, + "rewards/rejected": -7.805041631062825, + "step": 481 + }, + { + "epoch": 0.044038373686614896, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.956130507173559e-06, + "logits/chosen": 556958208.0, + "logits/rejected": 532717994.6666667, + "logps/chosen": -463.73907470703125, + "logps/rejected": -331.87021891276044, + "loss": 0.0309, + "rewards/chosen": 2.476632595062256, + "rewards/margins": 8.198990027109783, + "rewards/rejected": -5.722357432047526, + "step": 482 + }, + { + "epoch": 0.044129739607126545, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 9.955940257886169e-06, + "logits/chosen": 772684800.0, + "logits/rejected": 933340586.6666666, + "logps/chosen": -470.7032165527344, + "logps/rejected": -389.2923990885417, + "loss": 0.0364, + "rewards/chosen": 2.9628419876098633, + "rewards/margins": 9.24402077992757, + "rewards/rejected": -6.281178792317708, + "step": 483 + }, + { + "epoch": 0.044221105527638194, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.955749598788435e-06, + "logits/chosen": 593037824.0, + "logits/rejected": 766244224.0, + "logps/chosen": -331.7952880859375, + "logps/rejected": -368.2113037109375, + "loss": 0.06, + "rewards/chosen": 2.796879768371582, + "rewards/margins": 8.011760234832764, + "rewards/rejected": -5.214880466461182, + "step": 484 + }, + { + "epoch": 0.04431247144814984, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.955558529896128e-06, + "logits/chosen": 644486144.0, + "logits/rejected": 411705888.0, + "logps/chosen": -231.30806477864584, + "logps/rejected": -492.02490234375, + "loss": 0.0525, + "rewards/chosen": 3.1022752126057944, + "rewards/margins": 13.924760182698568, + "rewards/rejected": -10.822484970092773, + "step": 485 + }, + { + "epoch": 0.04440383736866149, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.955367051225044e-06, + "logits/chosen": 406990336.0, + "logits/rejected": 412725862.4, + "logps/chosen": -208.12837727864584, + "logps/rejected": -265.0514892578125, + "loss": 0.0796, + "rewards/chosen": 2.025950272878011, + "rewards/margins": 8.836964066823324, + "rewards/rejected": -6.811013793945312, + "step": 486 + }, + { + "epoch": 0.04449520328917314, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 9.955175162791016e-06, + "logits/chosen": 348971861.3333333, + "logits/rejected": 469166387.2, + "logps/chosen": -309.90622965494794, + "logps/rejected": -457.519970703125, + "loss": 0.012, + "rewards/chosen": 3.606424649556478, + "rewards/margins": 10.128904660542807, + "rewards/rejected": -6.522480010986328, + "step": 487 + }, + { + "epoch": 0.04458656920968479, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.954982864609914e-06, + "logits/chosen": 760806912.0, + "logits/rejected": 520967232.0, + "logps/chosen": -360.4314270019531, + "logps/rejected": -430.07049560546875, + "loss": 0.0439, + "rewards/chosen": 2.9192628860473633, + "rewards/margins": 11.603602409362793, + "rewards/rejected": -8.68433952331543, + "step": 488 + }, + { + "epoch": 0.04467793513019644, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.954790156697639e-06, + "logits/chosen": 511633024.0, + "logits/rejected": 505100256.0, + "logps/chosen": -414.31158447265625, + "logps/rejected": -412.88800048828125, + "loss": 0.0698, + "rewards/chosen": 1.9077975749969482, + "rewards/margins": 8.450485467910767, + "rewards/rejected": -6.542687892913818, + "step": 489 + }, + { + "epoch": 0.04476930105070809, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.954597039070121e-06, + "logits/chosen": 503133513.14285713, + "logits/rejected": 711640448.0, + "logps/chosen": -281.40359933035717, + "logps/rejected": -822.1566162109375, + "loss": 0.0795, + "rewards/chosen": 3.2411507197788785, + "rewards/margins": 12.20041642870222, + "rewards/rejected": -8.95926570892334, + "step": 490 + }, + { + "epoch": 0.04486066697121974, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.954403511743338e-06, + "logits/chosen": 372321766.4, + "logits/rejected": 761713152.0, + "logps/chosen": -269.558251953125, + "logps/rejected": -520.503662109375, + "loss": 0.1228, + "rewards/chosen": 3.069605827331543, + "rewards/margins": 5.56490151087443, + "rewards/rejected": -2.495295683542887, + "step": 491 + }, + { + "epoch": 0.044952032891731386, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.954209574733286e-06, + "logits/chosen": 592741888.0, + "logits/rejected": 894066816.0, + "logps/chosen": -326.2287292480469, + "logps/rejected": -553.9278564453125, + "loss": 0.1193, + "rewards/chosen": 2.0634193420410156, + "rewards/margins": 10.20921802520752, + "rewards/rejected": -8.145798683166504, + "step": 492 + }, + { + "epoch": 0.045043398812243035, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.954015228056004e-06, + "logits/chosen": 445784661.3333333, + "logits/rejected": 327408768.0, + "logps/chosen": -266.31211344401044, + "logps/rejected": -313.82333984375, + "loss": 0.1213, + "rewards/chosen": 2.895075480143229, + "rewards/margins": 7.2390688578287765, + "rewards/rejected": -4.343993377685547, + "step": 493 + }, + { + "epoch": 0.045134764732754684, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.953820471727564e-06, + "logits/chosen": 588096554.6666666, + "logits/rejected": 690538803.2, + "logps/chosen": -239.12076822916666, + "logps/rejected": -367.2919921875, + "loss": 0.1134, + "rewards/chosen": 3.4944235483805337, + "rewards/margins": 8.359754435221355, + "rewards/rejected": -4.8653308868408205, + "step": 494 + }, + { + "epoch": 0.04522613065326633, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.953625305764068e-06, + "logits/chosen": 568122214.4, + "logits/rejected": 721397589.3333334, + "logps/chosen": -310.4679931640625, + "logps/rejected": -432.6158447265625, + "loss": 0.0529, + "rewards/chosen": 2.585715675354004, + "rewards/margins": 8.19171593983968, + "rewards/rejected": -5.606000264485677, + "step": 495 + }, + { + "epoch": 0.04531749657377798, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.953429730181653e-06, + "logits/chosen": 334437013.3333333, + "logits/rejected": 428747059.2, + "logps/chosen": -168.3781941731771, + "logps/rejected": -607.9896484375, + "loss": 0.0338, + "rewards/chosen": 3.3930622736612954, + "rewards/margins": 10.982468096415202, + "rewards/rejected": -7.589405822753906, + "step": 496 + }, + { + "epoch": 0.04540886249428963, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.953233744996497e-06, + "logits/chosen": 399582500.5714286, + "logits/rejected": 472332480.0, + "logps/chosen": -257.0991734095982, + "logps/rejected": -534.724365234375, + "loss": 0.1105, + "rewards/chosen": 2.465202604021345, + "rewards/margins": 7.943589959825788, + "rewards/rejected": -5.478387355804443, + "step": 497 + }, + { + "epoch": 0.04550022841480128, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 9.953037350224804e-06, + "logits/chosen": 703260006.4, + "logits/rejected": 576728106.6666666, + "logps/chosen": -287.70234375, + "logps/rejected": -572.1658121744791, + "loss": 0.0379, + "rewards/chosen": 3.393415832519531, + "rewards/margins": 8.016722106933594, + "rewards/rejected": -4.6233062744140625, + "step": 498 + }, + { + "epoch": 0.04559159433531293, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.952840545882812e-06, + "logits/chosen": 288496213.3333333, + "logits/rejected": 803060224.0, + "logps/chosen": -164.15109252929688, + "logps/rejected": -494.672119140625, + "loss": 0.1482, + "rewards/chosen": 3.003107706705729, + "rewards/margins": 7.643323008219401, + "rewards/rejected": -4.640215301513672, + "step": 499 + }, + { + "epoch": 0.04568296025582458, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.952643331986794e-06, + "logits/chosen": 537673813.3333334, + "logits/rejected": 665484800.0, + "logps/chosen": -256.75046793619794, + "logps/rejected": -288.33359375, + "loss": 0.0307, + "rewards/chosen": 2.7080370585123696, + "rewards/margins": 9.103922526041666, + "rewards/rejected": -6.395885467529297, + "step": 500 + }, + { + "epoch": 0.04577432617633623, + "grad_norm": 27.25, + "kl": 0.0, + "learning_rate": 9.952445708553062e-06, + "logits/chosen": 357337941.3333333, + "logits/rejected": 841555251.2, + "logps/chosen": -407.2791748046875, + "logps/rejected": -546.1306640625, + "loss": 0.0802, + "rewards/chosen": 1.7885931332906086, + "rewards/margins": 8.83680861790975, + "rewards/rejected": -7.04821548461914, + "step": 501 + }, + { + "epoch": 0.045865692096847877, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 9.952247675597954e-06, + "logits/chosen": 369003136.0, + "logits/rejected": 388837717.3333333, + "logps/chosen": -238.657470703125, + "logps/rejected": -290.39796956380206, + "loss": 0.0558, + "rewards/chosen": 2.931431198120117, + "rewards/margins": 7.742444737752278, + "rewards/rejected": -4.811013539632161, + "step": 502 + }, + { + "epoch": 0.045957058017359526, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.952049233137848e-06, + "logits/chosen": 572349525.3333334, + "logits/rejected": 320357344.0, + "logps/chosen": -330.0891927083333, + "logps/rejected": -277.71697998046875, + "loss": 0.0772, + "rewards/chosen": 2.97281805674235, + "rewards/margins": 6.836643377939859, + "rewards/rejected": -3.8638253211975098, + "step": 503 + }, + { + "epoch": 0.046048423937871175, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.951850381189152e-06, + "logits/chosen": 516788326.4, + "logits/rejected": 736038229.3333334, + "logps/chosen": -315.927294921875, + "logps/rejected": -366.8690999348958, + "loss": 0.0307, + "rewards/chosen": 3.4348140716552735, + "rewards/margins": 8.608654594421386, + "rewards/rejected": -5.173840522766113, + "step": 504 + }, + { + "epoch": 0.046139789858382824, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.951651119768308e-06, + "logits/chosen": 276573994.6666667, + "logits/rejected": 365936998.4, + "logps/chosen": -256.3284505208333, + "logps/rejected": -477.472412109375, + "loss": 0.0441, + "rewards/chosen": 2.9486424128214517, + "rewards/margins": 8.962324206034342, + "rewards/rejected": -6.013681793212891, + "step": 505 + }, + { + "epoch": 0.04623115577889447, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 9.951451448891796e-06, + "logits/chosen": 529765504.0, + "logits/rejected": 264651616.0, + "logps/chosen": -447.9591979980469, + "logps/rejected": -210.20840454101562, + "loss": 0.1728, + "rewards/chosen": 1.0910769701004028, + "rewards/margins": 5.552078366279602, + "rewards/rejected": -4.461001396179199, + "step": 506 + }, + { + "epoch": 0.04632252169940612, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.951251368576124e-06, + "logits/chosen": 590872064.0, + "logits/rejected": 628992384.0, + "logps/chosen": -491.8018391927083, + "logps/rejected": -408.07232666015625, + "loss": 0.1367, + "rewards/chosen": 2.4723377227783203, + "rewards/margins": 4.938079357147217, + "rewards/rejected": -2.4657416343688965, + "step": 507 + }, + { + "epoch": 0.04641388761991777, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 9.951050878837839e-06, + "logits/chosen": 1325987413.3333333, + "logits/rejected": 773337804.8, + "logps/chosen": -457.3900553385417, + "logps/rejected": -497.8763671875, + "loss": 0.0344, + "rewards/chosen": 2.9758199055989585, + "rewards/margins": 8.961803181966147, + "rewards/rejected": -5.985983276367188, + "step": 508 + }, + { + "epoch": 0.04650525354042942, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.950849979693518e-06, + "logits/chosen": 434473676.8, + "logits/rejected": 415464362.6666667, + "logps/chosen": -281.848583984375, + "logps/rejected": -428.0063883463542, + "loss": 0.0423, + "rewards/chosen": 3.032710647583008, + "rewards/margins": 7.45271708170573, + "rewards/rejected": -4.420006434122722, + "step": 509 + }, + { + "epoch": 0.04659661946094107, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.950648671159775e-06, + "logits/chosen": 661595904.0, + "logits/rejected": 659282112.0, + "logps/chosen": -331.50840250651044, + "logps/rejected": -529.7432861328125, + "loss": 0.0469, + "rewards/chosen": 2.939255396525065, + "rewards/margins": 10.378252665201822, + "rewards/rejected": -7.438997268676758, + "step": 510 + }, + { + "epoch": 0.04668798538145272, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.950446953253257e-06, + "logits/chosen": 399795200.0, + "logits/rejected": 944247091.2, + "logps/chosen": -178.5517781575521, + "logps/rejected": -528.05205078125, + "loss": 0.0503, + "rewards/chosen": 2.863534927368164, + "rewards/margins": 7.8122600555419925, + "rewards/rejected": -4.9487251281738285, + "step": 511 + }, + { + "epoch": 0.04677935130196437, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 9.95024482599064e-06, + "logits/chosen": 733470080.0, + "logits/rejected": 603759488.0, + "logps/chosen": -461.16741943359375, + "logps/rejected": -509.7930094401042, + "loss": 0.0068, + "rewards/chosen": 3.927548408508301, + "rewards/margins": 12.51147429148356, + "rewards/rejected": -8.58392588297526, + "step": 512 + }, + { + "epoch": 0.046870717222476016, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 9.950042289388643e-06, + "logits/chosen": 516560832.0, + "logits/rejected": 603822592.0, + "logps/chosen": -320.8670349121094, + "logps/rejected": -397.4326171875, + "loss": 0.0433, + "rewards/chosen": 1.7687393426895142, + "rewards/margins": 7.9461443821589155, + "rewards/rejected": -6.177405039469401, + "step": 513 + }, + { + "epoch": 0.046962083142987665, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.949839343464012e-06, + "logits/chosen": 1175075584.0, + "logits/rejected": 617864704.0, + "logps/chosen": -338.13348388671875, + "logps/rejected": -329.5546875, + "loss": 0.084, + "rewards/chosen": 2.6653451919555664, + "rewards/margins": 9.060026168823242, + "rewards/rejected": -6.394680976867676, + "step": 514 + }, + { + "epoch": 0.047053449063499314, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 9.949635988233529e-06, + "logits/chosen": 679794432.0, + "logits/rejected": 341461792.0, + "logps/chosen": -408.2730712890625, + "logps/rejected": -292.6974792480469, + "loss": 0.0648, + "rewards/chosen": 2.02689266204834, + "rewards/margins": 8.73025131225586, + "rewards/rejected": -6.7033586502075195, + "step": 515 + }, + { + "epoch": 0.04714481498401096, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.949432223714006e-06, + "logits/chosen": 610027300.5714285, + "logits/rejected": 419997760.0, + "logps/chosen": -385.161376953125, + "logps/rejected": -226.81045532226562, + "loss": 0.0594, + "rewards/chosen": 3.171271187918527, + "rewards/margins": 8.790064198630198, + "rewards/rejected": -5.61879301071167, + "step": 516 + }, + { + "epoch": 0.04723618090452261, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.949228049922299e-06, + "logits/chosen": 427639253.3333333, + "logits/rejected": 392863641.6, + "logps/chosen": -340.5476481119792, + "logps/rejected": -304.8538818359375, + "loss": 0.0615, + "rewards/chosen": 3.6235669453938804, + "rewards/margins": 7.169983418782552, + "rewards/rejected": -3.546416473388672, + "step": 517 + }, + { + "epoch": 0.04732754682503426, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.949023466875284e-06, + "logits/chosen": 494497792.0, + "logits/rejected": 384168768.0, + "logps/chosen": -259.96307373046875, + "logps/rejected": -375.45819091796875, + "loss": 0.1319, + "rewards/chosen": 1.8205868403116863, + "rewards/margins": 9.446496645609537, + "rewards/rejected": -7.625909805297852, + "step": 518 + }, + { + "epoch": 0.04741891274554591, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.948818474589887e-06, + "logits/chosen": 404137642.6666667, + "logits/rejected": 893304832.0, + "logps/chosen": -199.17875162760416, + "logps/rejected": -676.151171875, + "loss": 0.0228, + "rewards/chosen": 3.647256851196289, + "rewards/margins": 11.631537246704102, + "rewards/rejected": -7.984280395507812, + "step": 519 + }, + { + "epoch": 0.04751027866605756, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.948613073083052e-06, + "logits/chosen": 632786346.6666666, + "logits/rejected": 293532697.6, + "logps/chosen": -280.3258056640625, + "logps/rejected": -315.10625, + "loss": 0.1187, + "rewards/chosen": 1.6836541493733723, + "rewards/margins": 8.97506955464681, + "rewards/rejected": -7.291415405273438, + "step": 520 + }, + { + "epoch": 0.04760164458656921, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 9.948407262371764e-06, + "logits/rejected": 546961536.0, + "logps/rejected": -540.5796508789062, + "loss": 0.0086, + "rewards/rejected": -6.095879554748535, + "step": 521 + }, + { + "epoch": 0.04769301050708086, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.948201042473044e-06, + "logits/chosen": 458073440.0, + "logits/rejected": 411448000.0, + "logps/chosen": -379.16351318359375, + "logps/rejected": -331.87823486328125, + "loss": 0.0706, + "rewards/chosen": 2.15440034866333, + "rewards/margins": 5.66847562789917, + "rewards/rejected": -3.51407527923584, + "step": 522 + }, + { + "epoch": 0.047784376427592506, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 9.947994413403946e-06, + "logits/chosen": 749662912.0, + "logits/rejected": 445787946.6666667, + "logps/chosen": -521.22216796875, + "logps/rejected": -367.93115234375, + "loss": 0.0211, + "rewards/chosen": 2.687844753265381, + "rewards/margins": 9.837687969207764, + "rewards/rejected": -7.149843215942383, + "step": 523 + }, + { + "epoch": 0.047875742348104156, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.947787375181553e-06, + "logits/chosen": 557497856.0, + "logits/rejected": 771550617.6, + "logps/chosen": -412.4674886067708, + "logps/rejected": -523.8486328125, + "loss": 0.0868, + "rewards/chosen": 2.8454392751057944, + "rewards/margins": 8.23777936299642, + "rewards/rejected": -5.392340087890625, + "step": 524 + }, + { + "epoch": 0.047967108268615805, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.947579927822986e-06, + "logits/chosen": 622897766.4, + "logits/rejected": 763622570.6666666, + "logps/chosen": -487.059716796875, + "logps/rejected": -437.1436360677083, + "loss": 0.0453, + "rewards/chosen": 2.9370323181152345, + "rewards/margins": 9.852932230631511, + "rewards/rejected": -6.915899912516276, + "step": 525 + }, + { + "epoch": 0.048058474189127454, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 9.9473720713454e-06, + "logits/chosen": 514511616.0, + "logits/rejected": 523236864.0, + "logps/chosen": -286.20135498046875, + "logps/rejected": -458.05723353794644, + "loss": 0.0144, + "rewards/chosen": 3.101898193359375, + "rewards/margins": 8.608911786760602, + "rewards/rejected": -5.507013593401227, + "step": 526 + }, + { + "epoch": 0.0481498401096391, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.94716380576598e-06, + "logits/chosen": 463803904.0, + "logits/rejected": 397716070.4, + "logps/chosen": -213.10526529947916, + "logps/rejected": -262.25224609375, + "loss": 0.0735, + "rewards/chosen": 3.3960755666097007, + "rewards/margins": 8.85375607808431, + "rewards/rejected": -5.457680511474609, + "step": 527 + }, + { + "epoch": 0.04824120603015075, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.946955131101952e-06, + "logits/chosen": 524393002.6666667, + "logits/rejected": 837766912.0, + "logps/chosen": -204.16473388671875, + "logps/rejected": -657.957275390625, + "loss": 0.0441, + "rewards/chosen": 3.1822500228881836, + "rewards/margins": 14.87500820159912, + "rewards/rejected": -11.692758178710937, + "step": 528 + }, + { + "epoch": 0.0483325719506624, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.946746047370568e-06, + "logits/chosen": 547422336.0, + "logits/rejected": 473094176.0, + "logps/chosen": -198.84521484375, + "logps/rejected": -432.6638488769531, + "loss": 0.0546, + "rewards/chosen": 2.865406036376953, + "rewards/margins": 10.712053298950195, + "rewards/rejected": -7.846647262573242, + "step": 529 + }, + { + "epoch": 0.04842393787117405, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.946536554589119e-06, + "logits/chosen": 515251520.0, + "logits/rejected": 397874261.3333333, + "logps/chosen": -456.40093994140625, + "logps/rejected": -463.6695963541667, + "loss": 0.053, + "rewards/chosen": 1.7612946033477783, + "rewards/margins": 9.572136481602985, + "rewards/rejected": -7.810841878255208, + "step": 530 + }, + { + "epoch": 0.0485153037916857, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.946326652774929e-06, + "logits/chosen": 1473681920.0, + "logits/rejected": 546558208.0, + "logps/chosen": -473.552734375, + "logps/rejected": -561.225537109375, + "loss": 0.0571, + "rewards/chosen": 1.9799922307332356, + "rewards/margins": 11.45218537648519, + "rewards/rejected": -9.472193145751953, + "step": 531 + }, + { + "epoch": 0.04860666971219735, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.946116341945351e-06, + "logits/chosen": 625927040.0, + "logits/rejected": 380062976.0, + "logps/chosen": -300.34552001953125, + "logps/rejected": -484.7450764973958, + "loss": 0.0988, + "rewards/chosen": 2.1196181774139404, + "rewards/margins": 9.774438937505085, + "rewards/rejected": -7.6548207600911455, + "step": 532 + }, + { + "epoch": 0.048698035632709, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.94590562211778e-06, + "logits/chosen": 652102860.8, + "logits/rejected": 406477098.6666667, + "logps/chosen": -291.44521484375, + "logps/rejected": -433.8701171875, + "loss": 0.0764, + "rewards/chosen": 2.4729869842529295, + "rewards/margins": 12.774298477172852, + "rewards/rejected": -10.301311492919922, + "step": 533 + }, + { + "epoch": 0.048789401553220646, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.94569449330964e-06, + "logits/chosen": 806818730.6666666, + "logits/rejected": 857759180.8, + "logps/chosen": -316.0816650390625, + "logps/rejected": -477.7521484375, + "loss": 0.0834, + "rewards/chosen": 2.0308257738749185, + "rewards/margins": 9.296333281199137, + "rewards/rejected": -7.265507507324219, + "step": 534 + }, + { + "epoch": 0.048880767473732295, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.945482955538385e-06, + "logits/chosen": 653191040.0, + "logits/rejected": 485441843.2, + "logps/chosen": -417.0482584635417, + "logps/rejected": -499.610693359375, + "loss": 0.0557, + "rewards/chosen": 2.104131062825521, + "rewards/margins": 8.839492543538412, + "rewards/rejected": -6.735361480712891, + "step": 535 + }, + { + "epoch": 0.048972133394243944, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.945271008821513e-06, + "logits/chosen": 472569344.0, + "logits/rejected": 464763008.0, + "logps/chosen": -372.9163513183594, + "logps/rejected": -432.953369140625, + "loss": 0.0381, + "rewards/chosen": 2.668997287750244, + "rewards/margins": 8.965349038441975, + "rewards/rejected": -6.2963517506917315, + "step": 536 + }, + { + "epoch": 0.04906349931475559, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.945058653176548e-06, + "logits/chosen": 1059489600.0, + "logits/rejected": 584885760.0, + "logps/chosen": -249.50729370117188, + "logps/rejected": -396.0429992675781, + "loss": 0.039, + "rewards/chosen": 2.5572829246520996, + "rewards/margins": 9.32886552810669, + "rewards/rejected": -6.77158260345459, + "step": 537 + }, + { + "epoch": 0.04915486523526724, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.944845888621047e-06, + "logits/chosen": 531492736.0, + "logits/rejected": 564581696.0, + "logps/chosen": -324.395263671875, + "logps/rejected": -321.5052490234375, + "loss": 0.0573, + "rewards/chosen": 2.2574143409729004, + "rewards/margins": 8.504410743713379, + "rewards/rejected": -6.2469964027404785, + "step": 538 + }, + { + "epoch": 0.04924623115577889, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.944632715172606e-06, + "logits/chosen": 444115968.0, + "logits/rejected": 592375552.0, + "logps/chosen": -329.07073974609375, + "logps/rejected": -574.61083984375, + "loss": 0.0409, + "rewards/chosen": 2.485933780670166, + "rewards/margins": 11.1018385887146, + "rewards/rejected": -8.615904808044434, + "step": 539 + }, + { + "epoch": 0.04933759707629054, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.944419132848855e-06, + "logits/chosen": 534498176.0, + "logits/rejected": 609886873.6, + "logps/chosen": -339.02891031901044, + "logps/rejected": -267.9729248046875, + "loss": 0.0425, + "rewards/chosen": 3.1376425425211587, + "rewards/margins": 8.347770563761394, + "rewards/rejected": -5.2101280212402346, + "step": 540 + }, + { + "epoch": 0.049428962996802196, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.94420514166745e-06, + "logits/chosen": 424964531.2, + "logits/rejected": 345621312.0, + "logps/chosen": -395.95478515625, + "logps/rejected": -365.8430582682292, + "loss": 0.0415, + "rewards/chosen": 3.0139894485473633, + "rewards/margins": 8.007105191548664, + "rewards/rejected": -4.993115743001302, + "step": 541 + }, + { + "epoch": 0.049520328917313845, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 9.94399074164609e-06, + "logits/chosen": 1142153984.0, + "logits/rejected": 1110934613.3333333, + "logps/chosen": -254.47244262695312, + "logps/rejected": -534.5941975911459, + "loss": 0.0055, + "rewards/chosen": 4.183614253997803, + "rewards/margins": 12.043324311574299, + "rewards/rejected": -7.859710057576497, + "step": 542 + }, + { + "epoch": 0.049611694837825494, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.943775932802503e-06, + "logits/chosen": 503339968.0, + "logits/rejected": 535270944.0, + "logps/chosen": -563.787353515625, + "logps/rejected": -704.9530639648438, + "loss": 0.0595, + "rewards/chosen": 2.37727427482605, + "rewards/margins": 14.933391332626343, + "rewards/rejected": -12.556117057800293, + "step": 543 + }, + { + "epoch": 0.04970306075833714, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.943560715154452e-06, + "logits/chosen": 623123669.3333334, + "logits/rejected": 853789235.2, + "logps/chosen": -499.57421875, + "logps/rejected": -380.735791015625, + "loss": 0.0406, + "rewards/chosen": 3.1009848912556968, + "rewards/margins": 9.01187432607015, + "rewards/rejected": -5.910889434814453, + "step": 544 + }, + { + "epoch": 0.04979442667884879, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.943345088719733e-06, + "logits/chosen": 772559232.0, + "logits/rejected": 997517696.0, + "logps/chosen": -389.2762451171875, + "logps/rejected": -613.8396606445312, + "loss": 0.0699, + "rewards/chosen": 2.5690696239471436, + "rewards/margins": 10.416871309280396, + "rewards/rejected": -7.847801685333252, + "step": 545 + }, + { + "epoch": 0.04988579259936044, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.943129053516176e-06, + "logits/chosen": 439897429.3333333, + "logits/rejected": 620420300.8, + "logps/chosen": -398.62060546875, + "logps/rejected": -525.8603515625, + "loss": 0.0323, + "rewards/chosen": 2.6253573099772134, + "rewards/margins": 10.29664789835612, + "rewards/rejected": -7.671290588378906, + "step": 546 + }, + { + "epoch": 0.04997715851987209, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.942912609561645e-06, + "logits/chosen": 766824064.0, + "logits/rejected": 904548992.0, + "logps/chosen": -432.8030700683594, + "logps/rejected": -606.4090576171875, + "loss": 0.0242, + "rewards/chosen": 3.1183242797851562, + "rewards/margins": 12.566186904907227, + "rewards/rejected": -9.44786262512207, + "step": 547 + }, + { + "epoch": 0.05006852444038374, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.942695756874037e-06, + "logits/chosen": 606654873.6, + "logits/rejected": 543873792.0, + "logps/chosen": -331.514404296875, + "logps/rejected": -426.1781005859375, + "loss": 0.0741, + "rewards/chosen": 2.816172218322754, + "rewards/margins": 6.485329246520996, + "rewards/rejected": -3.669157028198242, + "step": 548 + }, + { + "epoch": 0.05015989036089539, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 9.942478495471287e-06, + "logits/chosen": 569370048.0, + "logits/rejected": 799760896.0, + "logps/chosen": -303.7358703613281, + "logps/rejected": -562.8180745442709, + "loss": 0.0949, + "rewards/chosen": 1.262994408607483, + "rewards/margins": 8.518303195635479, + "rewards/rejected": -7.255308787027995, + "step": 549 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.942260825371359e-06, + "logits/chosen": 431763232.0, + "logits/rejected": 498725024.0, + "logps/chosen": -306.55389404296875, + "logps/rejected": -232.50894165039062, + "loss": 0.0759, + "rewards/chosen": 2.677633285522461, + "rewards/margins": 7.554461479187012, + "rewards/rejected": -4.876828193664551, + "step": 550 + }, + { + "epoch": 0.05034262220191869, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.94204274659225e-06, + "logits/chosen": 516418144.0, + "logits/rejected": 536102368.0, + "logps/chosen": -201.65811157226562, + "logps/rejected": -457.4442138671875, + "loss": 0.0698, + "rewards/chosen": 2.811689853668213, + "rewards/margins": 9.275840759277344, + "rewards/rejected": -6.464150905609131, + "step": 551 + }, + { + "epoch": 0.050433988122430336, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.941824259151996e-06, + "logits/chosen": 776445568.0, + "logits/rejected": 380178304.0, + "logps/chosen": -231.29190063476562, + "logps/rejected": -317.20703125, + "loss": 0.0465, + "rewards/chosen": 2.8480236530303955, + "rewards/margins": 9.102943658828735, + "rewards/rejected": -6.25492000579834, + "step": 552 + }, + { + "epoch": 0.050525354042941985, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.941605363068662e-06, + "logits/chosen": 391583027.2, + "logits/rejected": 496506922.6666667, + "logps/chosen": -348.278125, + "logps/rejected": -424.5496826171875, + "loss": 0.0277, + "rewards/chosen": 3.578693389892578, + "rewards/margins": 8.805691401163738, + "rewards/rejected": -5.226998011271159, + "step": 553 + }, + { + "epoch": 0.050616719963453634, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.94138605836035e-06, + "logits/chosen": 451542118.4, + "logits/rejected": 197470613.33333334, + "logps/chosen": -287.11787109375, + "logps/rejected": -221.819580078125, + "loss": 0.1394, + "rewards/chosen": 1.7700414657592773, + "rewards/margins": 7.8851213455200195, + "rewards/rejected": -6.115079879760742, + "step": 554 + }, + { + "epoch": 0.05070808588396528, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.941166345045193e-06, + "logits/chosen": 688377728.0, + "logits/rejected": 666600192.0, + "logps/chosen": -420.1392415364583, + "logps/rejected": -722.9005737304688, + "loss": 0.0932, + "rewards/chosen": 2.208003362019857, + "rewards/margins": 10.742949803670248, + "rewards/rejected": -8.53494644165039, + "step": 555 + }, + { + "epoch": 0.05079945180447693, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 9.940946223141361e-06, + "logits/chosen": 200792746.66666666, + "logits/rejected": 595686553.6, + "logps/chosen": -151.76595052083334, + "logps/rejected": -459.014111328125, + "loss": 0.0184, + "rewards/chosen": 4.232211430867513, + "rewards/margins": 8.617590077718098, + "rewards/rejected": -4.385378646850586, + "step": 556 + }, + { + "epoch": 0.05089081772498858, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 9.940725692667055e-06, + "logits/chosen": 769525760.0, + "logits/rejected": 370224018.28571427, + "logps/chosen": -597.9099731445312, + "logps/rejected": -430.65248325892856, + "loss": 0.0141, + "rewards/chosen": 2.383898973464966, + "rewards/margins": 9.669987099511282, + "rewards/rejected": -7.286088126046317, + "step": 557 + }, + { + "epoch": 0.05098218364550023, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.940504753640512e-06, + "logits/chosen": 588998336.0, + "logits/rejected": 522017600.0, + "logps/chosen": -308.67266845703125, + "logps/rejected": -568.4342651367188, + "loss": 0.0522, + "rewards/chosen": 2.3774266242980957, + "rewards/margins": 12.148130893707275, + "rewards/rejected": -9.77070426940918, + "step": 558 + }, + { + "epoch": 0.05107354956601188, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.94028340608e-06, + "logits/chosen": 534980096.0, + "logits/rejected": 153757584.0, + "logps/chosen": -329.4677734375, + "logps/rejected": -518.1055297851562, + "loss": 0.1214, + "rewards/chosen": 2.7692958286830356, + "rewards/margins": 11.838692801339285, + "rewards/rejected": -9.06939697265625, + "step": 559 + }, + { + "epoch": 0.05116491548652353, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.940061650003822e-06, + "logits/chosen": 372432384.0, + "logits/rejected": 458276249.6, + "logps/chosen": -209.72965494791666, + "logps/rejected": -544.153564453125, + "loss": 0.0751, + "rewards/chosen": 1.4868459701538086, + "rewards/margins": 10.089172172546387, + "rewards/rejected": -8.602326202392579, + "step": 560 + }, + { + "epoch": 0.05125628140703518, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.939839485430317e-06, + "logits/chosen": 372317030.4, + "logits/rejected": 427723861.3333333, + "logps/chosen": -229.7554931640625, + "logps/rejected": -445.4662679036458, + "loss": 0.0723, + "rewards/chosen": 3.493482971191406, + "rewards/margins": 7.458405558268229, + "rewards/rejected": -3.9649225870768228, + "step": 561 + }, + { + "epoch": 0.051347647327546826, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.939616912377855e-06, + "logits/chosen": 455016288.0, + "logits/rejected": 691330240.0, + "logps/chosen": -257.07159423828125, + "logps/rejected": -402.1934814453125, + "loss": 0.0729, + "rewards/chosen": 2.429377794265747, + "rewards/margins": 7.945145845413208, + "rewards/rejected": -5.515768051147461, + "step": 562 + }, + { + "epoch": 0.051439013248058475, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.939393930864843e-06, + "logits/chosen": 1122678272.0, + "logits/rejected": 644122828.8, + "logps/chosen": -430.5715738932292, + "logps/rejected": -385.7927734375, + "loss": 0.1517, + "rewards/chosen": 1.5885318120320637, + "rewards/margins": 4.815900739034017, + "rewards/rejected": -3.227368927001953, + "step": 563 + }, + { + "epoch": 0.051530379168570124, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.939170540909715e-06, + "logits/chosen": 554172586.6666666, + "logits/rejected": 885871308.8, + "logps/chosen": -261.9931233723958, + "logps/rejected": -555.96630859375, + "loss": 0.0749, + "rewards/chosen": 2.4329471588134766, + "rewards/margins": 7.08764762878418, + "rewards/rejected": -4.654700469970703, + "step": 564 + }, + { + "epoch": 0.05162174508908177, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.938946742530947e-06, + "logits/chosen": 587407323.4285715, + "logits/rejected": 581418624.0, + "logps/chosen": -345.54164341517856, + "logps/rejected": -433.142822265625, + "loss": 0.0628, + "rewards/chosen": 2.801086970738002, + "rewards/margins": 6.532471248081752, + "rewards/rejected": -3.73138427734375, + "step": 565 + }, + { + "epoch": 0.05171311100959342, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.938722535747044e-06, + "logits/chosen": 788829952.0, + "logits/rejected": 393832128.0, + "logps/chosen": -388.9601643880208, + "logps/rejected": -543.7454223632812, + "loss": 0.0604, + "rewards/chosen": 2.824568748474121, + "rewards/margins": 14.813846588134766, + "rewards/rejected": -11.989277839660645, + "step": 566 + }, + { + "epoch": 0.05180447693010507, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.938497920576544e-06, + "logits/chosen": 543850624.0, + "logits/rejected": 373261696.0, + "logps/chosen": -480.7633361816406, + "logps/rejected": -311.4798583984375, + "loss": 0.0597, + "rewards/chosen": 2.642219305038452, + "rewards/margins": 7.911183595657349, + "rewards/rejected": -5.2689642906188965, + "step": 567 + }, + { + "epoch": 0.05189584285061672, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.938272897038023e-06, + "logits/chosen": 603589034.6666666, + "logits/rejected": 380980992.0, + "logps/chosen": -379.9322916666667, + "logps/rejected": -448.71650390625, + "loss": 0.0879, + "rewards/chosen": 1.6401163736979167, + "rewards/margins": 6.900077692667644, + "rewards/rejected": -5.259961318969727, + "step": 568 + }, + { + "epoch": 0.05198720877112837, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.938047465150087e-06, + "logits/chosen": 501776896.0, + "logits/rejected": 305462880.0, + "logps/chosen": -295.17037527901783, + "logps/rejected": -396.55902099609375, + "loss": 0.0565, + "rewards/chosen": 3.031212397984096, + "rewards/margins": 7.906063147953578, + "rewards/rejected": -4.874850749969482, + "step": 569 + }, + { + "epoch": 0.05207857469164002, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.937821624931378e-06, + "logits/chosen": 415244629.3333333, + "logits/rejected": 478064332.8, + "logps/chosen": -413.6844889322917, + "logps/rejected": -429.78828125, + "loss": 0.026, + "rewards/chosen": 3.280973434448242, + "rewards/margins": 8.429648971557617, + "rewards/rejected": -5.148675537109375, + "step": 570 + }, + { + "epoch": 0.05216994061215167, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.93759537640057e-06, + "logits/chosen": 723773952.0, + "logits/rejected": 415599296.0, + "logps/chosen": -311.27105712890625, + "logps/rejected": -373.6888427734375, + "loss": 0.1146, + "rewards/chosen": 1.762797196706136, + "rewards/margins": 8.531826814015707, + "rewards/rejected": -6.76902961730957, + "step": 571 + }, + { + "epoch": 0.05226130653266332, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 9.937368719576374e-06, + "logits/chosen": 481115264.0, + "logits/rejected": 406085205.3333333, + "logps/chosen": -284.57171630859375, + "logps/rejected": -482.5922444661458, + "loss": 0.0409, + "rewards/chosen": 1.8003966808319092, + "rewards/margins": 8.432198286056519, + "rewards/rejected": -6.631801605224609, + "step": 572 + }, + { + "epoch": 0.052352672453174966, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.937141654477529e-06, + "logits/chosen": 789895296.0, + "logits/rejected": 778687616.0, + "logps/chosen": -474.0384216308594, + "logps/rejected": -336.8320617675781, + "loss": 0.0666, + "rewards/chosen": 2.3931374549865723, + "rewards/margins": 6.965983867645264, + "rewards/rejected": -4.572846412658691, + "step": 573 + }, + { + "epoch": 0.052444038373686615, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.936914181122815e-06, + "logits/chosen": 568396441.6, + "logits/rejected": 394638421.3333333, + "logps/chosen": -409.2862548828125, + "logps/rejected": -519.1792805989584, + "loss": 0.0504, + "rewards/chosen": 2.539236831665039, + "rewards/margins": 8.263681538899739, + "rewards/rejected": -5.7244447072347, + "step": 574 + }, + { + "epoch": 0.052535404294198264, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.936686299531038e-06, + "logits/chosen": 361628757.3333333, + "logits/rejected": 744359577.6, + "logps/chosen": -198.69805908203125, + "logps/rejected": -411.6814453125, + "loss": 0.096, + "rewards/chosen": 2.8530356089274087, + "rewards/margins": 7.20354601542155, + "rewards/rejected": -4.3505104064941404, + "step": 575 + }, + { + "epoch": 0.05262677021470991, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.936458009721042e-06, + "logits/chosen": 622393408.0, + "logits/rejected": 814508672.0, + "logps/chosen": -436.762939453125, + "logps/rejected": -869.9549560546875, + "loss": 0.0357, + "rewards/chosen": 2.846292018890381, + "rewards/margins": 18.315724849700928, + "rewards/rejected": -15.469432830810547, + "step": 576 + }, + { + "epoch": 0.05271813613522156, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.936229311711707e-06, + "logits/chosen": 323451861.3333333, + "logits/rejected": 334410016.0, + "logps/chosen": -252.81331380208334, + "logps/rejected": -473.58746337890625, + "loss": 0.044, + "rewards/chosen": 3.2511984507242837, + "rewards/margins": 9.636984984079996, + "rewards/rejected": -6.385786533355713, + "step": 577 + }, + { + "epoch": 0.05280950205573321, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.936000205521944e-06, + "logits/chosen": 386911424.0, + "logits/rejected": 612521252.5714285, + "logps/chosen": -93.24019622802734, + "logps/rejected": -542.62451171875, + "loss": 0.0451, + "rewards/chosen": 2.1993768215179443, + "rewards/margins": 7.704170261110578, + "rewards/rejected": -5.504793439592634, + "step": 578 + }, + { + "epoch": 0.05290086797624486, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.935770691170696e-06, + "logits/chosen": 418146867.2, + "logits/rejected": 367564672.0, + "logps/chosen": -340.280810546875, + "logps/rejected": -471.1500244140625, + "loss": 0.0429, + "rewards/chosen": 3.698381042480469, + "rewards/margins": 9.490167872111003, + "rewards/rejected": -5.791786829630534, + "step": 579 + }, + { + "epoch": 0.05299223389675651, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.935540768676944e-06, + "logits/chosen": 634686515.2, + "logits/rejected": 528608682.6666667, + "logps/chosen": -461.67919921875, + "logps/rejected": -412.7539876302083, + "loss": 0.0381, + "rewards/chosen": 3.0409223556518556, + "rewards/margins": 9.52405948638916, + "rewards/rejected": -6.483137130737305, + "step": 580 + }, + { + "epoch": 0.05308359981726816, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.9353104380597e-06, + "logits/chosen": 593523029.3333334, + "logits/rejected": 905204224.0, + "logps/chosen": -305.5103759765625, + "logps/rejected": -295.869287109375, + "loss": 0.041, + "rewards/chosen": 2.9244308471679688, + "rewards/margins": 7.625985717773437, + "rewards/rejected": -4.701554870605468, + "step": 581 + }, + { + "epoch": 0.05317496573777981, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.935079699338006e-06, + "logits/chosen": 592631338.6666666, + "logits/rejected": 636879206.4, + "logps/chosen": -314.63047281901044, + "logps/rejected": -204.998388671875, + "loss": 0.0598, + "rewards/chosen": 3.2223167419433594, + "rewards/margins": 8.30914077758789, + "rewards/rejected": -5.086824035644531, + "step": 582 + }, + { + "epoch": 0.053266331658291456, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 9.934848552530948e-06, + "logits/chosen": 1037999718.4, + "logits/rejected": 653922218.6666666, + "logps/chosen": -353.452783203125, + "logps/rejected": -706.075439453125, + "loss": 0.1193, + "rewards/chosen": 2.3461538314819337, + "rewards/margins": 10.8162446975708, + "rewards/rejected": -8.470090866088867, + "step": 583 + }, + { + "epoch": 0.053357697578803105, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.934616997657635e-06, + "logits/chosen": 597733273.6, + "logits/rejected": 267201408.0, + "logps/chosen": -216.8820556640625, + "logps/rejected": -325.65944417317706, + "loss": 0.0333, + "rewards/chosen": 3.5247817993164063, + "rewards/margins": 7.223199462890625, + "rewards/rejected": -3.6984176635742188, + "step": 584 + }, + { + "epoch": 0.053449063499314754, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.93438503473722e-06, + "logits/chosen": 354314218.6666667, + "logits/rejected": 511215104.0, + "logps/chosen": -252.3336385091146, + "logps/rejected": -499.485791015625, + "loss": 0.0503, + "rewards/chosen": 2.878448804219564, + "rewards/margins": 8.048985608418782, + "rewards/rejected": -5.170536804199219, + "step": 585 + }, + { + "epoch": 0.0535404294198264, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.934152663788878e-06, + "logits/chosen": 810426944.0, + "logits/rejected": 540301888.0, + "logps/chosen": -508.51165771484375, + "logps/rejected": -431.5880432128906, + "loss": 0.0282, + "rewards/chosen": 2.964129686355591, + "rewards/margins": 8.783080339431763, + "rewards/rejected": -5.818950653076172, + "step": 586 + }, + { + "epoch": 0.05363179534033805, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 9.933919884831828e-06, + "logits/chosen": 683021888.0, + "logits/rejected": 473283776.0, + "logps/chosen": -593.7889404296875, + "logps/rejected": -411.69976806640625, + "loss": 0.0384, + "rewards/chosen": 2.676865577697754, + "rewards/margins": 8.954186916351318, + "rewards/rejected": -6.2773213386535645, + "step": 587 + }, + { + "epoch": 0.0537231612608497, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 9.933686697885317e-06, + "logits/chosen": 405900928.0, + "logits/rejected": 715678720.0, + "logps/chosen": -100.60124206542969, + "logps/rejected": -354.8037109375, + "loss": 0.0996, + "rewards/chosen": 1.6804141998291016, + "rewards/margins": 6.548401832580566, + "rewards/rejected": -4.867987632751465, + "step": 588 + }, + { + "epoch": 0.05381452718136135, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.933453102968627e-06, + "logits/chosen": 468242227.2, + "logits/rejected": 269424213.3333333, + "logps/chosen": -207.8153564453125, + "logps/rejected": -483.62939453125, + "loss": 0.0689, + "rewards/chosen": 2.4296207427978516, + "rewards/margins": 11.767582575480143, + "rewards/rejected": -9.337961832682291, + "step": 589 + }, + { + "epoch": 0.053905893101873, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.933219100101075e-06, + "logits/chosen": 1318848256.0, + "logits/rejected": 555485235.2, + "logps/chosen": -377.1516927083333, + "logps/rejected": -246.225341796875, + "loss": 0.0881, + "rewards/chosen": 3.03511110941569, + "rewards/margins": 6.227436701456705, + "rewards/rejected": -3.1923255920410156, + "step": 590 + }, + { + "epoch": 0.05399725902238465, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.932984689302012e-06, + "logits/chosen": 479359296.0, + "logits/rejected": 466468278.85714287, + "logps/chosen": -297.0140380859375, + "logps/rejected": -360.107421875, + "loss": 0.0561, + "rewards/chosen": 2.044506788253784, + "rewards/margins": 10.084048850195748, + "rewards/rejected": -8.039542061941964, + "step": 591 + }, + { + "epoch": 0.0540886249428963, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.932749870590819e-06, + "logits/chosen": 774799018.6666666, + "logits/rejected": 525353676.8, + "logps/chosen": -375.9131673177083, + "logps/rejected": -340.5093505859375, + "loss": 0.1403, + "rewards/chosen": 2.388212203979492, + "rewards/margins": 5.94684066772461, + "rewards/rejected": -3.558628463745117, + "step": 592 + }, + { + "epoch": 0.05417999086340795, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 9.932514643986915e-06, + "logits/chosen": 680734464.0, + "logits/rejected": 467545804.8, + "logps/chosen": -456.155029296875, + "logps/rejected": -442.366162109375, + "loss": 0.0461, + "rewards/chosen": 2.5898825327555337, + "rewards/margins": 7.884660212198893, + "rewards/rejected": -5.294777679443359, + "step": 593 + }, + { + "epoch": 0.054271356783919596, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.932279009509748e-06, + "logits/chosen": 469798336.0, + "logits/rejected": 528101056.0, + "logps/chosen": -373.10589599609375, + "logps/rejected": -651.728515625, + "loss": 0.0406, + "rewards/chosen": 2.91513729095459, + "rewards/margins": 17.91959857940674, + "rewards/rejected": -15.004461288452148, + "step": 594 + }, + { + "epoch": 0.054362722704431245, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.93204296717881e-06, + "logits/chosen": 408078688.0, + "logits/rejected": 1051960576.0, + "logps/chosen": -233.04791259765625, + "logps/rejected": -427.2386779785156, + "loss": 0.0961, + "rewards/chosen": 2.009744882583618, + "rewards/margins": 9.537045240402222, + "rewards/rejected": -7.5273003578186035, + "step": 595 + }, + { + "epoch": 0.054454088624942894, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 9.931806517013612e-06, + "logits/chosen": 416687286.85714287, + "logits/rejected": 620095232.0, + "logps/chosen": -342.481201171875, + "logps/rejected": -163.96170043945312, + "loss": 0.1954, + "rewards/chosen": 2.0973740986415317, + "rewards/margins": 1.8813798299857547, + "rewards/rejected": 0.21599426865577698, + "step": 596 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.93156965903371e-06, + "logits/chosen": 718679040.0, + "logits/rejected": 366786432.0, + "logps/chosen": -473.17303466796875, + "logps/rejected": -413.6409505208333, + "loss": 0.0515, + "rewards/chosen": 3.4470763206481934, + "rewards/margins": 10.30131419499715, + "rewards/rejected": -6.854237874348958, + "step": 597 + }, + { + "epoch": 0.05463682046596619, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.93133239325869e-06, + "logits/chosen": 595494997.3333334, + "logits/rejected": 718413260.8, + "logps/chosen": -347.017822265625, + "logps/rejected": -318.5345703125, + "loss": 0.0891, + "rewards/chosen": 2.6260010401407876, + "rewards/margins": 9.83235632578532, + "rewards/rejected": -7.206355285644531, + "step": 598 + }, + { + "epoch": 0.05472818638647784, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.93109471970817e-06, + "logits/chosen": 553339392.0, + "logits/rejected": 483523200.0, + "logps/chosen": -200.96842447916666, + "logps/rejected": -453.9389953613281, + "loss": 0.1129, + "rewards/chosen": 1.9875092506408691, + "rewards/margins": 10.568737506866455, + "rewards/rejected": -8.581228256225586, + "step": 599 + }, + { + "epoch": 0.05481955230698949, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.930856638401805e-06, + "logits/chosen": 469849856.0, + "logits/rejected": 358864032.0, + "logps/chosen": -325.547607421875, + "logps/rejected": -498.3650817871094, + "loss": 0.0512, + "rewards/chosen": 2.97481632232666, + "rewards/margins": 11.831646919250488, + "rewards/rejected": -8.856830596923828, + "step": 600 + }, + { + "epoch": 0.05491091822750114, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.930618149359282e-06, + "logits/chosen": 598606080.0, + "logits/rejected": 953592917.3333334, + "logps/chosen": -339.32745361328125, + "logps/rejected": -569.651611328125, + "loss": 0.0342, + "rewards/chosen": 2.536059617996216, + "rewards/margins": 9.70378549893697, + "rewards/rejected": -7.167725880940755, + "step": 601 + }, + { + "epoch": 0.05500228414801279, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.930379252600319e-06, + "logits/chosen": 309266944.0, + "logits/rejected": 741613421.7142857, + "logps/chosen": -309.9733581542969, + "logps/rejected": -394.11021205357144, + "loss": 0.0245, + "rewards/chosen": 3.1932830810546875, + "rewards/margins": 9.919425419398717, + "rewards/rejected": -6.726142338344029, + "step": 602 + }, + { + "epoch": 0.05509365006852444, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.930139948144675e-06, + "logits/chosen": 556916565.3333334, + "logits/rejected": 422620774.4, + "logps/chosen": -256.9803466796875, + "logps/rejected": -415.3634765625, + "loss": 0.0934, + "rewards/chosen": 3.23876953125, + "rewards/margins": 10.251080322265626, + "rewards/rejected": -7.012310791015625, + "step": 603 + }, + { + "epoch": 0.055185015989036086, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.929900236012135e-06, + "logits/chosen": 621498581.3333334, + "logits/rejected": 1181006592.0, + "logps/chosen": -312.70416259765625, + "logps/rejected": -945.499267578125, + "loss": 0.0556, + "rewards/chosen": 2.7543509801228843, + "rewards/margins": 17.083570798238117, + "rewards/rejected": -14.329219818115234, + "step": 604 + }, + { + "epoch": 0.05527638190954774, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.929660116222524e-06, + "logits/chosen": 567753984.0, + "logits/rejected": 503899520.0, + "logps/chosen": -349.3451334635417, + "logps/rejected": -353.54486083984375, + "loss": 0.0981, + "rewards/chosen": 2.281419277191162, + "rewards/margins": 8.435656070709229, + "rewards/rejected": -6.154236793518066, + "step": 605 + }, + { + "epoch": 0.05536774783005939, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 9.929419588795692e-06, + "logits/chosen": 547308970.6666666, + "logits/rejected": 835417548.8, + "logps/chosen": -291.7486979166667, + "logps/rejected": -317.18740234375, + "loss": 0.0185, + "rewards/chosen": 4.0228525797526045, + "rewards/margins": 9.529883829752604, + "rewards/rejected": -5.50703125, + "step": 606 + }, + { + "epoch": 0.05545911375057104, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.929178653751534e-06, + "logits/chosen": 430352256.0, + "logits/rejected": 453175392.0, + "logps/chosen": -336.5187683105469, + "logps/rejected": -605.32763671875, + "loss": 0.0357, + "rewards/chosen": 3.3786144256591797, + "rewards/margins": 11.214422225952148, + "rewards/rejected": -7.835807800292969, + "step": 607 + }, + { + "epoch": 0.05555047967108269, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 9.928937311109972e-06, + "logits/chosen": 541877589.3333334, + "logits/rejected": 487759206.4, + "logps/chosen": -343.087158203125, + "logps/rejected": -429.22607421875, + "loss": 0.0356, + "rewards/chosen": 2.490901311238607, + "rewards/margins": 8.9979554494222, + "rewards/rejected": -6.507054138183594, + "step": 608 + }, + { + "epoch": 0.05564184559159434, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 9.928695560890962e-06, + "logits/chosen": 448207872.0, + "logits/rejected": 489319731.2, + "logps/chosen": -279.4010823567708, + "logps/rejected": -741.786669921875, + "loss": 0.0178, + "rewards/chosen": 3.1876516342163086, + "rewards/margins": 13.339254570007324, + "rewards/rejected": -10.151602935791015, + "step": 609 + }, + { + "epoch": 0.05573321151210599, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.928453403114494e-06, + "logits/chosen": 548277077.3333334, + "logits/rejected": 1164808960.0, + "logps/chosen": -364.4711100260417, + "logps/rejected": -754.0225830078125, + "loss": 0.0626, + "rewards/chosen": 2.7034988403320312, + "rewards/margins": 12.719226837158203, + "rewards/rejected": -10.015727996826172, + "step": 610 + }, + { + "epoch": 0.055824577432617636, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.928210837800591e-06, + "logits/chosen": 356724736.0, + "logits/rejected": 329279513.6, + "logps/chosen": -284.0992431640625, + "logps/rejected": -403.7134765625, + "loss": 0.0367, + "rewards/chosen": 2.357197125752767, + "rewards/margins": 10.486385663350424, + "rewards/rejected": -8.129188537597656, + "step": 611 + }, + { + "epoch": 0.055915943353129285, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.927967864969314e-06, + "logits/chosen": 642327360.0, + "logits/rejected": 424446304.0, + "logps/chosen": -144.22369384765625, + "logps/rejected": -372.27642822265625, + "loss": 0.08, + "rewards/chosen": 1.8288788795471191, + "rewards/margins": 8.732994079589844, + "rewards/rejected": -6.904115200042725, + "step": 612 + }, + { + "epoch": 0.056007309273640934, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 9.927724484640753e-06, + "logits/chosen": 510899136.0, + "logits/rejected": 743436864.0, + "logps/chosen": -308.1838684082031, + "logps/rejected": -469.33380126953125, + "loss": 0.0161, + "rewards/chosen": 3.6595306396484375, + "rewards/margins": 11.798930168151855, + "rewards/rejected": -8.139399528503418, + "step": 613 + }, + { + "epoch": 0.056098675194152584, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 9.927480696835035e-06, + "logits/chosen": 1048600405.3333334, + "logits/rejected": 576543488.0, + "logps/chosen": -308.0454508463542, + "logps/rejected": -374.3391357421875, + "loss": 0.0252, + "rewards/chosen": 3.4143454233805337, + "rewards/margins": 8.991277186075846, + "rewards/rejected": -5.576931762695312, + "step": 614 + }, + { + "epoch": 0.05619004111466423, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.927236501572315e-06, + "logits/chosen": 291634918.4, + "logits/rejected": 450887082.6666667, + "logps/chosen": -236.8050048828125, + "logps/rejected": -530.4671223958334, + "loss": 0.0313, + "rewards/chosen": 3.3539642333984374, + "rewards/margins": 10.292002105712891, + "rewards/rejected": -6.938037872314453, + "step": 615 + }, + { + "epoch": 0.05628140703517588, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 9.92699189887279e-06, + "logits/chosen": 402291200.0, + "logits/rejected": 607186636.8, + "logps/chosen": -346.2959798177083, + "logps/rejected": -482.679296875, + "loss": 0.0318, + "rewards/chosen": 2.6404927571614585, + "rewards/margins": 10.970777638753257, + "rewards/rejected": -8.330284881591798, + "step": 616 + }, + { + "epoch": 0.05637277295568753, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 9.926746888756684e-06, + "logits/chosen": 345319296.0, + "logits/rejected": 748281600.0, + "logps/chosen": -246.23673502604166, + "logps/rejected": -461.6109375, + "loss": 0.0142, + "rewards/chosen": 3.7318700154622397, + "rewards/margins": 10.062063344319661, + "rewards/rejected": -6.330193328857422, + "step": 617 + }, + { + "epoch": 0.05646413887619918, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.926501471244258e-06, + "logits/chosen": 533613920.0, + "logits/rejected": 596146368.0, + "logps/chosen": -232.4270782470703, + "logps/rejected": -448.94207763671875, + "loss": 0.0939, + "rewards/chosen": 3.3475308418273926, + "rewards/margins": 9.942127227783203, + "rewards/rejected": -6.5945963859558105, + "step": 618 + }, + { + "epoch": 0.05655550479671083, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.926255646355804e-06, + "logits/chosen": 558345898.6666666, + "logits/rejected": 531626598.4, + "logps/chosen": -275.2449137369792, + "logps/rejected": -534.32626953125, + "loss": 0.0241, + "rewards/chosen": 3.3082777659098306, + "rewards/margins": 10.792321650187175, + "rewards/rejected": -7.484043884277344, + "step": 619 + }, + { + "epoch": 0.05664687071722248, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.926009414111652e-06, + "logits/chosen": 333836074.6666667, + "logits/rejected": 345044172.8, + "logps/chosen": -168.2794392903646, + "logps/rejected": -252.20322265625, + "loss": 0.0441, + "rewards/chosen": 2.156172275543213, + "rewards/margins": 9.918438816070557, + "rewards/rejected": -7.762266540527344, + "step": 620 + }, + { + "epoch": 0.05673823663773413, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 9.925762774532162e-06, + "logits/chosen": 889093568.0, + "logits/rejected": 475168213.3333333, + "logps/chosen": -334.7366638183594, + "logps/rejected": -536.51123046875, + "loss": 0.0067, + "rewards/chosen": 3.6594223976135254, + "rewards/margins": 13.353973229726156, + "rewards/rejected": -9.69455083211263, + "step": 621 + }, + { + "epoch": 0.056829602558245776, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.925515727637729e-06, + "logits/chosen": 908927104.0, + "logits/rejected": 540311338.6666666, + "logps/chosen": -701.0166015625, + "logps/rejected": -680.640625, + "loss": 0.0471, + "rewards/chosen": 1.5380432605743408, + "rewards/margins": 15.646149237950643, + "rewards/rejected": -14.108105977376303, + "step": 622 + }, + { + "epoch": 0.056920968478757425, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.92526827344878e-06, + "logits/chosen": 537348821.3333334, + "logits/rejected": 671013683.2, + "logps/chosen": -360.879150390625, + "logps/rejected": -774.87255859375, + "loss": 0.0277, + "rewards/chosen": 2.7760051091512046, + "rewards/margins": 14.080536969502768, + "rewards/rejected": -11.304531860351563, + "step": 623 + }, + { + "epoch": 0.057012334399269074, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.925020411985779e-06, + "logits/chosen": 439059626.6666667, + "logits/rejected": 346603980.8, + "logps/chosen": -362.3885498046875, + "logps/rejected": -302.247265625, + "loss": 0.0394, + "rewards/chosen": 3.487706184387207, + "rewards/margins": 10.759684944152832, + "rewards/rejected": -7.271978759765625, + "step": 624 + }, + { + "epoch": 0.05710370031978072, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.924772143269221e-06, + "logits/chosen": 388853162.6666667, + "logits/rejected": 444819814.4, + "logps/chosen": -568.6934000651041, + "logps/rejected": -460.211962890625, + "loss": 0.0536, + "rewards/chosen": 3.4762516021728516, + "rewards/margins": 9.639316940307618, + "rewards/rejected": -6.163065338134766, + "step": 625 + }, + { + "epoch": 0.05719506624029237, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.924523467319636e-06, + "logits/chosen": 407241984.0, + "logits/rejected": 383851093.3333333, + "logps/chosen": -226.284765625, + "logps/rejected": -396.7528076171875, + "loss": 0.0569, + "rewards/chosen": 3.3513206481933593, + "rewards/margins": 13.18231430053711, + "rewards/rejected": -9.83099365234375, + "step": 626 + }, + { + "epoch": 0.05728643216080402, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.924274384157588e-06, + "logits/chosen": 335194137.6, + "logits/rejected": 545455616.0, + "logps/chosen": -262.1041015625, + "logps/rejected": -478.36083984375, + "loss": 0.1507, + "rewards/chosen": 1.7896015167236328, + "rewards/margins": 10.190165074666341, + "rewards/rejected": -8.400563557942709, + "step": 627 + }, + { + "epoch": 0.05737779808131567, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.924024893803671e-06, + "logits/chosen": 498601152.0, + "logits/rejected": 334011456.0, + "logps/chosen": -362.4577941894531, + "logps/rejected": -400.2040100097656, + "loss": 0.033, + "rewards/chosen": 3.085383415222168, + "rewards/margins": 10.178041934967041, + "rewards/rejected": -7.092658519744873, + "step": 628 + }, + { + "epoch": 0.05746916400182732, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 9.92377499627852e-06, + "logits/chosen": 607958208.0, + "logits/rejected": 317430784.0, + "logps/chosen": -351.1138916015625, + "logps/rejected": -496.0704345703125, + "loss": 0.177, + "rewards/chosen": 0.8784845471382141, + "rewards/margins": 10.009080708026886, + "rewards/rejected": -9.130596160888672, + "step": 629 + }, + { + "epoch": 0.05756052992233897, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.923524691602794e-06, + "logits/chosen": 374502688.0, + "logits/rejected": 693396906.6666666, + "logps/chosen": -188.87750244140625, + "logps/rejected": -503.961669921875, + "loss": 0.0293, + "rewards/chosen": 2.973249912261963, + "rewards/margins": 10.031983534495037, + "rewards/rejected": -7.058733622233073, + "step": 630 + }, + { + "epoch": 0.05765189584285062, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.923273979797194e-06, + "logits/chosen": 531898240.0, + "logits/rejected": 477890201.6, + "logps/chosen": -314.2298583984375, + "logps/rejected": -398.1397705078125, + "loss": 0.0789, + "rewards/chosen": 1.571804364522298, + "rewards/margins": 9.552053387959798, + "rewards/rejected": -7.9802490234375, + "step": 631 + }, + { + "epoch": 0.057743261763362266, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.923022860882453e-06, + "logits/chosen": 1701421184.0, + "logits/rejected": 556376234.6666666, + "logps/chosen": -353.5050964355469, + "logps/rejected": -340.4750569661458, + "loss": 0.0616, + "rewards/chosen": 1.9463379383087158, + "rewards/margins": 8.222219387690227, + "rewards/rejected": -6.275881449381511, + "step": 632 + }, + { + "epoch": 0.057834627683873915, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.92277133487933e-06, + "logits/chosen": 479245414.4, + "logits/rejected": 350548352.0, + "logps/chosen": -443.669921875, + "logps/rejected": -390.42724609375, + "loss": 0.1118, + "rewards/chosen": 1.728118896484375, + "rewards/margins": 8.472792943318684, + "rewards/rejected": -6.74467404683431, + "step": 633 + }, + { + "epoch": 0.057925993604385564, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.922519401808633e-06, + "logits/chosen": 372051968.0, + "logits/rejected": 423459008.0, + "logps/chosen": -385.6274719238281, + "logps/rejected": -469.9671630859375, + "loss": 0.0528, + "rewards/chosen": 2.272304058074951, + "rewards/margins": 9.313315391540527, + "rewards/rejected": -7.041011333465576, + "step": 634 + }, + { + "epoch": 0.058017359524897213, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.922267061691186e-06, + "logits/chosen": 651167424.0, + "logits/rejected": 1022839680.0, + "logps/chosen": -420.96209716796875, + "logps/rejected": -490.447021484375, + "loss": 0.0763, + "rewards/chosen": 2.3702611923217773, + "rewards/margins": 9.391172409057617, + "rewards/rejected": -7.02091121673584, + "step": 635 + }, + { + "epoch": 0.05810872544540886, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.922014314547861e-06, + "logits/chosen": 538534464.0, + "logits/rejected": 409013674.6666667, + "logps/chosen": -489.21246337890625, + "logps/rejected": -390.6998697916667, + "loss": 0.0635, + "rewards/chosen": 3.7678070068359375, + "rewards/margins": 9.08554967244466, + "rewards/rejected": -5.317742665608724, + "step": 636 + }, + { + "epoch": 0.05820009136592051, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.921761160399555e-06, + "logits/chosen": 326325536.0, + "logits/rejected": 372624896.0, + "logps/chosen": -174.7781982421875, + "logps/rejected": -469.3368733723958, + "loss": 0.0262, + "rewards/chosen": 3.387279510498047, + "rewards/margins": 11.593949635823568, + "rewards/rejected": -8.206670125325521, + "step": 637 + }, + { + "epoch": 0.05829145728643216, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.921507599267202e-06, + "logits/chosen": 711246634.6666666, + "logits/rejected": 1124225740.8, + "logps/chosen": -311.9006754557292, + "logps/rejected": -603.27919921875, + "loss": 0.0596, + "rewards/chosen": 3.341750462849935, + "rewards/margins": 11.216871770222982, + "rewards/rejected": -7.875121307373047, + "step": 638 + }, + { + "epoch": 0.05838282320694381, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 9.921253631171768e-06, + "logits/chosen": 215603600.0, + "logits/rejected": 351504896.0, + "logps/chosen": -197.88992309570312, + "logps/rejected": -450.0750325520833, + "loss": 0.0088, + "rewards/chosen": 4.352782249450684, + "rewards/margins": 12.166414578755695, + "rewards/rejected": -7.813632329305013, + "step": 639 + }, + { + "epoch": 0.05847418912745546, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.920999256134257e-06, + "logits/chosen": 660971605.3333334, + "logits/rejected": 450182860.8, + "logps/chosen": -236.001953125, + "logps/rejected": -385.1146240234375, + "loss": 0.2049, + "rewards/chosen": 1.3847427368164062, + "rewards/margins": 5.587279891967773, + "rewards/rejected": -4.202537155151367, + "step": 640 + }, + { + "epoch": 0.05856555504796711, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.920744474175702e-06, + "logits/chosen": 386364896.0, + "logits/rejected": 291574720.0, + "logps/chosen": -245.7909698486328, + "logps/rejected": -327.03955078125, + "loss": 0.0728, + "rewards/chosen": 2.7930071353912354, + "rewards/margins": 8.840576887130737, + "rewards/rejected": -6.047569751739502, + "step": 641 + }, + { + "epoch": 0.05865692096847876, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.920489285317169e-06, + "logits/chosen": 578157248.0, + "logits/rejected": 568145344.0, + "logps/chosen": -291.0279541015625, + "logps/rejected": -496.92633056640625, + "loss": 0.0583, + "rewards/chosen": 2.4064295291900635, + "rewards/margins": 9.57895302772522, + "rewards/rejected": -7.172523498535156, + "step": 642 + }, + { + "epoch": 0.058748286888990406, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.920233689579762e-06, + "logits/chosen": 566589525.3333334, + "logits/rejected": 652069504.0, + "logps/chosen": -297.74853515625, + "logps/rejected": -589.3532104492188, + "loss": 0.0614, + "rewards/chosen": 3.1267484029134116, + "rewards/margins": 11.14174779256185, + "rewards/rejected": -8.014999389648438, + "step": 643 + }, + { + "epoch": 0.058839652809502055, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.919977686984617e-06, + "logits/chosen": 316603648.0, + "logits/rejected": 291512064.0, + "logps/chosen": -247.8268025716146, + "logps/rejected": -294.41953125, + "loss": 0.073, + "rewards/chosen": 2.6261582374572754, + "rewards/margins": 8.593344974517823, + "rewards/rejected": -5.967186737060547, + "step": 644 + }, + { + "epoch": 0.058931018730013704, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 9.9197212775529e-06, + "logits/chosen": 264463616.0, + "logits/rejected": 382476117.3333333, + "logps/chosen": -375.8983154296875, + "logps/rejected": -599.4410807291666, + "loss": 0.0047, + "rewards/chosen": 4.083224296569824, + "rewards/margins": 13.112274487813314, + "rewards/rejected": -9.02905019124349, + "step": 645 + }, + { + "epoch": 0.05902238465052535, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 9.919464461305817e-06, + "logits/chosen": 637988352.0, + "logits/rejected": 673653930.6666666, + "logps/chosen": -291.313232421875, + "logps/rejected": -869.2264811197916, + "loss": 0.1379, + "rewards/chosen": 1.6107858657836913, + "rewards/margins": 10.339717292785645, + "rewards/rejected": -8.728931427001953, + "step": 646 + }, + { + "epoch": 0.059113750571037, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.919207238264602e-06, + "logits/chosen": 364082380.8, + "logits/rejected": 402621866.6666667, + "logps/chosen": -263.9790283203125, + "logps/rejected": -298.8336995442708, + "loss": 0.0335, + "rewards/chosen": 3.3963272094726564, + "rewards/margins": 8.660767428080241, + "rewards/rejected": -5.264440218607585, + "step": 647 + }, + { + "epoch": 0.05920511649154865, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.918949608450527e-06, + "logits/chosen": 1067263283.2, + "logits/rejected": 595142400.0, + "logps/chosen": -340.0867431640625, + "logps/rejected": -545.9640299479166, + "loss": 0.0635, + "rewards/chosen": 2.986616516113281, + "rewards/margins": 8.684949620564778, + "rewards/rejected": -5.698333104451497, + "step": 648 + }, + { + "epoch": 0.0592964824120603, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.918691571884893e-06, + "logits/chosen": 683369216.0, + "logits/rejected": 465795797.3333333, + "logps/chosen": -461.47685546875, + "logps/rejected": -347.6113688151042, + "loss": 0.2056, + "rewards/chosen": 1.2001531600952149, + "rewards/margins": 5.906753985087077, + "rewards/rejected": -4.706600824991862, + "step": 649 + }, + { + "epoch": 0.05938784833257195, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.91843312858904e-06, + "logits/chosen": 551054745.6, + "logits/rejected": 777785002.6666666, + "logps/chosen": -230.32265625, + "logps/rejected": -546.112060546875, + "loss": 0.0888, + "rewards/chosen": 2.0897804260253907, + "rewards/margins": 6.878504053751628, + "rewards/rejected": -4.788723627726237, + "step": 650 + }, + { + "epoch": 0.0594792142530836, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.918174278584335e-06, + "logits/chosen": 786338816.0, + "logits/rejected": 835661824.0, + "logps/chosen": -438.69488525390625, + "logps/rejected": -457.8231608072917, + "loss": 0.0953, + "rewards/chosen": 2.3091249465942383, + "rewards/margins": 8.215252876281738, + "rewards/rejected": -5.9061279296875, + "step": 651 + }, + { + "epoch": 0.05957058017359525, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.917915021892188e-06, + "logits/chosen": 720675968.0, + "logits/rejected": 369490304.0, + "logps/chosen": -544.6458129882812, + "logps/rejected": -319.58917236328125, + "loss": 0.0421, + "rewards/chosen": 2.8097596168518066, + "rewards/margins": 8.42663288116455, + "rewards/rejected": -5.616873264312744, + "step": 652 + }, + { + "epoch": 0.059661946094106896, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.917655358534034e-06, + "logits/chosen": 543526860.8, + "logits/rejected": 240567850.66666666, + "logps/chosen": -421.89716796875, + "logps/rejected": -310.4496256510417, + "loss": 0.0866, + "rewards/chosen": 1.9192453384399415, + "rewards/margins": 7.573432223002117, + "rewards/rejected": -5.654186884562175, + "step": 653 + }, + { + "epoch": 0.059753312014618545, + "grad_norm": 0.74609375, + "kl": 0.0, + "learning_rate": 9.917395288531344e-06, + "logits/rejected": 575321088.0, + "logps/rejected": -507.6059875488281, + "loss": 0.0028, + "rewards/rejected": -7.061880111694336, + "step": 654 + }, + { + "epoch": 0.059844677935130194, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.917134811905624e-06, + "logits/chosen": 524940544.0, + "logits/rejected": 283314560.0, + "logps/chosen": -297.31785074869794, + "logps/rejected": -392.2487487792969, + "loss": 0.1527, + "rewards/chosen": 1.6973870595296223, + "rewards/margins": 9.812939961751303, + "rewards/rejected": -8.11555290222168, + "step": 655 + }, + { + "epoch": 0.05993604385564184, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.916873928678415e-06, + "logits/chosen": 1004489920.0, + "logits/rejected": 1056194112.0, + "logps/chosen": -413.37628173828125, + "logps/rejected": -899.6900024414062, + "loss": 0.0863, + "rewards/chosen": 2.636582612991333, + "rewards/margins": 11.000723123550415, + "rewards/rejected": -8.364140510559082, + "step": 656 + }, + { + "epoch": 0.06002740977615349, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.916612638871286e-06, + "logits/chosen": 592992256.0, + "logits/rejected": 1099284992.0, + "logps/chosen": -394.7965576171875, + "logps/rejected": -686.1143391927084, + "loss": 0.0589, + "rewards/chosen": 2.6122453689575194, + "rewards/margins": 9.811354128519694, + "rewards/rejected": -7.199108759562175, + "step": 657 + }, + { + "epoch": 0.06011877569666514, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.916350942505846e-06, + "logits/chosen": 517938380.8, + "logits/rejected": 411410773.3333333, + "logps/chosen": -431.2052734375, + "logps/rejected": -476.5565592447917, + "loss": 0.0383, + "rewards/chosen": 3.0993169784545898, + "rewards/margins": 11.098260307312012, + "rewards/rejected": -7.998943328857422, + "step": 658 + }, + { + "epoch": 0.06021014161717679, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.916088839603735e-06, + "logits/chosen": 417927765.3333333, + "logits/rejected": 437505568.0, + "logps/chosen": -301.0039469401042, + "logps/rejected": -520.909912109375, + "loss": 0.1677, + "rewards/chosen": 2.155307133992513, + "rewards/margins": 8.341960271199545, + "rewards/rejected": -6.186653137207031, + "step": 659 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.915826330186625e-06, + "logits/chosen": 879925043.2, + "logits/rejected": 412916693.3333333, + "logps/chosen": -427.44091796875, + "logps/rejected": -315.1320393880208, + "loss": 0.0331, + "rewards/chosen": 3.155182647705078, + "rewards/margins": 8.799083201090495, + "rewards/rejected": -5.643900553385417, + "step": 660 + }, + { + "epoch": 0.06039287345820009, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.915563414276221e-06, + "logits/chosen": 797603157.3333334, + "logits/rejected": 611977011.2, + "logps/chosen": -454.3738606770833, + "logps/rejected": -624.42216796875, + "loss": 0.065, + "rewards/chosen": 2.2649313608805337, + "rewards/margins": 8.005051294962565, + "rewards/rejected": -5.740119934082031, + "step": 661 + }, + { + "epoch": 0.06048423937871174, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.915300091894269e-06, + "logits/chosen": 456432213.3333333, + "logits/rejected": 531028377.6, + "logps/chosen": -358.5531005859375, + "logps/rejected": -517.61318359375, + "loss": 0.0181, + "rewards/chosen": 3.4063294728597007, + "rewards/margins": 12.14134610493978, + "rewards/rejected": -8.735016632080079, + "step": 662 + }, + { + "epoch": 0.06057560529922339, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.91503636306254e-06, + "logits/chosen": 1028403840.0, + "logits/rejected": 437311584.0, + "logps/chosen": -401.49005126953125, + "logps/rejected": -337.4927978515625, + "loss": 0.0255, + "rewards/chosen": 3.393754482269287, + "rewards/margins": 9.13268518447876, + "rewards/rejected": -5.738930702209473, + "step": 663 + }, + { + "epoch": 0.060666971219735036, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.914772227802844e-06, + "logits/chosen": 421178965.3333333, + "logits/rejected": 858362163.2, + "logps/chosen": -250.59248860677084, + "logps/rejected": -518.656787109375, + "loss": 0.0359, + "rewards/chosen": 3.280961354573568, + "rewards/margins": 11.941155751546225, + "rewards/rejected": -8.660194396972656, + "step": 664 + }, + { + "epoch": 0.060758337140246685, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.91450768613702e-06, + "logits/chosen": 1266830540.8, + "logits/rejected": 634003882.6666666, + "logps/chosen": -666.23427734375, + "logps/rejected": -264.6920979817708, + "loss": 0.0834, + "rewards/chosen": 1.947645378112793, + "rewards/margins": 7.950511487325032, + "rewards/rejected": -6.002866109212239, + "step": 665 + }, + { + "epoch": 0.060849703060758334, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.914242738086943e-06, + "logits/chosen": 647309440.0, + "logits/rejected": 456141013.3333333, + "logps/chosen": -420.33050537109375, + "logps/rejected": -420.9062093098958, + "loss": 0.0992, + "rewards/chosen": 2.311692714691162, + "rewards/margins": 8.519830862681072, + "rewards/rejected": -6.208138147989909, + "step": 666 + }, + { + "epoch": 0.06094106898126998, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.913977383674525e-06, + "logits/chosen": 684653772.8, + "logits/rejected": 330562581.3333333, + "logps/chosen": -346.822216796875, + "logps/rejected": -328.45713297526044, + "loss": 0.056, + "rewards/chosen": 2.946837043762207, + "rewards/margins": 9.772454516092937, + "rewards/rejected": -6.8256174723307295, + "step": 667 + }, + { + "epoch": 0.06103243490178163, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.913711622921705e-06, + "logits/chosen": 140337072.0, + "logits/rejected": 426182290.28571427, + "logps/chosen": -73.6806411743164, + "logps/rejected": -449.60616629464283, + "loss": 0.0249, + "rewards/chosen": 2.550649404525757, + "rewards/margins": 8.444139787128993, + "rewards/rejected": -5.893490382603237, + "step": 668 + }, + { + "epoch": 0.06112380082229329, + "grad_norm": 8.125, + "kl": 2.138683319091797, + "learning_rate": 9.913445455850461e-06, + "logits/chosen": 399436774.4, + "logits/rejected": 485699669.3333333, + "logps/chosen": -272.891357421875, + "logps/rejected": -412.34814453125, + "loss": 0.0593, + "rewards/chosen": 2.6329414367675783, + "rewards/margins": 10.145214589436849, + "rewards/rejected": -7.5122731526692705, + "step": 669 + }, + { + "epoch": 0.06121516674280494, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 9.913178882482802e-06, + "logits/chosen": 308613504.0, + "logits/rejected": 434135244.8, + "logps/chosen": -330.424560546875, + "logps/rejected": -672.6755859375, + "loss": 0.0063, + "rewards/chosen": 4.585867563883464, + "rewards/margins": 13.26461359659831, + "rewards/rejected": -8.678746032714844, + "step": 670 + }, + { + "epoch": 0.061306532663316586, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.912911902840771e-06, + "logits/chosen": 1290089472.0, + "logits/rejected": 509055712.0, + "logps/chosen": -593.0220336914062, + "logps/rejected": -371.26708984375, + "loss": 0.0254, + "rewards/chosen": 3.216135025024414, + "rewards/margins": 11.091098308563232, + "rewards/rejected": -7.874963283538818, + "step": 671 + }, + { + "epoch": 0.061397898583828235, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.912644516946445e-06, + "logits/chosen": 559366758.4, + "logits/rejected": 593462272.0, + "logps/chosen": -363.696337890625, + "logps/rejected": -434.3091634114583, + "loss": 0.0935, + "rewards/chosen": 2.540480613708496, + "rewards/margins": 9.641177813212078, + "rewards/rejected": -7.100697199503581, + "step": 672 + }, + { + "epoch": 0.061489264504339884, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.912376724821934e-06, + "logits/chosen": 891685888.0, + "logits/rejected": 503195104.0, + "logps/chosen": -431.4404296875, + "logps/rejected": -665.9742431640625, + "loss": 0.0636, + "rewards/chosen": 2.0443618297576904, + "rewards/margins": 13.944090604782104, + "rewards/rejected": -11.899728775024414, + "step": 673 + }, + { + "epoch": 0.06158063042485153, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.912108526489381e-06, + "logits/chosen": 768004096.0, + "logits/rejected": 1043657728.0, + "logps/chosen": -238.6099650065104, + "logps/rejected": -718.38603515625, + "loss": 0.0461, + "rewards/chosen": 2.126192251841227, + "rewards/margins": 12.721907583872477, + "rewards/rejected": -10.59571533203125, + "step": 674 + }, + { + "epoch": 0.06167199634536318, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.911839921970966e-06, + "logits/chosen": 546175232.0, + "logits/rejected": 250256448.0, + "logps/chosen": -252.20638602120536, + "logps/rejected": -273.977294921875, + "loss": 0.0573, + "rewards/chosen": 2.9747633252825056, + "rewards/margins": 8.408268860408239, + "rewards/rejected": -5.433505535125732, + "step": 675 + }, + { + "epoch": 0.06176336226587483, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.911570911288897e-06, + "logits/chosen": 492509408.0, + "logits/rejected": 435783296.0, + "logps/chosen": -381.74163818359375, + "logps/rejected": -490.310302734375, + "loss": 0.044, + "rewards/chosen": 3.025404453277588, + "rewards/margins": 11.571374416351318, + "rewards/rejected": -8.54596996307373, + "step": 676 + }, + { + "epoch": 0.06185472818638648, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 9.911301494465423e-06, + "logits/chosen": 538662442.6666666, + "logits/rejected": 616025446.4, + "logps/chosen": -384.1588134765625, + "logps/rejected": -457.731689453125, + "loss": 0.0142, + "rewards/chosen": 3.7617082595825195, + "rewards/margins": 10.206958198547364, + "rewards/rejected": -6.445249938964844, + "step": 677 + }, + { + "epoch": 0.06194609410689813, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.911031671522817e-06, + "logits/chosen": 333200469.3333333, + "logits/rejected": 277988966.4, + "logps/chosen": -277.2932942708333, + "logps/rejected": -333.2484375, + "loss": 0.0226, + "rewards/chosen": 3.3029276529947915, + "rewards/margins": 10.099887339274089, + "rewards/rejected": -6.796959686279297, + "step": 678 + }, + { + "epoch": 0.06203746002740978, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.910761442483395e-06, + "logits/chosen": 442161152.0, + "logits/rejected": 571052629.3333334, + "logps/chosen": -365.073681640625, + "logps/rejected": -384.7154947916667, + "loss": 0.0258, + "rewards/chosen": 3.583001708984375, + "rewards/margins": 10.562503433227539, + "rewards/rejected": -6.979501724243164, + "step": 679 + }, + { + "epoch": 0.06212882594792143, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.9104908073695e-06, + "logits/chosen": 363604384.0, + "logits/rejected": 429561600.0, + "logps/chosen": -175.10186767578125, + "logps/rejected": -661.9259643554688, + "loss": 0.0374, + "rewards/chosen": 2.7743401527404785, + "rewards/margins": 14.087660312652588, + "rewards/rejected": -11.31332015991211, + "step": 680 + }, + { + "epoch": 0.06222019186843308, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 9.910219766203513e-06, + "logits/chosen": 849451648.0, + "logits/rejected": 1123576320.0, + "logps/chosen": -349.13116455078125, + "logps/rejected": -735.05126953125, + "loss": 0.0752, + "rewards/chosen": 1.4711045026779175, + "rewards/margins": 12.48280926545461, + "rewards/rejected": -11.011704762776693, + "step": 681 + }, + { + "epoch": 0.062311557788944726, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.909948319007845e-06, + "logits/chosen": 410552661.3333333, + "logits/rejected": 342794956.8, + "logps/chosen": -235.5733642578125, + "logps/rejected": -445.213671875, + "loss": 0.0673, + "rewards/chosen": 2.1216891606648765, + "rewards/margins": 9.769113477071127, + "rewards/rejected": -7.64742431640625, + "step": 682 + }, + { + "epoch": 0.062402923709456375, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.909676465804941e-06, + "logits/chosen": 479000832.0, + "logits/rejected": 450928384.0, + "logps/chosen": -258.10992431640625, + "logps/rejected": -338.57763671875, + "loss": 0.0326, + "rewards/chosen": 3.2913644313812256, + "rewards/margins": 10.255972146987915, + "rewards/rejected": -6.9646077156066895, + "step": 683 + }, + { + "epoch": 0.062494289629968024, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.909404206617285e-06, + "logits/chosen": 425071317.3333333, + "logits/rejected": 747542323.2, + "logps/chosen": -321.27357991536456, + "logps/rejected": -545.808984375, + "loss": 0.0328, + "rewards/chosen": 2.4878819783528647, + "rewards/margins": 10.910321553548178, + "rewards/rejected": -8.422439575195312, + "step": 684 + }, + { + "epoch": 0.06258565555047967, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.909131541467387e-06, + "logits/chosen": 450185664.0, + "logits/rejected": 438284000.0, + "logps/chosen": -326.12677001953125, + "logps/rejected": -360.30029296875, + "loss": 0.043, + "rewards/chosen": 3.3579511642456055, + "rewards/margins": 6.86195969581604, + "rewards/rejected": -3.5040085315704346, + "step": 685 + }, + { + "epoch": 0.06267702147099131, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 9.908858470377793e-06, + "logits/chosen": 449545728.0, + "logits/rejected": 466391381.3333333, + "logps/chosen": -273.800537109375, + "logps/rejected": -496.2377115885417, + "loss": 0.0273, + "rewards/chosen": 3.569538879394531, + "rewards/margins": 8.165515391031901, + "rewards/rejected": -4.59597651163737, + "step": 686 + }, + { + "epoch": 0.06276838739150296, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.908584993371087e-06, + "logits/chosen": 751046997.3333334, + "logits/rejected": 585309952.0, + "logps/chosen": -329.6938069661458, + "logps/rejected": -293.0544738769531, + "loss": 0.0663, + "rewards/chosen": 2.530463218688965, + "rewards/margins": 6.4133124351501465, + "rewards/rejected": -3.8828492164611816, + "step": 687 + }, + { + "epoch": 0.06285975331201461, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 9.908311110469881e-06, + "logits/chosen": 522443520.0, + "logits/rejected": 544056524.8, + "logps/chosen": -292.1033528645833, + "logps/rejected": -767.21611328125, + "loss": 0.0179, + "rewards/chosen": 3.769901911417643, + "rewards/margins": 11.982654444376626, + "rewards/rejected": -8.212752532958984, + "step": 688 + }, + { + "epoch": 0.06295111923252626, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.908036821696822e-06, + "logits/chosen": 518997708.8, + "logits/rejected": 616749738.6666666, + "logps/chosen": -366.95966796875, + "logps/rejected": -451.0194498697917, + "loss": 0.0264, + "rewards/chosen": 3.2597938537597657, + "rewards/margins": 9.857848103841146, + "rewards/rejected": -6.59805425008138, + "step": 689 + }, + { + "epoch": 0.06304248515303791, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.907762127074591e-06, + "logits/chosen": 451286272.0, + "logits/rejected": 704104490.6666666, + "logps/chosen": -381.0294677734375, + "logps/rejected": -575.6988118489584, + "loss": 0.0602, + "rewards/chosen": 2.7085575103759765, + "rewards/margins": 8.232826614379883, + "rewards/rejected": -5.524269104003906, + "step": 690 + }, + { + "epoch": 0.06313385107354956, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.907487026625905e-06, + "logits/chosen": 439374912.0, + "logits/rejected": 630532352.0, + "logps/chosen": -293.0465087890625, + "logps/rejected": -384.300537109375, + "loss": 0.0727, + "rewards/chosen": 2.6353909969329834, + "rewards/margins": 8.793423891067505, + "rewards/rejected": -6.1580328941345215, + "step": 691 + }, + { + "epoch": 0.06322521699406121, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.907211520373513e-06, + "logits/chosen": 389444224.0, + "logits/rejected": 519647744.0, + "logps/chosen": -322.27337646484375, + "logps/rejected": -392.8593444824219, + "loss": 0.0355, + "rewards/chosen": 3.105424404144287, + "rewards/margins": 8.95010757446289, + "rewards/rejected": -5.8446831703186035, + "step": 692 + }, + { + "epoch": 0.06331658291457286, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.90693560834019e-06, + "logits/chosen": 711140480.0, + "logits/rejected": 410440550.4, + "logps/chosen": -506.5743408203125, + "logps/rejected": -389.91201171875, + "loss": 0.0239, + "rewards/chosen": 3.1158469518025718, + "rewards/margins": 10.192903836568197, + "rewards/rejected": -7.077056884765625, + "step": 693 + }, + { + "epoch": 0.06340794883508451, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.906659290548758e-06, + "logits/chosen": 423690598.4, + "logits/rejected": 327923178.6666667, + "logps/chosen": -350.9244873046875, + "logps/rejected": -342.3679606119792, + "loss": 0.0785, + "rewards/chosen": 2.9057697296142577, + "rewards/margins": 7.771768951416016, + "rewards/rejected": -4.865999221801758, + "step": 694 + }, + { + "epoch": 0.06349931475559616, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.906382567022064e-06, + "logits/chosen": 578574528.0, + "logits/rejected": 668257536.0, + "logps/chosen": -289.53125, + "logps/rejected": -388.5215759277344, + "loss": 0.1026, + "rewards/chosen": 2.8790884017944336, + "rewards/margins": 7.927082538604736, + "rewards/rejected": -5.047994136810303, + "step": 695 + }, + { + "epoch": 0.0635906806761078, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.90610543778299e-06, + "logits/chosen": 1476275968.0, + "logits/rejected": 629907602.2857143, + "logps/chosen": -156.3846893310547, + "logps/rejected": -524.021240234375, + "loss": 0.0185, + "rewards/chosen": 2.9248063564300537, + "rewards/margins": 10.317080395562307, + "rewards/rejected": -7.392274039132254, + "step": 696 + }, + { + "epoch": 0.06368204659661945, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.905827902854453e-06, + "logits/chosen": 1119559065.6, + "logits/rejected": 663965994.6666666, + "logps/chosen": -201.88916015625, + "logps/rejected": -480.3154296875, + "loss": 0.0421, + "rewards/chosen": 3.1338623046875, + "rewards/margins": 9.709777323404948, + "rewards/rejected": -6.575915018717448, + "step": 697 + }, + { + "epoch": 0.0637734125171311, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.9055499622594e-06, + "logits/chosen": 718443520.0, + "logits/rejected": 768429312.0, + "logps/chosen": -640.6828206380209, + "logps/rejected": -464.815625, + "loss": 0.0382, + "rewards/chosen": 2.8518190383911133, + "rewards/margins": 9.475715827941894, + "rewards/rejected": -6.623896789550781, + "step": 698 + }, + { + "epoch": 0.06386477843764275, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 9.905271616020819e-06, + "logits/chosen": 226732108.8, + "logits/rejected": 452561408.0, + "logps/chosen": -194.4846435546875, + "logps/rejected": -491.5786946614583, + "loss": 0.0297, + "rewards/chosen": 3.626689910888672, + "rewards/margins": 11.335273742675781, + "rewards/rejected": -7.708583831787109, + "step": 699 + }, + { + "epoch": 0.06395614435815442, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.904992864161723e-06, + "logits/chosen": 543865941.3333334, + "logits/rejected": 469792972.8, + "logps/chosen": -507.1199951171875, + "logps/rejected": -461.04189453125, + "loss": 0.0234, + "rewards/chosen": 2.837026913960775, + "rewards/margins": 11.322158749898275, + "rewards/rejected": -8.4851318359375, + "step": 700 + }, + { + "epoch": 0.06404751027866606, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.90471370670516e-06, + "logits/chosen": 443428906.6666667, + "logits/rejected": 601934464.0, + "logps/chosen": -293.8730061848958, + "logps/rejected": -406.03912353515625, + "loss": 0.0456, + "rewards/chosen": 2.961076100667318, + "rewards/margins": 10.59720547993978, + "rewards/rejected": -7.636129379272461, + "step": 701 + }, + { + "epoch": 0.06413887619917771, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.904434143674219e-06, + "logits/chosen": 639753792.0, + "logits/rejected": 884486912.0, + "logps/chosen": -238.56312561035156, + "logps/rejected": -494.5633138020833, + "loss": 0.0448, + "rewards/chosen": 2.2283830642700195, + "rewards/margins": 10.068008740743, + "rewards/rejected": -7.8396256764729815, + "step": 702 + }, + { + "epoch": 0.06423024211968936, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.904154175092015e-06, + "logits/chosen": 403200000.0, + "logits/rejected": 283982805.3333333, + "logps/chosen": -272.071728515625, + "logps/rejected": -222.53409830729166, + "loss": 0.0978, + "rewards/chosen": 2.7496610641479493, + "rewards/margins": 6.388222312927246, + "rewards/rejected": -3.638561248779297, + "step": 703 + }, + { + "epoch": 0.06432160804020101, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.903873800981698e-06, + "logits/chosen": 360295910.4, + "logits/rejected": 339408362.6666667, + "logps/chosen": -298.354833984375, + "logps/rejected": -546.3574625651041, + "loss": 0.0765, + "rewards/chosen": 2.8392749786376954, + "rewards/margins": 10.010676956176757, + "rewards/rejected": -7.1714019775390625, + "step": 704 + }, + { + "epoch": 0.06441297396071266, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.903593021366454e-06, + "logits/chosen": 557726528.0, + "logits/rejected": 753748650.6666666, + "logps/chosen": -288.2623291015625, + "logps/rejected": -422.07861328125, + "loss": 0.1263, + "rewards/chosen": 2.953260898590088, + "rewards/margins": 7.399154504140218, + "rewards/rejected": -4.44589360555013, + "step": 705 + }, + { + "epoch": 0.06450433988122431, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.903311836269499e-06, + "logits/chosen": 450790912.0, + "logits/rejected": 656766848.0, + "logps/chosen": -297.4107666015625, + "logps/rejected": -602.279541015625, + "loss": 0.1288, + "rewards/chosen": 2.092452621459961, + "rewards/margins": 7.594710922241211, + "rewards/rejected": -5.50225830078125, + "step": 706 + }, + { + "epoch": 0.06459570580173596, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.903030245714085e-06, + "logits/chosen": 438214848.0, + "logits/rejected": 530959232.0, + "logps/chosen": -326.40673828125, + "logps/rejected": -309.4158020019531, + "loss": 0.041, + "rewards/chosen": 2.948089599609375, + "rewards/margins": 9.590768814086914, + "rewards/rejected": -6.642679214477539, + "step": 707 + }, + { + "epoch": 0.06468707172224761, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.9027482497235e-06, + "logits/chosen": 622006080.0, + "logits/rejected": 712736182.8571428, + "logps/chosen": -302.78564453125, + "logps/rejected": -447.5529087611607, + "loss": 0.027, + "rewards/chosen": 1.8088501691818237, + "rewards/margins": 7.407030667577471, + "rewards/rejected": -5.598180498395648, + "step": 708 + }, + { + "epoch": 0.06477843764275926, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.902465848321057e-06, + "logits/chosen": 1101084928.0, + "logits/rejected": 411020160.0, + "logps/chosen": -345.04864501953125, + "logps/rejected": -410.83837890625, + "loss": 0.0961, + "rewards/chosen": 1.7811856269836426, + "rewards/margins": 9.589942455291748, + "rewards/rejected": -7.8087568283081055, + "step": 709 + }, + { + "epoch": 0.0648698035632709, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 9.902183041530112e-06, + "logits/chosen": 468638003.2, + "logits/rejected": 457141077.3333333, + "logps/chosen": -236.312890625, + "logps/rejected": -444.0064697265625, + "loss": 0.1129, + "rewards/chosen": 1.8831796646118164, + "rewards/margins": 10.056807518005371, + "rewards/rejected": -8.173627853393555, + "step": 710 + }, + { + "epoch": 0.06496116948378255, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.901899829374048e-06, + "logits/chosen": 360720358.4, + "logits/rejected": 562512725.3333334, + "logps/chosen": -414.17587890625, + "logps/rejected": -642.8921712239584, + "loss": 0.0245, + "rewards/chosen": 3.453733444213867, + "rewards/margins": 10.459395726521809, + "rewards/rejected": -7.005662282307942, + "step": 711 + }, + { + "epoch": 0.0650525354042942, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.901616211876287e-06, + "logits/chosen": 689257216.0, + "logits/rejected": 314491161.6, + "logps/chosen": -267.5033772786458, + "logps/rejected": -264.9767333984375, + "loss": 0.0436, + "rewards/chosen": 2.236098289489746, + "rewards/margins": 7.104121208190918, + "rewards/rejected": -4.868022918701172, + "step": 712 + }, + { + "epoch": 0.06514390132480585, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 9.901332189060278e-06, + "logits/chosen": 297749589.3333333, + "logits/rejected": 691511347.2, + "logps/chosen": -166.88721720377603, + "logps/rejected": -680.65068359375, + "loss": 0.0237, + "rewards/chosen": 3.7651824951171875, + "rewards/margins": 10.469619750976562, + "rewards/rejected": -6.704437255859375, + "step": 713 + }, + { + "epoch": 0.0652352672453175, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.901047760949508e-06, + "logits/chosen": 460155392.0, + "logits/rejected": 363164518.4, + "logps/chosen": -364.4930013020833, + "logps/rejected": -491.934912109375, + "loss": 0.0934, + "rewards/chosen": 2.4475797017415366, + "rewards/margins": 9.505174763997395, + "rewards/rejected": -7.057595062255859, + "step": 714 + }, + { + "epoch": 0.06532663316582915, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.9007629275675e-06, + "logits/chosen": 458548906.6666667, + "logits/rejected": 398859878.4, + "logps/chosen": -287.1630859375, + "logps/rejected": -591.972119140625, + "loss": 0.1225, + "rewards/chosen": 1.2884803613026936, + "rewards/margins": 9.277062209447225, + "rewards/rejected": -7.988581848144531, + "step": 715 + }, + { + "epoch": 0.0654179990863408, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.900477688937802e-06, + "logits/chosen": 510890624.0, + "logits/rejected": 403570560.0, + "logps/chosen": -244.114990234375, + "logps/rejected": -471.6015319824219, + "loss": 0.0408, + "rewards/chosen": 3.4023070335388184, + "rewards/margins": 7.765657901763916, + "rewards/rejected": -4.363350868225098, + "step": 716 + }, + { + "epoch": 0.06550936500685245, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 9.900192045084005e-06, + "logits/chosen": 585763072.0, + "logits/rejected": 455296544.0, + "logps/chosen": -500.19293212890625, + "logps/rejected": -520.157470703125, + "loss": 0.0417, + "rewards/chosen": 2.544184923171997, + "rewards/margins": 11.019959688186646, + "rewards/rejected": -8.475774765014648, + "step": 717 + }, + { + "epoch": 0.0656007309273641, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 9.899905996029726e-06, + "logits/chosen": 766588928.0, + "logits/rejected": 927179605.3333334, + "logps/chosen": -603.9453125, + "logps/rejected": -553.918701171875, + "loss": 0.0156, + "rewards/chosen": 2.7456984519958496, + "rewards/margins": 12.120262940724691, + "rewards/rejected": -9.374564488728842, + "step": 718 + }, + { + "epoch": 0.06569209684787575, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.89961954179862e-06, + "logits/chosen": 556699776.0, + "logits/rejected": 768373696.0, + "logps/chosen": -462.3430480957031, + "logps/rejected": -433.53955078125, + "loss": 0.0652, + "rewards/chosen": 3.4438021183013916, + "rewards/margins": 7.642492055892944, + "rewards/rejected": -4.198689937591553, + "step": 719 + }, + { + "epoch": 0.0657834627683874, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.899332682414374e-06, + "logits/chosen": 555814912.0, + "logits/rejected": 1144106240.0, + "logps/chosen": -389.2329915364583, + "logps/rejected": -634.2692260742188, + "loss": 0.0371, + "rewards/chosen": 3.3913774490356445, + "rewards/margins": 9.736922264099121, + "rewards/rejected": -6.345544815063477, + "step": 720 + }, + { + "epoch": 0.06587482868889905, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.899045417900709e-06, + "logits/chosen": 437613504.0, + "logits/rejected": 842882474.6666666, + "logps/chosen": -206.0293426513672, + "logps/rejected": -426.9059651692708, + "loss": 0.0429, + "rewards/chosen": 3.905728816986084, + "rewards/margins": 10.669213771820068, + "rewards/rejected": -6.763484954833984, + "step": 721 + }, + { + "epoch": 0.0659661946094107, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.898757748281377e-06, + "logits/chosen": 792462848.0, + "logits/rejected": 669125376.0, + "logps/chosen": -313.0124206542969, + "logps/rejected": -537.411865234375, + "loss": 0.0465, + "rewards/chosen": 2.6142239570617676, + "rewards/margins": 9.527637958526611, + "rewards/rejected": -6.913414001464844, + "step": 722 + }, + { + "epoch": 0.06605756052992234, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.898469673580168e-06, + "logits/chosen": 702681702.4, + "logits/rejected": 498109525.3333333, + "logps/chosen": -302.3854248046875, + "logps/rejected": -486.17626953125, + "loss": 0.0378, + "rewards/chosen": 3.2781776428222655, + "rewards/margins": 11.636788431803385, + "rewards/rejected": -8.35861078898112, + "step": 723 + }, + { + "epoch": 0.06614892645043399, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.898181193820904e-06, + "logits/chosen": 736217907.2, + "logits/rejected": 723787264.0, + "logps/chosen": -220.1770263671875, + "logps/rejected": -280.74082438151044, + "loss": 0.1305, + "rewards/chosen": 3.1242130279541014, + "rewards/margins": 7.789767710367839, + "rewards/rejected": -4.665554682413737, + "step": 724 + }, + { + "epoch": 0.06624029237094564, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.897892309027437e-06, + "logits/chosen": 638861414.4, + "logits/rejected": 460772266.6666667, + "logps/chosen": -354.1634765625, + "logps/rejected": -339.9974365234375, + "loss": 0.1396, + "rewards/chosen": 2.774479293823242, + "rewards/margins": 6.022535705566407, + "rewards/rejected": -3.248056411743164, + "step": 725 + }, + { + "epoch": 0.06633165829145729, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.897603019223656e-06, + "logits/chosen": 728188330.6666666, + "logits/rejected": 770813696.0, + "logps/chosen": -341.2825520833333, + "logps/rejected": -352.56097412109375, + "loss": 0.0688, + "rewards/chosen": 2.4181536038716636, + "rewards/margins": 9.130562623341879, + "rewards/rejected": -6.712409019470215, + "step": 726 + }, + { + "epoch": 0.06642302421196894, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.897313324433482e-06, + "logits/chosen": 954221141.3333334, + "logits/rejected": 521569280.0, + "logps/chosen": -327.2532958984375, + "logps/rejected": -397.12841796875, + "loss": 0.0791, + "rewards/chosen": 2.5161757469177246, + "rewards/margins": 7.229750633239746, + "rewards/rejected": -4.7135748863220215, + "step": 727 + }, + { + "epoch": 0.06651439013248059, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.897023224680871e-06, + "logits/chosen": 598492313.6, + "logits/rejected": 665757610.6666666, + "logps/chosen": -411.432080078125, + "logps/rejected": -524.5870768229166, + "loss": 0.0362, + "rewards/chosen": 3.199798011779785, + "rewards/margins": 10.107534472147623, + "rewards/rejected": -6.907736460367839, + "step": 728 + }, + { + "epoch": 0.06660575605299224, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.896732719989812e-06, + "logits/chosen": 368779520.0, + "logits/rejected": 233929574.4, + "logps/chosen": -263.04587809244794, + "logps/rejected": -372.36181640625, + "loss": 0.0593, + "rewards/chosen": 2.5539878209431968, + "rewards/margins": 9.003224881490071, + "rewards/rejected": -6.449237060546875, + "step": 729 + }, + { + "epoch": 0.06669712197350389, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.896441810384325e-06, + "logits/chosen": 825339520.0, + "logits/rejected": 989569462.8571428, + "logps/chosen": -442.35137939453125, + "logps/rejected": -394.3663853236607, + "loss": 0.0513, + "rewards/chosen": 1.3846619129180908, + "rewards/margins": 6.8006541047777445, + "rewards/rejected": -5.415992191859654, + "step": 730 + }, + { + "epoch": 0.06678848789401554, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.896150495888466e-06, + "logits/chosen": 251676848.0, + "logits/rejected": 585147209.1428572, + "logps/chosen": -172.14273071289062, + "logps/rejected": -612.4903738839286, + "loss": 0.0291, + "rewards/chosen": 4.491513252258301, + "rewards/margins": 10.67716748373849, + "rewards/rejected": -6.18565423148019, + "step": 731 + }, + { + "epoch": 0.06687985381452718, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.895858776526328e-06, + "logits/chosen": 539494912.0, + "logits/rejected": 672102144.0, + "logps/chosen": -422.0216064453125, + "logps/rejected": -644.4384765625, + "loss": 0.0256, + "rewards/chosen": 3.4598402976989746, + "rewards/margins": 10.067398071289062, + "rewards/rejected": -6.607557773590088, + "step": 732 + }, + { + "epoch": 0.06697121973503883, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 9.895566652322027e-06, + "logits/chosen": 453276364.8, + "logits/rejected": 449699754.6666667, + "logps/chosen": -285.4283935546875, + "logps/rejected": -493.5532633463542, + "loss": 0.031, + "rewards/chosen": 3.127616119384766, + "rewards/margins": 12.124193064371745, + "rewards/rejected": -8.996576944986979, + "step": 733 + }, + { + "epoch": 0.06706258565555048, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.895274123299724e-06, + "logits/chosen": 602988970.6666666, + "logits/rejected": 1544236800.0, + "logps/chosen": -225.8892822265625, + "logps/rejected": -458.98138427734375, + "loss": 0.0236, + "rewards/chosen": 3.935858408610026, + "rewards/margins": 9.710977713267008, + "rewards/rejected": -5.775119304656982, + "step": 734 + }, + { + "epoch": 0.06715395157606213, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.894981189483607e-06, + "logits/chosen": 416995200.0, + "logits/rejected": 406625344.0, + "logps/chosen": -341.32692464192706, + "logps/rejected": -436.6096496582031, + "loss": 0.0443, + "rewards/chosen": 3.4328079223632812, + "rewards/margins": 10.449232578277588, + "rewards/rejected": -7.016424655914307, + "step": 735 + }, + { + "epoch": 0.06724531749657378, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.894687850897897e-06, + "logits/chosen": 455493856.0, + "logits/rejected": 393284832.0, + "logps/chosen": -402.0542907714844, + "logps/rejected": -515.2948608398438, + "loss": 0.0468, + "rewards/chosen": 2.4222307205200195, + "rewards/margins": 8.679269790649414, + "rewards/rejected": -6.2570390701293945, + "step": 736 + }, + { + "epoch": 0.06733668341708543, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 9.894394107566853e-06, + "logits/chosen": 372571776.0, + "logits/rejected": 377916864.0, + "logps/chosen": -267.757080078125, + "logps/rejected": -526.6731567382812, + "loss": 0.0271, + "rewards/chosen": 3.318284034729004, + "rewards/margins": 11.799046516418457, + "rewards/rejected": -8.480762481689453, + "step": 737 + }, + { + "epoch": 0.06742804933759708, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.894099959514765e-06, + "logits/chosen": 735848618.6666666, + "logits/rejected": 479598182.4, + "logps/chosen": -388.859130859375, + "logps/rejected": -468.2818359375, + "loss": 0.0243, + "rewards/chosen": 2.8059444427490234, + "rewards/margins": 10.761847305297852, + "rewards/rejected": -7.955902862548828, + "step": 738 + }, + { + "epoch": 0.06751941525810873, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.893805406765954e-06, + "logits/chosen": 757103488.0, + "logits/rejected": 428448128.0, + "logps/chosen": -298.3480224609375, + "logps/rejected": -413.48077392578125, + "loss": 0.0973, + "rewards/chosen": 2.3421339988708496, + "rewards/margins": 7.828538417816162, + "rewards/rejected": -5.4864044189453125, + "step": 739 + }, + { + "epoch": 0.06761078117862038, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.893510449344778e-06, + "logits/chosen": 431527488.0, + "logits/rejected": 419724416.0, + "logps/chosen": -362.5666198730469, + "logps/rejected": -264.420654296875, + "loss": 0.0477, + "rewards/chosen": 3.0562360286712646, + "rewards/margins": 8.521219968795776, + "rewards/rejected": -5.464983940124512, + "step": 740 + }, + { + "epoch": 0.06770214709913203, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.893215087275627e-06, + "logits/chosen": 599423692.8, + "logits/rejected": 496444928.0, + "logps/chosen": -369.1510009765625, + "logps/rejected": -436.9305013020833, + "loss": 0.1395, + "rewards/chosen": 2.0975311279296873, + "rewards/margins": 8.53003069559733, + "rewards/rejected": -6.4324995676676435, + "step": 741 + }, + { + "epoch": 0.06779351301964368, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.892919320582926e-06, + "logits/chosen": 529921120.0, + "logits/rejected": 473951712.0, + "logps/chosen": -286.2122802734375, + "logps/rejected": -399.1312255859375, + "loss": 0.0419, + "rewards/chosen": 2.8522963523864746, + "rewards/margins": 9.545673847198486, + "rewards/rejected": -6.693377494812012, + "step": 742 + }, + { + "epoch": 0.06788487894015532, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.892623149291131e-06, + "logits/chosen": 509909376.0, + "logits/rejected": 1151288448.0, + "logps/chosen": -281.1636047363281, + "logps/rejected": -600.398681640625, + "loss": 0.0626, + "rewards/chosen": 2.4949378967285156, + "rewards/margins": 9.533715724945068, + "rewards/rejected": -7.038777828216553, + "step": 743 + }, + { + "epoch": 0.06797624486066697, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.89232657342473e-06, + "logits/chosen": 621321318.4, + "logits/rejected": 417393920.0, + "logps/chosen": -445.91806640625, + "logps/rejected": -261.1090087890625, + "loss": 0.0622, + "rewards/chosen": 3.3577449798583983, + "rewards/margins": 6.079514948527018, + "rewards/rejected": -2.7217699686686196, + "step": 744 + }, + { + "epoch": 0.06806761078117862, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.892029593008253e-06, + "logits/chosen": 412630963.2, + "logits/rejected": 634825813.3333334, + "logps/chosen": -246.62294921875, + "logps/rejected": -482.6090087890625, + "loss": 0.0712, + "rewards/chosen": 3.0749549865722656, + "rewards/margins": 10.344370524088543, + "rewards/rejected": -7.269415537516276, + "step": 745 + }, + { + "epoch": 0.06815897670169027, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 9.891732208066254e-06, + "logits/chosen": 645196714.6666666, + "logits/rejected": 559000883.2, + "logps/chosen": -404.8090413411458, + "logps/rejected": -705.73837890625, + "loss": 0.027, + "rewards/chosen": 3.372267723083496, + "rewards/margins": 14.086534690856933, + "rewards/rejected": -10.714266967773437, + "step": 746 + }, + { + "epoch": 0.06825034262220192, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.891434418623323e-06, + "logits/chosen": 287438762.6666667, + "logits/rejected": 526591744.0, + "logps/chosen": -188.42073567708334, + "logps/rejected": -586.62890625, + "loss": 0.0169, + "rewards/chosen": 3.2625770568847656, + "rewards/margins": 12.73480224609375, + "rewards/rejected": -9.472225189208984, + "step": 747 + }, + { + "epoch": 0.06834170854271357, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.891136224704086e-06, + "logits/chosen": 867160746.6666666, + "logits/rejected": 386268313.6, + "logps/chosen": -523.8641764322916, + "logps/rejected": -366.9033935546875, + "loss": 0.2096, + "rewards/chosen": 2.685128847757975, + "rewards/margins": 6.789632097880045, + "rewards/rejected": -4.10450325012207, + "step": 748 + }, + { + "epoch": 0.06843307446322522, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.890837626333202e-06, + "logits/chosen": 473189120.0, + "logits/rejected": 554481280.0, + "logps/chosen": -332.81512451171875, + "logps/rejected": -612.7828369140625, + "loss": 0.0547, + "rewards/chosen": 2.3706178665161133, + "rewards/margins": 12.125030517578125, + "rewards/rejected": -9.754412651062012, + "step": 749 + }, + { + "epoch": 0.06852444038373687, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.89053862353536e-06, + "logits/chosen": 657781845.3333334, + "logits/rejected": 635875840.0, + "logps/chosen": -438.6027018229167, + "logps/rejected": -440.76580810546875, + "loss": 0.0497, + "rewards/chosen": 2.942188580830892, + "rewards/margins": 12.44095261891683, + "rewards/rejected": -9.498764038085938, + "step": 750 + }, + { + "epoch": 0.06861580630424852, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 9.890239216335285e-06, + "logits/chosen": 698913088.0, + "logits/rejected": 363658240.0, + "logps/chosen": -325.796142578125, + "logps/rejected": -324.6498616536458, + "loss": 0.0059, + "rewards/chosen": 4.062511444091797, + "rewards/margins": 10.756269454956055, + "rewards/rejected": -6.693758010864258, + "step": 751 + }, + { + "epoch": 0.06870717222476017, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.889939404757736e-06, + "logits/chosen": 350936640.0, + "logits/rejected": 579472457.1428572, + "logps/chosen": -206.00502014160156, + "logps/rejected": -487.6442173549107, + "loss": 0.0134, + "rewards/chosen": 4.967076301574707, + "rewards/margins": 14.396306037902832, + "rewards/rejected": -9.429229736328125, + "step": 752 + }, + { + "epoch": 0.06879853814527181, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.889639188827504e-06, + "logits/chosen": 557896294.4, + "logits/rejected": 407517098.6666667, + "logps/chosen": -409.94150390625, + "logps/rejected": -490.0198974609375, + "loss": 0.0443, + "rewards/chosen": 2.73378849029541, + "rewards/margins": 10.348882993062336, + "rewards/rejected": -7.615094502766927, + "step": 753 + }, + { + "epoch": 0.06888990406578346, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.889338568569417e-06, + "logits/chosen": 303370922.6666667, + "logits/rejected": 489267609.6, + "logps/chosen": -197.37548828125, + "logps/rejected": -522.6787109375, + "loss": 0.0372, + "rewards/chosen": 3.7357603708902993, + "rewards/margins": 11.143453089396159, + "rewards/rejected": -7.407692718505859, + "step": 754 + }, + { + "epoch": 0.06898126998629511, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 9.88903754400833e-06, + "logits/chosen": 421564544.0, + "logits/rejected": 287179861.3333333, + "logps/chosen": -166.0033416748047, + "logps/rejected": -357.3831787109375, + "loss": 0.008, + "rewards/chosen": 3.8241090774536133, + "rewards/margins": 11.563181241353352, + "rewards/rejected": -7.739072163899739, + "step": 755 + }, + { + "epoch": 0.06907263590680676, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 9.888736115169136e-06, + "logits/chosen": 753459136.0, + "logits/rejected": 679253632.0, + "logps/chosen": -640.7064819335938, + "logps/rejected": -549.2568969726562, + "loss": 0.0294, + "rewards/chosen": 3.247732639312744, + "rewards/margins": 9.180572986602783, + "rewards/rejected": -5.932840347290039, + "step": 756 + }, + { + "epoch": 0.06916400182731841, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.888434282076759e-06, + "logits/chosen": 442920832.0, + "logits/rejected": 477849728.0, + "logps/chosen": -317.03741455078125, + "logps/rejected": -435.17572021484375, + "loss": 0.038, + "rewards/chosen": 2.5266904830932617, + "rewards/margins": 10.228556156158447, + "rewards/rejected": -7.7018656730651855, + "step": 757 + }, + { + "epoch": 0.06925536774783006, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.88813204475616e-06, + "logits/chosen": 350686549.3333333, + "logits/rejected": 483426560.0, + "logps/chosen": -358.4442952473958, + "logps/rejected": -493.2009765625, + "loss": 0.0794, + "rewards/chosen": 2.4917014439900718, + "rewards/margins": 11.069075711568198, + "rewards/rejected": -8.577374267578126, + "step": 758 + }, + { + "epoch": 0.06934673366834171, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.887829403232331e-06, + "logits/chosen": 378334272.0, + "logits/rejected": 573587584.0, + "logps/chosen": -306.747802734375, + "logps/rejected": -648.048828125, + "loss": 0.0496, + "rewards/chosen": 2.533357620239258, + "rewards/margins": 13.102804183959961, + "rewards/rejected": -10.569446563720703, + "step": 759 + }, + { + "epoch": 0.06943809958885336, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.887526357530297e-06, + "logits/chosen": 546730944.0, + "logits/rejected": 511171904.0, + "logps/chosen": -282.73968505859375, + "logps/rejected": -410.97607421875, + "loss": 0.0364, + "rewards/chosen": 2.705491542816162, + "rewards/margins": 9.600529193878174, + "rewards/rejected": -6.895037651062012, + "step": 760 + }, + { + "epoch": 0.06952946550936501, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.887222907675115e-06, + "logits/chosen": 985646933.3333334, + "logits/rejected": 608850688.0, + "logps/chosen": -349.1103515625, + "logps/rejected": -551.9906005859375, + "loss": 0.1057, + "rewards/chosen": 2.332712173461914, + "rewards/margins": 10.469600677490234, + "rewards/rejected": -8.13688850402832, + "step": 761 + }, + { + "epoch": 0.06962083142987666, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 9.886919053691884e-06, + "logits/chosen": 507258816.0, + "logits/rejected": 716729920.0, + "logps/chosen": -286.16265869140625, + "logps/rejected": -495.2222900390625, + "loss": 0.044, + "rewards/chosen": 2.864866256713867, + "rewards/margins": 10.105836391448975, + "rewards/rejected": -7.240970134735107, + "step": 762 + }, + { + "epoch": 0.0697121973503883, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.886614795605722e-06, + "logits/chosen": 585715328.0, + "logits/rejected": 519755456.0, + "logps/chosen": -375.2727457682292, + "logps/rejected": -587.5267944335938, + "loss": 0.0501, + "rewards/chosen": 2.700455983479818, + "rewards/margins": 14.09726079305013, + "rewards/rejected": -11.396804809570312, + "step": 763 + }, + { + "epoch": 0.06980356327089995, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.886310133441794e-06, + "logits/chosen": 564764096.0, + "logits/rejected": 572917504.0, + "logps/chosen": -336.1973876953125, + "logps/rejected": -542.7113037109375, + "loss": 0.0645, + "rewards/chosen": 3.3417160511016846, + "rewards/margins": 11.269416729609173, + "rewards/rejected": -7.927700678507487, + "step": 764 + }, + { + "epoch": 0.0698949291914116, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.88600506722529e-06, + "logits/chosen": 462393792.0, + "logits/rejected": 531800896.0, + "logps/chosen": -269.4916076660156, + "logps/rejected": -501.8401794433594, + "loss": 0.034, + "rewards/chosen": 2.8301286697387695, + "rewards/margins": 10.188925743103027, + "rewards/rejected": -7.358797073364258, + "step": 765 + }, + { + "epoch": 0.06998629511192325, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 9.885699596981435e-06, + "logits/chosen": 440421952.0, + "logits/rejected": 690734506.6666666, + "logps/chosen": -294.3540954589844, + "logps/rejected": -498.160888671875, + "loss": 0.0109, + "rewards/chosen": 3.585836887359619, + "rewards/margins": 12.467702070871988, + "rewards/rejected": -8.88186518351237, + "step": 766 + }, + { + "epoch": 0.0700776610324349, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.885393722735492e-06, + "logits/chosen": 880742784.0, + "logits/rejected": 924658614.8571428, + "logps/chosen": -440.1181640625, + "logps/rejected": -682.0945172991071, + "loss": 0.037, + "rewards/chosen": 2.7952239513397217, + "rewards/margins": 11.850117444992065, + "rewards/rejected": -9.054893493652344, + "step": 767 + }, + { + "epoch": 0.07016902695294655, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.885087444512752e-06, + "logits/chosen": 711244842.6666666, + "logits/rejected": 450760224.0, + "logps/chosen": -347.4621988932292, + "logps/rejected": -229.60269165039062, + "loss": 0.0442, + "rewards/chosen": 2.9991381963094077, + "rewards/margins": 8.946154912312826, + "rewards/rejected": -5.947016716003418, + "step": 768 + }, + { + "epoch": 0.0702603928734582, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.884780762338542e-06, + "logits/chosen": 601798229.3333334, + "logits/rejected": 330569632.0, + "logps/chosen": -468.6111246744792, + "logps/rejected": -291.3839111328125, + "loss": 0.0369, + "rewards/chosen": 3.0252548853556314, + "rewards/margins": 10.061720530192057, + "rewards/rejected": -7.036465644836426, + "step": 769 + }, + { + "epoch": 0.07035175879396985, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.884473676238223e-06, + "logits/chosen": 485553883.4285714, + "logits/rejected": 499623168.0, + "logps/chosen": -343.49100167410717, + "logps/rejected": -262.41070556640625, + "loss": 0.0796, + "rewards/chosen": 2.790451322283064, + "rewards/margins": 9.649658475603376, + "rewards/rejected": -6.8592071533203125, + "step": 770 + }, + { + "epoch": 0.0704431247144815, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 9.884166186237185e-06, + "logits/chosen": 420509024.0, + "logits/rejected": 398369877.3333333, + "logps/chosen": -314.20465087890625, + "logps/rejected": -386.9031575520833, + "loss": 0.0169, + "rewards/chosen": 2.9116783142089844, + "rewards/margins": 10.024178822835285, + "rewards/rejected": -7.112500508626302, + "step": 771 + }, + { + "epoch": 0.07053449063499315, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 9.883858292360856e-06, + "logits/chosen": 671566144.0, + "logits/rejected": 229152874.66666666, + "logps/chosen": -524.023681640625, + "logps/rejected": -299.1867268880208, + "loss": 0.1039, + "rewards/chosen": 4.067273139953613, + "rewards/margins": 9.337454477945965, + "rewards/rejected": -5.270181337992351, + "step": 772 + }, + { + "epoch": 0.0706258565555048, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.883549994634696e-06, + "logits/chosen": 582158528.0, + "logits/rejected": 529236650.6666667, + "logps/chosen": -535.339111328125, + "logps/rejected": -576.4738362630209, + "loss": 0.0223, + "rewards/chosen": 2.6446151733398438, + "rewards/margins": 10.66943359375, + "rewards/rejected": -8.024818420410156, + "step": 773 + }, + { + "epoch": 0.07071722247601644, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.883241293084199e-06, + "logits/chosen": 567687753.1428572, + "logits/rejected": 438603744.0, + "logps/chosen": -331.79990931919644, + "logps/rejected": -431.30364990234375, + "loss": 0.0937, + "rewards/chosen": 2.6387904030936107, + "rewards/margins": 8.64349011012486, + "rewards/rejected": -6.00469970703125, + "step": 774 + }, + { + "epoch": 0.0708085883965281, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.88293218773489e-06, + "logits/chosen": 474678681.6, + "logits/rejected": 251318997.33333334, + "logps/chosen": -297.321630859375, + "logps/rejected": -214.93636067708334, + "loss": 0.142, + "rewards/chosen": 2.3036746978759766, + "rewards/margins": 5.717310269673666, + "rewards/rejected": -3.413635571797689, + "step": 775 + }, + { + "epoch": 0.07089995431703974, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.882622678612332e-06, + "logits/chosen": 477625958.4, + "logits/rejected": 411351808.0, + "logps/chosen": -283.29951171875, + "logps/rejected": -277.1339111328125, + "loss": 0.0802, + "rewards/chosen": 3.07089786529541, + "rewards/margins": 6.798884518941243, + "rewards/rejected": -3.7279866536458335, + "step": 776 + }, + { + "epoch": 0.07099132023755139, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 9.882312765742118e-06, + "logits/chosen": 441925536.0, + "logits/rejected": 496640000.0, + "logps/chosen": -100.33665466308594, + "logps/rejected": -479.4885660807292, + "loss": 0.0138, + "rewards/chosen": 3.508336067199707, + "rewards/margins": 10.320408185323078, + "rewards/rejected": -6.812072118123372, + "step": 777 + }, + { + "epoch": 0.07108268615806304, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.88200244914987e-06, + "logits/chosen": 617957376.0, + "logits/rejected": 538786901.3333334, + "logps/chosen": -329.0020263671875, + "logps/rejected": -576.7047526041666, + "loss": 0.0819, + "rewards/chosen": 2.976019287109375, + "rewards/margins": 11.81364860534668, + "rewards/rejected": -8.837629318237305, + "step": 778 + }, + { + "epoch": 0.07117405207857469, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.881691728861256e-06, + "logits/chosen": 856762112.0, + "logits/rejected": 694746368.0, + "logps/chosen": -432.3664957682292, + "logps/rejected": -482.732177734375, + "loss": 0.092, + "rewards/chosen": 3.0183709462483725, + "rewards/margins": 8.36902411778768, + "rewards/rejected": -5.350653171539307, + "step": 779 + }, + { + "epoch": 0.07126541799908634, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.881380604901964e-06, + "logits/chosen": 576301312.0, + "logits/rejected": 735170560.0, + "logps/chosen": -475.1421203613281, + "logps/rejected": -443.5643310546875, + "loss": 0.0471, + "rewards/chosen": 2.341212511062622, + "rewards/margins": 10.391318559646606, + "rewards/rejected": -8.050106048583984, + "step": 780 + }, + { + "epoch": 0.07135678391959799, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.881069077297724e-06, + "logits/chosen": 712887296.0, + "logits/rejected": 476536928.0, + "logps/chosen": -322.5700378417969, + "logps/rejected": -388.2596130371094, + "loss": 0.0487, + "rewards/chosen": 2.6778173446655273, + "rewards/margins": 9.032030582427979, + "rewards/rejected": -6.354213237762451, + "step": 781 + }, + { + "epoch": 0.07144814984010964, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 9.880757146074294e-06, + "logits/chosen": 468814592.0, + "logits/rejected": 716761753.6, + "logps/chosen": -331.2073974609375, + "logps/rejected": -352.037548828125, + "loss": 0.0207, + "rewards/chosen": 4.063785235087077, + "rewards/margins": 10.734572283426921, + "rewards/rejected": -6.670787048339844, + "step": 782 + }, + { + "epoch": 0.07153951576062129, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 9.88044481125747e-06, + "logits/chosen": 454796032.0, + "logits/rejected": 508014694.4, + "logps/chosen": -296.9976806640625, + "logps/rejected": -546.8955078125, + "loss": 0.0512, + "rewards/chosen": 2.283527692159017, + "rewards/margins": 8.444165356953938, + "rewards/rejected": -6.160637664794922, + "step": 783 + }, + { + "epoch": 0.07163088168113294, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.88013207287308e-06, + "logits/chosen": 429667072.0, + "logits/rejected": 483727872.0, + "logps/chosen": -331.7002766927083, + "logps/rejected": -393.0438720703125, + "loss": 0.0371, + "rewards/chosen": 3.072900136311849, + "rewards/margins": 8.950582631429036, + "rewards/rejected": -5.877682495117187, + "step": 784 + }, + { + "epoch": 0.07172224760164458, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 9.87981893094698e-06, + "logits/chosen": 395372544.0, + "logits/rejected": 405692074.6666667, + "logps/chosen": -222.28271484375, + "logps/rejected": -500.0955810546875, + "loss": 0.0144, + "rewards/chosen": 3.740635633468628, + "rewards/margins": 11.705110629399616, + "rewards/rejected": -7.964474995930989, + "step": 785 + }, + { + "epoch": 0.07181361352215623, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 9.87950538550507e-06, + "logits/chosen": 592052838.4, + "logits/rejected": 455819008.0, + "logps/chosen": -337.63857421875, + "logps/rejected": -383.368896484375, + "loss": 0.0168, + "rewards/chosen": 3.8893058776855467, + "rewards/margins": 10.893936284383138, + "rewards/rejected": -7.004630406697591, + "step": 786 + }, + { + "epoch": 0.07190497944266788, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.879191436573272e-06, + "logits/chosen": 439978240.0, + "logits/rejected": 508723046.4, + "logps/chosen": -185.32425944010416, + "logps/rejected": -463.1294921875, + "loss": 0.1069, + "rewards/chosen": 1.6245791117350261, + "rewards/margins": 10.924209849039713, + "rewards/rejected": -9.299630737304687, + "step": 787 + }, + { + "epoch": 0.07199634536317953, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.87887708417755e-06, + "logits/chosen": 600899584.0, + "logits/rejected": 224933232.0, + "logps/chosen": -283.44342041015625, + "logps/rejected": -289.99969482421875, + "loss": 0.0531, + "rewards/chosen": 2.67643404006958, + "rewards/margins": 8.612587928771973, + "rewards/rejected": -5.936153888702393, + "step": 788 + }, + { + "epoch": 0.07208771128369118, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.878562328343897e-06, + "logits/chosen": 776258560.0, + "logits/rejected": 446523840.0, + "logps/chosen": -696.2181396484375, + "logps/rejected": -521.1425170898438, + "loss": 0.0368, + "rewards/chosen": 2.702693462371826, + "rewards/margins": 11.29224157333374, + "rewards/rejected": -8.589548110961914, + "step": 789 + }, + { + "epoch": 0.07217907720420283, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.87824716909834e-06, + "logits/chosen": 547685741.7142857, + "logits/rejected": 717769984.0, + "logps/chosen": -324.677978515625, + "logps/rejected": -572.618408203125, + "loss": 0.0903, + "rewards/chosen": 2.9175938197544644, + "rewards/margins": 4.408041817801339, + "rewards/rejected": -1.490447998046875, + "step": 790 + }, + { + "epoch": 0.07227044312471448, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.87793160646694e-06, + "logits/chosen": 583673920.0, + "logits/rejected": 613568640.0, + "logps/chosen": -370.2924499511719, + "logps/rejected": -334.90924072265625, + "loss": 0.0702, + "rewards/chosen": 2.427544116973877, + "rewards/margins": 7.031086444854736, + "rewards/rejected": -4.603542327880859, + "step": 791 + }, + { + "epoch": 0.07236180904522613, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 9.877615640475792e-06, + "logits/chosen": 320570624.0, + "logits/rejected": 708569673.1428572, + "logps/chosen": -258.1634216308594, + "logps/rejected": -660.1354631696429, + "loss": 0.0303, + "rewards/chosen": 3.2925140857696533, + "rewards/margins": 13.186655146735054, + "rewards/rejected": -9.894141060965401, + "step": 792 + }, + { + "epoch": 0.07245317496573778, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.877299271151022e-06, + "logits/chosen": 427307946.6666667, + "logits/rejected": 543843481.6, + "logps/chosen": -290.1531168619792, + "logps/rejected": -640.08173828125, + "loss": 0.124, + "rewards/chosen": 2.655248006184896, + "rewards/margins": 7.351490910847982, + "rewards/rejected": -4.696242904663086, + "step": 793 + }, + { + "epoch": 0.07254454088624943, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.87698249851879e-06, + "logits/chosen": 664173952.0, + "logits/rejected": 761470208.0, + "logps/chosen": -392.47216796875, + "logps/rejected": -425.54571533203125, + "loss": 0.0332, + "rewards/chosen": 2.815720796585083, + "rewards/margins": 9.749526262283325, + "rewards/rejected": -6.933805465698242, + "step": 794 + }, + { + "epoch": 0.07263590680676107, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.876665322605294e-06, + "logits/chosen": 543269802.6666666, + "logits/rejected": 765928499.2, + "logps/chosen": -336.7333577473958, + "logps/rejected": -695.41552734375, + "loss": 0.0228, + "rewards/chosen": 3.0106512705485025, + "rewards/margins": 13.79640032450358, + "rewards/rejected": -10.785749053955078, + "step": 795 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.876347743436758e-06, + "logits/chosen": 575007808.0, + "logits/rejected": 522771648.0, + "logps/chosen": -253.24057006835938, + "logps/rejected": -384.293212890625, + "loss": 0.0479, + "rewards/chosen": 2.7124111652374268, + "rewards/margins": 9.309810400009155, + "rewards/rejected": -6.5973992347717285, + "step": 796 + }, + { + "epoch": 0.07281863864778437, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.876029761039444e-06, + "logits/chosen": 600943530.6666666, + "logits/rejected": 582128384.0, + "logps/chosen": -226.52482096354166, + "logps/rejected": -503.804052734375, + "loss": 0.1086, + "rewards/chosen": 1.0893972714742024, + "rewards/margins": 8.128211053212484, + "rewards/rejected": -7.038813781738281, + "step": 797 + }, + { + "epoch": 0.07291000456829602, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.875711375439645e-06, + "logits/chosen": 292581088.0, + "logits/rejected": 488607104.0, + "logps/chosen": -234.40133666992188, + "logps/rejected": -370.5102132161458, + "loss": 0.0504, + "rewards/chosen": 1.9722718000411987, + "rewards/margins": 7.524840871493022, + "rewards/rejected": -5.552569071451823, + "step": 798 + }, + { + "epoch": 0.07300137048880767, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.875392586663693e-06, + "logits/chosen": 483875072.0, + "logits/rejected": 572238400.0, + "logps/chosen": -323.2093505859375, + "logps/rejected": -502.0401611328125, + "loss": 0.0414, + "rewards/chosen": 2.917599678039551, + "rewards/margins": 11.602875709533691, + "rewards/rejected": -8.68527603149414, + "step": 799 + }, + { + "epoch": 0.07309273640931932, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 9.875073394737942e-06, + "logits/chosen": 699048601.6, + "logits/rejected": 776026624.0, + "logps/chosen": -621.3578125, + "logps/rejected": -300.2747802734375, + "loss": 0.1116, + "rewards/chosen": 2.6343284606933595, + "rewards/margins": 8.189377721150716, + "rewards/rejected": -5.5550492604573565, + "step": 800 + }, + { + "epoch": 0.07318410232983097, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.874753799688792e-06, + "logits/chosen": 407667648.0, + "logits/rejected": 594737578.6666666, + "logps/chosen": -249.2785186767578, + "logps/rejected": -513.1670735677084, + "loss": 0.0599, + "rewards/chosen": 1.9659180641174316, + "rewards/margins": 9.665047804514568, + "rewards/rejected": -7.699129740397136, + "step": 801 + }, + { + "epoch": 0.07327546825034262, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.874433801542669e-06, + "logits/chosen": 290691285.3333333, + "logits/rejected": 488369049.6, + "logps/chosen": -152.44866943359375, + "logps/rejected": -487.622509765625, + "loss": 0.0336, + "rewards/chosen": 3.2098633448282876, + "rewards/margins": 10.396153322855632, + "rewards/rejected": -7.186289978027344, + "step": 802 + }, + { + "epoch": 0.07336683417085427, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 9.874113400326031e-06, + "logits/chosen": 1220561920.0, + "logits/rejected": 678479725.7142857, + "logps/chosen": -998.4151611328125, + "logps/rejected": -349.48789760044644, + "loss": 0.0114, + "rewards/chosen": 4.303174018859863, + "rewards/margins": 9.680769647870745, + "rewards/rejected": -5.377595629010882, + "step": 803 + }, + { + "epoch": 0.07345820009136592, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.873792596065376e-06, + "logits/chosen": 769623603.2, + "logits/rejected": 571567616.0, + "logps/chosen": -328.037255859375, + "logps/rejected": -491.6673990885417, + "loss": 0.0519, + "rewards/chosen": 2.885251998901367, + "rewards/margins": 10.51111666361491, + "rewards/rejected": -7.625864664713542, + "step": 804 + }, + { + "epoch": 0.07354956601187757, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.873471388787229e-06, + "logits/chosen": 627627776.0, + "logits/rejected": 612460160.0, + "logps/chosen": -428.753466796875, + "logps/rejected": -541.3052164713541, + "loss": 0.0514, + "rewards/chosen": 2.5908248901367186, + "rewards/margins": 11.805640665690103, + "rewards/rejected": -9.214815775553385, + "step": 805 + }, + { + "epoch": 0.07364093193238921, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 9.873149778518152e-06, + "logits/chosen": 527276032.0, + "logits/rejected": 352425600.0, + "logps/chosen": -388.0423990885417, + "logps/rejected": -232.50833129882812, + "loss": 0.1369, + "rewards/chosen": 2.311401685078939, + "rewards/margins": 7.0349853833516445, + "rewards/rejected": -4.723583698272705, + "step": 806 + }, + { + "epoch": 0.07373229785290086, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 9.87282776528474e-06, + "logits/rejected": 359518272.0, + "logps/rejected": -447.35888671875, + "loss": 0.0106, + "rewards/rejected": -7.143719673156738, + "step": 807 + }, + { + "epoch": 0.07382366377341251, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 9.872505349113619e-06, + "logits/chosen": 288390860.8, + "logits/rejected": 235084202.66666666, + "logps/chosen": -166.4163330078125, + "logps/rejected": -234.981689453125, + "loss": 0.063, + "rewards/chosen": 2.654756546020508, + "rewards/margins": 7.605599721272786, + "rewards/rejected": -4.950843175252278, + "step": 808 + }, + { + "epoch": 0.07391502969392416, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 9.87218253003145e-06, + "logits/chosen": 559592640.0, + "logits/rejected": 942157994.6666666, + "logps/chosen": -330.27679443359375, + "logps/rejected": -703.6988118489584, + "loss": 0.0087, + "rewards/chosen": 4.064993381500244, + "rewards/margins": 14.013797601064047, + "rewards/rejected": -9.948804219563803, + "step": 809 + }, + { + "epoch": 0.07400639561443581, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.871859308064928e-06, + "logits/chosen": 414685994.6666667, + "logits/rejected": 413588633.6, + "logps/chosen": -176.44938151041666, + "logps/rejected": -479.8072265625, + "loss": 0.0304, + "rewards/chosen": 2.4401283264160156, + "rewards/margins": 11.244937896728516, + "rewards/rejected": -8.8048095703125, + "step": 810 + }, + { + "epoch": 0.07409776153494746, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.871535683240781e-06, + "logits/chosen": 390828544.0, + "logits/rejected": 444995737.6, + "logps/chosen": -363.3340657552083, + "logps/rejected": -443.91875, + "loss": 0.037, + "rewards/chosen": 2.8142083485921225, + "rewards/margins": 9.170844395955404, + "rewards/rejected": -6.356636047363281, + "step": 811 + }, + { + "epoch": 0.07418912745545911, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.871211655585766e-06, + "logits/rejected": 520897792.0, + "logps/rejected": -325.5259094238281, + "loss": 0.1207, + "rewards/rejected": -4.00175142288208, + "step": 812 + }, + { + "epoch": 0.07428049337597076, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.870887225126682e-06, + "logits/chosen": 506830950.4, + "logits/rejected": 514620458.6666667, + "logps/chosen": -200.4341064453125, + "logps/rejected": -550.1652425130209, + "loss": 0.0548, + "rewards/chosen": 2.7528545379638674, + "rewards/margins": 8.267077000935872, + "rewards/rejected": -5.514222462972005, + "step": 813 + }, + { + "epoch": 0.0743718592964824, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.870562391890353e-06, + "logits/chosen": 1036633856.0, + "logits/rejected": 730747520.0, + "logps/chosen": -219.03575134277344, + "logps/rejected": -394.34967041015625, + "loss": 0.0444, + "rewards/chosen": 2.4598522186279297, + "rewards/margins": 9.657719135284424, + "rewards/rejected": -7.197866916656494, + "step": 814 + }, + { + "epoch": 0.07446322521699406, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.870237155903643e-06, + "logits/chosen": 415908053.3333333, + "logits/rejected": 466522316.8, + "logps/chosen": -391.2202962239583, + "logps/rejected": -349.419970703125, + "loss": 0.0643, + "rewards/chosen": 2.262704531351725, + "rewards/margins": 10.399272219340006, + "rewards/rejected": -8.136567687988281, + "step": 815 + }, + { + "epoch": 0.0745545911375057, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.869911517193442e-06, + "logits/chosen": 988123584.0, + "logits/rejected": 841062107.4285715, + "logps/chosen": -190.03749084472656, + "logps/rejected": -469.77608816964283, + "loss": 0.0265, + "rewards/chosen": 1.3457428216934204, + "rewards/margins": 9.285707661083766, + "rewards/rejected": -7.939964839390346, + "step": 816 + }, + { + "epoch": 0.07464595705801735, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 9.86958547578668e-06, + "logits/chosen": 639029568.0, + "logits/rejected": 375455744.0, + "logps/chosen": -481.54156494140625, + "logps/rejected": -295.59320068359375, + "loss": 0.0137, + "rewards/chosen": 3.5116171836853027, + "rewards/margins": 9.003493150075276, + "rewards/rejected": -5.491875966389974, + "step": 817 + }, + { + "epoch": 0.074737322978529, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.869259031710317e-06, + "logits/chosen": 684206976.0, + "logits/rejected": 518010806.85714287, + "logps/chosen": -286.60284423828125, + "logps/rejected": -668.8716517857143, + "loss": 0.0178, + "rewards/chosen": 2.5264892578125, + "rewards/margins": 11.235167367117745, + "rewards/rejected": -8.708678109305245, + "step": 818 + }, + { + "epoch": 0.07482868889904065, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.868932184991347e-06, + "logits/chosen": 515586112.0, + "logits/rejected": 625055296.0, + "logps/chosen": -231.34793090820312, + "logps/rejected": -517.6802978515625, + "loss": 0.0497, + "rewards/chosen": 3.1851067543029785, + "rewards/margins": 10.308318614959717, + "rewards/rejected": -7.123211860656738, + "step": 819 + }, + { + "epoch": 0.0749200548195523, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.868604935656797e-06, + "logits/chosen": 1347481216.0, + "logits/rejected": 683197376.0, + "logps/chosen": -403.4476013183594, + "logps/rejected": -312.99169921875, + "loss": 0.0826, + "rewards/chosen": 2.779209613800049, + "rewards/margins": 8.585534572601318, + "rewards/rejected": -5.8063249588012695, + "step": 820 + }, + { + "epoch": 0.07501142074006395, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.868277283733725e-06, + "logits/chosen": 461582848.0, + "logits/rejected": 915466649.6, + "logps/chosen": -217.37263997395834, + "logps/rejected": -410.582421875, + "loss": 0.021, + "rewards/chosen": 3.5267957051595054, + "rewards/margins": 11.9013547261556, + "rewards/rejected": -8.374559020996093, + "step": 821 + }, + { + "epoch": 0.0751027866605756, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 9.86794922924923e-06, + "logits/chosen": 629914752.0, + "logits/rejected": 863890227.2, + "logps/chosen": -412.911376953125, + "logps/rejected": -312.272314453125, + "loss": 0.0262, + "rewards/chosen": 3.0289173126220703, + "rewards/margins": 9.039886856079102, + "rewards/rejected": -6.010969543457032, + "step": 822 + }, + { + "epoch": 0.07519415258108725, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 9.867620772230436e-06, + "logits/chosen": 968503588.5714285, + "logits/rejected": 365502976.0, + "logps/chosen": -366.48291015625, + "logps/rejected": -120.376953125, + "loss": 0.1154, + "rewards/chosen": 2.5095307486397878, + "rewards/margins": 7.520051683698382, + "rewards/rejected": -5.010520935058594, + "step": 823 + }, + { + "epoch": 0.0752855185015989, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 9.867291912704503e-06, + "logits/chosen": 440261760.0, + "logits/rejected": 558160896.0, + "logps/chosen": -345.7689615885417, + "logps/rejected": -447.7998046875, + "loss": 0.006, + "rewards/chosen": 4.302771886189778, + "rewards/margins": 12.202604039510092, + "rewards/rejected": -7.899832153320313, + "step": 824 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 9.866962650698626e-06, + "logits/chosen": 621327462.4, + "logits/rejected": 454092970.6666667, + "logps/chosen": -362.3448486328125, + "logps/rejected": -524.4442545572916, + "loss": 0.051, + "rewards/chosen": 2.5229333877563476, + "rewards/margins": 11.184323692321778, + "rewards/rejected": -8.66139030456543, + "step": 825 + }, + { + "epoch": 0.0754682503426222, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.86663298624003e-06, + "logits/chosen": 555432405.3333334, + "logits/rejected": 444688896.0, + "logps/chosen": -297.3517659505208, + "logps/rejected": -458.58857421875, + "loss": 0.0708, + "rewards/chosen": 2.8156960805257163, + "rewards/margins": 9.298186620076498, + "rewards/rejected": -6.482490539550781, + "step": 826 + }, + { + "epoch": 0.07555961626313384, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.866302919355977e-06, + "logits/chosen": 748334250.6666666, + "logits/rejected": 465627955.2, + "logps/chosen": -610.0884602864584, + "logps/rejected": -438.71357421875, + "loss": 0.0164, + "rewards/chosen": 3.537978172302246, + "rewards/margins": 11.007997703552245, + "rewards/rejected": -7.47001953125, + "step": 827 + }, + { + "epoch": 0.07565098218364551, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 9.86597245007376e-06, + "logits/chosen": 491800000.0, + "logits/rejected": 421031936.0, + "logps/chosen": -333.36993408203125, + "logps/rejected": -551.8056640625, + "loss": 0.0856, + "rewards/chosen": 2.791424036026001, + "rewards/margins": 11.544296503067017, + "rewards/rejected": -8.752872467041016, + "step": 828 + }, + { + "epoch": 0.07574234810415716, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.865641578420707e-06, + "logits/chosen": 569264640.0, + "logits/rejected": 1062388019.2, + "logps/chosen": -249.9077351888021, + "logps/rejected": -655.625439453125, + "loss": 0.0515, + "rewards/chosen": 1.9054808616638184, + "rewards/margins": 11.006364345550537, + "rewards/rejected": -9.100883483886719, + "step": 829 + }, + { + "epoch": 0.0758337140246688, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.865310304424174e-06, + "logits/chosen": 688602197.3333334, + "logits/rejected": 540357324.8, + "logps/chosen": -512.1086832682291, + "logps/rejected": -507.420849609375, + "loss": 0.0419, + "rewards/chosen": 2.2439778645833335, + "rewards/margins": 12.347008260091146, + "rewards/rejected": -10.103030395507812, + "step": 830 + }, + { + "epoch": 0.07592507994518045, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.864978628111557e-06, + "logits/chosen": 926271872.0, + "logits/rejected": 1007278976.0, + "logps/chosen": -340.8953857421875, + "logps/rejected": -649.3511962890625, + "loss": 0.0599, + "rewards/chosen": 2.421220302581787, + "rewards/margins": 11.335721492767334, + "rewards/rejected": -8.914501190185547, + "step": 831 + }, + { + "epoch": 0.0760164458656921, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 9.864646549510285e-06, + "logits/chosen": 453443456.0, + "logits/rejected": 433842329.6, + "logps/chosen": -244.71134440104166, + "logps/rejected": -504.9953125, + "loss": 0.0043, + "rewards/chosen": 4.712444305419922, + "rewards/margins": 13.661147308349609, + "rewards/rejected": -8.948703002929687, + "step": 832 + }, + { + "epoch": 0.07610781178620375, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.864314068647814e-06, + "logits/chosen": 605561958.4, + "logits/rejected": 478549845.3333333, + "logps/chosen": -379.9242919921875, + "logps/rejected": -349.0028483072917, + "loss": 0.1641, + "rewards/chosen": 2.038022041320801, + "rewards/margins": 7.636466026306152, + "rewards/rejected": -5.598443984985352, + "step": 833 + }, + { + "epoch": 0.0761991777067154, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.863981185551636e-06, + "logits/chosen": 412354852.5714286, + "logits/rejected": 133063840.0, + "logps/chosen": -281.0719517299107, + "logps/rejected": -7.83367919921875, + "loss": 0.1793, + "rewards/chosen": 2.8068438938685825, + "rewards/margins": 1.7933638266154697, + "rewards/rejected": 1.0134800672531128, + "step": 834 + }, + { + "epoch": 0.07629054362722705, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.863647900249283e-06, + "logits/chosen": 267335200.0, + "logits/rejected": 739097002.6666666, + "logps/chosen": -93.00220489501953, + "logps/rejected": -621.9078776041666, + "loss": 0.1321, + "rewards/chosen": 0.5643165707588196, + "rewards/margins": 6.950871924559276, + "rewards/rejected": -6.386555353800456, + "step": 835 + }, + { + "epoch": 0.0763819095477387, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.863314212768311e-06, + "logits/chosen": 1302047573.3333333, + "logits/rejected": 688836096.0, + "logps/chosen": -382.919921875, + "logps/rejected": -489.79599609375, + "loss": 0.0539, + "rewards/chosen": 2.437657674153646, + "rewards/margins": 10.488635762532553, + "rewards/rejected": -8.050978088378907, + "step": 836 + }, + { + "epoch": 0.07647327546825035, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.862980123136312e-06, + "logits/chosen": 512859040.0, + "logits/rejected": 455005184.0, + "logps/chosen": -389.449951171875, + "logps/rejected": -541.0520833333334, + "loss": 0.0533, + "rewards/chosen": 1.4386048316955566, + "rewards/margins": 8.323427677154541, + "rewards/rejected": -6.884822845458984, + "step": 837 + }, + { + "epoch": 0.076564641388762, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.862645631380914e-06, + "logits/chosen": 594377557.3333334, + "logits/rejected": 391316121.6, + "logps/chosen": -405.8164876302083, + "logps/rejected": -382.545849609375, + "loss": 0.0348, + "rewards/chosen": 3.050490379333496, + "rewards/margins": 9.734902000427246, + "rewards/rejected": -6.68441162109375, + "step": 838 + }, + { + "epoch": 0.07665600730927365, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 9.862310737529776e-06, + "logits/chosen": 372077408.0, + "logits/rejected": 485996714.6666667, + "logps/chosen": -294.5018615722656, + "logps/rejected": -548.892333984375, + "loss": 0.0118, + "rewards/chosen": 3.232774019241333, + "rewards/margins": 9.47187288602193, + "rewards/rejected": -6.239098866780599, + "step": 839 + }, + { + "epoch": 0.0767473732297853, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.86197544161059e-06, + "logits/chosen": 320001472.0, + "logits/rejected": 304316202.6666667, + "logps/chosen": -305.08856201171875, + "logps/rejected": -362.1236165364583, + "loss": 0.0278, + "rewards/chosen": 2.779585361480713, + "rewards/margins": 8.69832468032837, + "rewards/rejected": -5.918739318847656, + "step": 840 + }, + { + "epoch": 0.07683873915029694, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 9.86163974365108e-06, + "logits/chosen": 383245158.4, + "logits/rejected": 729863253.3333334, + "logps/chosen": -182.72607421875, + "logps/rejected": -598.7689208984375, + "loss": 0.1116, + "rewards/chosen": 1.965003204345703, + "rewards/margins": 12.58536008199056, + "rewards/rejected": -10.620356877644857, + "step": 841 + }, + { + "epoch": 0.0769301050708086, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.86130364367901e-06, + "logits/chosen": 525881248.0, + "logits/rejected": 484119552.0, + "logps/chosen": -388.8036193847656, + "logps/rejected": -187.6140899658203, + "loss": 0.0757, + "rewards/chosen": 2.5404677391052246, + "rewards/margins": 6.602259635925293, + "rewards/rejected": -4.061791896820068, + "step": 842 + }, + { + "epoch": 0.07702147099132024, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.86096714172217e-06, + "logits/chosen": 363983104.0, + "logits/rejected": 397845248.0, + "logps/chosen": -312.0616861979167, + "logps/rejected": -688.101220703125, + "loss": 0.0256, + "rewards/chosen": 2.998263676961263, + "rewards/margins": 10.744772466023763, + "rewards/rejected": -7.7465087890625, + "step": 843 + }, + { + "epoch": 0.07711283691183189, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.860630237808383e-06, + "logits/chosen": 396145440.0, + "logps/chosen": -296.1687316894531, + "loss": 0.0292, + "rewards/chosen": 3.8439502716064453, + "step": 844 + }, + { + "epoch": 0.07720420283234354, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.860292931965511e-06, + "logits/chosen": 592832682.6666666, + "logits/rejected": 283299648.0, + "logps/chosen": -338.00132242838544, + "logps/rejected": -349.8469543457031, + "loss": 0.026, + "rewards/chosen": 3.615924835205078, + "rewards/margins": 9.890639305114746, + "rewards/rejected": -6.274714469909668, + "step": 845 + }, + { + "epoch": 0.07729556875285519, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.859955224221446e-06, + "logits/chosen": 390082892.8, + "logits/rejected": 567037653.3333334, + "logps/chosen": -327.3990234375, + "logps/rejected": -602.8226725260416, + "loss": 0.037, + "rewards/chosen": 3.2356536865234373, + "rewards/margins": 11.054492441813151, + "rewards/rejected": -7.818838755289714, + "step": 846 + }, + { + "epoch": 0.07738693467336684, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 9.859617114604111e-06, + "logits/chosen": 431997184.0, + "logits/rejected": 393971200.0, + "logps/chosen": -310.3161926269531, + "logps/rejected": -543.4178059895834, + "loss": 0.0137, + "rewards/chosen": 3.861874580383301, + "rewards/margins": 12.349971453348795, + "rewards/rejected": -8.488096872965494, + "step": 847 + }, + { + "epoch": 0.07747830059387849, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.859278603141469e-06, + "logits/chosen": 634213120.0, + "logits/rejected": 490272938.6666667, + "logps/chosen": -562.04599609375, + "logps/rejected": -456.4030354817708, + "loss": 0.058, + "rewards/chosen": 3.308892822265625, + "rewards/margins": 8.66103515625, + "rewards/rejected": -5.352142333984375, + "step": 848 + }, + { + "epoch": 0.07756966651439014, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 9.858939689861506e-06, + "logits/chosen": 562377664.0, + "logits/rejected": 488206560.0, + "logps/chosen": -485.99285888671875, + "logps/rejected": -452.9732666015625, + "loss": 0.0168, + "rewards/chosen": 4.1304121017456055, + "rewards/margins": 10.107172966003418, + "rewards/rejected": -5.9767608642578125, + "step": 849 + }, + { + "epoch": 0.07766103243490179, + "grad_norm": 0.953125, + "kl": 0.0, + "learning_rate": 9.858600374792251e-06, + "logits/chosen": 394067232.0, + "logits/rejected": 403460010.6666667, + "logps/chosen": -346.31085205078125, + "logps/rejected": -319.8201904296875, + "loss": 0.0041, + "rewards/chosen": 4.358250617980957, + "rewards/margins": 12.224688529968262, + "rewards/rejected": -7.866437911987305, + "step": 850 + }, + { + "epoch": 0.07775239835541344, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 9.85826065796176e-06, + "logits/chosen": 506411264.0, + "logits/rejected": 1066781888.0, + "logps/chosen": -326.39605712890625, + "logps/rejected": -753.07958984375, + "loss": 0.0115, + "rewards/chosen": 4.447136878967285, + "rewards/margins": 14.953072547912598, + "rewards/rejected": -10.505935668945312, + "step": 851 + }, + { + "epoch": 0.07784376427592508, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.857920539398125e-06, + "logits/chosen": 866025344.0, + "logits/rejected": 768230272.0, + "logps/chosen": -211.85952758789062, + "logps/rejected": -558.0496826171875, + "loss": 0.1244, + "rewards/chosen": 1.6344547271728516, + "rewards/margins": 9.46333122253418, + "rewards/rejected": -7.828876495361328, + "step": 852 + }, + { + "epoch": 0.07793513019643673, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.85758001912947e-06, + "logits/chosen": 244501680.0, + "logits/rejected": 435218688.0, + "logps/chosen": -261.60675048828125, + "logps/rejected": -438.3638102213542, + "loss": 0.0277, + "rewards/chosen": 3.1598448753356934, + "rewards/margins": 11.670625845591227, + "rewards/rejected": -8.510780970255533, + "step": 853 + }, + { + "epoch": 0.07802649611694838, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.857239097183957e-06, + "logits/chosen": 419003801.6, + "logits/rejected": 652076032.0, + "logps/chosen": -274.382763671875, + "logps/rejected": -542.1344401041666, + "loss": 0.028, + "rewards/chosen": 3.28389892578125, + "rewards/margins": 12.237508392333984, + "rewards/rejected": -8.953609466552734, + "step": 854 + }, + { + "epoch": 0.07811786203746003, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 9.856897773589774e-06, + "logits/chosen": 371663168.0, + "logits/rejected": 399878485.3333333, + "logps/chosen": -354.9049987792969, + "logps/rejected": -460.6004638671875, + "loss": 0.0283, + "rewards/chosen": 3.635903835296631, + "rewards/margins": 11.803812185923258, + "rewards/rejected": -8.167908350626627, + "step": 855 + }, + { + "epoch": 0.07820922795797168, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.856556048375144e-06, + "logits/chosen": 508725973.3333333, + "logits/rejected": 595841638.4, + "logps/chosen": -407.1956787109375, + "logps/rejected": -350.8640869140625, + "loss": 0.04, + "rewards/chosen": 3.8218262990315757, + "rewards/margins": 9.772162755330404, + "rewards/rejected": -5.950336456298828, + "step": 856 + }, + { + "epoch": 0.07830059387848333, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 9.856213921568326e-06, + "logits/chosen": 578821120.0, + "logits/rejected": 357331648.0, + "logps/chosen": -376.38995361328125, + "logps/rejected": -474.86993408203125, + "loss": 0.0132, + "rewards/chosen": 3.862875461578369, + "rewards/margins": 15.068378925323486, + "rewards/rejected": -11.205503463745117, + "step": 857 + }, + { + "epoch": 0.07839195979899498, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 9.85587139319761e-06, + "logits/chosen": 586581504.0, + "logits/rejected": 691816768.0, + "logps/chosen": -343.15478515625, + "logps/rejected": -550.4361572265625, + "loss": 0.0204, + "rewards/chosen": 3.242194414138794, + "rewards/margins": 10.229196310043335, + "rewards/rejected": -6.987001895904541, + "step": 858 + }, + { + "epoch": 0.07848332571950663, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.855528463291319e-06, + "logits/chosen": 611279667.2, + "logits/rejected": 447951701.3333333, + "logps/chosen": -455.907080078125, + "logps/rejected": -456.4017740885417, + "loss": 0.0441, + "rewards/chosen": 2.6775672912597654, + "rewards/margins": 11.618439483642579, + "rewards/rejected": -8.940872192382812, + "step": 859 + }, + { + "epoch": 0.07857469164001828, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 9.855185131877813e-06, + "logits/chosen": 475520768.0, + "logits/rejected": 436938020.5714286, + "logps/chosen": -293.3998718261719, + "logps/rejected": -531.3379603794643, + "loss": 0.0103, + "rewards/chosen": 2.564645528793335, + "rewards/margins": 10.517917667116437, + "rewards/rejected": -7.953272138323102, + "step": 860 + }, + { + "epoch": 0.07866605756052993, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.854841398985483e-06, + "logits/chosen": 505762304.0, + "logits/rejected": 256920426.66666666, + "logps/chosen": -385.474755859375, + "logps/rejected": -201.76131184895834, + "loss": 0.0444, + "rewards/chosen": 3.7319087982177734, + "rewards/margins": 7.032958348592123, + "rewards/rejected": -3.301049550374349, + "step": 861 + }, + { + "epoch": 0.07875742348104157, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 9.854497264642746e-06, + "logits/chosen": 1130576256.0, + "logits/rejected": 729201728.0, + "logps/chosen": -358.63323974609375, + "logps/rejected": -474.68548583984375, + "loss": 0.0162, + "rewards/chosen": 3.821213483810425, + "rewards/margins": 11.101911783218384, + "rewards/rejected": -7.280698299407959, + "step": 862 + }, + { + "epoch": 0.07884878940155322, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.854152728878067e-06, + "logits/chosen": 379102720.0, + "logits/rejected": 636326528.0, + "logps/chosen": -324.63970947265625, + "logps/rejected": -333.0959777832031, + "loss": 0.0328, + "rewards/chosen": 3.520770311355591, + "rewards/margins": 8.912267923355103, + "rewards/rejected": -5.391497611999512, + "step": 863 + }, + { + "epoch": 0.07894015532206487, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.85380779171993e-06, + "logits/chosen": 268223216.0, + "logits/rejected": 624607488.0, + "logps/chosen": -138.75128173828125, + "logps/rejected": -611.5595296223959, + "loss": 0.0699, + "rewards/chosen": 1.5158047676086426, + "rewards/margins": 9.462459405263264, + "rewards/rejected": -7.946654637654622, + "step": 864 + }, + { + "epoch": 0.07903152124257652, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.85346245319686e-06, + "logits/chosen": 639637376.0, + "logits/rejected": 728188108.8, + "logps/chosen": -456.5653483072917, + "logps/rejected": -467.073828125, + "loss": 0.0569, + "rewards/chosen": 2.895838419596354, + "rewards/margins": 10.242788950602213, + "rewards/rejected": -7.346950531005859, + "step": 865 + }, + { + "epoch": 0.07912288716308817, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.853116713337414e-06, + "logits/chosen": 513233536.0, + "logits/rejected": 370355968.0, + "logps/chosen": -377.34454345703125, + "logps/rejected": -379.7408854166667, + "loss": 0.027, + "rewards/chosen": 2.9054811000823975, + "rewards/margins": 10.146372238794964, + "rewards/rejected": -7.240891138712565, + "step": 866 + }, + { + "epoch": 0.07921425308359982, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.852770572170183e-06, + "logits/chosen": 475856341.3333333, + "logits/rejected": 319357260.8, + "logps/chosen": -438.9954833984375, + "logps/rejected": -385.3329833984375, + "loss": 0.0221, + "rewards/chosen": 3.085113525390625, + "rewards/margins": 9.925590515136719, + "rewards/rejected": -6.840476989746094, + "step": 867 + }, + { + "epoch": 0.07930561900411147, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 9.852424029723783e-06, + "logits/chosen": 722490112.0, + "logits/rejected": 562906806.8571428, + "logps/chosen": -399.5059509277344, + "logps/rejected": -516.7940499441964, + "loss": 0.0041, + "rewards/chosen": 3.8548614978790283, + "rewards/margins": 12.707765136446271, + "rewards/rejected": -8.852903638567243, + "step": 868 + }, + { + "epoch": 0.07939698492462312, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 9.852077086026879e-06, + "logits/chosen": 443572394.6666667, + "logits/rejected": 137470192.0, + "logps/chosen": -344.7961832682292, + "logps/rejected": -179.9152374267578, + "loss": 0.1176, + "rewards/chosen": 2.6573769251505532, + "rewards/margins": 8.908734480539957, + "rewards/rejected": -6.251357555389404, + "step": 869 + }, + { + "epoch": 0.07948835084513477, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 9.851729741108154e-06, + "logits/chosen": 713697024.0, + "logits/rejected": 515930521.6, + "logps/chosen": -364.031982421875, + "logps/rejected": -591.00751953125, + "loss": 0.0691, + "rewards/chosen": 1.7004003524780273, + "rewards/margins": 11.131117820739746, + "rewards/rejected": -9.430717468261719, + "step": 870 + }, + { + "epoch": 0.07957971676564642, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.85138199499633e-06, + "logits/chosen": 1274372266.6666667, + "logits/rejected": 639242137.6, + "logps/chosen": -449.6319986979167, + "logps/rejected": -537.91884765625, + "loss": 0.0506, + "rewards/chosen": 2.191603660583496, + "rewards/margins": 10.514243125915527, + "rewards/rejected": -8.322639465332031, + "step": 871 + }, + { + "epoch": 0.07967108268615807, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.851033847720167e-06, + "logits/chosen": 357864755.2, + "logits/rejected": 488119040.0, + "logps/chosen": -422.48486328125, + "logps/rejected": -533.9914143880209, + "loss": 0.0453, + "rewards/chosen": 3.032606315612793, + "rewards/margins": 10.622194480895995, + "rewards/rejected": -7.589588165283203, + "step": 872 + }, + { + "epoch": 0.07976244860666971, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.850685299308448e-06, + "logits/chosen": 478994048.0, + "logits/rejected": 618322048.0, + "logps/chosen": -365.8642578125, + "logps/rejected": -635.8369750976562, + "loss": 0.0561, + "rewards/chosen": 2.3513526916503906, + "rewards/margins": 11.962577819824219, + "rewards/rejected": -9.611225128173828, + "step": 873 + }, + { + "epoch": 0.07985381452718136, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.850336349789999e-06, + "logits/chosen": 494455392.0, + "logits/rejected": 470337632.0, + "logps/chosen": -276.763671875, + "logps/rejected": -423.9932861328125, + "loss": 0.0312, + "rewards/chosen": 3.1586570739746094, + "rewards/margins": 10.618850708007812, + "rewards/rejected": -7.460193634033203, + "step": 874 + }, + { + "epoch": 0.07994518044769301, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.849986999193672e-06, + "logits/chosen": 614081126.4, + "logits/rejected": 577978752.0, + "logps/chosen": -282.44404296875, + "logps/rejected": -502.3086344401042, + "loss": 0.0884, + "rewards/chosen": 2.03417854309082, + "rewards/margins": 10.19498202006022, + "rewards/rejected": -8.1608034769694, + "step": 875 + }, + { + "epoch": 0.08003654636820466, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.849637247548356e-06, + "logits/chosen": 405432789.3333333, + "logits/rejected": 916441804.8, + "logps/chosen": -153.32757568359375, + "logps/rejected": -502.14619140625, + "loss": 0.0701, + "rewards/chosen": 2.8972352345784507, + "rewards/margins": 10.859796269734701, + "rewards/rejected": -7.96256103515625, + "step": 876 + }, + { + "epoch": 0.08012791228871631, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.849287094882974e-06, + "logits/chosen": 671457638.4, + "logits/rejected": 636300074.6666666, + "logps/chosen": -301.8183349609375, + "logps/rejected": -530.4553629557291, + "loss": 0.0729, + "rewards/chosen": 2.483479309082031, + "rewards/margins": 9.292991129557292, + "rewards/rejected": -6.809511820475261, + "step": 877 + }, + { + "epoch": 0.08021927820922796, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.848936541226479e-06, + "logits/chosen": 339234099.2, + "logits/rejected": 262624789.33333334, + "logps/chosen": -305.8340087890625, + "logps/rejected": -390.6905110677083, + "loss": 0.032, + "rewards/chosen": 3.4181102752685546, + "rewards/margins": 11.674651209513346, + "rewards/rejected": -8.256540934244791, + "step": 878 + }, + { + "epoch": 0.08031064412973961, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.848585586607858e-06, + "logits/chosen": 879162368.0, + "logits/rejected": 597758272.0, + "logps/chosen": -346.4555257161458, + "logps/rejected": -588.4866943359375, + "loss": 0.0826, + "rewards/chosen": 3.0314610799153647, + "rewards/margins": 11.997607549031576, + "rewards/rejected": -8.966146469116211, + "step": 879 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.84823423105613e-06, + "logits/chosen": 707606080.0, + "logits/rejected": 804722816.0, + "logps/chosen": -275.7142333984375, + "logps/rejected": -708.82568359375, + "loss": 0.071, + "rewards/chosen": 2.4652938842773438, + "rewards/margins": 10.968857765197754, + "rewards/rejected": -8.50356388092041, + "step": 880 + }, + { + "epoch": 0.0804933759707629, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.847882474600355e-06, + "logits/chosen": 741227520.0, + "logits/rejected": 392684480.0, + "logps/chosen": -223.68792724609375, + "logps/rejected": -303.38287353515625, + "loss": 0.2421, + "rewards/chosen": 1.10690176486969, + "rewards/margins": 7.8768497705459595, + "rewards/rejected": -6.7699480056762695, + "step": 881 + }, + { + "epoch": 0.08058474189127456, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.847530317269613e-06, + "logits/chosen": 883854165.3333334, + "logits/rejected": 453460633.6, + "logps/chosen": -436.162109375, + "logps/rejected": -367.3330078125, + "loss": 0.064, + "rewards/chosen": 3.1649659474690757, + "rewards/margins": 7.242706807454427, + "rewards/rejected": -4.077740859985352, + "step": 882 + }, + { + "epoch": 0.0806761078117862, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.847177759093028e-06, + "logits/chosen": 581961088.0, + "logits/rejected": 1152923136.0, + "logps/chosen": -289.498779296875, + "logps/rejected": -449.34033203125, + "loss": 0.0557, + "rewards/chosen": 2.6053857803344727, + "rewards/margins": 8.226166248321533, + "rewards/rejected": -5.6207804679870605, + "step": 883 + }, + { + "epoch": 0.08076747373229785, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.846824800099752e-06, + "logits/chosen": 501508778.6666667, + "logits/rejected": 856830566.4, + "logps/chosen": -211.32828776041666, + "logps/rejected": -443.5927734375, + "loss": 0.0793, + "rewards/chosen": 1.620057741800944, + "rewards/margins": 8.206077639261881, + "rewards/rejected": -6.586019897460938, + "step": 884 + }, + { + "epoch": 0.0808588396528095, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.846471440318972e-06, + "logits/chosen": 408758656.0, + "logits/rejected": 530166112.0, + "logps/chosen": -305.1141357421875, + "logps/rejected": -247.98281860351562, + "loss": 0.0273, + "rewards/chosen": 3.9253344535827637, + "rewards/margins": 8.458780288696289, + "rewards/rejected": -4.533445835113525, + "step": 885 + }, + { + "epoch": 0.08095020557332115, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 9.846117679779907e-06, + "logits/chosen": 450684723.2, + "logits/rejected": 378767445.3333333, + "logps/chosen": -385.6818359375, + "logps/rejected": -449.9532877604167, + "loss": 0.0157, + "rewards/chosen": 4.393140411376953, + "rewards/margins": 11.071932093302408, + "rewards/rejected": -6.678791681925456, + "step": 886 + }, + { + "epoch": 0.0810415714938328, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.845763518511812e-06, + "logits/chosen": 481349120.0, + "logits/rejected": 1020669644.8, + "logps/chosen": -269.4798990885417, + "logps/rejected": -530.6244140625, + "loss": 0.0335, + "rewards/chosen": 3.372655232747396, + "rewards/margins": 9.676626332600911, + "rewards/rejected": -6.3039710998535154, + "step": 887 + }, + { + "epoch": 0.08113293741434445, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.84540895654397e-06, + "logits/chosen": 701933696.0, + "logits/rejected": 314309888.0, + "logps/chosen": -473.4053039550781, + "logps/rejected": -431.9639892578125, + "loss": 0.033, + "rewards/chosen": 2.7072839736938477, + "rewards/margins": 9.394261360168457, + "rewards/rejected": -6.686977386474609, + "step": 888 + }, + { + "epoch": 0.0812243033348561, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.8450539939057e-06, + "logits/chosen": 714692915.2, + "logits/rejected": 337036928.0, + "logps/chosen": -232.41376953125, + "logps/rejected": -288.2277018229167, + "loss": 0.0411, + "rewards/chosen": 2.8264646530151367, + "rewards/margins": 8.205086072285969, + "rewards/rejected": -5.378621419270833, + "step": 889 + }, + { + "epoch": 0.08131566925536775, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.844698630626357e-06, + "logits/chosen": 1050881280.0, + "logits/rejected": 751855786.6666666, + "logps/chosen": -216.55535888671875, + "logps/rejected": -486.7192789713542, + "loss": 0.0611, + "rewards/chosen": 4.373621463775635, + "rewards/margins": 8.970712502797443, + "rewards/rejected": -4.59709103902181, + "step": 890 + }, + { + "epoch": 0.0814070351758794, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.844342866735323e-06, + "logits/chosen": 549417792.0, + "logits/rejected": 567813120.0, + "logps/chosen": -321.7340087890625, + "logps/rejected": -435.8883056640625, + "loss": 0.0287, + "rewards/chosen": 2.431307315826416, + "rewards/margins": 9.986441135406494, + "rewards/rejected": -7.555133819580078, + "step": 891 + }, + { + "epoch": 0.08149840109639105, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 9.843986702262019e-06, + "logits/chosen": 579892352.0, + "logits/rejected": 397302176.0, + "logps/chosen": -434.6622314453125, + "logps/rejected": -596.941162109375, + "loss": 0.0672, + "rewards/chosen": 2.9124226570129395, + "rewards/margins": 6.85904598236084, + "rewards/rejected": -3.9466233253479004, + "step": 892 + }, + { + "epoch": 0.0815897670169027, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.843630137235894e-06, + "logits/chosen": 338211712.0, + "logits/rejected": 799631155.2, + "logps/chosen": -127.65081787109375, + "logps/rejected": -363.6611572265625, + "loss": 0.0871, + "rewards/chosen": 2.591132481892904, + "rewards/margins": 9.502517064412435, + "rewards/rejected": -6.911384582519531, + "step": 893 + }, + { + "epoch": 0.08168113293741434, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 9.843273171686433e-06, + "logits/chosen": 404212224.0, + "logits/rejected": 519197792.0, + "logps/chosen": -316.6748046875, + "logps/rejected": -483.09478759765625, + "loss": 0.0397, + "rewards/chosen": 2.602832794189453, + "rewards/margins": 8.643999576568604, + "rewards/rejected": -6.04116678237915, + "step": 894 + }, + { + "epoch": 0.081772498857926, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.842915805643156e-06, + "logits/chosen": 927890329.6, + "logits/rejected": 1441806677.3333333, + "logps/chosen": -305.024267578125, + "logps/rejected": -456.760009765625, + "loss": 0.1093, + "rewards/chosen": 2.890830421447754, + "rewards/margins": 9.021434211730957, + "rewards/rejected": -6.130603790283203, + "step": 895 + }, + { + "epoch": 0.08186386477843764, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.842558039135612e-06, + "logits/chosen": 595647634.2857143, + "logits/rejected": 588278400.0, + "logps/chosen": -424.2426060267857, + "logps/rejected": -462.8019104003906, + "loss": 0.0493, + "rewards/chosen": 3.071575437273298, + "rewards/margins": 7.2566616875784735, + "rewards/rejected": -4.185086250305176, + "step": 896 + }, + { + "epoch": 0.08195523069894929, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.842199872193386e-06, + "logits/chosen": 608961322.6666666, + "logits/rejected": 652129075.2, + "logps/chosen": -408.60205078125, + "logps/rejected": -691.113916015625, + "loss": 0.0171, + "rewards/chosen": 3.1905539830525718, + "rewards/margins": 14.188039334615071, + "rewards/rejected": -10.9974853515625, + "step": 897 + }, + { + "epoch": 0.08204659661946094, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.841841304846096e-06, + "logits/chosen": 489420416.0, + "logits/rejected": 601549056.0, + "logps/chosen": -353.19744873046875, + "logps/rejected": -506.09161376953125, + "loss": 0.043, + "rewards/chosen": 2.6038150787353516, + "rewards/margins": 10.440775394439697, + "rewards/rejected": -7.836960315704346, + "step": 898 + }, + { + "epoch": 0.08213796253997259, + "grad_norm": 1.140625, + "kl": 0.0, + "learning_rate": 9.841482337123388e-06, + "logits/chosen": 606755413.3333334, + "logits/rejected": 576710144.0, + "logps/chosen": -246.52716064453125, + "logps/rejected": -572.3224609375, + "loss": 0.0052, + "rewards/chosen": 4.3272043863932295, + "rewards/margins": 14.402822367350261, + "rewards/rejected": -10.07561798095703, + "step": 899 + }, + { + "epoch": 0.08222932846048424, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.84112296905495e-06, + "logits/chosen": 459141546.6666667, + "logits/rejected": 373895232.0, + "logps/chosen": -403.8658040364583, + "logps/rejected": -344.5965576171875, + "loss": 0.086, + "rewards/chosen": 2.639477094014486, + "rewards/margins": 10.09300978978475, + "rewards/rejected": -7.453532695770264, + "step": 900 + }, + { + "epoch": 0.08232069438099589, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 9.840763200670495e-06, + "logits/chosen": 289914560.0, + "logits/rejected": 451451200.0, + "logps/chosen": -224.0843505859375, + "logps/rejected": -431.79022216796875, + "loss": 0.018, + "rewards/chosen": 3.6181163787841797, + "rewards/margins": 13.098888397216797, + "rewards/rejected": -9.480772018432617, + "step": 901 + }, + { + "epoch": 0.08241206030150754, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.840403031999775e-06, + "logits/chosen": 480497728.0, + "logits/rejected": 505047552.0, + "logps/chosen": -319.0423889160156, + "logps/rejected": -501.798095703125, + "loss": 0.0235, + "rewards/chosen": 3.6492156982421875, + "rewards/margins": 11.036492347717285, + "rewards/rejected": -7.387276649475098, + "step": 902 + }, + { + "epoch": 0.08250342622201919, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.84004246307257e-06, + "logits/chosen": 807949824.0, + "logits/rejected": 1348703232.0, + "logps/chosen": -329.5458984375, + "logps/rejected": -764.4849853515625, + "loss": 0.0485, + "rewards/chosen": 2.801300366719564, + "rewards/margins": 14.089714368184408, + "rewards/rejected": -11.288414001464844, + "step": 903 + }, + { + "epoch": 0.08259479214253083, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.8396814939187e-06, + "logits/chosen": 1204993126.4, + "logits/rejected": 654626730.6666666, + "logps/chosen": -348.366650390625, + "logps/rejected": -378.200439453125, + "loss": 0.0916, + "rewards/chosen": 2.16286563873291, + "rewards/margins": 10.955029487609863, + "rewards/rejected": -8.792163848876953, + "step": 904 + }, + { + "epoch": 0.08268615806304248, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.839320124568008e-06, + "logits/chosen": 792897877.3333334, + "logits/rejected": 1854958592.0, + "logps/chosen": -482.3379720052083, + "logps/rejected": -625.2650146484375, + "loss": 0.0562, + "rewards/chosen": 2.968378702799479, + "rewards/margins": 15.117247263590494, + "rewards/rejected": -12.148868560791016, + "step": 905 + }, + { + "epoch": 0.08277752398355413, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.83895835505038e-06, + "logits/chosen": 547275922.2857143, + "logits/rejected": 2544422656.0, + "logps/chosen": -342.18143136160717, + "logps/rejected": -338.54913330078125, + "loss": 0.1097, + "rewards/chosen": 2.459438051496233, + "rewards/margins": 10.161873544965472, + "rewards/rejected": -7.702435493469238, + "step": 906 + }, + { + "epoch": 0.08286888990406578, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.838596185395732e-06, + "logits/chosen": 429010602.6666667, + "logits/rejected": 534404147.2, + "logps/chosen": -197.35294596354166, + "logps/rejected": -313.5906982421875, + "loss": 0.156, + "rewards/chosen": 2.5536392529805503, + "rewards/margins": 8.393460877736409, + "rewards/rejected": -5.839821624755859, + "step": 907 + }, + { + "epoch": 0.08296025582457743, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.838233615634007e-06, + "logits/chosen": 686251392.0, + "logits/rejected": 437590304.0, + "logps/chosen": -285.69775390625, + "logps/rejected": -545.8499755859375, + "loss": 0.1157, + "rewards/chosen": 2.417543411254883, + "rewards/margins": 10.363914489746094, + "rewards/rejected": -7.946371078491211, + "step": 908 + }, + { + "epoch": 0.08305162174508908, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.837870645795192e-06, + "logits/chosen": 808821824.0, + "logits/rejected": 1077092096.0, + "logps/chosen": -327.8791809082031, + "logps/rejected": -690.6046752929688, + "loss": 0.0343, + "rewards/chosen": 3.4259839057922363, + "rewards/margins": 14.30658769607544, + "rewards/rejected": -10.880603790283203, + "step": 909 + }, + { + "epoch": 0.08314298766560073, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.837507275909296e-06, + "logits/chosen": 685702656.0, + "logits/rejected": 552279040.0, + "logps/chosen": -326.2044372558594, + "logps/rejected": -539.3518676757812, + "loss": 0.0331, + "rewards/chosen": 2.794076919555664, + "rewards/margins": 9.256338596343994, + "rewards/rejected": -6.46226167678833, + "step": 910 + }, + { + "epoch": 0.08323435358611238, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.837143506006372e-06, + "logits/chosen": 432515712.0, + "logits/rejected": 349574976.0, + "logps/chosen": -505.98858642578125, + "logps/rejected": -442.1077880859375, + "loss": 0.0414, + "rewards/chosen": 3.1245040893554688, + "rewards/margins": 11.612892150878906, + "rewards/rejected": -8.488388061523438, + "step": 911 + }, + { + "epoch": 0.08332571950662403, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.836779336116495e-06, + "logits/chosen": 539799705.6, + "logits/rejected": 823465557.3333334, + "logps/chosen": -281.978466796875, + "logps/rejected": -306.69276936848956, + "loss": 0.0684, + "rewards/chosen": 2.7152097702026365, + "rewards/margins": 7.428470738728841, + "rewards/rejected": -4.713260968526204, + "step": 912 + }, + { + "epoch": 0.08341708542713568, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.83641476626978e-06, + "logits/chosen": 372482730.6666667, + "logits/rejected": 379102208.0, + "logps/chosen": -334.3306884765625, + "logps/rejected": -356.184619140625, + "loss": 0.0222, + "rewards/chosen": 3.129518508911133, + "rewards/margins": 10.204468154907227, + "rewards/rejected": -7.074949645996094, + "step": 913 + }, + { + "epoch": 0.08350845134764733, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.836049796496376e-06, + "logits/chosen": 360239872.0, + "logits/rejected": 288060083.2, + "logps/chosen": -203.99462890625, + "logps/rejected": -456.82900390625, + "loss": 0.2356, + "rewards/chosen": 0.0740049680074056, + "rewards/margins": 7.559249718983968, + "rewards/rejected": -7.4852447509765625, + "step": 914 + }, + { + "epoch": 0.08359981726815897, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.835684426826459e-06, + "logits/chosen": 423454080.0, + "logps/chosen": -294.7293701171875, + "loss": 0.022, + "rewards/chosen": 3.8718643188476562, + "step": 915 + }, + { + "epoch": 0.08369118318867062, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.835318657290245e-06, + "logits/chosen": 579468288.0, + "logits/rejected": 720136832.0, + "logps/chosen": -286.87646484375, + "logps/rejected": -783.816162109375, + "loss": 0.0223, + "rewards/chosen": 3.456399917602539, + "rewards/margins": 11.77485179901123, + "rewards/rejected": -8.318451881408691, + "step": 916 + }, + { + "epoch": 0.08378254910918227, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.834952487917976e-06, + "logits/chosen": 521466953.14285713, + "logits/rejected": 309977504.0, + "logps/chosen": -305.704833984375, + "logps/rejected": -388.9895935058594, + "loss": 0.0617, + "rewards/chosen": 2.811994825090681, + "rewards/margins": 11.226048742021833, + "rewards/rejected": -8.414053916931152, + "step": 917 + }, + { + "epoch": 0.08387391502969392, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 9.834585918739936e-06, + "logits/chosen": 442556896.0, + "logits/rejected": 928800621.7142857, + "logps/chosen": -198.34515380859375, + "logps/rejected": -382.22935267857144, + "loss": 0.0048, + "rewards/chosen": 4.328405857086182, + "rewards/margins": 10.959793840135847, + "rewards/rejected": -6.631387983049665, + "step": 918 + }, + { + "epoch": 0.08396528095020557, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 9.834218949786433e-06, + "logits/chosen": 405567846.4, + "logits/rejected": 472809258.6666667, + "logps/chosen": -338.80576171875, + "logps/rejected": -588.5767415364584, + "loss": 0.0603, + "rewards/chosen": 2.278251075744629, + "rewards/margins": 11.194430732727051, + "rewards/rejected": -8.916179656982422, + "step": 919 + }, + { + "epoch": 0.08405664687071722, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.833851581087812e-06, + "logits/chosen": 756909738.6666666, + "logits/rejected": 1151619072.0, + "logps/chosen": -314.432861328125, + "logps/rejected": -566.3150390625, + "loss": 0.0335, + "rewards/chosen": 2.583073616027832, + "rewards/margins": 11.889653205871582, + "rewards/rejected": -9.30657958984375, + "step": 920 + }, + { + "epoch": 0.08414801279122887, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.833483812674453e-06, + "logits/chosen": 480402976.0, + "logits/rejected": 535693376.0, + "logps/chosen": -314.9220886230469, + "logps/rejected": -568.623291015625, + "loss": 0.0401, + "rewards/chosen": 2.7880849838256836, + "rewards/margins": 9.916111946105957, + "rewards/rejected": -7.128026962280273, + "step": 921 + }, + { + "epoch": 0.08423937871174052, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.833115644576765e-06, + "logits/chosen": 372659456.0, + "logits/rejected": 277490240.0, + "logps/chosen": -335.244677734375, + "logps/rejected": -335.1678059895833, + "loss": 0.0256, + "rewards/chosen": 3.411955642700195, + "rewards/margins": 12.168318557739259, + "rewards/rejected": -8.756362915039062, + "step": 922 + }, + { + "epoch": 0.08433074463225217, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.832747076825194e-06, + "logits/chosen": 734572160.0, + "logits/rejected": 1241604480.0, + "logps/chosen": -331.78179931640625, + "logps/rejected": -516.0267333984375, + "loss": 0.059, + "rewards/chosen": 3.0237903594970703, + "rewards/margins": 10.585889339447021, + "rewards/rejected": -7.562098979949951, + "step": 923 + }, + { + "epoch": 0.08442211055276382, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 9.832378109450215e-06, + "logits/chosen": 603946944.0, + "logits/rejected": 800434858.6666666, + "logps/chosen": -307.4272766113281, + "logps/rejected": -516.3587239583334, + "loss": 0.1653, + "rewards/chosen": 2.1555709838867188, + "rewards/margins": 8.732133229573567, + "rewards/rejected": -6.576562245686849, + "step": 924 + }, + { + "epoch": 0.08451347647327546, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.832008742482341e-06, + "logits/chosen": 348889600.0, + "logits/rejected": 720086016.0, + "logps/chosen": -294.07421875, + "logps/rejected": -451.01348876953125, + "loss": 0.022, + "rewards/chosen": 3.43782114982605, + "rewards/margins": 10.984430074691772, + "rewards/rejected": -7.546608924865723, + "step": 925 + }, + { + "epoch": 0.08460484239378711, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.831638975952112e-06, + "logits/chosen": 440336896.0, + "logits/rejected": 411672149.3333333, + "logps/chosen": -450.974169921875, + "logps/rejected": -351.1243082682292, + "loss": 0.1035, + "rewards/chosen": 3.051258659362793, + "rewards/margins": 6.537962849934896, + "rewards/rejected": -3.486704190572103, + "step": 926 + }, + { + "epoch": 0.08469620831429876, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 9.831268809890106e-06, + "logits/chosen": 554507673.6, + "logits/rejected": 629033045.3333334, + "logps/chosen": -370.614697265625, + "logps/rejected": -670.7145182291666, + "loss": 0.0303, + "rewards/chosen": 3.164307403564453, + "rewards/margins": 9.402017084757487, + "rewards/rejected": -6.237709681193034, + "step": 927 + }, + { + "epoch": 0.08478757423481041, + "grad_norm": 0.58203125, + "kl": 0.0, + "learning_rate": 9.830898244326932e-06, + "logits/chosen": 206385184.0, + "logits/rejected": 414056832.0, + "logps/chosen": -172.99969482421875, + "logps/rejected": -522.5264485677084, + "loss": 0.0036, + "rewards/chosen": 4.8137736320495605, + "rewards/margins": 13.64286184310913, + "rewards/rejected": -8.82908821105957, + "step": 928 + }, + { + "epoch": 0.08487894015532206, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.830527279293233e-06, + "logits/chosen": 463931776.0, + "logits/rejected": 791813248.0, + "logps/chosen": -263.0253601074219, + "logps/rejected": -391.0342102050781, + "loss": 0.098, + "rewards/chosen": 3.1302051544189453, + "rewards/margins": 7.327080249786377, + "rewards/rejected": -4.196875095367432, + "step": 929 + }, + { + "epoch": 0.08497030607583371, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 9.830155914819684e-06, + "logits/chosen": 340812064.0, + "logits/rejected": 677730624.0, + "logps/chosen": -207.1702880859375, + "logps/rejected": -451.4100341796875, + "loss": 0.0107, + "rewards/chosen": 4.192658424377441, + "rewards/margins": 13.76969051361084, + "rewards/rejected": -9.577032089233398, + "step": 930 + }, + { + "epoch": 0.08506167199634536, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.829784150936994e-06, + "logits/chosen": 501019733.3333333, + "logits/rejected": 396162496.0, + "logps/chosen": -387.609619140625, + "logps/rejected": -373.0204162597656, + "loss": 0.0547, + "rewards/chosen": 2.8864358266194663, + "rewards/margins": 6.7767785390218105, + "rewards/rejected": -3.8903427124023438, + "step": 931 + }, + { + "epoch": 0.08515303791685701, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 9.829411987675902e-06, + "logits/chosen": 571366656.0, + "logits/rejected": 716413056.0, + "logps/chosen": -389.9838562011719, + "logps/rejected": -421.4088439941406, + "loss": 0.0261, + "rewards/chosen": 3.7194161415100098, + "rewards/margins": 10.742266178131104, + "rewards/rejected": -7.022850036621094, + "step": 932 + }, + { + "epoch": 0.08524440383736866, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.829039425067185e-06, + "logits/chosen": 930743936.0, + "logits/rejected": 411365856.0, + "logps/chosen": -346.1593017578125, + "logps/rejected": -317.84368896484375, + "loss": 0.0384, + "rewards/chosen": 2.69584584236145, + "rewards/margins": 10.708574533462524, + "rewards/rejected": -8.012728691101074, + "step": 933 + }, + { + "epoch": 0.0853357697578803, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.828666463141649e-06, + "logits/chosen": 649813094.4, + "logits/rejected": 510910208.0, + "logps/chosen": -235.987353515625, + "logps/rejected": -395.0543619791667, + "loss": 0.1055, + "rewards/chosen": 2.321617317199707, + "rewards/margins": 9.62379093170166, + "rewards/rejected": -7.302173614501953, + "step": 934 + }, + { + "epoch": 0.08542713567839195, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 9.828293101930135e-06, + "logits/chosen": 207623584.0, + "logits/rejected": 478882742.85714287, + "logps/chosen": -152.9741668701172, + "logps/rejected": -342.0668247767857, + "loss": 0.0993, + "rewards/chosen": 4.680726528167725, + "rewards/margins": 10.10669960294451, + "rewards/rejected": -5.425973074776786, + "step": 935 + }, + { + "epoch": 0.0855185015989036, + "grad_norm": 24.625, + "kl": 0.0, + "learning_rate": 9.827919341463516e-06, + "logits/chosen": 484261760.0, + "logps/chosen": -202.13221740722656, + "loss": 0.1475, + "rewards/chosen": 2.5386414527893066, + "step": 936 + }, + { + "epoch": 0.08560986751941525, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.827545181772701e-06, + "logits/chosen": 897422540.8, + "logits/rejected": 358661845.3333333, + "logps/chosen": -331.755859375, + "logps/rejected": -401.3495686848958, + "loss": 0.0937, + "rewards/chosen": 2.104297065734863, + "rewards/margins": 11.00503355662028, + "rewards/rejected": -8.900736490885416, + "step": 937 + }, + { + "epoch": 0.0857012334399269, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.827170622888627e-06, + "logits/chosen": 771653248.0, + "logits/rejected": 495990304.0, + "logps/chosen": -256.9658508300781, + "logps/rejected": -514.4462890625, + "loss": 0.0365, + "rewards/chosen": 3.0520474910736084, + "rewards/margins": 11.248615026473999, + "rewards/rejected": -8.19656753540039, + "step": 938 + }, + { + "epoch": 0.08579259936043855, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.826795664842266e-06, + "logits/chosen": 652163993.6, + "logits/rejected": 868999680.0, + "logps/chosen": -398.0089599609375, + "logps/rejected": -589.3224283854166, + "loss": 0.0456, + "rewards/chosen": 2.9702388763427736, + "rewards/margins": 11.245890172322593, + "rewards/rejected": -8.275651295979818, + "step": 939 + }, + { + "epoch": 0.0858839652809502, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.826420307664626e-06, + "logits/chosen": 704853589.3333334, + "logits/rejected": 288472800.0, + "logps/chosen": -374.1205240885417, + "logps/rejected": -439.689453125, + "loss": 0.0467, + "rewards/chosen": 2.892204920450846, + "rewards/margins": 11.81495730082194, + "rewards/rejected": -8.922752380371094, + "step": 940 + }, + { + "epoch": 0.08597533120146185, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.826044551386743e-06, + "logits/chosen": 354514368.0, + "logits/rejected": 299468748.8, + "logps/chosen": -247.6666463216146, + "logps/rejected": -348.76376953125, + "loss": 0.0655, + "rewards/chosen": 2.3784221013387046, + "rewards/margins": 7.286971600850423, + "rewards/rejected": -4.908549499511719, + "step": 941 + }, + { + "epoch": 0.0860666971219735, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.825668396039691e-06, + "logits/chosen": 282057173.3333333, + "logits/rejected": 443715532.8, + "logps/chosen": -143.82100423177084, + "logps/rejected": -456.96591796875, + "loss": 0.0691, + "rewards/chosen": 1.6393283208211262, + "rewards/margins": 9.109667523701987, + "rewards/rejected": -7.47033920288086, + "step": 942 + }, + { + "epoch": 0.08615806304248515, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.825291841654574e-06, + "logits/chosen": 401227443.2, + "logits/rejected": 603458005.3333334, + "logps/chosen": -352.238330078125, + "logps/rejected": -594.1014811197916, + "loss": 0.0299, + "rewards/chosen": 3.216664505004883, + "rewards/margins": 11.17700055440267, + "rewards/rejected": -7.960336049397786, + "step": 943 + }, + { + "epoch": 0.0862494289629968, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.82491488826253e-06, + "logits/chosen": 682487168.0, + "logits/rejected": 469643072.0, + "logps/chosen": -334.786376953125, + "logps/rejected": -467.7234802246094, + "loss": 0.0414, + "rewards/chosen": 3.3409059047698975, + "rewards/margins": 11.236187219619751, + "rewards/rejected": -7.8952813148498535, + "step": 944 + }, + { + "epoch": 0.08634079488350845, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.824537535894725e-06, + "logits/chosen": 1049876096.0, + "logits/rejected": 691606144.0, + "logps/chosen": -360.224609375, + "logps/rejected": -435.4784851074219, + "loss": 0.0308, + "rewards/chosen": 3.1262524127960205, + "rewards/margins": 9.085870027542114, + "rewards/rejected": -5.959617614746094, + "step": 945 + }, + { + "epoch": 0.0864321608040201, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.82415978458237e-06, + "logits/chosen": 599779532.8, + "logits/rejected": 551869610.6666666, + "logps/chosen": -408.0913818359375, + "logps/rejected": -406.9370930989583, + "loss": 0.0274, + "rewards/chosen": 3.5557594299316406, + "rewards/margins": 10.628111521402996, + "rewards/rejected": -7.0723520914713545, + "step": 946 + }, + { + "epoch": 0.08652352672453174, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 9.823781634356695e-06, + "logits/chosen": 423604309.3333333, + "logits/rejected": 671483699.2, + "logps/chosen": -315.48061116536456, + "logps/rejected": -626.303125, + "loss": 0.0229, + "rewards/chosen": 3.5800374348958335, + "rewards/margins": 9.869065602620443, + "rewards/rejected": -6.289028167724609, + "step": 947 + }, + { + "epoch": 0.08661489264504339, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 9.823403085248973e-06, + "logits/chosen": 999968768.0, + "logits/rejected": 311308672.0, + "logps/chosen": -433.83795166015625, + "logps/rejected": -317.3890380859375, + "loss": 0.1266, + "rewards/chosen": 1.6527526378631592, + "rewards/margins": 8.999078035354614, + "rewards/rejected": -7.346325397491455, + "step": 948 + }, + { + "epoch": 0.08670625856555504, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.823024137290505e-06, + "logits/chosen": 436397098.6666667, + "logits/rejected": 310017766.4, + "logps/chosen": -348.125, + "logps/rejected": -384.1813232421875, + "loss": 0.041, + "rewards/chosen": 2.9669011433919272, + "rewards/margins": 10.30278294881185, + "rewards/rejected": -7.3358818054199215, + "step": 949 + }, + { + "epoch": 0.08679762448606669, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.822644790512628e-06, + "logits/chosen": 735271082.6666666, + "logits/rejected": 1078353817.6, + "logps/chosen": -252.7931925455729, + "logps/rejected": -755.595849609375, + "loss": 0.0249, + "rewards/chosen": 3.4677133560180664, + "rewards/margins": 13.178658485412598, + "rewards/rejected": -9.710945129394531, + "step": 950 + }, + { + "epoch": 0.08688899040657834, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.822265044946709e-06, + "logits/chosen": 399697408.0, + "logits/rejected": 380847317.3333333, + "logps/chosen": -259.557373046875, + "logps/rejected": -375.8085123697917, + "loss": 0.0824, + "rewards/chosen": 2.5342613220214845, + "rewards/margins": 9.609646097819011, + "rewards/rejected": -7.075384775797526, + "step": 951 + }, + { + "epoch": 0.08698035632708999, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.82188490062415e-06, + "logits/chosen": 1210824601.6, + "logits/rejected": 432554325.3333333, + "logps/chosen": -256.5626220703125, + "logps/rejected": -445.7336832682292, + "loss": 0.0585, + "rewards/chosen": 2.8015560150146483, + "rewards/margins": 10.262216313680012, + "rewards/rejected": -7.460660298665364, + "step": 952 + }, + { + "epoch": 0.08707172224760164, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.821504357576387e-06, + "logits/chosen": 504006400.0, + "logits/rejected": 861433856.0, + "logps/chosen": -372.8973388671875, + "logps/rejected": -731.0354614257812, + "loss": 0.0269, + "rewards/chosen": 3.0239338874816895, + "rewards/margins": 13.540745258331299, + "rewards/rejected": -10.51681137084961, + "step": 953 + }, + { + "epoch": 0.08716308816811329, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.821123415834884e-06, + "logits/chosen": 621708544.0, + "logits/rejected": 803247104.0, + "logps/chosen": -364.41400146484375, + "logps/rejected": -711.8907063802084, + "loss": 0.1006, + "rewards/chosen": 0.7998206615447998, + "rewards/margins": 11.477847496668497, + "rewards/rejected": -10.678026835123697, + "step": 954 + }, + { + "epoch": 0.08725445408862495, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.820742075431144e-06, + "logits/chosen": 472409856.0, + "logits/rejected": 710903168.0, + "logps/chosen": -294.6214599609375, + "logps/rejected": -451.6024169921875, + "loss": 0.0649, + "rewards/chosen": 2.698500156402588, + "rewards/margins": 7.325303077697754, + "rewards/rejected": -4.626802921295166, + "step": 955 + }, + { + "epoch": 0.0873458200091366, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.8203603363967e-06, + "logits/chosen": 581306538.6666666, + "logits/rejected": 713085542.4, + "logps/chosen": -380.6145833333333, + "logps/rejected": -364.1906982421875, + "loss": 0.0544, + "rewards/chosen": 2.712474505106608, + "rewards/margins": 8.11810614267985, + "rewards/rejected": -5.405631637573242, + "step": 956 + }, + { + "epoch": 0.08743718592964825, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.819978198763119e-06, + "logits/chosen": 448270848.0, + "logits/rejected": 385702016.0, + "logps/chosen": -276.3990478515625, + "logps/rejected": -427.84375, + "loss": 0.033, + "rewards/chosen": 2.4306971232096353, + "rewards/margins": 9.920298258463541, + "rewards/rejected": -7.489601135253906, + "step": 957 + }, + { + "epoch": 0.0875285518501599, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.819595662561997e-06, + "logits/chosen": 845259434.6666666, + "logits/rejected": 629669056.0, + "logps/chosen": -464.0011800130208, + "logps/rejected": -369.71795654296875, + "loss": 0.06, + "rewards/chosen": 2.9652156829833984, + "rewards/margins": 7.140307426452637, + "rewards/rejected": -4.175091743469238, + "step": 958 + }, + { + "epoch": 0.08761991777067155, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.819212727824968e-06, + "logits/chosen": 418595968.0, + "logits/rejected": 460557888.0, + "logps/chosen": -261.38079833984375, + "logps/rejected": -359.8656005859375, + "loss": 0.0573, + "rewards/chosen": 2.3512744903564453, + "rewards/margins": 8.319645404815674, + "rewards/rejected": -5.9683709144592285, + "step": 959 + }, + { + "epoch": 0.0877112836911832, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 9.818829394583698e-06, + "logits/chosen": 821606912.0, + "logits/rejected": 697772032.0, + "logps/chosen": -308.5888977050781, + "logps/rejected": -561.0836181640625, + "loss": 0.1014, + "rewards/chosen": 2.2346367835998535, + "rewards/margins": 7.772556304931641, + "rewards/rejected": -5.537919521331787, + "step": 960 + }, + { + "epoch": 0.08780264961169484, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.818445662869884e-06, + "logits/chosen": 1106059264.0, + "logits/rejected": 1070864384.0, + "logps/chosen": -494.5952453613281, + "logps/rejected": -772.2515258789062, + "loss": 0.0414, + "rewards/chosen": 3.0108461380004883, + "rewards/margins": 12.730438232421875, + "rewards/rejected": -9.719592094421387, + "step": 961 + }, + { + "epoch": 0.0878940155322065, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.818061532715258e-06, + "logits/chosen": 542018150.4, + "logits/rejected": 545329194.6666666, + "logps/chosen": -432.0365234375, + "logps/rejected": -513.3639322916666, + "loss": 0.0339, + "rewards/chosen": 3.0000354766845705, + "rewards/margins": 10.729648462931316, + "rewards/rejected": -7.729612986246745, + "step": 962 + }, + { + "epoch": 0.08798538145271814, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.817677004151584e-06, + "logits/chosen": 513323616.0, + "logits/rejected": 374952320.0, + "logps/chosen": -326.47259521484375, + "logps/rejected": -388.33953857421875, + "loss": 0.1045, + "rewards/chosen": 3.078829765319824, + "rewards/margins": 8.691162586212158, + "rewards/rejected": -5.612332820892334, + "step": 963 + }, + { + "epoch": 0.08807674737322979, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.817292077210658e-06, + "logits/chosen": 460897433.6, + "logits/rejected": 327753600.0, + "logps/chosen": -327.113525390625, + "logps/rejected": -428.1464029947917, + "loss": 0.0296, + "rewards/chosen": 3.462393951416016, + "rewards/margins": 7.799558512369792, + "rewards/rejected": -4.337164560953776, + "step": 964 + }, + { + "epoch": 0.08816811329374144, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.81690675192431e-06, + "logits/chosen": 805095936.0, + "logits/rejected": 1134458794.6666667, + "logps/chosen": -314.7527587890625, + "logps/rejected": -912.1321614583334, + "loss": 0.0352, + "rewards/chosen": 3.1854480743408202, + "rewards/margins": 14.223148727416993, + "rewards/rejected": -11.037700653076172, + "step": 965 + }, + { + "epoch": 0.08825947921425309, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.816521028324405e-06, + "logits/chosen": 252808000.0, + "logits/rejected": 518105429.3333333, + "logps/chosen": -259.2652282714844, + "logps/rejected": -614.4250895182291, + "loss": 0.0186, + "rewards/chosen": 5.011127471923828, + "rewards/margins": 13.214921315511068, + "rewards/rejected": -8.20379384358724, + "step": 966 + }, + { + "epoch": 0.08835084513476474, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.816134906442836e-06, + "logits/chosen": 381158400.0, + "logits/rejected": 518919168.0, + "logps/chosen": -208.007568359375, + "logps/rejected": -469.8013671875, + "loss": 0.0983, + "rewards/chosen": 3.5690600077311196, + "rewards/margins": 9.435898844401041, + "rewards/rejected": -5.866838836669922, + "step": 967 + }, + { + "epoch": 0.08844221105527639, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 9.815748386311532e-06, + "logits/chosen": 659951040.0, + "logits/rejected": 674769334.8571428, + "logps/chosen": -324.9962463378906, + "logps/rejected": -524.262451171875, + "loss": 0.0087, + "rewards/chosen": 2.82794189453125, + "rewards/margins": 11.535425458635602, + "rewards/rejected": -8.707483564104352, + "step": 968 + }, + { + "epoch": 0.08853357697578804, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 9.815361467962457e-06, + "logits/chosen": 540419456.0, + "logits/rejected": 1036917248.0, + "logps/chosen": -137.89300537109375, + "logps/rejected": -685.425048828125, + "loss": 0.0267, + "rewards/chosen": 3.556917190551758, + "rewards/margins": 14.308121681213379, + "rewards/rejected": -10.751204490661621, + "step": 969 + }, + { + "epoch": 0.08862494289629969, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.814974151427603e-06, + "logits/chosen": 352825856.0, + "logits/rejected": 493725781.3333333, + "logps/chosen": -250.032373046875, + "logps/rejected": -377.7568359375, + "loss": 0.0378, + "rewards/chosen": 3.127718925476074, + "rewards/margins": 10.4339017868042, + "rewards/rejected": -7.306182861328125, + "step": 970 + }, + { + "epoch": 0.08871630881681133, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.814586436738998e-06, + "logits/chosen": 370481237.3333333, + "logits/rejected": 488206131.2, + "logps/chosen": -280.6895751953125, + "logps/rejected": -510.31318359375, + "loss": 0.0124, + "rewards/chosen": 3.597771326700846, + "rewards/margins": 10.059508387247721, + "rewards/rejected": -6.461737060546875, + "step": 971 + }, + { + "epoch": 0.08880767473732298, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.814198323928704e-06, + "logits/chosen": 746355370.6666666, + "logits/rejected": 499166208.0, + "logps/chosen": -530.791259765625, + "logps/rejected": -546.302294921875, + "loss": 0.0258, + "rewards/chosen": 2.825119972229004, + "rewards/margins": 9.955374336242675, + "rewards/rejected": -7.1302543640136715, + "step": 972 + }, + { + "epoch": 0.08889904065783463, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 9.81380981302881e-06, + "logits/chosen": 776897945.6, + "logits/rejected": 568215637.3333334, + "logps/chosen": -251.93603515625, + "logps/rejected": -542.1894938151041, + "loss": 0.1187, + "rewards/chosen": 2.050113296508789, + "rewards/margins": 10.14936408996582, + "rewards/rejected": -8.099250793457031, + "step": 973 + }, + { + "epoch": 0.08899040657834628, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.81342090407145e-06, + "logits/chosen": 639907328.0, + "logits/rejected": 503174560.0, + "logps/chosen": -393.87713623046875, + "logps/rejected": -594.1035766601562, + "loss": 0.0625, + "rewards/chosen": 2.2150394916534424, + "rewards/margins": 11.85882306098938, + "rewards/rejected": -9.643783569335938, + "step": 974 + }, + { + "epoch": 0.08908177249885793, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.813031597088775e-06, + "logits/chosen": 824645888.0, + "logits/rejected": 618839844.5714285, + "logps/chosen": -639.3727416992188, + "logps/rejected": -439.2377232142857, + "loss": 0.1289, + "rewards/chosen": 2.318145751953125, + "rewards/margins": 7.997053963797433, + "rewards/rejected": -5.678908211844308, + "step": 975 + }, + { + "epoch": 0.08917313841936958, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.812641892112981e-06, + "logits/chosen": 539326156.8, + "logits/rejected": 385278378.6666667, + "logps/chosen": -360.050634765625, + "logps/rejected": -361.1805013020833, + "loss": 0.0453, + "rewards/chosen": 3.1683326721191407, + "rewards/margins": 8.0182341893514, + "rewards/rejected": -4.849901517232259, + "step": 976 + }, + { + "epoch": 0.08926450433988123, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.812251789176294e-06, + "logits/chosen": 350383948.8, + "logits/rejected": 524439594.6666667, + "logps/chosen": -220.4900146484375, + "logps/rejected": -572.5970052083334, + "loss": 0.0356, + "rewards/chosen": 3.331646728515625, + "rewards/margins": 11.485994466145833, + "rewards/rejected": -8.154347737630209, + "step": 977 + }, + { + "epoch": 0.08935587026039288, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.81186128831097e-06, + "logits/chosen": 812707328.0, + "logits/rejected": 370110912.0, + "logps/chosen": -467.534423828125, + "logps/rejected": -401.8216857910156, + "loss": 0.0952, + "rewards/chosen": 2.7922167096819197, + "rewards/margins": 7.080104282924108, + "rewards/rejected": -4.2878875732421875, + "step": 978 + }, + { + "epoch": 0.08944723618090453, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.811470389549301e-06, + "logits/chosen": 496692821.3333333, + "logits/rejected": 509541120.0, + "logps/chosen": -294.057861328125, + "logps/rejected": -463.48162841796875, + "loss": 0.0575, + "rewards/chosen": 3.231351852416992, + "rewards/margins": 12.170671463012695, + "rewards/rejected": -8.939319610595703, + "step": 979 + }, + { + "epoch": 0.08953860210141618, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.811079092923608e-06, + "logits/chosen": 871728512.0, + "logits/rejected": 533780096.0, + "logps/chosen": -256.1539001464844, + "logps/rejected": -323.455078125, + "loss": 0.0529, + "rewards/chosen": 2.6126511096954346, + "rewards/margins": 8.293200731277466, + "rewards/rejected": -5.680549621582031, + "step": 980 + }, + { + "epoch": 0.08962996802192782, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.810687398466252e-06, + "logits/chosen": 487810252.8, + "logits/rejected": 497237504.0, + "logps/chosen": -361.7346923828125, + "logps/rejected": -494.3986002604167, + "loss": 0.0727, + "rewards/chosen": 2.7836278915405273, + "rewards/margins": 9.168128395080567, + "rewards/rejected": -6.384500503540039, + "step": 981 + }, + { + "epoch": 0.08972133394243947, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.810295306209618e-06, + "logits/chosen": 492014336.0, + "logits/rejected": 535521472.0, + "logps/chosen": -257.989501953125, + "logps/rejected": -495.37628173828125, + "loss": 0.0794, + "rewards/chosen": 2.26247501373291, + "rewards/margins": 8.632720947265625, + "rewards/rejected": -6.370245933532715, + "step": 982 + }, + { + "epoch": 0.08981269986295112, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.809902816186133e-06, + "logits/chosen": 737922218.6666666, + "logits/rejected": 529229414.4, + "logps/chosen": -616.7097981770834, + "logps/rejected": -584.05234375, + "loss": 0.0409, + "rewards/chosen": 2.6874237060546875, + "rewards/margins": 10.03347396850586, + "rewards/rejected": -7.346050262451172, + "step": 983 + }, + { + "epoch": 0.08990406578346277, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.809509928428249e-06, + "logits/chosen": 596428928.0, + "logits/rejected": 287673664.0, + "logps/chosen": -282.42193603515625, + "logps/rejected": -334.20318603515625, + "loss": 0.0572, + "rewards/chosen": 2.3327033519744873, + "rewards/margins": 8.658906698226929, + "rewards/rejected": -6.326203346252441, + "step": 984 + }, + { + "epoch": 0.08999543170397442, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.809116642968454e-06, + "logits/chosen": 573930410.6666666, + "logits/rejected": 425540761.6, + "logps/chosen": -245.77030436197916, + "logps/rejected": -434.8931640625, + "loss": 0.0375, + "rewards/chosen": 4.500191688537598, + "rewards/margins": 9.789532279968261, + "rewards/rejected": -5.289340591430664, + "step": 985 + }, + { + "epoch": 0.09008679762448607, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.80872295983927e-06, + "logits/chosen": 724866048.0, + "logits/rejected": 505819221.3333333, + "logps/chosen": -408.670654296875, + "logps/rejected": -340.4097086588542, + "loss": 0.0439, + "rewards/chosen": 2.795948600769043, + "rewards/margins": 10.451507504781087, + "rewards/rejected": -7.655558904012044, + "step": 986 + }, + { + "epoch": 0.09017816354499772, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.808328879073251e-06, + "logits/chosen": 442847436.8, + "logits/rejected": 544565845.3333334, + "logps/chosen": -292.03359375, + "logps/rejected": -788.9874674479166, + "loss": 0.0495, + "rewards/chosen": 2.6420740127563476, + "rewards/margins": 11.607442919413248, + "rewards/rejected": -8.9653689066569, + "step": 987 + }, + { + "epoch": 0.09026952946550937, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.807934400702986e-06, + "logits/chosen": 679753514.6666666, + "logits/rejected": 528307814.4, + "logps/chosen": -318.3458658854167, + "logps/rejected": -386.11396484375, + "loss": 0.0706, + "rewards/chosen": 2.0949522654215493, + "rewards/margins": 8.597545496622722, + "rewards/rejected": -6.502593231201172, + "step": 988 + }, + { + "epoch": 0.09036089538602102, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.80753952476109e-06, + "logits/chosen": 520801365.3333333, + "logits/rejected": 346853913.6, + "logps/chosen": -313.2458902994792, + "logps/rejected": -485.058984375, + "loss": 0.1308, + "rewards/chosen": 1.7125658988952637, + "rewards/margins": 9.339527416229249, + "rewards/rejected": -7.626961517333984, + "step": 989 + }, + { + "epoch": 0.09045226130653267, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.80714425128022e-06, + "logits/chosen": 634213162.6666666, + "logits/rejected": 735693440.0, + "logps/chosen": -259.06789143880206, + "logps/rejected": -485.26226806640625, + "loss": 0.0467, + "rewards/chosen": 2.8079331715901694, + "rewards/margins": 10.823006947835287, + "rewards/rejected": -8.015073776245117, + "step": 990 + }, + { + "epoch": 0.09054362722704432, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.80674858029306e-06, + "logits/chosen": 415611989.3333333, + "logits/rejected": 546060646.4, + "logps/chosen": -257.607421875, + "logps/rejected": -386.4604248046875, + "loss": 0.0117, + "rewards/chosen": 3.7516282399495444, + "rewards/margins": 11.803288396199545, + "rewards/rejected": -8.05166015625, + "step": 991 + }, + { + "epoch": 0.09063499314755596, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.806352511832327e-06, + "logits/chosen": 525096128.0, + "logits/rejected": 844123456.0, + "logps/chosen": -259.49468994140625, + "logps/rejected": -451.0899658203125, + "loss": 0.068, + "rewards/chosen": 2.6539433002471924, + "rewards/margins": 8.060773611068726, + "rewards/rejected": -5.406830310821533, + "step": 992 + }, + { + "epoch": 0.09072635906806761, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.805956045930772e-06, + "logits/chosen": 616419072.0, + "logits/rejected": 477812224.0, + "logps/chosen": -302.0861511230469, + "logps/rejected": -411.3113708496094, + "loss": 0.0362, + "rewards/chosen": 2.8109946250915527, + "rewards/margins": 9.025721073150635, + "rewards/rejected": -6.214726448059082, + "step": 993 + }, + { + "epoch": 0.09081772498857926, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.805559182621182e-06, + "logits/chosen": 411614880.0, + "logits/rejected": 212327440.0, + "logps/chosen": -287.00677490234375, + "logps/rejected": -312.18365478515625, + "loss": 0.0569, + "rewards/chosen": 2.961996555328369, + "rewards/margins": 7.299153804779053, + "rewards/rejected": -4.337157249450684, + "step": 994 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.80516192193637e-06, + "logits/chosen": 521206101.3333333, + "logits/rejected": 481251225.6, + "logps/chosen": -343.351806640625, + "logps/rejected": -389.268896484375, + "loss": 0.0668, + "rewards/chosen": 2.615203857421875, + "rewards/margins": 7.583739471435547, + "rewards/rejected": -4.968535614013672, + "step": 995 + }, + { + "epoch": 0.09100045682960256, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 9.80476426390919e-06, + "logits/chosen": 516541984.0, + "logits/rejected": 377679530.6666667, + "logps/chosen": -310.379150390625, + "logps/rejected": -540.8168131510416, + "loss": 0.0175, + "rewards/chosen": 2.8372178077697754, + "rewards/margins": 9.717592716217041, + "rewards/rejected": -6.880374908447266, + "step": 996 + }, + { + "epoch": 0.09109182275011421, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.804366208572523e-06, + "logits/chosen": 802879283.2, + "logits/rejected": 455638442.6666667, + "logps/chosen": -366.908984375, + "logps/rejected": -425.4545084635417, + "loss": 0.0922, + "rewards/chosen": 2.2944931030273437, + "rewards/margins": 11.946154022216797, + "rewards/rejected": -9.651660919189453, + "step": 997 + }, + { + "epoch": 0.09118318867062586, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.80396775595928e-06, + "logits/chosen": 379334604.8, + "logits/rejected": 364723797.3333333, + "logps/chosen": -339.10224609375, + "logps/rejected": -349.997802734375, + "loss": 0.1085, + "rewards/chosen": 2.23803596496582, + "rewards/margins": 8.161874262491862, + "rewards/rejected": -5.923838297526042, + "step": 998 + }, + { + "epoch": 0.09127455459113751, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 9.803568906102416e-06, + "logits/chosen": 536583552.0, + "logits/rejected": 773078528.0, + "logps/chosen": -306.5232238769531, + "logps/rejected": -592.2530517578125, + "loss": 0.02, + "rewards/chosen": 3.9582886695861816, + "rewards/margins": 9.483181953430176, + "rewards/rejected": -5.524893283843994, + "step": 999 + }, + { + "epoch": 0.09136592051164916, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.80316965903491e-06, + "logits/chosen": 1792244224.0, + "logits/rejected": 764180565.3333334, + "logps/chosen": -150.77003479003906, + "logps/rejected": -666.1781412760416, + "loss": 0.079, + "rewards/chosen": 2.5108957290649414, + "rewards/margins": 8.564685503641766, + "rewards/rejected": -6.053789774576823, + "step": 1000 + }, + { + "epoch": 0.0914572864321608, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.802770014789775e-06, + "logits/chosen": 685415168.0, + "logits/rejected": 996995993.6, + "logps/chosen": -295.9672037760417, + "logps/rejected": -459.32685546875, + "loss": 0.0913, + "rewards/chosen": 1.7613681157430012, + "rewards/margins": 8.216124280293783, + "rewards/rejected": -6.454756164550782, + "step": 1001 + }, + { + "epoch": 0.09154865235267245, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.80236997340006e-06, + "logits/chosen": 647523174.4, + "logits/rejected": 698358357.3333334, + "logps/chosen": -405.0458740234375, + "logps/rejected": -541.9109700520834, + "loss": 0.0887, + "rewards/chosen": 2.739790344238281, + "rewards/margins": 8.034653091430664, + "rewards/rejected": -5.294862747192383, + "step": 1002 + }, + { + "epoch": 0.0916400182731841, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.80196953489884e-06, + "logits/chosen": 798592000.0, + "logits/rejected": 382987922.28571427, + "logps/chosen": -388.5340576171875, + "logps/rejected": -411.5664760044643, + "loss": 0.0104, + "rewards/chosen": 3.3995301723480225, + "rewards/margins": 10.27019122668675, + "rewards/rejected": -6.870661054338727, + "step": 1003 + }, + { + "epoch": 0.09173138419369575, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.801568699319233e-06, + "logits/chosen": 480013397.3333333, + "logits/rejected": 625569894.4, + "logps/chosen": -312.38885498046875, + "logps/rejected": -687.737890625, + "loss": 0.0227, + "rewards/chosen": 3.373650868733724, + "rewards/margins": 10.707158406575521, + "rewards/rejected": -7.333507537841797, + "step": 1004 + }, + { + "epoch": 0.0918227501142074, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.801167466694381e-06, + "logits/chosen": 468981657.6, + "logits/rejected": 432435072.0, + "logps/chosen": -239.061669921875, + "logps/rejected": -479.9564615885417, + "loss": 0.0307, + "rewards/chosen": 3.320679473876953, + "rewards/margins": 10.791758346557618, + "rewards/rejected": -7.471078872680664, + "step": 1005 + }, + { + "epoch": 0.09191411603471905, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.800765837057464e-06, + "logits/chosen": 876677760.0, + "logits/rejected": 629746432.0, + "logps/chosen": -720.7627563476562, + "logps/rejected": -537.8566284179688, + "loss": 0.0287, + "rewards/chosen": 2.9586119651794434, + "rewards/margins": 10.27159833908081, + "rewards/rejected": -7.312986373901367, + "step": 1006 + }, + { + "epoch": 0.0920054819552307, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 9.800363810441692e-06, + "logits/chosen": 452416832.0, + "logits/rejected": 395409280.0, + "logps/chosen": -227.63787841796875, + "logps/rejected": -417.6497802734375, + "loss": 0.0186, + "rewards/chosen": 3.5754947662353516, + "rewards/margins": 12.541034698486328, + "rewards/rejected": -8.965539932250977, + "step": 1007 + }, + { + "epoch": 0.09209684787574235, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.79996138688031e-06, + "logits/chosen": 1012274432.0, + "logits/rejected": 588035456.0, + "logps/chosen": -156.5745849609375, + "logps/rejected": -456.7948404947917, + "loss": 0.0343, + "rewards/chosen": 2.650557041168213, + "rewards/margins": 7.878798643747966, + "rewards/rejected": -5.228241602579753, + "step": 1008 + }, + { + "epoch": 0.092188213796254, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.799558566406594e-06, + "logits/chosen": 494976682.6666667, + "logits/rejected": 774165606.4, + "logps/chosen": -315.1125895182292, + "logps/rejected": -323.964404296875, + "loss": 0.067, + "rewards/chosen": 1.6970938046773274, + "rewards/margins": 8.416818841298422, + "rewards/rejected": -6.719725036621094, + "step": 1009 + }, + { + "epoch": 0.09227957971676565, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.79915534905385e-06, + "logits/chosen": 555918720.0, + "logits/rejected": 594415488.0, + "logps/chosen": -363.8469543457031, + "logps/rejected": -751.0362548828125, + "loss": 0.0379, + "rewards/chosen": 2.8010525703430176, + "rewards/margins": 13.224441051483154, + "rewards/rejected": -10.423388481140137, + "step": 1010 + }, + { + "epoch": 0.0923709456372773, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 9.798751734855427e-06, + "logits/chosen": 400202197.3333333, + "logits/rejected": 500941926.4, + "logps/chosen": -239.75850423177084, + "logps/rejected": -545.5345703125, + "loss": 0.0193, + "rewards/chosen": 3.1129515965779624, + "rewards/margins": 12.288216336568198, + "rewards/rejected": -9.175264739990235, + "step": 1011 + }, + { + "epoch": 0.09246231155778895, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.798347723844696e-06, + "logits/chosen": 524703402.6666667, + "logits/rejected": 360190924.8, + "logps/chosen": -441.7627360026042, + "logps/rejected": -430.440771484375, + "loss": 0.0475, + "rewards/chosen": 2.659642537434896, + "rewards/margins": 9.205820210774739, + "rewards/rejected": -6.546177673339844, + "step": 1012 + }, + { + "epoch": 0.0925536774783006, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.797943316055068e-06, + "logits/chosen": 862777258.6666666, + "logits/rejected": 666606643.2, + "logps/chosen": -370.83251953125, + "logps/rejected": -449.36044921875, + "loss": 0.0935, + "rewards/chosen": 4.16660722096761, + "rewards/margins": 12.28284715016683, + "rewards/rejected": -8.116239929199219, + "step": 1013 + }, + { + "epoch": 0.09264504339881224, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 9.79753851151998e-06, + "logits/chosen": 377970560.0, + "logits/rejected": 327436096.0, + "logps/chosen": -192.42218017578125, + "logps/rejected": -305.93227132161456, + "loss": 0.0289, + "rewards/chosen": 3.7388824462890624, + "rewards/margins": 8.98310604095459, + "rewards/rejected": -5.244223594665527, + "step": 1014 + }, + { + "epoch": 0.09273640931932389, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.797133310272905e-06, + "logits/chosen": 851270451.2, + "logits/rejected": 814160725.3333334, + "logps/chosen": -358.572119140625, + "logps/rejected": -767.392578125, + "loss": 0.0513, + "rewards/chosen": 2.7183582305908205, + "rewards/margins": 10.334246190388997, + "rewards/rejected": -7.615887959798177, + "step": 1015 + }, + { + "epoch": 0.09282777523983554, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 9.796727712347356e-06, + "logits/chosen": 763508800.0, + "logits/rejected": 770996992.0, + "logps/chosen": -169.92300415039062, + "logps/rejected": -497.56005859375, + "loss": 0.0653, + "rewards/chosen": 3.2113938331604004, + "rewards/margins": 8.793895562489826, + "rewards/rejected": -5.582501729329427, + "step": 1016 + }, + { + "epoch": 0.09291914116034719, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.796321717776865e-06, + "logits/chosen": 773333577.1428572, + "logits/rejected": 2782709760.0, + "logps/chosen": -335.33705357142856, + "logps/rejected": -854.4779052734375, + "loss": 0.1278, + "rewards/chosen": 2.240475518362863, + "rewards/margins": 9.110360963003977, + "rewards/rejected": -6.869885444641113, + "step": 1017 + }, + { + "epoch": 0.09301050708085884, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.795915326595008e-06, + "logits/chosen": 509618880.0, + "logits/rejected": 586616704.0, + "logps/chosen": -336.06146240234375, + "logps/rejected": -396.1065979003906, + "loss": 0.064, + "rewards/chosen": 2.6757652759552, + "rewards/margins": 8.510192632675171, + "rewards/rejected": -5.834427356719971, + "step": 1018 + }, + { + "epoch": 0.09310187300137049, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 9.795508538835388e-06, + "logits/chosen": 631667541.3333334, + "logits/rejected": 300986726.4, + "logps/chosen": -370.0546875, + "logps/rejected": -366.0822998046875, + "loss": 0.0141, + "rewards/chosen": 3.4747610092163086, + "rewards/margins": 9.352910804748536, + "rewards/rejected": -5.878149795532226, + "step": 1019 + }, + { + "epoch": 0.09319323892188214, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.795101354531644e-06, + "logits/chosen": 363307904.0, + "logits/rejected": 1063434410.6666666, + "logps/chosen": -189.51507568359375, + "logps/rejected": -425.5734049479167, + "loss": 0.1195, + "rewards/chosen": 2.5115363597869873, + "rewards/margins": 9.101407448450725, + "rewards/rejected": -6.589871088663737, + "step": 1020 + }, + { + "epoch": 0.09328460484239379, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.794693773717445e-06, + "logits/chosen": 481441600.0, + "logits/rejected": 511762240.0, + "logps/chosen": -510.67724609375, + "logps/rejected": -262.61871337890625, + "loss": 0.0259, + "rewards/chosen": 3.2316131591796875, + "rewards/margins": 9.895434379577637, + "rewards/rejected": -6.663821220397949, + "step": 1021 + }, + { + "epoch": 0.09337597076290544, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.794285796426496e-06, + "logits/chosen": 464259648.0, + "logits/rejected": 481842048.0, + "logps/chosen": -187.4703826904297, + "logps/rejected": -370.58892822265625, + "loss": 0.0254, + "rewards/chosen": 3.1160404682159424, + "rewards/margins": 11.476467370986938, + "rewards/rejected": -8.360426902770996, + "step": 1022 + }, + { + "epoch": 0.09346733668341708, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.793877422692529e-06, + "logits/chosen": 377625024.0, + "logits/rejected": 699687936.0, + "logps/chosen": -253.38427734375, + "logps/rejected": -460.4332275390625, + "loss": 0.1016, + "rewards/chosen": 2.71330189704895, + "rewards/margins": 8.691189527511597, + "rewards/rejected": -5.9778876304626465, + "step": 1023 + }, + { + "epoch": 0.09355870260392873, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.793468652549317e-06, + "logits/chosen": 943966634.6666666, + "logits/rejected": 800303040.0, + "logps/chosen": -219.762451171875, + "logps/rejected": -833.1612548828125, + "loss": 0.1044, + "rewards/chosen": 2.021135171254476, + "rewards/margins": 9.417817910512289, + "rewards/rejected": -7.3966827392578125, + "step": 1024 + }, + { + "epoch": 0.09365006852444038, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 9.79305948603066e-06, + "logits/chosen": 445247456.0, + "logits/rejected": 285375360.0, + "logps/chosen": -389.9387512207031, + "logps/rejected": -377.2845458984375, + "loss": 0.036, + "rewards/chosen": 3.2138962745666504, + "rewards/margins": 11.317324161529541, + "rewards/rejected": -8.10342788696289, + "step": 1025 + }, + { + "epoch": 0.09374143444495203, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.792649923170392e-06, + "logits/chosen": 391694131.2, + "logits/rejected": 278825493.3333333, + "logps/chosen": -155.1685546875, + "logps/rejected": -225.6165771484375, + "loss": 0.06, + "rewards/chosen": 2.842957878112793, + "rewards/margins": 8.694160525004069, + "rewards/rejected": -5.851202646891276, + "step": 1026 + }, + { + "epoch": 0.09383280036546368, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.79223996400238e-06, + "logits/chosen": 581603157.3333334, + "logits/rejected": 314703488.0, + "logps/chosen": -405.0481363932292, + "logps/rejected": -501.4722900390625, + "loss": 0.0582, + "rewards/chosen": 2.8259188334147134, + "rewards/margins": 10.340744654337565, + "rewards/rejected": -7.514825820922852, + "step": 1027 + }, + { + "epoch": 0.09392416628597533, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.791829608560524e-06, + "logits/chosen": 253041962.66666666, + "logits/rejected": 443856076.8, + "logps/chosen": -148.38050333658853, + "logps/rejected": -448.392578125, + "loss": 0.052, + "rewards/chosen": 2.1094182332356772, + "rewards/margins": 7.7499951680501304, + "rewards/rejected": -5.640576934814453, + "step": 1028 + }, + { + "epoch": 0.09401553220648698, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.791418856878758e-06, + "logits/chosen": 492584448.0, + "logits/rejected": 504191584.0, + "logps/chosen": -297.10902622767856, + "logps/rejected": -586.6494140625, + "loss": 0.0959, + "rewards/chosen": 2.3786604745047435, + "rewards/margins": 8.562467847551618, + "rewards/rejected": -6.183807373046875, + "step": 1029 + }, + { + "epoch": 0.09410689812699863, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 9.791007708991045e-06, + "logits/chosen": 338834346.6666667, + "logits/rejected": 460818432.0, + "logps/chosen": -190.76432291666666, + "logps/rejected": -440.058544921875, + "loss": 0.0238, + "rewards/chosen": 3.0748414993286133, + "rewards/margins": 11.182115364074708, + "rewards/rejected": -8.107273864746094, + "step": 1030 + }, + { + "epoch": 0.09419826404751028, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 9.790596164931382e-06, + "logits/chosen": 649591082.6666666, + "logits/rejected": 444457779.2, + "logps/chosen": -329.88999430338544, + "logps/rejected": -471.10986328125, + "loss": 0.0207, + "rewards/chosen": 3.459609031677246, + "rewards/margins": 9.339754676818847, + "rewards/rejected": -5.880145645141601, + "step": 1031 + }, + { + "epoch": 0.09428962996802193, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.790184224733805e-06, + "logits/chosen": 393253162.6666667, + "logits/rejected": 504626432.0, + "logps/chosen": -214.4981892903646, + "logps/rejected": -497.37490234375, + "loss": 0.0421, + "rewards/chosen": 3.2413527170817056, + "rewards/margins": 10.669759241739909, + "rewards/rejected": -7.428406524658203, + "step": 1032 + }, + { + "epoch": 0.09438099588853358, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.789771888432375e-06, + "logits/chosen": 331575424.0, + "logits/rejected": 428452633.6, + "logps/chosen": -221.04168701171875, + "logps/rejected": -267.745361328125, + "loss": 0.0225, + "rewards/chosen": 3.822469393412272, + "rewards/margins": 9.83705374399821, + "rewards/rejected": -6.014584350585937, + "step": 1033 + }, + { + "epoch": 0.09447236180904522, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.789359156061187e-06, + "logits/chosen": 413598668.8, + "logits/rejected": 472291797.3333333, + "logps/chosen": -205.9837890625, + "logps/rejected": -393.7604573567708, + "loss": 0.057, + "rewards/chosen": 2.5123544692993165, + "rewards/margins": 8.82568162282308, + "rewards/rejected": -6.313327153523763, + "step": 1034 + }, + { + "epoch": 0.09456372772955687, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.788946027654372e-06, + "logits/chosen": 498888320.0, + "logits/rejected": 288056384.0, + "logps/chosen": -370.6990152994792, + "logps/rejected": -578.2369384765625, + "loss": 0.1097, + "rewards/chosen": 2.0769006411234536, + "rewards/margins": 13.95672051111857, + "rewards/rejected": -11.879819869995117, + "step": 1035 + }, + { + "epoch": 0.09465509365006852, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.78853250324609e-06, + "logits/chosen": 437602901.3333333, + "logits/rejected": 455482816.0, + "logps/chosen": -249.52799479166666, + "logps/rejected": -575.20703125, + "loss": 0.139, + "rewards/chosen": 2.1964357693990073, + "rewards/margins": 9.959494431813559, + "rewards/rejected": -7.763058662414551, + "step": 1036 + }, + { + "epoch": 0.09474645957058017, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 9.788118582870537e-06, + "logits/chosen": 1176605952.0, + "logits/rejected": 717333077.3333334, + "logps/chosen": -278.59356689453125, + "logps/rejected": -360.2674967447917, + "loss": 0.077, + "rewards/chosen": 3.643177032470703, + "rewards/margins": 9.016705830891926, + "rewards/rejected": -5.373528798421224, + "step": 1037 + }, + { + "epoch": 0.09483782549109182, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.78770426656194e-06, + "logits/chosen": 557969152.0, + "logits/rejected": 528892128.0, + "logps/chosen": -227.7950439453125, + "logps/rejected": -347.504638671875, + "loss": 0.0402, + "rewards/chosen": 2.751415729522705, + "rewards/margins": 8.391717910766602, + "rewards/rejected": -5.6403021812438965, + "step": 1038 + }, + { + "epoch": 0.09492919141160347, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.787289554354558e-06, + "logits/chosen": 620930730.6666666, + "logits/rejected": 1294090112.0, + "logps/chosen": -336.4001057942708, + "logps/rejected": -445.76171875, + "loss": 0.0417, + "rewards/chosen": 2.9875208536783853, + "rewards/margins": 10.959086100260416, + "rewards/rejected": -7.971565246582031, + "step": 1039 + }, + { + "epoch": 0.09502055733211512, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.786874446282686e-06, + "logits/chosen": 669393984.0, + "logits/rejected": 999109312.0, + "logps/chosen": -402.40673828125, + "logps/rejected": -527.7041625976562, + "loss": 0.0307, + "rewards/chosen": 3.034407377243042, + "rewards/margins": 11.91992449760437, + "rewards/rejected": -8.885517120361328, + "step": 1040 + }, + { + "epoch": 0.09511192325262677, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.78645894238065e-06, + "logits/chosen": 410763904.0, + "logits/rejected": 406275520.0, + "logps/chosen": -272.6139221191406, + "logps/rejected": -407.12396240234375, + "loss": 0.0436, + "rewards/chosen": 2.620133399963379, + "rewards/margins": 11.643416404724121, + "rewards/rejected": -9.023283004760742, + "step": 1041 + }, + { + "epoch": 0.09520328917313842, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.786043042682805e-06, + "logits/chosen": 452088490.6666667, + "logits/rejected": 164907289.6, + "logps/chosen": -309.0600992838542, + "logps/rejected": -212.9389404296875, + "loss": 0.031, + "rewards/chosen": 2.6296475728352866, + "rewards/margins": 8.781174977620443, + "rewards/rejected": -6.151527404785156, + "step": 1042 + }, + { + "epoch": 0.09529465509365007, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 9.785626747223544e-06, + "logits/chosen": 477519648.0, + "logits/rejected": 498733184.0, + "logps/chosen": -236.2457275390625, + "logps/rejected": -636.345947265625, + "loss": 0.0519, + "rewards/chosen": 2.3108654022216797, + "rewards/margins": 11.24028205871582, + "rewards/rejected": -8.92941665649414, + "step": 1043 + }, + { + "epoch": 0.09538602101416171, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 9.785210056037292e-06, + "logits/chosen": 344119552.0, + "logits/rejected": 406562688.0, + "logps/chosen": -359.121337890625, + "logps/rejected": -270.59222412109375, + "loss": 0.0709, + "rewards/chosen": 3.229630947113037, + "rewards/margins": 8.333634853363037, + "rewards/rejected": -5.10400390625, + "step": 1044 + }, + { + "epoch": 0.09547738693467336, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.784792969158505e-06, + "logits/chosen": 712995392.0, + "logits/rejected": 414879146.6666667, + "logps/chosen": -252.3393096923828, + "logps/rejected": -377.8170979817708, + "loss": 0.0308, + "rewards/chosen": 3.0510361194610596, + "rewards/margins": 8.428803046544392, + "rewards/rejected": -5.377766927083333, + "step": 1045 + }, + { + "epoch": 0.09556875285518501, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.78437548662167e-06, + "logits/chosen": 717428565.3333334, + "logits/rejected": 583802675.2, + "logps/chosen": -390.8369954427083, + "logps/rejected": -507.474609375, + "loss": 0.0323, + "rewards/chosen": 3.30364990234375, + "rewards/margins": 9.93944854736328, + "rewards/rejected": -6.635798645019531, + "step": 1046 + }, + { + "epoch": 0.09566011877569666, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.783957608461312e-06, + "logits/chosen": 587029504.0, + "logits/rejected": 569317184.0, + "logps/chosen": -402.51129150390625, + "logps/rejected": -252.77764892578125, + "loss": 0.0864, + "rewards/chosen": 2.755667209625244, + "rewards/margins": 7.694580078125, + "rewards/rejected": -4.938912868499756, + "step": 1047 + }, + { + "epoch": 0.09575148469620831, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.783539334711983e-06, + "logits/chosen": 726545024.0, + "logits/rejected": 334028928.0, + "logps/chosen": -389.6051025390625, + "logps/rejected": -216.72097778320312, + "loss": 0.0667, + "rewards/chosen": 3.4719746112823486, + "rewards/margins": 6.708760738372803, + "rewards/rejected": -3.236786127090454, + "step": 1048 + }, + { + "epoch": 0.09584285061671996, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.783120665408271e-06, + "logits/chosen": 1492892672.0, + "logits/rejected": 706745753.6, + "logps/chosen": -368.7005208333333, + "logps/rejected": -459.940869140625, + "loss": 0.0515, + "rewards/chosen": 2.2701539993286133, + "rewards/margins": 7.87376766204834, + "rewards/rejected": -5.603613662719726, + "step": 1049 + }, + { + "epoch": 0.09593421653723161, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.782701600584798e-06, + "logits/chosen": 707837098.6666666, + "logits/rejected": 263673312.0, + "logps/chosen": -407.0354410807292, + "logps/rejected": -233.84451293945312, + "loss": 0.0479, + "rewards/chosen": 2.895295778910319, + "rewards/margins": 9.45852247873942, + "rewards/rejected": -6.563226699829102, + "step": 1050 + }, + { + "epoch": 0.09602558245774326, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.782282140276214e-06, + "logits/chosen": 505210784.0, + "logits/rejected": 570431360.0, + "logps/chosen": -276.9458312988281, + "logps/rejected": -601.7736206054688, + "loss": 0.0368, + "rewards/chosen": 2.762056827545166, + "rewards/margins": 10.756713390350342, + "rewards/rejected": -7.994656562805176, + "step": 1051 + }, + { + "epoch": 0.09611694837825491, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.781862284517208e-06, + "logits/chosen": 388054400.0, + "logits/rejected": 407712716.8, + "logps/chosen": -321.83774820963544, + "logps/rejected": -644.374365234375, + "loss": 0.0312, + "rewards/chosen": 3.191751797993978, + "rewards/margins": 15.201821072896323, + "rewards/rejected": -12.010069274902344, + "step": 1052 + }, + { + "epoch": 0.09620831429876656, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 9.781442033342494e-06, + "logits/chosen": 491923648.0, + "logits/rejected": 691824493.7142857, + "logps/chosen": -414.96636962890625, + "logps/rejected": -514.7269810267857, + "loss": 0.0799, + "rewards/chosen": 2.3591859340667725, + "rewards/margins": 8.682397603988647, + "rewards/rejected": -6.323211669921875, + "step": 1053 + }, + { + "epoch": 0.0962996802192782, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.781021386786824e-06, + "logits/chosen": 678381376.0, + "logits/rejected": 281482496.0, + "logps/chosen": -464.9836730957031, + "logps/rejected": -408.5146484375, + "loss": 0.062, + "rewards/chosen": 2.3538637161254883, + "rewards/margins": 9.830529689788818, + "rewards/rejected": -7.47666597366333, + "step": 1054 + }, + { + "epoch": 0.09639104613978985, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.780600344884985e-06, + "logits/chosen": 746851584.0, + "logits/rejected": 424255385.6, + "logps/chosen": -341.2726236979167, + "logps/rejected": -592.91025390625, + "loss": 0.0235, + "rewards/chosen": 2.8346691131591797, + "rewards/margins": 14.060139083862305, + "rewards/rejected": -11.225469970703125, + "step": 1055 + }, + { + "epoch": 0.0964824120603015, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.780178907671788e-06, + "logits/chosen": 911677781.3333334, + "logits/rejected": 646185344.0, + "logps/chosen": -336.443603515625, + "logps/rejected": -356.4053955078125, + "loss": 0.097, + "rewards/chosen": 2.085301717122396, + "rewards/margins": 7.437556107838949, + "rewards/rejected": -5.352254390716553, + "step": 1056 + }, + { + "epoch": 0.09657377798081315, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 9.779757075182089e-06, + "logits/chosen": 644325683.2, + "logits/rejected": 405312810.6666667, + "logps/chosen": -358.2152099609375, + "logps/rejected": -442.1715494791667, + "loss": 0.0404, + "rewards/chosen": 2.8618036270141602, + "rewards/margins": 12.580080604553222, + "rewards/rejected": -9.718276977539062, + "step": 1057 + }, + { + "epoch": 0.0966651439013248, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 9.779334847450761e-06, + "logits/chosen": 741510080.0, + "logits/rejected": 747308480.0, + "logps/chosen": -655.3638916015625, + "logps/rejected": -675.7437744140625, + "loss": 0.0181, + "rewards/chosen": 3.4390993118286133, + "rewards/margins": 13.578614234924316, + "rewards/rejected": -10.139514923095703, + "step": 1058 + }, + { + "epoch": 0.09675650982183645, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.778912224512725e-06, + "logits/chosen": 382741094.4, + "logits/rejected": 407139072.0, + "logps/chosen": -322.104248046875, + "logps/rejected": -624.5737711588541, + "loss": 0.0412, + "rewards/chosen": 3.034686279296875, + "rewards/margins": 14.66611531575521, + "rewards/rejected": -11.631429036458334, + "step": 1059 + }, + { + "epoch": 0.0968478757423481, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.778489206402923e-06, + "logits/chosen": 717163178.6666666, + "logits/rejected": 251215072.0, + "logps/chosen": -389.06640625, + "logps/rejected": -169.20213317871094, + "loss": 0.0675, + "rewards/chosen": 3.5909992853800454, + "rewards/margins": 8.7346560160319, + "rewards/rejected": -5.1436567306518555, + "step": 1060 + }, + { + "epoch": 0.09693924166285975, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.778065793156339e-06, + "logits/chosen": 425944362.6666667, + "logits/rejected": 284943776.0, + "logps/chosen": -347.396728515625, + "logps/rejected": -504.7676696777344, + "loss": 0.0228, + "rewards/chosen": 4.255521456400554, + "rewards/margins": 12.389482180277508, + "rewards/rejected": -8.133960723876953, + "step": 1061 + }, + { + "epoch": 0.0970306075833714, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 9.777641984807986e-06, + "logits/chosen": 523137984.0, + "logits/rejected": 531635584.0, + "logps/chosen": -369.5172119140625, + "logps/rejected": -383.6942138671875, + "loss": 0.0219, + "rewards/chosen": 3.3264386653900146, + "rewards/margins": 9.832672834396362, + "rewards/rejected": -6.506234169006348, + "step": 1062 + }, + { + "epoch": 0.09712197350388305, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.777217781392903e-06, + "logits/chosen": 472955562.6666667, + "logits/rejected": 292018432.0, + "logps/chosen": -411.3144124348958, + "logps/rejected": -494.35931396484375, + "loss": 0.0615, + "rewards/chosen": 2.5862936973571777, + "rewards/margins": 13.208731174468994, + "rewards/rejected": -10.622437477111816, + "step": 1063 + }, + { + "epoch": 0.0972133394243947, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.776793182946174e-06, + "logits/chosen": 826948544.0, + "logits/rejected": 332605632.0, + "logps/chosen": -325.06915283203125, + "logps/rejected": -433.3110656738281, + "loss": 0.1001, + "rewards/chosen": 1.6957279443740845, + "rewards/margins": 9.618894219398499, + "rewards/rejected": -7.923166275024414, + "step": 1064 + }, + { + "epoch": 0.09730470534490634, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 9.776368189502904e-06, + "logits/chosen": 706626432.0, + "logits/rejected": 362038656.0, + "logps/chosen": -492.0206604003906, + "logps/rejected": -559.1505737304688, + "loss": 0.0159, + "rewards/chosen": 3.658606767654419, + "rewards/margins": 10.458070039749146, + "rewards/rejected": -6.799463272094727, + "step": 1065 + }, + { + "epoch": 0.097396071265418, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 9.775942801098241e-06, + "logits/chosen": 996192841.1428572, + "logits/rejected": 627976512.0, + "logps/chosen": -527.7154366629464, + "logps/rejected": -295.54364013671875, + "loss": 0.1583, + "rewards/chosen": 2.0551222392490933, + "rewards/margins": 9.456388814108713, + "rewards/rejected": -7.401266574859619, + "step": 1066 + }, + { + "epoch": 0.09748743718592964, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.775517017767358e-06, + "logits/chosen": 501386432.0, + "logits/rejected": 964325824.0, + "logps/chosen": -261.6441650390625, + "logps/rejected": -408.0848693847656, + "loss": 0.1427, + "rewards/chosen": 1.8621037006378174, + "rewards/margins": 8.072133302688599, + "rewards/rejected": -6.210029602050781, + "step": 1067 + }, + { + "epoch": 0.09757880310644129, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 9.775090839545464e-06, + "logits/chosen": 737161792.0, + "logits/rejected": 506752512.0, + "logps/chosen": -337.6142578125, + "logps/rejected": -313.8699951171875, + "loss": 0.1778, + "rewards/chosen": 1.526822566986084, + "rewards/margins": 4.235302448272705, + "rewards/rejected": -2.708479881286621, + "step": 1068 + }, + { + "epoch": 0.09767016902695294, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 9.7746642664678e-06, + "logits/chosen": 418760960.0, + "logits/rejected": 817126784.0, + "logps/chosen": -196.73280334472656, + "logps/rejected": -723.6002197265625, + "loss": 0.1201, + "rewards/chosen": 1.6811461448669434, + "rewards/margins": 10.507400035858154, + "rewards/rejected": -8.826253890991211, + "step": 1069 + }, + { + "epoch": 0.09776153494746459, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.774237298569637e-06, + "logits/chosen": 580540525.7142857, + "logits/rejected": 658977024.0, + "logps/chosen": -390.31821986607144, + "logps/rejected": -561.1347045898438, + "loss": 0.0734, + "rewards/chosen": 2.887157440185547, + "rewards/margins": 5.587432146072388, + "rewards/rejected": -2.700274705886841, + "step": 1070 + }, + { + "epoch": 0.09785290086797624, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 9.773809935886287e-06, + "logits/chosen": 662255680.0, + "logits/rejected": 470409386.6666667, + "logps/chosen": -282.6756591796875, + "logps/rejected": -608.1231689453125, + "loss": 0.0098, + "rewards/chosen": 3.798976421356201, + "rewards/margins": 11.899660269419352, + "rewards/rejected": -8.10068384806315, + "step": 1071 + }, + { + "epoch": 0.09794426678848789, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.773382178453084e-06, + "logits/chosen": 742769152.0, + "logits/rejected": 668927232.0, + "logps/chosen": -179.83575439453125, + "logps/rejected": -307.3782958984375, + "loss": 0.0378, + "rewards/chosen": 3.4367380142211914, + "rewards/margins": 9.189948081970215, + "rewards/rejected": -5.753210067749023, + "step": 1072 + }, + { + "epoch": 0.09803563270899954, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.772954026305401e-06, + "logits/chosen": 473693568.0, + "logits/rejected": 963263872.0, + "logps/chosen": -317.4881286621094, + "logps/rejected": -492.3555908203125, + "loss": 0.0333, + "rewards/chosen": 3.063990831375122, + "rewards/margins": 10.438324213027954, + "rewards/rejected": -7.374333381652832, + "step": 1073 + }, + { + "epoch": 0.09812699862951119, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 9.772525479478643e-06, + "logits/chosen": 1338918912.0, + "logits/rejected": 593742131.2, + "logps/chosen": -409.7802734375, + "logps/rejected": -395.766943359375, + "loss": 0.0575, + "rewards/chosen": 3.6219730377197266, + "rewards/margins": 9.1888916015625, + "rewards/rejected": -5.566918563842774, + "step": 1074 + }, + { + "epoch": 0.09821836455002284, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.772096538008245e-06, + "logits/chosen": 447312298.6666667, + "logits/rejected": 271371366.4, + "logps/chosen": -282.6544596354167, + "logps/rejected": -173.89783935546876, + "loss": 0.0544, + "rewards/chosen": 3.139052708943685, + "rewards/margins": 7.754042180379232, + "rewards/rejected": -4.614989471435547, + "step": 1075 + }, + { + "epoch": 0.09830973047053448, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.77166720192968e-06, + "logits/chosen": 604565248.0, + "logps/chosen": -430.3805847167969, + "loss": 0.0573, + "rewards/chosen": 3.104785203933716, + "step": 1076 + }, + { + "epoch": 0.09840109639104613, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.771237471278446e-06, + "logits/chosen": 595291328.0, + "logits/rejected": 381968800.0, + "logps/chosen": -396.28814697265625, + "logps/rejected": -386.534423828125, + "loss": 0.028, + "rewards/chosen": 3.2824692726135254, + "rewards/margins": 11.734791278839111, + "rewards/rejected": -8.452322006225586, + "step": 1077 + }, + { + "epoch": 0.09849246231155778, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.77080734609008e-06, + "logits/chosen": 526688256.0, + "logits/rejected": 373145420.8, + "logps/chosen": -418.4545084635417, + "logps/rejected": -489.301123046875, + "loss": 0.029, + "rewards/chosen": 3.2557897567749023, + "rewards/margins": 9.167976188659669, + "rewards/rejected": -5.9121864318847654, + "step": 1078 + }, + { + "epoch": 0.09858382823206943, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.77037682640015e-06, + "logits/chosen": 609755904.0, + "logits/rejected": 936771392.0, + "logps/chosen": -201.1800537109375, + "logps/rejected": -549.1862182617188, + "loss": 0.048, + "rewards/chosen": 2.5986876487731934, + "rewards/margins": 7.793448448181152, + "rewards/rejected": -5.194760799407959, + "step": 1079 + }, + { + "epoch": 0.09867519415258108, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.769945912244255e-06, + "logits/chosen": 730791936.0, + "logits/rejected": 433277141.3333333, + "logps/chosen": -303.4661560058594, + "logps/rejected": -474.2211100260417, + "loss": 0.0485, + "rewards/chosen": 2.7410225868225098, + "rewards/margins": 7.8303596178690595, + "rewards/rejected": -5.08933703104655, + "step": 1080 + }, + { + "epoch": 0.09876656007309273, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.769514603658025e-06, + "logits/chosen": 746696896.0, + "logits/rejected": 566001322.6666666, + "logps/chosen": -281.5102844238281, + "logps/rejected": -472.1954752604167, + "loss": 0.0294, + "rewards/chosen": 2.7995293140411377, + "rewards/margins": 8.971399863560993, + "rewards/rejected": -6.1718705495198565, + "step": 1081 + }, + { + "epoch": 0.09885792599360439, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.76908290067713e-06, + "logits/chosen": 570626688.0, + "logits/rejected": 527051093.3333333, + "logps/chosen": -454.5008544921875, + "logps/rejected": -418.4402669270833, + "loss": 0.0266, + "rewards/chosen": 3.495871067047119, + "rewards/margins": 9.817723433176678, + "rewards/rejected": -6.321852366129558, + "step": 1082 + }, + { + "epoch": 0.09894929191411604, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.768650803337265e-06, + "logits/chosen": 318074026.6666667, + "logits/rejected": 463295692.8, + "logps/chosen": -339.3994954427083, + "logps/rejected": -357.03515625, + "loss": 0.0839, + "rewards/chosen": 4.377325693766276, + "rewards/margins": 9.955920664469401, + "rewards/rejected": -5.578594970703125, + "step": 1083 + }, + { + "epoch": 0.09904065783462769, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.768218311674162e-06, + "logits/chosen": 396680864.0, + "logits/rejected": 657083584.0, + "logps/chosen": -305.6790466308594, + "logps/rejected": -512.7495727539062, + "loss": 0.1098, + "rewards/chosen": 2.0947113037109375, + "rewards/margins": 9.13989543914795, + "rewards/rejected": -7.045184135437012, + "step": 1084 + }, + { + "epoch": 0.09913202375513934, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.767785425723583e-06, + "logits/chosen": 270871488.0, + "logits/rejected": 509674069.3333333, + "logps/chosen": -303.7098388671875, + "logps/rejected": -534.3470865885416, + "loss": 0.0242, + "rewards/chosen": 4.818572998046875, + "rewards/margins": 9.729876200358074, + "rewards/rejected": -4.911303202311198, + "step": 1085 + }, + { + "epoch": 0.09922338967565099, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 9.767352145521322e-06, + "logits/chosen": 595006784.0, + "logits/rejected": 660460544.0, + "logps/chosen": -298.6661376953125, + "logps/rejected": -404.83447265625, + "loss": 0.0899, + "rewards/chosen": 2.624340057373047, + "rewards/margins": 7.854648113250732, + "rewards/rejected": -5.2303080558776855, + "step": 1086 + }, + { + "epoch": 0.09931475559616264, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 9.766918471103212e-06, + "logits/chosen": 603991893.3333334, + "logits/rejected": 505700198.4, + "logps/chosen": -402.7052408854167, + "logps/rejected": -481.676953125, + "loss": 0.0458, + "rewards/chosen": 3.0130844116210938, + "rewards/margins": 12.018248748779296, + "rewards/rejected": -9.005164337158202, + "step": 1087 + }, + { + "epoch": 0.09940612151667429, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.76648440250511e-06, + "logits/chosen": 564119936.0, + "logits/rejected": 597895808.0, + "logps/chosen": -312.69403076171875, + "logps/rejected": -589.00146484375, + "loss": 0.026, + "rewards/chosen": 3.1032447814941406, + "rewards/margins": 12.512195587158203, + "rewards/rejected": -9.408950805664062, + "step": 1088 + }, + { + "epoch": 0.09949748743718594, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.76604993976291e-06, + "logits/chosen": 577203392.0, + "logits/rejected": 276390528.0, + "logps/chosen": -419.13726806640625, + "logps/rejected": -329.4480387369792, + "loss": 0.0359, + "rewards/chosen": 2.5354721546173096, + "rewards/margins": 10.019108374913532, + "rewards/rejected": -7.483636220296224, + "step": 1089 + }, + { + "epoch": 0.09958885335769758, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.765615082912536e-06, + "logits/chosen": 334780672.0, + "logits/rejected": 362665241.6, + "logps/chosen": -220.33561197916666, + "logps/rejected": -532.561083984375, + "loss": 0.0198, + "rewards/chosen": 2.9588851928710938, + "rewards/margins": 14.465523529052735, + "rewards/rejected": -11.506638336181641, + "step": 1090 + }, + { + "epoch": 0.09968021927820923, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.765179831989952e-06, + "logits/chosen": 387321386.6666667, + "logits/rejected": 423988428.8, + "logps/chosen": -147.67709350585938, + "logps/rejected": -476.26494140625, + "loss": 0.0612, + "rewards/chosen": 1.670507589975993, + "rewards/margins": 9.89983345667521, + "rewards/rejected": -8.229325866699218, + "step": 1091 + }, + { + "epoch": 0.09977158519872088, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.764744187031145e-06, + "logits/chosen": 649250496.0, + "logits/rejected": 682102592.0, + "logps/chosen": -415.1207580566406, + "logps/rejected": -327.55926513671875, + "loss": 0.0469, + "rewards/chosen": 2.7471213340759277, + "rewards/margins": 9.457348346710205, + "rewards/rejected": -6.710227012634277, + "step": 1092 + }, + { + "epoch": 0.09986295111923253, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 9.764308148072141e-06, + "logits/chosen": 744129152.0, + "logits/rejected": 583149141.3333334, + "logps/chosen": -303.8807067871094, + "logps/rejected": -490.61669921875, + "loss": 0.0095, + "rewards/chosen": 3.472266435623169, + "rewards/margins": 12.219659725824991, + "rewards/rejected": -8.747393290201822, + "step": 1093 + }, + { + "epoch": 0.09995431703974418, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.763871715148995e-06, + "logits/chosen": 412816486.4, + "logits/rejected": 274956117.3333333, + "logps/chosen": -495.15673828125, + "logps/rejected": -583.5095621744791, + "loss": 0.0298, + "rewards/chosen": 3.4050033569335936, + "rewards/margins": 16.179538218180337, + "rewards/rejected": -12.774534861246744, + "step": 1094 + }, + { + "epoch": 0.10004568296025583, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 9.763434888297796e-06, + "logits/chosen": 452388032.0, + "logits/rejected": 595113280.0, + "logps/chosen": -260.8658752441406, + "logps/rejected": -668.3629150390625, + "loss": 0.0172, + "rewards/chosen": 3.613426685333252, + "rewards/margins": 14.539092540740967, + "rewards/rejected": -10.925665855407715, + "step": 1095 + }, + { + "epoch": 0.10013704888076748, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.762997667554666e-06, + "logits/chosen": 737450816.0, + "logits/rejected": 628844672.0, + "logps/chosen": -282.0447082519531, + "logps/rejected": -419.9131164550781, + "loss": 0.0396, + "rewards/chosen": 2.7248687744140625, + "rewards/margins": 10.869800567626953, + "rewards/rejected": -8.14493179321289, + "step": 1096 + }, + { + "epoch": 0.10022841480127913, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.76256005295576e-06, + "logits/chosen": 658074777.6, + "logits/rejected": 431193173.3333333, + "logps/chosen": -375.0765625, + "logps/rejected": -631.6846923828125, + "loss": 0.1164, + "rewards/chosen": 1.8580966949462892, + "rewards/margins": 11.451568857828775, + "rewards/rejected": -9.593472162882486, + "step": 1097 + }, + { + "epoch": 0.10031978072179078, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.762122044537263e-06, + "logits/chosen": 617992021.3333334, + "logits/rejected": 485116160.0, + "logps/chosen": -321.2744954427083, + "logps/rejected": -425.1321105957031, + "loss": 0.0441, + "rewards/chosen": 3.116466522216797, + "rewards/margins": 11.207372665405273, + "rewards/rejected": -8.090906143188477, + "step": 1098 + }, + { + "epoch": 0.10041114664230243, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.761683642335396e-06, + "logits/chosen": 886173248.0, + "logits/rejected": 474086784.0, + "logps/chosen": -713.322998046875, + "logps/rejected": -408.3943684895833, + "loss": 0.0166, + "rewards/chosen": 2.6800568103790283, + "rewards/margins": 11.557816108067831, + "rewards/rejected": -8.877759297688803, + "step": 1099 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.761244846386408e-06, + "logits/chosen": 750994304.0, + "logits/rejected": 465355861.3333333, + "logps/chosen": -341.0248718261719, + "logps/rejected": -468.4816080729167, + "loss": 0.0285, + "rewards/chosen": 2.0996298789978027, + "rewards/margins": 10.376341025034586, + "rewards/rejected": -8.276711146036783, + "step": 1100 + }, + { + "epoch": 0.10059387848332572, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.760805656726586e-06, + "logits/chosen": 498854400.0, + "logits/rejected": 454175776.0, + "logps/chosen": -381.5357259114583, + "logps/rejected": -584.6099853515625, + "loss": 0.0443, + "rewards/chosen": 2.9443861643473306, + "rewards/margins": 14.217195192972818, + "rewards/rejected": -11.272809028625488, + "step": 1101 + }, + { + "epoch": 0.10068524440383737, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.760366073392246e-06, + "logits/chosen": 597289289.1428572, + "logits/rejected": 236865888.0, + "logps/chosen": -231.75441196986608, + "logps/rejected": -199.59925842285156, + "loss": 0.0911, + "rewards/chosen": 3.153406960623605, + "rewards/margins": 9.422646863119944, + "rewards/rejected": -6.269239902496338, + "step": 1102 + }, + { + "epoch": 0.10077661032434902, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.759926096419738e-06, + "logits/chosen": 457528480.0, + "logits/rejected": 359532064.0, + "logps/chosen": -270.14544677734375, + "logps/rejected": -467.3085021972656, + "loss": 0.0446, + "rewards/chosen": 2.6350784301757812, + "rewards/margins": 11.160774230957031, + "rewards/rejected": -8.52569580078125, + "step": 1103 + }, + { + "epoch": 0.10086797624486067, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.759485725845442e-06, + "logits/chosen": 487146837.3333333, + "logits/rejected": 727950336.0, + "logps/chosen": -455.120361328125, + "logps/rejected": -421.0912109375, + "loss": 0.1359, + "rewards/chosen": 0.9092254638671875, + "rewards/margins": 8.300482177734375, + "rewards/rejected": -7.391256713867188, + "step": 1104 + }, + { + "epoch": 0.10095934216537232, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.759044961705775e-06, + "logits/chosen": 461596202.6666667, + "logits/rejected": 513502912.0, + "logps/chosen": -341.4691162109375, + "logps/rejected": -571.6687622070312, + "loss": 0.0638, + "rewards/chosen": 2.624436060587565, + "rewards/margins": 10.754048029581705, + "rewards/rejected": -8.12961196899414, + "step": 1105 + }, + { + "epoch": 0.10105070808588397, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 9.758603804037184e-06, + "logits/chosen": 490617770.6666667, + "logits/rejected": 324675891.2, + "logps/chosen": -254.98490397135416, + "logps/rejected": -446.475, + "loss": 0.112, + "rewards/chosen": 1.0760173002878826, + "rewards/margins": 10.680819241205851, + "rewards/rejected": -9.60480194091797, + "step": 1106 + }, + { + "epoch": 0.10114207400639562, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.758162252876147e-06, + "logits/chosen": 766233002.6666666, + "logits/rejected": 530189209.6, + "logps/chosen": -193.8149210611979, + "logps/rejected": -315.1135986328125, + "loss": 0.0793, + "rewards/chosen": 2.2619447708129883, + "rewards/margins": 7.704940986633301, + "rewards/rejected": -5.442996215820313, + "step": 1107 + }, + { + "epoch": 0.10123343992690727, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.757720308259177e-06, + "logits/chosen": 358113792.0, + "logits/rejected": 391144371.2, + "logps/chosen": -227.6949666341146, + "logps/rejected": -396.589453125, + "loss": 0.0278, + "rewards/chosen": 2.9516321818033853, + "rewards/margins": 9.96635233561198, + "rewards/rejected": -7.014720153808594, + "step": 1108 + }, + { + "epoch": 0.10132480584741892, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 9.757277970222818e-06, + "logits/chosen": 440572330.6666667, + "logits/rejected": 115450568.0, + "logps/chosen": -335.74330647786456, + "logps/rejected": -501.3531494140625, + "loss": 0.1224, + "rewards/chosen": 2.8735663096110025, + "rewards/margins": 8.24419085184733, + "rewards/rejected": -5.370624542236328, + "step": 1109 + }, + { + "epoch": 0.10141617176793057, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.75683523880365e-06, + "logits/chosen": 651144874.6666666, + "logits/rejected": 708468992.0, + "logps/chosen": -634.292724609375, + "logps/rejected": -465.893017578125, + "loss": 0.0509, + "rewards/chosen": 2.659522533416748, + "rewards/margins": 9.22143201828003, + "rewards/rejected": -6.561909484863281, + "step": 1110 + }, + { + "epoch": 0.10150753768844221, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 9.75639211403828e-06, + "logits/chosen": 574621184.0, + "logits/rejected": 850617728.0, + "logps/chosen": -270.6024169921875, + "logps/rejected": -298.8818664550781, + "loss": 0.0339, + "rewards/chosen": 3.149566173553467, + "rewards/margins": 8.642225742340088, + "rewards/rejected": -5.492659568786621, + "step": 1111 + }, + { + "epoch": 0.10159890360895386, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.755948595963351e-06, + "logits/chosen": 532777696.0, + "logits/rejected": 295628096.0, + "logps/chosen": -333.61822509765625, + "logps/rejected": -323.25115966796875, + "loss": 0.1386, + "rewards/chosen": 2.075462818145752, + "rewards/margins": 7.432382583618164, + "rewards/rejected": -5.356919765472412, + "step": 1112 + }, + { + "epoch": 0.10169026952946551, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.755504684615538e-06, + "logits/chosen": 547040960.0, + "logits/rejected": 351790464.0, + "logps/chosen": -324.1651916503906, + "logps/rejected": -430.46771240234375, + "loss": 0.0519, + "rewards/chosen": 2.5107860565185547, + "rewards/margins": 8.05691909790039, + "rewards/rejected": -5.546133041381836, + "step": 1113 + }, + { + "epoch": 0.10178163544997716, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.755060380031547e-06, + "logits/chosen": 857257344.0, + "logits/rejected": 227731744.0, + "logps/chosen": -428.6658630371094, + "logps/rejected": -267.4676513671875, + "loss": 0.0344, + "rewards/chosen": 3.053333282470703, + "rewards/margins": 8.020322322845459, + "rewards/rejected": -4.966989040374756, + "step": 1114 + }, + { + "epoch": 0.10187300137048881, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.754615682248121e-06, + "logits/chosen": 810374528.0, + "logits/rejected": 898189632.0, + "logps/chosen": -279.09405517578125, + "logps/rejected": -988.9417724609375, + "loss": 0.0375, + "rewards/chosen": 2.6275875568389893, + "rewards/margins": 12.664038896560669, + "rewards/rejected": -10.03645133972168, + "step": 1115 + }, + { + "epoch": 0.10196436729100046, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.75417059130203e-06, + "logits/chosen": 1089357909.3333333, + "logits/rejected": 577433292.8, + "logps/chosen": -404.7552490234375, + "logps/rejected": -273.5675537109375, + "loss": 0.1135, + "rewards/chosen": 2.9690815607706704, + "rewards/margins": 6.9522804896036785, + "rewards/rejected": -3.9831989288330076, + "step": 1116 + }, + { + "epoch": 0.10205573321151211, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.753725107230078e-06, + "logits/chosen": 594408960.0, + "logits/rejected": 339375872.0, + "logps/chosen": -360.16839599609375, + "logps/rejected": -438.76593017578125, + "loss": 0.0505, + "rewards/chosen": 2.2415390014648438, + "rewards/margins": 10.46461296081543, + "rewards/rejected": -8.223073959350586, + "step": 1117 + }, + { + "epoch": 0.10214709913202376, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.753279230069107e-06, + "logits/chosen": 1442103936.0, + "logits/rejected": 593248000.0, + "logps/chosen": -330.8989562988281, + "logps/rejected": -367.48370361328125, + "loss": 0.108, + "rewards/chosen": 2.284723997116089, + "rewards/margins": 7.634294271469116, + "rewards/rejected": -5.349570274353027, + "step": 1118 + }, + { + "epoch": 0.10223846505253541, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 9.75283295985598e-06, + "logits/chosen": 1064817728.0, + "logits/rejected": 797390592.0, + "logps/chosen": -346.99908447265625, + "logps/rejected": -718.42431640625, + "loss": 0.0532, + "rewards/chosen": 2.3248353004455566, + "rewards/margins": 10.832815647125244, + "rewards/rejected": -8.507980346679688, + "step": 1119 + }, + { + "epoch": 0.10232983097304706, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.752386296627606e-06, + "logits/chosen": 916712448.0, + "logits/rejected": 555530069.3333334, + "logps/chosen": -506.0830078125, + "logps/rejected": -511.0281575520833, + "loss": 0.089, + "rewards/chosen": 2.426018714904785, + "rewards/margins": 10.876341183980307, + "rewards/rejected": -8.450322469075521, + "step": 1120 + }, + { + "epoch": 0.1024211968935587, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.751939240420916e-06, + "logits/chosen": 477655398.4, + "logits/rejected": 411012266.6666667, + "logps/chosen": -277.5781494140625, + "logps/rejected": -432.0636393229167, + "loss": 0.0576, + "rewards/chosen": 3.370177459716797, + "rewards/margins": 10.038606516520183, + "rewards/rejected": -6.668429056803386, + "step": 1121 + }, + { + "epoch": 0.10251256281407035, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.751491791272879e-06, + "logits/chosen": 1006108330.6666666, + "logits/rejected": 715437120.0, + "logps/chosen": -318.5185953776042, + "logps/rejected": -506.7901611328125, + "loss": 0.088, + "rewards/chosen": 2.293069044748942, + "rewards/margins": 9.403052965799967, + "rewards/rejected": -7.109983921051025, + "step": 1122 + }, + { + "epoch": 0.102603928734582, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.751043949220494e-06, + "logits/chosen": 605305002.6666666, + "logits/rejected": 554534451.2, + "logps/chosen": -472.1299641927083, + "logps/rejected": -345.494580078125, + "loss": 0.031, + "rewards/chosen": 3.0792083740234375, + "rewards/margins": 7.7251544952392575, + "rewards/rejected": -4.64594612121582, + "step": 1123 + }, + { + "epoch": 0.10269529465509365, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 9.750595714300795e-06, + "logits/chosen": 560726835.2, + "logits/rejected": 520150314.6666667, + "logps/chosen": -314.126611328125, + "logps/rejected": -523.4945882161459, + "loss": 0.0415, + "rewards/chosen": 3.2809547424316405, + "rewards/margins": 8.239160919189453, + "rewards/rejected": -4.9582061767578125, + "step": 1124 + }, + { + "epoch": 0.1027866605756053, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 9.750147086550843e-06, + "logits/chosen": 238848896.0, + "logits/rejected": 357732522.6666667, + "logps/chosen": -173.3463592529297, + "logps/rejected": -319.54144287109375, + "loss": 0.0226, + "rewards/chosen": 2.97416615486145, + "rewards/margins": 9.375247875849407, + "rewards/rejected": -6.401081720987956, + "step": 1125 + }, + { + "epoch": 0.10287802649611695, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.749698066007741e-06, + "logits/chosen": 397388580.5714286, + "logits/rejected": 1024675712.0, + "logps/chosen": -206.10410853794642, + "logps/rejected": -1011.9732666015625, + "loss": 0.0488, + "rewards/chosen": 3.355846949986049, + "rewards/margins": 12.172277995518275, + "rewards/rejected": -8.816431045532227, + "step": 1126 + }, + { + "epoch": 0.1029693924166286, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.749248652708615e-06, + "logits/chosen": 395048448.0, + "logits/rejected": 841073493.3333334, + "logps/chosen": -171.45657348632812, + "logps/rejected": -705.6437174479166, + "loss": 0.0175, + "rewards/chosen": 3.0539183616638184, + "rewards/margins": 10.041159470876057, + "rewards/rejected": -6.987241109212239, + "step": 1127 + }, + { + "epoch": 0.10306075833714025, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.748798846690627e-06, + "logits/chosen": 696559001.6, + "logits/rejected": 868518144.0, + "logps/chosen": -325.6597412109375, + "logps/rejected": -521.6851399739584, + "loss": 0.0466, + "rewards/chosen": 3.1415678024291993, + "rewards/margins": 7.157613690694173, + "rewards/rejected": -4.016045888264974, + "step": 1128 + }, + { + "epoch": 0.1031521242576519, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 9.748348647990975e-06, + "logits/chosen": 451672576.0, + "logits/rejected": 526578892.8, + "logps/chosen": -202.811279296875, + "logps/rejected": -519.310400390625, + "loss": 0.0267, + "rewards/chosen": 2.6553619702657065, + "rewards/margins": 9.00577319463094, + "rewards/rejected": -6.350411224365234, + "step": 1129 + }, + { + "epoch": 0.10324349017816355, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.747898056646883e-06, + "logits/chosen": 630237440.0, + "logits/rejected": 506551381.3333333, + "logps/chosen": -400.76373291015625, + "logps/rejected": -472.0165608723958, + "loss": 0.0355, + "rewards/chosen": 2.622384548187256, + "rewards/margins": 7.341928641001384, + "rewards/rejected": -4.719544092814128, + "step": 1130 + }, + { + "epoch": 0.1033348560986752, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.747447072695611e-06, + "logits/chosen": 1761433216.0, + "logits/rejected": 635055232.0, + "logps/chosen": -586.626953125, + "logps/rejected": -416.9795735677083, + "loss": 0.0832, + "rewards/chosen": 1.9614226818084717, + "rewards/margins": 7.572226285934448, + "rewards/rejected": -5.610803604125977, + "step": 1131 + }, + { + "epoch": 0.10342622201918684, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.746995696174453e-06, + "logits/chosen": 705961685.3333334, + "logits/rejected": 468283545.6, + "logps/chosen": -457.199951171875, + "logps/rejected": -297.87734375, + "loss": 0.0982, + "rewards/chosen": 1.8772347768147786, + "rewards/margins": 5.606739934285482, + "rewards/rejected": -3.729505157470703, + "step": 1132 + }, + { + "epoch": 0.1035175879396985, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.746543927120732e-06, + "logits/chosen": 1074370662.4, + "logits/rejected": 812378112.0, + "logps/chosen": -148.42752685546876, + "logps/rejected": -722.1666666666666, + "loss": 0.0428, + "rewards/chosen": 2.9113916397094726, + "rewards/margins": 11.98933162689209, + "rewards/rejected": -9.077939987182617, + "step": 1133 + }, + { + "epoch": 0.10360895386021014, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.746091765571808e-06, + "logits/chosen": 528380364.8, + "logits/rejected": 617216682.6666666, + "logps/chosen": -411.85830078125, + "logps/rejected": -379.5177408854167, + "loss": 0.0261, + "rewards/chosen": 3.71253662109375, + "rewards/margins": 9.64720802307129, + "rewards/rejected": -5.934671401977539, + "step": 1134 + }, + { + "epoch": 0.10370031978072179, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.745639211565068e-06, + "logits/chosen": 733154645.3333334, + "logits/rejected": 637447884.8, + "logps/chosen": -533.380859375, + "logps/rejected": -310.577392578125, + "loss": 0.0532, + "rewards/chosen": 3.0977115631103516, + "rewards/margins": 8.050391006469727, + "rewards/rejected": -4.952679443359375, + "step": 1135 + }, + { + "epoch": 0.10379168570123344, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 9.745186265137934e-06, + "logits/chosen": 480759893.3333333, + "logits/rejected": 417557312.0, + "logps/chosen": -561.4908854166666, + "logps/rejected": -445.4042053222656, + "loss": 0.024, + "rewards/chosen": 3.7727762858072915, + "rewards/margins": 10.654114882151285, + "rewards/rejected": -6.881338596343994, + "step": 1136 + }, + { + "epoch": 0.10388305162174509, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.74473292632786e-06, + "logits/chosen": 461497600.0, + "logits/rejected": 322413772.8, + "logps/chosen": -357.9340006510417, + "logps/rejected": -319.16669921875, + "loss": 0.0431, + "rewards/chosen": 3.0508607228597007, + "rewards/margins": 9.12120869954427, + "rewards/rejected": -6.0703479766845705, + "step": 1137 + }, + { + "epoch": 0.10397441754225674, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.744279195172334e-06, + "logits/chosen": 1163352166.4, + "logits/rejected": 789638997.3333334, + "logps/chosen": -459.86728515625, + "logps/rejected": -744.7135416666666, + "loss": 0.0619, + "rewards/chosen": 2.3978126525878904, + "rewards/margins": 14.055086263020833, + "rewards/rejected": -11.657273610432943, + "step": 1138 + }, + { + "epoch": 0.10406578346276839, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.743825071708877e-06, + "logits/chosen": 791031381.3333334, + "logits/rejected": 1063403417.6, + "logps/chosen": -593.2963053385416, + "logps/rejected": -620.41328125, + "loss": 0.0222, + "rewards/chosen": 2.953550338745117, + "rewards/margins": 10.113156509399413, + "rewards/rejected": -7.159606170654297, + "step": 1139 + }, + { + "epoch": 0.10415714938328004, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.743370555975037e-06, + "logits/chosen": 677638144.0, + "logits/rejected": 544888192.0, + "logps/chosen": -387.0820617675781, + "logps/rejected": -357.1920166015625, + "loss": 0.0437, + "rewards/chosen": 2.9574966430664062, + "rewards/margins": 8.494236469268799, + "rewards/rejected": -5.536739826202393, + "step": 1140 + }, + { + "epoch": 0.10424851530379169, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.742915648008403e-06, + "logits/chosen": 583174553.6, + "logits/rejected": 406745770.6666667, + "logps/chosen": -483.753857421875, + "logps/rejected": -362.0794270833333, + "loss": 0.1328, + "rewards/chosen": 1.908823585510254, + "rewards/margins": 8.420229530334472, + "rewards/rejected": -6.511405944824219, + "step": 1141 + }, + { + "epoch": 0.10433988122430334, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.742460347846588e-06, + "logits/chosen": 479412778.6666667, + "logits/rejected": 705252928.0, + "logps/chosen": -288.07663981119794, + "logps/rejected": -313.43878173828125, + "loss": 0.0846, + "rewards/chosen": 3.2039432525634766, + "rewards/margins": 10.387199878692627, + "rewards/rejected": -7.18325662612915, + "step": 1142 + }, + { + "epoch": 0.10443124714481498, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.742004655527241e-06, + "logits/chosen": 1096376448.0, + "logits/rejected": 859091264.0, + "logps/chosen": -290.7041931152344, + "logps/rejected": -744.36767578125, + "loss": 0.0846, + "rewards/chosen": 1.871778964996338, + "rewards/margins": 10.107425212860107, + "rewards/rejected": -8.23564624786377, + "step": 1143 + }, + { + "epoch": 0.10452261306532663, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.741548571088046e-06, + "logits/chosen": 509299370.6666667, + "logits/rejected": 555630400.0, + "logps/chosen": -500.8505859375, + "logps/rejected": -654.787841796875, + "loss": 0.0346, + "rewards/chosen": 3.369279225667318, + "rewards/margins": 13.56837018330892, + "rewards/rejected": -10.199090957641602, + "step": 1144 + }, + { + "epoch": 0.10461397898583828, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.741092094566714e-06, + "logits/chosen": 1004797760.0, + "logits/rejected": 624346496.0, + "logps/chosen": -373.7278137207031, + "logps/rejected": -329.86639404296875, + "loss": 0.0662, + "rewards/chosen": 2.7627785205841064, + "rewards/margins": 9.756582975387573, + "rewards/rejected": -6.993804454803467, + "step": 1145 + }, + { + "epoch": 0.10470534490634993, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.740635226000994e-06, + "logits/chosen": 527384928.0, + "logits/rejected": 548073472.0, + "logps/chosen": -274.9749755859375, + "logps/rejected": -483.5373942057292, + "loss": 0.0266, + "rewards/chosen": 3.927914619445801, + "rewards/margins": 12.694429715474447, + "rewards/rejected": -8.766515096028646, + "step": 1146 + }, + { + "epoch": 0.10479671082686158, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.740177965428664e-06, + "logits/chosen": 786177536.0, + "logits/rejected": 538407104.0, + "logps/chosen": -309.38254801432294, + "logps/rejected": -658.314453125, + "loss": 0.0448, + "rewards/chosen": 2.9127283096313477, + "rewards/margins": 13.344338417053223, + "rewards/rejected": -10.431610107421875, + "step": 1147 + }, + { + "epoch": 0.10488807674737323, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.739720312887536e-06, + "logits/chosen": 621769664.0, + "logits/rejected": 428537312.0, + "logps/chosen": -270.16754150390625, + "logps/rejected": -468.88818359375, + "loss": 0.0337, + "rewards/chosen": 3.047891855239868, + "rewards/margins": 11.52284550666809, + "rewards/rejected": -8.474953651428223, + "step": 1148 + }, + { + "epoch": 0.10497944266788488, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.73926226841545e-06, + "logits/chosen": 780301696.0, + "logits/rejected": 536441173.3333333, + "logps/chosen": -450.890625, + "logps/rejected": -666.33447265625, + "loss": 0.0732, + "rewards/chosen": 1.1451218128204346, + "rewards/margins": 11.42817489306132, + "rewards/rejected": -10.283053080240885, + "step": 1149 + }, + { + "epoch": 0.10507080858839653, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 9.738803832050285e-06, + "logits/chosen": 511987916.8, + "logits/rejected": 1379956053.3333333, + "logps/chosen": -181.37015380859376, + "logps/rejected": -538.75341796875, + "loss": 0.1435, + "rewards/chosen": 2.1525091171264648, + "rewards/margins": 9.846508725484211, + "rewards/rejected": -7.693999608357747, + "step": 1150 + }, + { + "epoch": 0.10516217450890818, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 9.73834500382995e-06, + "logits/chosen": 495766229.3333333, + "logits/rejected": 572505190.4, + "logps/chosen": -357.2786458333333, + "logps/rejected": -477.475390625, + "loss": 0.0092, + "rewards/chosen": 3.864241600036621, + "rewards/margins": 12.539832878112794, + "rewards/rejected": -8.675591278076173, + "step": 1151 + }, + { + "epoch": 0.10525354042941983, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 9.737885783792384e-06, + "logits/chosen": 500776064.0, + "logits/rejected": 589529804.8, + "logps/chosen": -442.4441731770833, + "logps/rejected": -229.526611328125, + "loss": 0.0225, + "rewards/chosen": 2.9141600926717124, + "rewards/margins": 9.822009976704916, + "rewards/rejected": -6.907849884033203, + "step": 1152 + }, + { + "epoch": 0.10534490634993147, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.737426171975563e-06, + "logits/chosen": 547269734.4, + "logits/rejected": 1019979178.6666666, + "logps/chosen": -248.0677490234375, + "logps/rejected": -496.1127115885417, + "loss": 0.0528, + "rewards/chosen": 2.7735012054443358, + "rewards/margins": 9.3486634572347, + "rewards/rejected": -6.575162251790364, + "step": 1153 + }, + { + "epoch": 0.10543627227044312, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.73696616841749e-06, + "logits/chosen": 912067925.3333334, + "logits/rejected": 876932915.2, + "logps/chosen": -603.8129069010416, + "logps/rejected": -391.904052734375, + "loss": 0.0237, + "rewards/chosen": 2.9205423990885415, + "rewards/margins": 9.700941721598307, + "rewards/rejected": -6.780399322509766, + "step": 1154 + }, + { + "epoch": 0.10552763819095477, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 9.736505773156203e-06, + "logits/chosen": 474599328.0, + "logits/rejected": 906903210.6666666, + "logps/chosen": -210.54696655273438, + "logps/rejected": -593.1189371744791, + "loss": 0.0132, + "rewards/chosen": 3.5632271766662598, + "rewards/margins": 12.677828311920166, + "rewards/rejected": -9.114601135253906, + "step": 1155 + }, + { + "epoch": 0.10561900411146642, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 9.736044986229775e-06, + "logits/chosen": 635828224.0, + "logits/rejected": 301496928.0, + "logps/chosen": -340.45086669921875, + "logps/rejected": -416.734619140625, + "loss": 0.0128, + "rewards/chosen": 3.9141223430633545, + "rewards/margins": 11.87598729133606, + "rewards/rejected": -7.961864948272705, + "step": 1156 + }, + { + "epoch": 0.10571037003197807, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 9.735583807676306e-06, + "logits/chosen": 569460480.0, + "logits/rejected": 404297898.6666667, + "logps/chosen": -430.78302001953125, + "logps/rejected": -555.8611653645834, + "loss": 0.0124, + "rewards/chosen": 3.0545883178710938, + "rewards/margins": 11.220559438069662, + "rewards/rejected": -8.165971120198568, + "step": 1157 + }, + { + "epoch": 0.10580173595248972, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.735122237533932e-06, + "logits/chosen": 498936960.0, + "logits/rejected": 518318890.6666667, + "logps/chosen": -234.13453674316406, + "logps/rejected": -491.97802734375, + "loss": 0.0523, + "rewards/chosen": 3.2642555236816406, + "rewards/margins": 8.779123942057293, + "rewards/rejected": -5.514868418375651, + "step": 1158 + }, + { + "epoch": 0.10589310187300137, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.734660275840822e-06, + "logits/chosen": 741369344.0, + "logits/rejected": 622723136.0, + "logps/chosen": -433.091796875, + "logps/rejected": -575.390380859375, + "loss": 0.0371, + "rewards/chosen": 2.918767213821411, + "rewards/margins": 12.841643571853638, + "rewards/rejected": -9.922876358032227, + "step": 1159 + }, + { + "epoch": 0.10598446779351302, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 9.734197922635174e-06, + "logits/chosen": 206704554.66666666, + "logits/rejected": 359758028.8, + "logps/chosen": -222.8541056315104, + "logps/rejected": -508.6265625, + "loss": 0.0129, + "rewards/chosen": 3.7258790334065757, + "rewards/margins": 12.912240727742514, + "rewards/rejected": -9.186361694335938, + "step": 1160 + }, + { + "epoch": 0.10607583371402467, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.733735177955219e-06, + "logits/chosen": 469899776.0, + "logits/rejected": 470133312.0, + "logps/chosen": -296.24916294642856, + "logps/rejected": -472.59942626953125, + "loss": 0.0619, + "rewards/chosen": 2.9737913949148997, + "rewards/margins": 11.566796575273786, + "rewards/rejected": -8.593005180358887, + "step": 1161 + }, + { + "epoch": 0.10616719963453632, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.733272041839225e-06, + "logits/chosen": 560805683.2, + "logits/rejected": 378480554.6666667, + "logps/chosen": -166.3669677734375, + "logps/rejected": -417.6139322916667, + "loss": 0.1417, + "rewards/chosen": 2.305330276489258, + "rewards/margins": 9.167683283487957, + "rewards/rejected": -6.862353006998698, + "step": 1162 + }, + { + "epoch": 0.10625856555504797, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.73280851432549e-06, + "logits/chosen": 299132192.0, + "logits/rejected": 444046720.0, + "logps/chosen": -189.03866577148438, + "logps/rejected": -592.0411987304688, + "loss": 0.0906, + "rewards/chosen": 2.395047664642334, + "rewards/margins": 11.397874355316162, + "rewards/rejected": -9.002826690673828, + "step": 1163 + }, + { + "epoch": 0.10634993147555961, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 9.732344595452341e-06, + "logits/chosen": 521228492.8, + "logits/rejected": 777520981.3333334, + "logps/chosen": -166.1069580078125, + "logps/rejected": -648.9193929036459, + "loss": 0.1084, + "rewards/chosen": 2.735774803161621, + "rewards/margins": 5.983850924173991, + "rewards/rejected": -3.2480761210123696, + "step": 1164 + }, + { + "epoch": 0.10644129739607126, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 9.731880285258138e-06, + "logits/chosen": 307872768.0, + "logits/rejected": 562651428.5714285, + "logps/chosen": -412.42236328125, + "logps/rejected": -547.5943080357143, + "loss": 0.0042, + "rewards/chosen": 4.660910129547119, + "rewards/margins": 10.978401388440814, + "rewards/rejected": -6.3174912588936945, + "step": 1165 + }, + { + "epoch": 0.10653266331658291, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.731415583781278e-06, + "logits/chosen": 619257536.0, + "logits/rejected": 241462608.0, + "logps/chosen": -400.2239990234375, + "logps/rejected": -378.92333984375, + "loss": 0.103, + "rewards/chosen": 2.0033836364746094, + "rewards/margins": 8.584566116333008, + "rewards/rejected": -6.581182479858398, + "step": 1166 + }, + { + "epoch": 0.10662402923709456, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.730950491060187e-06, + "logits/chosen": 543210057.1428572, + "logits/rejected": 652277120.0, + "logps/chosen": -203.27305385044642, + "logps/rejected": -791.6688842773438, + "loss": 0.0842, + "rewards/chosen": 2.7314932686941966, + "rewards/margins": 12.210381780351911, + "rewards/rejected": -9.478888511657715, + "step": 1167 + }, + { + "epoch": 0.10671539515760621, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.730485007133325e-06, + "logits/chosen": 813277269.3333334, + "logits/rejected": 515322777.6, + "logps/chosen": -551.3179931640625, + "logps/rejected": -520.280859375, + "loss": 0.0458, + "rewards/chosen": 2.928800900777181, + "rewards/margins": 9.41223824818929, + "rewards/rejected": -6.483437347412109, + "step": 1168 + }, + { + "epoch": 0.10680676107811786, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.73001913203918e-06, + "logits/chosen": 401211622.4, + "logits/rejected": 469722794.6666667, + "logps/chosen": -254.92333984375, + "logps/rejected": -521.53125, + "loss": 0.0346, + "rewards/chosen": 3.3588302612304686, + "rewards/margins": 10.961779403686524, + "rewards/rejected": -7.602949142456055, + "step": 1169 + }, + { + "epoch": 0.10689812699862951, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.729552865816278e-06, + "logits/chosen": 461733632.0, + "logits/rejected": 839627622.4, + "logps/chosen": -318.647216796875, + "logps/rejected": -478.3818359375, + "loss": 0.0195, + "rewards/chosen": 3.038748105367025, + "rewards/margins": 11.345906003316244, + "rewards/rejected": -8.307157897949219, + "step": 1170 + }, + { + "epoch": 0.10698949291914116, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 9.729086208503174e-06, + "logits/chosen": 739932544.0, + "logits/rejected": 523515538.28571427, + "logps/chosen": -174.70040893554688, + "logps/rejected": -376.6595982142857, + "loss": 0.0149, + "rewards/chosen": 2.341146945953369, + "rewards/margins": 9.47951364517212, + "rewards/rejected": -7.13836669921875, + "step": 1171 + }, + { + "epoch": 0.1070808588396528, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 9.728619160138457e-06, + "logits/chosen": 489114688.0, + "logits/rejected": 412648672.0, + "logps/chosen": -287.9810791015625, + "logps/rejected": -518.5776977539062, + "loss": 0.0179, + "rewards/chosen": 3.6319198608398438, + "rewards/margins": 10.745212078094482, + "rewards/rejected": -7.113292217254639, + "step": 1172 + }, + { + "epoch": 0.10717222476016446, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.728151720760747e-06, + "logits/chosen": 711805248.0, + "logits/rejected": 605620224.0, + "logps/chosen": -224.6444091796875, + "logps/rejected": -413.192138671875, + "loss": 0.0436, + "rewards/chosen": 2.9433960914611816, + "rewards/margins": 9.005952040354412, + "rewards/rejected": -6.0625559488932295, + "step": 1173 + }, + { + "epoch": 0.1072635906806761, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.727683890408697e-06, + "logits/chosen": 961416618.6666666, + "logits/rejected": 1101952512.0, + "logps/chosen": -457.5029703776042, + "logps/rejected": -625.5431640625, + "loss": 0.0425, + "rewards/chosen": 2.5665915807088218, + "rewards/margins": 10.786473019917807, + "rewards/rejected": -8.219881439208985, + "step": 1174 + }, + { + "epoch": 0.10735495660118775, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 9.727215669120992e-06, + "logits/chosen": 337349205.3333333, + "logits/rejected": 301292851.2, + "logps/chosen": -164.80293782552084, + "logps/rejected": -672.0171875, + "loss": 0.0351, + "rewards/chosen": 2.5895891189575195, + "rewards/margins": 11.961961555480958, + "rewards/rejected": -9.372372436523438, + "step": 1175 + }, + { + "epoch": 0.1074463225216994, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 9.72674705693635e-06, + "logits/chosen": 428407603.2, + "logits/rejected": 329525162.6666667, + "logps/chosen": -339.5560546875, + "logps/rejected": -418.74853515625, + "loss": 0.0234, + "rewards/chosen": 3.517115020751953, + "rewards/margins": 10.384930038452149, + "rewards/rejected": -6.867815017700195, + "step": 1176 + }, + { + "epoch": 0.10753768844221105, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.72627805389352e-06, + "logits/chosen": 613339136.0, + "logits/rejected": 327390912.0, + "logps/chosen": -395.10369873046875, + "logps/rejected": -327.021240234375, + "loss": 0.0529, + "rewards/chosen": 2.4787135124206543, + "rewards/margins": 8.963552474975586, + "rewards/rejected": -6.484838962554932, + "step": 1177 + }, + { + "epoch": 0.1076290543627227, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.725808660031286e-06, + "logits/chosen": 615130112.0, + "logits/rejected": 362914048.0, + "logps/chosen": -344.86468505859375, + "logps/rejected": -411.4871826171875, + "loss": 0.0815, + "rewards/chosen": 2.4425363540649414, + "rewards/margins": 9.9105863571167, + "rewards/rejected": -7.468050003051758, + "step": 1178 + }, + { + "epoch": 0.10772042028323435, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.72533887538846e-06, + "logits/chosen": 642688320.0, + "logits/rejected": 326160000.0, + "logps/chosen": -350.8626708984375, + "logps/rejected": -342.415283203125, + "loss": 0.0487, + "rewards/chosen": 2.45534610748291, + "rewards/margins": 9.872183322906494, + "rewards/rejected": -7.416837215423584, + "step": 1179 + }, + { + "epoch": 0.107811786203746, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.72486870000389e-06, + "logits/chosen": 724576256.0, + "logits/rejected": 683830656.0, + "logps/chosen": -396.7650669642857, + "logps/rejected": -642.4552001953125, + "loss": 0.0916, + "rewards/chosen": 2.4551756722586497, + "rewards/margins": 10.383117471422468, + "rewards/rejected": -7.927941799163818, + "step": 1180 + }, + { + "epoch": 0.10790315212425765, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.724398133916457e-06, + "logits/chosen": 437446720.0, + "logits/rejected": 656078912.0, + "logps/chosen": -380.02783203125, + "logps/rejected": -501.09942626953125, + "loss": 0.0285, + "rewards/chosen": 3.736682176589966, + "rewards/margins": 10.030592203140259, + "rewards/rejected": -6.293910026550293, + "step": 1181 + }, + { + "epoch": 0.1079945180447693, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 9.723927177165071e-06, + "logits/chosen": 625756288.0, + "logits/rejected": 494328768.0, + "logps/chosen": -371.18927001953125, + "logps/rejected": -680.159423828125, + "loss": 0.0285, + "rewards/chosen": 2.831791400909424, + "rewards/margins": 12.855968952178955, + "rewards/rejected": -10.024177551269531, + "step": 1182 + }, + { + "epoch": 0.10808588396528095, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.723455829788676e-06, + "logits/chosen": 788326997.3333334, + "logits/rejected": 529531392.0, + "logps/chosen": -585.7892659505209, + "logps/rejected": -313.4176940917969, + "loss": 0.0344, + "rewards/chosen": 3.4412047068277993, + "rewards/margins": 9.975857416788736, + "rewards/rejected": -6.5346527099609375, + "step": 1183 + }, + { + "epoch": 0.1081772498857926, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 9.722984091826245e-06, + "logits/chosen": 417847488.0, + "logits/rejected": 474562944.0, + "logps/chosen": -383.59942626953125, + "logps/rejected": -458.61822509765625, + "loss": 0.0758, + "rewards/chosen": 2.6948580741882324, + "rewards/margins": 11.572186946868896, + "rewards/rejected": -8.877328872680664, + "step": 1184 + }, + { + "epoch": 0.10826861580630424, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 9.72251196331679e-06, + "logits/chosen": 497823590.4, + "logits/rejected": 286628288.0, + "logps/chosen": -225.997705078125, + "logps/rejected": -427.7752278645833, + "loss": 0.1397, + "rewards/chosen": 2.608821678161621, + "rewards/margins": 9.589212862650554, + "rewards/rejected": -6.980391184488933, + "step": 1185 + }, + { + "epoch": 0.1083599817268159, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 9.722039444299351e-06, + "logits/chosen": 464337888.0, + "logits/rejected": 661243712.0, + "logps/chosen": -328.91558837890625, + "logps/rejected": -705.265625, + "loss": 0.0232, + "rewards/chosen": 3.346224308013916, + "rewards/margins": 10.712319374084473, + "rewards/rejected": -7.366095066070557, + "step": 1186 + }, + { + "epoch": 0.10845134764732754, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.721566534813002e-06, + "logits/chosen": 308925482.6666667, + "logits/rejected": 488062310.4, + "logps/chosen": -255.07735188802084, + "logps/rejected": -768.886376953125, + "loss": 0.0529, + "rewards/chosen": 3.018705368041992, + "rewards/margins": 16.433076095581054, + "rewards/rejected": -13.414370727539062, + "step": 1187 + }, + { + "epoch": 0.10854271356783919, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.721093234896846e-06, + "logits/chosen": 693058858.6666666, + "logits/rejected": 652100416.0, + "logps/chosen": -490.3080647786458, + "logps/rejected": -223.7926788330078, + "loss": 0.0575, + "rewards/chosen": 2.7266464233398438, + "rewards/margins": 7.987635612487793, + "rewards/rejected": -5.260989189147949, + "step": 1188 + }, + { + "epoch": 0.10863407948835084, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.720619544590021e-06, + "logits/chosen": 811535616.0, + "logits/rejected": 808288384.0, + "logps/chosen": -428.7817077636719, + "logps/rejected": -613.6603393554688, + "loss": 0.0511, + "rewards/chosen": 3.4124932289123535, + "rewards/margins": 8.071181774139404, + "rewards/rejected": -4.658688545227051, + "step": 1189 + }, + { + "epoch": 0.10872544540886249, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.720145463931697e-06, + "logits/chosen": 700292480.0, + "logits/rejected": 720882240.0, + "logps/chosen": -393.24462890625, + "logps/rejected": -212.1184844970703, + "loss": 0.0841, + "rewards/chosen": 2.453776995340983, + "rewards/margins": 7.821761767069498, + "rewards/rejected": -5.367984771728516, + "step": 1190 + }, + { + "epoch": 0.10881681132937414, + "grad_norm": 9.75, + "kl": 0.054668426513671875, + "learning_rate": 9.719670992961077e-06, + "logits/chosen": 479065685.3333333, + "logits/rejected": 518508192.0, + "logps/chosen": -313.20603434244794, + "logps/rejected": -451.0679626464844, + "loss": 0.0722, + "rewards/chosen": 2.5353004137674966, + "rewards/margins": 9.606128851572672, + "rewards/rejected": -7.070828437805176, + "step": 1191 + }, + { + "epoch": 0.10890817724988579, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.719196131717395e-06, + "logits/chosen": 452261034.6666667, + "logits/rejected": 629358028.8, + "logps/chosen": -321.61480712890625, + "logps/rejected": -512.9822265625, + "loss": 0.0198, + "rewards/chosen": 3.722910245259603, + "rewards/margins": 9.736063702901205, + "rewards/rejected": -6.013153457641602, + "step": 1192 + }, + { + "epoch": 0.10899954317039744, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.718720880239915e-06, + "logits/chosen": 621822336.0, + "logits/rejected": 650274752.0, + "logps/chosen": -336.26678466796875, + "logps/rejected": -291.2235107421875, + "loss": 0.1028, + "rewards/chosen": 2.619206428527832, + "rewards/margins": 6.318992614746094, + "rewards/rejected": -3.6997861862182617, + "step": 1193 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 9.718245238567939e-06, + "logits/chosen": 1609583616.0, + "logits/rejected": 770906624.0, + "logps/chosen": -469.174560546875, + "logps/rejected": -400.297998046875, + "loss": 0.0211, + "rewards/chosen": 3.444289207458496, + "rewards/margins": 11.959638023376465, + "rewards/rejected": -8.515348815917969, + "step": 1194 + }, + { + "epoch": 0.10918227501142073, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.717769206740799e-06, + "logits/chosen": 781851200.0, + "logits/rejected": 442017952.0, + "logps/chosen": -219.50070190429688, + "logps/rejected": -367.54766845703125, + "loss": 0.0764, + "rewards/chosen": 2.337038040161133, + "rewards/margins": 8.712696075439453, + "rewards/rejected": -6.37565803527832, + "step": 1195 + }, + { + "epoch": 0.10927364093193238, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 9.717292784797854e-06, + "logits/chosen": 557845546.6666666, + "logits/rejected": 465278310.4, + "logps/chosen": -218.4091796875, + "logps/rejected": -537.44912109375, + "loss": 0.0192, + "rewards/chosen": 3.0739965438842773, + "rewards/margins": 11.0624418258667, + "rewards/rejected": -7.988445281982422, + "step": 1196 + }, + { + "epoch": 0.10936500685244403, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.716815972778504e-06, + "logits/chosen": 487038890.6666667, + "logits/rejected": 430647091.2, + "logps/chosen": -288.17299397786456, + "logps/rejected": -398.2756103515625, + "loss": 0.054, + "rewards/chosen": 2.4300966262817383, + "rewards/margins": 10.081823921203613, + "rewards/rejected": -7.651727294921875, + "step": 1197 + }, + { + "epoch": 0.10945637277295568, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 9.716338770722176e-06, + "logits/chosen": 896777920.0, + "logits/rejected": 427611520.0, + "logps/chosen": -287.3741149902344, + "logps/rejected": -287.3655090332031, + "loss": 0.0174, + "rewards/chosen": 3.4516899585723877, + "rewards/margins": 10.590842485427856, + "rewards/rejected": -7.139152526855469, + "step": 1198 + }, + { + "epoch": 0.10954773869346733, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.715861178668328e-06, + "logits/chosen": 596251392.0, + "logits/rejected": 323868608.0, + "logps/chosen": -324.55743408203125, + "logps/rejected": -472.2451985677083, + "loss": 0.0237, + "rewards/chosen": 2.730799436569214, + "rewards/margins": 10.703699191411335, + "rewards/rejected": -7.972899754842122, + "step": 1199 + }, + { + "epoch": 0.10963910461397898, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.715383196656455e-06, + "logits/chosen": 482991923.2, + "logits/rejected": 612315605.3333334, + "logps/chosen": -299.2643310546875, + "logps/rejected": -517.2919921875, + "loss": 0.0367, + "rewards/chosen": 3.4387592315673827, + "rewards/margins": 10.239951833089192, + "rewards/rejected": -6.80119260152181, + "step": 1200 + }, + { + "epoch": 0.10973047053449063, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.71490482472608e-06, + "logits/chosen": 642456064.0, + "logits/rejected": 545856512.0, + "logps/chosen": -360.71661376953125, + "logps/rejected": -528.4590454101562, + "loss": 0.0382, + "rewards/chosen": 3.3870644569396973, + "rewards/margins": 10.427548885345459, + "rewards/rejected": -7.040484428405762, + "step": 1201 + }, + { + "epoch": 0.10982183645500228, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 9.714426062916761e-06, + "logits/chosen": 817595955.2, + "logits/rejected": 879727616.0, + "logps/chosen": -399.310986328125, + "logps/rejected": -556.2825520833334, + "loss": 0.0271, + "rewards/chosen": 3.4642425537109376, + "rewards/margins": 11.662066396077474, + "rewards/rejected": -8.197823842366537, + "step": 1202 + }, + { + "epoch": 0.10991320237551393, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 9.713946911268087e-06, + "logits/chosen": 408940416.0, + "logits/rejected": 500446549.3333333, + "logps/chosen": -325.80487060546875, + "logps/rejected": -753.9512532552084, + "loss": 0.0124, + "rewards/chosen": 3.0356264114379883, + "rewards/margins": 13.018098513285318, + "rewards/rejected": -9.98247210184733, + "step": 1203 + }, + { + "epoch": 0.11000456829602558, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.713467369819678e-06, + "logits/chosen": 590278656.0, + "logits/rejected": 530892885.3333333, + "logps/chosen": -497.349072265625, + "logps/rejected": -325.1663411458333, + "loss": 0.0311, + "rewards/chosen": 3.0914178848266602, + "rewards/margins": 9.776576296488445, + "rewards/rejected": -6.685158411661784, + "step": 1204 + }, + { + "epoch": 0.11009593421653723, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 9.71298743861119e-06, + "logits/chosen": 498141696.0, + "logits/rejected": 797645824.0, + "logps/chosen": -331.4078369140625, + "logps/rejected": -346.859130859375, + "loss": 0.0118, + "rewards/chosen": 4.760660552978516, + "rewards/margins": 11.470850372314453, + "rewards/rejected": -6.7101898193359375, + "step": 1205 + }, + { + "epoch": 0.11018730013704887, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.712507117682305e-06, + "logits/chosen": 477291733.3333333, + "logits/rejected": 601631334.4, + "logps/chosen": -292.79697672526044, + "logps/rejected": -664.43662109375, + "loss": 0.0486, + "rewards/chosen": 2.1357784271240234, + "rewards/margins": 10.944723892211915, + "rewards/rejected": -8.808945465087891, + "step": 1206 + }, + { + "epoch": 0.11027866605756052, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.712026407072748e-06, + "logits/chosen": 333756053.3333333, + "logits/rejected": 520462284.8, + "logps/chosen": -342.555419921875, + "logps/rejected": -421.371044921875, + "loss": 0.0695, + "rewards/chosen": 1.8647255897521973, + "rewards/margins": 10.394478702545166, + "rewards/rejected": -8.529753112792969, + "step": 1207 + }, + { + "epoch": 0.11037003197807217, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.711545306822264e-06, + "logits/chosen": 419813024.0, + "logits/rejected": 580277930.6666666, + "logps/chosen": -359.88543701171875, + "logps/rejected": -599.1403401692709, + "loss": 0.0326, + "rewards/chosen": 2.8247880935668945, + "rewards/margins": 12.872597694396973, + "rewards/rejected": -10.047809600830078, + "step": 1208 + }, + { + "epoch": 0.11046139789858382, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 9.711063816970635e-06, + "logits/chosen": 236857568.0, + "logits/rejected": 489532854.85714287, + "logps/chosen": -228.89015197753906, + "logps/rejected": -529.6017020089286, + "loss": 0.0033, + "rewards/chosen": 4.4684953689575195, + "rewards/margins": 14.329076903206962, + "rewards/rejected": -9.860581534249443, + "step": 1209 + }, + { + "epoch": 0.11055276381909548, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.710581937557677e-06, + "logits/chosen": 772752384.0, + "logits/rejected": 400900896.0, + "logps/chosen": -387.3625793457031, + "logps/rejected": -459.4207458496094, + "loss": 0.0344, + "rewards/chosen": 3.002690553665161, + "rewards/margins": 11.716191530227661, + "rewards/rejected": -8.7135009765625, + "step": 1210 + }, + { + "epoch": 0.11064412973960713, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.710099668623237e-06, + "logits/chosen": 767364812.8, + "logits/rejected": 189043285.33333334, + "logps/chosen": -413.731787109375, + "logps/rejected": -226.161376953125, + "loss": 0.0515, + "rewards/chosen": 2.6788589477539064, + "rewards/margins": 10.409398905436198, + "rewards/rejected": -7.730539957682292, + "step": 1211 + }, + { + "epoch": 0.11073549566011878, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.709617010207195e-06, + "logits/chosen": 596920320.0, + "logits/rejected": 766458432.0, + "logps/chosen": -268.03692626953125, + "logps/rejected": -506.63372802734375, + "loss": 0.071, + "rewards/chosen": 2.498223066329956, + "rewards/margins": 11.211442708969116, + "rewards/rejected": -8.71321964263916, + "step": 1212 + }, + { + "epoch": 0.11082686158063043, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.709133962349463e-06, + "logits/chosen": 479579520.0, + "logits/rejected": 528907232.0, + "logps/chosen": -466.0777587890625, + "logps/rejected": -341.5649719238281, + "loss": 0.0454, + "rewards/chosen": 2.4053421020507812, + "rewards/margins": 10.99936294555664, + "rewards/rejected": -8.59402084350586, + "step": 1213 + }, + { + "epoch": 0.11091822750114208, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 9.708650525089982e-06, + "logits/chosen": 591564586.6666666, + "logits/rejected": 1076049305.6, + "logps/chosen": -332.4250895182292, + "logps/rejected": -575.185546875, + "loss": 0.0079, + "rewards/chosen": 4.121432940165202, + "rewards/margins": 13.07000929514567, + "rewards/rejected": -8.948576354980469, + "step": 1214 + }, + { + "epoch": 0.11100959342165373, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 9.70816669846873e-06, + "logits/chosen": 681989568.0, + "logits/rejected": 349922389.3333333, + "logps/chosen": -420.50885009765625, + "logps/rejected": -533.36083984375, + "loss": 0.0123, + "rewards/chosen": 3.0153274536132812, + "rewards/margins": 14.259966532389322, + "rewards/rejected": -11.244639078776041, + "step": 1215 + }, + { + "epoch": 0.11110095934216538, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.707682482525714e-06, + "logits/chosen": 519210528.0, + "logits/rejected": 474243680.0, + "logps/chosen": -336.6400146484375, + "logps/rejected": -470.6695251464844, + "loss": 0.0338, + "rewards/chosen": 3.170631170272827, + "rewards/margins": 10.778963327407837, + "rewards/rejected": -7.60833215713501, + "step": 1216 + }, + { + "epoch": 0.11119232526267703, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 9.707197877300974e-06, + "logits/chosen": 555771264.0, + "logits/rejected": 389712224.0, + "logps/chosen": -215.0843963623047, + "logps/rejected": -414.3562316894531, + "loss": 0.0837, + "rewards/chosen": 3.156306505203247, + "rewards/margins": 9.343340635299683, + "rewards/rejected": -6.1870341300964355, + "step": 1217 + }, + { + "epoch": 0.11128369118318868, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 9.706712882834583e-06, + "logits/chosen": 656877568.0, + "logits/rejected": 695331712.0, + "logps/chosen": -265.2143249511719, + "logps/rejected": -346.088134765625, + "loss": 0.0482, + "rewards/chosen": 3.7654221057891846, + "rewards/margins": 10.557748556137085, + "rewards/rejected": -6.7923264503479, + "step": 1218 + }, + { + "epoch": 0.11137505710370033, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 9.706227499166646e-06, + "logits/chosen": 397508800.0, + "logits/rejected": 503368917.3333333, + "logps/chosen": -306.8597412109375, + "logps/rejected": -342.109619140625, + "loss": 0.0129, + "rewards/chosen": 3.613194465637207, + "rewards/margins": 11.674153327941895, + "rewards/rejected": -8.060958862304688, + "step": 1219 + }, + { + "epoch": 0.11146642302421197, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.7057417263373e-06, + "logits/chosen": 585245132.8, + "logits/rejected": 461862997.3333333, + "logps/chosen": -564.22255859375, + "logps/rejected": -317.5665690104167, + "loss": 0.0202, + "rewards/chosen": 3.903092956542969, + "rewards/margins": 10.918992614746093, + "rewards/rejected": -7.015899658203125, + "step": 1220 + }, + { + "epoch": 0.11155778894472362, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.70525556438671e-06, + "logits/chosen": 631956684.8, + "logits/rejected": 399084672.0, + "logps/chosen": -433.97705078125, + "logps/rejected": -563.553955078125, + "loss": 0.0265, + "rewards/chosen": 3.294126510620117, + "rewards/margins": 11.723251978556315, + "rewards/rejected": -8.429125467936197, + "step": 1221 + }, + { + "epoch": 0.11164915486523527, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 9.704769013355081e-06, + "logits/chosen": 510306976.0, + "logits/rejected": 540711594.6666666, + "logps/chosen": -373.8538513183594, + "logps/rejected": -551.6001790364584, + "loss": 0.0051, + "rewards/chosen": 4.080435752868652, + "rewards/margins": 14.888634045918783, + "rewards/rejected": -10.80819829305013, + "step": 1222 + }, + { + "epoch": 0.11174052078574692, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.704282073282645e-06, + "logits/chosen": 846958165.3333334, + "logits/rejected": 906527360.0, + "logps/chosen": -384.38232421875, + "logps/rejected": -605.2852172851562, + "loss": 0.0788, + "rewards/chosen": 2.529871622721354, + "rewards/margins": 11.066880861918131, + "rewards/rejected": -8.537009239196777, + "step": 1223 + }, + { + "epoch": 0.11183188670625857, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.703794744209668e-06, + "logits/chosen": 383649237.3333333, + "logits/rejected": 262675488.0, + "logps/chosen": -233.7120157877604, + "logps/rejected": -195.77545166015625, + "loss": 0.0521, + "rewards/chosen": 2.9562673568725586, + "rewards/margins": 9.023423194885254, + "rewards/rejected": -6.067155838012695, + "step": 1224 + }, + { + "epoch": 0.11192325262677022, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.703307026176447e-06, + "logits/chosen": 444243148.8, + "logits/rejected": 535317845.3333333, + "logps/chosen": -352.137548828125, + "logps/rejected": -534.3915201822916, + "loss": 0.0296, + "rewards/chosen": 3.344816970825195, + "rewards/margins": 12.105929183959962, + "rewards/rejected": -8.761112213134766, + "step": 1225 + }, + { + "epoch": 0.11201461854728187, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.702818919223313e-06, + "logits/chosen": 628790101.3333334, + "logits/rejected": 1288028160.0, + "logps/chosen": -318.39369710286456, + "logps/rejected": -478.7939453125, + "loss": 0.0825, + "rewards/chosen": 3.104656537373861, + "rewards/margins": 7.534748013814291, + "rewards/rejected": -4.430091476440429, + "step": 1226 + }, + { + "epoch": 0.11210598446779352, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.702330423390627e-06, + "logits/chosen": 543352064.0, + "logits/rejected": 424726101.3333333, + "logps/chosen": -225.6080810546875, + "logps/rejected": -372.7867431640625, + "loss": 0.0556, + "rewards/chosen": 2.871958351135254, + "rewards/margins": 9.238020515441894, + "rewards/rejected": -6.366062164306641, + "step": 1227 + }, + { + "epoch": 0.11219735038830517, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 9.701841538718782e-06, + "logits/chosen": 285266176.0, + "logits/rejected": 549560704.0, + "logps/chosen": -176.71188354492188, + "logps/rejected": -610.36474609375, + "loss": 0.0122, + "rewards/chosen": 4.213116645812988, + "rewards/margins": 13.004733085632324, + "rewards/rejected": -8.791616439819336, + "step": 1228 + }, + { + "epoch": 0.11228871630881682, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.701352265248205e-06, + "logits/chosen": 735696320.0, + "logits/rejected": 567966720.0, + "logps/chosen": -404.4276428222656, + "logps/rejected": -771.6691284179688, + "loss": 0.0458, + "rewards/chosen": 2.5910048484802246, + "rewards/margins": 9.416880130767822, + "rewards/rejected": -6.825875282287598, + "step": 1229 + }, + { + "epoch": 0.11238008222932847, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 9.700862603019357e-06, + "logits/chosen": 749371136.0, + "logits/rejected": 634336554.6666666, + "logps/chosen": -428.3961181640625, + "logps/rejected": -490.0984700520833, + "loss": 0.0132, + "rewards/chosen": 3.4792587757110596, + "rewards/margins": 9.46192479133606, + "rewards/rejected": -5.982666015625, + "step": 1230 + }, + { + "epoch": 0.11247144814984011, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 9.700372552072724e-06, + "logits/chosen": 436082880.0, + "logits/rejected": 412425696.0, + "logps/chosen": -255.49957275390625, + "logps/rejected": -543.4600830078125, + "loss": 0.02, + "rewards/chosen": 3.601818561553955, + "rewards/margins": 12.863733768463135, + "rewards/rejected": -9.26191520690918, + "step": 1231 + }, + { + "epoch": 0.11256281407035176, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.69988211244883e-06, + "logits/chosen": 637371349.3333334, + "logits/rejected": 460181568.0, + "logps/chosen": -412.2412923177083, + "logps/rejected": -523.9225463867188, + "loss": 0.0622, + "rewards/chosen": 2.609377066294352, + "rewards/margins": 12.762638250986734, + "rewards/rejected": -10.153261184692383, + "step": 1232 + }, + { + "epoch": 0.11265417999086341, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.699391284188234e-06, + "logits/chosen": 620546150.4, + "logits/rejected": 749037141.3333334, + "logps/chosen": -420.19912109375, + "logps/rejected": -253.1360880533854, + "loss": 0.1054, + "rewards/chosen": 2.5699920654296875, + "rewards/margins": 6.776013056437175, + "rewards/rejected": -4.206020991007487, + "step": 1233 + }, + { + "epoch": 0.11274554591137506, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.698900067331516e-06, + "logits/chosen": 324447552.0, + "logits/rejected": 696580352.0, + "logps/chosen": -240.59066772460938, + "logps/rejected": -379.6151428222656, + "loss": 0.0234, + "rewards/chosen": 4.453287601470947, + "rewards/margins": 11.863281726837158, + "rewards/rejected": -7.409994125366211, + "step": 1234 + }, + { + "epoch": 0.11283691183188671, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.698408461919301e-06, + "logits/chosen": 893483008.0, + "logits/rejected": 440494080.0, + "logps/chosen": -436.20037841796875, + "logps/rejected": -297.99009195963544, + "loss": 0.0686, + "rewards/chosen": 4.2751007080078125, + "rewards/margins": 9.998985290527344, + "rewards/rejected": -5.723884582519531, + "step": 1235 + }, + { + "epoch": 0.11292827775239836, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.697916467992237e-06, + "logits/chosen": 709612501.3333334, + "logits/rejected": 1145216716.8, + "logps/chosen": -487.97216796875, + "logps/rejected": -515.92578125, + "loss": 0.0261, + "rewards/chosen": 3.631778081258138, + "rewards/margins": 9.538500340779622, + "rewards/rejected": -5.9067222595214846, + "step": 1236 + }, + { + "epoch": 0.11301964367291001, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.69742408559101e-06, + "logits/chosen": 471268966.4, + "logits/rejected": 210910208.0, + "logps/chosen": -239.651513671875, + "logps/rejected": -389.9884033203125, + "loss": 0.1705, + "rewards/chosen": 1.3994962692260742, + "rewards/margins": 7.445229403177898, + "rewards/rejected": -6.045733133951823, + "step": 1237 + }, + { + "epoch": 0.11311100959342166, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.696931314756332e-06, + "logits/chosen": 224676501.33333334, + "logits/rejected": 770996889.6, + "logps/chosen": -466.5097249348958, + "logps/rejected": -451.729541015625, + "loss": 0.0408, + "rewards/chosen": 4.077086766560872, + "rewards/margins": 11.485289891560871, + "rewards/rejected": -7.408203125, + "step": 1238 + }, + { + "epoch": 0.1132023755139333, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.69643815552895e-06, + "logits/chosen": 272403616.0, + "logits/rejected": 526191872.0, + "logps/chosen": -210.53448486328125, + "logps/rejected": -434.5023600260417, + "loss": 0.0171, + "rewards/chosen": 3.346968173980713, + "rewards/margins": 11.649878025054932, + "rewards/rejected": -8.302909851074219, + "step": 1239 + }, + { + "epoch": 0.11329374143444496, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 9.69594460794965e-06, + "logits/chosen": 611840768.0, + "logits/rejected": 532869461.3333333, + "logps/chosen": -319.45458984375, + "logps/rejected": -433.244873046875, + "loss": 0.0075, + "rewards/chosen": 3.6352813243865967, + "rewards/margins": 11.96268916130066, + "rewards/rejected": -8.327407836914062, + "step": 1240 + }, + { + "epoch": 0.1133851073549566, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 9.69545067205924e-06, + "logits/chosen": 139033664.0, + "logits/rejected": 384200277.3333333, + "logps/chosen": -195.87428283691406, + "logps/rejected": -469.450927734375, + "loss": 0.0427, + "rewards/chosen": 2.552290678024292, + "rewards/margins": 12.431151151657104, + "rewards/rejected": -9.878860473632812, + "step": 1241 + }, + { + "epoch": 0.11347647327546825, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.69495634789856e-06, + "logits/chosen": 683228800.0, + "logits/rejected": 380604032.0, + "logps/chosen": -292.63006591796875, + "logps/rejected": -384.7559509277344, + "loss": 0.0236, + "rewards/chosen": 3.034670352935791, + "rewards/margins": 12.922674655914307, + "rewards/rejected": -9.888004302978516, + "step": 1242 + }, + { + "epoch": 0.1135678391959799, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.694461635508493e-06, + "logits/chosen": 512380736.0, + "logits/rejected": 540121173.3333334, + "logps/chosen": -353.76129150390625, + "logps/rejected": -563.5946451822916, + "loss": 0.031, + "rewards/chosen": 3.0533111095428467, + "rewards/margins": 11.772236585617065, + "rewards/rejected": -8.718925476074219, + "step": 1243 + }, + { + "epoch": 0.11365920511649155, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 9.693966534929942e-06, + "logits/chosen": 631611136.0, + "logits/rejected": 410455398.4, + "logps/chosen": -319.0659586588542, + "logps/rejected": -503.45986328125, + "loss": 0.0129, + "rewards/chosen": 4.175467491149902, + "rewards/margins": 11.621776390075684, + "rewards/rejected": -7.446308898925781, + "step": 1244 + }, + { + "epoch": 0.1137505710370032, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 9.69347104620385e-06, + "logits/rejected": 677932416.0, + "logps/rejected": -387.774658203125, + "loss": 0.0095, + "rewards/rejected": -6.420156478881836, + "step": 1245 + }, + { + "epoch": 0.11384193695751485, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.692975169371189e-06, + "logits/chosen": 561467818.6666666, + "logits/rejected": 704137152.0, + "logps/chosen": -349.9595540364583, + "logps/rejected": -680.962158203125, + "loss": 0.1192, + "rewards/chosen": 2.9240875244140625, + "rewards/margins": 13.078191757202148, + "rewards/rejected": -10.154104232788086, + "step": 1246 + }, + { + "epoch": 0.1139333028780265, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.692478904472963e-06, + "logits/chosen": 452960256.0, + "logits/rejected": 618699178.6666666, + "logps/chosen": -240.3518524169922, + "logps/rejected": -421.3401692708333, + "loss": 0.0253, + "rewards/chosen": 2.7029428482055664, + "rewards/margins": 11.129805564880371, + "rewards/rejected": -8.426862716674805, + "step": 1247 + }, + { + "epoch": 0.11402466879853815, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.691982251550207e-06, + "logits/chosen": 748230592.0, + "logits/rejected": 392864320.0, + "logps/chosen": -453.462158203125, + "logps/rejected": -555.6019287109375, + "loss": 0.027, + "rewards/chosen": 2.960745334625244, + "rewards/margins": 12.218332767486572, + "rewards/rejected": -9.257587432861328, + "step": 1248 + }, + { + "epoch": 0.1141160347190498, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 9.691485210643991e-06, + "logits/chosen": 462449356.8, + "logits/rejected": 692733525.3333334, + "logps/chosen": -398.812744140625, + "logps/rejected": -505.5987955729167, + "loss": 0.0279, + "rewards/chosen": 3.2704193115234377, + "rewards/margins": 10.891412607828777, + "rewards/rejected": -7.620993296305339, + "step": 1249 + }, + { + "epoch": 0.11420740063956145, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.690987781795419e-06, + "logits/chosen": 441425365.3333333, + "logits/rejected": 379706521.6, + "logps/chosen": -330.1365152994792, + "logps/rejected": -422.387939453125, + "loss": 0.0443, + "rewards/chosen": 2.707365036010742, + "rewards/margins": 11.92818717956543, + "rewards/rejected": -9.220822143554688, + "step": 1250 + }, + { + "epoch": 0.1142987665600731, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 9.690489965045616e-06, + "logits/chosen": 894872746.6666666, + "logits/rejected": 973878579.2, + "logps/chosen": -438.241943359375, + "logps/rejected": -327.6922119140625, + "loss": 0.0191, + "rewards/chosen": 3.198350270589193, + "rewards/margins": 10.471818288167318, + "rewards/rejected": -7.273468017578125, + "step": 1251 + }, + { + "epoch": 0.11439013248058474, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.689991760435754e-06, + "logits/chosen": 892076544.0, + "logits/rejected": 854240563.2, + "logps/chosen": -413.52392578125, + "logps/rejected": -413.113623046875, + "loss": 0.0423, + "rewards/chosen": 2.385251998901367, + "rewards/margins": 9.768368911743163, + "rewards/rejected": -7.383116912841797, + "step": 1252 + }, + { + "epoch": 0.1144814984010964, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.689493168007025e-06, + "logits/chosen": 657169749.3333334, + "logits/rejected": 491631820.8, + "logps/chosen": -227.97074381510416, + "logps/rejected": -611.63369140625, + "loss": 0.0268, + "rewards/chosen": 2.884741465250651, + "rewards/margins": 14.627848307291666, + "rewards/rejected": -11.743106842041016, + "step": 1253 + }, + { + "epoch": 0.11457286432160804, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 9.688994187800662e-06, + "logits/chosen": 404833728.0, + "logits/rejected": 308238016.0, + "logps/chosen": -328.6892395019531, + "logps/rejected": -422.7361145019531, + "loss": 0.0173, + "rewards/chosen": 3.413174629211426, + "rewards/margins": 14.380268096923828, + "rewards/rejected": -10.967093467712402, + "step": 1254 + }, + { + "epoch": 0.11466423024211969, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.688494819857924e-06, + "logits/chosen": 351420864.0, + "logits/rejected": 369844416.0, + "logps/chosen": -413.80975341796875, + "logps/rejected": -413.49786376953125, + "loss": 0.0848, + "rewards/chosen": 2.702775478363037, + "rewards/margins": 8.462075233459473, + "rewards/rejected": -5.7592997550964355, + "step": 1255 + }, + { + "epoch": 0.11475559616263134, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.687995064220102e-06, + "logits/chosen": 580353572.5714285, + "logits/rejected": 307770112.0, + "logps/chosen": -326.36049107142856, + "logps/rejected": -378.95025634765625, + "loss": 0.0743, + "rewards/chosen": 2.6222664969308034, + "rewards/margins": 10.113917078290667, + "rewards/rejected": -7.491650581359863, + "step": 1256 + }, + { + "epoch": 0.11484696208314299, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.687494920928527e-06, + "logits/chosen": 369366080.0, + "logits/rejected": 809451264.0, + "logps/chosen": -164.44744873046875, + "logps/rejected": -534.1207682291666, + "loss": 0.1341, + "rewards/chosen": -0.4017888903617859, + "rewards/margins": 9.768850147724152, + "rewards/rejected": -10.170639038085938, + "step": 1257 + }, + { + "epoch": 0.11493832800365464, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.68699439002455e-06, + "logits/chosen": 1460824234.6666667, + "logits/rejected": 557233766.4, + "logps/chosen": -536.152587890625, + "logps/rejected": -369.7216552734375, + "loss": 0.0499, + "rewards/chosen": 2.397370974222819, + "rewards/margins": 10.044800631205241, + "rewards/rejected": -7.647429656982422, + "step": 1258 + }, + { + "epoch": 0.11502969392416629, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.686493471549563e-06, + "logits/chosen": 363002272.0, + "logits/rejected": 342022912.0, + "logps/chosen": -345.02642822265625, + "logps/rejected": -439.1791585286458, + "loss": 0.0289, + "rewards/chosen": 2.066149950027466, + "rewards/margins": 10.35516095161438, + "rewards/rejected": -8.289011001586914, + "step": 1259 + }, + { + "epoch": 0.11512105984467794, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.685992165544988e-06, + "logits/chosen": 1176382037.3333333, + "logits/rejected": 922463488.0, + "logps/chosen": -343.55712890625, + "logps/rejected": -792.0224609375, + "loss": 0.058, + "rewards/chosen": 3.1357297897338867, + "rewards/margins": 12.044998168945312, + "rewards/rejected": -8.909268379211426, + "step": 1260 + }, + { + "epoch": 0.11521242576518959, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 9.685490472052276e-06, + "logits/chosen": 620350912.0, + "logits/rejected": 404541024.0, + "logps/chosen": -239.34024047851562, + "logps/rejected": -311.95013427734375, + "loss": 0.0168, + "rewards/chosen": 3.759446620941162, + "rewards/margins": 11.26390790939331, + "rewards/rejected": -7.504461288452148, + "step": 1261 + }, + { + "epoch": 0.11530379168570123, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.684988391112917e-06, + "logits/chosen": 485120768.0, + "logits/rejected": 459535744.0, + "logps/chosen": -473.3326110839844, + "logps/rejected": -625.1486409505209, + "loss": 0.0265, + "rewards/chosen": 2.688188076019287, + "rewards/margins": 12.00774081548055, + "rewards/rejected": -9.319552739461264, + "step": 1262 + }, + { + "epoch": 0.11539515760621288, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.684485922768422e-06, + "logits/chosen": 345407078.4, + "logits/rejected": 297522389.3333333, + "logps/chosen": -276.17421875, + "logps/rejected": -440.4827473958333, + "loss": 0.0913, + "rewards/chosen": 2.250051498413086, + "rewards/margins": 13.064601262410482, + "rewards/rejected": -10.814549763997396, + "step": 1263 + }, + { + "epoch": 0.11548652352672453, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.683983067060348e-06, + "logits/chosen": 480452096.0, + "logits/rejected": 521815961.6, + "logps/chosen": -200.177978515625, + "logps/rejected": -570.64560546875, + "loss": 0.0546, + "rewards/chosen": 2.4991561571756997, + "rewards/margins": 12.31275021235148, + "rewards/rejected": -9.81359405517578, + "step": 1264 + }, + { + "epoch": 0.11557788944723618, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.683479824030269e-06, + "logits/chosen": 516611891.2, + "logits/rejected": 301990741.3333333, + "logps/chosen": -337.103955078125, + "logps/rejected": -319.7999267578125, + "loss": 0.032, + "rewards/chosen": 3.3400314331054686, + "rewards/margins": 8.553682454427083, + "rewards/rejected": -5.213651021321614, + "step": 1265 + }, + { + "epoch": 0.11566925536774783, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.682976193719804e-06, + "logits/chosen": 849255253.3333334, + "logits/rejected": 422074688.0, + "logps/chosen": -274.6400960286458, + "logps/rejected": -246.57240295410156, + "loss": 0.0934, + "rewards/chosen": 3.0863030751546225, + "rewards/margins": 7.741403420766195, + "rewards/rejected": -4.655100345611572, + "step": 1266 + }, + { + "epoch": 0.11576062128825948, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.682472176170596e-06, + "logits/chosen": 560537536.0, + "logits/rejected": 340043616.0, + "logps/chosen": -258.3126220703125, + "logps/rejected": -392.134521484375, + "loss": 0.0483, + "rewards/chosen": 3.352809190750122, + "rewards/margins": 10.143684148788452, + "rewards/rejected": -6.79087495803833, + "step": 1267 + }, + { + "epoch": 0.11585198720877113, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 9.681967771424323e-06, + "logits/chosen": 650810624.0, + "logits/rejected": 245668000.0, + "logps/chosen": -399.2595621744792, + "logps/rejected": -393.73614501953125, + "loss": 0.0415, + "rewards/chosen": 3.139939626057943, + "rewards/margins": 13.585498174031576, + "rewards/rejected": -10.445558547973633, + "step": 1268 + }, + { + "epoch": 0.11594335312928278, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.681462979522693e-06, + "logits/chosen": 449280960.0, + "logits/rejected": 375976416.0, + "logps/chosen": -315.94873046875, + "logps/rejected": -315.23394775390625, + "loss": 0.0309, + "rewards/chosen": 3.5280518531799316, + "rewards/margins": 11.40363073348999, + "rewards/rejected": -7.875578880310059, + "step": 1269 + }, + { + "epoch": 0.11603471904979443, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 9.680957800507452e-06, + "logits/chosen": 727763797.3333334, + "logits/rejected": 491721062.4, + "logps/chosen": -441.9875081380208, + "logps/rejected": -463.29375, + "loss": 0.0158, + "rewards/chosen": 3.25975767771403, + "rewards/margins": 12.708002916971841, + "rewards/rejected": -9.448245239257812, + "step": 1270 + }, + { + "epoch": 0.11612608497030608, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.68045223442037e-06, + "logits/chosen": 661520128.0, + "logits/rejected": 410504320.0, + "logps/chosen": -311.2933654785156, + "logps/rejected": -398.257080078125, + "loss": 0.0226, + "rewards/chosen": 3.300107955932617, + "rewards/margins": 12.27424144744873, + "rewards/rejected": -8.974133491516113, + "step": 1271 + }, + { + "epoch": 0.11621745089081773, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.679946281303253e-06, + "logits/chosen": 698094848.0, + "logits/rejected": 371806272.0, + "logps/chosen": -439.6420084635417, + "logps/rejected": -293.55450439453125, + "loss": 0.0688, + "rewards/chosen": 2.470865567525228, + "rewards/margins": 8.169086774190268, + "rewards/rejected": -5.698221206665039, + "step": 1272 + }, + { + "epoch": 0.11630881681132937, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 9.67943994119794e-06, + "logits/chosen": 1037801045.3333334, + "logits/rejected": 1163499392.0, + "logps/chosen": -370.887939453125, + "logps/rejected": -581.545654296875, + "loss": 0.0435, + "rewards/chosen": 3.2414976755777993, + "rewards/margins": 11.1767365137736, + "rewards/rejected": -7.935238838195801, + "step": 1273 + }, + { + "epoch": 0.11640018273184102, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 9.678933214146299e-06, + "logits/chosen": 635140864.0, + "logits/rejected": 624209817.6, + "logps/chosen": -374.4651692708333, + "logps/rejected": -418.81865234375, + "loss": 0.0189, + "rewards/chosen": 3.0562610626220703, + "rewards/margins": 10.564342880249024, + "rewards/rejected": -7.5080818176269535, + "step": 1274 + }, + { + "epoch": 0.11649154865235267, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.678426100190233e-06, + "logits/chosen": 843291750.4, + "logits/rejected": 419137578.6666667, + "logps/chosen": -454.076171875, + "logps/rejected": -247.85498046875, + "loss": 0.0694, + "rewards/chosen": 3.135059928894043, + "rewards/margins": 6.7430527369181315, + "rewards/rejected": -3.6079928080240884, + "step": 1275 + }, + { + "epoch": 0.11658291457286432, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.677918599371674e-06, + "logits/chosen": 433963648.0, + "logits/rejected": 366158976.0, + "logps/chosen": -259.599609375, + "logps/rejected": -363.2578125, + "loss": 0.0325, + "rewards/chosen": 2.880596399307251, + "rewards/margins": 9.769556283950806, + "rewards/rejected": -6.888959884643555, + "step": 1276 + }, + { + "epoch": 0.11667428049337597, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.67741071173259e-06, + "logits/chosen": 316546261.3333333, + "logits/rejected": 436041856.0, + "logps/chosen": -247.22904459635416, + "logps/rejected": -298.8641052246094, + "loss": 0.0848, + "rewards/chosen": 2.9653501510620117, + "rewards/margins": 9.732794761657715, + "rewards/rejected": -6.767444610595703, + "step": 1277 + }, + { + "epoch": 0.11676564641388762, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.676902437314977e-06, + "logits/chosen": 608590438.4, + "logits/rejected": 368847616.0, + "logps/chosen": -303.38984375, + "logps/rejected": -446.7471516927083, + "loss": 0.0457, + "rewards/chosen": 2.8829729080200197, + "rewards/margins": 12.69698861440023, + "rewards/rejected": -9.814015706380209, + "step": 1278 + }, + { + "epoch": 0.11685701233439927, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.676393776160866e-06, + "logits/chosen": 1076259328.0, + "logits/rejected": 659620608.0, + "logps/chosen": -304.8488362630208, + "logps/rejected": -434.36370849609375, + "loss": 0.0924, + "rewards/chosen": 2.9473114013671875, + "rewards/margins": 11.310930252075195, + "rewards/rejected": -8.363618850708008, + "step": 1279 + }, + { + "epoch": 0.11694837825491092, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 9.675884728312314e-06, + "logits/chosen": 735037568.0, + "logits/rejected": 665409536.0, + "logps/chosen": -385.92755126953125, + "logps/rejected": -376.0312805175781, + "loss": 0.0328, + "rewards/chosen": 3.3260657787323, + "rewards/margins": 10.738475561141968, + "rewards/rejected": -7.412409782409668, + "step": 1280 + }, + { + "epoch": 0.11703974417542257, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.675375293811422e-06, + "logits/chosen": 568418389.3333334, + "logits/rejected": 405108684.8, + "logps/chosen": -499.2181803385417, + "logps/rejected": -331.317333984375, + "loss": 0.0589, + "rewards/chosen": 2.4298046429951987, + "rewards/margins": 9.035861619313557, + "rewards/rejected": -6.606056976318359, + "step": 1281 + }, + { + "epoch": 0.11713111009593422, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 9.67486547270031e-06, + "logits/chosen": 573209088.0, + "logits/rejected": 342753600.0, + "logps/chosen": -316.4574279785156, + "logps/rejected": -263.36541748046875, + "loss": 0.0258, + "rewards/chosen": 3.1476593017578125, + "rewards/margins": 8.268815040588379, + "rewards/rejected": -5.121155738830566, + "step": 1282 + }, + { + "epoch": 0.11722247601644586, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.674355265021136e-06, + "logits/chosen": 528850880.0, + "logits/rejected": 400920512.0, + "logps/chosen": -432.4943542480469, + "logps/rejected": -475.7469482421875, + "loss": 0.0338, + "rewards/chosen": 2.919015884399414, + "rewards/margins": 10.138535499572754, + "rewards/rejected": -7.21951961517334, + "step": 1283 + }, + { + "epoch": 0.11731384193695751, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 9.673844670816092e-06, + "logits/chosen": 646865024.0, + "logits/rejected": 663660480.0, + "logps/chosen": -268.11505126953125, + "logps/rejected": -643.895751953125, + "loss": 0.0084, + "rewards/chosen": 4.481039047241211, + "rewards/margins": 11.106973648071289, + "rewards/rejected": -6.625934600830078, + "step": 1284 + }, + { + "epoch": 0.11740520785746916, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.673333690127397e-06, + "logits/chosen": 544774912.0, + "logits/rejected": 602155349.3333334, + "logps/chosen": -218.542138671875, + "logps/rejected": -519.2976481119791, + "loss": 0.0409, + "rewards/chosen": 3.734522247314453, + "rewards/margins": 10.742517217000325, + "rewards/rejected": -7.007994969685872, + "step": 1285 + }, + { + "epoch": 0.11749657377798081, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 9.672822322997305e-06, + "logits/chosen": 435068096.0, + "logits/rejected": 278289920.0, + "logps/chosen": -242.56875610351562, + "logps/rejected": -380.7816569010417, + "loss": 0.0229, + "rewards/chosen": 2.5188498497009277, + "rewards/margins": 9.822490215301514, + "rewards/rejected": -7.303640365600586, + "step": 1286 + }, + { + "epoch": 0.11758793969849246, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.672310569468102e-06, + "logits/chosen": 527916842.6666667, + "logits/rejected": 476322528.0, + "logps/chosen": -369.4144287109375, + "logps/rejected": -472.5811462402344, + "loss": 0.0586, + "rewards/chosen": 2.590146064758301, + "rewards/margins": 10.029998779296875, + "rewards/rejected": -7.439852714538574, + "step": 1287 + }, + { + "epoch": 0.11767930561900411, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.671798429582104e-06, + "logits/chosen": 523091776.0, + "logits/rejected": 439565472.0, + "logps/chosen": -281.92962646484375, + "logps/rejected": -521.3406982421875, + "loss": 0.0263, + "rewards/chosen": 3.191514492034912, + "rewards/margins": 13.576664447784424, + "rewards/rejected": -10.385149955749512, + "step": 1288 + }, + { + "epoch": 0.11777067153951576, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 9.671285903381662e-06, + "logits/chosen": 382716352.0, + "logits/rejected": 497330346.6666667, + "logps/chosen": -334.82958984375, + "logps/rejected": -466.2432861328125, + "loss": 0.0166, + "rewards/chosen": 2.7011795043945312, + "rewards/margins": 10.388699213663738, + "rewards/rejected": -7.687519709269206, + "step": 1289 + }, + { + "epoch": 0.11786203746002741, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.670772990909156e-06, + "logits/chosen": 608432384.0, + "logits/rejected": 425305139.2, + "logps/chosen": -380.4265950520833, + "logps/rejected": -505.545849609375, + "loss": 0.081, + "rewards/chosen": 3.095069249471029, + "rewards/margins": 10.265781529744466, + "rewards/rejected": -7.170712280273437, + "step": 1290 + }, + { + "epoch": 0.11795340338053906, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.670259692207e-06, + "logits/chosen": 630020608.0, + "logits/rejected": 329280793.6, + "logps/chosen": -200.5339152018229, + "logps/rejected": -452.34462890625, + "loss": 0.0586, + "rewards/chosen": 2.041419347127279, + "rewards/margins": 9.76133893330892, + "rewards/rejected": -7.719919586181641, + "step": 1291 + }, + { + "epoch": 0.1180447693010507, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.669746007317637e-06, + "logits/chosen": 298861088.0, + "logits/rejected": 686946261.3333334, + "logps/chosen": -262.862548828125, + "logps/rejected": -539.0875651041666, + "loss": 0.066, + "rewards/chosen": 3.283052921295166, + "rewards/margins": 11.246649265289307, + "rewards/rejected": -7.963596343994141, + "step": 1292 + }, + { + "epoch": 0.11813613522156236, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.669231936283545e-06, + "logits/chosen": 703571251.2, + "logits/rejected": 454913322.6666667, + "logps/chosen": -313.634130859375, + "logps/rejected": -592.996826171875, + "loss": 0.0294, + "rewards/chosen": 3.4123401641845703, + "rewards/margins": 11.30756441752116, + "rewards/rejected": -7.895224253336589, + "step": 1293 + }, + { + "epoch": 0.118227501142074, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.668717479147235e-06, + "logits/chosen": 297671168.0, + "logits/rejected": 361533696.0, + "logps/chosen": -560.9742024739584, + "logps/rejected": -339.3663330078125, + "loss": 0.0386, + "rewards/chosen": 3.176586151123047, + "rewards/margins": 11.009469604492187, + "rewards/rejected": -7.83288345336914, + "step": 1294 + }, + { + "epoch": 0.11831886706258565, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.668202635951246e-06, + "logits/chosen": 573013862.4, + "logits/rejected": 665294464.0, + "logps/chosen": -367.902734375, + "logps/rejected": -603.2242024739584, + "loss": 0.0416, + "rewards/chosen": 2.77249755859375, + "rewards/margins": 12.647409820556641, + "rewards/rejected": -9.87491226196289, + "step": 1295 + }, + { + "epoch": 0.1184102329830973, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.66768740673815e-06, + "logits/chosen": 341128725.3333333, + "logits/rejected": 735821926.4, + "logps/chosen": -235.33319091796875, + "logps/rejected": -557.95849609375, + "loss": 0.067, + "rewards/chosen": 1.9979896545410156, + "rewards/margins": 11.211537933349609, + "rewards/rejected": -9.213548278808593, + "step": 1296 + }, + { + "epoch": 0.11850159890360895, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.667171791550553e-06, + "logits/chosen": 828278144.0, + "logits/rejected": 662357312.0, + "logps/chosen": -339.07086181640625, + "logps/rejected": -512.7963256835938, + "loss": 0.093, + "rewards/chosen": 2.8740615844726562, + "rewards/margins": 10.042057514190674, + "rewards/rejected": -7.167995929718018, + "step": 1297 + }, + { + "epoch": 0.1185929648241206, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.666655790431092e-06, + "logits/chosen": 803640768.0, + "logits/rejected": 828247296.0, + "logps/chosen": -220.1478271484375, + "logps/rejected": -423.849609375, + "loss": 0.0401, + "rewards/chosen": 2.176013231277466, + "rewards/margins": 9.610188086827595, + "rewards/rejected": -7.43417485555013, + "step": 1298 + }, + { + "epoch": 0.11868433074463225, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.666139403422434e-06, + "logits/chosen": 456920320.0, + "logits/rejected": 336643360.0, + "logps/chosen": -336.82041422526044, + "logps/rejected": -389.49993896484375, + "loss": 0.0492, + "rewards/chosen": 3.279350916544596, + "rewards/margins": 9.312383333841959, + "rewards/rejected": -6.033032417297363, + "step": 1299 + }, + { + "epoch": 0.1187756966651439, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.665622630567279e-06, + "logits/chosen": 1161833216.0, + "logits/rejected": 1144670080.0, + "logps/chosen": -306.6529846191406, + "logps/rejected": -576.8458862304688, + "loss": 0.0318, + "rewards/chosen": 3.0136818885803223, + "rewards/margins": 12.488834857940674, + "rewards/rejected": -9.475152969360352, + "step": 1300 + }, + { + "epoch": 0.11886706258565555, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.66510547190836e-06, + "logits/chosen": 341678634.6666667, + "logits/rejected": 530167705.6, + "logps/chosen": -330.03173828125, + "logps/rejected": -799.632177734375, + "loss": 0.0387, + "rewards/chosen": 3.853384017944336, + "rewards/margins": 12.864767074584961, + "rewards/rejected": -9.011383056640625, + "step": 1301 + }, + { + "epoch": 0.1189584285061672, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.664587927488443e-06, + "logits/chosen": 582731980.8, + "logits/rejected": 459071402.6666667, + "logps/chosen": -345.186083984375, + "logps/rejected": -291.7665201822917, + "loss": 0.04, + "rewards/chosen": 2.8321582794189455, + "rewards/margins": 8.47611796061198, + "rewards/rejected": -5.643959681193034, + "step": 1302 + }, + { + "epoch": 0.11904979442667885, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.664069997350322e-06, + "logits/chosen": 626408448.0, + "logits/rejected": 485058218.6666667, + "logps/chosen": -371.477294921875, + "logps/rejected": -395.047119140625, + "loss": 0.0446, + "rewards/chosen": 3.530491256713867, + "rewards/margins": 11.112193806966145, + "rewards/rejected": -7.581702550252278, + "step": 1303 + }, + { + "epoch": 0.1191411603471905, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.663551681536827e-06, + "logits/chosen": 424155072.0, + "logits/rejected": 233032688.0, + "logps/chosen": -245.3571014404297, + "logps/rejected": -288.26702880859375, + "loss": 0.0516, + "rewards/chosen": 2.7886898517608643, + "rewards/margins": 10.028962850570679, + "rewards/rejected": -7.2402729988098145, + "step": 1304 + }, + { + "epoch": 0.11923252626770214, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.663032980090817e-06, + "logits/chosen": 543951786.6666666, + "logits/rejected": 861203148.8, + "logps/chosen": -324.4564615885417, + "logps/rejected": -378.1560791015625, + "loss": 0.081, + "rewards/chosen": 3.3591092427571616, + "rewards/margins": 10.130776723225912, + "rewards/rejected": -6.77166748046875, + "step": 1305 + }, + { + "epoch": 0.11932389218821379, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 9.662513893055181e-06, + "logits/chosen": 446282069.3333333, + "logits/rejected": 529569382.4, + "logps/chosen": -226.6340535481771, + "logps/rejected": -475.967333984375, + "loss": 0.0958, + "rewards/chosen": 3.121957461039225, + "rewards/margins": 8.699889437357584, + "rewards/rejected": -5.577931976318359, + "step": 1306 + }, + { + "epoch": 0.11941525810872544, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.661994420472847e-06, + "logits/chosen": 652558720.0, + "logits/rejected": 556181376.0, + "logps/chosen": -296.52789306640625, + "logps/rejected": -473.87371826171875, + "loss": 0.0296, + "rewards/chosen": 3.0267019271850586, + "rewards/margins": 12.767306327819824, + "rewards/rejected": -9.740604400634766, + "step": 1307 + }, + { + "epoch": 0.11950662402923709, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 9.66147456238677e-06, + "logits/chosen": 1018369792.0, + "logits/rejected": 948613046.8571428, + "logps/chosen": -581.4309692382812, + "logps/rejected": -748.9654017857143, + "loss": 0.0031, + "rewards/chosen": 3.755169630050659, + "rewards/margins": 16.561439684459142, + "rewards/rejected": -12.806270054408483, + "step": 1308 + }, + { + "epoch": 0.11959798994974874, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.660954318839934e-06, + "logits/chosen": 535613184.0, + "logits/rejected": 330393446.4, + "logps/chosen": -318.37839762369794, + "logps/rejected": -285.75302734375, + "loss": 0.0393, + "rewards/chosen": 3.108839988708496, + "rewards/margins": 9.444614219665528, + "rewards/rejected": -6.335774230957031, + "step": 1309 + }, + { + "epoch": 0.11968935587026039, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.66043368987536e-06, + "logits/chosen": 382407833.6, + "logits/rejected": 421657728.0, + "logps/chosen": -248.14140625, + "logps/rejected": -799.5406901041666, + "loss": 0.0296, + "rewards/chosen": 3.3377037048339844, + "rewards/margins": 14.67800776163737, + "rewards/rejected": -11.340304056803385, + "step": 1310 + }, + { + "epoch": 0.11978072179077204, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.659912675536102e-06, + "logits/chosen": 539116672.0, + "logits/rejected": 460960614.4, + "logps/chosen": -381.2564290364583, + "logps/rejected": -347.180517578125, + "loss": 0.0501, + "rewards/chosen": 1.9334980646769206, + "rewards/margins": 9.478498522440592, + "rewards/rejected": -7.545000457763672, + "step": 1311 + }, + { + "epoch": 0.11987208771128369, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 9.659391275865239e-06, + "logits/chosen": 544083840.0, + "logits/rejected": 321357312.0, + "logps/chosen": -296.4236246744792, + "logps/rejected": -335.4042236328125, + "loss": 0.078, + "rewards/chosen": 2.092404365539551, + "rewards/margins": 9.883612251281738, + "rewards/rejected": -7.791207885742187, + "step": 1312 + }, + { + "epoch": 0.11996345363179534, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.658869490905888e-06, + "logits/chosen": 366166630.4, + "logits/rejected": 368785322.6666667, + "logps/chosen": -288.950244140625, + "logps/rejected": -511.4945475260417, + "loss": 0.0433, + "rewards/chosen": 2.743800163269043, + "rewards/margins": 13.725085258483887, + "rewards/rejected": -10.981285095214844, + "step": 1313 + }, + { + "epoch": 0.12005481955230698, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 9.658347320701198e-06, + "logits/chosen": 396107605.3333333, + "logits/rejected": 371276928.0, + "logps/chosen": -122.54049682617188, + "logps/rejected": -417.87734375, + "loss": 0.0477, + "rewards/chosen": 2.0326642990112305, + "rewards/margins": 10.811747550964355, + "rewards/rejected": -8.779083251953125, + "step": 1314 + }, + { + "epoch": 0.12014618547281863, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.65782476529434e-06, + "logits/chosen": 652626688.0, + "logits/rejected": 447750656.0, + "logps/chosen": -262.45111083984375, + "logps/rejected": -521.0968017578125, + "loss": 0.0223, + "rewards/chosen": 3.6428041458129883, + "rewards/margins": 11.715086936950684, + "rewards/rejected": -8.072282791137695, + "step": 1315 + }, + { + "epoch": 0.12023755139333028, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.657301824728534e-06, + "logits/chosen": 634258176.0, + "logits/rejected": 579724361.1428572, + "logps/chosen": -383.6438293457031, + "logps/rejected": -546.3314732142857, + "loss": 0.0197, + "rewards/chosen": 2.022869825363159, + "rewards/margins": 9.446891410010203, + "rewards/rejected": -7.4240215846470425, + "step": 1316 + }, + { + "epoch": 0.12032891731384193, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 9.656778499047017e-06, + "logits/chosen": 1243246464.0, + "logits/rejected": 670272512.0, + "logps/chosen": -268.6474609375, + "logps/rejected": -409.80450439453125, + "loss": 0.0239, + "rewards/chosen": 3.4099555015563965, + "rewards/margins": 10.3339524269104, + "rewards/rejected": -6.923996925354004, + "step": 1317 + }, + { + "epoch": 0.12042028323435358, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 9.656254788293064e-06, + "logits/chosen": 928493670.4, + "logits/rejected": 391190272.0, + "logps/chosen": -274.19755859375, + "logps/rejected": -340.2545572916667, + "loss": 0.0169, + "rewards/chosen": 3.9234237670898438, + "rewards/margins": 12.152921676635742, + "rewards/rejected": -8.229497909545898, + "step": 1318 + }, + { + "epoch": 0.12051164915486523, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.65573069250998e-06, + "logits/chosen": 402855584.0, + "logits/rejected": 970154304.0, + "logps/chosen": -224.8875732421875, + "logps/rejected": -487.748291015625, + "loss": 0.0476, + "rewards/chosen": 3.088104724884033, + "rewards/margins": 11.019441604614258, + "rewards/rejected": -7.931336879730225, + "step": 1319 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.655206211741106e-06, + "logits/chosen": 842986325.3333334, + "logits/rejected": 847386944.0, + "logps/chosen": -455.0975748697917, + "logps/rejected": -302.102294921875, + "loss": 0.052, + "rewards/chosen": 2.821854909261068, + "rewards/margins": 8.235397656758627, + "rewards/rejected": -5.413542747497559, + "step": 1320 + }, + { + "epoch": 0.12069438099588853, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.654681346029809e-06, + "logits/chosen": 502475456.0, + "logits/rejected": 640974720.0, + "logps/chosen": -375.74249267578125, + "logps/rejected": -262.82586669921875, + "loss": 0.0311, + "rewards/chosen": 3.135826349258423, + "rewards/margins": 9.599581480026245, + "rewards/rejected": -6.463755130767822, + "step": 1321 + }, + { + "epoch": 0.12078574691640018, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.654156095419491e-06, + "logits/chosen": 904510976.0, + "logits/rejected": 561781657.6, + "logps/chosen": -292.1138102213542, + "logps/rejected": -508.79775390625, + "loss": 0.06, + "rewards/chosen": 1.8954334259033203, + "rewards/margins": 9.621696853637696, + "rewards/rejected": -7.726263427734375, + "step": 1322 + }, + { + "epoch": 0.12087711283691183, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 9.653630459953585e-06, + "logits/chosen": 895711488.0, + "logits/rejected": 816642368.0, + "logps/chosen": -393.0847574869792, + "logps/rejected": -381.9139099121094, + "loss": 0.0424, + "rewards/chosen": 3.2735595703125, + "rewards/margins": 8.891972541809082, + "rewards/rejected": -5.618412971496582, + "step": 1323 + }, + { + "epoch": 0.12096847875742348, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 9.653104439675557e-06, + "logits/chosen": 451242176.0, + "logits/rejected": 571513984.0, + "logps/chosen": -332.5369873046875, + "logps/rejected": -610.33251953125, + "loss": 0.0233, + "rewards/chosen": 3.2241406440734863, + "rewards/margins": 11.562997341156006, + "rewards/rejected": -8.33885669708252, + "step": 1324 + }, + { + "epoch": 0.12105984467793512, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.652578034628904e-06, + "logits/chosen": 428886118.4, + "logits/rejected": 664619818.6666666, + "logps/chosen": -299.48671875, + "logps/rejected": -613.7650553385416, + "loss": 0.0539, + "rewards/chosen": 2.580308723449707, + "rewards/margins": 12.188999621073403, + "rewards/rejected": -9.608690897623697, + "step": 1325 + }, + { + "epoch": 0.12115121059844677, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.652051244857155e-06, + "logits/chosen": 442438272.0, + "logits/rejected": 657748544.0, + "logps/chosen": -226.48038736979166, + "logps/rejected": -793.4771118164062, + "loss": 0.0609, + "rewards/chosen": 2.699321746826172, + "rewards/margins": 11.84398078918457, + "rewards/rejected": -9.144659042358398, + "step": 1326 + }, + { + "epoch": 0.12124257651895842, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.65152407040387e-06, + "logits/chosen": 351265952.0, + "logits/rejected": 473396565.3333333, + "logps/chosen": -321.9242858886719, + "logps/rejected": -453.9795328776042, + "loss": 0.0155, + "rewards/chosen": 5.102355003356934, + "rewards/margins": 12.714807828267414, + "rewards/rejected": -7.6124528249104815, + "step": 1327 + }, + { + "epoch": 0.12133394243947007, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.650996511312642e-06, + "logits/chosen": 373468992.0, + "logits/rejected": 576234496.0, + "logps/chosen": -242.9832000732422, + "logps/rejected": -469.46807861328125, + "loss": 0.0261, + "rewards/chosen": 3.719865322113037, + "rewards/margins": 12.410501956939697, + "rewards/rejected": -8.69063663482666, + "step": 1328 + }, + { + "epoch": 0.12142530835998172, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 9.650468567627094e-06, + "logits/chosen": 447008768.0, + "logits/rejected": 376558165.3333333, + "logps/chosen": -191.6907470703125, + "logps/rejected": -548.0006510416666, + "loss": 0.0385, + "rewards/chosen": 3.08106689453125, + "rewards/margins": 13.175279744466145, + "rewards/rejected": -10.094212849934896, + "step": 1329 + }, + { + "epoch": 0.12151667428049337, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 9.649940239390884e-06, + "logits/chosen": 645863116.8, + "logits/rejected": 461768192.0, + "logps/chosen": -177.74798583984375, + "logps/rejected": -448.0319010416667, + "loss": 0.051, + "rewards/chosen": 2.9420259475708006, + "rewards/margins": 11.367611376444497, + "rewards/rejected": -8.425585428873697, + "step": 1330 + }, + { + "epoch": 0.12160804020100502, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.649411526647698e-06, + "logits/chosen": 1166336614.4, + "logits/rejected": 1115643904.0, + "logps/chosen": -389.767626953125, + "logps/rejected": -597.6689046223959, + "loss": 0.0437, + "rewards/chosen": 3.0556638717651365, + "rewards/margins": 14.280357170104981, + "rewards/rejected": -11.224693298339844, + "step": 1331 + }, + { + "epoch": 0.12169940612151667, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 9.648882429441258e-06, + "logits/chosen": 700542976.0, + "logits/rejected": 495552256.0, + "logps/chosen": -397.34112548828125, + "logps/rejected": -489.0736083984375, + "loss": 0.1162, + "rewards/chosen": 2.133718967437744, + "rewards/margins": 8.83615493774414, + "rewards/rejected": -6.7024359703063965, + "step": 1332 + }, + { + "epoch": 0.12179077204202832, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.648352947815312e-06, + "logits/chosen": 1101987072.0, + "logits/rejected": 499894368.0, + "logps/chosen": -519.5927734375, + "logps/rejected": -467.4383544921875, + "loss": 0.036, + "rewards/chosen": 3.235589345296224, + "rewards/margins": 12.71297295888265, + "rewards/rejected": -9.477383613586426, + "step": 1333 + }, + { + "epoch": 0.12188213796253997, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 9.647823081813648e-06, + "logits/chosen": 277538538.6666667, + "logits/rejected": 475514316.8, + "logps/chosen": -97.19978841145833, + "logps/rejected": -565.277880859375, + "loss": 0.0189, + "rewards/chosen": 3.4362157185872397, + "rewards/margins": 11.678216298421225, + "rewards/rejected": -8.242000579833984, + "step": 1334 + }, + { + "epoch": 0.12197350388305161, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 9.647292831480075e-06, + "logits/chosen": 317213525.3333333, + "logits/rejected": 727362355.2, + "logps/chosen": -177.41796875, + "logps/rejected": -533.127880859375, + "loss": 0.0111, + "rewards/chosen": 4.1892649332682295, + "rewards/margins": 13.813907114664715, + "rewards/rejected": -9.624642181396485, + "step": 1335 + }, + { + "epoch": 0.12206486980356326, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 9.646762196858444e-06, + "logits/chosen": 589694592.0, + "logits/rejected": 730477909.3333334, + "logps/chosen": -237.1134796142578, + "logps/rejected": -462.87255859375, + "loss": 0.1143, + "rewards/chosen": 1.5769332647323608, + "rewards/margins": 9.015062689781189, + "rewards/rejected": -7.438129425048828, + "step": 1336 + }, + { + "epoch": 0.12215623572407493, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.646231177992633e-06, + "logits/chosen": 347136000.0, + "logits/rejected": 371965952.0, + "logps/chosen": -339.1587320963542, + "logps/rejected": -404.3628173828125, + "loss": 0.0306, + "rewards/chosen": 2.795668601989746, + "rewards/margins": 9.718389701843261, + "rewards/rejected": -6.922721099853516, + "step": 1337 + }, + { + "epoch": 0.12224760164458658, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.645699774926552e-06, + "logits/chosen": 568068992.0, + "logits/rejected": 246526704.0, + "logps/chosen": -367.95794677734375, + "logps/rejected": -277.3503112792969, + "loss": 0.064, + "rewards/chosen": 2.084031105041504, + "rewards/margins": 7.623833656311035, + "rewards/rejected": -5.539802551269531, + "step": 1338 + }, + { + "epoch": 0.12233896756509823, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 9.645167987704142e-06, + "logits/chosen": 328245077.3333333, + "logits/rejected": 701398937.6, + "logps/chosen": -203.9898478190104, + "logps/rejected": -288.3475830078125, + "loss": 0.1252, + "rewards/chosen": 3.326173464457194, + "rewards/margins": 8.096884981791177, + "rewards/rejected": -4.770711517333984, + "step": 1339 + }, + { + "epoch": 0.12243033348560987, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.644635816369379e-06, + "logits/chosen": 539779840.0, + "logits/rejected": 336026645.3333333, + "logps/chosen": -290.0982177734375, + "logps/rejected": -301.9342041015625, + "loss": 0.0211, + "rewards/chosen": 4.175774383544922, + "rewards/margins": 11.52903060913086, + "rewards/rejected": -7.3532562255859375, + "step": 1340 + }, + { + "epoch": 0.12252169940612152, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.644103260966266e-06, + "logits/chosen": 376710400.0, + "logits/rejected": 711340595.2, + "logps/chosen": -221.33622233072916, + "logps/rejected": -211.275439453125, + "loss": 0.0328, + "rewards/chosen": 4.855807622273763, + "rewards/margins": 10.048207982381186, + "rewards/rejected": -5.192400360107422, + "step": 1341 + }, + { + "epoch": 0.12261306532663317, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.643570321538845e-06, + "logits/chosen": 673802154.6666666, + "logits/rejected": 274638336.0, + "logps/chosen": -238.60066731770834, + "logps/rejected": -358.89990234375, + "loss": 0.1167, + "rewards/chosen": 2.896280606587728, + "rewards/margins": 11.448973019917807, + "rewards/rejected": -8.552692413330078, + "step": 1342 + }, + { + "epoch": 0.12270443124714482, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.64303699813118e-06, + "logits/chosen": 665782272.0, + "logits/rejected": 973798336.0, + "logps/chosen": -280.33880615234375, + "logps/rejected": -603.888916015625, + "loss": 0.0338, + "rewards/chosen": 2.6808505058288574, + "rewards/margins": 10.97702932357788, + "rewards/rejected": -8.296178817749023, + "step": 1343 + }, + { + "epoch": 0.12279579716765647, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.642503290787372e-06, + "logits/chosen": 605530660.5714285, + "logits/rejected": 810703424.0, + "logps/chosen": -241.54833984375, + "logps/rejected": -167.23016357421875, + "loss": 0.0356, + "rewards/chosen": 3.969139644077846, + "rewards/margins": 10.518793174198695, + "rewards/rejected": -6.54965353012085, + "step": 1344 + }, + { + "epoch": 0.12288716308816812, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.641969199551559e-06, + "logits/chosen": 405597525.3333333, + "logits/rejected": 381556896.0, + "logps/chosen": -251.87874348958334, + "logps/rejected": -481.39129638671875, + "loss": 0.0412, + "rewards/chosen": 3.199235280354818, + "rewards/margins": 8.934344132741293, + "rewards/rejected": -5.735108852386475, + "step": 1345 + }, + { + "epoch": 0.12297852900867977, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.6414347244679e-06, + "logits/chosen": 542397312.0, + "logits/rejected": 389251648.0, + "logps/chosen": -323.6218668619792, + "logps/rejected": -454.2365417480469, + "loss": 0.0704, + "rewards/chosen": 2.670088768005371, + "rewards/margins": 12.975915908813477, + "rewards/rejected": -10.305827140808105, + "step": 1346 + }, + { + "epoch": 0.12306989492919142, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.640899865580594e-06, + "logits/chosen": 423822262.85714287, + "logits/rejected": 378695392.0, + "logps/chosen": -260.16629464285717, + "logps/rejected": -395.6483154296875, + "loss": 0.0507, + "rewards/chosen": 3.2874352591378346, + "rewards/margins": 10.905654157911028, + "rewards/rejected": -7.618218898773193, + "step": 1347 + }, + { + "epoch": 0.12316126084970307, + "grad_norm": 24.875, + "kl": 0.0, + "learning_rate": 9.640364622933867e-06, + "logits/chosen": 455988672.0, + "logits/rejected": 312743680.0, + "logps/chosen": -388.24017333984375, + "logps/rejected": -375.1534118652344, + "loss": 0.1024, + "rewards/chosen": 2.8797192573547363, + "rewards/margins": 9.886353015899658, + "rewards/rejected": -7.006633758544922, + "step": 1348 + }, + { + "epoch": 0.12325262677021472, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 9.639828996571981e-06, + "logits/chosen": 502559712.0, + "logits/rejected": 436935808.0, + "logps/chosen": -284.633544921875, + "logps/rejected": -544.6044921875, + "loss": 0.0196, + "rewards/chosen": 3.583022356033325, + "rewards/margins": 13.476262331008911, + "rewards/rejected": -9.893239974975586, + "step": 1349 + }, + { + "epoch": 0.12334399269072636, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 9.639292986539225e-06, + "logits/chosen": 617665600.0, + "logits/rejected": 537890304.0, + "logps/chosen": -379.86236572265625, + "logps/rejected": -614.6123046875, + "loss": 0.0102, + "rewards/chosen": 3.936941623687744, + "rewards/margins": 13.378559589385986, + "rewards/rejected": -9.441617965698242, + "step": 1350 + }, + { + "epoch": 0.12343535861123801, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 9.638756592879923e-06, + "logits/chosen": 647037824.0, + "logits/rejected": 1145702400.0, + "logps/chosen": -319.870849609375, + "logps/rejected": -859.080810546875, + "loss": 0.0918, + "rewards/chosen": 2.517289479573568, + "rewards/margins": 15.84390958150228, + "rewards/rejected": -13.326620101928711, + "step": 1351 + }, + { + "epoch": 0.12352672453174966, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.63821981563843e-06, + "logits/chosen": 409479972.5714286, + "logits/rejected": 306775648.0, + "logps/chosen": -314.94203404017856, + "logps/rejected": -456.22991943359375, + "loss": 0.0806, + "rewards/chosen": 2.7781775338309154, + "rewards/margins": 9.875400338854108, + "rewards/rejected": -7.097222805023193, + "step": 1352 + }, + { + "epoch": 0.12361809045226131, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 9.637682654859134e-06, + "logits/chosen": 387234240.0, + "logits/rejected": 419908064.0, + "logps/chosen": -262.69647216796875, + "logps/rejected": -408.1934814453125, + "loss": 0.0202, + "rewards/chosen": 3.7268056869506836, + "rewards/margins": 10.758366584777832, + "rewards/rejected": -7.031560897827148, + "step": 1353 + }, + { + "epoch": 0.12370945637277296, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 9.637145110586447e-06, + "logits/chosen": 1067470131.2, + "logits/rejected": 612550101.3333334, + "logps/chosen": -281.535205078125, + "logps/rejected": -611.88916015625, + "loss": 0.0318, + "rewards/chosen": 3.3204421997070312, + "rewards/margins": 10.405562082926433, + "rewards/rejected": -7.085119883219401, + "step": 1354 + }, + { + "epoch": 0.12380082229328461, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 9.636607182864828e-06, + "logits/chosen": 437541973.3333333, + "logits/rejected": 426020198.4, + "logps/chosen": -291.2537434895833, + "logps/rejected": -431.38427734375, + "loss": 0.0206, + "rewards/chosen": 3.1165075302124023, + "rewards/margins": 10.427998924255371, + "rewards/rejected": -7.311491394042969, + "step": 1355 + }, + { + "epoch": 0.12389218821379626, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.636068871738754e-06, + "logits/chosen": 1107670186.6666667, + "logits/rejected": 716914380.8, + "logps/chosen": -244.20564778645834, + "logps/rejected": -466.260009765625, + "loss": 0.024, + "rewards/chosen": 3.0931930541992188, + "rewards/margins": 11.475320434570312, + "rewards/rejected": -8.382127380371093, + "step": 1356 + }, + { + "epoch": 0.12398355413430791, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.635530177252735e-06, + "logits/chosen": 634396586.6666666, + "logits/rejected": 1274083456.0, + "logps/chosen": -324.4285074869792, + "logps/rejected": -323.81494140625, + "loss": 0.0742, + "rewards/chosen": 3.5142154693603516, + "rewards/margins": 8.53935432434082, + "rewards/rejected": -5.025138854980469, + "step": 1357 + }, + { + "epoch": 0.12407492005481956, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.634991099451321e-06, + "logits/chosen": 829881024.0, + "logits/rejected": 1087503488.0, + "logps/chosen": -420.2355651855469, + "logps/rejected": -663.1607666015625, + "loss": 0.0252, + "rewards/chosen": 3.14339017868042, + "rewards/margins": 14.86020040512085, + "rewards/rejected": -11.71681022644043, + "step": 1358 + }, + { + "epoch": 0.1241662859753312, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 9.634451638379088e-06, + "logits/chosen": 888618752.0, + "logits/rejected": 599385408.0, + "logps/chosen": -554.27490234375, + "logps/rejected": -494.86920166015625, + "loss": 0.0136, + "rewards/chosen": 3.681717872619629, + "rewards/margins": 11.092463493347168, + "rewards/rejected": -7.410745620727539, + "step": 1359 + }, + { + "epoch": 0.12425765189584285, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.633911794080643e-06, + "logits/chosen": 784464554.6666666, + "logits/rejected": 481856307.2, + "logps/chosen": -462.6123860677083, + "logps/rejected": -295.5086669921875, + "loss": 0.1046, + "rewards/chosen": 3.0059630076090493, + "rewards/margins": 7.83391024271647, + "rewards/rejected": -4.8279472351074215, + "step": 1360 + }, + { + "epoch": 0.1243490178163545, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.633371566600627e-06, + "logits/chosen": 680959129.6, + "logits/rejected": 1028205056.0, + "logps/chosen": -308.9309814453125, + "logps/rejected": -861.58251953125, + "loss": 0.0267, + "rewards/chosen": 3.3808212280273438, + "rewards/margins": 15.961064656575521, + "rewards/rejected": -12.580243428548178, + "step": 1361 + }, + { + "epoch": 0.12444038373686615, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.632830955983711e-06, + "logits/chosen": 776981312.0, + "logits/rejected": 918284032.0, + "logps/chosen": -305.06494140625, + "logps/rejected": -655.575927734375, + "loss": 0.0669, + "rewards/chosen": 2.453106641769409, + "rewards/margins": 7.664942502975464, + "rewards/rejected": -5.211835861206055, + "step": 1362 + }, + { + "epoch": 0.1245317496573778, + "grad_norm": 9.6875, + "kl": 6.1397552490234375, + "learning_rate": 9.6322899622746e-06, + "logits/chosen": 605963178.6666666, + "logits/rejected": 625426688.0, + "logps/chosen": -387.3714192708333, + "logps/rejected": -310.9007568359375, + "loss": 0.0785, + "rewards/chosen": 2.8970228830973306, + "rewards/margins": 7.562467734018961, + "rewards/rejected": -4.665444850921631, + "step": 1363 + }, + { + "epoch": 0.12462311557788945, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.631748585518028e-06, + "logits/chosen": 519777706.6666667, + "logits/rejected": 473942528.0, + "logps/chosen": -370.1693522135417, + "logps/rejected": -383.7848205566406, + "loss": 0.0362, + "rewards/chosen": 3.566976229349772, + "rewards/margins": 8.171505610148111, + "rewards/rejected": -4.60452938079834, + "step": 1364 + }, + { + "epoch": 0.1247144814984011, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 9.631206825758763e-06, + "logits/chosen": 502302293.3333333, + "logits/rejected": 495463577.6, + "logps/chosen": -321.1176350911458, + "logps/rejected": -514.727392578125, + "loss": 0.0106, + "rewards/chosen": 3.662470499674479, + "rewards/margins": 11.778086344401041, + "rewards/rejected": -8.115615844726562, + "step": 1365 + }, + { + "epoch": 0.12480584741891275, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 9.630664683041601e-06, + "logits/chosen": 428908202.6666667, + "logits/rejected": 487412224.0, + "logps/chosen": -309.5619710286458, + "logps/rejected": -429.417529296875, + "loss": 0.0172, + "rewards/chosen": 3.3889684677124023, + "rewards/margins": 10.871717643737792, + "rewards/rejected": -7.482749176025391, + "step": 1366 + }, + { + "epoch": 0.1248972133394244, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 9.630122157411374e-06, + "logits/chosen": 943688832.0, + "logits/rejected": 312411946.6666667, + "logps/chosen": -675.928466796875, + "logps/rejected": -287.8398030598958, + "loss": 0.1011, + "rewards/chosen": 3.5670623779296875, + "rewards/margins": 8.161261558532715, + "rewards/rejected": -4.594199180603027, + "step": 1367 + }, + { + "epoch": 0.12498857925993605, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.629579248912943e-06, + "logits/chosen": 532987818.6666667, + "logits/rejected": 377054464.0, + "logps/chosen": -202.46331787109375, + "logps/rejected": -371.7642822265625, + "loss": 0.0229, + "rewards/chosen": 3.956617991129557, + "rewards/margins": 10.748831431070963, + "rewards/rejected": -6.792213439941406, + "step": 1368 + }, + { + "epoch": 0.12507994518044768, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.629035957591204e-06, + "logits/chosen": 1193366357.3333333, + "logits/rejected": 446437683.2, + "logps/chosen": -353.5336100260417, + "logps/rejected": -250.94189453125, + "loss": 0.0439, + "rewards/chosen": 3.2423248291015625, + "rewards/margins": 8.375240325927734, + "rewards/rejected": -5.132915496826172, + "step": 1369 + }, + { + "epoch": 0.12517131110095933, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.62849228349108e-06, + "logits/chosen": 685219754.6666666, + "logits/rejected": 416863232.0, + "logps/chosen": -463.6742350260417, + "logps/rejected": -272.296435546875, + "loss": 0.0919, + "rewards/chosen": 2.485189914703369, + "rewards/margins": 6.9853089332580565, + "rewards/rejected": -4.500119018554687, + "step": 1370 + }, + { + "epoch": 0.12526267702147098, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 9.627948226657527e-06, + "logits/chosen": 419047136.0, + "logits/rejected": 374499360.0, + "logps/chosen": -335.39825439453125, + "logps/rejected": -506.0757751464844, + "loss": 0.0243, + "rewards/chosen": 3.246490478515625, + "rewards/margins": 12.672834396362305, + "rewards/rejected": -9.42634391784668, + "step": 1371 + }, + { + "epoch": 0.12535404294198263, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.627403787135536e-06, + "logits/chosen": 367113024.0, + "logits/rejected": 354668608.0, + "logps/chosen": -240.91983032226562, + "logps/rejected": -263.5284729003906, + "loss": 0.0961, + "rewards/chosen": 2.520739793777466, + "rewards/margins": 7.9888756275177, + "rewards/rejected": -5.468135833740234, + "step": 1372 + }, + { + "epoch": 0.12544540886249428, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 9.626858964970125e-06, + "logits/chosen": 376905557.3333333, + "logits/rejected": 1137152640.0, + "logps/chosen": -299.43157958984375, + "logps/rejected": -1248.162841796875, + "loss": 0.0194, + "rewards/chosen": 4.168231964111328, + "rewards/margins": 16.254578590393066, + "rewards/rejected": -12.086346626281738, + "step": 1373 + }, + { + "epoch": 0.12553677478300593, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.626313760206344e-06, + "logits/chosen": 684446105.6, + "logits/rejected": 718686037.3333334, + "logps/chosen": -387.8359375, + "logps/rejected": -687.396240234375, + "loss": 0.0435, + "rewards/chosen": 3.4035270690917967, + "rewards/margins": 11.35598258972168, + "rewards/rejected": -7.952455520629883, + "step": 1374 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.625768172889282e-06, + "logits/chosen": 448215722.6666667, + "logits/rejected": 421470912.0, + "logps/chosen": -257.1164957682292, + "logps/rejected": -550.0078125, + "loss": 0.1319, + "rewards/chosen": 2.739813804626465, + "rewards/margins": 13.36745834350586, + "rewards/rejected": -10.627644538879395, + "step": 1375 + }, + { + "epoch": 0.12571950662402923, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.62522220306405e-06, + "logits/chosen": 494347306.6666667, + "logits/rejected": 284120384.0, + "logps/chosen": -399.5262858072917, + "logps/rejected": -353.13677978515625, + "loss": 0.0489, + "rewards/chosen": 3.1409308115641275, + "rewards/margins": 10.575339953104654, + "rewards/rejected": -7.434409141540527, + "step": 1376 + }, + { + "epoch": 0.12581087254454087, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.624675850775794e-06, + "logits/chosen": 828903116.8, + "logits/rejected": 550915797.3333334, + "logps/chosen": -362.30361328125, + "logps/rejected": -560.147216796875, + "loss": 0.0892, + "rewards/chosen": 2.6189044952392577, + "rewards/margins": 10.58402608235677, + "rewards/rejected": -7.965121587117513, + "step": 1377 + }, + { + "epoch": 0.12590223846505252, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.624129116069695e-06, + "logits/chosen": 424277350.4, + "logits/rejected": 293062741.3333333, + "logps/chosen": -255.578662109375, + "logps/rejected": -350.7861328125, + "loss": 0.1494, + "rewards/chosen": 2.408502769470215, + "rewards/margins": 8.001903088887532, + "rewards/rejected": -5.593400319417317, + "step": 1378 + }, + { + "epoch": 0.12599360438556417, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.623581998990961e-06, + "logits/chosen": 635831296.0, + "logits/rejected": 456372256.0, + "logps/chosen": -308.1392415364583, + "logps/rejected": -504.3356018066406, + "loss": 0.0819, + "rewards/chosen": 2.2561041514078775, + "rewards/margins": 8.49397341410319, + "rewards/rejected": -6.2378692626953125, + "step": 1379 + }, + { + "epoch": 0.12608497030607582, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 9.623034499584834e-06, + "logits/chosen": 1270054656.0, + "logits/rejected": 744775168.0, + "logps/chosen": -197.64703369140625, + "logps/rejected": -467.78759765625, + "loss": 0.0156, + "rewards/chosen": 2.7981057167053223, + "rewards/margins": 9.966917514801025, + "rewards/rejected": -7.168811798095703, + "step": 1380 + }, + { + "epoch": 0.12617633622658747, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 9.62248661789659e-06, + "logits/chosen": 741737045.3333334, + "logits/rejected": 515104256.0, + "logps/chosen": -273.53346761067706, + "logps/rejected": -623.869189453125, + "loss": 0.0285, + "rewards/chosen": 2.6049315134684243, + "rewards/margins": 11.766339747111003, + "rewards/rejected": -9.161408233642579, + "step": 1381 + }, + { + "epoch": 0.12626770214709912, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 9.621938353971527e-06, + "logits/chosen": 1072409920.0, + "logits/rejected": 452686421.3333333, + "logps/chosen": -295.9803466796875, + "logps/rejected": -561.6116536458334, + "loss": 0.0051, + "rewards/chosen": 4.184051513671875, + "rewards/margins": 12.264520645141602, + "rewards/rejected": -8.080469131469727, + "step": 1382 + }, + { + "epoch": 0.12635906806761077, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 9.621389707854987e-06, + "logits/chosen": 530579498.6666667, + "logits/rejected": 306812339.2, + "logps/chosen": -246.80546061197916, + "logps/rejected": -360.5772216796875, + "loss": 0.0151, + "rewards/chosen": 3.6021769841512046, + "rewards/margins": 10.016094525655111, + "rewards/rejected": -6.413917541503906, + "step": 1383 + }, + { + "epoch": 0.12645043398812242, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.620840679592337e-06, + "logits/chosen": 581473152.0, + "logits/rejected": 576711296.0, + "logps/chosen": -243.7812703450521, + "logps/rejected": -571.2039794921875, + "loss": 0.078, + "rewards/chosen": 3.0311403274536133, + "rewards/margins": 10.198047637939453, + "rewards/rejected": -7.16690731048584, + "step": 1384 + }, + { + "epoch": 0.12654179990863407, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.620291269228975e-06, + "logits/chosen": 697610154.6666666, + "logits/rejected": 532403712.0, + "logps/chosen": -166.03361002604166, + "logps/rejected": -440.542333984375, + "loss": 0.0553, + "rewards/chosen": 2.1557881037394204, + "rewards/margins": 8.951549975077311, + "rewards/rejected": -6.7957618713378904, + "step": 1385 + }, + { + "epoch": 0.12663316582914572, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.619741476810334e-06, + "logits/chosen": 656336064.0, + "logits/rejected": 922448896.0, + "logps/chosen": -267.6826171875, + "logps/rejected": -564.0659790039062, + "loss": 0.0338, + "rewards/chosen": 2.8967483043670654, + "rewards/margins": 11.690274000167847, + "rewards/rejected": -8.793525695800781, + "step": 1386 + }, + { + "epoch": 0.12672453174965737, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.619191302381873e-06, + "logits/chosen": 715265024.0, + "logits/rejected": 856540672.0, + "logps/chosen": -369.4033610026042, + "logps/rejected": -809.7929077148438, + "loss": 0.0409, + "rewards/chosen": 3.0835816065470376, + "rewards/margins": 12.908475557963053, + "rewards/rejected": -9.824893951416016, + "step": 1387 + }, + { + "epoch": 0.12681589767016901, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 9.618640745989092e-06, + "logits/chosen": 625369344.0, + "logits/rejected": 959039616.0, + "logps/chosen": -268.6737060546875, + "logps/rejected": -520.86669921875, + "loss": 0.0234, + "rewards/chosen": 3.671633243560791, + "rewards/margins": 13.540771007537842, + "rewards/rejected": -9.86913776397705, + "step": 1388 + }, + { + "epoch": 0.12690726359068066, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.618089807677513e-06, + "logits/chosen": 684575040.0, + "logits/rejected": 559082496.0, + "logps/chosen": -393.02349853515625, + "logps/rejected": -401.3358154296875, + "loss": 0.0448, + "rewards/chosen": 2.6025819778442383, + "rewards/margins": 8.071639696757, + "rewards/rejected": -5.469057718912761, + "step": 1389 + }, + { + "epoch": 0.1269986295111923, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 9.617538487492694e-06, + "logits/chosen": 360521045.3333333, + "logits/rejected": 466512896.0, + "logps/chosen": -297.2899983723958, + "logps/rejected": -336.7357482910156, + "loss": 0.0275, + "rewards/chosen": 3.3698641459147134, + "rewards/margins": 12.488835970560709, + "rewards/rejected": -9.118971824645996, + "step": 1390 + }, + { + "epoch": 0.12708999543170396, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 9.616986785480225e-06, + "logits/chosen": 310935082.6666667, + "logits/rejected": 503687219.2, + "logps/chosen": -237.66058349609375, + "logps/rejected": -627.220361328125, + "loss": 0.1439, + "rewards/chosen": 1.3337982495625813, + "rewards/margins": 6.997252686818441, + "rewards/rejected": -5.66345443725586, + "step": 1391 + }, + { + "epoch": 0.1271813613522156, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.616434701685727e-06, + "logits/chosen": 455981226.6666667, + "logits/rejected": 465769824.0, + "logps/chosen": -180.05985514322916, + "logps/rejected": -550.9747314453125, + "loss": 0.0447, + "rewards/chosen": 3.393169085184733, + "rewards/margins": 11.969229380289713, + "rewards/rejected": -8.57606029510498, + "step": 1392 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.61588223615485e-06, + "logits/chosen": 494622924.8, + "logits/rejected": 1041733973.3333334, + "logps/chosen": -351.111279296875, + "logps/rejected": -611.7522786458334, + "loss": 0.0796, + "rewards/chosen": 2.425967979431152, + "rewards/margins": 11.89451643625895, + "rewards/rejected": -9.468548456827799, + "step": 1393 + }, + { + "epoch": 0.1273640931932389, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.615329388933279e-06, + "logits/chosen": 606867669.3333334, + "logits/rejected": 365611904.0, + "logps/chosen": -279.6398111979167, + "logps/rejected": -363.4410705566406, + "loss": 0.1123, + "rewards/chosen": 3.257434844970703, + "rewards/margins": 7.180041313171387, + "rewards/rejected": -3.9226064682006836, + "step": 1394 + }, + { + "epoch": 0.12745545911375056, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 9.61477616006673e-06, + "logits/chosen": 621854464.0, + "logits/rejected": 522125909.3333333, + "logps/chosen": -409.733740234375, + "logps/rejected": -489.3111979166667, + "loss": 0.0746, + "rewards/chosen": 2.5737577438354493, + "rewards/margins": 9.756317710876464, + "rewards/rejected": -7.182559967041016, + "step": 1395 + }, + { + "epoch": 0.1275468250342622, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.61422254960095e-06, + "logits/chosen": 537725132.8, + "logits/rejected": 377677653.3333333, + "logps/chosen": -240.5783935546875, + "logps/rejected": -400.470947265625, + "loss": 0.0641, + "rewards/chosen": 3.615362548828125, + "rewards/margins": 10.328578694661458, + "rewards/rejected": -6.713216145833333, + "step": 1396 + }, + { + "epoch": 0.12763819095477386, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.613668557581716e-06, + "logits/chosen": 990501632.0, + "logits/rejected": 585244288.0, + "logps/chosen": -355.2638346354167, + "logps/rejected": -369.9687805175781, + "loss": 0.1068, + "rewards/chosen": 1.991859753926595, + "rewards/margins": 10.509768803914389, + "rewards/rejected": -8.517909049987793, + "step": 1397 + }, + { + "epoch": 0.1277295568752855, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.61311418405484e-06, + "logits/chosen": 368735552.0, + "logits/rejected": 589443712.0, + "logps/chosen": -133.65521240234375, + "logps/rejected": -314.90283203125, + "loss": 0.1397, + "rewards/chosen": 3.373228073120117, + "rewards/margins": 7.525171756744385, + "rewards/rejected": -4.151943683624268, + "step": 1398 + }, + { + "epoch": 0.12782092279579715, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.612559429066162e-06, + "logits/chosen": 408059584.0, + "logits/rejected": 425841056.0, + "logps/chosen": -240.08322143554688, + "logps/rejected": -356.5482177734375, + "loss": 0.0257, + "rewards/chosen": 3.320969581604004, + "rewards/margins": 9.781118392944336, + "rewards/rejected": -6.460148811340332, + "step": 1399 + }, + { + "epoch": 0.12791228871630883, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.612004292661553e-06, + "logits/chosen": 503037659.4285714, + "logits/rejected": 1934544768.0, + "logps/chosen": -425.1974400111607, + "logps/rejected": -1331.125244140625, + "loss": 0.1923, + "rewards/chosen": 1.4872899736676897, + "rewards/margins": 21.888926097324916, + "rewards/rejected": -20.401636123657227, + "step": 1400 + }, + { + "epoch": 0.12800365463682048, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.611448774886925e-06, + "logits/chosen": 460094378.6666667, + "logits/rejected": 917373132.8, + "logps/chosen": -297.9839274088542, + "logps/rejected": -859.165625, + "loss": 0.0332, + "rewards/chosen": 3.1799545288085938, + "rewards/margins": 14.45994873046875, + "rewards/rejected": -11.279994201660156, + "step": 1401 + }, + { + "epoch": 0.12809502055733213, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 9.610892875788206e-06, + "logits/chosen": 569715840.0, + "logits/rejected": 534725973.3333333, + "logps/chosen": -201.97537231445312, + "logps/rejected": -425.7365315755208, + "loss": 0.021, + "rewards/chosen": 2.960742235183716, + "rewards/margins": 9.060780763626099, + "rewards/rejected": -6.100038528442383, + "step": 1402 + }, + { + "epoch": 0.12818638647784378, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.610336595411368e-06, + "logits/chosen": 500959786.6666667, + "logits/rejected": 390060211.2, + "logps/chosen": -545.8497314453125, + "logps/rejected": -321.088818359375, + "loss": 0.0698, + "rewards/chosen": 1.8382892608642578, + "rewards/margins": 8.174296951293945, + "rewards/rejected": -6.336007690429687, + "step": 1403 + }, + { + "epoch": 0.12827775239835543, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.60977993380241e-06, + "logits/chosen": 571818069.3333334, + "logits/rejected": 552103552.0, + "logps/chosen": -382.9781494140625, + "logps/rejected": -410.4862365722656, + "loss": 0.0471, + "rewards/chosen": 3.1708415349324546, + "rewards/margins": 10.459745724995932, + "rewards/rejected": -7.288904190063477, + "step": 1404 + }, + { + "epoch": 0.12836911831886708, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.60922289100736e-06, + "logits/chosen": 633654784.0, + "logits/rejected": 357175040.0, + "logps/chosen": -354.0314025878906, + "logps/rejected": -477.4355773925781, + "loss": 0.0425, + "rewards/chosen": 2.436599016189575, + "rewards/margins": 9.356046438217163, + "rewards/rejected": -6.919447422027588, + "step": 1405 + }, + { + "epoch": 0.12846048423937872, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.608665467072283e-06, + "logits/chosen": 751722154.6666666, + "logits/rejected": 814247424.0, + "logps/chosen": -283.14337158203125, + "logps/rejected": -399.4775390625, + "loss": 0.0397, + "rewards/chosen": 3.8004957834879556, + "rewards/margins": 11.138858954111734, + "rewards/rejected": -7.338363170623779, + "step": 1406 + }, + { + "epoch": 0.12855185015989037, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 9.608107662043274e-06, + "logits/chosen": 820845860.5714285, + "logits/rejected": 417467264.0, + "logps/chosen": -370.495849609375, + "logps/rejected": -309.2186279296875, + "loss": 0.0408, + "rewards/chosen": 3.3473074776785716, + "rewards/margins": 9.009404931749616, + "rewards/rejected": -5.662097454071045, + "step": 1407 + }, + { + "epoch": 0.12864321608040202, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 9.607549475966453e-06, + "logits/chosen": 466429866.6666667, + "logits/rejected": 702719539.2, + "logps/chosen": -136.75031534830728, + "logps/rejected": -476.936376953125, + "loss": 0.013, + "rewards/chosen": 4.035086949666341, + "rewards/margins": 11.66419766743978, + "rewards/rejected": -7.629110717773438, + "step": 1408 + }, + { + "epoch": 0.12873458200091367, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.606990908887983e-06, + "logits/chosen": 927314329.6, + "logits/rejected": 584016256.0, + "logps/chosen": -409.11376953125, + "logps/rejected": -604.2401123046875, + "loss": 0.0285, + "rewards/chosen": 3.5236896514892577, + "rewards/margins": 10.118708038330078, + "rewards/rejected": -6.59501838684082, + "step": 1409 + }, + { + "epoch": 0.12882594792142532, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.60643196085405e-06, + "logits/chosen": 307811648.0, + "logits/rejected": 591869632.0, + "logps/chosen": -259.55670166015625, + "logps/rejected": -581.4935302734375, + "loss": 0.0564, + "rewards/chosen": 2.6754519939422607, + "rewards/margins": 8.651005983352661, + "rewards/rejected": -5.9755539894104, + "step": 1410 + }, + { + "epoch": 0.12891731384193697, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.60587263191087e-06, + "logits/chosen": 394073600.0, + "logits/rejected": 321627882.6666667, + "logps/chosen": -271.103271484375, + "logps/rejected": -501.1176350911458, + "loss": 0.0484, + "rewards/chosen": 2.883663558959961, + "rewards/margins": 11.61481196085612, + "rewards/rejected": -8.731148401896158, + "step": 1411 + }, + { + "epoch": 0.12900867976244862, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 9.605312922104702e-06, + "logits/chosen": 926807552.0, + "logits/rejected": 1101840640.0, + "logps/chosen": -265.32421875, + "logps/rejected": -706.1371256510416, + "loss": 0.0285, + "rewards/chosen": 3.0931705474853515, + "rewards/margins": 9.964673741658528, + "rewards/rejected": -6.871503194173177, + "step": 1412 + }, + { + "epoch": 0.12910004568296027, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.60475283148182e-06, + "logits/chosen": 738221504.0, + "logits/rejected": 410177280.0, + "logps/chosen": -581.8577270507812, + "logps/rejected": -499.6614176432292, + "loss": 0.0218, + "rewards/chosen": 2.796832323074341, + "rewards/margins": 10.923205614089966, + "rewards/rejected": -8.126373291015625, + "step": 1413 + }, + { + "epoch": 0.12919141160347192, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.604192360088546e-06, + "logits/chosen": 1112416256.0, + "logits/rejected": 668489386.6666666, + "logps/chosen": -313.15615234375, + "logps/rejected": -505.3749186197917, + "loss": 0.0944, + "rewards/chosen": 2.751566505432129, + "rewards/margins": 6.653189595540365, + "rewards/rejected": -3.901623090108236, + "step": 1414 + }, + { + "epoch": 0.12928277752398357, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.603631507971222e-06, + "logits/chosen": 480275104.0, + "logits/rejected": 394592448.0, + "logps/chosen": -227.78244018554688, + "logps/rejected": -366.2308654785156, + "loss": 0.0617, + "rewards/chosen": 3.7083067893981934, + "rewards/margins": 7.455958127975464, + "rewards/rejected": -3.7476513385772705, + "step": 1415 + }, + { + "epoch": 0.12937414344449522, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 9.603070275176225e-06, + "logits/chosen": 475055296.0, + "logits/rejected": 115840832.0, + "logps/chosen": -327.0112609863281, + "logps/rejected": -382.9075012207031, + "loss": 0.0189, + "rewards/chosen": 3.4956274032592773, + "rewards/margins": 11.09447717666626, + "rewards/rejected": -7.598849773406982, + "step": 1416 + }, + { + "epoch": 0.12946550936500686, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.602508661749967e-06, + "logits/chosen": 452088128.0, + "logits/rejected": 592658112.0, + "logps/chosen": -602.2073974609375, + "logps/rejected": -769.9283447265625, + "loss": 0.0473, + "rewards/chosen": 2.3843164443969727, + "rewards/margins": 10.212915897369385, + "rewards/rejected": -7.828599452972412, + "step": 1417 + }, + { + "epoch": 0.1295568752855185, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.601946667738884e-06, + "logits/chosen": 526489292.8, + "logits/rejected": 420222165.3333333, + "logps/chosen": -342.1037109375, + "logps/rejected": -911.6749674479166, + "loss": 0.0653, + "rewards/chosen": 2.839887237548828, + "rewards/margins": 12.058179473876953, + "rewards/rejected": -9.218292236328125, + "step": 1418 + }, + { + "epoch": 0.12964824120603016, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.601384293189449e-06, + "logits/chosen": 1021653589.3333334, + "logits/rejected": 438985536.0, + "logps/chosen": -312.0187581380208, + "logps/rejected": -280.06146240234375, + "loss": 0.0404, + "rewards/chosen": 3.1635303497314453, + "rewards/margins": 8.150585651397705, + "rewards/rejected": -4.98705530166626, + "step": 1419 + }, + { + "epoch": 0.1297396071265418, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.600821538148167e-06, + "logits/chosen": 555860096.0, + "logits/rejected": 322751424.0, + "logps/chosen": -361.918701171875, + "logps/rejected": -306.9178466796875, + "loss": 0.0276, + "rewards/chosen": 2.9913997650146484, + "rewards/margins": 9.654865264892578, + "rewards/rejected": -6.66346549987793, + "step": 1420 + }, + { + "epoch": 0.12983097304705346, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.60025840266157e-06, + "logits/chosen": 1356277674.6666667, + "logits/rejected": 642658918.4, + "logps/chosen": -372.0997721354167, + "logps/rejected": -485.12060546875, + "loss": 0.0252, + "rewards/chosen": 4.131671905517578, + "rewards/margins": 11.92098159790039, + "rewards/rejected": -7.789309692382813, + "step": 1421 + }, + { + "epoch": 0.1299223389675651, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.599694886776227e-06, + "logits/chosen": 670332416.0, + "logits/rejected": 361531818.6666667, + "logps/chosen": -335.8970458984375, + "logps/rejected": -446.2346598307292, + "loss": 0.0301, + "rewards/chosen": 3.0038394927978516, + "rewards/margins": 12.945571263631185, + "rewards/rejected": -9.941731770833334, + "step": 1422 + }, + { + "epoch": 0.13001370488807676, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 9.599130990538733e-06, + "logits/chosen": 538404053.3333334, + "logits/rejected": 481960755.2, + "logps/chosen": -443.914306640625, + "logps/rejected": -401.8790283203125, + "loss": 0.0221, + "rewards/chosen": 2.945982297261556, + "rewards/margins": 9.834590848286947, + "rewards/rejected": -6.88860855102539, + "step": 1423 + }, + { + "epoch": 0.1301050708085884, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 9.598566713995718e-06, + "logits/chosen": 606935424.0, + "logits/rejected": 262817984.0, + "logps/chosen": -448.90594482421875, + "logps/rejected": -320.71966552734375, + "loss": 0.0322, + "rewards/chosen": 3.6334710121154785, + "rewards/margins": 9.373148441314697, + "rewards/rejected": -5.739677429199219, + "step": 1424 + }, + { + "epoch": 0.13019643672910006, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 9.598002057193842e-06, + "logits/chosen": 516871104.0, + "logits/rejected": 374570922.6666667, + "logps/chosen": -426.2325744628906, + "logps/rejected": -458.4329020182292, + "loss": 0.0074, + "rewards/chosen": 4.291833877563477, + "rewards/margins": 10.696137110392254, + "rewards/rejected": -6.404303232828776, + "step": 1425 + }, + { + "epoch": 0.1302878026496117, + "grad_norm": 0.9765625, + "kl": 0.0, + "learning_rate": 9.597437020179798e-06, + "logits/chosen": 207596096.0, + "logits/rejected": 564504722.2857143, + "logps/chosen": -96.68807983398438, + "logps/rejected": -398.8389369419643, + "loss": 0.0052, + "rewards/chosen": 3.854052782058716, + "rewards/margins": 10.30923226901463, + "rewards/rejected": -6.455179486955915, + "step": 1426 + }, + { + "epoch": 0.13037916857012335, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.596871603000309e-06, + "logits/chosen": 630318528.0, + "logits/rejected": 755095744.0, + "logps/chosen": -386.69873046875, + "logps/rejected": -582.0459594726562, + "loss": 0.0308, + "rewards/chosen": 3.162506580352783, + "rewards/margins": 11.754320621490479, + "rewards/rejected": -8.591814041137695, + "step": 1427 + }, + { + "epoch": 0.130470534490635, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.596305805702129e-06, + "logits/chosen": 654329958.4, + "logits/rejected": 529966720.0, + "logps/chosen": -394.090185546875, + "logps/rejected": -553.2674153645834, + "loss": 0.0332, + "rewards/chosen": 3.108600616455078, + "rewards/margins": 10.94968096415202, + "rewards/rejected": -7.84108034769694, + "step": 1428 + }, + { + "epoch": 0.13056190041114665, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.595739628332044e-06, + "logits/chosen": 1412362752.0, + "logits/rejected": 665901632.0, + "logps/chosen": -280.6212463378906, + "logps/rejected": -345.40704345703125, + "loss": 0.0641, + "rewards/chosen": 2.5445899963378906, + "rewards/margins": 10.32478141784668, + "rewards/rejected": -7.780191421508789, + "step": 1429 + }, + { + "epoch": 0.1306532663316583, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.595173070936873e-06, + "logits/chosen": 818078976.0, + "logits/rejected": 557108906.6666666, + "logps/chosen": -474.1748046875, + "logps/rejected": -548.1848551432291, + "loss": 0.0197, + "rewards/chosen": 2.648581027984619, + "rewards/margins": 11.733379205067953, + "rewards/rejected": -9.084798177083334, + "step": 1430 + }, + { + "epoch": 0.13074463225216995, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.594606133563465e-06, + "logits/chosen": 828801792.0, + "logits/rejected": 1119048704.0, + "logps/chosen": -442.7583414713542, + "logps/rejected": -530.83642578125, + "loss": 0.0329, + "rewards/chosen": 2.8849493662516275, + "rewards/margins": 11.786027399698893, + "rewards/rejected": -8.901078033447266, + "step": 1431 + }, + { + "epoch": 0.1308359981726816, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.594038816258698e-06, + "logits/chosen": 494817472.0, + "logits/rejected": 782751744.0, + "logps/chosen": -266.86273193359375, + "logps/rejected": -474.9119466145833, + "loss": 0.0499, + "rewards/chosen": 4.001988887786865, + "rewards/margins": 10.47470744450887, + "rewards/rejected": -6.472718556722005, + "step": 1432 + }, + { + "epoch": 0.13092736409319325, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.593471119069486e-06, + "logits/chosen": 733738496.0, + "logits/rejected": 508939968.0, + "logps/chosen": -414.6067592075893, + "logps/rejected": -590.3931274414062, + "loss": 0.0524, + "rewards/chosen": 3.0839554922921315, + "rewards/margins": 13.817262377057757, + "rewards/rejected": -10.733306884765625, + "step": 1433 + }, + { + "epoch": 0.1310187300137049, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 9.59290304204277e-06, + "logits/chosen": 873469952.0, + "logits/rejected": 660730922.6666666, + "logps/chosen": -351.52119140625, + "logps/rejected": -363.3902994791667, + "loss": 0.0205, + "rewards/chosen": 3.8397640228271483, + "rewards/margins": 11.91534907023112, + "rewards/rejected": -8.07558504740397, + "step": 1434 + }, + { + "epoch": 0.13111009593421655, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 9.592334585225532e-06, + "logits/chosen": 591405824.0, + "logits/rejected": 903750314.6666666, + "logps/chosen": -340.1013671875, + "logps/rejected": -665.1214599609375, + "loss": 0.1856, + "rewards/chosen": 3.0470657348632812, + "rewards/margins": 7.226085662841797, + "rewards/rejected": -4.179019927978516, + "step": 1435 + }, + { + "epoch": 0.1312014618547282, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.59176574866477e-06, + "logits/chosen": 603973888.0, + "logits/rejected": 364377139.2, + "logps/chosen": -377.5157063802083, + "logps/rejected": -484.944482421875, + "loss": 0.0687, + "rewards/chosen": 1.6826752026875813, + "rewards/margins": 10.530371125539144, + "rewards/rejected": -8.847695922851562, + "step": 1436 + }, + { + "epoch": 0.13129282777523985, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 9.591196532407522e-06, + "logits/chosen": 891415296.0, + "logits/rejected": 590543360.0, + "logps/chosen": -270.51312255859375, + "logps/rejected": -414.65240478515625, + "loss": 0.026, + "rewards/chosen": 2.9293770790100098, + "rewards/margins": 11.67506456375122, + "rewards/rejected": -8.745687484741211, + "step": 1437 + }, + { + "epoch": 0.1313841936957515, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.590626936500862e-06, + "logits/chosen": 457464729.6, + "logits/rejected": 719265962.6666666, + "logps/chosen": -297.71435546875, + "logps/rejected": -451.0159912109375, + "loss": 0.0801, + "rewards/chosen": 2.3684234619140625, + "rewards/margins": 8.243789672851562, + "rewards/rejected": -5.8753662109375, + "step": 1438 + }, + { + "epoch": 0.13147555961626314, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.590056960991885e-06, + "logits/chosen": 491475776.0, + "logits/rejected": 277527168.0, + "logps/chosen": -328.68231201171875, + "logps/rejected": -314.2965393066406, + "loss": 0.0277, + "rewards/chosen": 3.2536368370056152, + "rewards/margins": 9.742805004119873, + "rewards/rejected": -6.489168167114258, + "step": 1439 + }, + { + "epoch": 0.1315669255367748, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 9.589486605927726e-06, + "logits/chosen": 600456362.6666666, + "logits/rejected": 448166502.4, + "logps/chosen": -440.962890625, + "logps/rejected": -245.819580078125, + "loss": 0.0814, + "rewards/chosen": 4.0509999593098955, + "rewards/margins": 10.499100240071614, + "rewards/rejected": -6.448100280761719, + "step": 1440 + }, + { + "epoch": 0.13165829145728644, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 9.588915871355549e-06, + "logits/chosen": 751794624.0, + "logits/rejected": 333851776.0, + "logps/chosen": -431.24395751953125, + "logps/rejected": -320.4134216308594, + "loss": 0.0198, + "rewards/chosen": 3.5859811305999756, + "rewards/margins": 10.803198099136353, + "rewards/rejected": -7.217216968536377, + "step": 1441 + }, + { + "epoch": 0.1317496573777981, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.588344757322546e-06, + "logits/chosen": 543249066.6666666, + "logits/rejected": 888919449.6, + "logps/chosen": -388.2170817057292, + "logps/rejected": -430.118896484375, + "loss": 0.0243, + "rewards/chosen": 3.518335978190104, + "rewards/margins": 10.189410654703776, + "rewards/rejected": -6.671074676513672, + "step": 1442 + }, + { + "epoch": 0.13184102329830974, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.587773263875943e-06, + "logits/chosen": 521775040.0, + "logits/rejected": 222307280.0, + "logps/chosen": -250.8109588623047, + "logps/rejected": -299.7600402832031, + "loss": 0.058, + "rewards/chosen": 3.8906068801879883, + "rewards/margins": 9.48281717300415, + "rewards/rejected": -5.592210292816162, + "step": 1443 + }, + { + "epoch": 0.1319323892188214, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.587201391062997e-06, + "logits/chosen": 205873440.0, + "logits/rejected": 495079936.0, + "logps/chosen": -182.99114990234375, + "logps/rejected": -391.0120849609375, + "loss": 0.0301, + "rewards/chosen": 3.8387913703918457, + "rewards/margins": 10.826915264129639, + "rewards/rejected": -6.988123893737793, + "step": 1444 + }, + { + "epoch": 0.13202375513933304, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.586629138930998e-06, + "logits/chosen": 650517888.0, + "logits/rejected": 472497356.8, + "logps/chosen": -487.4984537760417, + "logps/rejected": -433.382861328125, + "loss": 0.0457, + "rewards/chosen": 2.7733306884765625, + "rewards/margins": 9.496340942382812, + "rewards/rejected": -6.72301025390625, + "step": 1445 + }, + { + "epoch": 0.1321151210598447, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 9.586056507527266e-06, + "logits/rejected": 696017536.0, + "logps/rejected": -504.1128845214844, + "loss": 0.0086, + "rewards/rejected": -10.130104064941406, + "step": 1446 + }, + { + "epoch": 0.13220648698035634, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.585483496899151e-06, + "logits/chosen": 569620114.2857143, + "logits/rejected": 350927712.0, + "logps/chosen": -384.4368373325893, + "logps/rejected": -622.014404296875, + "loss": 0.0439, + "rewards/chosen": 3.130660738263811, + "rewards/margins": 13.663253511701312, + "rewards/rejected": -10.5325927734375, + "step": 1447 + }, + { + "epoch": 0.13229785290086798, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.584910107094036e-06, + "logits/chosen": 492577248.0, + "logits/rejected": 539817344.0, + "logps/chosen": -332.5322265625, + "logps/rejected": -623.6400756835938, + "loss": 0.0646, + "rewards/chosen": 2.098266124725342, + "rewards/margins": 8.54408884048462, + "rewards/rejected": -6.445822715759277, + "step": 1448 + }, + { + "epoch": 0.13238921882137963, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.584336338159336e-06, + "logits/chosen": 1517661696.0, + "logits/rejected": 599304806.4, + "logps/chosen": -278.6700439453125, + "logps/rejected": -351.58544921875, + "loss": 0.0442, + "rewards/chosen": 2.139467398325602, + "rewards/margins": 9.126071707407633, + "rewards/rejected": -6.986604309082031, + "step": 1449 + }, + { + "epoch": 0.13248058474189128, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 9.583762190142494e-06, + "logits/chosen": 973046613.3333334, + "logits/rejected": 414842880.0, + "logps/chosen": -331.0968424479167, + "logps/rejected": -453.55166015625, + "loss": 0.0183, + "rewards/chosen": 3.4978240331014, + "rewards/margins": 12.942224057515462, + "rewards/rejected": -9.444400024414062, + "step": 1450 + }, + { + "epoch": 0.13257195066240293, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.58318766309099e-06, + "logits/chosen": 428407936.0, + "logits/rejected": 365406912.0, + "logps/chosen": -224.73812866210938, + "logps/rejected": -325.8991394042969, + "loss": 0.0375, + "rewards/chosen": 3.0710086822509766, + "rewards/margins": 9.626442909240723, + "rewards/rejected": -6.555434226989746, + "step": 1451 + }, + { + "epoch": 0.13266331658291458, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.58261275705233e-06, + "logits/chosen": 454036070.4, + "logits/rejected": 266456362.66666666, + "logps/chosen": -395.0309814453125, + "logps/rejected": -412.9585774739583, + "loss": 0.069, + "rewards/chosen": 3.0428306579589846, + "rewards/margins": 12.94616241455078, + "rewards/rejected": -9.903331756591797, + "step": 1452 + }, + { + "epoch": 0.13275468250342623, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.582037472074054e-06, + "logits/chosen": 332213299.2, + "logits/rejected": 406239829.3333333, + "logps/chosen": -373.6131591796875, + "logps/rejected": -385.5362141927083, + "loss": 0.0359, + "rewards/chosen": 3.492748260498047, + "rewards/margins": 10.723164240519207, + "rewards/rejected": -7.230415980021159, + "step": 1453 + }, + { + "epoch": 0.13284604842393788, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.581461808203732e-06, + "logits/chosen": 604059648.0, + "logits/rejected": 583378944.0, + "logps/chosen": -258.3904296875, + "logps/rejected": -389.8308919270833, + "loss": 0.043, + "rewards/chosen": 3.2517425537109377, + "rewards/margins": 9.824931971232097, + "rewards/rejected": -6.573189417521159, + "step": 1454 + }, + { + "epoch": 0.13293741434444953, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 9.580885765488968e-06, + "logits/chosen": 663036330.6666666, + "logits/rejected": 1132183347.2, + "logps/chosen": -281.28810628255206, + "logps/rejected": -487.22744140625, + "loss": 0.0378, + "rewards/chosen": 2.7902212142944336, + "rewards/margins": 11.120639610290528, + "rewards/rejected": -8.330418395996094, + "step": 1455 + }, + { + "epoch": 0.13302878026496118, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 9.580309343977391e-06, + "logits/chosen": 336009824.0, + "logits/rejected": 418928832.0, + "logps/chosen": -281.18157958984375, + "logps/rejected": -502.039306640625, + "loss": 0.0204, + "rewards/chosen": 3.581664800643921, + "rewards/margins": 12.788211584091187, + "rewards/rejected": -9.206546783447266, + "step": 1456 + }, + { + "epoch": 0.13312014618547283, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.579732543716673e-06, + "logits/chosen": 411587136.0, + "logits/rejected": 406648256.0, + "logps/chosen": -210.97940063476562, + "logps/rejected": -510.31103515625, + "loss": 0.0494, + "rewards/chosen": 2.266929864883423, + "rewards/margins": 13.764396905899048, + "rewards/rejected": -11.497467041015625, + "step": 1457 + }, + { + "epoch": 0.13321151210598448, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 9.579155364754501e-06, + "logits/chosen": 539853781.3333334, + "logits/rejected": 611888640.0, + "logps/chosen": -276.2475992838542, + "logps/rejected": -608.344140625, + "loss": 0.0153, + "rewards/chosen": 3.7755324045817056, + "rewards/margins": 11.954260126749674, + "rewards/rejected": -8.178727722167968, + "step": 1458 + }, + { + "epoch": 0.13330287802649612, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.57857780713861e-06, + "logits/chosen": 568184985.6, + "logits/rejected": 580082304.0, + "logps/chosen": -355.347900390625, + "logps/rejected": -586.220703125, + "loss": 0.0479, + "rewards/chosen": 2.5925628662109377, + "rewards/margins": 11.995223236083984, + "rewards/rejected": -9.402660369873047, + "step": 1459 + }, + { + "epoch": 0.13339424394700777, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.577999870916757e-06, + "logits/chosen": 688567872.0, + "logits/rejected": 681623808.0, + "logps/chosen": -387.7049560546875, + "logps/rejected": -420.748291015625, + "loss": 0.0435, + "rewards/chosen": 2.5936858654022217, + "rewards/margins": 11.60941195487976, + "rewards/rejected": -9.015726089477539, + "step": 1460 + }, + { + "epoch": 0.13348560986751942, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 9.57742155613673e-06, + "logits/chosen": 980409856.0, + "logits/rejected": 706193920.0, + "logps/chosen": -270.12225341796875, + "logps/rejected": -450.19769287109375, + "loss": 0.0131, + "rewards/chosen": 3.8736391067504883, + "rewards/margins": 14.071303367614746, + "rewards/rejected": -10.197664260864258, + "step": 1461 + }, + { + "epoch": 0.13357697578803107, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.576842862846349e-06, + "logits/chosen": 682139699.2, + "logits/rejected": 587400960.0, + "logps/chosen": -331.33388671875, + "logps/rejected": -331.89809163411456, + "loss": 0.2315, + "rewards/chosen": 1.0700000762939452, + "rewards/margins": 8.534767532348633, + "rewards/rejected": -7.4647674560546875, + "step": 1462 + }, + { + "epoch": 0.13366834170854272, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.57626379109347e-06, + "logits/chosen": 424396160.0, + "logits/rejected": 556732672.0, + "logps/chosen": -462.7802734375, + "logps/rejected": -528.5123697916666, + "loss": 0.0204, + "rewards/chosen": 3.337360382080078, + "rewards/margins": 12.213146209716797, + "rewards/rejected": -8.875785827636719, + "step": 1463 + }, + { + "epoch": 0.13375970762905437, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 9.575684340925977e-06, + "logits/chosen": 932427264.0, + "logits/rejected": 547273216.0, + "logps/chosen": -462.2196044921875, + "logps/rejected": -502.62451171875, + "loss": 0.0239, + "rewards/chosen": 2.818567911783854, + "rewards/margins": 9.576597086588542, + "rewards/rejected": -6.758029174804688, + "step": 1464 + }, + { + "epoch": 0.13385107354956602, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.575104512391782e-06, + "logits/chosen": 841609130.6666666, + "logits/rejected": 473684736.0, + "logps/chosen": -472.3855387369792, + "logps/rejected": -392.0978698730469, + "loss": 0.0399, + "rewards/chosen": 3.082475026448568, + "rewards/margins": 10.68283780415853, + "rewards/rejected": -7.600362777709961, + "step": 1465 + }, + { + "epoch": 0.13394243947007767, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 9.574524305538833e-06, + "logits/chosen": 476805973.3333333, + "logits/rejected": 618985164.8, + "logps/chosen": -236.24759928385416, + "logps/rejected": -608.689453125, + "loss": 0.0095, + "rewards/chosen": 3.848705291748047, + "rewards/margins": 12.137744903564453, + "rewards/rejected": -8.289039611816406, + "step": 1466 + }, + { + "epoch": 0.13403380539058932, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 9.573943720415109e-06, + "logits/chosen": 495527296.0, + "logits/rejected": 521588070.4, + "logps/chosen": -482.7332763671875, + "logps/rejected": -831.4421875, + "loss": 0.0137, + "rewards/chosen": 3.8136119842529297, + "rewards/margins": 14.616254806518555, + "rewards/rejected": -10.802642822265625, + "step": 1467 + }, + { + "epoch": 0.13412517131110097, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 9.573362757068616e-06, + "logits/chosen": 467305408.0, + "logits/rejected": 797049024.0, + "logps/chosen": -316.1202697753906, + "logps/rejected": -472.69696044921875, + "loss": 0.0148, + "rewards/chosen": 4.02723503112793, + "rewards/margins": 11.82332706451416, + "rewards/rejected": -7.7960920333862305, + "step": 1468 + }, + { + "epoch": 0.13421653723161261, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.572781415547397e-06, + "logits/chosen": 433922662.4, + "logits/rejected": 645865557.3333334, + "logps/chosen": -339.433642578125, + "logps/rejected": -362.179931640625, + "loss": 0.0433, + "rewards/chosen": 3.678772735595703, + "rewards/margins": 10.264096069335938, + "rewards/rejected": -6.585323333740234, + "step": 1469 + }, + { + "epoch": 0.13430790315212426, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 9.572199695899522e-06, + "logits/chosen": 703560832.0, + "logits/rejected": 458195712.0, + "logps/chosen": -446.2294921875, + "logps/rejected": -552.0042724609375, + "loss": 0.0199, + "rewards/chosen": 3.2504920959472656, + "rewards/margins": 11.200800895690918, + "rewards/rejected": -7.950308799743652, + "step": 1470 + }, + { + "epoch": 0.1343992690726359, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.571617598173097e-06, + "logits/chosen": 580705152.0, + "logits/rejected": 395387776.0, + "logps/chosen": -404.8704427083333, + "logps/rejected": -477.6243896484375, + "loss": 0.0394, + "rewards/chosen": 3.021735509236654, + "rewards/margins": 14.886475880940756, + "rewards/rejected": -11.864740371704102, + "step": 1471 + }, + { + "epoch": 0.13449063499314756, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.571035122416252e-06, + "logits/chosen": 337545152.0, + "logits/rejected": 601534592.0, + "logps/chosen": -209.85052490234375, + "logps/rejected": -486.0546875, + "loss": 0.0364, + "rewards/chosen": 3.333740711212158, + "rewards/margins": 11.431545734405518, + "rewards/rejected": -8.09780502319336, + "step": 1472 + }, + { + "epoch": 0.1345820009136592, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.570452268677153e-06, + "logits/chosen": 486751268.5714286, + "logits/rejected": 693005504.0, + "logps/chosen": -352.25279017857144, + "logps/rejected": -383.5732421875, + "loss": 0.0738, + "rewards/chosen": 2.685356685093471, + "rewards/margins": 9.367543765476771, + "rewards/rejected": -6.682187080383301, + "step": 1473 + }, + { + "epoch": 0.13467336683417086, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.569869037003998e-06, + "logits/chosen": 600338816.0, + "logits/rejected": 544679744.0, + "logps/chosen": -356.0252685546875, + "logps/rejected": -448.5143737792969, + "loss": 0.0225, + "rewards/chosen": 3.0943856239318848, + "rewards/margins": 11.058109760284424, + "rewards/rejected": -7.963724136352539, + "step": 1474 + }, + { + "epoch": 0.1347647327546825, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 9.569285427445016e-06, + "logits/chosen": 1098557568.0, + "logits/rejected": 635678656.0, + "logps/chosen": -429.5206604003906, + "logps/rejected": -489.071044921875, + "loss": 0.0122, + "rewards/chosen": 4.135315895080566, + "rewards/margins": 13.900686264038086, + "rewards/rejected": -9.76537036895752, + "step": 1475 + }, + { + "epoch": 0.13485609867519416, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.568701440048463e-06, + "logits/chosen": 404776021.3333333, + "logits/rejected": 525412454.4, + "logps/chosen": -262.3176676432292, + "logps/rejected": -472.3908203125, + "loss": 0.0221, + "rewards/chosen": 3.3455686569213867, + "rewards/margins": 10.48246364593506, + "rewards/rejected": -7.136894989013672, + "step": 1476 + }, + { + "epoch": 0.1349474645957058, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.568117074862633e-06, + "logits/chosen": 437451980.8, + "logits/rejected": 856301312.0, + "logps/chosen": -236.69990234375, + "logps/rejected": -847.7158203125, + "loss": 0.0491, + "rewards/chosen": 2.9543107986450194, + "rewards/margins": 17.700225257873534, + "rewards/rejected": -14.745914459228516, + "step": 1477 + }, + { + "epoch": 0.13503883051621746, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 9.567532331935843e-06, + "logits/chosen": 627045760.0, + "logits/rejected": 523948757.3333333, + "logps/chosen": -202.96600341796875, + "logps/rejected": -329.0877278645833, + "loss": 0.0061, + "rewards/chosen": 4.6371564865112305, + "rewards/margins": 11.402181307474773, + "rewards/rejected": -6.765024820963542, + "step": 1478 + }, + { + "epoch": 0.1351301964367291, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.566947211316448e-06, + "logits/chosen": 601313689.6, + "logits/rejected": 429009066.6666667, + "logps/chosen": -628.976171875, + "logps/rejected": -473.09326171875, + "loss": 0.0229, + "rewards/chosen": 3.4708477020263673, + "rewards/margins": 11.444802220662435, + "rewards/rejected": -7.973954518636067, + "step": 1479 + }, + { + "epoch": 0.13522156235724075, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.566361713052834e-06, + "logits/chosen": 618914355.2, + "logits/rejected": 642867626.6666666, + "logps/chosen": -239.1349609375, + "logps/rejected": -648.6964111328125, + "loss": 0.07, + "rewards/chosen": 2.1344642639160156, + "rewards/margins": 11.10423469543457, + "rewards/rejected": -8.969770431518555, + "step": 1480 + }, + { + "epoch": 0.1353129282777524, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.565775837193413e-06, + "logits/chosen": 559051980.8, + "logits/rejected": 909911808.0, + "logps/chosen": -402.9282470703125, + "logps/rejected": -657.416259765625, + "loss": 0.052, + "rewards/chosen": 3.069179725646973, + "rewards/margins": 9.826378059387206, + "rewards/rejected": -6.757198333740234, + "step": 1481 + }, + { + "epoch": 0.13540429419826405, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.565189583786633e-06, + "logits/chosen": 537334720.0, + "logits/rejected": 354425824.0, + "logps/chosen": -334.26922607421875, + "logps/rejected": -396.6092529296875, + "loss": 0.0401, + "rewards/chosen": 2.9288110733032227, + "rewards/margins": 12.516058921813965, + "rewards/rejected": -9.587247848510742, + "step": 1482 + }, + { + "epoch": 0.1354956601187757, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.564602952880975e-06, + "logits/chosen": 710527189.3333334, + "logits/rejected": 717519104.0, + "logps/chosen": -339.2795817057292, + "logps/rejected": -554.675439453125, + "loss": 0.023, + "rewards/chosen": 3.272160847981771, + "rewards/margins": 11.028668721516928, + "rewards/rejected": -7.756507873535156, + "step": 1483 + }, + { + "epoch": 0.13558702603928735, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 9.564015944524943e-06, + "logits/chosen": 505874124.8, + "logits/rejected": 513738581.3333333, + "logps/chosen": -292.3391845703125, + "logps/rejected": -232.83426920572916, + "loss": 0.0345, + "rewards/chosen": 3.1514793395996095, + "rewards/margins": 8.339955902099609, + "rewards/rejected": -5.1884765625, + "step": 1484 + }, + { + "epoch": 0.135678391959799, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.563428558767078e-06, + "logits/chosen": 852849408.0, + "logits/rejected": 967303168.0, + "logps/chosen": -236.2323486328125, + "logps/rejected": -613.8318277994791, + "loss": 0.0424, + "rewards/chosen": 3.4692703247070313, + "rewards/margins": 11.75365982055664, + "rewards/rejected": -8.28438949584961, + "step": 1485 + }, + { + "epoch": 0.13576975788031065, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.562840795655952e-06, + "logits/chosen": 579042389.3333334, + "logits/rejected": 420842304.0, + "logps/chosen": -264.2327067057292, + "logps/rejected": -274.57354736328125, + "loss": 0.1319, + "rewards/chosen": 3.0659917195638022, + "rewards/margins": 6.2358616193135585, + "rewards/rejected": -3.169869899749756, + "step": 1486 + }, + { + "epoch": 0.1358611238008223, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 9.56225265524017e-06, + "logits/chosen": 422841472.0, + "logits/rejected": 857228580.5714285, + "logps/chosen": -275.69891357421875, + "logps/rejected": -283.80625697544644, + "loss": 0.0737, + "rewards/chosen": 4.307580471038818, + "rewards/margins": 10.543019226619176, + "rewards/rejected": -6.235438755580357, + "step": 1487 + }, + { + "epoch": 0.13595248972133395, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.561664137568363e-06, + "logits/chosen": 631753536.0, + "logits/rejected": 556448292.5714285, + "logps/chosen": -359.9279479980469, + "logps/rejected": -561.3692452566964, + "loss": 0.0247, + "rewards/chosen": 5.332907199859619, + "rewards/margins": 13.156167370932444, + "rewards/rejected": -7.823260171072824, + "step": 1488 + }, + { + "epoch": 0.1360438556418456, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.561075242689196e-06, + "logits/chosen": 589386432.0, + "logits/rejected": 1177780224.0, + "logps/chosen": -316.0429992675781, + "logps/rejected": -525.8660888671875, + "loss": 0.043, + "rewards/chosen": 3.0242936611175537, + "rewards/margins": 10.830750703811646, + "rewards/rejected": -7.806457042694092, + "step": 1489 + }, + { + "epoch": 0.13613522156235724, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.560485970651365e-06, + "logits/chosen": 473259366.4, + "logits/rejected": 492457770.6666667, + "logps/chosen": -224.7498046875, + "logps/rejected": -529.8798421223959, + "loss": 0.0332, + "rewards/chosen": 3.2767127990722655, + "rewards/margins": 11.679746373494467, + "rewards/rejected": -8.403033574422201, + "step": 1490 + }, + { + "epoch": 0.1362265874828689, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 9.559896321503599e-06, + "logits/chosen": 427197728.0, + "logits/rejected": 1055731456.0, + "logps/chosen": -355.0330810546875, + "logps/rejected": -507.1612548828125, + "loss": 0.0218, + "rewards/chosen": 3.350473403930664, + "rewards/margins": 11.131322860717773, + "rewards/rejected": -7.780849456787109, + "step": 1491 + }, + { + "epoch": 0.13631795340338054, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.559306295294655e-06, + "logits/chosen": 494188096.0, + "logits/rejected": 772439552.0, + "logps/chosen": -256.34112548828125, + "logps/rejected": -487.97320556640625, + "loss": 0.0379, + "rewards/chosen": 3.190969228744507, + "rewards/margins": 11.392709493637085, + "rewards/rejected": -8.201740264892578, + "step": 1492 + }, + { + "epoch": 0.1364093193238922, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.558715892073324e-06, + "logits/chosen": 433270720.0, + "logits/rejected": 429907136.0, + "logps/chosen": -361.22186279296875, + "logps/rejected": -449.5205383300781, + "loss": 0.0825, + "rewards/chosen": 2.068131446838379, + "rewards/margins": 10.13499927520752, + "rewards/rejected": -8.06686782836914, + "step": 1493 + }, + { + "epoch": 0.13650068524440384, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.558125111888425e-06, + "logits/chosen": 485683404.8, + "logits/rejected": 383758506.6666667, + "logps/chosen": -372.289697265625, + "logps/rejected": -435.53173828125, + "loss": 0.051, + "rewards/chosen": 2.564027786254883, + "rewards/margins": 9.994731267293293, + "rewards/rejected": -7.430703481038411, + "step": 1494 + }, + { + "epoch": 0.1365920511649155, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.557533954788813e-06, + "logits/chosen": 244101920.0, + "logits/rejected": 527491424.0, + "logps/chosen": -290.2947692871094, + "logps/rejected": -575.2188720703125, + "loss": 0.0464, + "rewards/chosen": 2.6054139137268066, + "rewards/margins": 11.501986980438232, + "rewards/rejected": -8.896573066711426, + "step": 1495 + }, + { + "epoch": 0.13668341708542714, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.556942420823368e-06, + "logits/chosen": 531673472.0, + "logits/rejected": 485914880.0, + "logps/chosen": -389.3626708984375, + "logps/rejected": -604.4300537109375, + "loss": 0.1245, + "rewards/chosen": 2.6144984563191733, + "rewards/margins": 12.539651234944662, + "rewards/rejected": -9.925152778625488, + "step": 1496 + }, + { + "epoch": 0.1367747830059388, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.556350510041006e-06, + "logits/chosen": 602526720.0, + "logits/rejected": 678014912.0, + "logps/chosen": -318.5201416015625, + "logps/rejected": -485.65771484375, + "loss": 0.06, + "rewards/chosen": 2.282039165496826, + "rewards/margins": 9.47402811050415, + "rewards/rejected": -7.191988945007324, + "step": 1497 + }, + { + "epoch": 0.13686614892645044, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.555758222490674e-06, + "logits/chosen": 836827520.0, + "logits/rejected": 728859968.0, + "logps/chosen": -319.8460693359375, + "logps/rejected": -552.791259765625, + "loss": 0.0571, + "rewards/chosen": 2.2891733646392822, + "rewards/margins": 11.376453638076782, + "rewards/rejected": -9.0872802734375, + "step": 1498 + }, + { + "epoch": 0.1369575148469621, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.555165558221343e-06, + "logits/chosen": 385701568.0, + "logits/rejected": 402064640.0, + "logps/chosen": -257.01593017578125, + "logps/rejected": -406.69439697265625, + "loss": 0.0405, + "rewards/chosen": 2.589749574661255, + "rewards/margins": 8.109831094741821, + "rewards/rejected": -5.520081520080566, + "step": 1499 + }, + { + "epoch": 0.13704888076747374, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 9.55457251728203e-06, + "logits/chosen": 226600336.0, + "logits/rejected": 547806656.0, + "logps/chosen": -162.82301330566406, + "logps/rejected": -489.8718566894531, + "loss": 0.0248, + "rewards/chosen": 3.4457507133483887, + "rewards/margins": 10.908924579620361, + "rewards/rejected": -7.463173866271973, + "step": 1500 + }, + { + "epoch": 0.13714024668798538, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 9.553979099721766e-06, + "logits/chosen": 697030092.8, + "logits/rejected": 512846592.0, + "logps/chosen": -301.416259765625, + "logps/rejected": -417.5780029296875, + "loss": 0.0184, + "rewards/chosen": 3.7161460876464845, + "rewards/margins": 11.33451550801595, + "rewards/rejected": -7.618369420369466, + "step": 1501 + }, + { + "epoch": 0.13723161260849703, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.553385305589624e-06, + "logits/chosen": 144915008.0, + "logits/rejected": 373365162.6666667, + "logps/chosen": -347.5484619140625, + "logps/rejected": -513.782958984375, + "loss": 0.0235, + "rewards/chosen": 3.6489243507385254, + "rewards/margins": 10.427756468454998, + "rewards/rejected": -6.778832117716472, + "step": 1502 + }, + { + "epoch": 0.13732297852900868, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.552791134934706e-06, + "logits/chosen": 1284086272.0, + "logits/rejected": 554368554.6666666, + "logps/chosen": -362.0685119628906, + "logps/rejected": -476.2042236328125, + "loss": 0.0246, + "rewards/chosen": 2.685389757156372, + "rewards/margins": 10.112822771072388, + "rewards/rejected": -7.427433013916016, + "step": 1503 + }, + { + "epoch": 0.13741434444952033, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.552196587806143e-06, + "logits/chosen": 275187507.2, + "logits/rejected": 481633365.3333333, + "logps/chosen": -167.32530517578124, + "logps/rejected": -618.6764729817709, + "loss": 0.0283, + "rewards/chosen": 3.239690399169922, + "rewards/margins": 10.501264190673828, + "rewards/rejected": -7.261573791503906, + "step": 1504 + }, + { + "epoch": 0.13750571037003198, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.5516016642531e-06, + "logits/chosen": 577151317.3333334, + "logits/rejected": 695154329.6, + "logps/chosen": -405.3710123697917, + "logps/rejected": -486.26484375, + "loss": 0.0283, + "rewards/chosen": 2.8562450408935547, + "rewards/margins": 10.614975357055664, + "rewards/rejected": -7.75873031616211, + "step": 1505 + }, + { + "epoch": 0.13759707629054363, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.551006364324771e-06, + "logits/chosen": 481525964.8, + "logits/rejected": 513906730.6666667, + "logps/chosen": -283.9938232421875, + "logps/rejected": -431.3395182291667, + "loss": 0.0414, + "rewards/chosen": 2.737144660949707, + "rewards/margins": 11.129350852966308, + "rewards/rejected": -8.392206192016602, + "step": 1506 + }, + { + "epoch": 0.13768844221105528, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.550410688070379e-06, + "logits/chosen": 526768608.0, + "logits/rejected": 798576768.0, + "logps/chosen": -282.7135314941406, + "logps/rejected": -483.3826904296875, + "loss": 0.0216, + "rewards/chosen": 3.7734994888305664, + "rewards/margins": 12.98462963104248, + "rewards/rejected": -9.211130142211914, + "step": 1507 + }, + { + "epoch": 0.13777980813156693, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 9.549814635539188e-06, + "logits/chosen": 639774720.0, + "logits/rejected": 621363609.6, + "logps/chosen": -303.268310546875, + "logps/rejected": -361.3042724609375, + "loss": 0.0278, + "rewards/chosen": 2.6141870816548667, + "rewards/margins": 9.429218514760336, + "rewards/rejected": -6.815031433105469, + "step": 1508 + }, + { + "epoch": 0.13787117405207858, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 9.54921820678048e-06, + "logits/chosen": 568312320.0, + "logits/rejected": 1195097685.3333333, + "logps/chosen": -258.9919677734375, + "logps/rejected": -363.4261881510417, + "loss": 0.0161, + "rewards/chosen": 3.9046417236328126, + "rewards/margins": 11.345680491129558, + "rewards/rejected": -7.441038767496745, + "step": 1509 + }, + { + "epoch": 0.13796253997259023, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.548621401843575e-06, + "logits/chosen": 674206848.0, + "logits/rejected": 305076416.0, + "logps/chosen": -397.2786865234375, + "logps/rejected": -423.38311767578125, + "loss": 0.0689, + "rewards/chosen": 3.1486024856567383, + "rewards/margins": 11.246236801147461, + "rewards/rejected": -8.097634315490723, + "step": 1510 + }, + { + "epoch": 0.13805390589310187, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.548024220777823e-06, + "logits/chosen": 399993557.3333333, + "logits/rejected": 396226560.0, + "logps/chosen": -264.0692545572917, + "logps/rejected": -460.816796875, + "loss": 0.0353, + "rewards/chosen": 3.4657605489095054, + "rewards/margins": 10.123178609212239, + "rewards/rejected": -6.657418060302734, + "step": 1511 + }, + { + "epoch": 0.13814527181361352, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 9.547426663632607e-06, + "logits/chosen": 698648512.0, + "logits/rejected": 545408853.3333334, + "logps/chosen": -282.94580078125, + "logps/rejected": -547.0330403645834, + "loss": 0.0143, + "rewards/chosen": 2.958491563796997, + "rewards/margins": 10.250816265741985, + "rewards/rejected": -7.292324701944987, + "step": 1512 + }, + { + "epoch": 0.13823663773412517, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 9.546828730457339e-06, + "logits/chosen": 669640384.0, + "logits/rejected": 727387428.5714285, + "logps/chosen": -306.88568115234375, + "logps/rejected": -585.4302455357143, + "loss": 0.0121, + "rewards/chosen": 2.765887498855591, + "rewards/margins": 10.957363503319877, + "rewards/rejected": -8.191476004464286, + "step": 1513 + }, + { + "epoch": 0.13832800365463682, + "grad_norm": 24.75, + "kl": 0.0, + "learning_rate": 9.546230421301463e-06, + "logits/chosen": 299253333.3333333, + "logits/rejected": 366085683.2, + "logps/chosen": -189.4627685546875, + "logps/rejected": -488.772607421875, + "loss": 0.0921, + "rewards/chosen": 2.4356937408447266, + "rewards/margins": 11.28056526184082, + "rewards/rejected": -8.844871520996094, + "step": 1514 + }, + { + "epoch": 0.13841936957514847, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 9.545631736214453e-06, + "logits/chosen": 482583584.0, + "logits/rejected": 529489856.0, + "logps/chosen": -358.5986328125, + "logps/rejected": -529.9697265625, + "loss": 0.0129, + "rewards/chosen": 3.990386962890625, + "rewards/margins": 12.522438049316406, + "rewards/rejected": -8.532051086425781, + "step": 1515 + }, + { + "epoch": 0.13851073549566012, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 9.545032675245814e-06, + "logits/chosen": 513366058.6666667, + "logits/rejected": 516073600.0, + "logps/chosen": -180.17403157552084, + "logps/rejected": -496.37994384765625, + "loss": 0.0252, + "rewards/chosen": 3.817768096923828, + "rewards/margins": 14.093482971191406, + "rewards/rejected": -10.275714874267578, + "step": 1516 + }, + { + "epoch": 0.13860210141617177, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 9.544433238445084e-06, + "logits/chosen": 339995648.0, + "logits/rejected": 643962752.0, + "logps/chosen": -118.01266479492188, + "logps/rejected": -495.971435546875, + "loss": 0.0299, + "rewards/chosen": 3.032273769378662, + "rewards/margins": 9.2377347946167, + "rewards/rejected": -6.205461025238037, + "step": 1517 + }, + { + "epoch": 0.13869346733668342, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.543833425861829e-06, + "logits/chosen": 605066112.0, + "logits/rejected": 1108715392.0, + "logps/chosen": -244.93548583984375, + "logps/rejected": -554.2117919921875, + "loss": 0.0223, + "rewards/chosen": 3.50262188911438, + "rewards/margins": 12.543864488601685, + "rewards/rejected": -9.041242599487305, + "step": 1518 + }, + { + "epoch": 0.13878483325719507, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.543233237545649e-06, + "logits/chosen": 713445427.2, + "logits/rejected": 503781376.0, + "logps/chosen": -422.50224609375, + "logps/rejected": -277.6165364583333, + "loss": 0.075, + "rewards/chosen": 3.223929214477539, + "rewards/margins": 8.837117640177409, + "rewards/rejected": -5.61318842569987, + "step": 1519 + }, + { + "epoch": 0.13887619917770672, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.542632673546175e-06, + "logits/chosen": 711399680.0, + "logits/rejected": 394589344.0, + "logps/chosen": -391.7879943847656, + "logps/rejected": -277.31298828125, + "loss": 0.1261, + "rewards/chosen": 2.987208604812622, + "rewards/margins": 7.06228232383728, + "rewards/rejected": -4.075073719024658, + "step": 1520 + }, + { + "epoch": 0.13896756509821837, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.542031733913069e-06, + "logits/chosen": 402705305.6, + "logits/rejected": 408873642.6666667, + "logps/chosen": -408.69013671875, + "logps/rejected": -237.584228515625, + "loss": 0.0431, + "rewards/chosen": 2.9400218963623046, + "rewards/margins": 7.452343050638834, + "rewards/rejected": -4.51232115427653, + "step": 1521 + }, + { + "epoch": 0.13905893101873001, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.54143041869602e-06, + "logits/chosen": 579235766.8571428, + "logits/rejected": 444463424.0, + "logps/chosen": -242.42333984375, + "logps/rejected": -151.36341857910156, + "loss": 0.0681, + "rewards/chosen": 2.7942632947649275, + "rewards/margins": 8.694952896663121, + "rewards/rejected": -5.900689601898193, + "step": 1522 + }, + { + "epoch": 0.13915029693924166, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 9.540828727944752e-06, + "logits/chosen": 1213377408.0, + "logits/rejected": 736379776.0, + "logps/chosen": -228.19154357910156, + "logps/rejected": -581.071044921875, + "loss": 0.0347, + "rewards/chosen": 2.7867374420166016, + "rewards/margins": 9.74422836303711, + "rewards/rejected": -6.957490921020508, + "step": 1523 + }, + { + "epoch": 0.1392416628597533, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.54022666170902e-06, + "logits/chosen": 864401408.0, + "logits/rejected": 992949248.0, + "logps/chosen": -436.4869791666667, + "logps/rejected": -462.32216796875, + "loss": 0.057, + "rewards/chosen": 3.0826730728149414, + "rewards/margins": 10.26864414215088, + "rewards/rejected": -7.185971069335937, + "step": 1524 + }, + { + "epoch": 0.13933302878026496, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 9.53962422003861e-06, + "logits/chosen": 811159466.6666666, + "logits/rejected": 599883008.0, + "logps/chosen": -338.297119140625, + "logps/rejected": -746.7763671875, + "loss": 0.1347, + "rewards/chosen": 2.5486550331115723, + "rewards/margins": 11.94654893875122, + "rewards/rejected": -9.397893905639648, + "step": 1525 + }, + { + "epoch": 0.1394243947007766, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.539021402983337e-06, + "logits/chosen": 696882816.0, + "logits/rejected": 624703360.0, + "logps/chosen": -341.5611979166667, + "logps/rejected": -553.51806640625, + "loss": 0.0515, + "rewards/chosen": 3.6742115020751953, + "rewards/margins": 10.077211856842041, + "rewards/rejected": -6.403000354766846, + "step": 1526 + }, + { + "epoch": 0.13951576062128826, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 9.53841821059305e-06, + "logits/chosen": 402634784.0, + "logits/rejected": 447376298.6666667, + "logps/chosen": -363.7357177734375, + "logps/rejected": -385.4580485026042, + "loss": 0.0103, + "rewards/chosen": 3.509516716003418, + "rewards/margins": 11.09155559539795, + "rewards/rejected": -7.582038879394531, + "step": 1527 + }, + { + "epoch": 0.1396071265417999, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.537814642917626e-06, + "logits/chosen": 745979776.0, + "logits/rejected": 617585792.0, + "logps/chosen": -444.7084045410156, + "logps/rejected": -710.5321655273438, + "loss": 0.0315, + "rewards/chosen": 2.940667152404785, + "rewards/margins": 11.143214225769043, + "rewards/rejected": -8.202547073364258, + "step": 1528 + }, + { + "epoch": 0.13969849246231156, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.537210700006974e-06, + "logits/chosen": 484000384.0, + "logits/rejected": 968309184.0, + "logps/chosen": -291.25649007161456, + "logps/rejected": -845.8045654296875, + "loss": 0.0623, + "rewards/chosen": 3.0429064432779946, + "rewards/margins": 13.749547640482584, + "rewards/rejected": -10.70664119720459, + "step": 1529 + }, + { + "epoch": 0.1397898583828232, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.536606381911036e-06, + "logits/chosen": 572600524.8, + "logits/rejected": 802132480.0, + "logps/chosen": -377.556591796875, + "logps/rejected": -463.9745686848958, + "loss": 0.0276, + "rewards/chosen": 3.3035114288330076, + "rewards/margins": 11.400205739339192, + "rewards/rejected": -8.096694310506185, + "step": 1530 + }, + { + "epoch": 0.13988122430333486, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 9.536001688679783e-06, + "logits/chosen": 672976128.0, + "logits/rejected": 432847658.6666667, + "logps/chosen": -456.2177734375, + "logps/rejected": -571.5524495442709, + "loss": 0.0065, + "rewards/chosen": 4.029109001159668, + "rewards/margins": 13.011911710103353, + "rewards/rejected": -8.982802708943685, + "step": 1531 + }, + { + "epoch": 0.1399725902238465, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.535396620363216e-06, + "logits/chosen": 785248426.6666666, + "logits/rejected": 835468185.6, + "logps/chosen": -244.599853515625, + "logps/rejected": -479.59990234375, + "loss": 0.1131, + "rewards/chosen": 1.7017793655395508, + "rewards/margins": 8.770761299133301, + "rewards/rejected": -7.06898193359375, + "step": 1532 + }, + { + "epoch": 0.14006395614435815, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 9.53479117701137e-06, + "logits/chosen": 323214933.3333333, + "logits/rejected": 406849382.4, + "logps/chosen": -211.91072591145834, + "logps/rejected": -429.941015625, + "loss": 0.0208, + "rewards/chosen": 3.6832275390625, + "rewards/margins": 10.316635131835938, + "rewards/rejected": -6.6334075927734375, + "step": 1533 + }, + { + "epoch": 0.1401553220648698, + "grad_norm": 0.8515625, + "kl": 0.0, + "learning_rate": 9.53418535867431e-06, + "logits/chosen": 1442128768.0, + "logits/rejected": 627060992.0, + "logps/chosen": -351.52520751953125, + "logps/rejected": -559.2420247395834, + "loss": 0.0042, + "rewards/chosen": 4.467836380004883, + "rewards/margins": 12.168818791707356, + "rewards/rejected": -7.700982411702474, + "step": 1534 + }, + { + "epoch": 0.14024668798538145, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.533579165402133e-06, + "logits/chosen": 399265216.0, + "logits/rejected": 326455392.0, + "logps/chosen": -336.8851318359375, + "logps/rejected": -330.616943359375, + "loss": 0.0273, + "rewards/chosen": 3.3929526805877686, + "rewards/margins": 12.74674105644226, + "rewards/rejected": -9.353788375854492, + "step": 1535 + }, + { + "epoch": 0.1403380539058931, + "grad_norm": 24.125, + "kl": 0.0, + "learning_rate": 9.532972597244962e-06, + "logits/chosen": 461372928.0, + "logits/rejected": 505173600.0, + "logps/chosen": -250.44459533691406, + "logps/rejected": -367.5920104980469, + "loss": 0.0801, + "rewards/chosen": 2.4597630500793457, + "rewards/margins": 8.777074813842773, + "rewards/rejected": -6.317311763763428, + "step": 1536 + }, + { + "epoch": 0.14042941982640475, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.532365654252956e-06, + "logits/chosen": 672093781.3333334, + "logits/rejected": 383936576.0, + "logps/chosen": -368.0871175130208, + "logps/rejected": -439.52886962890625, + "loss": 0.024, + "rewards/chosen": 3.8355773289998374, + "rewards/margins": 10.041776974995932, + "rewards/rejected": -6.206199645996094, + "step": 1537 + }, + { + "epoch": 0.1405207857469164, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.531758336476304e-06, + "logits/chosen": 600184512.0, + "logits/rejected": 571443648.0, + "logps/chosen": -443.71441650390625, + "logps/rejected": -503.7913513183594, + "loss": 0.0526, + "rewards/chosen": 2.6918423175811768, + "rewards/margins": 10.954144716262817, + "rewards/rejected": -8.26230239868164, + "step": 1538 + }, + { + "epoch": 0.14061215166742805, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.531150643965224e-06, + "logits/chosen": 348806656.0, + "logits/rejected": 641080320.0, + "logps/chosen": -197.70693969726562, + "logps/rejected": -340.43011474609375, + "loss": 0.0726, + "rewards/chosen": 2.949892997741699, + "rewards/margins": 9.217453002929688, + "rewards/rejected": -6.267560005187988, + "step": 1539 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.530542576769968e-06, + "logits/chosen": 265830592.0, + "logits/rejected": 480186560.0, + "logps/chosen": -168.5587921142578, + "logps/rejected": -473.6326904296875, + "loss": 0.115, + "rewards/chosen": 2.7052340507507324, + "rewards/margins": 9.663102626800537, + "rewards/rejected": -6.957868576049805, + "step": 1540 + }, + { + "epoch": 0.14079488350845135, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.529934134940819e-06, + "logits/chosen": 807402240.0, + "logits/rejected": 713758464.0, + "logps/chosen": -285.12013753255206, + "logps/rejected": -743.216064453125, + "loss": 0.0365, + "rewards/chosen": 3.2800302505493164, + "rewards/margins": 13.507258415222168, + "rewards/rejected": -10.227228164672852, + "step": 1541 + }, + { + "epoch": 0.140886249428963, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.529325318528087e-06, + "logits/chosen": 570043776.0, + "logits/rejected": 818051242.6666666, + "logps/chosen": -310.4499816894531, + "logps/rejected": -411.0422770182292, + "loss": 0.0668, + "rewards/chosen": 2.236393451690674, + "rewards/margins": 8.901920159657795, + "rewards/rejected": -6.665526707967122, + "step": 1542 + }, + { + "epoch": 0.14097761534947464, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 9.528716127582116e-06, + "logits/chosen": 563569920.0, + "logits/rejected": 829192192.0, + "logps/chosen": -305.5362955729167, + "logps/rejected": -629.493798828125, + "loss": 0.0127, + "rewards/chosen": 3.7136977513631186, + "rewards/margins": 11.20656655629476, + "rewards/rejected": -7.492868804931641, + "step": 1543 + }, + { + "epoch": 0.1410689812699863, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.528106562153281e-06, + "logits/chosen": 385669324.8, + "logits/rejected": 574405802.6666666, + "logps/chosen": -387.5333984375, + "logps/rejected": -520.8336995442709, + "loss": 0.0257, + "rewards/chosen": 3.6707427978515623, + "rewards/margins": 11.752414194742837, + "rewards/rejected": -8.081671396891275, + "step": 1544 + }, + { + "epoch": 0.14116034719049794, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.527496622291986e-06, + "logits/chosen": 313262592.0, + "logits/rejected": 677142425.6, + "logps/chosen": -257.74863688151044, + "logps/rejected": -776.382861328125, + "loss": 0.0136, + "rewards/chosen": 3.5632988611857095, + "rewards/margins": 14.376539675394694, + "rewards/rejected": -10.813240814208985, + "step": 1545 + }, + { + "epoch": 0.1412517131110096, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.52688630804867e-06, + "logits/chosen": 470872256.0, + "logits/rejected": 405440896.0, + "logps/chosen": -336.522216796875, + "logps/rejected": -255.15306091308594, + "loss": 0.0563, + "rewards/chosen": 2.5648789405822754, + "rewards/margins": 9.475893020629883, + "rewards/rejected": -6.911014080047607, + "step": 1546 + }, + { + "epoch": 0.14134307903152124, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 9.526275619473798e-06, + "logits/chosen": 891315285.3333334, + "logits/rejected": 510409574.4, + "logps/chosen": -493.2384440104167, + "logps/rejected": -575.4751953125, + "loss": 0.0132, + "rewards/chosen": 3.3981831868489585, + "rewards/margins": 13.203749593098959, + "rewards/rejected": -9.80556640625, + "step": 1547 + }, + { + "epoch": 0.1414344449520329, + "grad_norm": 7.34375, + "kl": 3.6628189086914062, + "learning_rate": 9.525664556617868e-06, + "logits/chosen": 416811008.0, + "logps/chosen": -352.18487548828125, + "loss": 0.0538, + "rewards/chosen": 3.6051526069641113, + "step": 1548 + }, + { + "epoch": 0.14152581087254454, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.525053119531414e-06, + "logits/chosen": 235238272.0, + "logits/rejected": 408998092.8, + "logps/chosen": -211.8951212565104, + "logps/rejected": -633.679150390625, + "loss": 0.009, + "rewards/chosen": 4.273640950520833, + "rewards/margins": 14.532809193929037, + "rewards/rejected": -10.259168243408203, + "step": 1549 + }, + { + "epoch": 0.1416171767930562, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.524441308264988e-06, + "logits/chosen": 672890112.0, + "logits/rejected": 353150028.8, + "logps/chosen": -564.0992024739584, + "logps/rejected": -314.16904296875, + "loss": 0.0281, + "rewards/chosen": 2.8641490936279297, + "rewards/margins": 9.142767333984375, + "rewards/rejected": -6.278618240356446, + "step": 1550 + }, + { + "epoch": 0.14170854271356784, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.523829122869191e-06, + "logits/chosen": 614453589.3333334, + "logits/rejected": 655863552.0, + "logps/chosen": -409.2123209635417, + "logps/rejected": -916.6083984375, + "loss": 0.0354, + "rewards/chosen": 3.396084785461426, + "rewards/margins": 17.350479125976562, + "rewards/rejected": -13.954394340515137, + "step": 1551 + }, + { + "epoch": 0.14179990863407949, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.523216563394635e-06, + "logits/chosen": 205825456.0, + "logits/rejected": 415971285.3333333, + "logps/chosen": -154.24620056152344, + "logps/rejected": -364.2726236979167, + "loss": 0.0326, + "rewards/chosen": 2.662585735321045, + "rewards/margins": 10.014005819956463, + "rewards/rejected": -7.351420084635417, + "step": 1552 + }, + { + "epoch": 0.14189127455459113, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.52260362989198e-06, + "logits/chosen": 493072588.8, + "logits/rejected": 367026517.3333333, + "logps/chosen": -352.9029052734375, + "logps/rejected": -501.540283203125, + "loss": 0.0242, + "rewards/chosen": 3.6633949279785156, + "rewards/margins": 15.147318522135416, + "rewards/rejected": -11.4839235941569, + "step": 1553 + }, + { + "epoch": 0.14198264047510278, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.521990322411908e-06, + "logits/chosen": 786090432.0, + "logits/rejected": 485073024.0, + "logps/chosen": -583.4842529296875, + "logps/rejected": -529.74267578125, + "loss": 0.0287, + "rewards/chosen": 2.921553134918213, + "rewards/margins": 9.472469806671143, + "rewards/rejected": -6.55091667175293, + "step": 1554 + }, + { + "epoch": 0.14207400639561443, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.521376641005132e-06, + "logits/chosen": 487459114.6666667, + "logits/rejected": 758923392.0, + "logps/chosen": -420.0796712239583, + "logps/rejected": -563.558349609375, + "loss": 0.0349, + "rewards/chosen": 3.4304749170939126, + "rewards/margins": 10.430001894632975, + "rewards/rejected": -6.9995269775390625, + "step": 1555 + }, + { + "epoch": 0.14216537231612608, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.5207625857224e-06, + "logits/chosen": 387102592.0, + "logits/rejected": 502974528.0, + "logps/chosen": -412.56195068359375, + "logps/rejected": -526.2891235351562, + "loss": 0.04, + "rewards/chosen": 2.93747878074646, + "rewards/margins": 11.439265966415405, + "rewards/rejected": -8.501787185668945, + "step": 1556 + }, + { + "epoch": 0.14225673823663773, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 9.520148156614488e-06, + "logits/chosen": 468745472.0, + "logits/rejected": 628863744.0, + "logps/chosen": -383.27020263671875, + "logps/rejected": -498.05450439453125, + "loss": 0.0085, + "rewards/chosen": 4.195572853088379, + "rewards/margins": 12.362218856811523, + "rewards/rejected": -8.166646003723145, + "step": 1557 + }, + { + "epoch": 0.14234810415714938, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.519533353732203e-06, + "logits/chosen": 246790464.0, + "logits/rejected": 726421869.7142857, + "logps/chosen": -75.60169982910156, + "logps/rejected": -433.7411411830357, + "loss": 0.1287, + "rewards/chosen": 1.0425446033477783, + "rewards/margins": 7.223678418568203, + "rewards/rejected": -6.1811338152204245, + "step": 1558 + }, + { + "epoch": 0.14243947007766103, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 9.518918177126386e-06, + "logits/chosen": 521655381.3333333, + "logits/rejected": 580372736.0, + "logps/chosen": -299.4769694010417, + "logps/rejected": -565.1845703125, + "loss": 0.018, + "rewards/chosen": 3.4178689320882163, + "rewards/margins": 10.653207524617514, + "rewards/rejected": -7.235338592529297, + "step": 1559 + }, + { + "epoch": 0.14253083599817268, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 9.518302626847902e-06, + "logits/chosen": 801833984.0, + "logits/rejected": 1002024106.6666666, + "logps/chosen": -502.04315185546875, + "logps/rejected": -968.1673177083334, + "loss": 0.0132, + "rewards/chosen": 3.0483741760253906, + "rewards/margins": 15.408058166503906, + "rewards/rejected": -12.359683990478516, + "step": 1560 + }, + { + "epoch": 0.14262220191868433, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 9.517686702947654e-06, + "logits/chosen": 597981866.6666666, + "logits/rejected": 1376523673.6, + "logps/chosen": -275.91542561848956, + "logps/rejected": -587.59267578125, + "loss": 0.0072, + "rewards/chosen": 4.025417645772298, + "rewards/margins": 13.097034009297687, + "rewards/rejected": -9.07161636352539, + "step": 1561 + }, + { + "epoch": 0.14271356783919598, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.517070405476575e-06, + "logits/chosen": 907244714.6666666, + "logits/rejected": 955241472.0, + "logps/chosen": -377.128662109375, + "logps/rejected": -600.02177734375, + "loss": 0.0185, + "rewards/chosen": 3.5041726430257163, + "rewards/margins": 12.400058873494467, + "rewards/rejected": -8.89588623046875, + "step": 1562 + }, + { + "epoch": 0.14280493375970763, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.516453734485624e-06, + "logits/chosen": 701310122.6666666, + "logits/rejected": 442111180.8, + "logps/chosen": -296.28708902994794, + "logps/rejected": -540.16591796875, + "loss": 0.0198, + "rewards/chosen": 2.9673760732014975, + "rewards/margins": 12.465667088826498, + "rewards/rejected": -9.498291015625, + "step": 1563 + }, + { + "epoch": 0.14289629968021927, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.515836690025795e-06, + "logits/chosen": 652641664.0, + "logps/chosen": -350.086669921875, + "loss": 0.1163, + "rewards/chosen": 2.840456247329712, + "step": 1564 + }, + { + "epoch": 0.14298766560073092, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.51521927214811e-06, + "logits/chosen": 576332160.0, + "logits/rejected": 374423712.0, + "logps/chosen": -291.29156494140625, + "logps/rejected": -284.0941467285156, + "loss": 0.1158, + "rewards/chosen": 3.2646446228027344, + "rewards/margins": 8.804080486297607, + "rewards/rejected": -5.539435863494873, + "step": 1565 + }, + { + "epoch": 0.14307903152124257, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.51460148090363e-06, + "logits/chosen": 516932608.0, + "logits/rejected": 489642752.0, + "logps/chosen": -290.4174560546875, + "logps/rejected": -666.5253092447916, + "loss": 0.034, + "rewards/chosen": 3.1410804748535157, + "rewards/margins": 10.306997553507488, + "rewards/rejected": -7.165917078653972, + "step": 1566 + }, + { + "epoch": 0.14317039744175422, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.513983316343432e-06, + "logits/chosen": 338076928.0, + "logits/rejected": 410203296.0, + "logps/chosen": -215.44857788085938, + "logps/rejected": -576.04638671875, + "loss": 0.0405, + "rewards/chosen": 2.5804965496063232, + "rewards/margins": 12.600638151168823, + "rewards/rejected": -10.0201416015625, + "step": 1567 + }, + { + "epoch": 0.14326176336226587, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 9.513364778518638e-06, + "logits/chosen": 691264307.2, + "logits/rejected": 538525354.6666666, + "logps/chosen": -264.140380859375, + "logps/rejected": -479.5387369791667, + "loss": 0.1271, + "rewards/chosen": 3.3015110015869142, + "rewards/margins": 9.045406850179036, + "rewards/rejected": -5.743895848592122, + "step": 1568 + }, + { + "epoch": 0.14335312928277752, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.512745867480395e-06, + "logits/chosen": 562325942.8571428, + "logits/rejected": 147527408.0, + "logps/chosen": -343.26834542410717, + "logps/rejected": -211.1583709716797, + "loss": 0.045, + "rewards/chosen": 3.1780883244105746, + "rewards/margins": 9.840321200234548, + "rewards/rejected": -6.662232875823975, + "step": 1569 + }, + { + "epoch": 0.14344449520328917, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.512126583279879e-06, + "logits/chosen": 384992844.8, + "logits/rejected": 562932266.6666666, + "logps/chosen": -225.834228515625, + "logps/rejected": -484.1569417317708, + "loss": 0.0655, + "rewards/chosen": 3.0822629928588867, + "rewards/margins": 10.108857154846191, + "rewards/rejected": -7.026594161987305, + "step": 1570 + }, + { + "epoch": 0.14353586112380082, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.511506925968302e-06, + "logits/chosen": 640410560.0, + "logits/rejected": 740199552.0, + "logps/chosen": -399.62750244140625, + "logps/rejected": -449.77545166015625, + "loss": 0.0366, + "rewards/chosen": 2.8757026195526123, + "rewards/margins": 10.406384706497192, + "rewards/rejected": -7.53068208694458, + "step": 1571 + }, + { + "epoch": 0.14362722704431247, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.510886895596901e-06, + "logits/chosen": 509807308.8, + "logits/rejected": 479681920.0, + "logps/chosen": -460.08623046875, + "logps/rejected": -521.7932942708334, + "loss": 0.0716, + "rewards/chosen": 2.3939491271972657, + "rewards/margins": 10.978257751464843, + "rewards/rejected": -8.584308624267578, + "step": 1572 + }, + { + "epoch": 0.14371859296482412, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.510266492216949e-06, + "logits/chosen": 406666035.2, + "logits/rejected": 306702890.6666667, + "logps/chosen": -261.8208984375, + "logps/rejected": -242.88749186197916, + "loss": 0.0453, + "rewards/chosen": 2.8688610076904295, + "rewards/margins": 7.375939178466797, + "rewards/rejected": -4.507078170776367, + "step": 1573 + }, + { + "epoch": 0.14380995888533576, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 9.509645715879746e-06, + "logits/chosen": 973397811.2, + "logits/rejected": 525094912.0, + "logps/chosen": -463.4712890625, + "logps/rejected": -450.1216634114583, + "loss": 0.0243, + "rewards/chosen": 3.4160087585449217, + "rewards/margins": 10.492025502522786, + "rewards/rejected": -7.076016743977864, + "step": 1574 + }, + { + "epoch": 0.1439013248058474, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 9.509024566636624e-06, + "logits/chosen": 406734592.0, + "logits/rejected": 525997152.0, + "logps/chosen": -343.09222412109375, + "logps/rejected": -580.1310424804688, + "loss": 0.0136, + "rewards/chosen": 3.738438844680786, + "rewards/margins": 13.860846757888794, + "rewards/rejected": -10.122407913208008, + "step": 1575 + }, + { + "epoch": 0.14399269072635906, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.50840304453895e-06, + "logits/chosen": 548932659.2, + "logits/rejected": 975865344.0, + "logps/chosen": -427.91201171875, + "logps/rejected": -655.1226399739584, + "loss": 0.0316, + "rewards/chosen": 3.2669654846191407, + "rewards/margins": 13.149579620361328, + "rewards/rejected": -9.882614135742188, + "step": 1576 + }, + { + "epoch": 0.1440840566468707, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 9.507781149638113e-06, + "logits/chosen": 839428864.0, + "logits/rejected": 582800981.3333334, + "logps/chosen": -311.59580078125, + "logps/rejected": -648.2615559895834, + "loss": 0.0373, + "rewards/chosen": 3.252382278442383, + "rewards/margins": 13.401684443155924, + "rewards/rejected": -10.149302164713541, + "step": 1577 + }, + { + "epoch": 0.14417542256738236, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 9.507158881985541e-06, + "logits/chosen": 598508544.0, + "logits/rejected": 568668928.0, + "logps/chosen": -222.901123046875, + "logps/rejected": -371.9870910644531, + "loss": 0.0229, + "rewards/chosen": 3.1419880390167236, + "rewards/margins": 10.867008924484253, + "rewards/rejected": -7.725020885467529, + "step": 1578 + }, + { + "epoch": 0.144266788487894, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.506536241632689e-06, + "logits/chosen": 528848691.2, + "logits/rejected": 607792128.0, + "logps/chosen": -244.714990234375, + "logps/rejected": -654.4452311197916, + "loss": 0.0405, + "rewards/chosen": 3.0413797378540037, + "rewards/margins": 9.675726381937663, + "rewards/rejected": -6.634346644083659, + "step": 1579 + }, + { + "epoch": 0.14435815440840566, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 9.505913228631046e-06, + "logits/chosen": 609421440.0, + "logits/rejected": 484468138.6666667, + "logps/chosen": -529.4581909179688, + "logps/rejected": -409.6104329427083, + "loss": 0.0188, + "rewards/chosen": 2.8100509643554688, + "rewards/margins": 9.224937438964844, + "rewards/rejected": -6.414886474609375, + "step": 1580 + }, + { + "epoch": 0.1444495203289173, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.505289843032125e-06, + "logits/chosen": 466205632.0, + "logits/rejected": 927456341.3333334, + "logps/chosen": -274.1125793457031, + "logps/rejected": -513.8264973958334, + "loss": 0.0564, + "rewards/chosen": 1.9172683954238892, + "rewards/margins": 10.921126325925192, + "rewards/rejected": -9.003857930501303, + "step": 1581 + }, + { + "epoch": 0.14454088624942896, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.504666084887478e-06, + "logits/chosen": 1129734528.0, + "logits/rejected": 870241280.0, + "logps/chosen": -237.97740173339844, + "logps/rejected": -280.24114990234375, + "loss": 0.0804, + "rewards/chosen": 2.791191816329956, + "rewards/margins": 7.6920716762542725, + "rewards/rejected": -4.900879859924316, + "step": 1582 + }, + { + "epoch": 0.1446322521699406, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 9.50404195424868e-06, + "logits/chosen": 623216896.0, + "logits/rejected": 558692249.6, + "logps/chosen": -346.9783121744792, + "logps/rejected": -458.227099609375, + "loss": 0.0149, + "rewards/chosen": 3.3552652994791665, + "rewards/margins": 11.655386606852213, + "rewards/rejected": -8.300121307373047, + "step": 1583 + }, + { + "epoch": 0.14472361809045226, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 9.503417451167345e-06, + "logits/chosen": 721060181.3333334, + "logits/rejected": 759162777.6, + "logps/chosen": -381.1765543619792, + "logps/rejected": -499.09970703125, + "loss": 0.0181, + "rewards/chosen": 3.08084774017334, + "rewards/margins": 10.556529808044434, + "rewards/rejected": -7.475682067871094, + "step": 1584 + }, + { + "epoch": 0.1448149840109639, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.502792575695112e-06, + "logits/chosen": 358278579.2, + "logits/rejected": 426115242.6666667, + "logps/chosen": -187.36275634765624, + "logps/rejected": -513.7019856770834, + "loss": 0.0608, + "rewards/chosen": 2.5127561569213865, + "rewards/margins": 10.89861405690511, + "rewards/rejected": -8.385857899983725, + "step": 1585 + }, + { + "epoch": 0.14490634993147555, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 9.502167327883652e-06, + "logits/chosen": 623858432.0, + "logits/rejected": 833437696.0, + "logps/chosen": -277.4427490234375, + "logps/rejected": -438.91328125, + "loss": 0.0097, + "rewards/chosen": 4.017838795979817, + "rewards/margins": 11.391453297932943, + "rewards/rejected": -7.373614501953125, + "step": 1586 + }, + { + "epoch": 0.1449977158519872, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.501541707784667e-06, + "logits/chosen": 215361984.0, + "logits/rejected": 369017760.0, + "logps/chosen": -205.44992065429688, + "logps/rejected": -522.8662109375, + "loss": 0.111, + "rewards/chosen": 3.227956533432007, + "rewards/margins": 13.231305837631226, + "rewards/rejected": -10.003349304199219, + "step": 1587 + }, + { + "epoch": 0.14508908177249885, + "grad_norm": 0.3046875, + "kl": 0.0, + "learning_rate": 9.50091571544989e-06, + "logits/rejected": 421457792.0, + "logps/rejected": -626.1495361328125, + "loss": 0.0011, + "rewards/rejected": -8.309231758117676, + "step": 1588 + }, + { + "epoch": 0.1451804476930105, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.500289350931087e-06, + "logits/chosen": 454563328.0, + "logits/rejected": 498113472.0, + "logps/chosen": -362.8503824869792, + "logps/rejected": -296.7991943359375, + "loss": 0.042, + "rewards/chosen": 3.0078538258870444, + "rewards/margins": 10.711754163106283, + "rewards/rejected": -7.703900337219238, + "step": 1589 + }, + { + "epoch": 0.14527181361352215, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.49966261428005e-06, + "logits/chosen": 346259821.71428573, + "logits/rejected": 353659936.0, + "logps/chosen": -278.56033761160717, + "logps/rejected": -151.85987854003906, + "loss": 0.0493, + "rewards/chosen": 3.3097776685442244, + "rewards/margins": 10.83482449395316, + "rewards/rejected": -7.5250468254089355, + "step": 1590 + }, + { + "epoch": 0.1453631795340338, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.499035505548603e-06, + "logits/chosen": 710422464.0, + "logits/rejected": 539173184.0, + "logps/chosen": -454.15826416015625, + "logps/rejected": -354.92291259765625, + "loss": 0.0421, + "rewards/chosen": 3.0140633583068848, + "rewards/margins": 9.279644012451172, + "rewards/rejected": -6.265580654144287, + "step": 1591 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 9.498408024788608e-06, + "logits/chosen": 261388096.0, + "logits/rejected": 397616896.0, + "logps/chosen": -283.2420654296875, + "logps/rejected": -469.7041829427083, + "loss": 0.0118, + "rewards/chosen": 3.2007341384887695, + "rewards/margins": 13.356094678243002, + "rewards/rejected": -10.155360539754232, + "step": 1592 + }, + { + "epoch": 0.1455459113750571, + "grad_norm": 24.375, + "kl": 0.0, + "learning_rate": 9.497780172051946e-06, + "logits/chosen": 748458666.6666666, + "logits/rejected": 598589440.0, + "logps/chosen": -412.5078125, + "logps/rejected": -491.629150390625, + "loss": 0.1113, + "rewards/chosen": 2.2165018717447915, + "rewards/margins": 6.747481981913248, + "rewards/rejected": -4.530980110168457, + "step": 1593 + }, + { + "epoch": 0.14563727729556875, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.497151947390538e-06, + "logits/chosen": 366363840.0, + "logits/rejected": 644911104.0, + "logps/chosen": -385.13446044921875, + "logps/rejected": -536.3397623697916, + "loss": 0.0133, + "rewards/chosen": 2.9769105911254883, + "rewards/margins": 11.641493161519369, + "rewards/rejected": -8.66458257039388, + "step": 1594 + }, + { + "epoch": 0.1457286432160804, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.496523350856328e-06, + "logits/chosen": 331377856.0, + "logits/rejected": 503857888.0, + "logps/chosen": -223.08740234375, + "logps/rejected": -551.0469970703125, + "loss": 0.1002, + "rewards/chosen": 2.6464834213256836, + "rewards/margins": 12.532380104064941, + "rewards/rejected": -9.885896682739258, + "step": 1595 + }, + { + "epoch": 0.14582000913659204, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 9.4958943825013e-06, + "logits/chosen": 727596970.6666666, + "logits/rejected": 495711385.6, + "logps/chosen": -321.6802164713542, + "logps/rejected": -363.471875, + "loss": 0.0101, + "rewards/chosen": 3.905005137125651, + "rewards/margins": 12.244547526041666, + "rewards/rejected": -8.339542388916016, + "step": 1596 + }, + { + "epoch": 0.1459113750571037, + "grad_norm": 0.37890625, + "kl": 0.0, + "learning_rate": 9.495265042377461e-06, + "logits/rejected": 545294592.0, + "logps/rejected": -556.64306640625, + "loss": 0.0014, + "rewards/rejected": -8.062192916870117, + "step": 1597 + }, + { + "epoch": 0.14600274097761534, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.494635330536855e-06, + "logits/chosen": 327947349.3333333, + "logits/rejected": 578543001.6, + "logps/chosen": -377.1484375, + "logps/rejected": -367.14482421875, + "loss": 0.0234, + "rewards/chosen": 3.121615727742513, + "rewards/margins": 10.807414372762045, + "rewards/rejected": -7.685798645019531, + "step": 1598 + }, + { + "epoch": 0.146094106898127, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.49400524703155e-06, + "logits/chosen": 727392768.0, + "logits/rejected": 539789354.6666666, + "logps/chosen": -358.3704833984375, + "logps/rejected": -643.9053141276041, + "loss": 0.0691, + "rewards/chosen": 1.9397637844085693, + "rewards/margins": 10.992021322250366, + "rewards/rejected": -9.052257537841797, + "step": 1599 + }, + { + "epoch": 0.14618547281863864, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.493374791913647e-06, + "logits/chosen": 1276271232.0, + "logits/rejected": 545785280.0, + "logps/chosen": -363.75616455078125, + "logps/rejected": -353.1644287109375, + "loss": 0.1222, + "rewards/chosen": 3.114104747772217, + "rewards/margins": 8.622630596160889, + "rewards/rejected": -5.508525848388672, + "step": 1600 + }, + { + "epoch": 0.1462768387391503, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.492743965235283e-06, + "logits/chosen": 963462528.0, + "logits/rejected": 323789664.0, + "logps/chosen": -434.744140625, + "logps/rejected": -375.37103271484375, + "loss": 0.0281, + "rewards/chosen": 2.880889415740967, + "rewards/margins": 10.12849235534668, + "rewards/rejected": -7.247602939605713, + "step": 1601 + }, + { + "epoch": 0.14636820465966194, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 9.492112767048618e-06, + "logits/chosen": 404439808.0, + "logits/rejected": 594963456.0, + "logps/chosen": -254.2025390625, + "logps/rejected": -757.0755208333334, + "loss": 0.0622, + "rewards/chosen": 2.611955261230469, + "rewards/margins": 12.971713002522787, + "rewards/rejected": -10.359757741292318, + "step": 1602 + }, + { + "epoch": 0.1464595705801736, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 9.491481197405848e-06, + "logits/chosen": 496270208.0, + "logits/rejected": 1048180838.4, + "logps/chosen": -287.6225992838542, + "logps/rejected": -337.9036376953125, + "loss": 0.0113, + "rewards/chosen": 3.814777692159017, + "rewards/margins": 10.990449078877766, + "rewards/rejected": -7.17567138671875, + "step": 1603 + }, + { + "epoch": 0.14655093650068524, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.490849256359195e-06, + "logits/chosen": 539589222.4, + "logits/rejected": 350404480.0, + "logps/chosen": -362.7463134765625, + "logps/rejected": -420.3147379557292, + "loss": 0.0372, + "rewards/chosen": 2.9865348815917967, + "rewards/margins": 11.073951212565103, + "rewards/rejected": -8.087416330973307, + "step": 1604 + }, + { + "epoch": 0.14664230242119689, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.490216943960921e-06, + "logits/chosen": 573875029.3333334, + "logits/rejected": 426769248.0, + "logps/chosen": -451.5223388671875, + "logps/rejected": -355.2622985839844, + "loss": 0.0374, + "rewards/chosen": 3.008315086364746, + "rewards/margins": 10.025458335876465, + "rewards/rejected": -7.017143249511719, + "step": 1605 + }, + { + "epoch": 0.14673366834170853, + "grad_norm": 0.98828125, + "kl": 0.0, + "learning_rate": 9.489584260263307e-06, + "logits/chosen": 337941600.0, + "logits/rejected": 497793024.0, + "logps/chosen": -123.89884185791016, + "logps/rejected": -550.8311941964286, + "loss": 0.0049, + "rewards/chosen": 3.373293399810791, + "rewards/margins": 12.375714097704206, + "rewards/rejected": -9.002420697893415, + "step": 1606 + }, + { + "epoch": 0.14682503426222018, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.48895120531867e-06, + "logits/chosen": 746750771.2, + "logits/rejected": 450508586.6666667, + "logps/chosen": -409.31572265625, + "logps/rejected": -278.2703450520833, + "loss": 0.116, + "rewards/chosen": 2.8044197082519533, + "rewards/margins": 7.805842590332031, + "rewards/rejected": -5.001422882080078, + "step": 1607 + }, + { + "epoch": 0.14691640018273183, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 9.48831777917936e-06, + "logits/chosen": 610135756.8, + "logits/rejected": 596030421.3333334, + "logps/chosen": -285.5030029296875, + "logps/rejected": -731.1905110677084, + "loss": 0.0193, + "rewards/chosen": 3.9266841888427733, + "rewards/margins": 15.186285781860352, + "rewards/rejected": -11.259601593017578, + "step": 1608 + }, + { + "epoch": 0.14700776610324348, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.487683981897758e-06, + "logits/chosen": 850664768.0, + "logits/rejected": 1058870656.0, + "logps/chosen": -389.2084655761719, + "logps/rejected": -304.8360595703125, + "loss": 0.1365, + "rewards/chosen": 3.3978285789489746, + "rewards/margins": 6.972458839416504, + "rewards/rejected": -3.5746302604675293, + "step": 1609 + }, + { + "epoch": 0.14709913202375513, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.487049813526268e-06, + "logits/chosen": 391674931.2, + "logits/rejected": 460849962.6666667, + "logps/chosen": -393.88447265625, + "logps/rejected": -459.2958170572917, + "loss": 0.0258, + "rewards/chosen": 3.6076400756835936, + "rewards/margins": 11.319912719726563, + "rewards/rejected": -7.712272644042969, + "step": 1610 + }, + { + "epoch": 0.14719049794426678, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 9.486415274117331e-06, + "logits/chosen": 451579050.6666667, + "logits/rejected": 236451276.8, + "logps/chosen": -244.01456705729166, + "logps/rejected": -403.7129638671875, + "loss": 0.0104, + "rewards/chosen": 3.6650234858194985, + "rewards/margins": 14.054519335428873, + "rewards/rejected": -10.389495849609375, + "step": 1611 + }, + { + "epoch": 0.14728186386477843, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.485780363723419e-06, + "logits/chosen": 772824960.0, + "logits/rejected": 364785024.0, + "logps/chosen": -252.3330535888672, + "logps/rejected": -420.7501220703125, + "loss": 0.0589, + "rewards/chosen": 2.5768847465515137, + "rewards/margins": 9.066934585571289, + "rewards/rejected": -6.490049839019775, + "step": 1612 + }, + { + "epoch": 0.14737322978529008, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.485145082397032e-06, + "logits/chosen": 383337120.0, + "logits/rejected": 616984448.0, + "logps/chosen": -193.03851318359375, + "logps/rejected": -361.0076904296875, + "loss": 0.0272, + "rewards/chosen": 3.3409571647644043, + "rewards/margins": 10.149498462677002, + "rewards/rejected": -6.808541297912598, + "step": 1613 + }, + { + "epoch": 0.14746459570580173, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 9.484509430190704e-06, + "logits/chosen": 706565952.0, + "logits/rejected": 580586752.0, + "logps/chosen": -254.4276580810547, + "logps/rejected": -465.081298828125, + "loss": 0.0277, + "rewards/chosen": 3.2742185592651367, + "rewards/margins": 11.350346565246582, + "rewards/rejected": -8.076128005981445, + "step": 1614 + }, + { + "epoch": 0.14755596162631338, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.483873407156995e-06, + "logits/chosen": 761285034.6666666, + "logits/rejected": 320623923.2, + "logps/chosen": -425.047119140625, + "logps/rejected": -514.905517578125, + "loss": 0.0327, + "rewards/chosen": 2.5556591351826987, + "rewards/margins": 10.126597245534262, + "rewards/rejected": -7.5709381103515625, + "step": 1615 + }, + { + "epoch": 0.14764732754682502, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 9.483237013348498e-06, + "logits/chosen": 897387264.0, + "logits/rejected": 685814016.0, + "logps/chosen": -262.5819091796875, + "logps/rejected": -500.286865234375, + "loss": 0.0199, + "rewards/chosen": 3.349470774332682, + "rewards/margins": 11.695556131998698, + "rewards/rejected": -8.346085357666016, + "step": 1616 + }, + { + "epoch": 0.14773869346733667, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.48260024881784e-06, + "logits/chosen": 1375201536.0, + "logits/rejected": 601931264.0, + "logps/chosen": -166.05018615722656, + "logps/rejected": -509.5341796875, + "loss": 0.0736, + "rewards/chosen": 0.9112001657485962, + "rewards/margins": 11.655038158098856, + "rewards/rejected": -10.74383799235026, + "step": 1617 + }, + { + "epoch": 0.14783005938784832, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 9.481963113617675e-06, + "logits/chosen": 641242009.6, + "logits/rejected": 971385429.3333334, + "logps/chosen": -186.15670166015624, + "logps/rejected": -916.2628580729166, + "loss": 0.0281, + "rewards/chosen": 3.1871335983276365, + "rewards/margins": 12.399419593811036, + "rewards/rejected": -9.212285995483398, + "step": 1618 + }, + { + "epoch": 0.14792142530835997, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 9.481325607800684e-06, + "logits/chosen": 411061600.0, + "logits/rejected": 710289536.0, + "logps/chosen": -241.3467559814453, + "logps/rejected": -680.12548828125, + "loss": 0.0214, + "rewards/chosen": 3.515028476715088, + "rewards/margins": 11.465054512023926, + "rewards/rejected": -7.950026035308838, + "step": 1619 + }, + { + "epoch": 0.14801279122887162, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 9.480687731419587e-06, + "logits/chosen": 332491818.6666667, + "logits/rejected": 308866124.8, + "logps/chosen": -383.0777994791667, + "logps/rejected": -424.99951171875, + "loss": 0.0148, + "rewards/chosen": 3.7934487660725913, + "rewards/margins": 12.220978673299154, + "rewards/rejected": -8.427529907226562, + "step": 1620 + }, + { + "epoch": 0.14810415714938327, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 9.480049484527127e-06, + "logits/chosen": 958098534.4, + "logits/rejected": 598292309.3333334, + "logps/chosen": -247.335693359375, + "logps/rejected": -660.8016357421875, + "loss": 0.0244, + "rewards/chosen": 3.648711395263672, + "rewards/margins": 11.949342600504558, + "rewards/rejected": -8.300631205240885, + "step": 1621 + }, + { + "epoch": 0.14819552306989492, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.479410867176087e-06, + "logits/chosen": 437499136.0, + "logits/rejected": 578521088.0, + "logps/chosen": -306.1787923177083, + "logps/rejected": -411.3150939941406, + "loss": 0.0375, + "rewards/chosen": 3.4447962443033853, + "rewards/margins": 12.193547884623209, + "rewards/rejected": -8.748751640319824, + "step": 1622 + }, + { + "epoch": 0.14828688899040657, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.478771879419268e-06, + "logits/chosen": 371928356.5714286, + "logits/rejected": 708333376.0, + "logps/chosen": -203.78721400669642, + "logps/rejected": -773.1722412109375, + "loss": 0.0421, + "rewards/chosen": 3.879368373325893, + "rewards/margins": 14.928618022373744, + "rewards/rejected": -11.049249649047852, + "step": 1623 + }, + { + "epoch": 0.14837825491091822, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.478132521309511e-06, + "logits/chosen": 678269056.0, + "logits/rejected": 1014044992.0, + "logps/chosen": -259.5174560546875, + "logps/rejected": -387.899658203125, + "loss": 0.0395, + "rewards/chosen": 3.4146032333374023, + "rewards/margins": 9.94948434829712, + "rewards/rejected": -6.534881114959717, + "step": 1624 + }, + { + "epoch": 0.14846962083142987, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 9.477492792899687e-06, + "logits/chosen": 509261721.6, + "logits/rejected": 661621333.3333334, + "logps/chosen": -280.8658935546875, + "logps/rejected": -478.82080078125, + "loss": 0.028, + "rewards/chosen": 3.4141796112060545, + "rewards/margins": 10.778523890177409, + "rewards/rejected": -7.3643442789713545, + "step": 1625 + }, + { + "epoch": 0.14856098675194152, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.476852694242692e-06, + "logits/chosen": 373158758.4, + "logits/rejected": 213544277.33333334, + "logps/chosen": -297.724169921875, + "logps/rejected": -376.8731689453125, + "loss": 0.0419, + "rewards/chosen": 3.028915786743164, + "rewards/margins": 8.798462931315104, + "rewards/rejected": -5.76954714457194, + "step": 1626 + }, + { + "epoch": 0.14865235267245316, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.476212225391461e-06, + "logits/chosen": 407872972.8, + "logits/rejected": 430740949.3333333, + "logps/chosen": -336.5505615234375, + "logps/rejected": -401.101806640625, + "loss": 0.1231, + "rewards/chosen": 3.4147125244140626, + "rewards/margins": 10.23434664408366, + "rewards/rejected": -6.819634119669597, + "step": 1627 + }, + { + "epoch": 0.1487437185929648, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 9.47557138639895e-06, + "logits/chosen": 451721369.6, + "logits/rejected": 479517141.3333333, + "logps/chosen": -210.4642578125, + "logps/rejected": -431.5458984375, + "loss": 0.0286, + "rewards/chosen": 3.2987648010253907, + "rewards/margins": 10.275156911214193, + "rewards/rejected": -6.976392110188802, + "step": 1628 + }, + { + "epoch": 0.14883508451347646, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 9.474930177318153e-06, + "logits/chosen": 306785152.0, + "logits/rejected": 566443008.0, + "logps/chosen": -225.92684936523438, + "logps/rejected": -414.6501159667969, + "loss": 0.0221, + "rewards/chosen": 3.8233754634857178, + "rewards/margins": 12.589904546737671, + "rewards/rejected": -8.766529083251953, + "step": 1629 + }, + { + "epoch": 0.1489264504339881, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 9.474288598202091e-06, + "logits/chosen": 831697920.0, + "logits/rejected": 468448960.0, + "logps/chosen": -288.2188313802083, + "logps/rejected": -509.2901306152344, + "loss": 0.0683, + "rewards/chosen": 2.6948960622151694, + "rewards/margins": 10.007379849751791, + "rewards/rejected": -7.312483787536621, + "step": 1630 + }, + { + "epoch": 0.14901781635449976, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 9.473646649103819e-06, + "logits/chosen": 763645184.0, + "logits/rejected": 584799078.4, + "logps/chosen": -191.08626302083334, + "logps/rejected": -272.393017578125, + "loss": 0.0274, + "rewards/chosen": 3.7957111994425454, + "rewards/margins": 9.390664354960123, + "rewards/rejected": -5.594953155517578, + "step": 1631 + }, + { + "epoch": 0.1491091822750114, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.473004330076417e-06, + "logits/chosen": 405789344.0, + "logits/rejected": 538296320.0, + "logps/chosen": -315.645263671875, + "logps/rejected": -481.7001139322917, + "loss": 0.0278, + "rewards/chosen": 2.6830203533172607, + "rewards/margins": 9.902612765630085, + "rewards/rejected": -7.219592412312825, + "step": 1632 + }, + { + "epoch": 0.14920054819552306, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.472361641172998e-06, + "logits/chosen": 377079637.3333333, + "logits/rejected": 1207591424.0, + "logps/chosen": -221.4825642903646, + "logps/rejected": -574.1677734375, + "loss": 0.037, + "rewards/chosen": 2.828683535257975, + "rewards/margins": 11.510282961527507, + "rewards/rejected": -8.681599426269532, + "step": 1633 + }, + { + "epoch": 0.1492919141160347, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.471718582446711e-06, + "logits/chosen": 356901952.0, + "logits/rejected": 802106816.0, + "logps/chosen": -204.5378875732422, + "logps/rejected": -780.6507568359375, + "loss": 0.1297, + "rewards/chosen": 2.054945945739746, + "rewards/margins": 10.761614799499512, + "rewards/rejected": -8.706668853759766, + "step": 1634 + }, + { + "epoch": 0.14938328003654636, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.47107515395073e-06, + "logits/chosen": 432321194.6666667, + "logits/rejected": 794195648.0, + "logps/chosen": -306.9957275390625, + "logps/rejected": -402.7293701171875, + "loss": 0.126, + "rewards/chosen": 2.7838398615519204, + "rewards/margins": 8.192264238993326, + "rewards/rejected": -5.408424377441406, + "step": 1635 + }, + { + "epoch": 0.149474645957058, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.470431355738257e-06, + "logits/chosen": 451577036.8, + "logits/rejected": 290330154.6666667, + "logps/chosen": -250.656787109375, + "logps/rejected": -479.8749186197917, + "loss": 0.0467, + "rewards/chosen": 2.847162437438965, + "rewards/margins": 13.929926109313964, + "rewards/rejected": -11.082763671875, + "step": 1636 + }, + { + "epoch": 0.14956601187756965, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.469787187862531e-06, + "logits/chosen": 815096371.2, + "logits/rejected": 512512853.3333333, + "logps/chosen": -375.7427978515625, + "logps/rejected": -256.4373779296875, + "loss": 0.0291, + "rewards/chosen": 3.6212677001953124, + "rewards/margins": 9.033501688639323, + "rewards/rejected": -5.412233988444011, + "step": 1637 + }, + { + "epoch": 0.1496573777980813, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 9.469142650376819e-06, + "logits/chosen": 511901824.0, + "logits/rejected": 476633088.0, + "logps/chosen": -313.1651916503906, + "logps/rejected": -382.3037109375, + "loss": 0.0182, + "rewards/chosen": 2.891282558441162, + "rewards/margins": 11.39188019434611, + "rewards/rejected": -8.500597635904947, + "step": 1638 + }, + { + "epoch": 0.14974874371859295, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 9.468497743334415e-06, + "logits/chosen": 826048128.0, + "logits/rejected": 1131076681.142857, + "logps/chosen": -162.55221557617188, + "logps/rejected": -465.68505859375, + "loss": 0.0092, + "rewards/chosen": 2.669076681137085, + "rewards/margins": 11.025411912373134, + "rewards/rejected": -8.356335231236049, + "step": 1639 + }, + { + "epoch": 0.1498401096391046, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 9.467852466788651e-06, + "logits/chosen": 603319808.0, + "logits/rejected": 1121684480.0, + "logps/chosen": -461.6351318359375, + "logps/rejected": -648.0189819335938, + "loss": 0.015, + "rewards/chosen": 4.091054439544678, + "rewards/margins": 13.396095752716064, + "rewards/rejected": -9.305041313171387, + "step": 1640 + }, + { + "epoch": 0.14993147555961625, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 9.467206820792882e-06, + "logits/chosen": 266575712.0, + "logits/rejected": 397877577.14285713, + "logps/chosen": -256.3732604980469, + "logps/rejected": -395.2721470424107, + "loss": 0.0122, + "rewards/chosen": 2.3553619384765625, + "rewards/margins": 10.356685093470983, + "rewards/rejected": -8.00132315499442, + "step": 1641 + }, + { + "epoch": 0.1500228414801279, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.4665608054005e-06, + "logits/chosen": 501348266.6666667, + "logits/rejected": 466086528.0, + "logps/chosen": -252.28348795572916, + "logps/rejected": -476.04852294921875, + "loss": 0.0492, + "rewards/chosen": 2.717205047607422, + "rewards/margins": 10.616697311401367, + "rewards/rejected": -7.899492263793945, + "step": 1642 + }, + { + "epoch": 0.15011420740063955, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.465914420664924e-06, + "logits/chosen": 299476032.0, + "logits/rejected": 253859936.0, + "logps/chosen": -256.6459655761719, + "logps/rejected": -349.9101867675781, + "loss": 0.0597, + "rewards/chosen": 3.7484054565429688, + "rewards/margins": 9.815378189086914, + "rewards/rejected": -6.066972732543945, + "step": 1643 + }, + { + "epoch": 0.1502055733211512, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.4652676666396e-06, + "logits/chosen": 667869952.0, + "logits/rejected": 418066304.0, + "logps/chosen": -315.20945231119794, + "logps/rejected": -395.8946533203125, + "loss": 0.0481, + "rewards/chosen": 2.7527716954549155, + "rewards/margins": 11.715372403462728, + "rewards/rejected": -8.962600708007812, + "step": 1644 + }, + { + "epoch": 0.15029693924166285, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.464620543378014e-06, + "logits/chosen": 304361642.6666667, + "logits/rejected": 567653939.2, + "logps/chosen": -284.7100423177083, + "logps/rejected": -527.195166015625, + "loss": 0.1392, + "rewards/chosen": 3.193561236063639, + "rewards/margins": 8.19378859202067, + "rewards/rejected": -5.000227355957032, + "step": 1645 + }, + { + "epoch": 0.1503883051621745, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.463973050933674e-06, + "logits/chosen": 676662016.0, + "logits/rejected": 695359104.0, + "logps/chosen": -301.6225891113281, + "logps/rejected": -575.095458984375, + "loss": 0.045, + "rewards/chosen": 2.5086264610290527, + "rewards/margins": 11.333483219146729, + "rewards/rejected": -8.824856758117676, + "step": 1646 + }, + { + "epoch": 0.15047967108268615, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.463325189360122e-06, + "logits/chosen": 424723328.0, + "logits/rejected": 535852672.0, + "logps/chosen": -282.5749816894531, + "logps/rejected": -582.6326904296875, + "loss": 0.0647, + "rewards/chosen": 2.4691708087921143, + "rewards/margins": 12.486879587173462, + "rewards/rejected": -10.017708778381348, + "step": 1647 + }, + { + "epoch": 0.1505710370031978, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.46267695871093e-06, + "logits/chosen": 501063488.0, + "logits/rejected": 447760512.0, + "logps/chosen": -403.72418212890625, + "logps/rejected": -457.7406005859375, + "loss": 0.0509, + "rewards/chosen": 2.4389450550079346, + "rewards/margins": 10.304561853408813, + "rewards/rejected": -7.865616798400879, + "step": 1648 + }, + { + "epoch": 0.15066240292370944, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.462028359039702e-06, + "logits/chosen": 731231146.6666666, + "logits/rejected": 927322624.0, + "logps/chosen": -366.3868408203125, + "logps/rejected": -436.219287109375, + "loss": 0.0225, + "rewards/chosen": 2.7667338053385415, + "rewards/margins": 13.290267435709636, + "rewards/rejected": -10.523533630371094, + "step": 1649 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 9.461379390400069e-06, + "logits/chosen": 526170624.0, + "logits/rejected": 470099200.0, + "logps/chosen": -334.5498046875, + "logps/rejected": -267.27490234375, + "loss": 0.0438, + "rewards/chosen": 2.9248023986816407, + "rewards/margins": 8.967215474446615, + "rewards/rejected": -6.042413075764974, + "step": 1650 + }, + { + "epoch": 0.15084513476473274, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 9.460730052845697e-06, + "logits/chosen": 383979328.0, + "logits/rejected": 355816533.3333333, + "logps/chosen": -332.9883728027344, + "logps/rejected": -441.0275472005208, + "loss": 0.0092, + "rewards/chosen": 3.662341594696045, + "rewards/margins": 12.657016277313232, + "rewards/rejected": -8.994674682617188, + "step": 1651 + }, + { + "epoch": 0.1509365006852444, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.46008034643028e-06, + "logits/chosen": 330853888.0, + "logits/rejected": 408698304.0, + "logps/chosen": -282.66259765625, + "logps/rejected": -372.26507568359375, + "loss": 0.0384, + "rewards/chosen": 3.534372011820475, + "rewards/margins": 9.632869402567545, + "rewards/rejected": -6.09849739074707, + "step": 1652 + }, + { + "epoch": 0.15102786660575604, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.45943027120754e-06, + "logits/chosen": 408760544.0, + "logits/rejected": 821342890.6666666, + "logps/chosen": -226.999267578125, + "logps/rejected": -428.6145833333333, + "loss": 0.1012, + "rewards/chosen": 2.018071174621582, + "rewards/margins": 8.290988604227703, + "rewards/rejected": -6.27291742960612, + "step": 1653 + }, + { + "epoch": 0.1511192325262677, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.458779827231237e-06, + "logits/chosen": 556534912.0, + "logits/rejected": 963314560.0, + "logps/chosen": -305.2226867675781, + "logps/rejected": -439.70587158203125, + "loss": 0.0303, + "rewards/chosen": 3.7432243824005127, + "rewards/margins": 13.490895509719849, + "rewards/rejected": -9.747671127319336, + "step": 1654 + }, + { + "epoch": 0.15121059844677937, + "grad_norm": 28.25, + "kl": 0.0, + "learning_rate": 9.458129014555153e-06, + "logits/chosen": 730830336.0, + "logits/rejected": 580393779.2, + "logps/chosen": -272.2621663411458, + "logps/rejected": -536.588818359375, + "loss": 0.1361, + "rewards/chosen": 1.9157772064208984, + "rewards/margins": 9.471126937866211, + "rewards/rejected": -7.555349731445313, + "step": 1655 + }, + { + "epoch": 0.15130196436729101, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.457477833233104e-06, + "logits/chosen": 460062208.0, + "logits/rejected": 348530048.0, + "logps/chosen": -386.62587890625, + "logps/rejected": -445.6664632161458, + "loss": 0.0292, + "rewards/chosen": 3.492913818359375, + "rewards/margins": 11.900004323323568, + "rewards/rejected": -8.407090504964193, + "step": 1656 + }, + { + "epoch": 0.15139333028780266, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.456826283318938e-06, + "logits/chosen": 480737216.0, + "logits/rejected": 487600160.0, + "logps/chosen": -315.7518615722656, + "logps/rejected": -334.2481384277344, + "loss": 0.1549, + "rewards/chosen": 3.1307570934295654, + "rewards/margins": 7.715895414352417, + "rewards/rejected": -4.585138320922852, + "step": 1657 + }, + { + "epoch": 0.1514846962083143, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 9.45617436486653e-06, + "logits/chosen": 402801152.0, + "logits/rejected": 503757098.6666667, + "logps/chosen": -331.79681396484375, + "logps/rejected": -442.19873046875, + "loss": 0.0086, + "rewards/chosen": 4.178285598754883, + "rewards/margins": 12.01397705078125, + "rewards/rejected": -7.835691452026367, + "step": 1658 + }, + { + "epoch": 0.15157606212882596, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.455522077929792e-06, + "logits/chosen": 490648576.0, + "logits/rejected": 371715264.0, + "logps/chosen": -357.323486328125, + "logps/rejected": -729.6581420898438, + "loss": 0.0562, + "rewards/chosen": 2.9947280883789062, + "rewards/margins": 15.624677658081055, + "rewards/rejected": -12.629949569702148, + "step": 1659 + }, + { + "epoch": 0.1516674280493376, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.454869422562659e-06, + "logits/chosen": 600378176.0, + "logits/rejected": 551469952.0, + "logps/chosen": -432.63568115234375, + "logps/rejected": -555.2752685546875, + "loss": 0.0368, + "rewards/chosen": 3.431306838989258, + "rewards/margins": 12.646900177001953, + "rewards/rejected": -9.215593338012695, + "step": 1660 + }, + { + "epoch": 0.15175879396984926, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 9.454216398819099e-06, + "logits/chosen": 448741216.0, + "logits/rejected": 623279680.0, + "logps/chosen": -413.9065246582031, + "logps/rejected": -475.25189208984375, + "loss": 0.0113, + "rewards/chosen": 4.045226097106934, + "rewards/margins": 12.7396879196167, + "rewards/rejected": -8.694461822509766, + "step": 1661 + }, + { + "epoch": 0.1518501598903609, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 9.453563006753113e-06, + "logits/chosen": 619599488.0, + "logits/rejected": 658669312.0, + "logps/chosen": -315.10174560546875, + "logps/rejected": -646.9248860677084, + "loss": 0.0152, + "rewards/chosen": 2.9707016944885254, + "rewards/margins": 13.155369917551676, + "rewards/rejected": -10.18466822306315, + "step": 1662 + }, + { + "epoch": 0.15194152581087256, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 9.452909246418729e-06, + "logits/chosen": 695826124.8, + "logits/rejected": 388803968.0, + "logps/chosen": -297.0024169921875, + "logps/rejected": -454.9479166666667, + "loss": 0.0137, + "rewards/chosen": 4.3249153137207035, + "rewards/margins": 13.14972407023112, + "rewards/rejected": -8.824808756510416, + "step": 1663 + }, + { + "epoch": 0.1520328917313842, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 9.452255117870006e-06, + "logits/chosen": 614964224.0, + "logits/rejected": 634684501.3333334, + "logps/chosen": -216.95883178710938, + "logps/rejected": -379.6819661458333, + "loss": 0.045, + "rewards/chosen": 2.1831138134002686, + "rewards/margins": 7.743070681889852, + "rewards/rejected": -5.559956868489583, + "step": 1664 + }, + { + "epoch": 0.15212425765189586, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 9.451600621161039e-06, + "logits/chosen": 408239701.3333333, + "logits/rejected": 756757145.6, + "logps/chosen": -155.90219116210938, + "logps/rejected": -577.8126953125, + "loss": 0.0151, + "rewards/chosen": 3.941460291544596, + "rewards/margins": 12.981044642130533, + "rewards/rejected": -9.039584350585937, + "step": 1665 + }, + { + "epoch": 0.1522156235724075, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.450945756345943e-06, + "logits/chosen": 1071439769.6, + "logits/rejected": 583511509.3333334, + "logps/chosen": -298.7970458984375, + "logps/rejected": -480.2391764322917, + "loss": 0.0476, + "rewards/chosen": 2.863165283203125, + "rewards/margins": 11.714074198404948, + "rewards/rejected": -8.850908915201822, + "step": 1666 + }, + { + "epoch": 0.15230698949291915, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.450290523478873e-06, + "logits/chosen": 348445760.0, + "logits/rejected": 426950080.0, + "logps/chosen": -236.42544555664062, + "logps/rejected": -708.7139282226562, + "loss": 0.0182, + "rewards/chosen": 3.6723268032073975, + "rewards/margins": 14.546340703964233, + "rewards/rejected": -10.874013900756836, + "step": 1667 + }, + { + "epoch": 0.1523983554134308, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 9.449634922614009e-06, + "logits/chosen": 501777280.0, + "logits/rejected": 458068070.4, + "logps/chosen": -312.9635009765625, + "logps/rejected": -627.91142578125, + "loss": 0.0209, + "rewards/chosen": 3.0494486490885415, + "rewards/margins": 11.989828745524088, + "rewards/rejected": -8.940380096435547, + "step": 1668 + }, + { + "epoch": 0.15248972133394245, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.448978953805564e-06, + "logits/chosen": 586408140.8, + "logits/rejected": 570080256.0, + "logps/chosen": -340.702783203125, + "logps/rejected": -637.2832845052084, + "loss": 0.0462, + "rewards/chosen": 2.7935659408569338, + "rewards/margins": 15.22120621999105, + "rewards/rejected": -12.427640279134115, + "step": 1669 + }, + { + "epoch": 0.1525810872544541, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 9.44832261710778e-06, + "logits/chosen": 596213333.3333334, + "logits/rejected": 450691430.4, + "logps/chosen": -420.0947265625, + "logps/rejected": -454.487109375, + "loss": 0.0123, + "rewards/chosen": 3.6835714975992837, + "rewards/margins": 13.28552385965983, + "rewards/rejected": -9.601952362060548, + "step": 1670 + }, + { + "epoch": 0.15267245317496575, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.44766591257493e-06, + "logits/chosen": 526431232.0, + "logits/rejected": 446639616.0, + "logps/chosen": -233.13995361328125, + "logps/rejected": -469.3426920572917, + "loss": 0.039, + "rewards/chosen": 3.9620447158813477, + "rewards/margins": 11.215354601542156, + "rewards/rejected": -7.253309885660808, + "step": 1671 + }, + { + "epoch": 0.1527638190954774, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 9.447008840261318e-06, + "logits/chosen": 455817376.0, + "logits/rejected": 499727264.0, + "logps/chosen": -312.821533203125, + "logps/rejected": -571.0440673828125, + "loss": 0.0154, + "rewards/chosen": 3.872929573059082, + "rewards/margins": 14.29269027709961, + "rewards/rejected": -10.419760704040527, + "step": 1672 + }, + { + "epoch": 0.15285518501598905, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.446351400221278e-06, + "logits/chosen": 652857753.6, + "logits/rejected": 581705728.0, + "logps/chosen": -310.6872314453125, + "logps/rejected": -346.1726888020833, + "loss": 0.0309, + "rewards/chosen": 3.562420654296875, + "rewards/margins": 12.045913187662759, + "rewards/rejected": -8.483492533365885, + "step": 1673 + }, + { + "epoch": 0.1529465509365007, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.445693592509173e-06, + "logits/chosen": 564208640.0, + "logits/rejected": 489331584.0, + "logps/chosen": -117.879638671875, + "logps/rejected": -644.583251953125, + "loss": 0.0454, + "rewards/chosen": 3.130652904510498, + "rewards/margins": 13.860801219940186, + "rewards/rejected": -10.730148315429688, + "step": 1674 + }, + { + "epoch": 0.15303791685701235, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 9.445035417179398e-06, + "logits/chosen": 817146197.3333334, + "logits/rejected": 533521868.8, + "logps/chosen": -468.6979166666667, + "logps/rejected": -463.98671875, + "loss": 0.0236, + "rewards/chosen": 2.8226105372111, + "rewards/margins": 11.786896578470865, + "rewards/rejected": -8.964286041259765, + "step": 1675 + }, + { + "epoch": 0.153129282777524, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 9.444376874286378e-06, + "logits/chosen": 635391360.0, + "logits/rejected": 882423296.0, + "logps/chosen": -434.998779296875, + "logps/rejected": -495.6083170572917, + "loss": 0.0096, + "rewards/chosen": 3.435955762863159, + "rewards/margins": 12.11831291516622, + "rewards/rejected": -8.68235715230306, + "step": 1676 + }, + { + "epoch": 0.15322064869803564, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.443717963884568e-06, + "logits/chosen": 513474252.8, + "logits/rejected": 554193578.6666666, + "logps/chosen": -204.0518310546875, + "logps/rejected": -669.189697265625, + "loss": 0.0392, + "rewards/chosen": 3.001468467712402, + "rewards/margins": 12.443304506937661, + "rewards/rejected": -9.44183603922526, + "step": 1677 + }, + { + "epoch": 0.1533120146185473, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.443058686028457e-06, + "logits/chosen": 567702374.4, + "logits/rejected": 660842197.3333334, + "logps/chosen": -343.664111328125, + "logps/rejected": -453.3544108072917, + "loss": 0.0341, + "rewards/chosen": 3.1253793716430662, + "rewards/margins": 13.479606437683106, + "rewards/rejected": -10.354227066040039, + "step": 1678 + }, + { + "epoch": 0.15340338053905894, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 9.442399040772556e-06, + "logits/chosen": 581231104.0, + "logits/rejected": 341312608.0, + "logps/chosen": -491.2034912109375, + "logps/rejected": -580.5316772460938, + "loss": 0.019, + "rewards/chosen": 3.4714577198028564, + "rewards/margins": 16.052476167678833, + "rewards/rejected": -12.581018447875977, + "step": 1679 + }, + { + "epoch": 0.1534947464595706, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.441739028171414e-06, + "logits/chosen": 353139157.3333333, + "logits/rejected": 419747635.2, + "logps/chosen": -269.8811442057292, + "logps/rejected": -430.33369140625, + "loss": 0.0523, + "rewards/chosen": 2.583660284678141, + "rewards/margins": 10.536908880869547, + "rewards/rejected": -7.953248596191406, + "step": 1680 + }, + { + "epoch": 0.15358611238008224, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.441078648279609e-06, + "logits/chosen": 950857318.4, + "logits/rejected": 616693760.0, + "logps/chosen": -206.4135986328125, + "logps/rejected": -256.404296875, + "loss": 0.065, + "rewards/chosen": 2.682090950012207, + "rewards/margins": 9.126426887512206, + "rewards/rejected": -6.4443359375, + "step": 1681 + }, + { + "epoch": 0.1536774783005939, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 9.440417901151748e-06, + "logits/chosen": 721741926.4, + "logits/rejected": 645485909.3333334, + "logps/chosen": -492.11767578125, + "logps/rejected": -569.8970947265625, + "loss": 0.03, + "rewards/chosen": 3.1772241592407227, + "rewards/margins": 12.122726122538248, + "rewards/rejected": -8.945501963297525, + "step": 1682 + }, + { + "epoch": 0.15376884422110554, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 9.439756786842466e-06, + "logits/chosen": 437399552.0, + "logits/rejected": 549328448.0, + "logps/chosen": -291.42299107142856, + "logps/rejected": -479.8548583984375, + "loss": 0.0344, + "rewards/chosen": 3.4711181095668246, + "rewards/margins": 9.496710436684744, + "rewards/rejected": -6.02559232711792, + "step": 1683 + }, + { + "epoch": 0.1538602101416172, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.439095305406435e-06, + "logits/chosen": 511290453.3333333, + "logits/rejected": 238629606.4, + "logps/chosen": -230.8450927734375, + "logps/rejected": -263.383349609375, + "loss": 0.0218, + "rewards/chosen": 4.157487869262695, + "rewards/margins": 11.940460586547852, + "rewards/rejected": -7.782972717285157, + "step": 1684 + }, + { + "epoch": 0.15395157606212884, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.43843345689835e-06, + "logits/chosen": 348774677.3333333, + "logits/rejected": 356246169.6, + "logps/chosen": -302.98362223307294, + "logps/rejected": -396.9384521484375, + "loss": 0.05, + "rewards/chosen": 2.2515576680501304, + "rewards/margins": 10.53212865193685, + "rewards/rejected": -8.280570983886719, + "step": 1685 + }, + { + "epoch": 0.15404294198264049, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.437771241372942e-06, + "logits/chosen": 716177322.6666666, + "logits/rejected": 551294771.2, + "logps/chosen": -446.4306640625, + "logps/rejected": -499.030517578125, + "loss": 0.0195, + "rewards/chosen": 3.7791131337483725, + "rewards/margins": 12.169040807088217, + "rewards/rejected": -8.389927673339844, + "step": 1686 + }, + { + "epoch": 0.15413430790315213, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.437108658884968e-06, + "logits/chosen": 354931840.0, + "logits/rejected": 292522336.0, + "logps/chosen": -323.9648742675781, + "logps/rejected": -510.1826477050781, + "loss": 0.0221, + "rewards/chosen": 3.1304283142089844, + "rewards/margins": 15.946939468383789, + "rewards/rejected": -12.816511154174805, + "step": 1687 + }, + { + "epoch": 0.15422567382366378, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 9.43644570948922e-06, + "logits/chosen": 473417574.4, + "logits/rejected": 363364608.0, + "logps/chosen": -199.1033447265625, + "logps/rejected": -257.3513997395833, + "loss": 0.0409, + "rewards/chosen": 3.352509689331055, + "rewards/margins": 9.164268366495769, + "rewards/rejected": -5.811758677164714, + "step": 1688 + }, + { + "epoch": 0.15431703974417543, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 9.435782393240515e-06, + "logits/chosen": 651013312.0, + "logits/rejected": 562443227.4285715, + "logps/chosen": -490.52337646484375, + "logps/rejected": -370.87325613839283, + "loss": 0.0046, + "rewards/chosen": 3.38641357421875, + "rewards/margins": 11.775474548339844, + "rewards/rejected": -8.389060974121094, + "step": 1689 + }, + { + "epoch": 0.15440840566468708, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.435118710193705e-06, + "logits/chosen": 502068633.6, + "logits/rejected": 690047061.3333334, + "logps/chosen": -287.4569091796875, + "logps/rejected": -356.43701171875, + "loss": 0.0414, + "rewards/chosen": 2.674199104309082, + "rewards/margins": 10.610599199930828, + "rewards/rejected": -7.936400095621745, + "step": 1690 + }, + { + "epoch": 0.15449977158519873, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 9.43445466040367e-06, + "logits/chosen": 796873728.0, + "logits/rejected": 648813354.6666666, + "logps/chosen": -417.47373046875, + "logps/rejected": -490.921630859375, + "loss": 0.0596, + "rewards/chosen": 2.750944900512695, + "rewards/margins": 15.143599319458009, + "rewards/rejected": -12.392654418945312, + "step": 1691 + }, + { + "epoch": 0.15459113750571038, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 9.433790243925323e-06, + "logits/chosen": 770560000.0, + "logits/rejected": 407074048.0, + "logps/chosen": -226.2890625, + "logps/rejected": -330.9321695963542, + "loss": 0.0216, + "rewards/chosen": 3.599896240234375, + "rewards/margins": 11.735453414916993, + "rewards/rejected": -8.135557174682617, + "step": 1692 + }, + { + "epoch": 0.15468250342622203, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.4331254608136e-06, + "logits/chosen": 314867968.0, + "logits/rejected": 325874901.3333333, + "logps/chosen": -303.4672546386719, + "logps/rejected": -379.1609700520833, + "loss": 0.112, + "rewards/chosen": 1.1847206354141235, + "rewards/margins": 9.514950712521872, + "rewards/rejected": -8.330230077107748, + "step": 1693 + }, + { + "epoch": 0.15477386934673368, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.432460311123476e-06, + "logits/chosen": 623230464.0, + "logits/rejected": 603035221.3333334, + "logps/chosen": -431.617822265625, + "logps/rejected": -374.2684326171875, + "loss": 0.0377, + "rewards/chosen": 2.980556297302246, + "rewards/margins": 9.870675214131673, + "rewards/rejected": -6.890118916829427, + "step": 1694 + }, + { + "epoch": 0.15486523526724533, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 9.431794794909952e-06, + "logits/chosen": 725021354.6666666, + "logits/rejected": 347244492.8, + "logps/chosen": -300.5484619140625, + "logps/rejected": -505.32470703125, + "loss": 0.0062, + "rewards/chosen": 4.31903076171875, + "rewards/margins": 12.328541564941407, + "rewards/rejected": -8.009510803222657, + "step": 1695 + }, + { + "epoch": 0.15495660118775698, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.43112891222806e-06, + "logits/chosen": 622534656.0, + "logits/rejected": 675563136.0, + "logps/chosen": -347.8471984863281, + "logps/rejected": -509.3247375488281, + "loss": 0.0242, + "rewards/chosen": 2.9935710430145264, + "rewards/margins": 14.039667844772339, + "rewards/rejected": -11.046096801757812, + "step": 1696 + }, + { + "epoch": 0.15504796710826863, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 9.430462663132862e-06, + "logits/chosen": 620501802.6666666, + "logits/rejected": 731967488.0, + "logps/chosen": -365.13037109375, + "logps/rejected": -583.946875, + "loss": 0.0951, + "rewards/chosen": 2.29222838083903, + "rewards/margins": 12.52435499827067, + "rewards/rejected": -10.232126617431641, + "step": 1697 + }, + { + "epoch": 0.15513933302878027, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.429796047679452e-06, + "logits/chosen": 855601100.8, + "logits/rejected": 665414570.6666666, + "logps/chosen": -384.763916015625, + "logps/rejected": -598.5056966145834, + "loss": 0.0548, + "rewards/chosen": 3.0829036712646483, + "rewards/margins": 8.767543284098307, + "rewards/rejected": -5.684639612833659, + "step": 1698 + }, + { + "epoch": 0.15523069894929192, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.42912906592295e-06, + "logits/chosen": 939335168.0, + "logits/rejected": 626067072.0, + "logps/chosen": -208.11690266927084, + "logps/rejected": -499.63177490234375, + "loss": 0.0834, + "rewards/chosen": 2.8701041539510093, + "rewards/margins": 8.644692262013754, + "rewards/rejected": -5.774588108062744, + "step": 1699 + }, + { + "epoch": 0.15532206486980357, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 9.428461717918512e-06, + "logits/chosen": 400724864.0, + "logits/rejected": 879512192.0, + "logps/chosen": -246.6083984375, + "logps/rejected": -464.951171875, + "loss": 0.0177, + "rewards/chosen": 4.135159492492676, + "rewards/margins": 13.209322929382324, + "rewards/rejected": -9.074163436889648, + "step": 1700 + }, + { + "epoch": 0.15541343079031522, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 9.42779400372132e-06, + "logits/chosen": 440222176.0, + "logits/rejected": 449666400.0, + "logps/chosen": -153.72293090820312, + "logps/rejected": -493.5642395019531, + "loss": 0.1293, + "rewards/chosen": 1.8570828437805176, + "rewards/margins": 9.303367137908936, + "rewards/rejected": -7.446284294128418, + "step": 1701 + }, + { + "epoch": 0.15550479671082687, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.42712592338659e-06, + "logits/chosen": 355252582.4, + "logits/rejected": 531457194.6666667, + "logps/chosen": -246.489453125, + "logps/rejected": -501.7441813151042, + "loss": 0.0192, + "rewards/chosen": 3.863643264770508, + "rewards/margins": 14.412885920206705, + "rewards/rejected": -10.549242655436197, + "step": 1702 + }, + { + "epoch": 0.15559616263133852, + "grad_norm": 0.6171875, + "kl": 0.0, + "learning_rate": 9.426457476969561e-06, + "logits/chosen": 410081664.0, + "logits/rejected": 1134728411.4285715, + "logps/chosen": -239.73782348632812, + "logps/rejected": -849.2875279017857, + "loss": 0.0021, + "rewards/chosen": 5.749432563781738, + "rewards/margins": 18.73196234021868, + "rewards/rejected": -12.982529776436943, + "step": 1703 + }, + { + "epoch": 0.15568752855185017, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.425788664525514e-06, + "logits/chosen": 617936969.1428572, + "logits/rejected": 447235616.0, + "logps/chosen": -365.60239955357144, + "logps/rejected": -351.271240234375, + "loss": 0.0486, + "rewards/chosen": 2.9925458090645924, + "rewards/margins": 10.821769441877093, + "rewards/rejected": -7.8292236328125, + "step": 1704 + }, + { + "epoch": 0.15577889447236182, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.425119486109748e-06, + "logits/chosen": 605026986.6666666, + "logits/rejected": 296648089.6, + "logps/chosen": -205.29341634114584, + "logps/rejected": -270.34833984375, + "loss": 0.0291, + "rewards/chosen": 3.1792545318603516, + "rewards/margins": 9.702990341186524, + "rewards/rejected": -6.523735809326172, + "step": 1705 + }, + { + "epoch": 0.15587026039287347, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.424449941777602e-06, + "logits/chosen": 589707904.0, + "logits/rejected": 803392512.0, + "logps/chosen": -314.8881530761719, + "logps/rejected": -286.99688720703125, + "loss": 0.1673, + "rewards/chosen": 4.353868007659912, + "rewards/margins": 8.213926474253336, + "rewards/rejected": -3.8600584665934243, + "step": 1706 + }, + { + "epoch": 0.15596162631338512, + "grad_norm": 0.482421875, + "kl": 0.0, + "learning_rate": 9.423780031584439e-06, + "logits/chosen": 241458160.0, + "logits/rejected": 566639573.3333334, + "logps/chosen": -213.14480590820312, + "logps/rejected": -586.2227376302084, + "loss": 0.0021, + "rewards/chosen": 4.866520881652832, + "rewards/margins": 14.51174259185791, + "rewards/rejected": -9.645221710205078, + "step": 1707 + }, + { + "epoch": 0.15605299223389676, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.423109755585653e-06, + "logits/chosen": 639595328.0, + "logits/rejected": 472235136.0, + "logps/chosen": -393.655517578125, + "logps/rejected": -389.5729064941406, + "loss": 0.02, + "rewards/chosen": 3.4778311252593994, + "rewards/margins": 11.109049081802368, + "rewards/rejected": -7.631217956542969, + "step": 1708 + }, + { + "epoch": 0.1561443581544084, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 9.422439113836673e-06, + "logits/chosen": 538193664.0, + "logits/rejected": 841768832.0, + "logps/chosen": -308.4651184082031, + "logps/rejected": -503.2394104003906, + "loss": 0.0309, + "rewards/chosen": 3.0367488861083984, + "rewards/margins": 11.945858001708984, + "rewards/rejected": -8.909109115600586, + "step": 1709 + }, + { + "epoch": 0.15623572407492006, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.421768106392952e-06, + "logits/chosen": 875659776.0, + "logits/rejected": 827113280.0, + "logps/chosen": -211.72418212890625, + "logps/rejected": -598.4153442382812, + "loss": 0.0897, + "rewards/chosen": 2.8214438756306968, + "rewards/margins": 10.402245362599691, + "rewards/rejected": -7.580801486968994, + "step": 1710 + }, + { + "epoch": 0.1563270899954317, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.421096733309978e-06, + "logits/chosen": 479231283.2, + "logits/rejected": 384912725.3333333, + "logps/chosen": -289.9383544921875, + "logps/rejected": -594.7739664713541, + "loss": 0.0494, + "rewards/chosen": 2.509480857849121, + "rewards/margins": 10.334789339701334, + "rewards/rejected": -7.825308481852214, + "step": 1711 + }, + { + "epoch": 0.15641845591594336, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.420424994643268e-06, + "logits/chosen": 506001216.0, + "logits/rejected": 487639040.0, + "logps/chosen": -353.7301025390625, + "logps/rejected": -320.2946370442708, + "loss": 0.0645, + "rewards/chosen": 1.948506236076355, + "rewards/margins": 7.263754725456238, + "rewards/rejected": -5.315248489379883, + "step": 1712 + }, + { + "epoch": 0.156509821836455, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.419752890448364e-06, + "logits/chosen": 532412736.0, + "logits/rejected": 646746240.0, + "logps/chosen": -392.4439392089844, + "logps/rejected": -446.7354736328125, + "loss": 0.1385, + "rewards/chosen": 2.9111697673797607, + "rewards/margins": 7.056480169296265, + "rewards/rejected": -4.145310401916504, + "step": 1713 + }, + { + "epoch": 0.15660118775696666, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.419080420780847e-06, + "logits/chosen": 438121152.0, + "logits/rejected": 625395200.0, + "logps/chosen": -341.73919677734375, + "logps/rejected": -587.3801879882812, + "loss": 0.025, + "rewards/chosen": 3.4942564964294434, + "rewards/margins": 14.284254550933838, + "rewards/rejected": -10.789998054504395, + "step": 1714 + }, + { + "epoch": 0.1566925536774783, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.418407585696322e-06, + "logits/chosen": 523103968.0, + "logits/rejected": 662483328.0, + "logps/chosen": -362.28179931640625, + "logps/rejected": -443.8805236816406, + "loss": 0.1314, + "rewards/chosen": 2.9319040775299072, + "rewards/margins": 10.059145212173462, + "rewards/rejected": -7.127241134643555, + "step": 1715 + }, + { + "epoch": 0.15678391959798996, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 9.417734385250428e-06, + "logits/chosen": 386289664.0, + "logits/rejected": 504608597.3333333, + "logps/chosen": -327.08525390625, + "logps/rejected": -594.8562418619791, + "loss": 0.0313, + "rewards/chosen": 3.6119148254394533, + "rewards/margins": 13.069247436523437, + "rewards/rejected": -9.457332611083984, + "step": 1716 + }, + { + "epoch": 0.1568752855185016, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.417060819498832e-06, + "logits/chosen": 662539904.0, + "logits/rejected": 507505376.0, + "logps/chosen": -281.2422790527344, + "logps/rejected": -610.5595092773438, + "loss": 0.024, + "rewards/chosen": 3.295518398284912, + "rewards/margins": 13.163026332855225, + "rewards/rejected": -9.867507934570312, + "step": 1717 + }, + { + "epoch": 0.15696665143901326, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.416386888497231e-06, + "logits/chosen": 411980224.0, + "logits/rejected": 346730432.0, + "logps/chosen": -235.8605194091797, + "logps/rejected": -601.052490234375, + "loss": 0.0411, + "rewards/chosen": 3.380418062210083, + "rewards/margins": 17.036036729812622, + "rewards/rejected": -13.655618667602539, + "step": 1718 + }, + { + "epoch": 0.1570580173595249, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.415712592301354e-06, + "logits/chosen": 519581866.6666667, + "logits/rejected": 288626688.0, + "logps/chosen": -148.17940266927084, + "logps/rejected": -500.864306640625, + "loss": 0.0618, + "rewards/chosen": 2.24819548924764, + "rewards/margins": 12.986801751454673, + "rewards/rejected": -10.738606262207032, + "step": 1719 + }, + { + "epoch": 0.15714938328003655, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.415037930966957e-06, + "logits/chosen": 432974438.4, + "logits/rejected": 639505280.0, + "logps/chosen": -254.714892578125, + "logps/rejected": -397.6310628255208, + "loss": 0.0745, + "rewards/chosen": 2.5431291580200197, + "rewards/margins": 9.324051094055175, + "rewards/rejected": -6.780921936035156, + "step": 1720 + }, + { + "epoch": 0.1572407492005482, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.414362904549829e-06, + "logits/chosen": 209053184.0, + "logits/rejected": 533817753.6, + "logps/chosen": -129.9856160481771, + "logps/rejected": -561.8509765625, + "loss": 0.0363, + "rewards/chosen": 3.232863744099935, + "rewards/margins": 9.428577550252278, + "rewards/rejected": -6.195713806152344, + "step": 1721 + }, + { + "epoch": 0.15733211512105985, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.413687513105789e-06, + "logits/chosen": 564076501.3333334, + "logits/rejected": 262304204.8, + "logps/chosen": -205.44873046875, + "logps/rejected": -423.13818359375, + "loss": 0.0411, + "rewards/chosen": 2.1556386947631836, + "rewards/margins": 10.223793601989746, + "rewards/rejected": -8.068154907226562, + "step": 1722 + }, + { + "epoch": 0.1574234810415715, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.413011756690686e-06, + "logits/chosen": 395142826.6666667, + "logits/rejected": 448120832.0, + "logps/chosen": -316.8314615885417, + "logps/rejected": -595.02314453125, + "loss": 0.0418, + "rewards/chosen": 2.7987499237060547, + "rewards/margins": 10.43346061706543, + "rewards/rejected": -7.634710693359375, + "step": 1723 + }, + { + "epoch": 0.15751484696208315, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 9.412335635360397e-06, + "logits/chosen": 318844384.0, + "logits/rejected": 652982016.0, + "logps/chosen": -97.93073272705078, + "logps/rejected": -763.5752563476562, + "loss": 0.0338, + "rewards/chosen": 3.1391777992248535, + "rewards/margins": 17.273071765899658, + "rewards/rejected": -14.133893966674805, + "step": 1724 + }, + { + "epoch": 0.1576062128825948, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.411659149170833e-06, + "logits/chosen": 1178906214.4, + "logits/rejected": 788814848.0, + "logps/chosen": -354.8763671875, + "logps/rejected": -652.2567952473959, + "loss": 0.0321, + "rewards/chosen": 3.2686492919921877, + "rewards/margins": 12.425025177001952, + "rewards/rejected": -9.156375885009766, + "step": 1725 + }, + { + "epoch": 0.15769757880310645, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.410982298177933e-06, + "logits/chosen": 646186188.8, + "logits/rejected": 324991296.0, + "logps/chosen": -332.99541015625, + "logps/rejected": -245.64371744791666, + "loss": 0.045, + "rewards/chosen": 2.7302356719970704, + "rewards/margins": 8.857399876912435, + "rewards/rejected": -6.127164204915364, + "step": 1726 + }, + { + "epoch": 0.1577889447236181, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 9.410305082437664e-06, + "logits/chosen": 517582549.3333333, + "logits/rejected": 432358860.8, + "logps/chosen": -373.5486653645833, + "logps/rejected": -400.4591796875, + "loss": 0.0088, + "rewards/chosen": 3.969492276509603, + "rewards/margins": 12.637438901265464, + "rewards/rejected": -8.66794662475586, + "step": 1727 + }, + { + "epoch": 0.15788031064412975, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.409627502006027e-06, + "logits/chosen": 752130432.0, + "logits/rejected": 445523090.28571427, + "logps/chosen": -508.46746826171875, + "logps/rejected": -398.5904017857143, + "loss": 0.019, + "rewards/chosen": 3.8852295875549316, + "rewards/margins": 11.375652517591204, + "rewards/rejected": -7.490422930036273, + "step": 1728 + }, + { + "epoch": 0.1579716765646414, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 9.408949556939053e-06, + "logits/chosen": 505346624.0, + "logits/rejected": 421696768.0, + "logps/chosen": -321.7388610839844, + "logps/rejected": -397.39227294921875, + "loss": 0.0408, + "rewards/chosen": 2.4416344165802, + "rewards/margins": 9.898072004318237, + "rewards/rejected": -7.456437587738037, + "step": 1729 + }, + { + "epoch": 0.15806304248515304, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 9.4082712472928e-06, + "logits/chosen": 743642709.3333334, + "logits/rejected": 781627904.0, + "logps/chosen": -481.7939453125, + "logps/rejected": -452.207275390625, + "loss": 0.0265, + "rewards/chosen": 3.616156895955404, + "rewards/margins": 9.451094182332357, + "rewards/rejected": -5.834937286376953, + "step": 1730 + }, + { + "epoch": 0.1581544084056647, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.407592573123359e-06, + "logits/chosen": 375172864.0, + "logits/rejected": 442933120.0, + "logps/chosen": -175.31500244140625, + "logps/rejected": -366.75128173828125, + "loss": 0.036, + "rewards/chosen": 3.065887451171875, + "rewards/margins": 10.116745948791504, + "rewards/rejected": -7.050858497619629, + "step": 1731 + }, + { + "epoch": 0.15824577432617634, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 9.406913534486848e-06, + "logits/chosen": 609352630.8571428, + "logits/rejected": 1070767488.0, + "logps/chosen": -286.56556919642856, + "logps/rejected": -589.9903564453125, + "loss": 0.0464, + "rewards/chosen": 3.1696989876883372, + "rewards/margins": 14.775314603533063, + "rewards/rejected": -11.605615615844727, + "step": 1732 + }, + { + "epoch": 0.158337140246688, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 9.40623413143942e-06, + "logits/chosen": 1073373184.0, + "logits/rejected": 620826944.0, + "logps/chosen": -379.2219645182292, + "logps/rejected": -305.4193115234375, + "loss": 0.0689, + "rewards/chosen": 2.8674338658650718, + "rewards/margins": 8.598663647969564, + "rewards/rejected": -5.731229782104492, + "step": 1733 + }, + { + "epoch": 0.15842850616719964, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 9.405554364037254e-06, + "logits/chosen": 637780608.0, + "logits/rejected": 644296192.0, + "logps/chosen": -244.825927734375, + "logps/rejected": -695.0548502604166, + "loss": 0.0078, + "rewards/chosen": 3.649372100830078, + "rewards/margins": 12.988043467203775, + "rewards/rejected": -9.338671366373697, + "step": 1734 + }, + { + "epoch": 0.1585198720877113, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.404874232336561e-06, + "logits/chosen": 609020288.0, + "logits/rejected": 1401486720.0, + "logps/chosen": -440.09771728515625, + "logps/rejected": -420.13336181640625, + "loss": 0.0831, + "rewards/chosen": 3.2838869094848633, + "rewards/margins": 7.672588348388672, + "rewards/rejected": -4.388701438903809, + "step": 1735 + }, + { + "epoch": 0.15861123800822294, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.404193736393578e-06, + "logits/chosen": 765605717.3333334, + "logits/rejected": 524524134.4, + "logps/chosen": -375.0144449869792, + "logps/rejected": -535.853125, + "loss": 0.0727, + "rewards/chosen": 3.492288589477539, + "rewards/margins": 9.85713996887207, + "rewards/rejected": -6.364851379394532, + "step": 1736 + }, + { + "epoch": 0.1587026039287346, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 9.403512876264583e-06, + "logits/chosen": 736710592.0, + "logits/rejected": 871776597.3333334, + "logps/chosen": -315.61199951171875, + "logps/rejected": -455.345458984375, + "loss": 0.0075, + "rewards/chosen": 3.739370822906494, + "rewards/margins": 11.566426436106365, + "rewards/rejected": -7.82705561319987, + "step": 1737 + }, + { + "epoch": 0.15879396984924624, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 9.402831652005868e-06, + "logits/chosen": 1157156864.0, + "logits/rejected": 827140992.0, + "logps/chosen": -250.09567260742188, + "logps/rejected": -533.6156616210938, + "loss": 0.0341, + "rewards/chosen": 3.400876998901367, + "rewards/margins": 12.157825469970703, + "rewards/rejected": -8.756948471069336, + "step": 1738 + }, + { + "epoch": 0.15888533576975788, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.402150063673772e-06, + "logits/chosen": 434264864.0, + "logits/rejected": 675754752.0, + "logps/chosen": -274.7065734863281, + "logps/rejected": -411.99517822265625, + "loss": 0.0214, + "rewards/chosen": 3.3476414680480957, + "rewards/margins": 11.80961561203003, + "rewards/rejected": -8.461974143981934, + "step": 1739 + }, + { + "epoch": 0.15897670169026953, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 9.401468111324651e-06, + "logits/chosen": 524611481.6, + "logits/rejected": 307243797.3333333, + "logps/chosen": -314.5421875, + "logps/rejected": -304.8664143880208, + "loss": 0.0195, + "rewards/chosen": 3.8678897857666015, + "rewards/margins": 10.601894124348958, + "rewards/rejected": -6.7340043385823565, + "step": 1740 + }, + { + "epoch": 0.15906806761078118, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 9.400785795014898e-06, + "logits/chosen": 359842688.0, + "logits/rejected": 443680960.0, + "logps/chosen": -312.43939208984375, + "logps/rejected": -484.73681640625, + "loss": 0.0227, + "rewards/chosen": 3.7953107357025146, + "rewards/margins": 12.859034299850464, + "rewards/rejected": -9.06372356414795, + "step": 1741 + }, + { + "epoch": 0.15915943353129283, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 9.400103114800935e-06, + "logits/chosen": 526659904.0, + "logits/rejected": 568689408.0, + "logps/chosen": -495.2521667480469, + "logps/rejected": -459.7767028808594, + "loss": 0.0407, + "rewards/chosen": 2.895458221435547, + "rewards/margins": 12.207767486572266, + "rewards/rejected": -9.312309265136719, + "step": 1742 + }, + { + "epoch": 0.15925079945180448, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.399420070739211e-06, + "logits/chosen": 445677120.0, + "logits/rejected": 492347968.0, + "logps/chosen": -358.8575439453125, + "logps/rejected": -440.10723876953125, + "loss": 0.0183, + "rewards/chosen": 3.8332743644714355, + "rewards/margins": 10.796463012695312, + "rewards/rejected": -6.963188648223877, + "step": 1743 + }, + { + "epoch": 0.15934216537231613, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.398736662886208e-06, + "logits/chosen": 542999765.3333334, + "logits/rejected": 851941120.0, + "logps/chosen": -252.75065104166666, + "logps/rejected": -431.05048828125, + "loss": 0.0496, + "rewards/chosen": 2.3017848332722983, + "rewards/margins": 11.631547101338706, + "rewards/rejected": -9.329762268066407, + "step": 1744 + }, + { + "epoch": 0.15943353129282778, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.39805289129844e-06, + "logits/chosen": 350534016.0, + "logits/rejected": 376454720.0, + "logps/chosen": -257.92388916015625, + "logps/rejected": -458.3299560546875, + "loss": 0.0799, + "rewards/chosen": 2.303992509841919, + "rewards/margins": 11.745423555374146, + "rewards/rejected": -9.441431045532227, + "step": 1745 + }, + { + "epoch": 0.15952489721333943, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 9.397368756032445e-06, + "logits/chosen": 910896981.3333334, + "logits/rejected": 438855424.0, + "logps/chosen": -370.9325358072917, + "logps/rejected": -458.85673828125, + "loss": 0.0112, + "rewards/chosen": 4.102659225463867, + "rewards/margins": 14.12599449157715, + "rewards/rejected": -10.023335266113282, + "step": 1746 + }, + { + "epoch": 0.15961626313385108, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 9.396684257144799e-06, + "logits/chosen": 485456224.0, + "logits/rejected": 395426176.0, + "logps/chosen": -392.215087890625, + "logps/rejected": -342.2953287760417, + "loss": 0.0069, + "rewards/chosen": 4.049021244049072, + "rewards/margins": 10.517238457997639, + "rewards/rejected": -6.468217213948567, + "step": 1747 + }, + { + "epoch": 0.15970762905436273, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.395999394692098e-06, + "logits/chosen": 441939675.4285714, + "logits/rejected": 647148736.0, + "logps/chosen": -284.3706752232143, + "logps/rejected": -271.0703125, + "loss": 0.0651, + "rewards/chosen": 3.397413526262556, + "rewards/margins": 8.416423116411481, + "rewards/rejected": -5.019009590148926, + "step": 1748 + }, + { + "epoch": 0.15979899497487438, + "grad_norm": 5.09375, + "kl": 0.047046661376953125, + "learning_rate": 9.39531416873098e-06, + "logits/chosen": 791984576.0, + "logps/chosen": -316.6280212402344, + "loss": 0.0392, + "rewards/chosen": 3.450103282928467, + "step": 1749 + }, + { + "epoch": 0.15989036089538602, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.394628579318101e-06, + "logits/chosen": 468040789.3333333, + "logits/rejected": 355140480.0, + "logps/chosen": -309.1607666015625, + "logps/rejected": -618.4906005859375, + "loss": 0.036, + "rewards/chosen": 4.078275044759114, + "rewards/margins": 14.713175137837727, + "rewards/rejected": -10.634900093078613, + "step": 1750 + }, + { + "epoch": 0.15998172681589767, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.393942626510157e-06, + "logits/chosen": 348365670.4, + "logits/rejected": 157254848.0, + "logps/chosen": -248.598193359375, + "logps/rejected": -526.6043701171875, + "loss": 0.0242, + "rewards/chosen": 3.85888671875, + "rewards/margins": 12.215059916178385, + "rewards/rejected": -8.356173197428385, + "step": 1751 + }, + { + "epoch": 0.16007309273640932, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 9.393256310363868e-06, + "logits/chosen": 735833258.6666666, + "logits/rejected": 862878208.0, + "logps/chosen": -253.6727498372396, + "logps/rejected": -360.3244140625, + "loss": 0.0657, + "rewards/chosen": 4.283862749735515, + "rewards/margins": 8.548335711161297, + "rewards/rejected": -4.264472961425781, + "step": 1752 + }, + { + "epoch": 0.16016445865692097, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.392569630935988e-06, + "logits/chosen": 388372992.0, + "logits/rejected": 798855040.0, + "logps/chosen": -306.9425048828125, + "logps/rejected": -575.8238525390625, + "loss": 0.0498, + "rewards/chosen": 2.7495930989583335, + "rewards/margins": 10.714157422383627, + "rewards/rejected": -7.964564323425293, + "step": 1753 + }, + { + "epoch": 0.16025582457743262, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 9.391882588283296e-06, + "logits/chosen": 693372885.3333334, + "logits/rejected": 751541299.2, + "logps/chosen": -420.801513671875, + "logps/rejected": -561.08828125, + "loss": 0.0135, + "rewards/chosen": 3.615128835042318, + "rewards/margins": 12.43659413655599, + "rewards/rejected": -8.821465301513673, + "step": 1754 + }, + { + "epoch": 0.16034719049794427, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.391195182462607e-06, + "logits/chosen": 422936021.3333333, + "logits/rejected": 542556108.8, + "logps/chosen": -371.3328857421875, + "logps/rejected": -567.854736328125, + "loss": 0.0211, + "rewards/chosen": 3.2907606760660806, + "rewards/margins": 11.351220575968425, + "rewards/rejected": -8.060459899902344, + "step": 1755 + }, + { + "epoch": 0.16043855641845592, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.39050741353076e-06, + "logits/chosen": 750508992.0, + "logits/rejected": 811893184.0, + "logps/chosen": -369.06005859375, + "logps/rejected": -596.0457763671875, + "loss": 0.0338, + "rewards/chosen": 3.230775833129883, + "rewards/margins": 11.769063949584961, + "rewards/rejected": -8.538288116455078, + "step": 1756 + }, + { + "epoch": 0.16052992233896757, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 9.38981928154463e-06, + "logits/chosen": 830103168.0, + "logits/rejected": 793131946.6666666, + "logps/chosen": -293.3068542480469, + "logps/rejected": -347.0888671875, + "loss": 0.2055, + "rewards/chosen": 5.239559173583984, + "rewards/margins": 9.720497131347656, + "rewards/rejected": -4.480937957763672, + "step": 1757 + }, + { + "epoch": 0.16062128825947922, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.389130786561117e-06, + "logits/chosen": 310296064.0, + "logits/rejected": 378051264.0, + "logps/chosen": -187.2187042236328, + "logps/rejected": -358.9581298828125, + "loss": 0.0257, + "rewards/chosen": 3.7227039337158203, + "rewards/margins": 11.967656135559082, + "rewards/rejected": -8.244952201843262, + "step": 1758 + }, + { + "epoch": 0.16071265417999087, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.388441928637155e-06, + "logits/chosen": 717686937.6, + "logits/rejected": 1299837866.6666667, + "logps/chosen": -323.77587890625, + "logps/rejected": -464.5174560546875, + "loss": 0.0442, + "rewards/chosen": 2.7588993072509767, + "rewards/margins": 10.342393493652343, + "rewards/rejected": -7.583494186401367, + "step": 1759 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 9.387752707829705e-06, + "logits/chosen": 666640000.0, + "logits/rejected": 1057295744.0, + "logps/chosen": -281.12799072265625, + "logps/rejected": -438.7423095703125, + "loss": 0.0237, + "rewards/chosen": 3.1914236545562744, + "rewards/margins": 11.53767991065979, + "rewards/rejected": -8.346256256103516, + "step": 1760 + }, + { + "epoch": 0.16089538602101416, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 9.38706312419576e-06, + "logits/chosen": 1208404608.0, + "logits/rejected": 742924416.0, + "logps/chosen": -450.7539978027344, + "logps/rejected": -352.78466796875, + "loss": 0.0154, + "rewards/chosen": 3.8469295501708984, + "rewards/margins": 11.825684070587158, + "rewards/rejected": -7.97875452041626, + "step": 1761 + }, + { + "epoch": 0.1609867519415258, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 9.386373177792341e-06, + "logits/chosen": 545713856.0, + "logits/rejected": 542226048.0, + "logps/chosen": -347.17559814453125, + "logps/rejected": -499.2534484863281, + "loss": 0.0202, + "rewards/chosen": 4.088326454162598, + "rewards/margins": 11.216496467590332, + "rewards/rejected": -7.128170013427734, + "step": 1762 + }, + { + "epoch": 0.16107811786203746, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 9.385682868676502e-06, + "logits/chosen": 793580160.0, + "logits/rejected": 539058112.0, + "logps/chosen": -564.5068969726562, + "logps/rejected": -506.0897216796875, + "loss": 0.0221, + "rewards/chosen": 3.1001832485198975, + "rewards/margins": 11.59854769706726, + "rewards/rejected": -8.498364448547363, + "step": 1763 + }, + { + "epoch": 0.1611694837825491, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 9.384992196905323e-06, + "logits/chosen": 792190156.8, + "logits/rejected": 612733482.6666666, + "logps/chosen": -291.384326171875, + "logps/rejected": -396.6573079427083, + "loss": 0.1393, + "rewards/chosen": 2.4680561065673827, + "rewards/margins": 10.859622065226237, + "rewards/rejected": -8.391565958658854, + "step": 1764 + }, + { + "epoch": 0.16126084970306076, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.384301162535918e-06, + "logits/chosen": 514824960.0, + "logits/rejected": 845752320.0, + "logps/chosen": -256.790576171875, + "logps/rejected": -600.2557779947916, + "loss": 0.0316, + "rewards/chosen": 3.452499771118164, + "rewards/margins": 13.408942921956381, + "rewards/rejected": -9.956443150838217, + "step": 1765 + }, + { + "epoch": 0.1613522156235724, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.383609765625428e-06, + "logits/chosen": 455470438.4, + "logits/rejected": 404871509.3333333, + "logps/chosen": -181.452294921875, + "logps/rejected": -461.6383463541667, + "loss": 0.0334, + "rewards/chosen": 3.6567230224609375, + "rewards/margins": 11.848498026529947, + "rewards/rejected": -8.19177500406901, + "step": 1766 + }, + { + "epoch": 0.16144358154408406, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.382918006231024e-06, + "logits/chosen": 435057792.0, + "logits/rejected": 480570112.0, + "logps/chosen": -266.31020100911456, + "logps/rejected": -440.733544921875, + "loss": 0.0254, + "rewards/chosen": 2.7387355168660483, + "rewards/margins": 11.318117078145345, + "rewards/rejected": -8.579381561279297, + "step": 1767 + }, + { + "epoch": 0.1615349474645957, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.38222588440991e-06, + "logits/chosen": 656915200.0, + "logits/rejected": 413563392.0, + "logps/chosen": -259.3666178385417, + "logps/rejected": -301.8828430175781, + "loss": 0.0264, + "rewards/chosen": 3.766540209452311, + "rewards/margins": 8.289911905924479, + "rewards/rejected": -4.523371696472168, + "step": 1768 + }, + { + "epoch": 0.16162631338510736, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 9.381533400219319e-06, + "logits/chosen": 800854476.8, + "logits/rejected": 646075136.0, + "logps/chosen": -436.382421875, + "logps/rejected": -556.3529052734375, + "loss": 0.0213, + "rewards/chosen": 3.6454505920410156, + "rewards/margins": 10.622646967569988, + "rewards/rejected": -6.977196375528972, + "step": 1769 + }, + { + "epoch": 0.161717679305619, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.380840553716511e-06, + "logits/chosen": 419321600.0, + "logits/rejected": 326270310.4, + "logps/chosen": -389.2447916666667, + "logps/rejected": -353.406640625, + "loss": 0.0475, + "rewards/chosen": 2.0886648495992026, + "rewards/margins": 10.214799340566, + "rewards/rejected": -8.126134490966797, + "step": 1770 + }, + { + "epoch": 0.16180904522613065, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.380147344958778e-06, + "logits/chosen": 584284160.0, + "logits/rejected": 792009728.0, + "logps/chosen": -331.97149658203125, + "logps/rejected": -564.84619140625, + "loss": 0.0294, + "rewards/chosen": 2.521756966908773, + "rewards/margins": 11.61228125890096, + "rewards/rejected": -9.090524291992187, + "step": 1771 + }, + { + "epoch": 0.1619004111466423, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.379453774003442e-06, + "logits/chosen": 919805696.0, + "logits/rejected": 524992416.0, + "logps/chosen": -350.3631998697917, + "logps/rejected": -528.8211059570312, + "loss": 0.1011, + "rewards/chosen": 2.231725056966146, + "rewards/margins": 11.73501714070638, + "rewards/rejected": -9.503292083740234, + "step": 1772 + }, + { + "epoch": 0.16199177706715395, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.378759840907857e-06, + "logits/chosen": 412964992.0, + "logits/rejected": 362548019.2, + "logps/chosen": -278.3673502604167, + "logps/rejected": -574.16787109375, + "loss": 0.0605, + "rewards/chosen": 3.0859578450520835, + "rewards/margins": 14.251660664876303, + "rewards/rejected": -11.165702819824219, + "step": 1773 + }, + { + "epoch": 0.1620831429876656, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 9.378065545729403e-06, + "logits/chosen": 1357304320.0, + "logits/rejected": 454284544.0, + "logps/chosen": -515.1945190429688, + "logps/rejected": -410.8623860677083, + "loss": 0.0051, + "rewards/chosen": 3.981350898742676, + "rewards/margins": 13.583204587300619, + "rewards/rejected": -9.601853688557943, + "step": 1774 + }, + { + "epoch": 0.16217450890817725, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 9.377370888525492e-06, + "logits/chosen": 745259328.0, + "logits/rejected": 635355050.6666666, + "logps/chosen": -741.4388427734375, + "logps/rejected": -605.6947428385416, + "loss": 0.0202, + "rewards/chosen": 2.6501708030700684, + "rewards/margins": 9.950001557668049, + "rewards/rejected": -7.2998307545979815, + "step": 1775 + }, + { + "epoch": 0.1622658748286889, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.376675869353566e-06, + "logits/chosen": 553082197.3333334, + "logits/rejected": 413373952.0, + "logps/chosen": -276.172607421875, + "logps/rejected": -540.2468872070312, + "loss": 0.0544, + "rewards/chosen": 3.0363791783650718, + "rewards/margins": 15.394703229268393, + "rewards/rejected": -12.35832405090332, + "step": 1776 + }, + { + "epoch": 0.16235724074920055, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.375980488271095e-06, + "logits/chosen": 927972864.0, + "logits/rejected": 1541805440.0, + "logps/chosen": -336.14556884765625, + "logps/rejected": -881.6021118164062, + "loss": 0.0442, + "rewards/chosen": 3.0427449544270835, + "rewards/margins": 9.667045911153158, + "rewards/rejected": -6.624300956726074, + "step": 1777 + }, + { + "epoch": 0.1624486066697122, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 9.375284745335582e-06, + "logits/chosen": 646469529.6, + "logits/rejected": 372178645.3333333, + "logps/chosen": -338.664111328125, + "logps/rejected": -377.7707112630208, + "loss": 0.0164, + "rewards/chosen": 3.7743053436279297, + "rewards/margins": 11.107196807861328, + "rewards/rejected": -7.332891464233398, + "step": 1778 + }, + { + "epoch": 0.16253997259022385, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.37458864060456e-06, + "logits/chosen": 563662250.6666666, + "logits/rejected": 583246284.8, + "logps/chosen": -385.7200927734375, + "logps/rejected": -692.07958984375, + "loss": 0.0206, + "rewards/chosen": 3.7396392822265625, + "rewards/margins": 13.78489990234375, + "rewards/rejected": -10.045260620117187, + "step": 1779 + }, + { + "epoch": 0.1626313385107355, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.373892174135588e-06, + "logits/chosen": 551338240.0, + "logits/rejected": 699875498.6666666, + "logps/chosen": -228.58134765625, + "logps/rejected": -279.0966389973958, + "loss": 0.0821, + "rewards/chosen": 3.474483108520508, + "rewards/margins": 7.586023585001628, + "rewards/rejected": -4.11154047648112, + "step": 1780 + }, + { + "epoch": 0.16272270443124714, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.373195345986259e-06, + "logits/chosen": 589496934.4, + "logits/rejected": 885994666.6666666, + "logps/chosen": -350.5292236328125, + "logps/rejected": -643.7771402994791, + "loss": 0.0231, + "rewards/chosen": 3.7845458984375, + "rewards/margins": 13.28627840677897, + "rewards/rejected": -9.50173250834147, + "step": 1781 + }, + { + "epoch": 0.1628140703517588, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 9.372498156214192e-06, + "logits/chosen": 662126336.0, + "logits/rejected": 489720320.0, + "logps/chosen": -427.20703125, + "logps/rejected": -639.32529296875, + "loss": 0.0098, + "rewards/chosen": 4.051116943359375, + "rewards/margins": 13.343759155273437, + "rewards/rejected": -9.292642211914062, + "step": 1782 + }, + { + "epoch": 0.16290543627227044, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.371800604877041e-06, + "logits/chosen": 534582485.3333333, + "logits/rejected": 854006067.2, + "logps/chosen": -214.70174153645834, + "logps/rejected": -541.014208984375, + "loss": 0.0136, + "rewards/chosen": 4.5462646484375, + "rewards/margins": 12.767304229736329, + "rewards/rejected": -8.221039581298829, + "step": 1783 + }, + { + "epoch": 0.1629968021927821, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 9.371102692032485e-06, + "logits/chosen": 648265318.4, + "logits/rejected": 505861290.6666667, + "logps/chosen": -332.57783203125, + "logps/rejected": -529.8994140625, + "loss": 0.0317, + "rewards/chosen": 3.3084415435791015, + "rewards/margins": 9.585286839803059, + "rewards/rejected": -6.276845296223958, + "step": 1784 + }, + { + "epoch": 0.16308816811329374, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.370404417738235e-06, + "logits/chosen": 506831872.0, + "logits/rejected": 388488021.3333333, + "logps/chosen": -343.69267578125, + "logps/rejected": -337.9997151692708, + "loss": 0.0887, + "rewards/chosen": 2.862723541259766, + "rewards/margins": 8.657396189371745, + "rewards/rejected": -5.7946726481119795, + "step": 1785 + }, + { + "epoch": 0.1631795340338054, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 9.369705782052034e-06, + "logits/chosen": 1135254118.4, + "logits/rejected": 687489450.6666666, + "logps/chosen": -357.321337890625, + "logps/rejected": -508.1766357421875, + "loss": 0.0248, + "rewards/chosen": 3.364832305908203, + "rewards/margins": 11.84724973042806, + "rewards/rejected": -8.482417424519857, + "step": 1786 + }, + { + "epoch": 0.16327089995431704, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.36900678503165e-06, + "logits/chosen": 463046048.0, + "logits/rejected": 872793920.0, + "logps/chosen": -245.77166748046875, + "logps/rejected": -555.5711059570312, + "loss": 0.0223, + "rewards/chosen": 4.145810127258301, + "rewards/margins": 14.562771797180176, + "rewards/rejected": -10.416961669921875, + "step": 1787 + }, + { + "epoch": 0.1633622658748287, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.368307426734884e-06, + "logits/chosen": 755445930.6666666, + "logits/rejected": 288116704.0, + "logps/chosen": -255.18729654947916, + "logps/rejected": -185.91256713867188, + "loss": 0.0288, + "rewards/chosen": 3.5371615091959634, + "rewards/margins": 9.82945696512858, + "rewards/rejected": -6.292295455932617, + "step": 1788 + }, + { + "epoch": 0.16345363179534034, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.367607707219569e-06, + "logits/chosen": 641415899.4285715, + "logits/rejected": 453237056.0, + "logps/chosen": -264.12064034598217, + "logps/rejected": -344.712890625, + "loss": 0.0598, + "rewards/chosen": 2.7041822160993303, + "rewards/margins": 10.873887607029506, + "rewards/rejected": -8.169705390930176, + "step": 1789 + }, + { + "epoch": 0.163544997715852, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 9.366907626543562e-06, + "logits/chosen": 500172800.0, + "logits/rejected": 532389068.8, + "logps/chosen": -177.30499267578125, + "logps/rejected": -526.248193359375, + "loss": 0.0253, + "rewards/chosen": 2.715234120686849, + "rewards/margins": 9.857865651448568, + "rewards/rejected": -7.142631530761719, + "step": 1790 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 0.921875, + "kl": 0.0, + "learning_rate": 9.366207184764753e-06, + "logits/chosen": 1034191360.0, + "logits/rejected": 802390381.7142857, + "logps/chosen": -175.64581298828125, + "logps/rejected": -633.6939174107143, + "loss": 0.0042, + "rewards/chosen": 3.666085958480835, + "rewards/margins": 12.69081882068089, + "rewards/rejected": -9.024732862200056, + "step": 1791 + }, + { + "epoch": 0.16372772955687528, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 9.365506381941066e-06, + "logits/chosen": 774985728.0, + "logits/rejected": 617470912.0, + "logps/chosen": -269.43121337890625, + "logps/rejected": -572.1192626953125, + "loss": 0.0204, + "rewards/chosen": 3.3623452186584473, + "rewards/margins": 12.40055513381958, + "rewards/rejected": -9.038209915161133, + "step": 1792 + }, + { + "epoch": 0.16381909547738693, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 9.364805218130448e-06, + "logits/chosen": 320659968.0, + "logits/rejected": 373955904.0, + "logps/chosen": -334.0965576171875, + "logps/rejected": -531.45703125, + "loss": 0.0082, + "rewards/chosen": 4.52897834777832, + "rewards/margins": 12.032739162445068, + "rewards/rejected": -7.503760814666748, + "step": 1793 + }, + { + "epoch": 0.16391046139789858, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 9.36410369339088e-06, + "logits/chosen": 645095104.0, + "logits/rejected": 1032025429.3333334, + "logps/chosen": -196.1810760498047, + "logps/rejected": -522.305908203125, + "loss": 0.0117, + "rewards/chosen": 3.5238585472106934, + "rewards/margins": 10.911764939626057, + "rewards/rejected": -7.387906392415364, + "step": 1794 + }, + { + "epoch": 0.16400182731841023, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.363401807780368e-06, + "logits/chosen": 444582442.6666667, + "logits/rejected": 274943008.0, + "logps/chosen": -392.2399495442708, + "logps/rejected": -180.59817504882812, + "loss": 0.137, + "rewards/chosen": 3.3225978215535483, + "rewards/margins": 6.178960879643759, + "rewards/rejected": -2.85636305809021, + "step": 1795 + }, + { + "epoch": 0.16409319323892188, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 9.362699561356957e-06, + "logits/chosen": 440287744.0, + "logits/rejected": 454399180.8, + "logps/chosen": -234.42769368489584, + "logps/rejected": -556.9455078125, + "loss": 0.0116, + "rewards/chosen": 3.6778844197591147, + "rewards/margins": 12.158174641927085, + "rewards/rejected": -8.48029022216797, + "step": 1796 + }, + { + "epoch": 0.16418455915943353, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.361996954178714e-06, + "logits/rejected": 983583488.0, + "logps/rejected": -498.3131103515625, + "loss": 0.1995, + "rewards/rejected": -6.4178009033203125, + "step": 1797 + }, + { + "epoch": 0.16427592507994518, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.361293986303736e-06, + "logits/chosen": 796368042.6666666, + "logits/rejected": 709950656.0, + "logps/chosen": -345.8309326171875, + "logps/rejected": -743.5809326171875, + "loss": 0.0274, + "rewards/chosen": 4.132221857706706, + "rewards/margins": 14.880105654398601, + "rewards/rejected": -10.747883796691895, + "step": 1798 + }, + { + "epoch": 0.16436729100045683, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 9.360590657790155e-06, + "logits/chosen": 593743424.0, + "logits/rejected": 421250688.0, + "logps/chosen": -456.7941589355469, + "logps/rejected": -436.5995178222656, + "loss": 0.0104, + "rewards/chosen": 4.255145072937012, + "rewards/margins": 13.673246383666992, + "rewards/rejected": -9.41810131072998, + "step": 1799 + }, + { + "epoch": 0.16445865692096848, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.359886968696127e-06, + "logits/chosen": 326200490.6666667, + "logits/rejected": 435133235.2, + "logps/chosen": -167.72843424479166, + "logps/rejected": -330.415869140625, + "loss": 0.0191, + "rewards/chosen": 3.309000333150228, + "rewards/margins": 10.789177640279135, + "rewards/rejected": -7.480177307128907, + "step": 1800 + }, + { + "epoch": 0.16455002284148013, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 9.359182919079841e-06, + "logits/chosen": 634606336.0, + "logits/rejected": 542563635.2, + "logps/chosen": -375.6247151692708, + "logps/rejected": -437.13974609375, + "loss": 0.0083, + "rewards/chosen": 4.487789154052734, + "rewards/margins": 11.764720916748047, + "rewards/rejected": -7.2769317626953125, + "step": 1801 + }, + { + "epoch": 0.16464138876199177, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 9.358478508999518e-06, + "logits/chosen": 401145216.0, + "logits/rejected": 464733248.0, + "logps/chosen": -339.1593322753906, + "logps/rejected": -404.16265869140625, + "loss": 0.013, + "rewards/chosen": 4.188136100769043, + "rewards/margins": 11.800904273986816, + "rewards/rejected": -7.612768173217773, + "step": 1802 + }, + { + "epoch": 0.16473275468250342, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 9.357773738513405e-06, + "logits/chosen": 573666432.0, + "logits/rejected": 493747353.6, + "logps/chosen": -412.8916015625, + "logps/rejected": -497.8044921875, + "loss": 0.0128, + "rewards/chosen": 3.9053525924682617, + "rewards/margins": 12.41112117767334, + "rewards/rejected": -8.505768585205079, + "step": 1803 + }, + { + "epoch": 0.16482412060301507, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.357068607679778e-06, + "logits/chosen": 355723072.0, + "logits/rejected": 339787584.0, + "logps/chosen": -465.4372253417969, + "logps/rejected": -494.8191731770833, + "loss": 0.013, + "rewards/chosen": 3.767230749130249, + "rewards/margins": 12.769536097844442, + "rewards/rejected": -9.002305348714193, + "step": 1804 + }, + { + "epoch": 0.16491548652352672, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.356363116556947e-06, + "logits/chosen": 719452620.8, + "logits/rejected": 1068680021.3333334, + "logps/chosen": -455.113916015625, + "logps/rejected": -364.9912516276042, + "loss": 0.0423, + "rewards/chosen": 2.904571533203125, + "rewards/margins": 12.795649846394857, + "rewards/rejected": -9.891078313191732, + "step": 1805 + }, + { + "epoch": 0.16500685244403837, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.35565726520325e-06, + "logits/chosen": 542863513.6, + "logits/rejected": 613917184.0, + "logps/chosen": -271.850439453125, + "logps/rejected": -369.3888346354167, + "loss": 0.0969, + "rewards/chosen": 3.117472839355469, + "rewards/margins": 10.780195744832357, + "rewards/rejected": -7.662722905476888, + "step": 1806 + }, + { + "epoch": 0.16509821836455002, + "grad_norm": 25.625, + "kl": 0.0, + "learning_rate": 9.35495105367705e-06, + "logits/chosen": 708247552.0, + "logits/rejected": 349745760.0, + "logps/chosen": -253.72909545898438, + "logps/rejected": -243.96224975585938, + "loss": 0.0598, + "rewards/chosen": 3.28057599067688, + "rewards/margins": 9.123080492019653, + "rewards/rejected": -5.842504501342773, + "step": 1807 + }, + { + "epoch": 0.16518958428506167, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.354244482036751e-06, + "logits/chosen": 408598954.6666667, + "logits/rejected": 369454745.6, + "logps/chosen": -289.2377522786458, + "logps/rejected": -455.27509765625, + "loss": 0.0124, + "rewards/chosen": 3.709087689717611, + "rewards/margins": 11.366972668965658, + "rewards/rejected": -7.657884979248047, + "step": 1808 + }, + { + "epoch": 0.16528095020557332, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.353537550340776e-06, + "logits/chosen": 587401472.0, + "logits/rejected": 653336405.3333334, + "logps/chosen": -372.83564453125, + "logps/rejected": -529.0023193359375, + "loss": 0.068, + "rewards/chosen": 3.062256622314453, + "rewards/margins": 11.679423777262368, + "rewards/rejected": -8.617167154947916, + "step": 1809 + }, + { + "epoch": 0.16537231612608497, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 9.35283025864758e-06, + "logits/chosen": 451949909.3333333, + "logits/rejected": 399996620.8, + "logps/chosen": -317.3588053385417, + "logps/rejected": -513.55087890625, + "loss": 0.044, + "rewards/chosen": 3.1232401529947915, + "rewards/margins": 13.15064951578776, + "rewards/rejected": -10.02740936279297, + "step": 1810 + }, + { + "epoch": 0.16546368204659662, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 9.352122607015652e-06, + "logits/chosen": 365614336.0, + "logits/rejected": 261726224.0, + "logps/chosen": -364.8478597005208, + "logps/rejected": -557.1783447265625, + "loss": 0.0494, + "rewards/chosen": 2.9456199010213218, + "rewards/margins": 10.893092473347982, + "rewards/rejected": -7.94747257232666, + "step": 1811 + }, + { + "epoch": 0.16555504796710827, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.351414595503508e-06, + "logits/chosen": 573805875.2, + "logits/rejected": 673566720.0, + "logps/chosen": -294.439306640625, + "logps/rejected": -353.5425211588542, + "loss": 0.0279, + "rewards/chosen": 3.2993694305419923, + "rewards/margins": 10.057396825154623, + "rewards/rejected": -6.75802739461263, + "step": 1812 + }, + { + "epoch": 0.16564641388761991, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.350706224169695e-06, + "logits/chosen": 572896841.1428572, + "logits/rejected": 492837696.0, + "logps/chosen": -261.07285853794644, + "logps/rejected": -326.589111328125, + "loss": 0.0466, + "rewards/chosen": 3.2535476684570312, + "rewards/margins": 9.134141445159912, + "rewards/rejected": -5.880593776702881, + "step": 1813 + }, + { + "epoch": 0.16573777980813156, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 9.349997493072785e-06, + "logits/chosen": 412960076.8, + "logits/rejected": 523918677.3333333, + "logps/chosen": -237.5298828125, + "logps/rejected": -622.3688151041666, + "loss": 0.0272, + "rewards/chosen": 3.405957794189453, + "rewards/margins": 10.981649017333984, + "rewards/rejected": -7.575691223144531, + "step": 1814 + }, + { + "epoch": 0.1658291457286432, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 9.349288402271387e-06, + "logits/chosen": 442102826.6666667, + "logits/rejected": 407248614.4, + "logps/chosen": -247.86688232421875, + "logps/rejected": -372.147998046875, + "loss": 0.0195, + "rewards/chosen": 3.0621582667032876, + "rewards/margins": 12.402751223246256, + "rewards/rejected": -9.340592956542968, + "step": 1815 + }, + { + "epoch": 0.16592051164915486, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.348578951824137e-06, + "logits/chosen": 529494944.0, + "logits/rejected": 284465834.6666667, + "logps/chosen": -349.04736328125, + "logps/rejected": -415.1845296223958, + "loss": 0.0185, + "rewards/chosen": 2.6985108852386475, + "rewards/margins": 11.124818563461304, + "rewards/rejected": -8.426307678222656, + "step": 1816 + }, + { + "epoch": 0.1660118775696665, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.347869141789696e-06, + "logits/chosen": 804320512.0, + "logits/rejected": 464301107.2, + "logps/chosen": -484.2795003255208, + "logps/rejected": -625.8416015625, + "loss": 0.0256, + "rewards/chosen": 2.853349049886068, + "rewards/margins": 10.066683705647787, + "rewards/rejected": -7.213334655761718, + "step": 1817 + }, + { + "epoch": 0.16610324349017816, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 9.34715897222676e-06, + "logits/chosen": 656043008.0, + "logits/rejected": 516812714.6666667, + "logps/chosen": -356.40802001953125, + "logps/rejected": -423.8389892578125, + "loss": 0.0088, + "rewards/chosen": 3.8974413871765137, + "rewards/margins": 10.995813528696697, + "rewards/rejected": -7.098372141520183, + "step": 1818 + }, + { + "epoch": 0.1661946094106898, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.346448443194056e-06, + "logits/chosen": 756633984.0, + "logits/rejected": 484645600.0, + "logps/chosen": -434.411376953125, + "logps/rejected": -646.125732421875, + "loss": 0.0297, + "rewards/chosen": 3.2647671699523926, + "rewards/margins": 13.972789287567139, + "rewards/rejected": -10.708022117614746, + "step": 1819 + }, + { + "epoch": 0.16628597533120146, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 9.345737554750336e-06, + "logits/chosen": 263266816.0, + "logits/rejected": 670566144.0, + "logps/chosen": -198.08447265625, + "logps/rejected": -562.7927856445312, + "loss": 0.0205, + "rewards/chosen": 3.813871145248413, + "rewards/margins": 10.19756007194519, + "rewards/rejected": -6.383688926696777, + "step": 1820 + }, + { + "epoch": 0.1663773412517131, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.345026306954385e-06, + "logits/chosen": 652414506.6666666, + "logits/rejected": 451264256.0, + "logps/chosen": -294.62485758463544, + "logps/rejected": -584.3563842773438, + "loss": 0.0404, + "rewards/chosen": 3.31406307220459, + "rewards/margins": 13.419648170471191, + "rewards/rejected": -10.105585098266602, + "step": 1821 + }, + { + "epoch": 0.16646870717222476, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 9.344314699865016e-06, + "logits/chosen": 781136384.0, + "logits/rejected": 426064128.0, + "logps/chosen": -354.1794189453125, + "logps/rejected": -504.9260660807292, + "loss": 0.0168, + "rewards/chosen": 3.868397521972656, + "rewards/margins": 13.03991330464681, + "rewards/rejected": -9.171515782674154, + "step": 1822 + }, + { + "epoch": 0.1665600730927364, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 9.343602733541071e-06, + "logits/chosen": 853894976.0, + "logits/rejected": 493049088.0, + "logps/chosen": -519.5989379882812, + "logps/rejected": -298.34478759765625, + "loss": 0.0155, + "rewards/chosen": 3.6138031482696533, + "rewards/margins": 11.417500257492065, + "rewards/rejected": -7.803697109222412, + "step": 1823 + }, + { + "epoch": 0.16665143901324805, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.342890408041425e-06, + "logits/chosen": 713924650.6666666, + "logits/rejected": 710377267.2, + "logps/chosen": -207.068359375, + "logps/rejected": -632.127294921875, + "loss": 0.0143, + "rewards/chosen": 3.8472817738850913, + "rewards/margins": 12.759368260701498, + "rewards/rejected": -8.912086486816406, + "step": 1824 + }, + { + "epoch": 0.1667428049337597, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.342177723424981e-06, + "logits/chosen": 533512294.4, + "logits/rejected": 455395114.6666667, + "logps/chosen": -336.3916015625, + "logps/rejected": -302.4345703125, + "loss": 0.15, + "rewards/chosen": 3.584187316894531, + "rewards/margins": 5.951254336039225, + "rewards/rejected": -2.367067019144694, + "step": 1825 + }, + { + "epoch": 0.16683417085427135, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.341464679750669e-06, + "logits/chosen": 400144554.6666667, + "logits/rejected": 898198118.4, + "logps/chosen": -243.9832763671875, + "logps/rejected": -693.776904296875, + "loss": 0.0206, + "rewards/chosen": 3.4696677525838218, + "rewards/margins": 14.647408231099448, + "rewards/rejected": -11.177740478515625, + "step": 1826 + }, + { + "epoch": 0.166925536774783, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.340751277077453e-06, + "logits/chosen": 681293482.6666666, + "logits/rejected": 473259724.8, + "logps/chosen": -399.2593180338542, + "logps/rejected": -332.506689453125, + "loss": 0.02, + "rewards/chosen": 3.223729451497396, + "rewards/margins": 10.594868977864584, + "rewards/rejected": -7.3711395263671875, + "step": 1827 + }, + { + "epoch": 0.16701690269529465, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 9.340037515464326e-06, + "logits/chosen": 824615680.0, + "logits/rejected": 599644992.0, + "logps/chosen": -338.42852783203125, + "logps/rejected": -240.34329223632812, + "loss": 0.1178, + "rewards/chosen": 3.6448869705200195, + "rewards/margins": 8.629726886749268, + "rewards/rejected": -4.984839916229248, + "step": 1828 + }, + { + "epoch": 0.1671082686158063, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.339323394970305e-06, + "logits/chosen": 375382101.3333333, + "logits/rejected": 461212704.0, + "logps/chosen": -224.4725138346354, + "logps/rejected": -154.94149780273438, + "loss": 0.0429, + "rewards/chosen": 3.345292409261068, + "rewards/margins": 8.636998017628988, + "rewards/rejected": -5.29170560836792, + "step": 1829 + }, + { + "epoch": 0.16719963453631795, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 9.338608915654447e-06, + "logits/chosen": 929683558.4, + "logits/rejected": 735138304.0, + "logps/chosen": -430.42802734375, + "logps/rejected": -557.3434651692709, + "loss": 0.0279, + "rewards/chosen": 3.1444726943969727, + "rewards/margins": 11.110722796122234, + "rewards/rejected": -7.966250101725261, + "step": 1830 + }, + { + "epoch": 0.1672910004568296, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 9.337894077575827e-06, + "logits/chosen": 843609344.0, + "logits/rejected": 498911840.0, + "logps/chosen": -376.6592712402344, + "logps/rejected": -597.10302734375, + "loss": 0.0187, + "rewards/chosen": 3.3208630084991455, + "rewards/margins": 11.127978086471558, + "rewards/rejected": -7.807115077972412, + "step": 1831 + }, + { + "epoch": 0.16738236637734125, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 9.337178880793559e-06, + "logits/chosen": 399904682.6666667, + "logits/rejected": 327933388.8, + "logps/chosen": -306.29075113932294, + "logps/rejected": -423.18271484375, + "loss": 0.021, + "rewards/chosen": 3.5268751780192056, + "rewards/margins": 11.053744379679362, + "rewards/rejected": -7.526869201660157, + "step": 1832 + }, + { + "epoch": 0.1674737322978529, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.336463325366783e-06, + "logits/chosen": 367986474.6666667, + "logits/rejected": 338173056.0, + "logps/chosen": -293.2900797526042, + "logps/rejected": -360.92783203125, + "loss": 0.0942, + "rewards/chosen": 3.0023533503214517, + "rewards/margins": 8.336204210917154, + "rewards/rejected": -5.333850860595703, + "step": 1833 + }, + { + "epoch": 0.16756509821836454, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 9.335747411354667e-06, + "logits/chosen": 1105677312.0, + "logits/rejected": 563024298.6666666, + "logps/chosen": -368.67864990234375, + "logps/rejected": -639.6068115234375, + "loss": 0.0147, + "rewards/chosen": 3.111009120941162, + "rewards/margins": 10.706956386566162, + "rewards/rejected": -7.595947265625, + "step": 1834 + }, + { + "epoch": 0.1676564641388762, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.335031138816412e-06, + "logits/chosen": 616064682.6666666, + "logits/rejected": 211303248.0, + "logps/chosen": -405.1490071614583, + "logps/rejected": -326.5382995605469, + "loss": 0.0403, + "rewards/chosen": 3.111525217692057, + "rewards/margins": 13.577194849650065, + "rewards/rejected": -10.465669631958008, + "step": 1835 + }, + { + "epoch": 0.16774783005938784, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.334314507811249e-06, + "logits/chosen": 612036973.7142857, + "logits/rejected": 556889984.0, + "logps/chosen": -318.337890625, + "logps/rejected": -1309.3101806640625, + "loss": 0.0385, + "rewards/chosen": 3.388287135532924, + "rewards/margins": 27.194988795689174, + "rewards/rejected": -23.80670166015625, + "step": 1836 + }, + { + "epoch": 0.1678391959798995, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.333597518398431e-06, + "logits/chosen": 461284454.4, + "logits/rejected": 273236586.6666667, + "logps/chosen": -245.73525390625, + "logps/rejected": -338.69972737630206, + "loss": 0.0721, + "rewards/chosen": 2.79480094909668, + "rewards/margins": 8.510784022013347, + "rewards/rejected": -5.715983072916667, + "step": 1837 + }, + { + "epoch": 0.16793056190041114, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.332880170637252e-06, + "logits/chosen": 683174604.8, + "logits/rejected": 517437184.0, + "logps/chosen": -380.54892578125, + "logps/rejected": -434.979248046875, + "loss": 0.0304, + "rewards/chosen": 3.360828399658203, + "rewards/margins": 10.850292205810547, + "rewards/rejected": -7.489463806152344, + "step": 1838 + }, + { + "epoch": 0.1680219278209228, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.332162464587029e-06, + "logits/chosen": 513315754.6666667, + "logits/rejected": 721283123.2, + "logps/chosen": -255.8510538736979, + "logps/rejected": -676.148095703125, + "loss": 0.0588, + "rewards/chosen": 2.809506098429362, + "rewards/margins": 10.501278559366861, + "rewards/rejected": -7.6917724609375, + "step": 1839 + }, + { + "epoch": 0.16811329374143444, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 9.331444400307107e-06, + "logits/chosen": 702484266.6666666, + "logits/rejected": 511883878.4, + "logps/chosen": -504.2173258463542, + "logps/rejected": -374.665234375, + "loss": 0.1142, + "rewards/chosen": 1.8080851236979167, + "rewards/margins": 9.146559397379557, + "rewards/rejected": -7.338474273681641, + "step": 1840 + }, + { + "epoch": 0.1682046596619461, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 9.330725977856864e-06, + "logits/chosen": 608093286.4, + "logits/rejected": 210930304.0, + "logps/chosen": -428.14990234375, + "logps/rejected": -329.09979248046875, + "loss": 0.0235, + "rewards/chosen": 3.625364303588867, + "rewards/margins": 11.741323471069336, + "rewards/rejected": -8.115959167480469, + "step": 1841 + }, + { + "epoch": 0.16829602558245774, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 9.33000719729571e-06, + "logits/chosen": 460862592.0, + "logits/rejected": 565674304.0, + "logps/chosen": -290.1325988769531, + "logps/rejected": -583.4744262695312, + "loss": 0.0174, + "rewards/chosen": 4.129303455352783, + "rewards/margins": 12.626362323760986, + "rewards/rejected": -8.497058868408203, + "step": 1842 + }, + { + "epoch": 0.16838739150296939, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.329288058683077e-06, + "logits/chosen": 636456512.0, + "logits/rejected": 758744210.2857143, + "logps/chosen": -159.13046264648438, + "logps/rejected": -482.33339146205356, + "loss": 0.0243, + "rewards/chosen": 2.8593080043792725, + "rewards/margins": 8.836054699761526, + "rewards/rejected": -5.976746695382254, + "step": 1843 + }, + { + "epoch": 0.16847875742348103, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.328568562078434e-06, + "logits/chosen": 849164434.2857143, + "logits/rejected": 546519936.0, + "logps/chosen": -361.6626674107143, + "logps/rejected": -150.1018829345703, + "loss": 0.0472, + "rewards/chosen": 3.1774509974888394, + "rewards/margins": 7.7113631793430875, + "rewards/rejected": -4.533912181854248, + "step": 1844 + }, + { + "epoch": 0.16857012334399268, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.327848707541278e-06, + "logits/chosen": 467352832.0, + "logits/rejected": 487975040.0, + "logps/chosen": -280.9420166015625, + "logps/rejected": -578.1165161132812, + "loss": 0.0463, + "rewards/chosen": 3.0525735219319663, + "rewards/margins": 11.710453351338705, + "rewards/rejected": -8.657879829406738, + "step": 1845 + }, + { + "epoch": 0.16866148926450433, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 9.32712849513113e-06, + "logits/chosen": 588325696.0, + "logits/rejected": 643038250.6666666, + "logps/chosen": -390.20001220703125, + "logps/rejected": -656.0079752604166, + "loss": 0.0148, + "rewards/chosen": 2.8166732788085938, + "rewards/margins": 11.822599411010742, + "rewards/rejected": -9.005926132202148, + "step": 1846 + }, + { + "epoch": 0.16875285518501598, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 9.326407924907547e-06, + "logits/chosen": 695307520.0, + "logits/rejected": 824787584.0, + "logps/chosen": -350.8693542480469, + "logps/rejected": -385.85357666015625, + "loss": 0.0162, + "rewards/chosen": 3.6916117668151855, + "rewards/margins": 10.298025608062744, + "rewards/rejected": -6.606413841247559, + "step": 1847 + }, + { + "epoch": 0.16884422110552763, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.325686996930116e-06, + "logits/chosen": 234061952.0, + "logits/rejected": 442112256.0, + "logps/chosen": -94.92906951904297, + "logps/rejected": -402.707763671875, + "loss": 0.0227, + "rewards/chosen": 3.5387096405029297, + "rewards/margins": 12.011983871459961, + "rewards/rejected": -8.473274230957031, + "step": 1848 + }, + { + "epoch": 0.16893558702603928, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 9.324965711258447e-06, + "logits/chosen": 442563584.0, + "logits/rejected": 455010080.0, + "logps/chosen": -244.31137084960938, + "logps/rejected": -334.7618408203125, + "loss": 0.017, + "rewards/chosen": 3.5705745220184326, + "rewards/margins": 11.679931402206421, + "rewards/rejected": -8.109356880187988, + "step": 1849 + }, + { + "epoch": 0.16902695294655093, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 9.324244067952185e-06, + "logits/chosen": 688694016.0, + "logits/rejected": 448420571.4285714, + "logps/chosen": -215.40419006347656, + "logps/rejected": -295.74581473214283, + "loss": 0.0081, + "rewards/chosen": 2.7017288208007812, + "rewards/margins": 12.641584123883929, + "rewards/rejected": -9.939855303083148, + "step": 1850 + }, + { + "epoch": 0.16911831886706258, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.323522067071005e-06, + "logits/chosen": 636556672.0, + "logits/rejected": 624027648.0, + "logps/chosen": -399.2300618489583, + "logps/rejected": -918.1688842773438, + "loss": 0.0481, + "rewards/chosen": 2.92584228515625, + "rewards/margins": 15.348699569702148, + "rewards/rejected": -12.422857284545898, + "step": 1851 + }, + { + "epoch": 0.16920968478757423, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.322799708674607e-06, + "logits/chosen": 839867538.2857143, + "logits/rejected": 249044832.0, + "logps/chosen": -402.32596261160717, + "logps/rejected": -195.888916015625, + "loss": 0.0353, + "rewards/chosen": 3.506239754813058, + "rewards/margins": 10.594858510153635, + "rewards/rejected": -7.088618755340576, + "step": 1852 + }, + { + "epoch": 0.16930105070808588, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 9.322076992822724e-06, + "logits/chosen": 249755808.0, + "logits/rejected": 589934848.0, + "logps/chosen": -222.9307861328125, + "logps/rejected": -648.869384765625, + "loss": 0.0799, + "rewards/chosen": 3.5321593284606934, + "rewards/margins": 12.498451709747314, + "rewards/rejected": -8.966292381286621, + "step": 1853 + }, + { + "epoch": 0.16939241662859753, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 9.32135391957512e-06, + "logits/chosen": 265094912.0, + "logits/rejected": 457645619.2, + "logps/chosen": -253.62618001302084, + "logps/rejected": -505.834619140625, + "loss": 0.0125, + "rewards/chosen": 4.304874420166016, + "rewards/margins": 12.770310211181641, + "rewards/rejected": -8.465435791015626, + "step": 1854 + }, + { + "epoch": 0.16948378254910917, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 9.320630488991586e-06, + "logits/chosen": 427587456.0, + "logits/rejected": 458694592.0, + "logps/chosen": -272.3081359863281, + "logps/rejected": -501.8872375488281, + "loss": 0.0331, + "rewards/chosen": 2.8566055297851562, + "rewards/margins": 11.83369255065918, + "rewards/rejected": -8.977087020874023, + "step": 1855 + }, + { + "epoch": 0.16957514846962082, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 9.31990670113194e-06, + "logits/chosen": 959903232.0, + "logits/rejected": 516527530.6666667, + "logps/chosen": -154.7996337890625, + "logps/rejected": -429.5159505208333, + "loss": 0.0259, + "rewards/chosen": 3.460373306274414, + "rewards/margins": 11.730465316772461, + "rewards/rejected": -8.270092010498047, + "step": 1856 + }, + { + "epoch": 0.16966651439013247, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 9.319182556056037e-06, + "logits/chosen": 388585267.2, + "logits/rejected": 723976704.0, + "logps/chosen": -279.282373046875, + "logps/rejected": -733.4536946614584, + "loss": 0.0231, + "rewards/chosen": 3.692160415649414, + "rewards/margins": 16.720993169148763, + "rewards/rejected": -13.02883275349935, + "step": 1857 + }, + { + "epoch": 0.16975788031064412, + "grad_norm": 0.7109375, + "kl": 0.0, + "learning_rate": 9.318458053823754e-06, + "logits/chosen": 235344224.0, + "logits/rejected": 514708778.6666667, + "logps/chosen": -141.190185546875, + "logps/rejected": -429.1201171875, + "loss": 0.0041, + "rewards/chosen": 4.489148139953613, + "rewards/margins": 11.94273789723714, + "rewards/rejected": -7.453589757283528, + "step": 1858 + }, + { + "epoch": 0.16984924623115577, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 9.317733194495e-06, + "logits/chosen": 589961600.0, + "logits/rejected": 575784533.3333334, + "logps/chosen": -334.7281494140625, + "logps/rejected": -507.7432047526042, + "loss": 0.0147, + "rewards/chosen": 2.8876495361328125, + "rewards/margins": 12.09042231241862, + "rewards/rejected": -9.202772776285807, + "step": 1859 + }, + { + "epoch": 0.16994061215166742, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 9.317007978129715e-06, + "logits/chosen": 674393600.0, + "logits/rejected": 416548032.0, + "logps/chosen": -423.8736267089844, + "logps/rejected": -329.0146484375, + "loss": 0.0169, + "rewards/chosen": 3.5159597396850586, + "rewards/margins": 10.701819896697998, + "rewards/rejected": -7.1858601570129395, + "step": 1860 + }, + { + "epoch": 0.17003197807217907, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 9.31628240478787e-06, + "logits/chosen": 583292800.0, + "logits/rejected": 578047914.6666666, + "logps/chosen": -550.4471435546875, + "logps/rejected": -421.648193359375, + "loss": 0.015, + "rewards/chosen": 3.168121337890625, + "rewards/margins": 11.088027318318684, + "rewards/rejected": -7.91990598042806, + "step": 1861 + }, + { + "epoch": 0.17012334399269072, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 9.31555647452946e-06, + "logits/chosen": 994145706.6666666, + "logits/rejected": 799787110.4, + "logps/chosen": -376.6703287760417, + "logps/rejected": -428.46708984375, + "loss": 0.0113, + "rewards/chosen": 3.7875471115112305, + "rewards/margins": 11.725822257995606, + "rewards/rejected": -7.938275146484375, + "step": 1862 + }, + { + "epoch": 0.17021470991320237, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 9.314830187414516e-06, + "logits/chosen": 610253824.0, + "logits/rejected": 879869120.0, + "logps/chosen": -303.3674011230469, + "logps/rejected": -493.5299072265625, + "loss": 0.0812, + "rewards/chosen": 2.3452820777893066, + "rewards/margins": 10.46338701248169, + "rewards/rejected": -8.118104934692383, + "step": 1863 + }, + { + "epoch": 0.17030607583371402, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.314103543503093e-06, + "logits/chosen": 596605952.0, + "logits/rejected": 465591978.6666667, + "logps/chosen": -352.116259765625, + "logps/rejected": -370.9106852213542, + "loss": 0.0344, + "rewards/chosen": 3.004774475097656, + "rewards/margins": 8.580208841959635, + "rewards/rejected": -5.5754343668619795, + "step": 1864 + }, + { + "epoch": 0.17039744175422566, + "grad_norm": 30.75, + "kl": 0.0, + "learning_rate": 9.313376542855277e-06, + "logits/chosen": 432162752.0, + "logits/rejected": 534361984.0, + "logps/chosen": -337.7278747558594, + "logps/rejected": -637.660888671875, + "loss": 0.0883, + "rewards/chosen": 2.5762643814086914, + "rewards/margins": 12.74839973449707, + "rewards/rejected": -10.172135353088379, + "step": 1865 + }, + { + "epoch": 0.1704888076747373, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.312649185531183e-06, + "logits/chosen": 305306581.3333333, + "logits/rejected": 1016836300.8, + "logps/chosen": -96.37521362304688, + "logps/rejected": -573.776953125, + "loss": 0.0749, + "rewards/chosen": 2.8439035415649414, + "rewards/margins": 15.380164527893067, + "rewards/rejected": -12.536260986328125, + "step": 1866 + }, + { + "epoch": 0.17058017359524896, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 9.311921471590964e-06, + "logits/chosen": 842150912.0, + "logits/rejected": 409427882.6666667, + "logps/chosen": -278.260498046875, + "logps/rejected": -463.65966796875, + "loss": 0.0113, + "rewards/chosen": 3.129072666168213, + "rewards/margins": 12.029086907704672, + "rewards/rejected": -8.900014241536459, + "step": 1867 + }, + { + "epoch": 0.1706715395157606, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.311193401094787e-06, + "logits/chosen": 567684352.0, + "logits/rejected": 312180821.3333333, + "logps/chosen": -412.02353515625, + "logps/rejected": -268.54274495442706, + "loss": 0.0228, + "rewards/chosen": 3.9783355712890627, + "rewards/margins": 11.350892893473308, + "rewards/rejected": -7.372557322184245, + "step": 1868 + }, + { + "epoch": 0.17076290543627226, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.31046497410286e-06, + "logits/chosen": 603020202.6666666, + "logits/rejected": 416891904.0, + "logps/chosen": -427.5299886067708, + "logps/rejected": -346.536474609375, + "loss": 0.0562, + "rewards/chosen": 2.3800110816955566, + "rewards/margins": 10.042151737213135, + "rewards/rejected": -7.662140655517578, + "step": 1869 + }, + { + "epoch": 0.1708542713567839, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.309736190675417e-06, + "logits/chosen": 580166297.6, + "logits/rejected": 622142208.0, + "logps/chosen": -392.8068115234375, + "logps/rejected": -483.142578125, + "loss": 0.0319, + "rewards/chosen": 3.140277099609375, + "rewards/margins": 13.624160639444987, + "rewards/rejected": -10.483883539835611, + "step": 1870 + }, + { + "epoch": 0.17094563727729556, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 9.309007050872722e-06, + "logits/chosen": 1677261824.0, + "logits/rejected": 708924364.8, + "logps/chosen": -532.9905598958334, + "logps/rejected": -509.69775390625, + "loss": 0.026, + "rewards/chosen": 2.96761163075765, + "rewards/margins": 11.722180875142415, + "rewards/rejected": -8.754569244384765, + "step": 1871 + }, + { + "epoch": 0.1710370031978072, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 9.308277554755068e-06, + "logits/chosen": 630702208.0, + "logits/rejected": 865060181.3333334, + "logps/chosen": -359.1201477050781, + "logps/rejected": -439.8999430338542, + "loss": 0.0146, + "rewards/chosen": 2.874001979827881, + "rewards/margins": 11.098936875661215, + "rewards/rejected": -8.224934895833334, + "step": 1872 + }, + { + "epoch": 0.17112836911831886, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.307547702382775e-06, + "logits/chosen": 475092121.6, + "logits/rejected": 405419178.6666667, + "logps/chosen": -234.120068359375, + "logps/rejected": -462.5439860026042, + "loss": 0.0354, + "rewards/chosen": 4.625395202636719, + "rewards/margins": 11.438662592569987, + "rewards/rejected": -6.8132673899332685, + "step": 1873 + }, + { + "epoch": 0.1712197350388305, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.3068174938162e-06, + "logits/chosen": 540672256.0, + "logits/rejected": 503337472.0, + "logps/chosen": -299.9974670410156, + "logps/rejected": -374.9692687988281, + "loss": 0.0332, + "rewards/chosen": 3.0535595417022705, + "rewards/margins": 10.1783287525177, + "rewards/rejected": -7.12476921081543, + "step": 1874 + }, + { + "epoch": 0.17131110095934216, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 9.306086929115721e-06, + "logits/chosen": 572963123.2, + "logits/rejected": 416981077.3333333, + "logps/chosen": -352.2131103515625, + "logps/rejected": -333.98777262369794, + "loss": 0.0261, + "rewards/chosen": 3.4231597900390627, + "rewards/margins": 10.736552429199218, + "rewards/rejected": -7.313392639160156, + "step": 1875 + }, + { + "epoch": 0.1714024668798538, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.30535600834175e-06, + "logits/chosen": 893297024.0, + "logits/rejected": 523741610.6666667, + "logps/chosen": -437.71173095703125, + "logps/rejected": -511.03125, + "loss": 0.0679, + "rewards/chosen": 3.1232728958129883, + "rewards/margins": 10.460667610168457, + "rewards/rejected": -7.337394714355469, + "step": 1876 + }, + { + "epoch": 0.17149383280036545, + "grad_norm": 26.25, + "kl": 0.0, + "learning_rate": 9.304624731554728e-06, + "logits/chosen": 441524224.0, + "logits/rejected": 384493397.3333333, + "logps/chosen": -358.4533935546875, + "logps/rejected": -345.4759114583333, + "loss": 0.0913, + "rewards/chosen": 3.094300651550293, + "rewards/margins": 8.974232419331868, + "rewards/rejected": -5.879931767781575, + "step": 1877 + }, + { + "epoch": 0.1715851987208771, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.303893098815123e-06, + "logits/chosen": 734797994.6666666, + "logits/rejected": 553537945.6, + "logps/chosen": -284.256103515625, + "logps/rejected": -450.459326171875, + "loss": 0.0366, + "rewards/chosen": 3.695741653442383, + "rewards/margins": 10.9325382232666, + "rewards/rejected": -7.236796569824219, + "step": 1878 + }, + { + "epoch": 0.17167656464138875, + "grad_norm": 33.0, + "kl": 0.0, + "learning_rate": 9.303161110183434e-06, + "logits/chosen": 511277013.3333333, + "logits/rejected": 552249497.6, + "logps/chosen": -316.9219970703125, + "logps/rejected": -477.0712890625, + "loss": 0.0617, + "rewards/chosen": 2.841684023539225, + "rewards/margins": 12.252131716410318, + "rewards/rejected": -9.410447692871093, + "step": 1879 + }, + { + "epoch": 0.1717679305619004, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.302428765720192e-06, + "logits/chosen": 362719232.0, + "logits/rejected": 441862784.0, + "logps/chosen": -317.2884216308594, + "logps/rejected": -451.3236999511719, + "loss": 0.1412, + "rewards/chosen": 1.3358879089355469, + "rewards/margins": 7.698436260223389, + "rewards/rejected": -6.362548351287842, + "step": 1880 + }, + { + "epoch": 0.17185929648241205, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.301696065485955e-06, + "logits/chosen": 670716160.0, + "logits/rejected": 536374886.4, + "logps/chosen": -124.00592041015625, + "logps/rejected": -571.7978515625, + "loss": 0.0521, + "rewards/chosen": 2.029127279917399, + "rewards/margins": 10.628935019175211, + "rewards/rejected": -8.599807739257812, + "step": 1881 + }, + { + "epoch": 0.1719506624029237, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 9.300963009541308e-06, + "logits/chosen": 528918272.0, + "logits/rejected": 680142890.6666666, + "logps/chosen": -143.639892578125, + "logps/rejected": -425.3631998697917, + "loss": 0.0751, + "rewards/chosen": 2.735257625579834, + "rewards/margins": 10.331899166107178, + "rewards/rejected": -7.596641540527344, + "step": 1882 + }, + { + "epoch": 0.17204202832343535, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 9.300229597946869e-06, + "logits/chosen": 600813465.6, + "logits/rejected": 881673557.3333334, + "logps/chosen": -298.7173095703125, + "logps/rejected": -356.6655680338542, + "loss": 0.031, + "rewards/chosen": 3.153506278991699, + "rewards/margins": 10.067509015401203, + "rewards/rejected": -6.914002736409505, + "step": 1883 + }, + { + "epoch": 0.172133394243947, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.299495830763285e-06, + "logits/chosen": 470335648.0, + "logits/rejected": 686449536.0, + "logps/chosen": -398.12701416015625, + "logps/rejected": -604.5167236328125, + "loss": 0.0264, + "rewards/chosen": 3.303934097290039, + "rewards/margins": 11.590343475341797, + "rewards/rejected": -8.286409378051758, + "step": 1884 + }, + { + "epoch": 0.17222476016445865, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 9.298761708051232e-06, + "logits/chosen": 494544512.0, + "logits/rejected": 407836160.0, + "logps/chosen": -288.482421875, + "logps/rejected": -388.2589416503906, + "loss": 0.0423, + "rewards/chosen": 3.3137216567993164, + "rewards/margins": 9.38309383392334, + "rewards/rejected": -6.069372177124023, + "step": 1885 + }, + { + "epoch": 0.1723161260849703, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 9.298027229871417e-06, + "logits/chosen": 529927808.0, + "logits/rejected": 587929792.0, + "logps/chosen": -402.36700439453125, + "logps/rejected": -650.877685546875, + "loss": 0.0221, + "rewards/chosen": 3.2165608406066895, + "rewards/margins": 11.908775806427002, + "rewards/rejected": -8.692214965820312, + "step": 1886 + }, + { + "epoch": 0.17240749200548194, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 9.297292396284569e-06, + "logits/chosen": 512878400.0, + "logits/rejected": 1142140288.0, + "logps/chosen": -234.37220764160156, + "logps/rejected": -532.5510864257812, + "loss": 0.0166, + "rewards/chosen": 3.9676642417907715, + "rewards/margins": 11.056822776794434, + "rewards/rejected": -7.089158535003662, + "step": 1887 + }, + { + "epoch": 0.1724988579259936, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 9.296557207351456e-06, + "logits/chosen": 524205312.0, + "logits/rejected": 638711552.0, + "logps/chosen": -334.6617431640625, + "logps/rejected": -476.90875244140625, + "loss": 0.0283, + "rewards/chosen": 3.04103946685791, + "rewards/margins": 9.849244594573975, + "rewards/rejected": -6.8082051277160645, + "step": 1888 + }, + { + "epoch": 0.17259022384650524, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 9.29582166313287e-06, + "logits/chosen": 779469824.0, + "logits/rejected": 905090240.0, + "logps/chosen": -361.3773193359375, + "logps/rejected": -593.4288330078125, + "loss": 0.0419, + "rewards/chosen": 3.081686019897461, + "rewards/margins": 13.106554985046387, + "rewards/rejected": -10.024868965148926, + "step": 1889 + }, + { + "epoch": 0.1726815897670169, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 9.295085763689634e-06, + "logits/chosen": 239491696.0, + "logits/rejected": 340172873.14285713, + "logps/chosen": -225.69334411621094, + "logps/rejected": -513.1916155133929, + "loss": 0.0028, + "rewards/chosen": 6.436262607574463, + "rewards/margins": 14.329740456172399, + "rewards/rejected": -7.893477848597935, + "step": 1890 + }, + { + "epoch": 0.17277295568752854, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.294349509082601e-06, + "logits/chosen": 477566156.8, + "logits/rejected": 378461482.6666667, + "logps/chosen": -267.16611328125, + "logps/rejected": -617.4881184895834, + "loss": 0.0351, + "rewards/chosen": 3.1812225341796876, + "rewards/margins": 12.092690404256185, + "rewards/rejected": -8.911467870076498, + "step": 1891 + }, + { + "epoch": 0.1728643216080402, + "grad_norm": 28.75, + "kl": 0.0, + "learning_rate": 9.293612899372651e-06, + "logits/chosen": 517610656.0, + "logits/rejected": 1040174400.0, + "logps/chosen": -263.08367919921875, + "logps/rejected": -527.1348266601562, + "loss": 0.071, + "rewards/chosen": 2.5335693359375, + "rewards/margins": 7.957460403442383, + "rewards/rejected": -5.423891067504883, + "step": 1892 + }, + { + "epoch": 0.17295568752855184, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.292875934620696e-06, + "logits/chosen": 742288640.0, + "logits/rejected": 436194150.4, + "logps/chosen": -364.3481852213542, + "logps/rejected": -280.911181640625, + "loss": 0.0361, + "rewards/chosen": 3.8636900583902993, + "rewards/margins": 9.757876459757487, + "rewards/rejected": -5.894186401367188, + "step": 1893 + }, + { + "epoch": 0.1730470534490635, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 9.292138614887675e-06, + "logits/chosen": 360500650.6666667, + "logits/rejected": 231464140.8, + "logps/chosen": -271.85227457682294, + "logps/rejected": -396.313671875, + "loss": 0.0078, + "rewards/chosen": 4.442700703938802, + "rewards/margins": 13.692381795247396, + "rewards/rejected": -9.249681091308593, + "step": 1894 + }, + { + "epoch": 0.17313841936957514, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.29140094023456e-06, + "logits/chosen": 1015834931.2, + "logits/rejected": 566814293.3333334, + "logps/chosen": -368.0677001953125, + "logps/rejected": -311.9751790364583, + "loss": 0.1297, + "rewards/chosen": 2.2599361419677733, + "rewards/margins": 10.26147092183431, + "rewards/rejected": -8.001534779866537, + "step": 1895 + }, + { + "epoch": 0.17322978529008679, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.290662910722346e-06, + "logits/chosen": 502154496.0, + "logits/rejected": 189438944.0, + "logps/chosen": -328.6575927734375, + "logps/rejected": -227.00888061523438, + "loss": 0.047, + "rewards/chosen": 2.9087864557902017, + "rewards/margins": 10.480359236399332, + "rewards/rejected": -7.571572780609131, + "step": 1896 + }, + { + "epoch": 0.17332115121059843, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.289924526412064e-06, + "logits/chosen": 556696000.0, + "logits/rejected": 874593792.0, + "logps/chosen": -403.8948059082031, + "logps/rejected": -608.853515625, + "loss": 0.0299, + "rewards/chosen": 3.258042335510254, + "rewards/margins": 10.748358249664307, + "rewards/rejected": -7.490315914154053, + "step": 1897 + }, + { + "epoch": 0.17341251713111008, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 9.28918578736477e-06, + "logits/chosen": 492685472.0, + "logits/rejected": 751042944.0, + "logps/chosen": -317.60986328125, + "logps/rejected": -567.2759399414062, + "loss": 0.0174, + "rewards/chosen": 4.0303754806518555, + "rewards/margins": 12.659979820251465, + "rewards/rejected": -8.62960433959961, + "step": 1898 + }, + { + "epoch": 0.17350388305162173, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 9.288446693641552e-06, + "logits/chosen": 363490432.0, + "logits/rejected": 663047680.0, + "logps/chosen": -250.66084798177084, + "logps/rejected": -581.53408203125, + "loss": 0.0625, + "rewards/chosen": 3.5381387074788413, + "rewards/margins": 9.919595082600912, + "rewards/rejected": -6.38145637512207, + "step": 1899 + }, + { + "epoch": 0.17359524897213338, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.287707245303525e-06, + "logits/chosen": 1018644053.3333334, + "logits/rejected": 561418137.6, + "logps/chosen": -168.35073852539062, + "logps/rejected": -277.01240234375, + "loss": 0.1182, + "rewards/chosen": 2.697333017985026, + "rewards/margins": 9.702955118815105, + "rewards/rejected": -7.005622100830078, + "step": 1900 + }, + { + "epoch": 0.17368661489264503, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.286967442411836e-06, + "logits/chosen": 784560725.3333334, + "logits/rejected": 554339264.0, + "logps/chosen": -326.0351155598958, + "logps/rejected": -589.5858764648438, + "loss": 0.0328, + "rewards/chosen": 3.2809295654296875, + "rewards/margins": 15.704547882080078, + "rewards/rejected": -12.42361831665039, + "step": 1901 + }, + { + "epoch": 0.17377798081315668, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 9.28622728502766e-06, + "logits/chosen": 750240384.0, + "logits/rejected": 488963114.6666667, + "logps/chosen": -356.78021240234375, + "logps/rejected": -482.4309488932292, + "loss": 0.012, + "rewards/chosen": 3.154698371887207, + "rewards/margins": 10.953206062316895, + "rewards/rejected": -7.7985076904296875, + "step": 1902 + }, + { + "epoch": 0.17386934673366833, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.2854867732122e-06, + "logits/chosen": 405486182.4, + "logits/rejected": 476030933.3333333, + "logps/chosen": -392.5158203125, + "logps/rejected": -865.7008463541666, + "loss": 0.1106, + "rewards/chosen": 2.6365997314453127, + "rewards/margins": 14.588949076334636, + "rewards/rejected": -11.952349344889322, + "step": 1903 + }, + { + "epoch": 0.17396071265417998, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.28474590702669e-06, + "logits/chosen": 540976640.0, + "logits/rejected": 587307520.0, + "logps/chosen": -323.2883707682292, + "logps/rejected": -255.6197509765625, + "loss": 0.038, + "rewards/chosen": 3.347906748453776, + "rewards/margins": 8.770799318949381, + "rewards/rejected": -5.4228925704956055, + "step": 1904 + }, + { + "epoch": 0.17405207857469163, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.28400468653239e-06, + "logits/chosen": 483445248.0, + "logits/rejected": 539394474.6666666, + "logps/chosen": -364.2879638671875, + "logps/rejected": -615.5243326822916, + "loss": 0.109, + "rewards/chosen": 3.060135269165039, + "rewards/margins": 12.23172353108724, + "rewards/rejected": -9.171588261922201, + "step": 1905 + }, + { + "epoch": 0.17414344449520328, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.283263111790597e-06, + "logits/chosen": 668296234.6666666, + "logits/rejected": 521505433.6, + "logps/chosen": -252.54423014322916, + "logps/rejected": -416.6634765625, + "loss": 0.0215, + "rewards/chosen": 3.281336466471354, + "rewards/margins": 10.263585917154948, + "rewards/rejected": -6.982249450683594, + "step": 1906 + }, + { + "epoch": 0.17423481041571492, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 9.28252118286263e-06, + "logits/chosen": 604599808.0, + "logits/rejected": 449248597.3333333, + "logps/chosen": -250.0884765625, + "logps/rejected": -337.59161376953125, + "loss": 0.048, + "rewards/chosen": 3.3350250244140627, + "rewards/margins": 9.319221369425456, + "rewards/rejected": -5.9841963450113935, + "step": 1907 + }, + { + "epoch": 0.17432617633622657, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 9.281778899809837e-06, + "logits/chosen": 590131584.0, + "logits/rejected": 616331520.0, + "logps/chosen": -213.19345092773438, + "logps/rejected": -410.09637451171875, + "loss": 0.0879, + "rewards/chosen": 2.8401637077331543, + "rewards/margins": 10.768201351165771, + "rewards/rejected": -7.928037643432617, + "step": 1908 + }, + { + "epoch": 0.17441754225673822, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 9.281036262693604e-06, + "logits/chosen": 462859878.4, + "logits/rejected": 268796160.0, + "logps/chosen": -280.44150390625, + "logps/rejected": -303.68206787109375, + "loss": 0.0734, + "rewards/chosen": 3.5119110107421876, + "rewards/margins": 12.373520151774088, + "rewards/rejected": -8.8616091410319, + "step": 1909 + }, + { + "epoch": 0.1745089081772499, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.280293271575333e-06, + "logits/chosen": 643872704.0, + "logits/rejected": 392618581.3333333, + "logps/chosen": -568.8242797851562, + "logps/rejected": -391.539306640625, + "loss": 0.0232, + "rewards/chosen": 3.0024003982543945, + "rewards/margins": 12.263635953267416, + "rewards/rejected": -9.261235555013021, + "step": 1910 + }, + { + "epoch": 0.17460027409776155, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 9.27954992651647e-06, + "logits/chosen": 657173350.4, + "logits/rejected": 315458069.3333333, + "logps/chosen": -290.181982421875, + "logps/rejected": -366.5541178385417, + "loss": 0.0208, + "rewards/chosen": 3.5871185302734374, + "rewards/margins": 12.526413981119791, + "rewards/rejected": -8.939295450846354, + "step": 1911 + }, + { + "epoch": 0.1746916400182732, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.278806227578478e-06, + "logits/chosen": 593038080.0, + "logits/rejected": 543713152.0, + "logps/chosen": -293.632080078125, + "logps/rejected": -478.00433349609375, + "loss": 0.0688, + "rewards/chosen": 2.0218915939331055, + "rewards/margins": 12.62669849395752, + "rewards/rejected": -10.604806900024414, + "step": 1912 + }, + { + "epoch": 0.17478300593878485, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 9.278062174822853e-06, + "logits/chosen": 783043993.6, + "logits/rejected": 449599232.0, + "logps/chosen": -387.474365234375, + "logps/rejected": -417.329345703125, + "loss": 0.0465, + "rewards/chosen": 2.893560791015625, + "rewards/margins": 9.56040547688802, + "rewards/rejected": -6.6668446858723955, + "step": 1913 + }, + { + "epoch": 0.1748743718592965, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 9.277317768311125e-06, + "logits/chosen": 1190487637.3333333, + "logits/rejected": 729632716.8, + "logps/chosen": -375.3846028645833, + "logps/rejected": -547.20341796875, + "loss": 0.018, + "rewards/chosen": 3.382523854573568, + "rewards/margins": 13.68356450398763, + "rewards/rejected": -10.301040649414062, + "step": 1914 + }, + { + "epoch": 0.17496573777980814, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.276573008104848e-06, + "logits/chosen": 457888298.6666667, + "logits/rejected": 600093056.0, + "logps/chosen": -358.4273681640625, + "logps/rejected": -458.5676574707031, + "loss": 0.0319, + "rewards/chosen": 3.957758585611979, + "rewards/margins": 11.74785296122233, + "rewards/rejected": -7.790094375610352, + "step": 1915 + }, + { + "epoch": 0.1750571037003198, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 9.275827894265606e-06, + "logits/chosen": 637472085.3333334, + "logits/rejected": 586265920.0, + "logps/chosen": -275.41530354817706, + "logps/rejected": -491.9309997558594, + "loss": 0.0915, + "rewards/chosen": 2.581294377644857, + "rewards/margins": 11.08316167195638, + "rewards/rejected": -8.501867294311523, + "step": 1916 + }, + { + "epoch": 0.17514846962083144, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 9.275082426855013e-06, + "logits/chosen": 801982720.0, + "logits/rejected": 407116928.0, + "logps/chosen": -224.68988037109375, + "logps/rejected": -367.5555013020833, + "loss": 0.0089, + "rewards/chosen": 3.9155235290527344, + "rewards/margins": 11.358173370361328, + "rewards/rejected": -7.442649841308594, + "step": 1917 + }, + { + "epoch": 0.1752398355413431, + "grad_norm": 24.875, + "kl": 0.0, + "learning_rate": 9.274336605934714e-06, + "logits/chosen": 463147827.2, + "logits/rejected": 273783829.3333333, + "logps/chosen": -271.209912109375, + "logps/rejected": -393.94775390625, + "loss": 0.0743, + "rewards/chosen": 2.3733776092529295, + "rewards/margins": 9.873827107747395, + "rewards/rejected": -7.500449498494466, + "step": 1918 + }, + { + "epoch": 0.17533120146185474, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.27359043156638e-06, + "logits/chosen": 531650208.0, + "logits/rejected": 373783488.0, + "logps/chosen": -411.3270263671875, + "logps/rejected": -415.6524658203125, + "loss": 0.0482, + "rewards/chosen": 2.7163422107696533, + "rewards/margins": 12.721844911575317, + "rewards/rejected": -10.005502700805664, + "step": 1919 + }, + { + "epoch": 0.1754225673823664, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.272843903811712e-06, + "logits/chosen": 519122432.0, + "logits/rejected": 403649952.0, + "logps/chosen": -388.4787902832031, + "logps/rejected": -502.7585754394531, + "loss": 0.0299, + "rewards/chosen": 2.98114013671875, + "rewards/margins": 11.810144424438477, + "rewards/rejected": -8.829004287719727, + "step": 1920 + }, + { + "epoch": 0.17551393330287804, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 9.272097022732444e-06, + "logits/chosen": 405931136.0, + "logits/rejected": 478234828.8, + "logps/chosen": -319.2840169270833, + "logps/rejected": -424.07177734375, + "loss": 0.072, + "rewards/chosen": 3.4970973332722983, + "rewards/margins": 7.74969285329183, + "rewards/rejected": -4.252595520019531, + "step": 1921 + }, + { + "epoch": 0.1756052992233897, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 9.271349788390332e-06, + "logits/chosen": 572203690.6666666, + "logits/rejected": 494945728.0, + "logps/chosen": -277.4100748697917, + "logps/rejected": -542.2294921875, + "loss": 0.0182, + "rewards/chosen": 3.8590447107950845, + "rewards/margins": 12.148015658060709, + "rewards/rejected": -8.288970947265625, + "step": 1922 + }, + { + "epoch": 0.17569666514390134, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.270602200847168e-06, + "logits/chosen": 532700032.0, + "logits/rejected": 847262656.0, + "logps/chosen": -425.6537780761719, + "logps/rejected": -466.53997802734375, + "loss": 0.0295, + "rewards/chosen": 3.981186628341675, + "rewards/margins": 10.984274625778198, + "rewards/rejected": -7.003087997436523, + "step": 1923 + }, + { + "epoch": 0.175788031064413, + "grad_norm": 0.494140625, + "kl": 0.0, + "learning_rate": 9.26985426016477e-06, + "logits/rejected": 907607296.0, + "logps/rejected": -489.72900390625, + "loss": 0.0016, + "rewards/rejected": -8.017711639404297, + "step": 1924 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 9.269105966404987e-06, + "logits/chosen": 497869312.0, + "logits/rejected": 1073351936.0, + "logps/chosen": -307.8058268229167, + "logps/rejected": -567.5669555664062, + "loss": 0.0189, + "rewards/chosen": 3.873955408732096, + "rewards/margins": 13.407687822977701, + "rewards/rejected": -9.533732414245605, + "step": 1925 + }, + { + "epoch": 0.17597076290543628, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.268357319629694e-06, + "logits/chosen": 1218731827.2, + "logits/rejected": 2188763818.6666665, + "logps/chosen": -246.957763671875, + "logps/rejected": -738.75244140625, + "loss": 0.0399, + "rewards/chosen": 2.9947128295898438, + "rewards/margins": 12.002477645874023, + "rewards/rejected": -9.00776481628418, + "step": 1926 + }, + { + "epoch": 0.17606212882594793, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.267608319900798e-06, + "logits/chosen": 663174582.8571428, + "logits/rejected": 506906528.0, + "logps/chosen": -317.80650111607144, + "logps/rejected": -387.4688415527344, + "loss": 0.0368, + "rewards/chosen": 3.2420637948172435, + "rewards/margins": 13.428956304277692, + "rewards/rejected": -10.18689250946045, + "step": 1927 + }, + { + "epoch": 0.17615349474645958, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 9.266858967280232e-06, + "logits/chosen": 595402496.0, + "logits/rejected": 562035456.0, + "logps/chosen": -438.7962646484375, + "logps/rejected": -485.13848876953125, + "loss": 0.0193, + "rewards/chosen": 3.5639243125915527, + "rewards/margins": 12.01426649093628, + "rewards/rejected": -8.450342178344727, + "step": 1928 + }, + { + "epoch": 0.17624486066697123, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 9.266109261829965e-06, + "logits/chosen": 796927948.8, + "logits/rejected": 546312277.3333334, + "logps/chosen": -301.288916015625, + "logps/rejected": -455.7247721354167, + "loss": 0.0236, + "rewards/chosen": 3.664259338378906, + "rewards/margins": 12.71814448038737, + "rewards/rejected": -9.053885142008463, + "step": 1929 + }, + { + "epoch": 0.17633622658748288, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.265359203611988e-06, + "logits/chosen": 424872448.0, + "logits/rejected": 478803456.0, + "logps/chosen": -280.73638916015625, + "logps/rejected": -421.799560546875, + "loss": 0.0547, + "rewards/chosen": 2.5788888931274414, + "rewards/margins": 8.775819778442383, + "rewards/rejected": -6.196930885314941, + "step": 1930 + }, + { + "epoch": 0.17642759250799453, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.264608792688326e-06, + "logits/chosen": 437077350.4, + "logits/rejected": 472384981.3333333, + "logps/chosen": -399.751025390625, + "logps/rejected": -324.3732503255208, + "loss": 0.1295, + "rewards/chosen": 3.7534271240234376, + "rewards/margins": 7.924462890625, + "rewards/rejected": -4.1710357666015625, + "step": 1931 + }, + { + "epoch": 0.17651895842850618, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 9.263858029121026e-06, + "logits/chosen": 567819648.0, + "logits/rejected": 599042688.0, + "logps/chosen": -212.13516235351562, + "logps/rejected": -645.7733154296875, + "loss": 0.0285, + "rewards/chosen": 3.275007724761963, + "rewards/margins": 11.229980945587158, + "rewards/rejected": -7.954973220825195, + "step": 1932 + }, + { + "epoch": 0.17661032434901783, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 9.263106912972174e-06, + "logits/chosen": 1138745600.0, + "logits/rejected": 666508873.1428572, + "logps/chosen": -345.49853515625, + "logps/rejected": -626.8133370535714, + "loss": 0.0089, + "rewards/chosen": 2.87919020652771, + "rewards/margins": 11.919418369020734, + "rewards/rejected": -9.040228162493024, + "step": 1933 + }, + { + "epoch": 0.17670169026952948, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.262355444303878e-06, + "logits/chosen": 542796748.8, + "logits/rejected": 411483264.0, + "logps/chosen": -362.069189453125, + "logps/rejected": -507.0847574869792, + "loss": 0.0151, + "rewards/chosen": 3.9748016357421876, + "rewards/margins": 12.783514912923177, + "rewards/rejected": -8.80871327718099, + "step": 1934 + }, + { + "epoch": 0.17679305619004113, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 9.261603623178281e-06, + "logits/chosen": 506042208.0, + "logits/rejected": 549646592.0, + "logps/chosen": -254.3518829345703, + "logps/rejected": -750.11767578125, + "loss": 0.0145, + "rewards/chosen": 3.7080368995666504, + "rewards/margins": 13.15801477432251, + "rewards/rejected": -9.44997787475586, + "step": 1935 + }, + { + "epoch": 0.17688442211055277, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.260851449657546e-06, + "logits/chosen": 464040768.0, + "logits/rejected": 696208704.0, + "logps/chosen": -201.8356475830078, + "logps/rejected": -335.02679443359375, + "loss": 0.0466, + "rewards/chosen": 2.932185649871826, + "rewards/margins": 11.179284572601318, + "rewards/rejected": -8.247098922729492, + "step": 1936 + }, + { + "epoch": 0.17697578803106442, + "grad_norm": 23.875, + "kl": 0.0, + "learning_rate": 9.260098923803876e-06, + "logits/chosen": 569100928.0, + "logits/rejected": 296225600.0, + "logps/chosen": -489.8409118652344, + "logps/rejected": -384.5794677734375, + "loss": 0.1071, + "rewards/chosen": 2.814666509628296, + "rewards/margins": 10.23485016822815, + "rewards/rejected": -7.4201836585998535, + "step": 1937 + }, + { + "epoch": 0.17706715395157607, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.259346045679495e-06, + "logits/chosen": 698613632.0, + "logits/rejected": 467155200.0, + "logps/chosen": -492.9380798339844, + "logps/rejected": -389.7303466796875, + "loss": 0.0777, + "rewards/chosen": 3.2320425510406494, + "rewards/margins": 10.313303232192993, + "rewards/rejected": -7.081260681152344, + "step": 1938 + }, + { + "epoch": 0.17715851987208772, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.258592815346659e-06, + "logits/chosen": 615587072.0, + "logits/rejected": 698660608.0, + "logps/chosen": -734.7005615234375, + "logps/rejected": -876.38671875, + "loss": 0.0155, + "rewards/chosen": 2.9841294288635254, + "rewards/margins": 14.093149662017822, + "rewards/rejected": -11.109020233154297, + "step": 1939 + }, + { + "epoch": 0.17724988579259937, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 9.257839232867655e-06, + "logits/chosen": 335969344.0, + "logits/rejected": 285182464.0, + "logps/chosen": -227.9916534423828, + "logps/rejected": -418.22430419921875, + "loss": 0.0234, + "rewards/chosen": 3.3279519081115723, + "rewards/margins": 14.409595966339111, + "rewards/rejected": -11.081644058227539, + "step": 1940 + }, + { + "epoch": 0.17734125171311102, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 9.257085298304796e-06, + "logits/chosen": 391271338.6666667, + "logits/rejected": 606937395.2, + "logps/chosen": -276.8111572265625, + "logps/rejected": -620.073046875, + "loss": 0.0125, + "rewards/chosen": 3.9704786936442056, + "rewards/margins": 13.512622706095376, + "rewards/rejected": -9.542144012451171, + "step": 1941 + }, + { + "epoch": 0.17743261763362267, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.256331011720425e-06, + "logits/chosen": 374315059.2, + "logits/rejected": 685436330.6666666, + "logps/chosen": -317.066259765625, + "logps/rejected": -393.5767415364583, + "loss": 0.1427, + "rewards/chosen": 2.4717689514160157, + "rewards/margins": 10.688872909545898, + "rewards/rejected": -8.217103958129883, + "step": 1942 + }, + { + "epoch": 0.17752398355413432, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.255576373176915e-06, + "logits/chosen": 632350617.6, + "logits/rejected": 1233026133.3333333, + "logps/chosen": -304.1330078125, + "logps/rejected": -509.9574381510417, + "loss": 0.0293, + "rewards/chosen": 3.520611572265625, + "rewards/margins": 12.69243952433268, + "rewards/rejected": -9.171827952067057, + "step": 1943 + }, + { + "epoch": 0.17761534947464597, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.254821382736668e-06, + "logits/chosen": 905840469.3333334, + "logits/rejected": 582882099.2, + "logps/chosen": -297.7476806640625, + "logps/rejected": -485.916650390625, + "loss": 0.0295, + "rewards/chosen": 3.4039576848347983, + "rewards/margins": 10.482250531514486, + "rewards/rejected": -7.0782928466796875, + "step": 1944 + }, + { + "epoch": 0.17770671539515762, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.254066040462115e-06, + "logits/chosen": 553387712.0, + "logits/rejected": 712353600.0, + "logps/chosen": -384.4169921875, + "logps/rejected": -550.197265625, + "loss": 0.0184, + "rewards/chosen": 3.335653781890869, + "rewards/margins": 11.775879383087158, + "rewards/rejected": -8.440225601196289, + "step": 1945 + }, + { + "epoch": 0.17779808131566927, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.253310346415714e-06, + "logits/chosen": 547349376.0, + "logits/rejected": 741811712.0, + "logps/chosen": -533.8009033203125, + "logps/rejected": -509.6959228515625, + "loss": 0.0176, + "rewards/chosen": 2.6300079822540283, + "rewards/margins": 10.466742118199665, + "rewards/rejected": -7.836734135945638, + "step": 1946 + }, + { + "epoch": 0.17788944723618091, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 9.252554300659956e-06, + "logits/chosen": 412833056.0, + "logits/rejected": 297235904.0, + "logps/chosen": -347.9714050292969, + "logps/rejected": -522.3819580078125, + "loss": 0.0138, + "rewards/chosen": 4.1974945068359375, + "rewards/margins": 13.657283782958984, + "rewards/rejected": -9.459789276123047, + "step": 1947 + }, + { + "epoch": 0.17798081315669256, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.251797903257358e-06, + "logits/chosen": 890408320.0, + "logits/rejected": 530724448.0, + "logps/chosen": -399.50323486328125, + "logps/rejected": -435.25372314453125, + "loss": 0.1378, + "rewards/chosen": 1.5327708721160889, + "rewards/margins": 10.833319425582886, + "rewards/rejected": -9.300548553466797, + "step": 1948 + }, + { + "epoch": 0.1780721790772042, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.251041154270469e-06, + "logits/chosen": 596179660.8, + "logits/rejected": 479776170.6666667, + "logps/chosen": -396.9725830078125, + "logps/rejected": -563.3741048177084, + "loss": 0.0344, + "rewards/chosen": 3.2700374603271483, + "rewards/margins": 13.948664474487305, + "rewards/rejected": -10.678627014160156, + "step": 1949 + }, + { + "epoch": 0.17816354499771586, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.25028405376186e-06, + "logits/chosen": 594590976.0, + "logits/rejected": 529199680.0, + "logps/chosen": -320.33026123046875, + "logps/rejected": -475.57794189453125, + "loss": 0.0417, + "rewards/chosen": 2.4444727897644043, + "rewards/margins": 10.979018688201904, + "rewards/rejected": -8.5345458984375, + "step": 1950 + }, + { + "epoch": 0.1782549109182275, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.24952660179414e-06, + "logits/chosen": 671443520.0, + "logits/rejected": 351267200.0, + "logps/chosen": -273.5655212402344, + "logps/rejected": -346.8348388671875, + "loss": 0.0242, + "rewards/chosen": 3.410454034805298, + "rewards/margins": 11.555839776992798, + "rewards/rejected": -8.1453857421875, + "step": 1951 + }, + { + "epoch": 0.17834627683873916, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 9.248768798429944e-06, + "logits/chosen": 654138880.0, + "logits/rejected": 917924249.6, + "logps/chosen": -368.3275960286458, + "logps/rejected": -723.292919921875, + "loss": 0.0125, + "rewards/chosen": 3.753774642944336, + "rewards/margins": 13.125934219360351, + "rewards/rejected": -9.372159576416015, + "step": 1952 + }, + { + "epoch": 0.1784376427592508, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.248010643731936e-06, + "logits/chosen": 513942186.6666667, + "logits/rejected": 911492224.0, + "logps/chosen": -288.625244140625, + "logps/rejected": -389.84637451171875, + "loss": 0.0594, + "rewards/chosen": 3.7296584447224936, + "rewards/margins": 7.648181279500326, + "rewards/rejected": -3.918522834777832, + "step": 1953 + }, + { + "epoch": 0.17852900867976246, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 9.247252137762803e-06, + "logits/chosen": 828877107.2, + "logits/rejected": 641172480.0, + "logps/chosen": -286.83349609375, + "logps/rejected": -465.1812744140625, + "loss": 0.0278, + "rewards/chosen": 3.747294235229492, + "rewards/margins": 11.96932487487793, + "rewards/rejected": -8.222030639648438, + "step": 1954 + }, + { + "epoch": 0.1786203746002741, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.24649328058527e-06, + "logits/chosen": 568035072.0, + "logits/rejected": 466237408.0, + "logps/chosen": -385.8712158203125, + "logps/rejected": -600.0538330078125, + "loss": 0.0625, + "rewards/chosen": 2.5026679039001465, + "rewards/margins": 13.620192050933838, + "rewards/rejected": -11.117524147033691, + "step": 1955 + }, + { + "epoch": 0.17871174052078576, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 9.24573407226209e-06, + "logits/chosen": 684407125.3333334, + "logits/rejected": 1269634764.8, + "logps/chosen": -325.61488850911456, + "logps/rejected": -687.159033203125, + "loss": 0.0202, + "rewards/chosen": 3.528315226236979, + "rewards/margins": 13.700773111979165, + "rewards/rejected": -10.172457885742187, + "step": 1956 + }, + { + "epoch": 0.1788031064412974, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.244974512856038e-06, + "logits/chosen": 643662848.0, + "logits/rejected": 426344960.0, + "logps/chosen": -318.391015625, + "logps/rejected": -342.6014811197917, + "loss": 0.0296, + "rewards/chosen": 3.5444015502929687, + "rewards/margins": 13.740956370035807, + "rewards/rejected": -10.196554819742838, + "step": 1957 + }, + { + "epoch": 0.17889447236180905, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.244214602429924e-06, + "logits/chosen": 540007509.3333334, + "logits/rejected": 594256512.0, + "logps/chosen": -302.9027099609375, + "logps/rejected": -505.2573547363281, + "loss": 0.0392, + "rewards/chosen": 3.2591126759847007, + "rewards/margins": 9.181021054585775, + "rewards/rejected": -5.921908378601074, + "step": 1958 + }, + { + "epoch": 0.1789858382823207, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.243454341046588e-06, + "logits/chosen": 700350098.2857143, + "logits/rejected": 663070464.0, + "logps/chosen": -309.77682059151783, + "logps/rejected": -933.7528076171875, + "loss": 0.048, + "rewards/chosen": 2.8916402544294084, + "rewards/margins": 14.538210460117885, + "rewards/rejected": -11.646570205688477, + "step": 1959 + }, + { + "epoch": 0.17907720420283235, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.242693728768892e-06, + "logits/chosen": 354940202.6666667, + "logits/rejected": 462506803.2, + "logps/chosen": -405.4202880859375, + "logps/rejected": -410.631103515625, + "loss": 0.0223, + "rewards/chosen": 3.051537831624349, + "rewards/margins": 11.3319943745931, + "rewards/rejected": -8.28045654296875, + "step": 1960 + }, + { + "epoch": 0.179168570123344, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.241932765659735e-06, + "logits/chosen": 414021290.6666667, + "logits/rejected": 581830912.0, + "logps/chosen": -342.7345377604167, + "logps/rejected": -520.966552734375, + "loss": 0.0396, + "rewards/chosen": 3.3967164357503257, + "rewards/margins": 11.022370179494223, + "rewards/rejected": -7.6256537437438965, + "step": 1961 + }, + { + "epoch": 0.17925993604385565, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 9.241171451782042e-06, + "logits/chosen": 558752341.3333334, + "logits/rejected": 309434880.0, + "logps/chosen": -442.7840169270833, + "logps/rejected": -373.594091796875, + "loss": 0.0243, + "rewards/chosen": 2.847161293029785, + "rewards/margins": 10.335504341125489, + "rewards/rejected": -7.488343048095703, + "step": 1962 + }, + { + "epoch": 0.1793513019643673, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 9.240409787198764e-06, + "logits/chosen": 517968640.0, + "logits/rejected": 436906752.0, + "logps/chosen": -155.037353515625, + "logps/rejected": -508.96630859375, + "loss": 0.0136, + "rewards/chosen": 3.7474350929260254, + "rewards/margins": 11.922781467437744, + "rewards/rejected": -8.175346374511719, + "step": 1963 + }, + { + "epoch": 0.17944266788487895, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.239647771972884e-06, + "logits/chosen": 1002560256.0, + "logits/rejected": 658253248.0, + "logps/chosen": -227.62801106770834, + "logps/rejected": -524.0736694335938, + "loss": 0.1128, + "rewards/chosen": 2.4401659965515137, + "rewards/margins": 9.875303745269775, + "rewards/rejected": -7.435137748718262, + "step": 1964 + }, + { + "epoch": 0.1795340338053906, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.238885406167416e-06, + "logits/chosen": 837854310.4, + "logits/rejected": 556419029.3333334, + "logps/chosen": -320.18271484375, + "logps/rejected": -464.4407958984375, + "loss": 0.051, + "rewards/chosen": 3.632501983642578, + "rewards/margins": 12.082345708211264, + "rewards/rejected": -8.449843724568685, + "step": 1965 + }, + { + "epoch": 0.17962539972590225, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.238122689845398e-06, + "logits/chosen": 467520256.0, + "logits/rejected": 220992512.0, + "logps/chosen": -349.5184733072917, + "logps/rejected": -265.2751159667969, + "loss": 0.0418, + "rewards/chosen": 3.0872227350870767, + "rewards/margins": 9.95098320643107, + "rewards/rejected": -6.863760471343994, + "step": 1966 + }, + { + "epoch": 0.1797167656464139, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.237359623069901e-06, + "logits/chosen": 780851370.6666666, + "logits/rejected": 511835648.0, + "logps/chosen": -359.4962158203125, + "logps/rejected": -412.1587890625, + "loss": 0.089, + "rewards/chosen": 1.8745023409525554, + "rewards/margins": 9.926659933725992, + "rewards/rejected": -8.052157592773437, + "step": 1967 + }, + { + "epoch": 0.17980813156692554, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 9.236596205904024e-06, + "logits/chosen": 440582560.0, + "logits/rejected": 799897088.0, + "logps/chosen": -284.56829833984375, + "logps/rejected": -615.6915283203125, + "loss": 0.0426, + "rewards/chosen": 2.605942964553833, + "rewards/margins": 13.852246522903442, + "rewards/rejected": -11.24630355834961, + "step": 1968 + }, + { + "epoch": 0.1798994974874372, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.235832438410892e-06, + "logits/chosen": 459014336.0, + "logits/rejected": 396089664.0, + "logps/chosen": -271.7626953125, + "logps/rejected": -382.79315185546875, + "loss": 0.1043, + "rewards/chosen": 2.581090211868286, + "rewards/margins": 11.824973344802856, + "rewards/rejected": -9.24388313293457, + "step": 1969 + }, + { + "epoch": 0.17999086340794884, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 9.235068320653665e-06, + "logits/chosen": 553315020.8, + "logits/rejected": 450248533.3333333, + "logps/chosen": -369.1595703125, + "logps/rejected": -526.7788899739584, + "loss": 0.0191, + "rewards/chosen": 3.9706531524658204, + "rewards/margins": 12.189656194051107, + "rewards/rejected": -8.219003041585287, + "step": 1970 + }, + { + "epoch": 0.1800822293284605, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 9.234303852695526e-06, + "logits/chosen": 472344985.6, + "logits/rejected": 646711978.6666666, + "logps/chosen": -292.25546875, + "logps/rejected": -582.2001139322916, + "loss": 0.0206, + "rewards/chosen": 3.8407527923583986, + "rewards/margins": 14.01148338317871, + "rewards/rejected": -10.170730590820312, + "step": 1971 + }, + { + "epoch": 0.18017359524897214, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.23353903459969e-06, + "logits/chosen": 759491584.0, + "logits/rejected": 539440768.0, + "logps/chosen": -372.8918863932292, + "logps/rejected": -602.1527099609375, + "loss": 0.049, + "rewards/chosen": 3.272411346435547, + "rewards/margins": 10.083003997802734, + "rewards/rejected": -6.8105926513671875, + "step": 1972 + }, + { + "epoch": 0.1802649611694838, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 9.2327738664294e-06, + "logits/chosen": 543256640.0, + "logits/rejected": 266907232.0, + "logps/chosen": -326.6628112792969, + "logps/rejected": -242.89242553710938, + "loss": 0.0218, + "rewards/chosen": 3.539468288421631, + "rewards/margins": 10.442445755004883, + "rewards/rejected": -6.902977466583252, + "step": 1973 + }, + { + "epoch": 0.18035632708999544, + "grad_norm": 44.75, + "kl": 0.0, + "learning_rate": 9.23200834824793e-06, + "logits/chosen": 751985152.0, + "logits/rejected": 581489066.6666666, + "logps/chosen": -236.36533203125, + "logps/rejected": -292.3553059895833, + "loss": 0.1805, + "rewards/chosen": 2.4511445999145507, + "rewards/margins": 7.032243792215983, + "rewards/rejected": -4.581099192301433, + "step": 1974 + }, + { + "epoch": 0.1804476930105071, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 9.231242480118578e-06, + "logits/chosen": 324324544.0, + "logits/rejected": 485208320.0, + "logps/chosen": -310.080322265625, + "logps/rejected": -312.8563639322917, + "loss": 0.0101, + "rewards/chosen": 4.544095039367676, + "rewards/margins": 10.992004712422688, + "rewards/rejected": -6.447909673055013, + "step": 1975 + }, + { + "epoch": 0.18053905893101874, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 9.230476262104678e-06, + "logits/chosen": 741739008.0, + "logits/rejected": 768855424.0, + "logps/chosen": -436.821044921875, + "logps/rejected": -514.7252197265625, + "loss": 0.0201, + "rewards/chosen": 3.3055367469787598, + "rewards/margins": 10.62235975265503, + "rewards/rejected": -7.3168230056762695, + "step": 1976 + }, + { + "epoch": 0.18063042485153039, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 9.229709694269587e-06, + "logits/chosen": 414635520.0, + "logits/rejected": 857459370.6666666, + "logps/chosen": -249.1624298095703, + "logps/rejected": -577.2536214192709, + "loss": 0.0184, + "rewards/chosen": 2.662130832672119, + "rewards/margins": 13.577632745107016, + "rewards/rejected": -10.915501912434896, + "step": 1977 + }, + { + "epoch": 0.18072179077204203, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 9.228942776676692e-06, + "logits/chosen": 396712106.6666667, + "logits/rejected": 400242585.6, + "logps/chosen": -274.4247233072917, + "logps/rejected": -674.30224609375, + "loss": 0.0153, + "rewards/chosen": 3.5303544998168945, + "rewards/margins": 12.191436195373536, + "rewards/rejected": -8.661081695556641, + "step": 1978 + }, + { + "epoch": 0.18081315669255368, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.228175509389415e-06, + "logits/chosen": 431988704.0, + "logits/rejected": 426615488.0, + "logps/chosen": -190.08270263671875, + "logps/rejected": -376.96099853515625, + "loss": 0.061, + "rewards/chosen": 2.3932347297668457, + "rewards/margins": 9.648238182067871, + "rewards/rejected": -7.255003452301025, + "step": 1979 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 9.227407892471194e-06, + "logits/chosen": 382118546.28571427, + "logits/rejected": 147620864.0, + "logps/chosen": -277.08387974330356, + "logps/rejected": -240.0787353515625, + "loss": 0.0353, + "rewards/chosen": 3.530346461704799, + "rewards/margins": 9.587762423924037, + "rewards/rejected": -6.057415962219238, + "step": 1980 + }, + { + "epoch": 0.18099588853357698, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.22663992598551e-06, + "logits/chosen": 555104384.0, + "logits/rejected": 761341184.0, + "logps/chosen": -340.30419921875, + "logps/rejected": -806.9805908203125, + "loss": 0.0308, + "rewards/chosen": 2.874518871307373, + "rewards/margins": 15.122979640960693, + "rewards/rejected": -12.24846076965332, + "step": 1981 + }, + { + "epoch": 0.18108725445408863, + "grad_norm": 37.0, + "kl": 0.0, + "learning_rate": 9.225871609995864e-06, + "logits/chosen": 439708672.0, + "logits/rejected": 647305472.0, + "logps/chosen": -204.4566650390625, + "logps/rejected": -389.4555257161458, + "loss": 0.0723, + "rewards/chosen": 3.1910226345062256, + "rewards/margins": 10.665040731430054, + "rewards/rejected": -7.474018096923828, + "step": 1982 + }, + { + "epoch": 0.18117862037460028, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.22510294456579e-06, + "logits/chosen": 383420160.0, + "logits/rejected": 314498304.0, + "logps/chosen": -246.13447265625, + "logps/rejected": -298.1977132161458, + "loss": 0.0299, + "rewards/chosen": 3.3022945404052733, + "rewards/margins": 7.510179964701335, + "rewards/rejected": -4.2078854242960615, + "step": 1983 + }, + { + "epoch": 0.18126998629511193, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 9.224333929758851e-06, + "logits/chosen": 476816128.0, + "logits/rejected": 581404672.0, + "logps/chosen": -348.0781555175781, + "logps/rejected": -779.8338623046875, + "loss": 0.0165, + "rewards/chosen": 3.577146291732788, + "rewards/margins": 15.001062154769897, + "rewards/rejected": -11.42391586303711, + "step": 1984 + }, + { + "epoch": 0.18136135221562358, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.223564565638633e-06, + "logits/chosen": 812274585.6, + "logits/rejected": 1395248981.3333333, + "logps/chosen": -553.11015625, + "logps/rejected": -433.3206380208333, + "loss": 0.0246, + "rewards/chosen": 3.683590316772461, + "rewards/margins": 11.213608169555664, + "rewards/rejected": -7.530017852783203, + "step": 1985 + }, + { + "epoch": 0.18145271813613523, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.222794852268759e-06, + "logits/chosen": 538306048.0, + "logits/rejected": 342414112.0, + "logps/chosen": -309.8777669270833, + "logps/rejected": -412.873046875, + "loss": 0.036, + "rewards/chosen": 3.2639795939127603, + "rewards/margins": 9.932623068491617, + "rewards/rejected": -6.668643474578857, + "step": 1986 + }, + { + "epoch": 0.18154408405664688, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.222024789712874e-06, + "logits/chosen": 901528320.0, + "logits/rejected": 337799253.3333333, + "logps/chosen": -270.921875, + "logps/rejected": -366.61328125, + "loss": 0.0198, + "rewards/chosen": 3.330888509750366, + "rewards/margins": 11.149304469426472, + "rewards/rejected": -7.8184159596761065, + "step": 1987 + }, + { + "epoch": 0.18163544997715853, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 9.221254378034657e-06, + "logits/chosen": 371176243.2, + "logits/rejected": 324937408.0, + "logps/chosen": -209.555126953125, + "logps/rejected": -524.3439534505209, + "loss": 0.037, + "rewards/chosen": 3.3239276885986326, + "rewards/margins": 12.348189163208009, + "rewards/rejected": -9.024261474609375, + "step": 1988 + }, + { + "epoch": 0.18172681589767017, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 9.220483617297814e-06, + "logits/chosen": 988941824.0, + "logits/rejected": 437563818.6666667, + "logps/chosen": -397.91943359375, + "logps/rejected": -277.6814371744792, + "loss": 0.0229, + "rewards/chosen": 3.792998123168945, + "rewards/margins": 8.863604990641276, + "rewards/rejected": -5.070606867472331, + "step": 1989 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 9.219712507566081e-06, + "logits/chosen": 722770858.6666666, + "logits/rejected": 389357568.0, + "logps/chosen": -315.6781819661458, + "logps/rejected": -443.313671875, + "loss": 0.0288, + "rewards/chosen": 2.9251111348470054, + "rewards/margins": 8.829911549886068, + "rewards/rejected": -5.9048004150390625, + "step": 1990 + }, + { + "epoch": 0.18190954773869347, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.218941048903218e-06, + "logits/chosen": 768293376.0, + "logits/rejected": 805089024.0, + "logps/chosen": -465.7899169921875, + "logps/rejected": -505.44451904296875, + "loss": 0.0514, + "rewards/chosen": 2.7945518493652344, + "rewards/margins": 10.704743385314941, + "rewards/rejected": -7.910191535949707, + "step": 1991 + }, + { + "epoch": 0.18200091365920512, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 9.21816924137302e-06, + "logits/chosen": 382363733.3333333, + "logits/rejected": 228260147.2, + "logps/chosen": -382.685791015625, + "logps/rejected": -375.374658203125, + "loss": 0.0136, + "rewards/chosen": 3.6713473002115884, + "rewards/margins": 13.07568384806315, + "rewards/rejected": -9.404336547851562, + "step": 1992 + }, + { + "epoch": 0.18209227957971677, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.21739708503931e-06, + "logits/chosen": 480491349.3333333, + "logits/rejected": 495148032.0, + "logps/chosen": -329.6787516276042, + "logps/rejected": -483.96328125, + "loss": 0.0126, + "rewards/chosen": 3.6089417139689126, + "rewards/margins": 11.958606020609537, + "rewards/rejected": -8.349664306640625, + "step": 1993 + }, + { + "epoch": 0.18218364550022842, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 9.216624579965936e-06, + "logits/chosen": 486799616.0, + "logits/rejected": 1021544106.6666666, + "logps/chosen": -347.8277587890625, + "logps/rejected": -938.628662109375, + "loss": 0.007, + "rewards/chosen": 4.009343147277832, + "rewards/margins": 13.430400530497232, + "rewards/rejected": -9.4210573832194, + "step": 1994 + }, + { + "epoch": 0.18227501142074007, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.215851726216774e-06, + "logits/chosen": 491653920.0, + "logits/rejected": 412365312.0, + "logps/chosen": -311.0799560546875, + "logps/rejected": -306.69439697265625, + "loss": 0.0242, + "rewards/chosen": 3.349851131439209, + "rewards/margins": 10.555976867675781, + "rewards/rejected": -7.206125736236572, + "step": 1995 + }, + { + "epoch": 0.18236637734125172, + "grad_norm": 27.625, + "kl": 0.0, + "learning_rate": 9.215078523855736e-06, + "logits/chosen": 598012373.3333334, + "logits/rejected": 336354278.4, + "logps/chosen": -316.77256266276044, + "logps/rejected": -445.6181640625, + "loss": 0.0895, + "rewards/chosen": 3.8732665379842124, + "rewards/margins": 10.744849332173665, + "rewards/rejected": -6.871582794189453, + "step": 1996 + }, + { + "epoch": 0.18245774326176337, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 9.21430497294676e-06, + "logits/chosen": 545934848.0, + "logps/chosen": -357.9125061035156, + "loss": 0.0937, + "rewards/chosen": 2.924391984939575, + "step": 1997 + }, + { + "epoch": 0.18254910918227502, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 9.213531073553807e-06, + "logits/chosen": 513606186.6666667, + "logits/rejected": 720570816.0, + "logps/chosen": -350.4067789713542, + "logps/rejected": -659.912841796875, + "loss": 0.0213, + "rewards/chosen": 3.9416290918986, + "rewards/margins": 11.510272661844889, + "rewards/rejected": -7.568643569946289, + "step": 1998 + }, + { + "epoch": 0.18264047510278666, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 9.212756825740874e-06, + "logits/chosen": 649187174.4, + "logits/rejected": 243483818.66666666, + "logps/chosen": -492.046630859375, + "logps/rejected": -332.56591796875, + "loss": 0.0586, + "rewards/chosen": 2.7666156768798826, + "rewards/margins": 10.228304036458333, + "rewards/rejected": -7.46168835957845, + "step": 1999 + }, + { + "epoch": 0.1827318410232983, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 9.211982229571983e-06, + "logits/chosen": 545123712.0, + "logits/rejected": 484464810.6666667, + "logps/chosen": -288.89825439453125, + "logps/rejected": -398.2649332682292, + "loss": 0.0196, + "rewards/chosen": 2.6855173110961914, + "rewards/margins": 10.959906578063965, + "rewards/rejected": -8.274389266967773, + "step": 2000 + }, + { + "epoch": 0.18282320694380996, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 9.211207285111188e-06, + "logits/chosen": 831499827.2, + "logits/rejected": 734987434.6666666, + "logps/chosen": -384.991455078125, + "logps/rejected": -604.6684163411459, + "loss": 0.0177, + "rewards/chosen": 3.944362258911133, + "rewards/margins": 13.49753532409668, + "rewards/rejected": -9.553173065185547, + "step": 2001 + }, + { + "epoch": 0.1829145728643216, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.210431992422569e-06, + "logits/chosen": 398010048.0, + "logits/rejected": 424696320.0, + "logps/chosen": -304.4036865234375, + "logps/rejected": -604.0482177734375, + "loss": 0.0346, + "rewards/chosen": 3.746654987335205, + "rewards/margins": 11.444411754608154, + "rewards/rejected": -7.697756767272949, + "step": 2002 + }, + { + "epoch": 0.18300593878483326, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 9.209656351570231e-06, + "logits/chosen": 818414080.0, + "logits/rejected": 553688934.4, + "logps/chosen": -463.01171875, + "logps/rejected": -436.181884765625, + "loss": 0.0152, + "rewards/chosen": 3.270028750101725, + "rewards/margins": 10.675913302103679, + "rewards/rejected": -7.405884552001953, + "step": 2003 + }, + { + "epoch": 0.1830973047053449, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.20888036261832e-06, + "logits/chosen": 473290112.0, + "logits/rejected": 321339488.0, + "logps/chosen": -217.67160034179688, + "logps/rejected": -442.5694274902344, + "loss": 0.0505, + "rewards/chosen": 2.638540744781494, + "rewards/margins": 9.797255992889404, + "rewards/rejected": -7.15871524810791, + "step": 2004 + }, + { + "epoch": 0.18318867062585656, + "grad_norm": 0.8984375, + "kl": 0.0, + "learning_rate": 9.208104025630995e-06, + "logits/chosen": 232938048.0, + "logits/rejected": 286935872.0, + "logps/chosen": -233.6065673828125, + "logps/rejected": -379.2405192057292, + "loss": 0.0049, + "rewards/chosen": 4.145063400268555, + "rewards/margins": 11.88844108581543, + "rewards/rejected": -7.743377685546875, + "step": 2005 + }, + { + "epoch": 0.1832800365463682, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 9.20732734067246e-06, + "logits/chosen": 639380224.0, + "logits/rejected": 499506227.2, + "logps/chosen": -294.4476725260417, + "logps/rejected": -526.2224609375, + "loss": 0.0148, + "rewards/chosen": 3.4184096654256186, + "rewards/margins": 10.422963651021321, + "rewards/rejected": -7.004553985595703, + "step": 2006 + }, + { + "epoch": 0.18337140246687986, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 9.206550307806934e-06, + "logits/chosen": 830580633.6, + "logits/rejected": 572275285.3333334, + "logps/chosen": -272.277685546875, + "logps/rejected": -620.2596028645834, + "loss": 0.0186, + "rewards/chosen": 3.8799716949462892, + "rewards/margins": 13.232835006713866, + "rewards/rejected": -9.352863311767578, + "step": 2007 + }, + { + "epoch": 0.1834627683873915, + "grad_norm": 0.25390625, + "kl": 0.0, + "learning_rate": 9.205772927098671e-06, + "logits/rejected": 594644864.0, + "logps/rejected": -454.1930236816406, + "loss": 0.0009, + "rewards/rejected": -8.04198932647705, + "step": 2008 + }, + { + "epoch": 0.18355413430790316, + "grad_norm": 0.9140625, + "kl": 0.0, + "learning_rate": 9.204995198611956e-06, + "logits/chosen": 771945984.0, + "logits/rejected": 437686016.0, + "logps/chosen": -314.6833190917969, + "logps/rejected": -543.9174397786459, + "loss": 0.0061, + "rewards/chosen": 3.9456262588500977, + "rewards/margins": 12.329741477966309, + "rewards/rejected": -8.384115219116211, + "step": 2009 + }, + { + "epoch": 0.1836455002284148, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.204217122411096e-06, + "logits/chosen": 319177536.0, + "logits/rejected": 386061354.6666667, + "logps/chosen": -166.77529907226562, + "logps/rejected": -444.1708577473958, + "loss": 0.0142, + "rewards/chosen": 2.866412401199341, + "rewards/margins": 10.609256029129028, + "rewards/rejected": -7.7428436279296875, + "step": 2010 + }, + { + "epoch": 0.18373686614892645, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 9.203438698560436e-06, + "logits/chosen": 450850912.0, + "logits/rejected": 504471872.0, + "logps/chosen": -289.97418212890625, + "logps/rejected": -448.1667175292969, + "loss": 0.0204, + "rewards/chosen": 3.5451440811157227, + "rewards/margins": 11.042675971984863, + "rewards/rejected": -7.497531890869141, + "step": 2011 + }, + { + "epoch": 0.1838282320694381, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 9.20265992712434e-06, + "logits/chosen": 519869504.0, + "logits/rejected": 1037370624.0, + "logps/chosen": -139.56649780273438, + "logps/rejected": -470.744384765625, + "loss": 0.0877, + "rewards/chosen": 1.1825858354568481, + "rewards/margins": 10.451539794603983, + "rewards/rejected": -9.268953959147135, + "step": 2012 + }, + { + "epoch": 0.18391959798994975, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 9.201880808167206e-06, + "logits/chosen": 840192768.0, + "logits/rejected": 531814784.0, + "logps/chosen": -206.08493041992188, + "logps/rejected": -189.59146118164062, + "loss": 0.0687, + "rewards/chosen": 2.683985710144043, + "rewards/margins": 6.844717979431152, + "rewards/rejected": -4.160732269287109, + "step": 2013 + }, + { + "epoch": 0.1840109639104614, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.20110134175346e-06, + "logits/chosen": 455172608.0, + "logits/rejected": 831079065.6, + "logps/chosen": -351.16455078125, + "logps/rejected": -577.210498046875, + "loss": 0.0123, + "rewards/chosen": 3.40696652730306, + "rewards/margins": 12.978458531697592, + "rewards/rejected": -9.571492004394532, + "step": 2014 + }, + { + "epoch": 0.18410232983097305, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.200321527947558e-06, + "logits/chosen": 456620544.0, + "logits/rejected": 1036594483.2, + "logps/chosen": -98.61165364583333, + "logps/rejected": -607.7546875, + "loss": 0.0323, + "rewards/chosen": 2.9974940617879233, + "rewards/margins": 9.329693158467611, + "rewards/rejected": -6.3321990966796875, + "step": 2015 + }, + { + "epoch": 0.1841936957514847, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.199541366813984e-06, + "logits/chosen": 727959232.0, + "logits/rejected": 634475200.0, + "logps/chosen": -326.22479248046875, + "logps/rejected": -494.050048828125, + "loss": 0.0327, + "rewards/chosen": 2.924010753631592, + "rewards/margins": 10.537120342254639, + "rewards/rejected": -7.613109588623047, + "step": 2016 + }, + { + "epoch": 0.18428506167199635, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.198760858417246e-06, + "logits/chosen": 593906432.0, + "logits/rejected": 374144819.2, + "logps/chosen": -370.2371012369792, + "logps/rejected": -417.115576171875, + "loss": 0.0656, + "rewards/chosen": 1.7148243586222331, + "rewards/margins": 10.922432390848796, + "rewards/rejected": -9.207608032226563, + "step": 2017 + }, + { + "epoch": 0.184376427592508, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.19798000282189e-06, + "logits/chosen": 534132121.6, + "logits/rejected": 386005034.6666667, + "logps/chosen": -352.0815673828125, + "logps/rejected": -348.2980143229167, + "loss": 0.0379, + "rewards/chosen": 3.3679752349853516, + "rewards/margins": 12.619048436482748, + "rewards/rejected": -9.251073201497396, + "step": 2018 + }, + { + "epoch": 0.18446779351301965, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.197198800092479e-06, + "logits/chosen": 503766048.0, + "logits/rejected": 535807040.0, + "logps/chosen": -309.1827392578125, + "logps/rejected": -680.001953125, + "loss": 0.0428, + "rewards/chosen": 2.611234188079834, + "rewards/margins": 12.38403844833374, + "rewards/rejected": -9.772804260253906, + "step": 2019 + }, + { + "epoch": 0.1845591594335313, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 9.196417250293618e-06, + "logits/chosen": 321165162.6666667, + "logits/rejected": 383205452.8, + "logps/chosen": -247.693359375, + "logps/rejected": -466.528125, + "loss": 0.0146, + "rewards/chosen": 3.7601893742879233, + "rewards/margins": 11.247524579366049, + "rewards/rejected": -7.487335205078125, + "step": 2020 + }, + { + "epoch": 0.18465052535404294, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 9.195635353489932e-06, + "logits/chosen": 631368652.8, + "logits/rejected": 630938965.3333334, + "logps/chosen": -264.496435546875, + "logps/rejected": -567.7159830729166, + "loss": 0.0143, + "rewards/chosen": 3.936051940917969, + "rewards/margins": 17.611356353759767, + "rewards/rejected": -13.675304412841797, + "step": 2021 + }, + { + "epoch": 0.1847418912745546, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.194853109746073e-06, + "logits/chosen": 658368896.0, + "logits/rejected": 296441056.0, + "logps/chosen": -463.843017578125, + "logps/rejected": -454.18035888671875, + "loss": 0.0352, + "rewards/chosen": 3.5924439430236816, + "rewards/margins": 10.328386306762695, + "rewards/rejected": -6.735942363739014, + "step": 2022 + }, + { + "epoch": 0.18483325719506624, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.19407051912673e-06, + "logits/chosen": 466530752.0, + "logits/rejected": 624327082.6666666, + "logps/chosen": -241.496337890625, + "logps/rejected": -362.2776285807292, + "loss": 0.0252, + "rewards/chosen": 2.272751569747925, + "rewards/margins": 10.335574547449747, + "rewards/rejected": -8.062822977701822, + "step": 2023 + }, + { + "epoch": 0.1849246231155779, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 9.193287581696612e-06, + "logits/chosen": 582305664.0, + "logits/rejected": 587624256.0, + "logps/chosen": -570.7576904296875, + "logps/rejected": -729.9015502929688, + "loss": 0.0153, + "rewards/chosen": 3.7933740615844727, + "rewards/margins": 15.465476036071777, + "rewards/rejected": -11.672101974487305, + "step": 2024 + }, + { + "epoch": 0.18501598903608954, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.192504297520463e-06, + "logits/chosen": 983273813.3333334, + "logits/rejected": 969604928.0, + "logps/chosen": -351.5958658854167, + "logps/rejected": -572.63671875, + "loss": 0.0616, + "rewards/chosen": 3.269627253214518, + "rewards/margins": 11.360617319742838, + "rewards/rejected": -8.09099006652832, + "step": 2025 + }, + { + "epoch": 0.1851073549566012, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.191720666663055e-06, + "logits/chosen": 416043520.0, + "logits/rejected": 690492672.0, + "logps/chosen": -292.27764892578125, + "logps/rejected": -676.8302001953125, + "loss": 0.0323, + "rewards/chosen": 3.1527059078216553, + "rewards/margins": 12.273950338363647, + "rewards/rejected": -9.121244430541992, + "step": 2026 + }, + { + "epoch": 0.18519872087711284, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 9.190936689189185e-06, + "logits/chosen": 618051803.4285715, + "logits/rejected": 1045672768.0, + "logps/chosen": -290.30308314732144, + "logps/rejected": -759.092529296875, + "loss": 0.0326, + "rewards/chosen": 3.436067853655134, + "rewards/margins": 14.481068883623395, + "rewards/rejected": -11.045001029968262, + "step": 2027 + }, + { + "epoch": 0.1852900867976245, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 9.190152365163679e-06, + "logits/chosen": 589147520.0, + "logits/rejected": 293767264.0, + "logps/chosen": -491.223388671875, + "logps/rejected": -390.5203857421875, + "loss": 0.0242, + "rewards/chosen": 3.3249573707580566, + "rewards/margins": 11.895724773406982, + "rewards/rejected": -8.570767402648926, + "step": 2028 + }, + { + "epoch": 0.18538145271813614, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 9.189367694651395e-06, + "logits/chosen": 640602521.6, + "logits/rejected": 415150165.3333333, + "logps/chosen": -468.919287109375, + "logps/rejected": -532.0508626302084, + "loss": 0.0319, + "rewards/chosen": 3.374475860595703, + "rewards/margins": 14.253834788004557, + "rewards/rejected": -10.879358927408854, + "step": 2029 + }, + { + "epoch": 0.18547281863864779, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.188582677717219e-06, + "logits/chosen": 972677632.0, + "logits/rejected": 767392768.0, + "logps/chosen": -405.9267578125, + "logps/rejected": -547.05078125, + "loss": 0.0264, + "rewards/chosen": 3.7339851379394533, + "rewards/margins": 12.974669647216796, + "rewards/rejected": -9.240684509277344, + "step": 2030 + }, + { + "epoch": 0.18556418455915943, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.187797314426064e-06, + "logits/chosen": 491425389.71428573, + "logits/rejected": 277419712.0, + "logps/chosen": -320.859619140625, + "logps/rejected": -504.29248046875, + "loss": 0.0692, + "rewards/chosen": 2.8974636622837613, + "rewards/margins": 16.892758233206614, + "rewards/rejected": -13.995294570922852, + "step": 2031 + }, + { + "epoch": 0.18565555047967108, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.18701160484287e-06, + "logits/chosen": 585103872.0, + "logits/rejected": 851472384.0, + "logps/chosen": -396.857568359375, + "logps/rejected": -402.8935546875, + "loss": 0.0674, + "rewards/chosen": 3.2789520263671874, + "rewards/margins": 9.8089724222819, + "rewards/rejected": -6.530020395914714, + "step": 2032 + }, + { + "epoch": 0.18574691640018273, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 9.186225549032613e-06, + "logits/chosen": 520985600.0, + "logits/rejected": 850989376.0, + "logps/chosen": -385.6571960449219, + "logps/rejected": -464.6756591796875, + "loss": 0.0227, + "rewards/chosen": 3.4478981494903564, + "rewards/margins": 10.224560022354126, + "rewards/rejected": -6.7766618728637695, + "step": 2033 + }, + { + "epoch": 0.18583828232069438, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.185439147060287e-06, + "logits/chosen": 665561292.8, + "logits/rejected": 782077354.6666666, + "logps/chosen": -423.93388671875, + "logps/rejected": -738.15087890625, + "loss": 0.03, + "rewards/chosen": 3.3190471649169924, + "rewards/margins": 12.328415425618491, + "rewards/rejected": -9.009368260701498, + "step": 2034 + }, + { + "epoch": 0.18592964824120603, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 9.184652398990922e-06, + "logits/chosen": 469785728.0, + "logits/rejected": 691560345.6, + "logps/chosen": -242.50895182291666, + "logps/rejected": -534.18671875, + "loss": 0.0109, + "rewards/chosen": 3.7200024922688804, + "rewards/margins": 12.606715138753255, + "rewards/rejected": -8.886712646484375, + "step": 2035 + }, + { + "epoch": 0.18602101416171768, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 9.183865304889577e-06, + "logits/chosen": 257375513.6, + "logits/rejected": 406009429.3333333, + "logps/chosen": -155.75802001953124, + "logps/rejected": -379.1033528645833, + "loss": 0.0375, + "rewards/chosen": 3.7683536529541017, + "rewards/margins": 12.169017028808593, + "rewards/rejected": -8.400663375854492, + "step": 2036 + }, + { + "epoch": 0.18611238008222933, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.183077864821334e-06, + "logits/chosen": 473691238.4, + "logits/rejected": 408610474.6666667, + "logps/chosen": -232.552783203125, + "logps/rejected": -680.4756673177084, + "loss": 0.0296, + "rewards/chosen": 3.335166168212891, + "rewards/margins": 15.76683349609375, + "rewards/rejected": -12.43166732788086, + "step": 2037 + }, + { + "epoch": 0.18620374600274098, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 9.18229007885131e-06, + "logits/chosen": 819412224.0, + "logits/rejected": 368951616.0, + "logps/chosen": -364.8270263671875, + "logps/rejected": -269.6771240234375, + "loss": 0.0224, + "rewards/chosen": 3.2438435554504395, + "rewards/margins": 8.629372596740723, + "rewards/rejected": -5.385529041290283, + "step": 2038 + }, + { + "epoch": 0.18629511192325263, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.181501947044644e-06, + "logits/chosen": 572019776.0, + "logits/rejected": 954348836.5714285, + "logps/chosen": -282.8064880371094, + "logps/rejected": -703.5514787946429, + "loss": 0.0369, + "rewards/chosen": 1.2324799299240112, + "rewards/margins": 13.554668171065194, + "rewards/rejected": -12.322188241141182, + "step": 2039 + }, + { + "epoch": 0.18638647784376428, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.18071346946651e-06, + "logits/chosen": 373805721.6, + "logits/rejected": 257251328.0, + "logps/chosen": -344.29501953125, + "logps/rejected": -299.98044840494794, + "loss": 0.0321, + "rewards/chosen": 3.339143753051758, + "rewards/margins": 11.456531524658203, + "rewards/rejected": -8.117387771606445, + "step": 2040 + }, + { + "epoch": 0.18647784376427592, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.17992464618211e-06, + "logits/chosen": 420708198.4, + "logits/rejected": 535694464.0, + "logps/chosen": -278.956298828125, + "logps/rejected": -611.42333984375, + "loss": 0.0113, + "rewards/chosen": 4.446928405761719, + "rewards/margins": 13.891286595662436, + "rewards/rejected": -9.444358189900717, + "step": 2041 + }, + { + "epoch": 0.18656920968478757, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.17913547725667e-06, + "logits/chosen": 360784576.0, + "logits/rejected": 360923392.0, + "logps/chosen": -151.57901000976562, + "logps/rejected": -389.8594156901042, + "loss": 0.0454, + "rewards/chosen": 2.138317108154297, + "rewards/margins": 9.381409962972004, + "rewards/rejected": -7.243092854817708, + "step": 2042 + }, + { + "epoch": 0.18666057560529922, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 9.178345962755442e-06, + "logits/chosen": 929368768.0, + "logits/rejected": 713771904.0, + "logps/chosen": -324.1355895996094, + "logps/rejected": -768.434326171875, + "loss": 0.0829, + "rewards/chosen": 2.2685766220092773, + "rewards/margins": 11.529627799987793, + "rewards/rejected": -9.261051177978516, + "step": 2043 + }, + { + "epoch": 0.18675194152581087, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.177556102743719e-06, + "logits/chosen": 784251084.8, + "logits/rejected": 650307029.3333334, + "logps/chosen": -445.74521484375, + "logps/rejected": -333.8623453776042, + "loss": 0.0226, + "rewards/chosen": 3.549976348876953, + "rewards/margins": 10.693671544392902, + "rewards/rejected": -7.14369519551595, + "step": 2044 + }, + { + "epoch": 0.18684330744632252, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 9.176765897286812e-06, + "logits/chosen": 575641728.0, + "logits/rejected": 614633600.0, + "logps/chosen": -334.22119140625, + "logps/rejected": -541.590576171875, + "loss": 0.0126, + "rewards/chosen": 4.217769622802734, + "rewards/margins": 11.733103275299072, + "rewards/rejected": -7.515333652496338, + "step": 2045 + }, + { + "epoch": 0.18693467336683417, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 9.175975346450063e-06, + "logits/chosen": 916075008.0, + "logits/rejected": 802648832.0, + "logps/chosen": -246.09965006510416, + "logps/rejected": -665.390185546875, + "loss": 0.0152, + "rewards/chosen": 3.52980105082194, + "rewards/margins": 11.961936060587565, + "rewards/rejected": -8.432135009765625, + "step": 2046 + }, + { + "epoch": 0.18702603928734582, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.175184450298847e-06, + "logits/chosen": 484512512.0, + "logits/rejected": 394086144.0, + "logps/chosen": -385.0954182942708, + "logps/rejected": -457.9854431152344, + "loss": 0.0367, + "rewards/chosen": 3.363987922668457, + "rewards/margins": 9.57889461517334, + "rewards/rejected": -6.214906692504883, + "step": 2047 + }, + { + "epoch": 0.18711740520785747, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 9.174393208898558e-06, + "logits/chosen": 404948832.0, + "logits/rejected": 614335808.0, + "logps/chosen": -298.57000732421875, + "logps/rejected": -516.6439819335938, + "loss": 0.0163, + "rewards/chosen": 3.684727191925049, + "rewards/margins": 11.959146976470947, + "rewards/rejected": -8.274419784545898, + "step": 2048 + }, + { + "epoch": 0.18720877112836912, + "grad_norm": 26.25, + "kl": 0.0, + "learning_rate": 9.173601622314628e-06, + "logits/chosen": 492204501.3333333, + "logits/rejected": 643052416.0, + "logps/chosen": -276.2884521484375, + "logps/rejected": -533.0736083984375, + "loss": 0.0611, + "rewards/chosen": 3.010422706604004, + "rewards/margins": 9.979820251464844, + "rewards/rejected": -6.96939754486084, + "step": 2049 + }, + { + "epoch": 0.18730013704888077, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 9.172809690612512e-06, + "logits/chosen": 497465696.0, + "logits/rejected": 442774101.3333333, + "logps/chosen": -230.1053009033203, + "logps/rejected": -433.5789388020833, + "loss": 0.0212, + "rewards/chosen": 2.485644578933716, + "rewards/margins": 10.322612047195435, + "rewards/rejected": -7.836967468261719, + "step": 2050 + }, + { + "epoch": 0.18739150296939242, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 9.172017413857699e-06, + "logits/chosen": 482213664.0, + "logits/rejected": 443453920.0, + "logps/chosen": -454.4281311035156, + "logps/rejected": -387.449462890625, + "loss": 0.0222, + "rewards/chosen": 3.4115073680877686, + "rewards/margins": 11.025226354598999, + "rewards/rejected": -7.6137189865112305, + "step": 2051 + }, + { + "epoch": 0.18748286888990406, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 9.171224792115698e-06, + "logits/chosen": 605791670.8571428, + "logits/rejected": 1728939776.0, + "logps/chosen": -317.58192661830356, + "logps/rejected": -746.7919311523438, + "loss": 0.1194, + "rewards/chosen": 3.1136763436453685, + "rewards/margins": 13.579966817583356, + "rewards/rejected": -10.466290473937988, + "step": 2052 + }, + { + "epoch": 0.1875742348104157, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.170431825452056e-06, + "logits/chosen": 593411456.0, + "logits/rejected": 485107904.0, + "logps/chosen": -357.9491882324219, + "logps/rejected": -445.975341796875, + "loss": 0.0343, + "rewards/chosen": 2.8034238815307617, + "rewards/margins": 10.899259567260742, + "rewards/rejected": -8.09583568572998, + "step": 2053 + }, + { + "epoch": 0.18766560073092736, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 9.169638513932342e-06, + "logits/chosen": 643600213.3333334, + "logits/rejected": 704923289.6, + "logps/chosen": -382.6251627604167, + "logps/rejected": -544.7509765625, + "loss": 0.0098, + "rewards/chosen": 3.6369781494140625, + "rewards/margins": 13.313155364990234, + "rewards/rejected": -9.676177215576171, + "step": 2054 + }, + { + "epoch": 0.187756966651439, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.168844857622153e-06, + "logits/chosen": 545459072.0, + "logits/rejected": 395581920.0, + "logps/chosen": -343.6529235839844, + "logps/rejected": -576.382080078125, + "loss": 0.0201, + "rewards/chosen": 3.7309882640838623, + "rewards/margins": 14.659547567367554, + "rewards/rejected": -10.928559303283691, + "step": 2055 + }, + { + "epoch": 0.18784833257195066, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.168050856587121e-06, + "logits/chosen": 452785120.0, + "logits/rejected": 281908032.0, + "logps/chosen": -253.00650024414062, + "logps/rejected": -282.5697021484375, + "loss": 0.0589, + "rewards/chosen": 2.076139450073242, + "rewards/margins": 8.771639823913574, + "rewards/rejected": -6.695500373840332, + "step": 2056 + }, + { + "epoch": 0.1879396984924623, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 9.167256510892902e-06, + "logits/chosen": 457670502.4, + "logits/rejected": 581779328.0, + "logps/chosen": -252.2603515625, + "logps/rejected": -410.19091796875, + "loss": 0.0203, + "rewards/chosen": 3.7965999603271485, + "rewards/margins": 15.03721071879069, + "rewards/rejected": -11.240610758463541, + "step": 2057 + }, + { + "epoch": 0.18803106441297396, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.16646182060518e-06, + "logits/chosen": 819462058.6666666, + "logits/rejected": 452042112.0, + "logps/chosen": -299.40696207682294, + "logps/rejected": -699.4273681640625, + "loss": 0.0307, + "rewards/chosen": 3.4125401178995767, + "rewards/margins": 16.82736365000407, + "rewards/rejected": -13.414823532104492, + "step": 2058 + }, + { + "epoch": 0.1881224303334856, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.165666785789666e-06, + "logits/chosen": 380302250.6666667, + "logits/rejected": 262026816.0, + "logps/chosen": -226.84515380859375, + "logps/rejected": -573.9511108398438, + "loss": 0.0246, + "rewards/chosen": 3.8400837580362954, + "rewards/margins": 17.951160113016766, + "rewards/rejected": -14.111076354980469, + "step": 2059 + }, + { + "epoch": 0.18821379625399726, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.164871406512106e-06, + "logits/chosen": 880033344.0, + "logits/rejected": 2047724288.0, + "logps/chosen": -408.6029052734375, + "logps/rejected": -690.1617431640625, + "loss": 0.0284, + "rewards/chosen": 3.0766913890838623, + "rewards/margins": 15.401872396469116, + "rewards/rejected": -12.325181007385254, + "step": 2060 + }, + { + "epoch": 0.1883051621745089, + "grad_norm": 30.875, + "kl": 0.0, + "learning_rate": 9.16407568283827e-06, + "logits/chosen": 628111104.0, + "logits/rejected": 374703530.6666667, + "logps/chosen": -354.09580078125, + "logps/rejected": -314.3876953125, + "loss": 0.1137, + "rewards/chosen": 3.5663036346435546, + "rewards/margins": 7.077228291829427, + "rewards/rejected": -3.5109246571858725, + "step": 2061 + }, + { + "epoch": 0.18839652809502055, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 9.163279614833956e-06, + "logits/chosen": 635111488.0, + "logits/rejected": 501176512.0, + "logps/chosen": -398.0208740234375, + "logps/rejected": -443.58856201171875, + "loss": 0.2185, + "rewards/chosen": 1.8907856941223145, + "rewards/margins": 9.207175731658936, + "rewards/rejected": -7.316390037536621, + "step": 2062 + }, + { + "epoch": 0.1884878940155322, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.16248320256499e-06, + "logits/chosen": 627217322.6666666, + "logits/rejected": 255903840.0, + "logps/chosen": -429.9908854166667, + "logps/rejected": -266.7542724609375, + "loss": 0.0391, + "rewards/chosen": 3.0455099741617837, + "rewards/margins": 11.396135012308756, + "rewards/rejected": -8.350625038146973, + "step": 2063 + }, + { + "epoch": 0.18857925993604385, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 9.16168644609723e-06, + "logits/chosen": 1048267136.0, + "logits/rejected": 1136428288.0, + "logps/chosen": -523.6329345703125, + "logps/rejected": -443.60687255859375, + "loss": 0.0108, + "rewards/chosen": 3.9696097373962402, + "rewards/margins": 12.950864315032959, + "rewards/rejected": -8.981254577636719, + "step": 2064 + }, + { + "epoch": 0.1886706258565555, + "grad_norm": 0.734375, + "kl": 0.0, + "learning_rate": 9.160889345496559e-06, + "logits/chosen": 363508437.3333333, + "logits/rejected": 756333875.2, + "logps/chosen": -327.5223795572917, + "logps/rejected": -542.176416015625, + "loss": 0.0042, + "rewards/chosen": 5.076981862386067, + "rewards/margins": 13.320702870686848, + "rewards/rejected": -8.243721008300781, + "step": 2065 + }, + { + "epoch": 0.18876199177706715, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.160091900828892e-06, + "logits/chosen": 1260751360.0, + "logits/rejected": 847644288.0, + "logps/chosen": -346.7464599609375, + "logps/rejected": -554.9476318359375, + "loss": 0.0334, + "rewards/chosen": 3.357093572616577, + "rewards/margins": 10.001495122909546, + "rewards/rejected": -6.644401550292969, + "step": 2066 + }, + { + "epoch": 0.1888533576975788, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 9.159294112160168e-06, + "logits/chosen": 647623744.0, + "logits/rejected": 573553115.4285715, + "logps/chosen": -452.1319580078125, + "logps/rejected": -493.845458984375, + "loss": 0.0097, + "rewards/chosen": 2.591400146484375, + "rewards/margins": 11.341303144182477, + "rewards/rejected": -8.749902997698102, + "step": 2067 + }, + { + "epoch": 0.18894472361809045, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 9.15849597955636e-06, + "logits/chosen": 469499596.8, + "logits/rejected": 506940629.3333333, + "logps/chosen": -246.6929443359375, + "logps/rejected": -572.4776204427084, + "loss": 0.0146, + "rewards/chosen": 4.108022308349609, + "rewards/margins": 11.407236353556314, + "rewards/rejected": -7.299214045206706, + "step": 2068 + }, + { + "epoch": 0.1890360895386021, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.157697503083461e-06, + "logits/chosen": 866247424.0, + "logits/rejected": 556176896.0, + "logps/chosen": -283.5186767578125, + "logps/rejected": -514.408447265625, + "loss": 0.0775, + "rewards/chosen": 2.3253143628438315, + "rewards/margins": 9.524562199910482, + "rewards/rejected": -7.19924783706665, + "step": 2069 + }, + { + "epoch": 0.18912745545911375, + "grad_norm": 27.25, + "kl": 0.0, + "learning_rate": 9.1568986828075e-06, + "logits/chosen": 880410624.0, + "logits/rejected": 495015372.8, + "logps/chosen": -358.4848225911458, + "logps/rejected": -426.16845703125, + "loss": 0.1003, + "rewards/chosen": 2.9249960581461587, + "rewards/margins": 9.038920084635416, + "rewards/rejected": -6.113924026489258, + "step": 2070 + }, + { + "epoch": 0.1892188213796254, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.156099518794535e-06, + "logits/chosen": 631008256.0, + "logits/rejected": 342900416.0, + "logps/chosen": -380.15325927734375, + "logps/rejected": -395.02703857421875, + "loss": 0.025, + "rewards/chosen": 3.0876336097717285, + "rewards/margins": 11.264549732208252, + "rewards/rejected": -8.176916122436523, + "step": 2071 + }, + { + "epoch": 0.18931018730013704, + "grad_norm": 29.0, + "kl": 0.0, + "learning_rate": 9.155300011110647e-06, + "logits/chosen": 506134869.3333333, + "logits/rejected": 391078451.2, + "logps/chosen": -204.38118489583334, + "logps/rejected": -562.1193359375, + "loss": 0.1036, + "rewards/chosen": 1.1942248344421387, + "rewards/margins": 9.750680828094483, + "rewards/rejected": -8.556455993652344, + "step": 2072 + }, + { + "epoch": 0.1894015532206487, + "grad_norm": 0.6796875, + "kl": 0.0, + "learning_rate": 9.154500159821946e-06, + "logits/chosen": 345212352.0, + "logits/rejected": 950493866.6666666, + "logps/chosen": -253.66140747070312, + "logps/rejected": -722.2692057291666, + "loss": 0.0037, + "rewards/chosen": 4.463105201721191, + "rewards/margins": 12.027551968892414, + "rewards/rejected": -7.564446767171224, + "step": 2073 + }, + { + "epoch": 0.18949291914116034, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 9.153699964994573e-06, + "logits/chosen": 681763200.0, + "logits/rejected": 598907136.0, + "logps/chosen": -207.33956909179688, + "logps/rejected": -731.5193481445312, + "loss": 0.0108, + "rewards/chosen": 4.135651111602783, + "rewards/margins": 15.638089656829834, + "rewards/rejected": -11.50243854522705, + "step": 2074 + }, + { + "epoch": 0.189584285061672, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 9.152899426694701e-06, + "logits/chosen": 430379136.0, + "logits/rejected": 451292885.3333333, + "logps/chosen": -261.5395202636719, + "logps/rejected": -402.3624674479167, + "loss": 0.0164, + "rewards/chosen": 3.2154452800750732, + "rewards/margins": 9.126927137374878, + "rewards/rejected": -5.911481857299805, + "step": 2075 + }, + { + "epoch": 0.18967565098218364, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.152098544988521e-06, + "logits/chosen": 320233002.6666667, + "logits/rejected": 501261004.8, + "logps/chosen": -158.09088134765625, + "logps/rejected": -561.1583984375, + "loss": 0.0288, + "rewards/chosen": 3.043455123901367, + "rewards/margins": 11.778972244262695, + "rewards/rejected": -8.735517120361328, + "step": 2076 + }, + { + "epoch": 0.1897670169026953, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.151297319942262e-06, + "logits/chosen": 1206430080.0, + "logits/rejected": 1189856768.0, + "logps/chosen": -318.97198486328125, + "logps/rejected": -693.5149536132812, + "loss": 0.0587, + "rewards/chosen": 2.2910687923431396, + "rewards/margins": 13.525068044662476, + "rewards/rejected": -11.233999252319336, + "step": 2077 + }, + { + "epoch": 0.18985838282320694, + "grad_norm": 35.0, + "kl": 0.0, + "learning_rate": 9.15049575162218e-06, + "logits/chosen": 771634688.0, + "logits/rejected": 340288998.4, + "logps/chosen": -484.0077718098958, + "logps/rejected": -270.363525390625, + "loss": 0.0918, + "rewards/chosen": 3.346399943033854, + "rewards/margins": 8.266559092203776, + "rewards/rejected": -4.920159149169922, + "step": 2078 + }, + { + "epoch": 0.1899497487437186, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.14969384009455e-06, + "logits/chosen": 855143808.0, + "logits/rejected": 471973568.0, + "logps/chosen": -288.6394348144531, + "logps/rejected": -280.5647277832031, + "loss": 0.0831, + "rewards/chosen": 2.3175086975097656, + "rewards/margins": 7.675675868988037, + "rewards/rejected": -5.3581671714782715, + "step": 2079 + }, + { + "epoch": 0.19004111466423024, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 9.148891585425692e-06, + "logits/chosen": 414455978.6666667, + "logits/rejected": 433998899.2, + "logps/chosen": -194.71223958333334, + "logps/rejected": -411.168603515625, + "loss": 0.0226, + "rewards/chosen": 2.8438097635904946, + "rewards/margins": 11.54684092203776, + "rewards/rejected": -8.703031158447265, + "step": 2080 + }, + { + "epoch": 0.1901324805847419, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 9.148088987681938e-06, + "logits/chosen": 660620492.8, + "logits/rejected": 761111296.0, + "logps/chosen": -677.40419921875, + "logps/rejected": -556.8572998046875, + "loss": 0.0148, + "rewards/chosen": 4.52061767578125, + "rewards/margins": 10.999276987711589, + "rewards/rejected": -6.478659311930339, + "step": 2081 + }, + { + "epoch": 0.19022384650525354, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.147286046929657e-06, + "logits/chosen": 486638592.0, + "logits/rejected": 419554624.0, + "logps/chosen": -198.31971740722656, + "logps/rejected": -512.0894165039062, + "loss": 0.0739, + "rewards/chosen": 2.7879998683929443, + "rewards/margins": 12.614444494247437, + "rewards/rejected": -9.826444625854492, + "step": 2082 + }, + { + "epoch": 0.19031521242576518, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.146482763235245e-06, + "logits/chosen": 543134668.8, + "logits/rejected": 658052352.0, + "logps/chosen": -399.589306640625, + "logps/rejected": -661.116943359375, + "loss": 0.0436, + "rewards/chosen": 3.216028594970703, + "rewards/margins": 10.813882573445637, + "rewards/rejected": -7.597853978474935, + "step": 2083 + }, + { + "epoch": 0.19040657834627683, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.14567913666513e-06, + "logits/chosen": 700844851.2, + "logits/rejected": 488175232.0, + "logps/chosen": -339.4497802734375, + "logps/rejected": -508.6383870442708, + "loss": 0.0586, + "rewards/chosen": 2.7202978134155273, + "rewards/margins": 10.542572339375813, + "rewards/rejected": -7.822274525960286, + "step": 2084 + }, + { + "epoch": 0.19049794426678848, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.144875167285756e-06, + "logits/chosen": 607925845.3333334, + "logits/rejected": 807681996.8, + "logps/chosen": -190.43021647135416, + "logps/rejected": -437.455078125, + "loss": 0.0697, + "rewards/chosen": 3.680500030517578, + "rewards/margins": 10.145835876464844, + "rewards/rejected": -6.465335845947266, + "step": 2085 + }, + { + "epoch": 0.19058931018730013, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 9.144070855163612e-06, + "logits/chosen": 664137216.0, + "logits/rejected": 516485077.3333333, + "logps/chosen": -360.97198486328125, + "logps/rejected": -469.01953125, + "loss": 0.0123, + "rewards/chosen": 3.542463779449463, + "rewards/margins": 10.887192249298096, + "rewards/rejected": -7.344728469848633, + "step": 2086 + }, + { + "epoch": 0.19068067610781178, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.143266200365203e-06, + "logits/chosen": 672973632.0, + "logits/rejected": 553912704.0, + "logps/chosen": -443.772216796875, + "logps/rejected": -405.50042724609375, + "loss": 0.03, + "rewards/chosen": 2.8283653259277344, + "rewards/margins": 10.16676139831543, + "rewards/rejected": -7.338396072387695, + "step": 2087 + }, + { + "epoch": 0.19077204202832343, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.142461202957067e-06, + "logits/chosen": 271978016.0, + "logits/rejected": 523482688.0, + "logps/chosen": -209.12860107421875, + "logps/rejected": -320.7426452636719, + "loss": 0.068, + "rewards/chosen": 2.6609010696411133, + "rewards/margins": 8.587115287780762, + "rewards/rejected": -5.926214218139648, + "step": 2088 + }, + { + "epoch": 0.19086340794883508, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.14165586300577e-06, + "logits/chosen": 585639360.0, + "logits/rejected": 544583488.0, + "logps/chosen": -289.85186767578125, + "logps/rejected": -306.98956298828125, + "loss": 0.0413, + "rewards/chosen": 2.8726184368133545, + "rewards/margins": 8.432150602340698, + "rewards/rejected": -5.559532165527344, + "step": 2089 + }, + { + "epoch": 0.19095477386934673, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.140850180577908e-06, + "logits/chosen": 1203192832.0, + "logits/rejected": 606783616.0, + "logps/chosen": -344.8913879394531, + "logps/rejected": -584.1563110351562, + "loss": 0.0684, + "rewards/chosen": 3.055239677429199, + "rewards/margins": 11.126261711120605, + "rewards/rejected": -8.071022033691406, + "step": 2090 + }, + { + "epoch": 0.19104613978985838, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.140044155740102e-06, + "logits/chosen": 291755315.2, + "logits/rejected": 429966421.3333333, + "logps/chosen": -344.5291748046875, + "logps/rejected": -427.4142252604167, + "loss": 0.0319, + "rewards/chosen": 3.632753372192383, + "rewards/margins": 12.256999333699545, + "rewards/rejected": -8.624245961507162, + "step": 2091 + }, + { + "epoch": 0.19113750571037003, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 9.139237788559002e-06, + "logits/chosen": 1471771392.0, + "logits/rejected": 593429184.0, + "logps/chosen": -163.543701171875, + "logps/rejected": -782.1070556640625, + "loss": 0.032, + "rewards/chosen": 2.9908928871154785, + "rewards/margins": 12.08046007156372, + "rewards/rejected": -9.089567184448242, + "step": 2092 + }, + { + "epoch": 0.19122887163088167, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.138431079101287e-06, + "logits/chosen": 647603814.4, + "logits/rejected": 661578282.6666666, + "logps/chosen": -244.6668701171875, + "logps/rejected": -681.3391927083334, + "loss": 0.0336, + "rewards/chosen": 3.449044418334961, + "rewards/margins": 12.53281135559082, + "rewards/rejected": -9.08376693725586, + "step": 2093 + }, + { + "epoch": 0.19132023755139332, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 9.137624027433665e-06, + "logits/chosen": 633417813.3333334, + "logits/rejected": 528714956.8, + "logps/chosen": -240.0010782877604, + "logps/rejected": -623.9390625, + "loss": 0.014, + "rewards/chosen": 3.5806407928466797, + "rewards/margins": 12.632702255249024, + "rewards/rejected": -9.052061462402344, + "step": 2094 + }, + { + "epoch": 0.19141160347190497, + "grad_norm": 0.71484375, + "kl": 0.0, + "learning_rate": 9.136816633622873e-06, + "logits/chosen": 600663253.3333334, + "logits/rejected": 545630566.4, + "logps/chosen": -321.1111246744792, + "logps/rejected": -514.45478515625, + "loss": 0.004, + "rewards/chosen": 4.991218566894531, + "rewards/margins": 12.745106506347657, + "rewards/rejected": -7.753887939453125, + "step": 2095 + }, + { + "epoch": 0.19150296939241662, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.136008897735673e-06, + "logits/chosen": 906289536.0, + "logits/rejected": 471606112.0, + "logps/chosen": -421.2664794921875, + "logps/rejected": -334.4591979980469, + "loss": 0.0611, + "rewards/chosen": 2.8008971214294434, + "rewards/margins": 9.903564929962158, + "rewards/rejected": -7.102667808532715, + "step": 2096 + }, + { + "epoch": 0.19159433531292827, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 9.135200819838858e-06, + "logits/chosen": 450627424.0, + "logits/rejected": 613767296.0, + "logps/chosen": -421.4071044921875, + "logps/rejected": -558.7387084960938, + "loss": 0.0397, + "rewards/chosen": 2.9974136352539062, + "rewards/margins": 11.668585777282715, + "rewards/rejected": -8.671172142028809, + "step": 2097 + }, + { + "epoch": 0.19168570123343992, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 9.134392399999251e-06, + "logits/chosen": 699077589.3333334, + "logits/rejected": 500283801.6, + "logps/chosen": -277.05820719401044, + "logps/rejected": -452.07275390625, + "loss": 0.0095, + "rewards/chosen": 3.752544403076172, + "rewards/margins": 13.122180938720703, + "rewards/rejected": -9.369636535644531, + "step": 2098 + }, + { + "epoch": 0.19177706715395157, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 9.133583638283697e-06, + "logits/chosen": 475993941.3333333, + "logits/rejected": 866286720.0, + "logps/chosen": -322.70668538411456, + "logps/rejected": -584.1475219726562, + "loss": 0.0176, + "rewards/chosen": 3.955063501993815, + "rewards/margins": 11.115985552469889, + "rewards/rejected": -7.160922050476074, + "step": 2099 + }, + { + "epoch": 0.19186843307446322, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.132774534759073e-06, + "logits/chosen": 475108384.0, + "logits/rejected": 732319616.0, + "logps/chosen": -313.1535339355469, + "logps/rejected": -653.0038452148438, + "loss": 0.0369, + "rewards/chosen": 2.729404926300049, + "rewards/margins": 10.268351078033447, + "rewards/rejected": -7.538946151733398, + "step": 2100 + }, + { + "epoch": 0.19195979899497487, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.131965089492288e-06, + "logits/chosen": 588147404.8, + "logits/rejected": 263248064.0, + "logps/chosen": -354.1001953125, + "logps/rejected": -129.71733601888022, + "loss": 0.0382, + "rewards/chosen": 3.4521648406982424, + "rewards/margins": 8.081388346354167, + "rewards/rejected": -4.629223505655925, + "step": 2101 + }, + { + "epoch": 0.19205116491548652, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.131155302550272e-06, + "logits/chosen": 541390250.6666666, + "logits/rejected": 424172569.6, + "logps/chosen": -202.23038736979166, + "logps/rejected": -379.95830078125, + "loss": 0.0139, + "rewards/chosen": 4.681818008422852, + "rewards/margins": 11.56952781677246, + "rewards/rejected": -6.8877098083496096, + "step": 2102 + }, + { + "epoch": 0.19214253083599817, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 9.130345173999991e-06, + "logits/chosen": 459944874.6666667, + "logits/rejected": 486398144.0, + "logps/chosen": -371.35498046875, + "logps/rejected": -191.93655395507812, + "loss": 0.0303, + "rewards/chosen": 3.473792394002279, + "rewards/margins": 9.713480790456137, + "rewards/rejected": -6.239688396453857, + "step": 2103 + }, + { + "epoch": 0.19223389675650981, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.12953470390843e-06, + "logits/chosen": 491200896.0, + "logits/rejected": 455619008.0, + "logps/chosen": -374.6495056152344, + "logps/rejected": -582.0399780273438, + "loss": 0.0332, + "rewards/chosen": 2.7462093830108643, + "rewards/margins": 12.227629899978638, + "rewards/rejected": -9.481420516967773, + "step": 2104 + }, + { + "epoch": 0.19232526267702146, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 9.12872389234261e-06, + "logits/chosen": 459561654.85714287, + "logits/rejected": 395657152.0, + "logps/chosen": -401.1363002232143, + "logps/rejected": -523.174560546875, + "loss": 0.0265, + "rewards/chosen": 3.95705441066197, + "rewards/margins": 11.15238550731114, + "rewards/rejected": -7.19533109664917, + "step": 2105 + }, + { + "epoch": 0.1924166285975331, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.12791273936958e-06, + "logits/chosen": 561519923.2, + "logits/rejected": 637031509.3333334, + "logps/chosen": -308.21474609375, + "logps/rejected": -456.3693440755208, + "loss": 0.0377, + "rewards/chosen": 2.811589813232422, + "rewards/margins": 10.929854329427084, + "rewards/rejected": -8.118264516194662, + "step": 2106 + }, + { + "epoch": 0.19250799451804476, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 9.127101245056412e-06, + "logits/chosen": 456399018.6666667, + "logits/rejected": 423504448.0, + "logps/chosen": -336.9099527994792, + "logps/rejected": -480.11163330078125, + "loss": 0.0391, + "rewards/chosen": 3.473799705505371, + "rewards/margins": 12.365924835205078, + "rewards/rejected": -8.892125129699707, + "step": 2107 + }, + { + "epoch": 0.1925993604385564, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.126289409470209e-06, + "logits/chosen": 665254464.0, + "logits/rejected": 703722240.0, + "logps/chosen": -346.933837890625, + "logps/rejected": -631.3713989257812, + "loss": 0.0554, + "rewards/chosen": 2.4081664085388184, + "rewards/margins": 12.006727695465088, + "rewards/rejected": -9.59856128692627, + "step": 2108 + }, + { + "epoch": 0.19269072635906806, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 9.1254772326781e-06, + "logits/chosen": 354702156.8, + "logits/rejected": 262495616.0, + "logps/chosen": -252.6786865234375, + "logps/rejected": -387.9158935546875, + "loss": 0.03, + "rewards/chosen": 3.434559631347656, + "rewards/margins": 9.487474568684895, + "rewards/rejected": -6.052914937337239, + "step": 2109 + }, + { + "epoch": 0.1927820922795797, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.12466471474725e-06, + "logits/chosen": 792376371.2, + "logits/rejected": 524627626.6666667, + "logps/chosen": -448.041064453125, + "logps/rejected": -421.421630859375, + "loss": 0.0246, + "rewards/chosen": 3.573007583618164, + "rewards/margins": 11.61620012919108, + "rewards/rejected": -8.043192545572916, + "step": 2110 + }, + { + "epoch": 0.19287345820009136, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 9.123851855744842e-06, + "logits/chosen": 239024416.0, + "logits/rejected": 412769536.0, + "logps/chosen": -251.36138916015625, + "logps/rejected": -480.4549967447917, + "loss": 0.0084, + "rewards/chosen": 3.9291434288024902, + "rewards/margins": 12.669235388437906, + "rewards/rejected": -8.740091959635416, + "step": 2111 + }, + { + "epoch": 0.192964824120603, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 9.123038655738098e-06, + "logits/chosen": 511852074.6666667, + "logits/rejected": 676375360.0, + "logps/chosen": -261.1852213541667, + "logps/rejected": -1061.107177734375, + "loss": 0.0194, + "rewards/chosen": 3.8296028772989907, + "rewards/margins": 16.124545733133953, + "rewards/rejected": -12.294942855834961, + "step": 2112 + }, + { + "epoch": 0.19305619004111466, + "grad_norm": 29.125, + "kl": 0.0, + "learning_rate": 9.122225114794254e-06, + "logits/chosen": 469188672.0, + "logits/rejected": 710631936.0, + "logps/chosen": -145.1815185546875, + "logps/rejected": -645.669677734375, + "loss": 0.0957, + "rewards/chosen": 2.599640369415283, + "rewards/margins": 12.380662441253662, + "rewards/rejected": -9.781022071838379, + "step": 2113 + }, + { + "epoch": 0.1931475559616263, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.121411232980589e-06, + "logits/chosen": 535315136.0, + "logits/rejected": 506314496.0, + "logps/chosen": -403.87738037109375, + "logps/rejected": -474.09027099609375, + "loss": 0.123, + "rewards/chosen": 3.4425060749053955, + "rewards/margins": 9.786640405654907, + "rewards/rejected": -6.344134330749512, + "step": 2114 + }, + { + "epoch": 0.19323892188213795, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.120597010364397e-06, + "logits/chosen": 549569365.3333334, + "logits/rejected": 452398976.0, + "logps/chosen": -329.84906005859375, + "logps/rejected": -578.7570190429688, + "loss": 0.0389, + "rewards/chosen": 3.144232749938965, + "rewards/margins": 10.322612762451172, + "rewards/rejected": -7.178380012512207, + "step": 2115 + }, + { + "epoch": 0.1933302878026496, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 9.119782447013013e-06, + "logits/chosen": 476897152.0, + "logits/rejected": 611890304.0, + "logps/chosen": -223.4969482421875, + "logps/rejected": -561.973388671875, + "loss": 0.0051, + "rewards/chosen": 4.762591361999512, + "rewards/margins": 13.19127368927002, + "rewards/rejected": -8.428682327270508, + "step": 2116 + }, + { + "epoch": 0.19342165372316125, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.118967542993791e-06, + "logits/chosen": 531337728.0, + "logits/rejected": 837574758.4, + "logps/chosen": -250.2791544596354, + "logps/rejected": -477.893603515625, + "loss": 0.0449, + "rewards/chosen": 3.198753992716471, + "rewards/margins": 11.419930903116862, + "rewards/rejected": -8.221176910400391, + "step": 2117 + }, + { + "epoch": 0.1935130196436729, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 9.118152298374117e-06, + "logits/chosen": 447444416.0, + "logits/rejected": 369097216.0, + "logps/chosen": -280.5719909667969, + "logps/rejected": -416.862548828125, + "loss": 0.0101, + "rewards/chosen": 4.181159019470215, + "rewards/margins": 12.705183982849121, + "rewards/rejected": -8.524024963378906, + "step": 2118 + }, + { + "epoch": 0.19360438556418455, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 9.117336713221403e-06, + "logits/chosen": 503833685.3333333, + "logits/rejected": 580531814.4, + "logps/chosen": -281.17592366536456, + "logps/rejected": -550.061767578125, + "loss": 0.0226, + "rewards/chosen": 2.8755553563435874, + "rewards/margins": 12.357206662495932, + "rewards/rejected": -9.481651306152344, + "step": 2119 + }, + { + "epoch": 0.1936957514846962, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.11652078760309e-06, + "logits/chosen": 1026641715.2, + "logits/rejected": 431941290.6666667, + "logps/chosen": -661.43544921875, + "logps/rejected": -491.2227376302083, + "loss": 0.0279, + "rewards/chosen": 3.4095619201660154, + "rewards/margins": 13.760756937662759, + "rewards/rejected": -10.351195017496744, + "step": 2120 + }, + { + "epoch": 0.19378711740520785, + "grad_norm": 28.625, + "kl": 0.0, + "learning_rate": 9.11570452158665e-06, + "logits/chosen": 483359027.2, + "logits/rejected": 259457578.66666666, + "logps/chosen": -332.464990234375, + "logps/rejected": -259.18963623046875, + "loss": 0.0942, + "rewards/chosen": 3.2960227966308593, + "rewards/margins": 8.158468945821125, + "rewards/rejected": -4.862446149190267, + "step": 2121 + }, + { + "epoch": 0.1938784833257195, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 9.114887915239578e-06, + "logits/chosen": 1147192064.0, + "logits/rejected": 736538112.0, + "logps/chosen": -342.1214599609375, + "logps/rejected": -580.1716918945312, + "loss": 0.0149, + "rewards/chosen": 3.759415864944458, + "rewards/margins": 12.87752652168274, + "rewards/rejected": -9.118110656738281, + "step": 2122 + }, + { + "epoch": 0.19396984924623115, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.1140709686294e-06, + "logits/chosen": 571507029.3333334, + "logits/rejected": 406629632.0, + "logps/chosen": -197.79638671875, + "logps/rejected": -209.48863220214844, + "loss": 0.0881, + "rewards/chosen": 3.119572321573893, + "rewards/margins": 7.659246126810709, + "rewards/rejected": -4.539673805236816, + "step": 2123 + }, + { + "epoch": 0.1940612151667428, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 9.11325368182367e-06, + "logits/chosen": 231919936.0, + "logits/rejected": 322600832.0, + "logps/chosen": -335.1382141113281, + "logps/rejected": -510.65716552734375, + "loss": 0.0186, + "rewards/chosen": 3.8038177490234375, + "rewards/margins": 15.204680442810059, + "rewards/rejected": -11.400862693786621, + "step": 2124 + }, + { + "epoch": 0.19415258108725444, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.112436054889972e-06, + "logits/chosen": 445720928.0, + "logits/rejected": 541014336.0, + "logps/chosen": -284.0986633300781, + "logps/rejected": -488.14892578125, + "loss": 0.116, + "rewards/chosen": 3.335233688354492, + "rewards/margins": 9.496764659881592, + "rewards/rejected": -6.1615309715271, + "step": 2125 + }, + { + "epoch": 0.1942439470077661, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.111618087895915e-06, + "logits/chosen": 397145557.3333333, + "logits/rejected": 275104320.0, + "logps/chosen": -286.46763102213544, + "logps/rejected": -341.5665283203125, + "loss": 0.1233, + "rewards/chosen": 3.9991442362467446, + "rewards/margins": 10.88566509882609, + "rewards/rejected": -6.886520862579346, + "step": 2126 + }, + { + "epoch": 0.19433531292827774, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.110799780909137e-06, + "logits/chosen": 880634880.0, + "logits/rejected": 1113162112.0, + "logps/chosen": -328.53741455078125, + "logps/rejected": -788.5587158203125, + "loss": 0.1257, + "rewards/chosen": 1.8567255735397339, + "rewards/margins": 13.00942313671112, + "rewards/rejected": -11.152697563171387, + "step": 2127 + }, + { + "epoch": 0.1944266788487894, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 9.109981133997303e-06, + "logits/chosen": 369668437.3333333, + "logits/rejected": 624643481.6, + "logps/chosen": -344.0941975911458, + "logps/rejected": -619.9826171875, + "loss": 0.0749, + "rewards/chosen": 2.5578227043151855, + "rewards/margins": 12.086795330047607, + "rewards/rejected": -9.528972625732422, + "step": 2128 + }, + { + "epoch": 0.19451804476930104, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 9.109162147228111e-06, + "logits/chosen": 414204160.0, + "logits/rejected": 367903488.0, + "logps/chosen": -232.75130208333334, + "logps/rejected": -372.1852111816406, + "loss": 0.0235, + "rewards/chosen": 3.781902313232422, + "rewards/margins": 10.48156452178955, + "rewards/rejected": -6.699662208557129, + "step": 2129 + }, + { + "epoch": 0.1946094106898127, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.108342820669281e-06, + "logits/chosen": 356568096.0, + "logits/rejected": 797399552.0, + "logps/chosen": -177.6138916015625, + "logps/rejected": -312.66558837890625, + "loss": 0.0271, + "rewards/chosen": 2.927734375, + "rewards/margins": 9.506449381510418, + "rewards/rejected": -6.578715006510417, + "step": 2130 + }, + { + "epoch": 0.19470077661032434, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.107523154388565e-06, + "logits/chosen": 542030006.8571428, + "logits/rejected": 300863744.0, + "logps/chosen": -364.69918387276783, + "logps/rejected": -623.302001953125, + "loss": 0.0536, + "rewards/chosen": 3.0008068084716797, + "rewards/margins": 16.112525939941406, + "rewards/rejected": -13.111719131469727, + "step": 2131 + }, + { + "epoch": 0.194792142530836, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 9.106703148453739e-06, + "logits/chosen": 394025568.0, + "logits/rejected": 433216853.3333333, + "logps/chosen": -246.3740692138672, + "logps/rejected": -465.8238118489583, + "loss": 0.008, + "rewards/chosen": 3.8622169494628906, + "rewards/margins": 12.464065551757812, + "rewards/rejected": -8.601848602294922, + "step": 2132 + }, + { + "epoch": 0.19488350845134764, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 9.105882802932613e-06, + "logits/chosen": 348087756.8, + "logits/rejected": 440657493.3333333, + "logps/chosen": -339.135986328125, + "logps/rejected": -497.4529215494792, + "loss": 0.0203, + "rewards/chosen": 3.587328338623047, + "rewards/margins": 11.04456049601237, + "rewards/rejected": -7.457232157389323, + "step": 2133 + }, + { + "epoch": 0.19497487437185929, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 9.105062117893022e-06, + "logits/chosen": 1150155648.0, + "logits/rejected": 481766272.0, + "logps/chosen": -311.97705078125, + "logps/rejected": -430.8939208984375, + "loss": 0.0145, + "rewards/chosen": 3.078810214996338, + "rewards/margins": 9.872033913930256, + "rewards/rejected": -6.793223698933919, + "step": 2134 + }, + { + "epoch": 0.19506624029237093, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.104241093402828e-06, + "logits/chosen": 514454835.2, + "logits/rejected": 300558037.3333333, + "logps/chosen": -320.3729248046875, + "logps/rejected": -234.2266642252604, + "loss": 0.1156, + "rewards/chosen": 3.385384368896484, + "rewards/margins": 7.822964477539062, + "rewards/rejected": -4.437580108642578, + "step": 2135 + }, + { + "epoch": 0.19515760621288258, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 9.103419729529921e-06, + "logits/chosen": 482676394.6666667, + "logits/rejected": 397918304.0, + "logps/chosen": -406.9705403645833, + "logps/rejected": -560.6158447265625, + "loss": 0.0673, + "rewards/chosen": 2.4830398559570312, + "rewards/margins": 15.478128433227539, + "rewards/rejected": -12.995088577270508, + "step": 2136 + }, + { + "epoch": 0.19524897213339423, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 9.102598026342223e-06, + "logits/chosen": 454549299.2, + "logits/rejected": 433012394.6666667, + "logps/chosen": -300.231103515625, + "logps/rejected": -390.2314046223958, + "loss": 0.021, + "rewards/chosen": 3.6381175994873045, + "rewards/margins": 12.163728205362954, + "rewards/rejected": -8.52561060587565, + "step": 2137 + }, + { + "epoch": 0.19534033805390588, + "grad_norm": 0.408203125, + "kl": 0.0, + "learning_rate": 9.10177598390768e-06, + "logits/chosen": 337827232.0, + "logits/rejected": 912025856.0, + "logps/chosen": -216.09483337402344, + "logps/rejected": -527.5055338541666, + "loss": 0.0023, + "rewards/chosen": 5.1978607177734375, + "rewards/margins": 13.905670166015625, + "rewards/rejected": -8.707809448242188, + "step": 2138 + }, + { + "epoch": 0.19543170397441753, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.100953602294264e-06, + "logits/chosen": 513204800.0, + "logits/rejected": 538272426.6666666, + "logps/chosen": -356.68597412109375, + "logps/rejected": -228.9246826171875, + "loss": 0.1715, + "rewards/chosen": 2.0678298473358154, + "rewards/margins": 5.946248451868692, + "rewards/rejected": -3.8784186045328775, + "step": 2139 + }, + { + "epoch": 0.19552306989492918, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.100130881569984e-06, + "logits/chosen": 409411520.0, + "logits/rejected": 535374464.0, + "logps/chosen": -362.04718017578125, + "logps/rejected": -571.9189453125, + "loss": 0.0207, + "rewards/chosen": 2.4647927284240723, + "rewards/margins": 10.662185509999594, + "rewards/rejected": -8.197392781575521, + "step": 2140 + }, + { + "epoch": 0.19561443581544083, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 9.099307821802867e-06, + "logits/chosen": 640359125.3333334, + "logits/rejected": 525583769.6, + "logps/chosen": -467.263671875, + "logps/rejected": -449.7705078125, + "loss": 0.0198, + "rewards/chosen": 3.0274445215861, + "rewards/margins": 10.899218813578287, + "rewards/rejected": -7.871774291992187, + "step": 2141 + }, + { + "epoch": 0.19570580173595248, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 9.098484423060976e-06, + "logits/chosen": 306848204.8, + "logits/rejected": 705575253.3333334, + "logps/chosen": -382.166455078125, + "logps/rejected": -576.3310546875, + "loss": 0.0223, + "rewards/chosen": 3.8527217864990235, + "rewards/margins": 11.123918787638347, + "rewards/rejected": -7.271197001139323, + "step": 2142 + }, + { + "epoch": 0.19579716765646413, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.097660685412395e-06, + "logits/chosen": 497223372.8, + "logits/rejected": 536934058.6666666, + "logps/chosen": -411.35126953125, + "logps/rejected": -573.68310546875, + "loss": 0.07, + "rewards/chosen": 2.547581100463867, + "rewards/margins": 9.91364262898763, + "rewards/rejected": -7.366061528523763, + "step": 2143 + }, + { + "epoch": 0.19588853357697578, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 9.096836608925241e-06, + "logits/chosen": 864540245.3333334, + "logits/rejected": 513684838.4, + "logps/chosen": -360.2306315104167, + "logps/rejected": -392.171142578125, + "loss": 0.0191, + "rewards/chosen": 3.039536794026693, + "rewards/margins": 10.788585408528647, + "rewards/rejected": -7.7490486145019535, + "step": 2144 + }, + { + "epoch": 0.19597989949748743, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 9.096012193667658e-06, + "logits/chosen": 437938090.6666667, + "logits/rejected": 519972192.0, + "logps/chosen": -347.7941487630208, + "logps/rejected": -298.29766845703125, + "loss": 0.0242, + "rewards/chosen": 3.6755396525065103, + "rewards/margins": 10.979741732279459, + "rewards/rejected": -7.304202079772949, + "step": 2145 + }, + { + "epoch": 0.19607126541799907, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 9.095187439707817e-06, + "logits/chosen": 655583283.2, + "logits/rejected": 926428160.0, + "logps/chosen": -257.332470703125, + "logps/rejected": -714.220947265625, + "loss": 0.0254, + "rewards/chosen": 3.568848419189453, + "rewards/margins": 19.028744506835938, + "rewards/rejected": -15.459896087646484, + "step": 2146 + }, + { + "epoch": 0.19616263133851072, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.094362347113917e-06, + "logits/chosen": 237483306.66666666, + "logits/rejected": 281796889.6, + "logps/chosen": -195.2436726888021, + "logps/rejected": -363.8276611328125, + "loss": 0.0355, + "rewards/chosen": 3.7085647583007812, + "rewards/margins": 12.460481262207031, + "rewards/rejected": -8.75191650390625, + "step": 2147 + }, + { + "epoch": 0.19625399725902237, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.093536915954186e-06, + "logits/chosen": 1051317333.3333334, + "logits/rejected": 843818432.0, + "logps/chosen": -317.68044026692706, + "logps/rejected": -847.2471313476562, + "loss": 0.043, + "rewards/chosen": 3.5697571436564126, + "rewards/margins": 13.99439779917399, + "rewards/rejected": -10.424640655517578, + "step": 2148 + }, + { + "epoch": 0.19634536317953402, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.09271114629688e-06, + "logits/chosen": 641559808.0, + "logits/rejected": 535420096.0, + "logps/chosen": -249.3128662109375, + "logps/rejected": -457.82147216796875, + "loss": 0.1172, + "rewards/chosen": 2.1580984592437744, + "rewards/margins": 11.162835359573364, + "rewards/rejected": -9.00473690032959, + "step": 2149 + }, + { + "epoch": 0.19643672910004567, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 9.091885038210282e-06, + "logits/chosen": 652783786.6666666, + "logits/rejected": 462015488.0, + "logps/chosen": -192.69063313802084, + "logps/rejected": -553.918505859375, + "loss": 0.0292, + "rewards/chosen": 3.1384010314941406, + "rewards/margins": 11.780658721923828, + "rewards/rejected": -8.642257690429688, + "step": 2150 + }, + { + "epoch": 0.19652809502055732, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.091058591762703e-06, + "logits/chosen": 366159104.0, + "logits/rejected": 395466592.0, + "logps/chosen": -317.1168619791667, + "logps/rejected": -508.90911865234375, + "loss": 0.0379, + "rewards/chosen": 3.3177480697631836, + "rewards/margins": 12.92233657836914, + "rewards/rejected": -9.604588508605957, + "step": 2151 + }, + { + "epoch": 0.19661946094106897, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 9.090231807022483e-06, + "logits/chosen": 492353962.6666667, + "logits/rejected": 515097907.2, + "logps/chosen": -310.39833577473956, + "logps/rejected": -491.024609375, + "loss": 0.0153, + "rewards/chosen": 3.914981206258138, + "rewards/margins": 13.11601651509603, + "rewards/rejected": -9.201035308837891, + "step": 2152 + }, + { + "epoch": 0.19671082686158062, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.08940468405799e-06, + "logits/chosen": 655033728.0, + "logits/rejected": 413236672.0, + "logps/chosen": -376.5281066894531, + "logps/rejected": -466.12530517578125, + "loss": 0.0091, + "rewards/chosen": 4.452432155609131, + "rewards/margins": 11.308306694030762, + "rewards/rejected": -6.855874538421631, + "step": 2153 + }, + { + "epoch": 0.19680219278209227, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 9.088577222937618e-06, + "logits/chosen": 668960819.2, + "logits/rejected": 568027392.0, + "logps/chosen": -351.4818115234375, + "logps/rejected": -564.5726318359375, + "loss": 0.0262, + "rewards/chosen": 3.282358169555664, + "rewards/margins": 12.51861572265625, + "rewards/rejected": -9.236257553100586, + "step": 2154 + }, + { + "epoch": 0.19689355870260392, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.087749423729792e-06, + "logits/chosen": 775347712.0, + "logits/rejected": 468204629.3333333, + "logps/chosen": -288.9747009277344, + "logps/rejected": -572.6911214192709, + "loss": 0.0175, + "rewards/chosen": 2.987569570541382, + "rewards/margins": 12.192660411198935, + "rewards/rejected": -9.205090840657553, + "step": 2155 + }, + { + "epoch": 0.19698492462311556, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.086921286502962e-06, + "logits/chosen": 447369952.0, + "logits/rejected": 587871360.0, + "logps/chosen": -407.69384765625, + "logps/rejected": -512.8516845703125, + "loss": 0.0142, + "rewards/chosen": 4.641684055328369, + "rewards/margins": 13.658323764801025, + "rewards/rejected": -9.016639709472656, + "step": 2156 + }, + { + "epoch": 0.19707629054362721, + "grad_norm": 26.5, + "kl": 0.0, + "learning_rate": 9.086092811325608e-06, + "logits/chosen": 587650560.0, + "logits/rejected": 802120021.3333334, + "logps/chosen": -298.5575256347656, + "logps/rejected": -352.800048828125, + "loss": 0.1104, + "rewards/chosen": 1.98297119140625, + "rewards/margins": 8.08038838704427, + "rewards/rejected": -6.0974171956380205, + "step": 2157 + }, + { + "epoch": 0.19716765646413886, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 9.085263998266236e-06, + "logits/chosen": 562524160.0, + "logits/rejected": 591044608.0, + "logps/chosen": -477.6311340332031, + "logps/rejected": -709.583251953125, + "loss": 0.013, + "rewards/chosen": 3.769681692123413, + "rewards/margins": 12.15321135520935, + "rewards/rejected": -8.383529663085938, + "step": 2158 + }, + { + "epoch": 0.1972590223846505, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 9.084434847393384e-06, + "logits/chosen": 523715686.4, + "logits/rejected": 579324842.6666666, + "logps/chosen": -367.144580078125, + "logps/rejected": -695.1083984375, + "loss": 0.016, + "rewards/chosen": 3.7482749938964846, + "rewards/margins": 13.048931630452476, + "rewards/rejected": -9.30065663655599, + "step": 2159 + }, + { + "epoch": 0.19735038830516216, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 9.083605358775612e-06, + "logits/chosen": 515938377.14285713, + "logits/rejected": 215901856.0, + "logps/chosen": -313.93613978794644, + "logps/rejected": -189.27732849121094, + "loss": 0.025, + "rewards/chosen": 4.0863045283726285, + "rewards/margins": 8.97820840563093, + "rewards/rejected": -4.891903877258301, + "step": 2160 + }, + { + "epoch": 0.1974417542256738, + "grad_norm": 10.3125, + "kl": 2.1262054443359375, + "learning_rate": 9.082775532481513e-06, + "logits/chosen": 355939035.4285714, + "logits/rejected": 283879424.0, + "logps/chosen": -360.58642578125, + "logps/rejected": -151.30186462402344, + "loss": 0.0704, + "rewards/chosen": 3.0939434596470425, + "rewards/margins": 7.890992505209787, + "rewards/rejected": -4.797049045562744, + "step": 2161 + }, + { + "epoch": 0.19753312014618546, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.081945368579704e-06, + "logits/chosen": 765680810.6666666, + "logits/rejected": 977950105.6, + "logps/chosen": -401.4954020182292, + "logps/rejected": -492.9908203125, + "loss": 0.0514, + "rewards/chosen": 1.9621857007344563, + "rewards/margins": 8.901338990529378, + "rewards/rejected": -6.9391532897949215, + "step": 2162 + }, + { + "epoch": 0.1976244860666971, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.081114867138835e-06, + "logits/chosen": 501701696.0, + "logits/rejected": 922194346.6666666, + "logps/chosen": -300.55047607421875, + "logps/rejected": -487.0891927083333, + "loss": 0.0215, + "rewards/chosen": 2.5992798805236816, + "rewards/margins": 10.991849104563395, + "rewards/rejected": -8.392569224039713, + "step": 2163 + }, + { + "epoch": 0.19771585198720878, + "grad_norm": 25.0, + "kl": 0.0, + "learning_rate": 9.080284028227578e-06, + "logits/chosen": 431822400.0, + "logits/rejected": 874456704.0, + "logps/chosen": -113.17354583740234, + "logps/rejected": -686.923095703125, + "loss": 0.1446, + "rewards/chosen": 1.1756592988967896, + "rewards/margins": 12.689802289009094, + "rewards/rejected": -11.514142990112305, + "step": 2164 + }, + { + "epoch": 0.19780721790772043, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.079452851914634e-06, + "logits/chosen": 447391744.0, + "logits/rejected": 635226752.0, + "logps/chosen": -240.2407470703125, + "logps/rejected": -543.8254801432291, + "loss": 0.1231, + "rewards/chosen": 2.7702545166015624, + "rewards/margins": 7.485151672363282, + "rewards/rejected": -4.714897155761719, + "step": 2165 + }, + { + "epoch": 0.19789858382823208, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 9.078621338268739e-06, + "logits/chosen": 659091520.0, + "logits/rejected": 527942976.0, + "logps/chosen": -347.50469970703125, + "logps/rejected": -538.644775390625, + "loss": 0.0443, + "rewards/chosen": 2.770646810531616, + "rewards/margins": 11.680424451828003, + "rewards/rejected": -8.909777641296387, + "step": 2166 + }, + { + "epoch": 0.19798994974874373, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.077789487358648e-06, + "logits/chosen": 547343155.2, + "logits/rejected": 302403285.3333333, + "logps/chosen": -443.527294921875, + "logps/rejected": -312.4633382161458, + "loss": 0.0335, + "rewards/chosen": 3.4119617462158205, + "rewards/margins": 11.020385615030925, + "rewards/rejected": -7.6084238688151045, + "step": 2167 + }, + { + "epoch": 0.19808131566925538, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 9.07695729925315e-06, + "logits/chosen": 843838976.0, + "logits/rejected": 628315200.0, + "logps/chosen": -619.2205810546875, + "logps/rejected": -382.9295959472656, + "loss": 0.0169, + "rewards/chosen": 3.608018636703491, + "rewards/margins": 10.539658308029175, + "rewards/rejected": -6.931639671325684, + "step": 2168 + }, + { + "epoch": 0.19817268158976703, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.076124774021054e-06, + "logits/chosen": 532135808.0, + "logits/rejected": 372122931.2, + "logps/chosen": -345.524658203125, + "logps/rejected": -320.208056640625, + "loss": 0.0233, + "rewards/chosen": 3.3064149220784507, + "rewards/margins": 10.114650090535482, + "rewards/rejected": -6.808235168457031, + "step": 2169 + }, + { + "epoch": 0.19826404751027868, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.075291911731206e-06, + "logits/chosen": 353245226.6666667, + "logits/rejected": 345191552.0, + "logps/chosen": -202.9131876627604, + "logps/rejected": -347.22723388671875, + "loss": 0.0379, + "rewards/chosen": 3.3430264790852866, + "rewards/margins": 10.84748109181722, + "rewards/rejected": -7.504454612731934, + "step": 2170 + }, + { + "epoch": 0.19835541343079033, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 9.074458712452476e-06, + "logits/chosen": 417090208.0, + "logits/rejected": 433240277.3333333, + "logps/chosen": -232.6982421875, + "logps/rejected": -513.4713541666666, + "loss": 0.0206, + "rewards/chosen": 2.6140527725219727, + "rewards/margins": 9.735828081766766, + "rewards/rejected": -7.121775309244792, + "step": 2171 + }, + { + "epoch": 0.19844677935130198, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.073625176253759e-06, + "logits/chosen": 494729984.0, + "logits/rejected": 303386197.3333333, + "logps/chosen": -404.93525390625, + "logps/rejected": -448.9025065104167, + "loss": 0.0242, + "rewards/chosen": 3.3936538696289062, + "rewards/margins": 11.535365422566732, + "rewards/rejected": -8.141711552937826, + "step": 2172 + }, + { + "epoch": 0.19853814527181363, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 9.072791303203986e-06, + "logits/chosen": 973218048.0, + "logits/rejected": 686268544.0, + "logps/chosen": -348.62353515625, + "logps/rejected": -655.812255859375, + "loss": 0.0779, + "rewards/chosen": 2.712787310282389, + "rewards/margins": 13.392759005228678, + "rewards/rejected": -10.679971694946289, + "step": 2173 + }, + { + "epoch": 0.19862951119232528, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.071957093372107e-06, + "logits/chosen": 521849984.0, + "logits/rejected": 361524512.0, + "logps/chosen": -427.4443359375, + "logps/rejected": -484.9982604980469, + "loss": 0.0354, + "rewards/chosen": 3.2773971557617188, + "rewards/margins": 11.71725845336914, + "rewards/rejected": -8.439861297607422, + "step": 2174 + }, + { + "epoch": 0.19872087711283692, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 9.071122546827102e-06, + "logits/chosen": 427711829.3333333, + "logits/rejected": 520664576.0, + "logps/chosen": -262.5182291666667, + "logps/rejected": -678.629443359375, + "loss": 0.0082, + "rewards/chosen": 4.03155517578125, + "rewards/margins": 12.362934875488282, + "rewards/rejected": -8.331379699707032, + "step": 2175 + }, + { + "epoch": 0.19881224303334857, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 9.070287663637984e-06, + "logits/chosen": 382318336.0, + "logits/rejected": 1027087360.0, + "logps/chosen": -228.8629353841146, + "logps/rejected": -703.7048828125, + "loss": 0.0155, + "rewards/chosen": 3.8609434763590493, + "rewards/margins": 12.036569086710612, + "rewards/rejected": -8.175625610351563, + "step": 2176 + }, + { + "epoch": 0.19890360895386022, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.069452443873788e-06, + "logits/chosen": 539661107.2, + "logits/rejected": 429909888.0, + "logps/chosen": -369.5465576171875, + "logps/rejected": -509.8731282552083, + "loss": 0.0369, + "rewards/chosen": 2.916822052001953, + "rewards/margins": 14.13543217976888, + "rewards/rejected": -11.218610127766928, + "step": 2177 + }, + { + "epoch": 0.19899497487437187, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.068616887603578e-06, + "logits/chosen": 497212544.0, + "logits/rejected": 454122086.4, + "logps/chosen": -341.9127604166667, + "logps/rejected": -338.7964599609375, + "loss": 0.0335, + "rewards/chosen": 2.5046380360921225, + "rewards/margins": 9.193449529012044, + "rewards/rejected": -6.688811492919922, + "step": 2178 + }, + { + "epoch": 0.19908634079488352, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 9.06778099489645e-06, + "logits/chosen": 785910272.0, + "logits/rejected": 530896896.0, + "logps/chosen": -359.5294189453125, + "logps/rejected": -426.23455810546875, + "loss": 0.0327, + "rewards/chosen": 3.366413116455078, + "rewards/margins": 10.767287254333496, + "rewards/rejected": -7.400874137878418, + "step": 2179 + }, + { + "epoch": 0.19917770671539517, + "grad_norm": 6.125, + "kl": 3.050382614135742, + "learning_rate": 9.066944765821522e-06, + "logits/chosen": 741367091.2, + "logits/rejected": 548775082.6666666, + "logps/chosen": -315.6773681640625, + "logps/rejected": -717.8063151041666, + "loss": 0.0604, + "rewards/chosen": 2.886402893066406, + "rewards/margins": 10.224018859863282, + "rewards/rejected": -7.337615966796875, + "step": 2180 + }, + { + "epoch": 0.19926907263590682, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.066108200447941e-06, + "logits/chosen": 795812288.0, + "logits/rejected": 425684416.0, + "logps/chosen": -372.245849609375, + "logps/rejected": -402.8757629394531, + "loss": 0.0273, + "rewards/chosen": 3.8908300399780273, + "rewards/margins": 10.467111587524414, + "rewards/rejected": -6.576281547546387, + "step": 2181 + }, + { + "epoch": 0.19936043855641847, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 9.065271298844886e-06, + "logits/chosen": 505714176.0, + "logits/rejected": 526065024.0, + "logps/chosen": -432.0646565755208, + "logps/rejected": -480.17266845703125, + "loss": 0.023, + "rewards/chosen": 3.9217211405436196, + "rewards/margins": 12.98628012339274, + "rewards/rejected": -9.064558982849121, + "step": 2182 + }, + { + "epoch": 0.19945180447693012, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 9.064434061081562e-06, + "logits/chosen": 926956864.0, + "logits/rejected": 719735637.3333334, + "logps/chosen": -350.98046875, + "logps/rejected": -500.6924641927083, + "loss": 0.009, + "rewards/chosen": 3.512472629547119, + "rewards/margins": 10.56696367263794, + "rewards/rejected": -7.05449104309082, + "step": 2183 + }, + { + "epoch": 0.19954317039744177, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.063596487227197e-06, + "logits/chosen": 404359509.3333333, + "logits/rejected": 535785369.6, + "logps/chosen": -199.4324747721354, + "logps/rejected": -395.790234375, + "loss": 0.0485, + "rewards/chosen": 2.2191330591837564, + "rewards/margins": 9.4879425684611, + "rewards/rejected": -7.268809509277344, + "step": 2184 + }, + { + "epoch": 0.19963453631795341, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 9.062758577351051e-06, + "logits/chosen": 758606336.0, + "logits/rejected": 861430681.6, + "logps/chosen": -375.03564453125, + "logps/rejected": -570.12490234375, + "loss": 0.0304, + "rewards/chosen": 2.887126922607422, + "rewards/margins": 11.64731216430664, + "rewards/rejected": -8.760185241699219, + "step": 2185 + }, + { + "epoch": 0.19972590223846506, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.061920331522417e-06, + "logits/chosen": 724523520.0, + "logits/rejected": 736641024.0, + "logps/chosen": -386.84189453125, + "logps/rejected": -833.8603515625, + "loss": 0.0285, + "rewards/chosen": 3.8484424591064452, + "rewards/margins": 16.180927658081053, + "rewards/rejected": -12.33248519897461, + "step": 2186 + }, + { + "epoch": 0.1998172681589767, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.061081749810604e-06, + "logits/chosen": 316525994.6666667, + "logits/rejected": 356123417.6, + "logps/chosen": -402.8846842447917, + "logps/rejected": -530.8330078125, + "loss": 0.0328, + "rewards/chosen": 3.465369542439779, + "rewards/margins": 10.004543431599934, + "rewards/rejected": -6.539173889160156, + "step": 2187 + }, + { + "epoch": 0.19990863407948836, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.060242832284958e-06, + "logits/chosen": 748825216.0, + "logits/rejected": 1218042368.0, + "logps/chosen": -305.02337646484375, + "logps/rejected": -528.396728515625, + "loss": 0.0316, + "rewards/chosen": 2.8491644859313965, + "rewards/margins": 11.21372365951538, + "rewards/rejected": -8.364559173583984, + "step": 2188 + }, + { + "epoch": 0.2, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.059403579014846e-06, + "logits/chosen": 354573952.0, + "logits/rejected": 974819737.6, + "logps/chosen": -325.7010904947917, + "logps/rejected": -476.0466796875, + "loss": 0.0643, + "rewards/chosen": 3.303598086039225, + "rewards/margins": 9.152424303690593, + "rewards/rejected": -5.848826217651367, + "step": 2189 + }, + { + "epoch": 0.20009136592051166, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.058563990069674e-06, + "logits/chosen": 588569408.0, + "logits/rejected": 340687530.6666667, + "logps/chosen": -191.78065490722656, + "logps/rejected": -332.6413167317708, + "loss": 0.0204, + "rewards/chosen": 3.1195950508117676, + "rewards/margins": 9.847901503245037, + "rewards/rejected": -6.7283064524332685, + "step": 2190 + }, + { + "epoch": 0.2001827318410233, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 9.05772406551886e-06, + "logits/chosen": 862080000.0, + "logits/rejected": 635144192.0, + "logps/chosen": -193.61641438802084, + "logps/rejected": -531.89228515625, + "loss": 0.0098, + "rewards/chosen": 4.068603515625, + "rewards/margins": 13.127061462402343, + "rewards/rejected": -9.058457946777343, + "step": 2191 + }, + { + "epoch": 0.20027409776153496, + "grad_norm": 28.25, + "kl": 0.0, + "learning_rate": 9.056883805431862e-06, + "logits/chosen": 437923840.0, + "logits/rejected": 561999744.0, + "logps/chosen": -188.5567626953125, + "logps/rejected": -443.4388020833333, + "loss": 0.1348, + "rewards/chosen": 2.408675765991211, + "rewards/margins": 11.789673741658529, + "rewards/rejected": -9.380997975667318, + "step": 2192 + }, + { + "epoch": 0.2003654636820466, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 9.056043209878161e-06, + "logits/chosen": 656317909.3333334, + "logits/rejected": 1788527360.0, + "logps/chosen": -335.39719645182294, + "logps/rejected": -410.49322509765625, + "loss": 0.0624, + "rewards/chosen": 3.2912912368774414, + "rewards/margins": 11.339964866638184, + "rewards/rejected": -8.048673629760742, + "step": 2193 + }, + { + "epoch": 0.20045682960255826, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 9.055202278927267e-06, + "logits/chosen": 522786816.0, + "logits/rejected": 878429900.8, + "logps/chosen": -425.6607259114583, + "logps/rejected": -390.089013671875, + "loss": 0.0156, + "rewards/chosen": 3.586432774861654, + "rewards/margins": 10.480495580037434, + "rewards/rejected": -6.894062805175781, + "step": 2194 + }, + { + "epoch": 0.2005481955230699, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 9.054361012648715e-06, + "logits/chosen": 500326741.3333333, + "logits/rejected": 585208934.4, + "logps/chosen": -264.2853597005208, + "logps/rejected": -389.32626953125, + "loss": 0.0161, + "rewards/chosen": 3.6263910929361978, + "rewards/margins": 11.060811869303386, + "rewards/rejected": -7.434420776367188, + "step": 2195 + }, + { + "epoch": 0.20063956144358155, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 9.053519411112075e-06, + "logits/chosen": 628448972.8, + "logits/rejected": 195290560.0, + "logps/chosen": -339.714990234375, + "logps/rejected": -242.70157877604166, + "loss": 0.0228, + "rewards/chosen": 3.948281097412109, + "rewards/margins": 10.332248814900716, + "rewards/rejected": -6.3839677174886065, + "step": 2196 + }, + { + "epoch": 0.2007309273640932, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 9.052677474386936e-06, + "logits/chosen": 582009600.0, + "logits/rejected": 561803648.0, + "logps/chosen": -271.18408203125, + "logps/rejected": -414.24871826171875, + "loss": 0.0259, + "rewards/chosen": 3.9998413721720376, + "rewards/margins": 11.776917139689127, + "rewards/rejected": -7.77707576751709, + "step": 2197 + }, + { + "epoch": 0.20082229328460485, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.051835202542916e-06, + "logits/chosen": 843325866.6666666, + "logits/rejected": 617168896.0, + "logps/chosen": -326.8472900390625, + "logps/rejected": -406.8541259765625, + "loss": 0.1248, + "rewards/chosen": 2.8397528330485025, + "rewards/margins": 11.914686838785807, + "rewards/rejected": -9.074934005737305, + "step": 2198 + }, + { + "epoch": 0.2009136592051165, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 9.050992595649667e-06, + "logits/chosen": 623687680.0, + "logits/rejected": 898405785.6, + "logps/chosen": -334.5284423828125, + "logps/rejected": -350.212060546875, + "loss": 0.0895, + "rewards/chosen": 1.404971440633138, + "rewards/margins": 8.145011266072592, + "rewards/rejected": -6.740039825439453, + "step": 2199 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 9.050149653776865e-06, + "logits/chosen": 855898560.0, + "logits/rejected": 623648320.0, + "logps/chosen": -397.29351806640625, + "logps/rejected": -523.8399047851562, + "loss": 0.0089, + "rewards/chosen": 4.449446201324463, + "rewards/margins": 11.069226741790771, + "rewards/rejected": -6.619780540466309, + "step": 2200 + }, + { + "epoch": 0.2010963910461398, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.049306376994212e-06, + "logits/chosen": 672321228.8, + "logits/rejected": 561185834.6666666, + "logps/chosen": -384.580322265625, + "logps/rejected": -434.3458658854167, + "loss": 0.0389, + "rewards/chosen": 2.8854848861694338, + "rewards/margins": 11.6049342473348, + "rewards/rejected": -8.719449361165365, + "step": 2201 + }, + { + "epoch": 0.20118775696665145, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 9.048462765371437e-06, + "logits/chosen": 518231509.3333333, + "logits/rejected": 654701721.6, + "logps/chosen": -379.09521484375, + "logps/rejected": -485.82939453125, + "loss": 0.014, + "rewards/chosen": 4.043622334798177, + "rewards/margins": 14.357717641194661, + "rewards/rejected": -10.314095306396485, + "step": 2202 + }, + { + "epoch": 0.2012791228871631, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.047618818978303e-06, + "logits/chosen": 1109867648.0, + "logits/rejected": 559706752.0, + "logps/chosen": -621.6683959960938, + "logps/rejected": -360.99951171875, + "loss": 0.0537, + "rewards/chosen": 2.9119834899902344, + "rewards/margins": 10.610058784484863, + "rewards/rejected": -7.698075294494629, + "step": 2203 + }, + { + "epoch": 0.20137048880767475, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.046774537884594e-06, + "logits/chosen": 661493811.2, + "logits/rejected": 395167232.0, + "logps/chosen": -301.210986328125, + "logps/rejected": -437.18603515625, + "loss": 0.0398, + "rewards/chosen": 2.9642818450927733, + "rewards/margins": 12.151868311564126, + "rewards/rejected": -9.187586466471354, + "step": 2204 + }, + { + "epoch": 0.2014618547281864, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 9.045929922160125e-06, + "logits/chosen": 534149728.0, + "logits/rejected": 346214784.0, + "logps/chosen": -320.0118103027344, + "logps/rejected": -352.9752197265625, + "loss": 0.2129, + "rewards/chosen": 2.4476661682128906, + "rewards/margins": 6.600729306538899, + "rewards/rejected": -4.153063138326009, + "step": 2205 + }, + { + "epoch": 0.20155322064869804, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.045084971874738e-06, + "logits/chosen": 416071628.8, + "logits/rejected": 731784021.3333334, + "logps/chosen": -223.999169921875, + "logps/rejected": -969.1920572916666, + "loss": 0.0299, + "rewards/chosen": 3.684754180908203, + "rewards/margins": 17.478272247314454, + "rewards/rejected": -13.79351806640625, + "step": 2206 + }, + { + "epoch": 0.2016445865692097, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 9.044239687098303e-06, + "logits/chosen": 575738368.0, + "logits/rejected": 414788778.6666667, + "logps/chosen": -316.152685546875, + "logps/rejected": -492.5152180989583, + "loss": 0.0187, + "rewards/chosen": 3.7174942016601564, + "rewards/margins": 12.170015207926433, + "rewards/rejected": -8.452521006266275, + "step": 2207 + }, + { + "epoch": 0.20173595248972134, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.043394067900715e-06, + "logits/chosen": 596210892.8, + "logits/rejected": 712013056.0, + "logps/chosen": -373.5375, + "logps/rejected": -541.5161946614584, + "loss": 0.0369, + "rewards/chosen": 2.853664779663086, + "rewards/margins": 11.66887067159017, + "rewards/rejected": -8.815205891927084, + "step": 2208 + }, + { + "epoch": 0.201827318410233, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.0425481143519e-06, + "logits/chosen": 440810048.0, + "logits/rejected": 523379370.6666667, + "logps/chosen": -170.18157958984375, + "logps/rejected": -391.0331217447917, + "loss": 0.0184, + "rewards/chosen": 3.571258544921875, + "rewards/margins": 10.498316446940105, + "rewards/rejected": -6.9270579020182295, + "step": 2209 + }, + { + "epoch": 0.20191868433074464, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 9.041701826521814e-06, + "logits/chosen": 448626261.3333333, + "logits/rejected": 561367654.4, + "logps/chosen": -331.55763753255206, + "logps/rejected": -508.3779296875, + "loss": 0.0155, + "rewards/chosen": 3.214512825012207, + "rewards/margins": 11.882732582092284, + "rewards/rejected": -8.668219757080077, + "step": 2210 + }, + { + "epoch": 0.2020100502512563, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 9.040855204480432e-06, + "logits/chosen": 417841056.0, + "logits/rejected": 661788800.0, + "logps/chosen": -311.7205505371094, + "logps/rejected": -708.3712158203125, + "loss": 0.0186, + "rewards/chosen": 4.0189619064331055, + "rewards/margins": 12.403244972229004, + "rewards/rejected": -8.384283065795898, + "step": 2211 + }, + { + "epoch": 0.20210141617176794, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.040008248297764e-06, + "logits/chosen": 591439360.0, + "logits/rejected": 1509933184.0, + "logps/chosen": -337.6053466796875, + "logps/rejected": -722.5712280273438, + "loss": 0.0365, + "rewards/chosen": 3.3289047876993814, + "rewards/margins": 11.9526735941569, + "rewards/rejected": -8.62376880645752, + "step": 2212 + }, + { + "epoch": 0.2021927820922796, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 9.039160958043847e-06, + "logits/chosen": 596146816.0, + "logits/rejected": 656086656.0, + "logps/chosen": -326.302734375, + "logps/rejected": -515.4429931640625, + "loss": 0.0144, + "rewards/chosen": 3.8616132736206055, + "rewards/margins": 10.400197982788086, + "rewards/rejected": -6.5385847091674805, + "step": 2213 + }, + { + "epoch": 0.20228414801279124, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.038313333788742e-06, + "logits/chosen": 1199071488.0, + "logits/rejected": 658922432.0, + "logps/chosen": -280.1096496582031, + "logps/rejected": -417.64923095703125, + "loss": 0.0556, + "rewards/chosen": 2.6551623344421387, + "rewards/margins": 9.969295501708984, + "rewards/rejected": -7.314133167266846, + "step": 2214 + }, + { + "epoch": 0.2023755139333029, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 9.03746537560254e-06, + "logits/chosen": 642825676.8, + "logits/rejected": 392527317.3333333, + "logps/chosen": -189.2282470703125, + "logps/rejected": -303.97589111328125, + "loss": 0.1418, + "rewards/chosen": 2.000371742248535, + "rewards/margins": 9.451160875956218, + "rewards/rejected": -7.450789133707683, + "step": 2215 + }, + { + "epoch": 0.20246687985381454, + "grad_norm": 31.625, + "kl": 0.0, + "learning_rate": 9.036617083555357e-06, + "logits/chosen": 670612736.0, + "logits/rejected": 425590613.3333333, + "logps/chosen": -312.054248046875, + "logps/rejected": -474.2110188802083, + "loss": 0.0701, + "rewards/chosen": 2.8980030059814452, + "rewards/margins": 12.627993647257487, + "rewards/rejected": -9.729990641276041, + "step": 2216 + }, + { + "epoch": 0.20255824577432618, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.035768457717343e-06, + "logits/chosen": 529644714.6666667, + "logits/rejected": 412312243.2, + "logps/chosen": -277.5644938151042, + "logps/rejected": -368.445166015625, + "loss": 0.1092, + "rewards/chosen": 1.9297566413879395, + "rewards/margins": 9.30790605545044, + "rewards/rejected": -7.3781494140625, + "step": 2217 + }, + { + "epoch": 0.20264961169483783, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 9.03491949815867e-06, + "logits/chosen": 755574016.0, + "logits/rejected": 411596064.0, + "logps/chosen": -453.215087890625, + "logps/rejected": -373.52215576171875, + "loss": 0.0272, + "rewards/chosen": 3.0669608116149902, + "rewards/margins": 9.43506383895874, + "rewards/rejected": -6.36810302734375, + "step": 2218 + }, + { + "epoch": 0.20274097761534948, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 9.034070204949539e-06, + "logits/chosen": 844486741.3333334, + "logits/rejected": 986116300.8, + "logps/chosen": -396.9256998697917, + "logps/rejected": -591.027099609375, + "loss": 0.0444, + "rewards/chosen": 2.2167771657307944, + "rewards/margins": 10.616456731160483, + "rewards/rejected": -8.399679565429688, + "step": 2219 + }, + { + "epoch": 0.20283234353586113, + "grad_norm": 33.0, + "kl": 0.0, + "learning_rate": 9.033220578160178e-06, + "logits/chosen": 475292416.0, + "logits/rejected": 432570453.3333333, + "logps/chosen": -320.779833984375, + "logps/rejected": -427.4863688151042, + "loss": 0.065, + "rewards/chosen": 2.754698944091797, + "rewards/margins": 11.593871307373046, + "rewards/rejected": -8.83917236328125, + "step": 2220 + }, + { + "epoch": 0.20292370945637278, + "grad_norm": 29.75, + "kl": 0.0, + "learning_rate": 9.032370617860844e-06, + "logits/chosen": 519781674.6666667, + "logits/rejected": 461411968.0, + "logps/chosen": -353.3313395182292, + "logps/rejected": -418.3949890136719, + "loss": 0.1772, + "rewards/chosen": 1.6322182019551594, + "rewards/margins": 8.202400048573812, + "rewards/rejected": -6.570181846618652, + "step": 2221 + }, + { + "epoch": 0.20301507537688443, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 9.03152032412182e-06, + "logits/chosen": 2159131136.0, + "logits/rejected": 567947958.8571428, + "logps/chosen": -211.203369140625, + "logps/rejected": -491.65457589285717, + "loss": 0.0079, + "rewards/chosen": 3.2618865966796875, + "rewards/margins": 11.764849526541573, + "rewards/rejected": -8.502962929861885, + "step": 2222 + }, + { + "epoch": 0.20310644129739608, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 9.030669697013419e-06, + "logits/chosen": 664598272.0, + "logits/rejected": 604218794.6666666, + "logps/chosen": -517.0189208984375, + "logps/rejected": -266.7750244140625, + "loss": 0.0073, + "rewards/chosen": 4.1755218505859375, + "rewards/margins": 10.516031901041668, + "rewards/rejected": -6.3405100504557295, + "step": 2223 + }, + { + "epoch": 0.20319780721790773, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 9.029818736605979e-06, + "logits/chosen": 707112448.0, + "logits/rejected": 492049834.6666667, + "logps/chosen": -279.6560791015625, + "logps/rejected": -256.973388671875, + "loss": 0.0482, + "rewards/chosen": 3.25491943359375, + "rewards/margins": 10.247564697265625, + "rewards/rejected": -6.992645263671875, + "step": 2224 + }, + { + "epoch": 0.20328917313841938, + "grad_norm": 39.5, + "kl": 0.0, + "learning_rate": 9.028967442969867e-06, + "logits/chosen": 638419865.6, + "logits/rejected": 353145301.3333333, + "logps/chosen": -352.33564453125, + "logps/rejected": -334.5922037760417, + "loss": 0.0651, + "rewards/chosen": 2.730303955078125, + "rewards/margins": 9.715972137451171, + "rewards/rejected": -6.985668182373047, + "step": 2225 + }, + { + "epoch": 0.20338053905893103, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.028115816175476e-06, + "logits/chosen": 617662976.0, + "logits/rejected": 833325482.6666666, + "logps/chosen": -272.156591796875, + "logps/rejected": -998.552001953125, + "loss": 0.0395, + "rewards/chosen": 3.0422534942626953, + "rewards/margins": 14.767943700154623, + "rewards/rejected": -11.725690205891928, + "step": 2226 + }, + { + "epoch": 0.20347190497944267, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.027263856293228e-06, + "logits/chosen": 480571136.0, + "logits/rejected": 457764864.0, + "logps/chosen": -382.77568359375, + "logps/rejected": -561.8326416015625, + "loss": 0.0142, + "rewards/chosen": 4.15621337890625, + "rewards/margins": 14.586737569173177, + "rewards/rejected": -10.430524190266928, + "step": 2227 + }, + { + "epoch": 0.20356327089995432, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.026411563393574e-06, + "logits/chosen": 451262976.0, + "logits/rejected": 439810176.0, + "logps/chosen": -299.23525390625, + "logps/rejected": -242.26106770833334, + "loss": 0.0397, + "rewards/chosen": 3.0735542297363283, + "rewards/margins": 9.37844009399414, + "rewards/rejected": -6.3048858642578125, + "step": 2228 + }, + { + "epoch": 0.20365463682046597, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.025558937546987e-06, + "logits/chosen": 369219925.3333333, + "logits/rejected": 553257472.0, + "logps/chosen": -310.0091959635417, + "logps/rejected": -483.9578125, + "loss": 0.049, + "rewards/chosen": 3.7057984670003257, + "rewards/margins": 9.733083470662436, + "rewards/rejected": -6.02728500366211, + "step": 2229 + }, + { + "epoch": 0.20374600274097762, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 9.024705978823975e-06, + "logits/chosen": 678408746.6666666, + "logits/rejected": 1150257356.8, + "logps/chosen": -344.8812662760417, + "logps/rejected": -575.177880859375, + "loss": 0.0251, + "rewards/chosen": 2.8397254943847656, + "rewards/margins": 10.449488067626953, + "rewards/rejected": -7.609762573242188, + "step": 2230 + }, + { + "epoch": 0.20383736866148927, + "grad_norm": 24.5, + "kl": 0.0, + "learning_rate": 9.023852687295067e-06, + "logits/chosen": 413522304.0, + "logits/rejected": 647356800.0, + "logps/chosen": -193.90322875976562, + "logps/rejected": -304.7310791015625, + "loss": 0.0895, + "rewards/chosen": 2.7583749294281006, + "rewards/margins": 9.677181005477905, + "rewards/rejected": -6.918806076049805, + "step": 2231 + }, + { + "epoch": 0.20392873458200092, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.022999063030826e-06, + "logits/chosen": 453545625.6, + "logits/rejected": 259417728.0, + "logps/chosen": -223.55615234375, + "logps/rejected": -242.85298665364584, + "loss": 0.0757, + "rewards/chosen": 2.8022823333740234, + "rewards/margins": 6.674560546875, + "rewards/rejected": -3.8722782135009766, + "step": 2232 + }, + { + "epoch": 0.20402010050251257, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.022145106101833e-06, + "logits/chosen": 456332160.0, + "logits/rejected": 349884192.0, + "logps/chosen": -296.535400390625, + "logps/rejected": -391.5511779785156, + "loss": 0.0503, + "rewards/chosen": 2.820406436920166, + "rewards/margins": 9.51181173324585, + "rewards/rejected": -6.691405296325684, + "step": 2233 + }, + { + "epoch": 0.20411146642302422, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 9.021290816578708e-06, + "logits/chosen": 568220160.0, + "logits/rejected": 426952960.0, + "logps/chosen": -241.17867024739584, + "logps/rejected": -427.498291015625, + "loss": 0.0217, + "rewards/chosen": 3.925077438354492, + "rewards/margins": 13.648029327392578, + "rewards/rejected": -9.722951889038086, + "step": 2234 + }, + { + "epoch": 0.20420283234353587, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.02043619453209e-06, + "logits/chosen": 806035584.0, + "logits/rejected": 579150043.4285715, + "logps/chosen": -338.48248291015625, + "logps/rejected": -505.64132254464283, + "loss": 0.0333, + "rewards/chosen": 3.998666524887085, + "rewards/margins": 10.266642468316213, + "rewards/rejected": -6.267975943429129, + "step": 2235 + }, + { + "epoch": 0.20429419826404752, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.01958124003265e-06, + "logits/chosen": 390867968.0, + "logits/rejected": 373834316.8, + "logps/chosen": -294.4878743489583, + "logps/rejected": -275.005322265625, + "loss": 0.0623, + "rewards/chosen": 2.618525187174479, + "rewards/margins": 9.291723124186198, + "rewards/rejected": -6.673197937011719, + "step": 2236 + }, + { + "epoch": 0.20438556418455917, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 9.018725953151082e-06, + "logits/chosen": 429577728.0, + "logits/rejected": 324941363.2, + "logps/chosen": -303.07993570963544, + "logps/rejected": -431.58212890625, + "loss": 0.0186, + "rewards/chosen": 3.2212441762288413, + "rewards/margins": 11.643293126424155, + "rewards/rejected": -8.422048950195313, + "step": 2237 + }, + { + "epoch": 0.20447693010507081, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.017870333958113e-06, + "logits/chosen": 287165248.0, + "logits/rejected": 592834560.0, + "logps/chosen": -173.45233154296875, + "logps/rejected": -494.06180245535717, + "loss": 0.0346, + "rewards/chosen": 4.407824993133545, + "rewards/margins": 9.982580525534495, + "rewards/rejected": -5.574755532400949, + "step": 2238 + }, + { + "epoch": 0.20456829602558246, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 9.017014382524494e-06, + "logits/chosen": 414084992.0, + "logits/rejected": 712636544.0, + "logps/chosen": -280.839599609375, + "logps/rejected": -563.0499877929688, + "loss": 0.0194, + "rewards/chosen": 4.087873458862305, + "rewards/margins": 10.420270442962646, + "rewards/rejected": -6.332396984100342, + "step": 2239 + }, + { + "epoch": 0.2046596619460941, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.016158098921002e-06, + "logits/chosen": 701889638.4, + "logits/rejected": 1396626176.0, + "logps/chosen": -325.7649169921875, + "logps/rejected": -508.9501953125, + "loss": 0.0203, + "rewards/chosen": 4.044583892822265, + "rewards/margins": 11.098421351114908, + "rewards/rejected": -7.0538374582926435, + "step": 2240 + }, + { + "epoch": 0.20475102786660576, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 9.015301483218448e-06, + "logits/chosen": 579415210.6666666, + "logits/rejected": 542074163.2, + "logps/chosen": -267.2599690755208, + "logps/rejected": -479.11572265625, + "loss": 0.0198, + "rewards/chosen": 3.247471491495768, + "rewards/margins": 11.000607172648111, + "rewards/rejected": -7.753135681152344, + "step": 2241 + }, + { + "epoch": 0.2048423937871174, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.014444535487663e-06, + "logits/chosen": 719087762.2857143, + "logits/rejected": 506817152.0, + "logps/chosen": -304.84835379464283, + "logps/rejected": -850.2977294921875, + "loss": 0.0408, + "rewards/chosen": 3.466814858572824, + "rewards/margins": 21.013317925589426, + "rewards/rejected": -17.5465030670166, + "step": 2242 + }, + { + "epoch": 0.20493375970762906, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.013587255799509e-06, + "logits/chosen": 700417024.0, + "logits/rejected": 1032033877.3333334, + "logps/chosen": -367.8732177734375, + "logps/rejected": -441.7622884114583, + "loss": 0.0387, + "rewards/chosen": 2.9014156341552733, + "rewards/margins": 9.617836125691731, + "rewards/rejected": -6.716420491536458, + "step": 2243 + }, + { + "epoch": 0.2050251256281407, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.012729644224878e-06, + "logits/chosen": 689643477.3333334, + "logits/rejected": 438594496.0, + "logps/chosen": -351.3858642578125, + "logps/rejected": -487.4221496582031, + "loss": 0.0421, + "rewards/chosen": 3.3718687693277993, + "rewards/margins": 15.287251154581705, + "rewards/rejected": -11.915382385253906, + "step": 2244 + }, + { + "epoch": 0.20511649154865236, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.011871700834683e-06, + "logits/chosen": 851642880.0, + "logits/rejected": 537345472.0, + "logps/chosen": -241.61940002441406, + "logps/rejected": -422.9793701171875, + "loss": 0.1159, + "rewards/chosen": 3.030142307281494, + "rewards/margins": 8.525490760803223, + "rewards/rejected": -5.4953484535217285, + "step": 2245 + }, + { + "epoch": 0.205207857469164, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.011013425699868e-06, + "logits/chosen": 835400499.2, + "logits/rejected": 1142825301.3333333, + "logps/chosen": -438.5337890625, + "logps/rejected": -506.751220703125, + "loss": 0.0525, + "rewards/chosen": 2.716484260559082, + "rewards/margins": 11.744582049051921, + "rewards/rejected": -9.028097788492838, + "step": 2246 + }, + { + "epoch": 0.20529922338967566, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 9.010154818891407e-06, + "logits/chosen": 218896992.0, + "logits/rejected": 304705621.3333333, + "logps/chosen": -210.61312866210938, + "logps/rejected": -319.7477620442708, + "loss": 0.0076, + "rewards/chosen": 3.4855377674102783, + "rewards/margins": 12.799319505691528, + "rewards/rejected": -9.31378173828125, + "step": 2247 + }, + { + "epoch": 0.2053905893101873, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.009295880480296e-06, + "logits/chosen": 679334656.0, + "logits/rejected": 827362688.0, + "logps/chosen": -270.54254150390625, + "logps/rejected": -570.33154296875, + "loss": 0.0437, + "rewards/chosen": 2.9835140705108643, + "rewards/margins": 12.243399858474731, + "rewards/rejected": -9.259885787963867, + "step": 2248 + }, + { + "epoch": 0.20548195523069895, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.008436610537562e-06, + "logits/chosen": 580180437.3333334, + "logits/rejected": 480234304.0, + "logps/chosen": -376.0857747395833, + "logps/rejected": -528.6236572265625, + "loss": 0.0611, + "rewards/chosen": 2.6434431076049805, + "rewards/margins": 13.369542121887207, + "rewards/rejected": -10.726099014282227, + "step": 2249 + }, + { + "epoch": 0.2055733211512106, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 9.007577009134263e-06, + "logits/chosen": 313557312.0, + "logits/rejected": 371110656.0, + "logps/chosen": -160.75057983398438, + "logps/rejected": -522.0870361328125, + "loss": 0.0143, + "rewards/chosen": 3.857961654663086, + "rewards/margins": 13.746384620666504, + "rewards/rejected": -9.888422966003418, + "step": 2250 + }, + { + "epoch": 0.20566468707172225, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.006717076341473e-06, + "logits/chosen": 700802688.0, + "logits/rejected": 407787648.0, + "logps/chosen": -221.1014862060547, + "logps/rejected": -431.1821594238281, + "loss": 0.1042, + "rewards/chosen": 2.948004722595215, + "rewards/margins": 12.226177215576172, + "rewards/rejected": -9.278172492980957, + "step": 2251 + }, + { + "epoch": 0.2057560529922339, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 9.005856812230304e-06, + "logits/chosen": 756149376.0, + "logits/rejected": 404088768.0, + "logps/chosen": -265.95782470703125, + "logps/rejected": -295.048583984375, + "loss": 0.0113, + "rewards/chosen": 4.0809831619262695, + "rewards/margins": 11.556151390075684, + "rewards/rejected": -7.475168228149414, + "step": 2252 + }, + { + "epoch": 0.20584741891274555, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 9.004996216871893e-06, + "logits/chosen": 667831082.6666666, + "logits/rejected": 460872806.4, + "logps/chosen": -435.4616292317708, + "logps/rejected": -551.9099609375, + "loss": 0.011, + "rewards/chosen": 3.73332150777181, + "rewards/margins": 12.498043950398763, + "rewards/rejected": -8.764722442626953, + "step": 2253 + }, + { + "epoch": 0.2059387848332572, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.004135290337402e-06, + "logits/chosen": 1082260224.0, + "logits/rejected": 661175168.0, + "logps/chosen": -438.20721435546875, + "logps/rejected": -446.18841552734375, + "loss": 0.027, + "rewards/chosen": 3.9747085571289062, + "rewards/margins": 10.97402048110962, + "rewards/rejected": -6.999311923980713, + "step": 2254 + }, + { + "epoch": 0.20603015075376885, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.003274032698021e-06, + "logits/chosen": 523014272.0, + "logits/rejected": 472004192.0, + "logps/chosen": -239.45819091796875, + "logps/rejected": -493.2794494628906, + "loss": 0.0952, + "rewards/chosen": 2.04223903020223, + "rewards/margins": 10.270862420399984, + "rewards/rejected": -8.228623390197754, + "step": 2255 + }, + { + "epoch": 0.2061215166742805, + "grad_norm": 41.75, + "kl": 0.0, + "learning_rate": 9.002412444024969e-06, + "logits/chosen": 506825216.0, + "logits/rejected": 616088405.3333334, + "logps/chosen": -397.33759765625, + "logps/rejected": -687.2068684895834, + "loss": 0.0945, + "rewards/chosen": 3.0086395263671877, + "rewards/margins": 15.126197052001952, + "rewards/rejected": -12.117557525634766, + "step": 2256 + }, + { + "epoch": 0.20621288259479215, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 9.00155052438949e-06, + "logits/chosen": 391868416.0, + "logits/rejected": 454959104.0, + "logps/chosen": -294.56414794921875, + "logps/rejected": -568.94140625, + "loss": 0.02, + "rewards/chosen": 2.6626152992248535, + "rewards/margins": 11.504815260569254, + "rewards/rejected": -8.8421999613444, + "step": 2257 + }, + { + "epoch": 0.2063042485153038, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 9.00068827386286e-06, + "logits/chosen": 567642624.0, + "logits/rejected": 614308800.0, + "logps/chosen": -279.31113688151044, + "logps/rejected": -481.70587158203125, + "loss": 0.0325, + "rewards/chosen": 3.3160928090413413, + "rewards/margins": 13.612537701924643, + "rewards/rejected": -10.2964448928833, + "step": 2258 + }, + { + "epoch": 0.20639561443581544, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.999825692516375e-06, + "logits/chosen": 439597158.4, + "logits/rejected": 684006058.6666666, + "logps/chosen": -234.0209228515625, + "logps/rejected": -491.104736328125, + "loss": 0.0321, + "rewards/chosen": 3.024369239807129, + "rewards/margins": 12.685428937276205, + "rewards/rejected": -9.661059697469076, + "step": 2259 + }, + { + "epoch": 0.2064869803563271, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 8.998962780421366e-06, + "logits/chosen": 299947110.4, + "logits/rejected": 382486186.6666667, + "logps/chosen": -278.674462890625, + "logps/rejected": -344.9396158854167, + "loss": 0.0461, + "rewards/chosen": 3.463667297363281, + "rewards/margins": 11.021161778767903, + "rewards/rejected": -7.557494481404622, + "step": 2260 + }, + { + "epoch": 0.20657834627683874, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 8.998099537649184e-06, + "logits/chosen": 1171244885.3333333, + "logits/rejected": 626102400.0, + "logps/chosen": -332.1164143880208, + "logps/rejected": -435.73358154296875, + "loss": 0.0232, + "rewards/chosen": 3.9269542694091797, + "rewards/margins": 14.08832836151123, + "rewards/rejected": -10.16137409210205, + "step": 2261 + }, + { + "epoch": 0.2066697121973504, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 8.997235964271216e-06, + "logits/chosen": 527601322.6666667, + "logits/rejected": 598864064.0, + "logps/chosen": -198.5366414388021, + "logps/rejected": -676.2557983398438, + "loss": 0.0291, + "rewards/chosen": 3.321582794189453, + "rewards/margins": 14.151355743408203, + "rewards/rejected": -10.82977294921875, + "step": 2262 + }, + { + "epoch": 0.20676107811786204, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 8.996372060358867e-06, + "logits/chosen": 343362633.14285713, + "logits/rejected": 399582752.0, + "logps/chosen": -363.1796875, + "logps/rejected": -262.09521484375, + "loss": 0.0297, + "rewards/chosen": 3.937188284737723, + "rewards/margins": 12.782171385628835, + "rewards/rejected": -8.844983100891113, + "step": 2263 + }, + { + "epoch": 0.2068524440383737, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 8.995507825983578e-06, + "logits/chosen": 703352473.6, + "logits/rejected": 520186453.3333333, + "logps/chosen": -448.35791015625, + "logps/rejected": -563.1998697916666, + "loss": 0.0386, + "rewards/chosen": 2.9216976165771484, + "rewards/margins": 13.23897616068522, + "rewards/rejected": -10.317278544108072, + "step": 2264 + }, + { + "epoch": 0.20694380995888534, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 8.994643261216809e-06, + "logits/chosen": 435343923.2, + "logits/rejected": 826669909.3333334, + "logps/chosen": -209.2845947265625, + "logps/rejected": -504.398681640625, + "loss": 0.0399, + "rewards/chosen": 3.32734375, + "rewards/margins": 13.353062438964844, + "rewards/rejected": -10.025718688964844, + "step": 2265 + }, + { + "epoch": 0.207035175879397, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 8.993778366130054e-06, + "logits/chosen": 538681856.0, + "logits/rejected": 142694144.0, + "logps/chosen": -334.053466796875, + "logps/rejected": -179.29263305664062, + "loss": 0.0442, + "rewards/chosen": 2.581226348876953, + "rewards/margins": 9.011218070983887, + "rewards/rejected": -6.429991722106934, + "step": 2266 + }, + { + "epoch": 0.20712654179990864, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 8.992913140794832e-06, + "logits/chosen": 455734442.6666667, + "logits/rejected": 388693504.0, + "logps/chosen": -347.3512369791667, + "logps/rejected": -378.2548522949219, + "loss": 0.0463, + "rewards/chosen": 3.0078859329223633, + "rewards/margins": 11.756430625915527, + "rewards/rejected": -8.748544692993164, + "step": 2267 + }, + { + "epoch": 0.20721790772042029, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 8.992047585282689e-06, + "logits/chosen": 248858624.0, + "logits/rejected": 737441536.0, + "logps/chosen": -212.90054321289062, + "logps/rejected": -580.1949869791666, + "loss": 0.0037, + "rewards/chosen": 4.450109004974365, + "rewards/margins": 14.032113552093506, + "rewards/rejected": -9.58200454711914, + "step": 2268 + }, + { + "epoch": 0.20730927364093193, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 8.991181699665194e-06, + "logits/chosen": 377795520.0, + "logits/rejected": 485430272.0, + "logps/chosen": -224.51278686523438, + "logps/rejected": -439.70489501953125, + "loss": 0.0203, + "rewards/chosen": 3.712419033050537, + "rewards/margins": 12.129008769989014, + "rewards/rejected": -8.416589736938477, + "step": 2269 + }, + { + "epoch": 0.20740063956144358, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 8.990315484013956e-06, + "logits/chosen": 464407296.0, + "logits/rejected": 949596842.6666666, + "logps/chosen": -277.1935546875, + "logps/rejected": -681.2702229817709, + "loss": 0.0265, + "rewards/chosen": 3.266082000732422, + "rewards/margins": 13.336594772338866, + "rewards/rejected": -10.070512771606445, + "step": 2270 + }, + { + "epoch": 0.20749200548195523, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 8.989448938400596e-06, + "logits/chosen": 852905408.0, + "logits/rejected": 1332474148.5714285, + "logps/chosen": -323.5814208984375, + "logps/rejected": -785.0530133928571, + "loss": 0.0079, + "rewards/chosen": 2.7649199962615967, + "rewards/margins": 11.893491097858973, + "rewards/rejected": -9.128571101597377, + "step": 2271 + }, + { + "epoch": 0.20758337140246688, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 8.988582062896773e-06, + "logits/chosen": 506853785.6, + "logits/rejected": 615554048.0, + "logps/chosen": -241.06748046875, + "logps/rejected": -451.7568766276042, + "loss": 0.0455, + "rewards/chosen": 3.1234498977661134, + "rewards/margins": 11.636754926045736, + "rewards/rejected": -8.513305028279623, + "step": 2272 + }, + { + "epoch": 0.20767473732297853, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 8.987714857574168e-06, + "logits/chosen": 623183488.0, + "logits/rejected": 378282752.0, + "logps/chosen": -479.701904296875, + "logps/rejected": -496.0329284667969, + "loss": 0.0188, + "rewards/chosen": 3.260929822921753, + "rewards/margins": 15.376204252243042, + "rewards/rejected": -12.115274429321289, + "step": 2273 + }, + { + "epoch": 0.20776610324349018, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.98684732250449e-06, + "logits/chosen": 474806674.28571427, + "logits/rejected": 1169477632.0, + "logps/chosen": -279.73256138392856, + "logps/rejected": -162.92987060546875, + "loss": 0.0474, + "rewards/chosen": 3.3226307460239957, + "rewards/margins": 11.026138646262034, + "rewards/rejected": -7.703507900238037, + "step": 2274 + }, + { + "epoch": 0.20785746916400183, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 8.98597945775948e-06, + "logits/chosen": 281825344.0, + "logits/rejected": 408040405.3333333, + "logps/chosen": -222.57376098632812, + "logps/rejected": -478.2412516276042, + "loss": 0.0139, + "rewards/chosen": 3.0837528705596924, + "rewards/margins": 12.639477968215942, + "rewards/rejected": -9.55572509765625, + "step": 2275 + }, + { + "epoch": 0.20794883508451348, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.985111263410899e-06, + "logits/chosen": 897456844.8, + "logits/rejected": 1145354581.3333333, + "logps/chosen": -369.749755859375, + "logps/rejected": -507.40478515625, + "loss": 0.0244, + "rewards/chosen": 3.8193187713623047, + "rewards/margins": 13.950745264689127, + "rewards/rejected": -10.131426493326822, + "step": 2276 + }, + { + "epoch": 0.20804020100502513, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 8.984242739530539e-06, + "logits/chosen": 512359381.3333333, + "logits/rejected": 629706048.0, + "logps/chosen": -234.89654541015625, + "logps/rejected": -438.8431396484375, + "loss": 0.0407, + "rewards/chosen": 3.4072326024373374, + "rewards/margins": 11.96374543507894, + "rewards/rejected": -8.556512832641602, + "step": 2277 + }, + { + "epoch": 0.20813156692553678, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 8.983373886190219e-06, + "logits/chosen": 560892757.3333334, + "logits/rejected": 228050662.4, + "logps/chosen": -348.2375081380208, + "logps/rejected": -273.430810546875, + "loss": 0.0469, + "rewards/chosen": 3.7580960591634116, + "rewards/margins": 11.398798116048177, + "rewards/rejected": -7.6407020568847654, + "step": 2278 + }, + { + "epoch": 0.20822293284604843, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 8.982504703461784e-06, + "logits/chosen": 522982336.0, + "logits/rejected": 274329344.0, + "logps/chosen": -261.2867126464844, + "logps/rejected": -251.95602416992188, + "loss": 0.0189, + "rewards/chosen": 3.351532459259033, + "rewards/margins": 11.496151447296143, + "rewards/rejected": -8.14461898803711, + "step": 2279 + }, + { + "epoch": 0.20831429876656007, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 8.981635191417111e-06, + "logits/chosen": 514412714.6666667, + "logits/rejected": 393634624.0, + "logps/chosen": -348.5736490885417, + "logps/rejected": -676.5078125, + "loss": 0.0379, + "rewards/chosen": 3.1217638651529946, + "rewards/margins": 15.111639658610025, + "rewards/rejected": -11.989875793457031, + "step": 2280 + }, + { + "epoch": 0.20840566468707172, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 8.980765350128097e-06, + "logits/chosen": 662984960.0, + "logits/rejected": 355798176.0, + "logps/chosen": -387.0944010416667, + "logps/rejected": -432.6494140625, + "loss": 0.0725, + "rewards/chosen": 2.904513676961263, + "rewards/margins": 9.638009866078695, + "rewards/rejected": -6.733496189117432, + "step": 2281 + }, + { + "epoch": 0.20849703060758337, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 8.979895179666674e-06, + "logits/chosen": 627344256.0, + "logits/rejected": 692802688.0, + "logps/chosen": -425.06005859375, + "logps/rejected": -346.74395751953125, + "loss": 0.0299, + "rewards/chosen": 3.2260327339172363, + "rewards/margins": 9.444243907928467, + "rewards/rejected": -6.2182111740112305, + "step": 2282 + }, + { + "epoch": 0.20858839652809502, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 8.97902468010479e-06, + "logits/chosen": 1008081152.0, + "logits/rejected": 504208486.4, + "logps/chosen": -356.2035319010417, + "logps/rejected": -374.3199462890625, + "loss": 0.0907, + "rewards/chosen": 3.2273737589518228, + "rewards/margins": 9.737877909342448, + "rewards/rejected": -6.510504150390625, + "step": 2283 + }, + { + "epoch": 0.20867976244860667, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.978153851514436e-06, + "logits/chosen": 470768512.0, + "logits/rejected": 575929856.0, + "logps/chosen": -320.5282287597656, + "logps/rejected": -751.4362182617188, + "loss": 0.031, + "rewards/chosen": 2.9960551261901855, + "rewards/margins": 12.379054546356201, + "rewards/rejected": -9.382999420166016, + "step": 2284 + }, + { + "epoch": 0.20877112836911832, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.977282693967613e-06, + "logits/chosen": 855558997.3333334, + "logits/rejected": 518044672.0, + "logps/chosen": -402.4694010416667, + "logps/rejected": -281.6362609863281, + "loss": 0.0535, + "rewards/chosen": 3.030752182006836, + "rewards/margins": 10.237151145935059, + "rewards/rejected": -7.206398963928223, + "step": 2285 + }, + { + "epoch": 0.20886249428962997, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 8.976411207536361e-06, + "logits/chosen": 812495445.3333334, + "logits/rejected": 470244147.2, + "logps/chosen": -295.0047607421875, + "logps/rejected": -437.2552734375, + "loss": 0.0149, + "rewards/chosen": 3.438072840372721, + "rewards/margins": 11.020854822794597, + "rewards/rejected": -7.582781982421875, + "step": 2286 + }, + { + "epoch": 0.20895386021014162, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.975539392292748e-06, + "logits/chosen": 243791120.0, + "logits/rejected": 218301056.0, + "logps/chosen": -163.2046356201172, + "logps/rejected": -256.3411560058594, + "loss": 0.0348, + "rewards/chosen": 2.894495725631714, + "rewards/margins": 10.541497945785522, + "rewards/rejected": -7.647002220153809, + "step": 2287 + }, + { + "epoch": 0.20904522613065327, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.974667248308859e-06, + "logits/chosen": 467596629.3333333, + "logits/rejected": 629285683.2, + "logps/chosen": -184.4576619466146, + "logps/rejected": -504.72939453125, + "loss": 0.0106, + "rewards/chosen": 4.081418991088867, + "rewards/margins": 12.558250045776367, + "rewards/rejected": -8.4768310546875, + "step": 2288 + }, + { + "epoch": 0.20913659205116492, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.973794775656817e-06, + "logits/chosen": 865584064.0, + "logits/rejected": 1037758720.0, + "logps/chosen": -294.54888916015625, + "logps/rejected": -441.9017028808594, + "loss": 0.057, + "rewards/chosen": 3.622184991836548, + "rewards/margins": 9.44632363319397, + "rewards/rejected": -5.824138641357422, + "step": 2289 + }, + { + "epoch": 0.20922795797167656, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 8.972921974408763e-06, + "logits/chosen": 952533196.8, + "logits/rejected": 720217088.0, + "logps/chosen": -235.0120361328125, + "logps/rejected": -687.4405110677084, + "loss": 0.0208, + "rewards/chosen": 4.010283660888672, + "rewards/margins": 10.246450297037761, + "rewards/rejected": -6.236166636149089, + "step": 2290 + }, + { + "epoch": 0.2093193238921882, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 8.972048844636872e-06, + "logits/chosen": 635213653.3333334, + "logits/rejected": 1172623667.2, + "logps/chosen": -235.00516764322916, + "logps/rejected": -396.62041015625, + "loss": 0.0124, + "rewards/chosen": 3.8656943639119468, + "rewards/margins": 11.359905179341634, + "rewards/rejected": -7.494210815429687, + "step": 2291 + }, + { + "epoch": 0.20941068981269986, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 8.971175386413344e-06, + "logits/chosen": 810514176.0, + "logits/rejected": 1084876885.3333333, + "logps/chosen": -510.9668273925781, + "logps/rejected": -626.3199869791666, + "loss": 0.0287, + "rewards/chosen": 4.477103233337402, + "rewards/margins": 11.552000363667805, + "rewards/rejected": -7.074897130330403, + "step": 2292 + }, + { + "epoch": 0.2095020557332115, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.970301599810405e-06, + "logits/chosen": 740924586.6666666, + "logits/rejected": 724993228.8, + "logps/chosen": -298.4179280598958, + "logps/rejected": -673.646875, + "loss": 0.0149, + "rewards/chosen": 4.485576311747233, + "rewards/margins": 13.737910143534343, + "rewards/rejected": -9.252333831787109, + "step": 2293 + }, + { + "epoch": 0.20959342165372316, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 8.969427484900309e-06, + "logits/chosen": 650095786.6666666, + "logits/rejected": 673560268.8, + "logps/chosen": -370.211669921875, + "logps/rejected": -710.901220703125, + "loss": 0.0308, + "rewards/chosen": 3.503840128580729, + "rewards/margins": 11.910120646158854, + "rewards/rejected": -8.406280517578125, + "step": 2294 + }, + { + "epoch": 0.2096847875742348, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 8.968553041755337e-06, + "logits/chosen": 904267366.4, + "logits/rejected": 543865685.3333334, + "logps/chosen": -325.8661376953125, + "logps/rejected": -657.2993977864584, + "loss": 0.0229, + "rewards/chosen": 3.5378337860107423, + "rewards/margins": 12.334289169311523, + "rewards/rejected": -8.796455383300781, + "step": 2295 + }, + { + "epoch": 0.20977615349474646, + "grad_norm": 0.8671875, + "kl": 0.0, + "learning_rate": 8.9676782704478e-06, + "logits/chosen": 311706956.8, + "logits/rejected": 280663210.6666667, + "logps/chosen": -259.94794921875, + "logps/rejected": -334.0480143229167, + "loss": 0.0063, + "rewards/chosen": 4.717219543457031, + "rewards/margins": 12.812916946411132, + "rewards/rejected": -8.095697402954102, + "step": 2296 + }, + { + "epoch": 0.2098675194152581, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 8.966803171050028e-06, + "logits/chosen": 456179840.0, + "logits/rejected": 825555200.0, + "logps/chosen": -370.11627197265625, + "logps/rejected": -477.3994954427083, + "loss": 0.0214, + "rewards/chosen": 4.735250949859619, + "rewards/margins": 11.084644794464111, + "rewards/rejected": -6.349393844604492, + "step": 2297 + }, + { + "epoch": 0.20995888533576976, + "grad_norm": 0.8046875, + "kl": 0.0, + "learning_rate": 8.96592774363439e-06, + "logits/chosen": 291749312.0, + "logits/rejected": 530754517.3333333, + "logps/chosen": -248.9098663330078, + "logps/rejected": -489.220703125, + "loss": 0.0046, + "rewards/chosen": 4.516450881958008, + "rewards/margins": 12.221193313598633, + "rewards/rejected": -7.704742431640625, + "step": 2298 + }, + { + "epoch": 0.2100502512562814, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 8.965051988273273e-06, + "logits/chosen": 719888000.0, + "logits/rejected": 573854208.0, + "logps/chosen": -432.05804443359375, + "logps/rejected": -539.1829833984375, + "loss": 0.0221, + "rewards/chosen": 3.296722412109375, + "rewards/margins": 12.253076553344727, + "rewards/rejected": -8.956354141235352, + "step": 2299 + }, + { + "epoch": 0.21014161717679306, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 8.964175905039091e-06, + "logits/chosen": 556206293.3333334, + "logits/rejected": 190768448.0, + "logps/chosen": -302.0454915364583, + "logps/rejected": -263.51751708984375, + "loss": 0.0428, + "rewards/chosen": 3.1066201527913413, + "rewards/margins": 10.099224408467611, + "rewards/rejected": -6.9926042556762695, + "step": 2300 + }, + { + "epoch": 0.2102329830973047, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 8.963299494004292e-06, + "logits/chosen": 525798912.0, + "logits/rejected": 580174438.4, + "logps/chosen": -286.107421875, + "logps/rejected": -508.7486328125, + "loss": 0.0128, + "rewards/chosen": 4.173103332519531, + "rewards/margins": 11.8392578125, + "rewards/rejected": -7.666154479980468, + "step": 2301 + }, + { + "epoch": 0.21032434901781635, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 8.962422755241346e-06, + "logits/chosen": 487730645.3333333, + "logits/rejected": 521339852.8, + "logps/chosen": -305.5275065104167, + "logps/rejected": -583.7, + "loss": 0.0296, + "rewards/chosen": 2.5194387435913086, + "rewards/margins": 12.489322471618653, + "rewards/rejected": -9.969883728027344, + "step": 2302 + }, + { + "epoch": 0.210415714938328, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 8.96154568882275e-06, + "logits/chosen": 874881621.3333334, + "logits/rejected": 671611776.0, + "logps/chosen": -448.1224365234375, + "logps/rejected": -489.8759460449219, + "loss": 0.0221, + "rewards/chosen": 4.082831700642903, + "rewards/margins": 11.811049779256184, + "rewards/rejected": -7.728218078613281, + "step": 2303 + }, + { + "epoch": 0.21050708085883965, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 8.960668294821032e-06, + "logits/chosen": 537122752.0, + "logits/rejected": 624852224.0, + "logps/chosen": -364.11175537109375, + "logps/rejected": -474.39239501953125, + "loss": 0.0161, + "rewards/chosen": 3.494503974914551, + "rewards/margins": 12.26152515411377, + "rewards/rejected": -8.767021179199219, + "step": 2304 + }, + { + "epoch": 0.2105984467793513, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 8.959790573308739e-06, + "logits/chosen": 529431705.6, + "logits/rejected": 568869461.3333334, + "logps/chosen": -254.1904052734375, + "logps/rejected": -527.8880208333334, + "loss": 0.0073, + "rewards/chosen": 4.774288940429687, + "rewards/margins": 14.300307718912759, + "rewards/rejected": -9.526018778483072, + "step": 2305 + }, + { + "epoch": 0.21068981269986295, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 8.958912524358456e-06, + "logits/chosen": 661708544.0, + "logits/rejected": 416640665.6, + "logps/chosen": -176.47379557291666, + "logps/rejected": -535.920263671875, + "loss": 0.0251, + "rewards/chosen": 3.091771443684896, + "rewards/margins": 11.927983601888021, + "rewards/rejected": -8.836212158203125, + "step": 2306 + }, + { + "epoch": 0.2107811786203746, + "grad_norm": 24.25, + "kl": 0.0, + "learning_rate": 8.958034148042786e-06, + "logits/chosen": 460125696.0, + "logits/rejected": 567122517.3333334, + "logps/chosen": -294.065869140625, + "logps/rejected": -540.0533854166666, + "loss": 0.0578, + "rewards/chosen": 2.548695182800293, + "rewards/margins": 12.10135072072347, + "rewards/rejected": -9.552655537923178, + "step": 2307 + }, + { + "epoch": 0.21087254454088625, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.957155444434367e-06, + "logits/chosen": 530052288.0, + "logits/rejected": 414650048.0, + "logps/chosen": -311.8792724609375, + "logps/rejected": -430.3154296875, + "loss": 0.0267, + "rewards/chosen": 3.325812339782715, + "rewards/margins": 13.855319023132324, + "rewards/rejected": -10.52950668334961, + "step": 2308 + }, + { + "epoch": 0.2109639104613979, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 8.956276413605853e-06, + "logits/chosen": 536305561.6, + "logits/rejected": 487367296.0, + "logps/chosen": -223.13740234375, + "logps/rejected": -466.1466471354167, + "loss": 0.0406, + "rewards/chosen": 2.850338172912598, + "rewards/margins": 14.197839291890464, + "rewards/rejected": -11.347501118977865, + "step": 2309 + }, + { + "epoch": 0.21105527638190955, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 8.955397055629938e-06, + "logits/chosen": 522080085.3333333, + "logits/rejected": 502633267.2, + "logps/chosen": -339.4221598307292, + "logps/rejected": -572.23154296875, + "loss": 0.0453, + "rewards/chosen": 2.149624824523926, + "rewards/margins": 9.670465278625489, + "rewards/rejected": -7.520840454101562, + "step": 2310 + }, + { + "epoch": 0.2111466423024212, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 8.954517370579334e-06, + "logits/chosen": 932726988.8, + "logits/rejected": 697767765.3333334, + "logps/chosen": -215.3564208984375, + "logps/rejected": -650.353759765625, + "loss": 0.0512, + "rewards/chosen": 2.7035669326782226, + "rewards/margins": 12.849541409810385, + "rewards/rejected": -10.145974477132162, + "step": 2311 + }, + { + "epoch": 0.21123800822293284, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 8.953637358526781e-06, + "logits/chosen": 694109376.0, + "logits/rejected": 358977024.0, + "logps/chosen": -412.722900390625, + "logps/rejected": -422.51507568359375, + "loss": 0.0101, + "rewards/chosen": 3.970970869064331, + "rewards/margins": 11.402033567428589, + "rewards/rejected": -7.431062698364258, + "step": 2312 + }, + { + "epoch": 0.2113293741434445, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.95275701954505e-06, + "logits/chosen": 580044492.8, + "logits/rejected": 951976874.6666666, + "logps/chosen": -372.4212158203125, + "logps/rejected": -373.0672200520833, + "loss": 0.1311, + "rewards/chosen": 2.477924346923828, + "rewards/margins": 8.402102788289387, + "rewards/rejected": -5.92417844136556, + "step": 2313 + }, + { + "epoch": 0.21142074006395614, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 8.951876353706938e-06, + "logits/chosen": 643959168.0, + "logits/rejected": 283914304.0, + "logps/chosen": -410.4329833984375, + "logps/rejected": -350.8955993652344, + "loss": 0.0208, + "rewards/chosen": 4.138779958089192, + "rewards/margins": 13.20432408650716, + "rewards/rejected": -9.065544128417969, + "step": 2314 + }, + { + "epoch": 0.2115121059844678, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.950995361085266e-06, + "logits/chosen": 492797081.6, + "logits/rejected": 571214890.6666666, + "logps/chosen": -303.0547119140625, + "logps/rejected": -631.060302734375, + "loss": 0.0635, + "rewards/chosen": 2.755191993713379, + "rewards/margins": 11.919900449117026, + "rewards/rejected": -9.164708455403646, + "step": 2315 + }, + { + "epoch": 0.21160347190497944, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 8.950114041752884e-06, + "logits/chosen": 768819541.3333334, + "logits/rejected": 982014464.0, + "logps/chosen": -430.4007975260417, + "logps/rejected": -486.38079833984375, + "loss": 0.026, + "rewards/chosen": 3.5708980560302734, + "rewards/margins": 11.264890670776367, + "rewards/rejected": -7.693992614746094, + "step": 2316 + }, + { + "epoch": 0.2116948378254911, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.949232395782668e-06, + "logits/chosen": 542591616.0, + "logits/rejected": 367259968.0, + "logps/chosen": -397.15753173828125, + "logps/rejected": -334.38739013671875, + "loss": 0.0255, + "rewards/chosen": 3.002779483795166, + "rewards/margins": 11.27319860458374, + "rewards/rejected": -8.270419120788574, + "step": 2317 + }, + { + "epoch": 0.21178620374600274, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.948350423247526e-06, + "logits/chosen": 593188096.0, + "logits/rejected": 744795776.0, + "logps/chosen": -370.144775390625, + "logps/rejected": -527.0675048828125, + "loss": 0.0315, + "rewards/chosen": 2.9222846031188965, + "rewards/margins": 12.03765058517456, + "rewards/rejected": -9.115365982055664, + "step": 2318 + }, + { + "epoch": 0.2118775696665144, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 8.947468124220385e-06, + "logits/chosen": 337574058.6666667, + "logits/rejected": 534535884.8, + "logps/chosen": -249.45869954427084, + "logps/rejected": -765.0677734375, + "loss": 0.0234, + "rewards/chosen": 3.210414250691732, + "rewards/margins": 13.730349095662435, + "rewards/rejected": -10.519934844970702, + "step": 2319 + }, + { + "epoch": 0.21196893558702604, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 8.946585498774205e-06, + "logits/chosen": 1403507200.0, + "logits/rejected": 605727872.0, + "logps/chosen": -406.03802490234375, + "logps/rejected": -314.43865966796875, + "loss": 0.0072, + "rewards/chosen": 4.285853385925293, + "rewards/margins": 13.40134334564209, + "rewards/rejected": -9.115489959716797, + "step": 2320 + }, + { + "epoch": 0.21206030150753769, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.94570254698197e-06, + "logits/chosen": 630364224.0, + "logits/rejected": 751633280.0, + "logps/chosen": -353.3173828125, + "logps/rejected": -815.7367553710938, + "loss": 0.0368, + "rewards/chosen": 2.564467668533325, + "rewards/margins": 11.838076829910278, + "rewards/rejected": -9.273609161376953, + "step": 2321 + }, + { + "epoch": 0.21215166742804933, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 8.94481926891669e-06, + "logits/chosen": 179911658.66666666, + "logits/rejected": 379707212.8, + "logps/chosen": -104.66163126627605, + "logps/rejected": -317.0640625, + "loss": 0.0151, + "rewards/chosen": 4.176255544026692, + "rewards/margins": 12.679445393880208, + "rewards/rejected": -8.503189849853516, + "step": 2322 + }, + { + "epoch": 0.21224303334856098, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 8.943935664651409e-06, + "logits/chosen": 916035520.0, + "logits/rejected": 537123072.0, + "logps/chosen": -372.3371887207031, + "logps/rejected": -437.3985595703125, + "loss": 0.0269, + "rewards/chosen": 2.9317519664764404, + "rewards/margins": 10.67737889289856, + "rewards/rejected": -7.745626926422119, + "step": 2323 + }, + { + "epoch": 0.21233439926907263, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 8.943051734259187e-06, + "logits/chosen": 441760365.71428573, + "logits/rejected": 440425888.0, + "logps/chosen": -320.12841796875, + "logps/rejected": -232.46395874023438, + "loss": 0.027, + "rewards/chosen": 3.7831017630440846, + "rewards/margins": 9.821450437818255, + "rewards/rejected": -6.03834867477417, + "step": 2324 + }, + { + "epoch": 0.21242576518958428, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 8.942167477813122e-06, + "logits/chosen": 557609536.0, + "logits/rejected": 367818410.6666667, + "logps/chosen": -369.0366516113281, + "logps/rejected": -527.1341552734375, + "loss": 0.1728, + "rewards/chosen": 3.2261459827423096, + "rewards/margins": 9.961629788080852, + "rewards/rejected": -6.735483805338542, + "step": 2325 + }, + { + "epoch": 0.21251713111009593, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 8.94128289538633e-06, + "logits/chosen": 490492330.6666667, + "logits/rejected": 608393523.2, + "logps/chosen": -363.446533203125, + "logps/rejected": -459.94892578125, + "loss": 0.0217, + "rewards/chosen": 3.6327330271402993, + "rewards/margins": 13.002352015177408, + "rewards/rejected": -9.369618988037109, + "step": 2326 + }, + { + "epoch": 0.21260849703060758, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 8.94039798705196e-06, + "logits/chosen": 486855104.0, + "logits/rejected": 578804181.3333334, + "logps/chosen": -588.6649780273438, + "logps/rejected": -519.9767659505209, + "loss": 0.0225, + "rewards/chosen": 2.9823837280273438, + "rewards/margins": 11.485016504923502, + "rewards/rejected": -8.502632776896158, + "step": 2327 + }, + { + "epoch": 0.21269986295111923, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.939512752883186e-06, + "logits/chosen": 691966208.0, + "logits/rejected": 597392896.0, + "logps/chosen": -453.845703125, + "logps/rejected": -531.7351481119791, + "loss": 0.0384, + "rewards/chosen": 3.0431438446044923, + "rewards/margins": 10.321030553181966, + "rewards/rejected": -7.277886708577474, + "step": 2328 + }, + { + "epoch": 0.21279122887163088, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 8.938627192953207e-06, + "logits/chosen": 429941686.85714287, + "logits/rejected": 199131440.0, + "logps/chosen": -380.67124720982144, + "logps/rejected": -90.99678802490234, + "loss": 0.0334, + "rewards/chosen": 4.262622015816825, + "rewards/margins": 6.422323601586478, + "rewards/rejected": -2.1597015857696533, + "step": 2329 + }, + { + "epoch": 0.21288259479214253, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 8.937741307335252e-06, + "logits/chosen": 394199360.0, + "logits/rejected": 378992960.0, + "logps/chosen": -299.9891357421875, + "logps/rejected": -364.43707275390625, + "loss": 0.0378, + "rewards/chosen": 3.0916051864624023, + "rewards/margins": 9.507707595825195, + "rewards/rejected": -6.416102409362793, + "step": 2330 + }, + { + "epoch": 0.21297396071265418, + "grad_norm": 0.1328125, + "kl": 0.0, + "learning_rate": 8.936855096102575e-06, + "logits/rejected": 480695168.0, + "logps/rejected": -446.42669677734375, + "loss": 0.0007, + "rewards/rejected": -8.859660148620605, + "step": 2331 + }, + { + "epoch": 0.21306532663316582, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 8.935968559328457e-06, + "logits/chosen": 253046992.0, + "logits/rejected": 414878848.0, + "logps/chosen": -175.15292358398438, + "logps/rejected": -438.63568115234375, + "loss": 0.0456, + "rewards/chosen": 2.6971521377563477, + "rewards/margins": 10.745148658752441, + "rewards/rejected": -8.047996520996094, + "step": 2332 + }, + { + "epoch": 0.21315669255367747, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 8.935081697086206e-06, + "logits/chosen": 385758016.0, + "logits/rejected": 516598101.3333333, + "logps/chosen": -280.0975036621094, + "logps/rejected": -551.049072265625, + "loss": 0.0716, + "rewards/chosen": 1.590359091758728, + "rewards/margins": 10.428924918174744, + "rewards/rejected": -8.838565826416016, + "step": 2333 + }, + { + "epoch": 0.21324805847418912, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 8.934194509449157e-06, + "logits/chosen": 446883200.0, + "logits/rejected": 397592601.6, + "logps/chosen": -240.71590169270834, + "logps/rejected": -378.393603515625, + "loss": 0.0135, + "rewards/chosen": 3.420660654703776, + "rewards/margins": 12.202150217692056, + "rewards/rejected": -8.78148956298828, + "step": 2334 + }, + { + "epoch": 0.21333942439470077, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 8.933306996490675e-06, + "logits/chosen": 459424768.0, + "logits/rejected": 340881152.0, + "logps/chosen": -376.4435119628906, + "logps/rejected": -601.1858520507812, + "loss": 0.0118, + "rewards/chosen": 4.1889214515686035, + "rewards/margins": 14.551540851593018, + "rewards/rejected": -10.362619400024414, + "step": 2335 + }, + { + "epoch": 0.21343079031521242, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 8.932419158284147e-06, + "logits/chosen": 477263274.6666667, + "logits/rejected": 378694720.0, + "logps/chosen": -300.31850179036456, + "logps/rejected": -399.7706298828125, + "loss": 0.0327, + "rewards/chosen": 3.540425936381022, + "rewards/margins": 12.960858027140299, + "rewards/rejected": -9.420432090759277, + "step": 2336 + }, + { + "epoch": 0.21352215623572407, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 8.931530994902988e-06, + "logits/chosen": 1140746410.6666667, + "logits/rejected": 625194188.8, + "logps/chosen": -212.7490030924479, + "logps/rejected": -563.532958984375, + "loss": 0.0177, + "rewards/chosen": 4.183057149251302, + "rewards/margins": 13.419988759358723, + "rewards/rejected": -9.236931610107423, + "step": 2337 + }, + { + "epoch": 0.21361352215623572, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 8.930642506420644e-06, + "logits/chosen": 642104832.0, + "logits/rejected": 357124160.0, + "logps/chosen": -306.1482849121094, + "logps/rejected": -346.1029052734375, + "loss": 0.0533, + "rewards/chosen": 2.6962523460388184, + "rewards/margins": 10.871731281280518, + "rewards/rejected": -8.1754789352417, + "step": 2338 + }, + { + "epoch": 0.21370488807674737, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 8.929753692910581e-06, + "logits/chosen": 512071040.0, + "logits/rejected": 648884582.4, + "logps/chosen": -303.6265869140625, + "logps/rejected": -688.26005859375, + "loss": 0.0113, + "rewards/chosen": 3.568650245666504, + "rewards/margins": 15.169792366027831, + "rewards/rejected": -11.601142120361327, + "step": 2339 + }, + { + "epoch": 0.21379625399725902, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 8.9288645544463e-06, + "logits/chosen": 593924505.6, + "logits/rejected": 728451242.6666666, + "logps/chosen": -340.623095703125, + "logps/rejected": -617.4265950520834, + "loss": 0.0429, + "rewards/chosen": 2.9024530410766602, + "rewards/margins": 10.495607948303222, + "rewards/rejected": -7.5931549072265625, + "step": 2340 + }, + { + "epoch": 0.21388761991777067, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 8.927975091101319e-06, + "logits/chosen": 294981216.0, + "logits/rejected": 315961088.0, + "logps/chosen": -212.34182739257812, + "logps/rejected": -396.24603271484375, + "loss": 0.0089, + "rewards/chosen": 4.562664031982422, + "rewards/margins": 14.697908401489258, + "rewards/rejected": -10.135244369506836, + "step": 2341 + }, + { + "epoch": 0.21397898583828232, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 8.927085302949194e-06, + "logits/chosen": 505758310.4, + "logits/rejected": 352407424.0, + "logps/chosen": -349.75, + "logps/rejected": -526.1021321614584, + "loss": 0.0245, + "rewards/chosen": 3.4615455627441407, + "rewards/margins": 16.0581413269043, + "rewards/rejected": -12.596595764160156, + "step": 2342 + }, + { + "epoch": 0.21407035175879396, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 8.9261951900635e-06, + "logits/chosen": 180979760.0, + "logits/rejected": 511694037.3333333, + "logps/chosen": -234.24542236328125, + "logps/rejected": -410.6837972005208, + "loss": 0.1081, + "rewards/chosen": 3.987391710281372, + "rewards/margins": 10.578826189041138, + "rewards/rejected": -6.591434478759766, + "step": 2343 + }, + { + "epoch": 0.2141617176793056, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 8.92530475251784e-06, + "logits/chosen": 332354688.0, + "logits/rejected": 531837269.3333333, + "logps/chosen": -214.72573852539062, + "logps/rejected": -537.9088541666666, + "loss": 0.0079, + "rewards/chosen": 3.561312198638916, + "rewards/margins": 11.557109673817951, + "rewards/rejected": -7.995797475179036, + "step": 2344 + }, + { + "epoch": 0.21425308359981726, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 8.924413990385844e-06, + "logits/chosen": 535158848.0, + "logps/chosen": -338.9563293457031, + "loss": 0.0328, + "rewards/chosen": 3.718491792678833, + "step": 2345 + }, + { + "epoch": 0.2143444495203289, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.923522903741173e-06, + "logits/chosen": 344497382.4, + "logits/rejected": 277040682.6666667, + "logps/chosen": -278.2222412109375, + "logps/rejected": -347.9959309895833, + "loss": 0.1686, + "rewards/chosen": 2.3931133270263674, + "rewards/margins": 7.137257130940755, + "rewards/rejected": -4.744143803914388, + "step": 2346 + }, + { + "epoch": 0.21443581544084056, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 8.92263149265751e-06, + "logits/chosen": 562629461.3333334, + "logits/rejected": 659093862.4, + "logps/chosen": -399.3888346354167, + "logps/rejected": -471.930517578125, + "loss": 0.0099, + "rewards/chosen": 3.8411712646484375, + "rewards/margins": 12.243258666992187, + "rewards/rejected": -8.40208740234375, + "step": 2347 + }, + { + "epoch": 0.2145271813613522, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 8.921739757208567e-06, + "logits/chosen": 545290624.0, + "logits/rejected": 733118080.0, + "logps/chosen": -252.52496337890625, + "logps/rejected": -460.55810546875, + "loss": 0.0267, + "rewards/chosen": 3.532838821411133, + "rewards/margins": 14.379423141479492, + "rewards/rejected": -10.84658432006836, + "step": 2348 + }, + { + "epoch": 0.21461854728186386, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 8.920847697468082e-06, + "logits/chosen": 1150605312.0, + "logits/rejected": 578198835.2, + "logps/chosen": -525.4741617838541, + "logps/rejected": -633.869775390625, + "loss": 0.0103, + "rewards/chosen": 4.021442413330078, + "rewards/margins": 13.380860137939454, + "rewards/rejected": -9.359417724609376, + "step": 2349 + }, + { + "epoch": 0.2147099132023755, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 8.919955313509817e-06, + "logits/chosen": 526140672.0, + "logits/rejected": 859933900.8, + "logps/chosen": -263.47312418619794, + "logps/rejected": -425.945849609375, + "loss": 0.0075, + "rewards/chosen": 4.194289525349935, + "rewards/margins": 12.656930669148764, + "rewards/rejected": -8.462641143798828, + "step": 2350 + }, + { + "epoch": 0.21480127912288716, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.919062605407572e-06, + "logits/chosen": 509474112.0, + "logits/rejected": 420192448.0, + "logps/chosen": -179.94708251953125, + "logps/rejected": -422.678955078125, + "loss": 0.0222, + "rewards/chosen": 3.2578940391540527, + "rewards/margins": 11.777463436126709, + "rewards/rejected": -8.519569396972656, + "step": 2351 + }, + { + "epoch": 0.2148926450433988, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 8.918169573235155e-06, + "logits/chosen": 555334229.3333334, + "logits/rejected": 414036352.0, + "logps/chosen": -473.1544596354167, + "logps/rejected": -598.760888671875, + "loss": 0.02, + "rewards/chosen": 2.9142233530680337, + "rewards/margins": 12.73655687967936, + "rewards/rejected": -9.822333526611327, + "step": 2352 + }, + { + "epoch": 0.21498401096391045, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 8.91727621706642e-06, + "logits/chosen": 602337280.0, + "logits/rejected": 337465770.6666667, + "logps/chosen": -168.4234375, + "logps/rejected": -489.9490152994792, + "loss": 0.0357, + "rewards/chosen": 3.319252777099609, + "rewards/margins": 12.285529581705728, + "rewards/rejected": -8.96627680460612, + "step": 2353 + }, + { + "epoch": 0.2150753768844221, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 8.916382536975237e-06, + "logits/chosen": 762091264.0, + "logits/rejected": 579850944.0, + "logps/chosen": -414.03643798828125, + "logps/rejected": -652.720458984375, + "loss": 0.0219, + "rewards/chosen": 3.76682186126709, + "rewards/margins": 15.944398880004883, + "rewards/rejected": -12.177577018737793, + "step": 2354 + }, + { + "epoch": 0.21516674280493375, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 8.915488533035503e-06, + "logits/chosen": 728445952.0, + "logits/rejected": 452913856.0, + "logps/chosen": -460.4994201660156, + "logps/rejected": -453.78839111328125, + "loss": 0.0299, + "rewards/chosen": 2.9096076488494873, + "rewards/margins": 12.161038160324097, + "rewards/rejected": -9.25143051147461, + "step": 2355 + }, + { + "epoch": 0.2152581087254454, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.914594205321145e-06, + "logits/chosen": 828055552.0, + "logits/rejected": 792811776.0, + "logps/chosen": -368.16546630859375, + "logps/rejected": -331.084228515625, + "loss": 0.1078, + "rewards/chosen": 2.7716546058654785, + "rewards/margins": 9.442594051361084, + "rewards/rejected": -6.6709394454956055, + "step": 2356 + }, + { + "epoch": 0.21534947464595705, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 8.913699553906116e-06, + "logits/chosen": 513232085.3333333, + "logits/rejected": 591329740.8, + "logps/chosen": -176.5408935546875, + "logps/rejected": -491.711474609375, + "loss": 0.0153, + "rewards/chosen": 3.2244691848754883, + "rewards/margins": 11.931643867492676, + "rewards/rejected": -8.707174682617188, + "step": 2357 + }, + { + "epoch": 0.2154408405664687, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 8.912804578864393e-06, + "logits/chosen": 435773235.2, + "logits/rejected": 634394282.6666666, + "logps/chosen": -186.1711669921875, + "logps/rejected": -414.2330729166667, + "loss": 0.0337, + "rewards/chosen": 3.005161666870117, + "rewards/margins": 9.542642974853516, + "rewards/rejected": -6.537481307983398, + "step": 2358 + }, + { + "epoch": 0.21553220648698035, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 8.911909280269988e-06, + "logits/chosen": 524137792.0, + "logits/rejected": 345675648.0, + "logps/chosen": -287.3282165527344, + "logps/rejected": -352.94464111328125, + "loss": 0.034, + "rewards/chosen": 3.583465099334717, + "rewards/margins": 9.842859268188477, + "rewards/rejected": -6.25939416885376, + "step": 2359 + }, + { + "epoch": 0.215623572407492, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 8.911013658196927e-06, + "logits/chosen": 1068377600.0, + "logits/rejected": 393162624.0, + "logps/chosen": -386.3004557291667, + "logps/rejected": -452.9596252441406, + "loss": 0.0277, + "rewards/chosen": 3.5501022338867188, + "rewards/margins": 13.212028503417969, + "rewards/rejected": -9.66192626953125, + "step": 2360 + }, + { + "epoch": 0.21571493832800365, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 8.910117712719274e-06, + "logits/chosen": 386459093.3333333, + "logits/rejected": 622834278.4, + "logps/chosen": -316.3181966145833, + "logps/rejected": -761.6826171875, + "loss": 0.0144, + "rewards/chosen": 3.906940778096517, + "rewards/margins": 15.607487805684409, + "rewards/rejected": -11.700547027587891, + "step": 2361 + }, + { + "epoch": 0.2158063042485153, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.909221443911116e-06, + "logits/chosen": 435793152.0, + "logits/rejected": 404585002.6666667, + "logps/chosen": -301.770068359375, + "logps/rejected": -341.6595052083333, + "loss": 0.0301, + "rewards/chosen": 3.249029541015625, + "rewards/margins": 11.300846481323243, + "rewards/rejected": -8.051816940307617, + "step": 2362 + }, + { + "epoch": 0.21589767016902695, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 8.908324851846562e-06, + "logits/chosen": 473851596.8, + "logits/rejected": 243290368.0, + "logps/chosen": -389.237646484375, + "logps/rejected": -381.4069010416667, + "loss": 0.0242, + "rewards/chosen": 3.5385463714599608, + "rewards/margins": 12.992778269449868, + "rewards/rejected": -9.454231897989908, + "step": 2363 + }, + { + "epoch": 0.2159890360895386, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 8.907427936599755e-06, + "logits/chosen": 648507392.0, + "logps/chosen": -388.6523742675781, + "loss": 0.0277, + "rewards/chosen": 3.8140459060668945, + "step": 2364 + }, + { + "epoch": 0.21608040201005024, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 8.906530698244862e-06, + "logits/chosen": 371694560.0, + "logits/rejected": 443757568.0, + "logps/chosen": -285.5526428222656, + "logps/rejected": -497.33038330078125, + "loss": 0.0442, + "rewards/chosen": 2.467160940170288, + "rewards/margins": 9.712426900863647, + "rewards/rejected": -7.245265960693359, + "step": 2365 + }, + { + "epoch": 0.2161717679305619, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 8.905633136856072e-06, + "logits/chosen": 544820288.0, + "logits/rejected": 493506592.0, + "logps/chosen": -347.7574157714844, + "logps/rejected": -413.6452331542969, + "loss": 0.0306, + "rewards/chosen": 3.2505974769592285, + "rewards/margins": 12.206698894500732, + "rewards/rejected": -8.956101417541504, + "step": 2366 + }, + { + "epoch": 0.21626313385107354, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.90473525250761e-06, + "logits/chosen": 486364736.0, + "logits/rejected": 408574016.0, + "logps/chosen": -311.1324768066406, + "logps/rejected": -482.18463134765625, + "loss": 0.0317, + "rewards/chosen": 2.855855941772461, + "rewards/margins": 10.867561340332031, + "rewards/rejected": -8.01170539855957, + "step": 2367 + }, + { + "epoch": 0.2163544997715852, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.903837045273722e-06, + "logits/chosen": 380846592.0, + "logits/rejected": 693046613.3333334, + "logps/chosen": -251.8039794921875, + "logps/rejected": -461.769775390625, + "loss": 0.0231, + "rewards/chosen": 3.6627696990966796, + "rewards/margins": 11.235530598958333, + "rewards/rejected": -7.572760899861653, + "step": 2368 + }, + { + "epoch": 0.21644586569209684, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 8.902938515228678e-06, + "logits/chosen": 606483558.4, + "logits/rejected": 400393386.6666667, + "logps/chosen": -269.00322265625, + "logps/rejected": -531.7630208333334, + "loss": 0.0497, + "rewards/chosen": 2.991257095336914, + "rewards/margins": 12.112284723917643, + "rewards/rejected": -9.121027628580729, + "step": 2369 + }, + { + "epoch": 0.2165372316126085, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.90203966244678e-06, + "logits/chosen": 705221478.4, + "logits/rejected": 547573376.0, + "logps/chosen": -431.76484375, + "logps/rejected": -469.1105143229167, + "loss": 0.0324, + "rewards/chosen": 3.366294097900391, + "rewards/margins": 12.946376673380534, + "rewards/rejected": -9.580082575480143, + "step": 2370 + }, + { + "epoch": 0.21662859753312014, + "grad_norm": 0.64453125, + "kl": 0.0, + "learning_rate": 8.901140487002358e-06, + "logits/chosen": 265620688.0, + "logits/rejected": 618801664.0, + "logps/chosen": -221.89251708984375, + "logps/rejected": -561.0243326822916, + "loss": 0.0035, + "rewards/chosen": 4.272414207458496, + "rewards/margins": 14.712188402811686, + "rewards/rejected": -10.43977419535319, + "step": 2371 + }, + { + "epoch": 0.2167199634536318, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.90024098896976e-06, + "logits/chosen": 603258572.8, + "logits/rejected": 371169109.3333333, + "logps/chosen": -311.824853515625, + "logps/rejected": -331.81044514973956, + "loss": 0.022, + "rewards/chosen": 3.5801124572753906, + "rewards/margins": 11.000614166259766, + "rewards/rejected": -7.420501708984375, + "step": 2372 + }, + { + "epoch": 0.21681132937414344, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 8.89934116842337e-06, + "logits/chosen": 422376149.3333333, + "logits/rejected": 635480627.2, + "logps/chosen": -364.9519449869792, + "logps/rejected": -543.64921875, + "loss": 0.0088, + "rewards/chosen": 4.535492897033691, + "rewards/margins": 13.11545352935791, + "rewards/rejected": -8.57996063232422, + "step": 2373 + }, + { + "epoch": 0.21690269529465508, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 8.898441025437592e-06, + "logits/chosen": 507901098.6666667, + "logits/rejected": 447555328.0, + "logps/chosen": -364.092529296875, + "logps/rejected": -424.91904296875, + "loss": 0.0085, + "rewards/chosen": 4.0347474416097, + "rewards/margins": 13.457770665486652, + "rewards/rejected": -9.423023223876953, + "step": 2374 + }, + { + "epoch": 0.21699406121516673, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 8.897540560086864e-06, + "logits/chosen": 379782229.3333333, + "logits/rejected": 332233395.2, + "logps/chosen": -341.0941975911458, + "logps/rejected": -411.3521484375, + "loss": 0.0294, + "rewards/chosen": 2.8255961736043296, + "rewards/margins": 11.316778119405111, + "rewards/rejected": -8.491181945800781, + "step": 2375 + }, + { + "epoch": 0.21708542713567838, + "grad_norm": 5.34375, + "kl": 3.177915573120117, + "learning_rate": 8.89663977244564e-06, + "logits/chosen": 1117226393.6, + "logits/rejected": 563206314.6666666, + "logps/chosen": -343.73125, + "logps/rejected": -480.0376383463542, + "loss": 0.0299, + "rewards/chosen": 3.5065586090087892, + "rewards/margins": 15.713513565063476, + "rewards/rejected": -12.206954956054688, + "step": 2376 + }, + { + "epoch": 0.21717679305619003, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.895738662588412e-06, + "logits/chosen": 646298325.3333334, + "logits/rejected": 273667136.0, + "logps/chosen": -313.79848225911456, + "logps/rejected": -393.40789794921875, + "loss": 0.0296, + "rewards/chosen": 3.414647102355957, + "rewards/margins": 12.927939414978027, + "rewards/rejected": -9.51329231262207, + "step": 2377 + }, + { + "epoch": 0.21726815897670168, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 8.89483723058969e-06, + "logits/chosen": 614237013.3333334, + "logits/rejected": 623656448.0, + "logps/chosen": -430.0049641927083, + "logps/rejected": -317.1860595703125, + "loss": 0.0292, + "rewards/chosen": 2.562997500101725, + "rewards/margins": 9.5919708887736, + "rewards/rejected": -7.028973388671875, + "step": 2378 + }, + { + "epoch": 0.21735952489721333, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 8.893935476524018e-06, + "logits/chosen": 477020467.2, + "logits/rejected": 264326613.33333334, + "logps/chosen": -349.4215576171875, + "logps/rejected": -236.42313639322916, + "loss": 0.0171, + "rewards/chosen": 4.006228256225586, + "rewards/margins": 9.830091603597005, + "rewards/rejected": -5.823863347371419, + "step": 2379 + }, + { + "epoch": 0.21745089081772498, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.893033400465957e-06, + "logits/chosen": 629944064.0, + "logits/rejected": 487622912.0, + "logps/chosen": -459.4580485026042, + "logps/rejected": -474.96484375, + "loss": 0.0122, + "rewards/chosen": 3.595649083455404, + "rewards/margins": 13.358767064412435, + "rewards/rejected": -9.76311798095703, + "step": 2380 + }, + { + "epoch": 0.21754225673823663, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.892131002490105e-06, + "logits/chosen": 490473600.0, + "logits/rejected": 311253696.0, + "logps/chosen": -347.271728515625, + "logps/rejected": -469.326416015625, + "loss": 0.0243, + "rewards/chosen": 4.107150077819824, + "rewards/margins": 14.77521800994873, + "rewards/rejected": -10.668067932128906, + "step": 2381 + }, + { + "epoch": 0.21763362265874828, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.89122828267108e-06, + "logits/chosen": 559111552.0, + "logits/rejected": 382768832.0, + "logps/chosen": -444.3968505859375, + "logps/rejected": -339.93853759765625, + "loss": 0.0226, + "rewards/chosen": 3.217146873474121, + "rewards/margins": 10.839614868164062, + "rewards/rejected": -7.622467994689941, + "step": 2382 + }, + { + "epoch": 0.21772498857925993, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 8.89032524108353e-06, + "logits/chosen": 473802956.8, + "logits/rejected": 438432512.0, + "logps/chosen": -290.4083740234375, + "logps/rejected": -657.3345133463541, + "loss": 0.0117, + "rewards/chosen": 4.057563400268554, + "rewards/margins": 13.685941314697265, + "rewards/rejected": -9.628377914428711, + "step": 2383 + }, + { + "epoch": 0.21781635449977158, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 8.889421877802126e-06, + "logits/chosen": 481376064.0, + "logits/rejected": 395843328.0, + "logps/chosen": -109.23977661132812, + "logps/rejected": -428.1421305338542, + "loss": 0.0093, + "rewards/chosen": 3.8625497817993164, + "rewards/margins": 12.892910321553549, + "rewards/rejected": -9.030360539754232, + "step": 2384 + }, + { + "epoch": 0.21790772042028322, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 8.888518192901571e-06, + "logits/chosen": 452971093.3333333, + "logits/rejected": 499896473.6, + "logps/chosen": -280.0353597005208, + "logps/rejected": -464.08994140625, + "loss": 0.0153, + "rewards/chosen": 3.2472591400146484, + "rewards/margins": 10.810631942749023, + "rewards/rejected": -7.563372802734375, + "step": 2385 + }, + { + "epoch": 0.21799908634079487, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.887614186456588e-06, + "logits/chosen": 527144768.0, + "logits/rejected": 546590400.0, + "logps/chosen": -346.53582763671875, + "logps/rejected": -446.709228515625, + "loss": 0.0388, + "rewards/chosen": 2.666858673095703, + "rewards/margins": 10.330883026123047, + "rewards/rejected": -7.664024353027344, + "step": 2386 + }, + { + "epoch": 0.21809045226130652, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 8.886709858541932e-06, + "logits/chosen": 559487360.0, + "logits/rejected": 359560896.0, + "logps/chosen": -273.0078125, + "logps/rejected": -504.06451416015625, + "loss": 0.1503, + "rewards/chosen": 1.2255594730377197, + "rewards/margins": 11.279446840286255, + "rewards/rejected": -10.053887367248535, + "step": 2387 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.88580520923238e-06, + "logits/chosen": 471288832.0, + "logits/rejected": 477194336.0, + "logps/chosen": -424.3274841308594, + "logps/rejected": -541.7025146484375, + "loss": 0.0229, + "rewards/chosen": 3.643843173980713, + "rewards/margins": 12.887529850006104, + "rewards/rejected": -9.24368667602539, + "step": 2388 + }, + { + "epoch": 0.21827318410232982, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 8.884900238602742e-06, + "logits/chosen": 535180842.6666667, + "logits/rejected": 828459212.8, + "logps/chosen": -253.8215535481771, + "logps/rejected": -594.01533203125, + "loss": 0.029, + "rewards/chosen": 3.718449274698893, + "rewards/margins": 17.028034845987957, + "rewards/rejected": -13.309585571289062, + "step": 2389 + }, + { + "epoch": 0.21836455002284147, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 8.883994946727848e-06, + "logits/chosen": 362169036.8, + "logits/rejected": 638690304.0, + "logps/chosen": -179.64998779296874, + "logps/rejected": -681.6543782552084, + "loss": 0.0233, + "rewards/chosen": 3.371225357055664, + "rewards/margins": 16.49206988016764, + "rewards/rejected": -13.120844523111979, + "step": 2390 + }, + { + "epoch": 0.21845591594335312, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 8.88308933368256e-06, + "logits/chosen": 915200512.0, + "logits/rejected": 650413696.0, + "logps/chosen": -286.1650085449219, + "logps/rejected": -401.6005859375, + "loss": 0.0471, + "rewards/chosen": 3.128382682800293, + "rewards/margins": 7.881715297698975, + "rewards/rejected": -4.753332614898682, + "step": 2391 + }, + { + "epoch": 0.21854728186386477, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 8.882183399541758e-06, + "logits/chosen": 704917952.0, + "logits/rejected": 497749280.0, + "logps/chosen": -357.9751281738281, + "logps/rejected": -600.8154296875, + "loss": 0.0214, + "rewards/chosen": 3.1805472373962402, + "rewards/margins": 13.220255374908447, + "rewards/rejected": -10.039708137512207, + "step": 2392 + }, + { + "epoch": 0.21863864778437642, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 8.88127714438036e-06, + "logits/chosen": 390937408.0, + "logits/rejected": 693419861.3333334, + "logps/chosen": -286.4373779296875, + "logps/rejected": -401.7714029947917, + "loss": 0.0107, + "rewards/chosen": 3.96286940574646, + "rewards/margins": 11.83203673362732, + "rewards/rejected": -7.869167327880859, + "step": 2393 + }, + { + "epoch": 0.21873001370488807, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.880370568273303e-06, + "logits/chosen": 920828074.6666666, + "logits/rejected": 774229913.6, + "logps/chosen": -377.9854736328125, + "logps/rejected": -438.878857421875, + "loss": 0.1279, + "rewards/chosen": 1.825666904449463, + "rewards/margins": 9.35604944229126, + "rewards/rejected": -7.530382537841797, + "step": 2394 + }, + { + "epoch": 0.21882137962539971, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.879463671295551e-06, + "logits/chosen": 687287003.4285715, + "logits/rejected": 246150176.0, + "logps/chosen": -285.98036411830356, + "logps/rejected": -274.919189453125, + "loss": 0.1461, + "rewards/chosen": 2.5084097726004466, + "rewards/margins": 10.332979474748884, + "rewards/rejected": -7.8245697021484375, + "step": 2395 + }, + { + "epoch": 0.21891274554591136, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 8.8785564535221e-06, + "logits/chosen": 676102860.8, + "logits/rejected": 887765760.0, + "logps/chosen": -205.1193603515625, + "logps/rejected": -366.5185546875, + "loss": 0.0403, + "rewards/chosen": 2.9978845596313475, + "rewards/margins": 9.473872311909993, + "rewards/rejected": -6.4759877522786455, + "step": 2396 + }, + { + "epoch": 0.219004111466423, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 8.877648915027964e-06, + "logits/chosen": 672373930.6666666, + "logits/rejected": 708996288.0, + "logps/chosen": -361.4814860026042, + "logps/rejected": -581.91748046875, + "loss": 0.037, + "rewards/chosen": 3.1241111755371094, + "rewards/margins": 11.741262435913086, + "rewards/rejected": -8.617151260375977, + "step": 2397 + }, + { + "epoch": 0.21909547738693466, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 8.876741055888192e-06, + "logits/chosen": 603733504.0, + "logits/rejected": 1058383872.0, + "logps/chosen": -337.5838317871094, + "logps/rejected": -497.140869140625, + "loss": 0.0389, + "rewards/chosen": 2.589919328689575, + "rewards/margins": 11.523675203323364, + "rewards/rejected": -8.933755874633789, + "step": 2398 + }, + { + "epoch": 0.2191868433074463, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 8.87583287617785e-06, + "logits/chosen": 323927424.0, + "logits/rejected": 833808810.6666666, + "logps/chosen": -235.78179931640625, + "logps/rejected": -507.6295572916667, + "loss": 0.0092, + "rewards/chosen": 3.3748481273651123, + "rewards/margins": 13.534987847010294, + "rewards/rejected": -10.160139719645182, + "step": 2399 + }, + { + "epoch": 0.21927820922795796, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 8.874924375972043e-06, + "logits/chosen": 593673280.0, + "logits/rejected": 526155232.0, + "logps/chosen": -234.44305419921875, + "logps/rejected": -475.7328796386719, + "loss": 0.0087, + "rewards/chosen": 4.219106674194336, + "rewards/margins": 12.869939804077148, + "rewards/rejected": -8.650833129882812, + "step": 2400 + }, + { + "epoch": 0.2193695751484696, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.874015555345889e-06, + "logits/chosen": 451107648.0, + "logits/rejected": 708562176.0, + "logps/chosen": -239.20640563964844, + "logps/rejected": -410.51953125, + "loss": 0.0302, + "rewards/chosen": 3.2227368354797363, + "rewards/margins": 10.033836841583252, + "rewards/rejected": -6.811100006103516, + "step": 2401 + }, + { + "epoch": 0.21946094106898126, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 8.873106414374545e-06, + "logits/chosen": 710213632.0, + "logits/rejected": 372664768.0, + "logps/chosen": -284.76796177455356, + "logps/rejected": -342.83721923828125, + "loss": 0.0286, + "rewards/chosen": 3.6776182992117747, + "rewards/margins": 12.406985555376325, + "rewards/rejected": -8.72936725616455, + "step": 2402 + }, + { + "epoch": 0.2195523069894929, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 8.872196953133184e-06, + "logits/chosen": 402727136.0, + "logits/rejected": 472969685.3333333, + "logps/chosen": -364.8427734375, + "logps/rejected": -588.983154296875, + "loss": 0.0091, + "rewards/chosen": 3.3151016235351562, + "rewards/margins": 11.931208928426107, + "rewards/rejected": -8.616107304890951, + "step": 2403 + }, + { + "epoch": 0.21964367291000456, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 8.871287171697011e-06, + "logits/chosen": 675502080.0, + "logits/rejected": 868098998.8571428, + "logps/chosen": -414.8634033203125, + "logps/rejected": -399.57059151785717, + "loss": 0.008, + "rewards/chosen": 2.8951263427734375, + "rewards/margins": 10.856857299804688, + "rewards/rejected": -7.96173095703125, + "step": 2404 + }, + { + "epoch": 0.2197350388305162, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 8.870377070141258e-06, + "logits/chosen": 737154944.0, + "logits/rejected": 1216740096.0, + "logps/chosen": -301.2239685058594, + "logps/rejected": -706.1331787109375, + "loss": 0.0311, + "rewards/chosen": 2.772885322570801, + "rewards/margins": 11.312018394470215, + "rewards/rejected": -8.539133071899414, + "step": 2405 + }, + { + "epoch": 0.21982640475102785, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 8.869466648541183e-06, + "logits/chosen": 391686809.6, + "logits/rejected": 555871402.6666666, + "logps/chosen": -381.489404296875, + "logps/rejected": -764.0620930989584, + "loss": 0.1699, + "rewards/chosen": 1.3839868545532226, + "rewards/margins": 9.655810737609864, + "rewards/rejected": -8.27182388305664, + "step": 2406 + }, + { + "epoch": 0.2199177706715395, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.868555906972065e-06, + "logits/chosen": 747736422.4, + "logits/rejected": 361818880.0, + "logps/chosen": -561.109375, + "logps/rejected": -327.84035237630206, + "loss": 0.0279, + "rewards/chosen": 3.216164779663086, + "rewards/margins": 10.469340515136718, + "rewards/rejected": -7.253175735473633, + "step": 2407 + }, + { + "epoch": 0.22000913659205115, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.867644845509218e-06, + "logits/chosen": 348230528.0, + "logits/rejected": 423173043.2, + "logps/chosen": -220.6128133138021, + "logps/rejected": -554.30537109375, + "loss": 0.0106, + "rewards/chosen": 3.7961864471435547, + "rewards/margins": 12.679111099243164, + "rewards/rejected": -8.882924652099609, + "step": 2408 + }, + { + "epoch": 0.2201005025125628, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 8.866733464227975e-06, + "logits/chosen": 528933984.0, + "logits/rejected": 428810432.0, + "logps/chosen": -390.5235290527344, + "logps/rejected": -592.8655395507812, + "loss": 0.0748, + "rewards/chosen": 2.5579922199249268, + "rewards/margins": 10.934849977493286, + "rewards/rejected": -8.37685775756836, + "step": 2409 + }, + { + "epoch": 0.22019186843307445, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 8.865821763203702e-06, + "logits/chosen": 484488704.0, + "logits/rejected": 673358592.0, + "logps/chosen": -243.19686889648438, + "logps/rejected": -590.8701171875, + "loss": 0.0185, + "rewards/chosen": 3.724160671234131, + "rewards/margins": 13.733767986297607, + "rewards/rejected": -10.009607315063477, + "step": 2410 + }, + { + "epoch": 0.2202832343535861, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 8.864909742511787e-06, + "logits/chosen": 474342314.6666667, + "logits/rejected": 837342156.8, + "logps/chosen": -274.62432861328125, + "logps/rejected": -298.80732421875, + "loss": 0.063, + "rewards/chosen": 3.371487299601237, + "rewards/margins": 10.309239832560221, + "rewards/rejected": -6.937752532958984, + "step": 2411 + }, + { + "epoch": 0.22037460027409775, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 8.863997402227646e-06, + "logits/chosen": 457226547.2, + "logits/rejected": 302088362.6666667, + "logps/chosen": -349.91611328125, + "logps/rejected": -678.949951171875, + "loss": 0.0352, + "rewards/chosen": 3.1423131942749025, + "rewards/margins": 11.929510307312011, + "rewards/rejected": -8.78719711303711, + "step": 2412 + }, + { + "epoch": 0.2204659661946094, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.863084742426719e-06, + "logits/chosen": 391057408.0, + "logits/rejected": 388175957.3333333, + "logps/chosen": -171.0291290283203, + "logps/rejected": -418.757080078125, + "loss": 0.082, + "rewards/chosen": 1.3157970905303955, + "rewards/margins": 9.59000007311503, + "rewards/rejected": -8.274202982584635, + "step": 2413 + }, + { + "epoch": 0.22055733211512105, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 8.862171763184477e-06, + "logits/chosen": 475338848.0, + "logits/rejected": 466218304.0, + "logps/chosen": -282.775634765625, + "logps/rejected": -441.1136779785156, + "loss": 0.034, + "rewards/chosen": 3.382233142852783, + "rewards/margins": 12.122074604034424, + "rewards/rejected": -8.73984146118164, + "step": 2414 + }, + { + "epoch": 0.2206486980356327, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 8.861258464576414e-06, + "logits/chosen": 876985088.0, + "logits/rejected": 539161685.3333334, + "logps/chosen": -258.6143798828125, + "logps/rejected": -458.6620686848958, + "loss": 0.0149, + "rewards/chosen": 2.846846103668213, + "rewards/margins": 11.065145333607992, + "rewards/rejected": -8.21829922993978, + "step": 2415 + }, + { + "epoch": 0.22074006395614434, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 8.86034484667805e-06, + "logits/chosen": 579762688.0, + "logits/rejected": 443078976.0, + "logps/chosen": -500.7287292480469, + "logps/rejected": -561.4857788085938, + "loss": 0.031, + "rewards/chosen": 3.0373642444610596, + "rewards/margins": 11.719524621963501, + "rewards/rejected": -8.682160377502441, + "step": 2416 + }, + { + "epoch": 0.220831429876656, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 8.859430909564935e-06, + "logits/chosen": 330125056.0, + "logits/rejected": 368424729.6, + "logps/chosen": -276.6001790364583, + "logps/rejected": -391.5908447265625, + "loss": 0.0136, + "rewards/chosen": 3.6018555959065757, + "rewards/margins": 12.865903600056967, + "rewards/rejected": -9.264048004150391, + "step": 2417 + }, + { + "epoch": 0.22092279579716764, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 8.858516653312643e-06, + "logits/chosen": 498414899.2, + "logits/rejected": 484726869.3333333, + "logps/chosen": -295.4950439453125, + "logps/rejected": -444.9041341145833, + "loss": 0.028, + "rewards/chosen": 3.5230979919433594, + "rewards/margins": 11.177169799804688, + "rewards/rejected": -7.654071807861328, + "step": 2418 + }, + { + "epoch": 0.22101416171767932, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.857602077996773e-06, + "logits/chosen": 663070259.2, + "logits/rejected": 529200554.6666667, + "logps/chosen": -283.8002685546875, + "logps/rejected": -373.4295247395833, + "loss": 0.0255, + "rewards/chosen": 3.6358722686767577, + "rewards/margins": 10.838979466756184, + "rewards/rejected": -7.203107198079427, + "step": 2419 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 8.856687183692951e-06, + "logits/chosen": 552339882.6666666, + "logits/rejected": 333144268.8, + "logps/chosen": -352.6433512369792, + "logps/rejected": -399.594873046875, + "loss": 0.0137, + "rewards/chosen": 3.3044220606486, + "rewards/margins": 11.792121950785319, + "rewards/rejected": -8.487699890136719, + "step": 2420 + }, + { + "epoch": 0.22119689355870262, + "grad_norm": 37.5, + "kl": 0.0, + "learning_rate": 8.855771970476834e-06, + "logits/chosen": 688809369.6, + "logits/rejected": 941459456.0, + "logps/chosen": -338.1081298828125, + "logps/rejected": -441.60888671875, + "loss": 0.1263, + "rewards/chosen": 2.626433563232422, + "rewards/margins": 8.011231613159179, + "rewards/rejected": -5.384798049926758, + "step": 2421 + }, + { + "epoch": 0.22128825947921427, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 8.8548564384241e-06, + "logits/chosen": 538160768.0, + "logits/rejected": 962076586.6666666, + "logps/chosen": -329.828125, + "logps/rejected": -625.8485107421875, + "loss": 0.0067, + "rewards/chosen": 4.661494255065918, + "rewards/margins": 15.178217887878418, + "rewards/rejected": -10.5167236328125, + "step": 2422 + }, + { + "epoch": 0.22137962539972592, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.853940587610453e-06, + "logits/chosen": 444859648.0, + "logits/rejected": 333767616.0, + "logps/chosen": -235.3063720703125, + "logps/rejected": -404.3816324869792, + "loss": 0.1095, + "rewards/chosen": 3.126903533935547, + "rewards/margins": 7.479088147481282, + "rewards/rejected": -4.352184613545735, + "step": 2423 + }, + { + "epoch": 0.22147099132023756, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 8.853024418111628e-06, + "logits/chosen": 443296153.6, + "logits/rejected": 724174506.6666666, + "logps/chosen": -352.6157958984375, + "logps/rejected": -464.0023193359375, + "loss": 0.0101, + "rewards/chosen": 4.351749420166016, + "rewards/margins": 11.505868275960285, + "rewards/rejected": -7.1541188557942705, + "step": 2424 + }, + { + "epoch": 0.2215623572407492, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 8.852107930003382e-06, + "logits/chosen": 780050432.0, + "logits/rejected": 815522048.0, + "logps/chosen": -340.0151062011719, + "logps/rejected": -572.26025390625, + "loss": 0.0149, + "rewards/chosen": 3.593216896057129, + "rewards/margins": 14.156170845031738, + "rewards/rejected": -10.56295394897461, + "step": 2425 + }, + { + "epoch": 0.22165372316126086, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.851191123361502e-06, + "logits/chosen": 708368537.6, + "logits/rejected": 722394965.3333334, + "logps/chosen": -367.3396484375, + "logps/rejected": -426.8111165364583, + "loss": 0.0305, + "rewards/chosen": 3.1865047454833983, + "rewards/margins": 9.792421086629231, + "rewards/rejected": -6.605916341145833, + "step": 2426 + }, + { + "epoch": 0.2217450890817725, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 8.850273998261796e-06, + "logits/chosen": 640286080.0, + "logits/rejected": 1302632448.0, + "logps/chosen": -290.16815185546875, + "logps/rejected": -486.7300109863281, + "loss": 0.0209, + "rewards/chosen": 3.423100233078003, + "rewards/margins": 13.779812574386597, + "rewards/rejected": -10.356712341308594, + "step": 2427 + }, + { + "epoch": 0.22183645500228416, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 8.849356554780106e-06, + "logits/chosen": 559152256.0, + "logits/rejected": 599192362.6666666, + "logps/chosen": -476.5421142578125, + "logps/rejected": -385.6793619791667, + "loss": 0.0085, + "rewards/chosen": 3.766631603240967, + "rewards/margins": 11.487135728200276, + "rewards/rejected": -7.72050412495931, + "step": 2428 + }, + { + "epoch": 0.2219278209227958, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 8.848438792992293e-06, + "logits/chosen": 597951616.0, + "logits/rejected": 861541440.0, + "logps/chosen": -329.87969970703125, + "logps/rejected": -520.4278564453125, + "loss": 0.0178, + "rewards/chosen": 3.4198622703552246, + "rewards/margins": 13.757326602935791, + "rewards/rejected": -10.337464332580566, + "step": 2429 + }, + { + "epoch": 0.22201918684330746, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.84752071297425e-06, + "logits/chosen": 648499302.4, + "logits/rejected": 451237717.3333333, + "logps/chosen": -401.9449951171875, + "logps/rejected": -590.8496907552084, + "loss": 0.0277, + "rewards/chosen": 3.375320816040039, + "rewards/margins": 16.041518274943034, + "rewards/rejected": -12.666197458902994, + "step": 2430 + }, + { + "epoch": 0.2221105527638191, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 8.84660231480189e-06, + "logits/chosen": 626155904.0, + "logits/rejected": 708007936.0, + "logps/chosen": -326.24969482421875, + "logps/rejected": -774.6368408203125, + "loss": 0.0192, + "rewards/chosen": 3.719123125076294, + "rewards/margins": 12.826039552688599, + "rewards/rejected": -9.106916427612305, + "step": 2431 + }, + { + "epoch": 0.22220191868433076, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.845683598551157e-06, + "logits/chosen": 260075920.0, + "logits/rejected": 479260202.6666667, + "logps/chosen": -302.14892578125, + "logps/rejected": -703.2565104166666, + "loss": 0.0167, + "rewards/chosen": 3.9546937942504883, + "rewards/margins": 13.156867663065592, + "rewards/rejected": -9.202173868815104, + "step": 2432 + }, + { + "epoch": 0.2222932846048424, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 8.844764564298025e-06, + "logits/chosen": 383625301.3333333, + "logits/rejected": 288133568.0, + "logps/chosen": -305.77931722005206, + "logps/rejected": -319.093017578125, + "loss": 0.0979, + "rewards/chosen": 3.312876065572103, + "rewards/margins": 13.791661580403646, + "rewards/rejected": -10.478785514831543, + "step": 2433 + }, + { + "epoch": 0.22238465052535406, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 8.843845212118483e-06, + "logits/chosen": 710232320.0, + "logits/rejected": 568547225.6, + "logps/chosen": -387.8463541666667, + "logps/rejected": -458.721923828125, + "loss": 0.0197, + "rewards/chosen": 3.100128173828125, + "rewards/margins": 11.092051696777343, + "rewards/rejected": -7.991923522949219, + "step": 2434 + }, + { + "epoch": 0.2224760164458657, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 8.842925542088558e-06, + "logits/chosen": 578681856.0, + "logits/rejected": 875606323.2, + "logps/chosen": -443.2561442057292, + "logps/rejected": -638.76328125, + "loss": 0.0059, + "rewards/chosen": 4.336808204650879, + "rewards/margins": 13.978479957580566, + "rewards/rejected": -9.641671752929687, + "step": 2435 + }, + { + "epoch": 0.22256738236637735, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 8.842005554284296e-06, + "logits/chosen": 438278912.0, + "logits/rejected": 602358630.4, + "logps/chosen": -241.24918619791666, + "logps/rejected": -398.06572265625, + "loss": 0.0092, + "rewards/chosen": 4.43848991394043, + "rewards/margins": 12.833002853393555, + "rewards/rejected": -8.394512939453126, + "step": 2436 + }, + { + "epoch": 0.222658748286889, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.84108524878177e-06, + "logits/chosen": 929297728.0, + "logits/rejected": 295988053.3333333, + "logps/chosen": -591.9096069335938, + "logps/rejected": -387.2389322916667, + "loss": 0.0284, + "rewards/chosen": 4.244182109832764, + "rewards/margins": 12.310328324635824, + "rewards/rejected": -8.06614621480306, + "step": 2437 + }, + { + "epoch": 0.22275011420740065, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 8.840164625657084e-06, + "logits/chosen": 712506368.0, + "logits/rejected": 755847424.0, + "logps/chosen": -323.3995361328125, + "logps/rejected": -508.4366861979167, + "loss": 0.0136, + "rewards/chosen": 3.2041025161743164, + "rewards/margins": 10.138262112935383, + "rewards/rejected": -6.934159596761067, + "step": 2438 + }, + { + "epoch": 0.2228414801279123, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 8.839243684986364e-06, + "logits/chosen": 777893973.3333334, + "logits/rejected": 474768486.4, + "logps/chosen": -334.36920166015625, + "logps/rejected": -552.316455078125, + "loss": 0.0111, + "rewards/chosen": 3.565373420715332, + "rewards/margins": 12.080241584777832, + "rewards/rejected": -8.5148681640625, + "step": 2439 + }, + { + "epoch": 0.22293284604842395, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 8.83832242684576e-06, + "logits/chosen": 569197696.0, + "logits/rejected": 377918144.0, + "logps/chosen": -403.936767578125, + "logps/rejected": -430.9650573730469, + "loss": 0.0181, + "rewards/chosen": 3.715477705001831, + "rewards/margins": 11.021129369735718, + "rewards/rejected": -7.305651664733887, + "step": 2440 + }, + { + "epoch": 0.2230242119689356, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 8.837400851311458e-06, + "logits/chosen": 920731989.3333334, + "logits/rejected": 493234227.2, + "logps/chosen": -585.9861653645834, + "logps/rejected": -490.64873046875, + "loss": 0.0171, + "rewards/chosen": 3.1207764943440757, + "rewards/margins": 12.047675450642904, + "rewards/rejected": -8.926898956298828, + "step": 2441 + }, + { + "epoch": 0.22311557788944725, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.836478958459656e-06, + "logits/chosen": 530722048.0, + "logits/rejected": 746587328.0, + "logps/chosen": -295.69488525390625, + "logps/rejected": -509.1202697753906, + "loss": 0.0155, + "rewards/chosen": 3.739469289779663, + "rewards/margins": 13.17328143119812, + "rewards/rejected": -9.433812141418457, + "step": 2442 + }, + { + "epoch": 0.2232069438099589, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.835556748366592e-06, + "logits/chosen": 439243808.0, + "logits/rejected": 309509152.0, + "logps/chosen": -379.4503479003906, + "logps/rejected": -463.463134765625, + "loss": 0.0143, + "rewards/chosen": 3.6833677291870117, + "rewards/margins": 13.437859535217285, + "rewards/rejected": -9.754491806030273, + "step": 2443 + }, + { + "epoch": 0.22329830973047055, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 8.83463422110852e-06, + "logits/chosen": 421760224.0, + "logits/rejected": 589894528.0, + "logps/chosen": -309.87115478515625, + "logps/rejected": -540.6256713867188, + "loss": 0.0301, + "rewards/chosen": 2.873905658721924, + "rewards/margins": 12.047570705413818, + "rewards/rejected": -9.173665046691895, + "step": 2444 + }, + { + "epoch": 0.2233896756509822, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 8.833711376761729e-06, + "logits/chosen": 838042931.2, + "logits/rejected": 819322368.0, + "logps/chosen": -450.1236328125, + "logps/rejected": -571.0364176432291, + "loss": 0.023, + "rewards/chosen": 3.864216613769531, + "rewards/margins": 12.114600626627603, + "rewards/rejected": -8.250384012858072, + "step": 2445 + }, + { + "epoch": 0.22348104157149384, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 8.832788215402527e-06, + "logits/chosen": 509842944.0, + "logits/rejected": 1098801834.6666667, + "logps/chosen": -256.5474609375, + "logps/rejected": -610.9798990885416, + "loss": 0.0884, + "rewards/chosen": 3.5325088500976562, + "rewards/margins": 13.137284596761068, + "rewards/rejected": -9.604775746663412, + "step": 2446 + }, + { + "epoch": 0.2235724074920055, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.83186473710725e-06, + "logits/chosen": 433839168.0, + "logits/rejected": 250256768.0, + "logps/chosen": -197.02142333984375, + "logps/rejected": -360.803955078125, + "loss": 0.0134, + "rewards/chosen": 4.1406145095825195, + "rewards/margins": 12.983502388000488, + "rewards/rejected": -8.842887878417969, + "step": 2447 + }, + { + "epoch": 0.22366377341251714, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 8.830940941952264e-06, + "logits/chosen": 605580864.0, + "logits/rejected": 1429076864.0, + "logps/chosen": -304.382568359375, + "logps/rejected": -650.4595947265625, + "loss": 0.0162, + "rewards/chosen": 3.6904690265655518, + "rewards/margins": 13.112248659133911, + "rewards/rejected": -9.42177963256836, + "step": 2448 + }, + { + "epoch": 0.2237551393330288, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 8.830016830013954e-06, + "logits/chosen": 805136230.4, + "logits/rejected": 441293653.3333333, + "logps/chosen": -238.8230224609375, + "logps/rejected": -555.8767903645834, + "loss": 0.1565, + "rewards/chosen": 1.549580955505371, + "rewards/margins": 12.056817436218262, + "rewards/rejected": -10.50723648071289, + "step": 2449 + }, + { + "epoch": 0.22384650525354044, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.82909240136874e-06, + "logits/chosen": 644367308.8, + "logits/rejected": 902240768.0, + "logps/chosen": -263.392431640625, + "logps/rejected": -433.3641357421875, + "loss": 0.0344, + "rewards/chosen": 3.4135787963867186, + "rewards/margins": 11.404551951090495, + "rewards/rejected": -7.990973154703776, + "step": 2450 + }, + { + "epoch": 0.2239378711740521, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.82816765609306e-06, + "logits/chosen": 317992234.6666667, + "logits/rejected": 582666816.0, + "logps/chosen": -264.9363199869792, + "logps/rejected": -339.53924560546875, + "loss": 0.0263, + "rewards/chosen": 3.8494656880696616, + "rewards/margins": 12.881271680196127, + "rewards/rejected": -9.031805992126465, + "step": 2451 + }, + { + "epoch": 0.22402923709456374, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.827242594263385e-06, + "logits/chosen": 813979340.8, + "logits/rejected": 532923050.6666667, + "logps/chosen": -170.506005859375, + "logps/rejected": -389.8792317708333, + "loss": 0.0255, + "rewards/chosen": 3.4425209045410154, + "rewards/margins": 11.681131490071614, + "rewards/rejected": -8.2386105855306, + "step": 2452 + }, + { + "epoch": 0.2241206030150754, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 8.826317215956209e-06, + "logits/chosen": 612413696.0, + "logits/rejected": 514513109.3333333, + "logps/chosen": -536.09580078125, + "logps/rejected": -577.1054280598959, + "loss": 0.0077, + "rewards/chosen": 4.798539733886718, + "rewards/margins": 14.194592412312826, + "rewards/rejected": -9.396052678426107, + "step": 2453 + }, + { + "epoch": 0.22421196893558704, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 8.825391521248047e-06, + "logits/chosen": 403583436.8, + "logits/rejected": 307026773.3333333, + "logps/chosen": -443.28271484375, + "logps/rejected": -438.0498046875, + "loss": 0.0453, + "rewards/chosen": 3.2193439483642576, + "rewards/margins": 11.399279403686524, + "rewards/rejected": -8.179935455322266, + "step": 2454 + }, + { + "epoch": 0.22430333485609869, + "grad_norm": 0.890625, + "kl": 0.0, + "learning_rate": 8.824465510215453e-06, + "logits/chosen": 894893248.0, + "logits/rejected": 451061650.28571427, + "logps/chosen": -266.7589111328125, + "logps/rejected": -626.1834542410714, + "loss": 0.0031, + "rewards/chosen": 3.732403516769409, + "rewards/margins": 14.679078408649989, + "rewards/rejected": -10.94667489188058, + "step": 2455 + }, + { + "epoch": 0.22439470077661033, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 8.823539182934994e-06, + "logits/chosen": 624081706.6666666, + "logits/rejected": 525191456.0, + "logps/chosen": -411.4071858723958, + "logps/rejected": -466.9833984375, + "loss": 0.0403, + "rewards/chosen": 3.1956049601236978, + "rewards/margins": 10.595846811930338, + "rewards/rejected": -7.400241851806641, + "step": 2456 + }, + { + "epoch": 0.22448606669712198, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 8.82261253948327e-06, + "logits/chosen": 491208857.6, + "logits/rejected": 591592277.3333334, + "logps/chosen": -270.224169921875, + "logps/rejected": -485.5871988932292, + "loss": 0.1069, + "rewards/chosen": 2.4249652862548827, + "rewards/margins": 10.965679550170899, + "rewards/rejected": -8.540714263916016, + "step": 2457 + }, + { + "epoch": 0.22457743261763363, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 8.821685579936909e-06, + "logits/chosen": 548145024.0, + "logits/rejected": 313060704.0, + "logps/chosen": -373.1006164550781, + "logps/rejected": -473.47210693359375, + "loss": 0.0145, + "rewards/chosen": 3.5913033485412598, + "rewards/margins": 15.58266305923462, + "rewards/rejected": -11.99135971069336, + "step": 2458 + }, + { + "epoch": 0.22466879853814528, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.820758304372557e-06, + "logits/chosen": 521264384.0, + "logits/rejected": 508309760.0, + "logps/chosen": -405.2196451822917, + "logps/rejected": -434.902099609375, + "loss": 0.0415, + "rewards/chosen": 3.2660512924194336, + "rewards/margins": 10.936758041381836, + "rewards/rejected": -7.670706748962402, + "step": 2459 + }, + { + "epoch": 0.22476016445865693, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.819830712866893e-06, + "logits/chosen": 499168032.0, + "logits/rejected": 919069440.0, + "logps/chosen": -243.27359008789062, + "logps/rejected": -581.900390625, + "loss": 0.1806, + "rewards/chosen": 2.753560781478882, + "rewards/margins": 11.61868166923523, + "rewards/rejected": -8.865120887756348, + "step": 2460 + }, + { + "epoch": 0.22485153037916858, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 8.818902805496622e-06, + "logits/chosen": 349118944.0, + "logits/rejected": 348636693.3333333, + "logps/chosen": -226.48500061035156, + "logps/rejected": -427.6421305338542, + "loss": 0.0074, + "rewards/chosen": 3.5494778156280518, + "rewards/margins": 12.362260103225708, + "rewards/rejected": -8.812782287597656, + "step": 2461 + }, + { + "epoch": 0.22494289629968023, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 8.817974582338471e-06, + "logits/chosen": 624599552.0, + "logits/rejected": 664768819.2, + "logps/chosen": -379.9085286458333, + "logps/rejected": -557.58154296875, + "loss": 0.0308, + "rewards/chosen": 3.1306680043538413, + "rewards/margins": 12.68195889790853, + "rewards/rejected": -9.551290893554688, + "step": 2462 + }, + { + "epoch": 0.22503426222019188, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 8.817046043469195e-06, + "logits/chosen": 438629034.6666667, + "logits/rejected": 248907040.0, + "logps/chosen": -391.0437825520833, + "logps/rejected": -314.2481689453125, + "loss": 0.0483, + "rewards/chosen": 2.904966672261556, + "rewards/margins": 9.487555821736654, + "rewards/rejected": -6.582589149475098, + "step": 2463 + }, + { + "epoch": 0.22512562814070353, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 8.81611718896558e-06, + "logits/chosen": 769385130.6666666, + "logits/rejected": 481668198.4, + "logps/chosen": -410.070556640625, + "logps/rejected": -380.7936767578125, + "loss": 0.1294, + "rewards/chosen": 2.5804821650187173, + "rewards/margins": 10.317454210917155, + "rewards/rejected": -7.736972045898438, + "step": 2464 + }, + { + "epoch": 0.22521699406121518, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 8.815188018904427e-06, + "logits/chosen": 1144165120.0, + "logits/rejected": 859621312.0, + "logps/chosen": -326.51495361328125, + "logps/rejected": -539.7205810546875, + "loss": 0.0321, + "rewards/chosen": 3.1636781692504883, + "rewards/margins": 10.305413246154785, + "rewards/rejected": -7.141735076904297, + "step": 2465 + }, + { + "epoch": 0.22530835998172682, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 8.814258533362576e-06, + "logits/chosen": 494369344.0, + "logits/rejected": 434259840.0, + "logps/chosen": -395.9078063964844, + "logps/rejected": -492.2933044433594, + "loss": 0.0109, + "rewards/chosen": 4.028676509857178, + "rewards/margins": 13.040204524993896, + "rewards/rejected": -9.011528015136719, + "step": 2466 + }, + { + "epoch": 0.22539972590223847, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 8.813328732416883e-06, + "logits/chosen": 553076992.0, + "logits/rejected": 488391488.0, + "logps/chosen": -226.45864868164062, + "logps/rejected": -507.69207763671875, + "loss": 0.0165, + "rewards/chosen": 3.619830846786499, + "rewards/margins": 11.863900899887085, + "rewards/rejected": -8.244070053100586, + "step": 2467 + }, + { + "epoch": 0.22549109182275012, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 8.812398616144236e-06, + "logits/chosen": 514911456.0, + "logits/rejected": 449411840.0, + "logps/chosen": -330.2676696777344, + "logps/rejected": -314.821044921875, + "loss": 0.0684, + "rewards/chosen": 4.549568176269531, + "rewards/margins": 11.34248161315918, + "rewards/rejected": -6.792913436889648, + "step": 2468 + }, + { + "epoch": 0.22558245774326177, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 8.811468184621545e-06, + "logits/chosen": 697503232.0, + "logits/rejected": 695489843.2, + "logps/chosen": -255.25712076822916, + "logps/rejected": -494.903662109375, + "loss": 0.0249, + "rewards/chosen": 3.3224767049153647, + "rewards/margins": 10.833192952473958, + "rewards/rejected": -7.510716247558594, + "step": 2469 + }, + { + "epoch": 0.22567382366377342, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 8.81053743792575e-06, + "logits/chosen": 659870976.0, + "logits/rejected": 451816448.0, + "logps/chosen": -291.601904296875, + "logps/rejected": -464.6453450520833, + "loss": 0.0367, + "rewards/chosen": 3.858226776123047, + "rewards/margins": 10.972979227701824, + "rewards/rejected": -7.114752451578776, + "step": 2470 + }, + { + "epoch": 0.22576518958428507, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 8.809606376133814e-06, + "logits/chosen": 438753877.3333333, + "logits/rejected": 313077990.4, + "logps/chosen": -400.6415608723958, + "logps/rejected": -374.763916015625, + "loss": 0.0143, + "rewards/chosen": 3.963367780049642, + "rewards/margins": 10.955184491475423, + "rewards/rejected": -6.991816711425781, + "step": 2471 + }, + { + "epoch": 0.22585655550479672, + "grad_norm": 25.625, + "kl": 0.0, + "learning_rate": 8.808674999322729e-06, + "logits/chosen": 328001408.0, + "logits/rejected": 552917376.0, + "logps/chosen": -225.8888916015625, + "logps/rejected": -755.9808756510416, + "loss": 0.0716, + "rewards/chosen": 3.1444515228271483, + "rewards/margins": 14.129635238647461, + "rewards/rejected": -10.985183715820312, + "step": 2472 + }, + { + "epoch": 0.22594792142530837, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 8.807743307569508e-06, + "logits/chosen": 558665472.0, + "logits/rejected": 432976640.0, + "logps/chosen": -160.1135986328125, + "logps/rejected": -362.7593994140625, + "loss": 0.0336, + "rewards/chosen": 3.216960144042969, + "rewards/margins": 10.255336634318034, + "rewards/rejected": -7.038376490275065, + "step": 2473 + }, + { + "epoch": 0.22603928734582002, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 8.806811300951194e-06, + "logits/chosen": 440972390.4, + "logits/rejected": 281168469.3333333, + "logps/chosen": -313.597265625, + "logps/rejected": -427.971435546875, + "loss": 0.1316, + "rewards/chosen": 2.4255182266235353, + "rewards/margins": 9.143159421284993, + "rewards/rejected": -6.717641194661458, + "step": 2474 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 8.80587897954486e-06, + "logits/chosen": 669106986.6666666, + "logits/rejected": 1183057920.0, + "logps/chosen": -341.6966959635417, + "logps/rejected": -760.776611328125, + "loss": 0.0621, + "rewards/chosen": 3.3210229873657227, + "rewards/margins": 12.339590072631836, + "rewards/rejected": -9.018567085266113, + "step": 2475 + }, + { + "epoch": 0.22622201918684332, + "grad_norm": 9.9375, + "kl": 2.9787940979003906, + "learning_rate": 8.804946343427594e-06, + "logits/chosen": 873622966.8571428, + "logits/rejected": 535610656.0, + "logps/chosen": -316.7525111607143, + "logps/rejected": -230.71878051757812, + "loss": 0.1437, + "rewards/chosen": 2.7796500069754466, + "rewards/margins": 8.503408227648054, + "rewards/rejected": -5.723758220672607, + "step": 2476 + }, + { + "epoch": 0.22631338510735496, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 8.804013392676519e-06, + "logits/chosen": 614357717.3333334, + "logits/rejected": 442938828.8, + "logps/chosen": -308.4400634765625, + "logps/rejected": -446.44541015625, + "loss": 0.0399, + "rewards/chosen": 2.33685302734375, + "rewards/margins": 10.047706604003906, + "rewards/rejected": -7.710853576660156, + "step": 2477 + }, + { + "epoch": 0.2264047510278666, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.803080127368782e-06, + "logits/chosen": 506951456.0, + "logps/chosen": -363.8929443359375, + "loss": 0.0762, + "rewards/chosen": 2.889577865600586, + "step": 2478 + }, + { + "epoch": 0.22649611694837826, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.802146547581555e-06, + "logits/chosen": 627665280.0, + "logits/rejected": 462614400.0, + "logps/chosen": -252.326171875, + "logps/rejected": -532.6597900390625, + "loss": 0.0205, + "rewards/chosen": 3.4415688514709473, + "rewards/margins": 13.344013690948486, + "rewards/rejected": -9.902444839477539, + "step": 2479 + }, + { + "epoch": 0.2265874828688899, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.801212653392035e-06, + "logits/chosen": 418739200.0, + "logits/rejected": 819630421.3333334, + "logps/chosen": -191.416259765625, + "logps/rejected": -337.2978515625, + "loss": 0.0218, + "rewards/chosen": 3.823100280761719, + "rewards/margins": 11.916932296752929, + "rewards/rejected": -8.093832015991211, + "step": 2480 + }, + { + "epoch": 0.22667884878940156, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.800278444877448e-06, + "logits/chosen": 528730410.6666667, + "logits/rejected": 683766579.2, + "logps/chosen": -294.4866943359375, + "logps/rejected": -604.766796875, + "loss": 0.0224, + "rewards/chosen": 2.849702517191569, + "rewards/margins": 13.526993624369302, + "rewards/rejected": -10.677291107177734, + "step": 2481 + }, + { + "epoch": 0.2267702147099132, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 8.799343922115045e-06, + "logits/chosen": 469762944.0, + "logits/rejected": 385843648.0, + "logps/chosen": -294.2833251953125, + "logps/rejected": -364.5412902832031, + "loss": 0.0173, + "rewards/chosen": 3.581972599029541, + "rewards/margins": 11.220648765563965, + "rewards/rejected": -7.638676166534424, + "step": 2482 + }, + { + "epoch": 0.22686158063042486, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 8.798409085182099e-06, + "logits/chosen": 655689932.8, + "logits/rejected": 921663488.0, + "logps/chosen": -460.606640625, + "logps/rejected": -505.0325520833333, + "loss": 0.0391, + "rewards/chosen": 3.0574764251708983, + "rewards/margins": 8.881361389160157, + "rewards/rejected": -5.823884963989258, + "step": 2483 + }, + { + "epoch": 0.2269529465509365, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 8.797473934155916e-06, + "logits/chosen": 408826496.0, + "logits/rejected": 430816512.0, + "logps/chosen": -383.14541015625, + "logps/rejected": -451.6692301432292, + "loss": 0.014, + "rewards/chosen": 3.932852935791016, + "rewards/margins": 12.971597544352214, + "rewards/rejected": -9.038744608561197, + "step": 2484 + }, + { + "epoch": 0.22704431247144816, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.796538469113823e-06, + "logits/chosen": 597857216.0, + "logits/rejected": 846184000.0, + "logps/chosen": -243.02346801757812, + "logps/rejected": -532.7725219726562, + "loss": 0.0306, + "rewards/chosen": 3.2313737869262695, + "rewards/margins": 11.053823471069336, + "rewards/rejected": -7.822449684143066, + "step": 2485 + }, + { + "epoch": 0.2271356783919598, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 8.795602690133175e-06, + "logits/chosen": 507393696.0, + "logits/rejected": 588815445.3333334, + "logps/chosen": -304.28277587890625, + "logps/rejected": -411.5224202473958, + "loss": 0.0769, + "rewards/chosen": 3.2332534790039062, + "rewards/margins": 10.63014856974284, + "rewards/rejected": -7.396895090738933, + "step": 2486 + }, + { + "epoch": 0.22722704431247145, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 8.794666597291352e-06, + "logits/chosen": 505866086.4, + "logits/rejected": 457587882.6666667, + "logps/chosen": -254.972998046875, + "logps/rejected": -560.4447428385416, + "loss": 0.0559, + "rewards/chosen": 2.604722023010254, + "rewards/margins": 10.654388109842936, + "rewards/rejected": -8.049666086832682, + "step": 2487 + }, + { + "epoch": 0.2273184102329831, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 8.793730190665758e-06, + "logits/chosen": 779787008.0, + "logits/rejected": 412581888.0, + "logps/chosen": -152.75968424479166, + "logps/rejected": -508.344091796875, + "loss": 0.0069, + "rewards/chosen": 4.944541295369466, + "rewards/margins": 14.636197026570638, + "rewards/rejected": -9.691655731201172, + "step": 2488 + }, + { + "epoch": 0.22740977615349475, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 8.792793470333829e-06, + "logits/chosen": 503012181.3333333, + "logits/rejected": 518804288.0, + "logps/chosen": -297.90712483723956, + "logps/rejected": -865.358154296875, + "loss": 0.0315, + "rewards/chosen": 3.793196360270182, + "rewards/margins": 15.669789950052897, + "rewards/rejected": -11.876593589782715, + "step": 2489 + }, + { + "epoch": 0.2275011420740064, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 8.791856436373018e-06, + "logits/chosen": 413811737.6, + "logits/rejected": 376611413.3333333, + "logps/chosen": -279.9821044921875, + "logps/rejected": -297.2004801432292, + "loss": 0.0183, + "rewards/chosen": 4.487185668945313, + "rewards/margins": 11.521009318033855, + "rewards/rejected": -7.033823649088542, + "step": 2490 + }, + { + "epoch": 0.22759250799451805, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 8.790919088860815e-06, + "logits/chosen": 645364160.0, + "logits/rejected": 542137472.0, + "logps/chosen": -349.1308288574219, + "logps/rejected": -621.3884887695312, + "loss": 0.0308, + "rewards/chosen": 2.871575355529785, + "rewards/margins": 10.977537155151367, + "rewards/rejected": -8.105961799621582, + "step": 2491 + }, + { + "epoch": 0.2276838739150297, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 8.789981427874726e-06, + "logits/chosen": 369561301.3333333, + "logits/rejected": 300702464.0, + "logps/chosen": -213.09344482421875, + "logps/rejected": -476.19501953125, + "loss": 0.0151, + "rewards/chosen": 3.4128236770629883, + "rewards/margins": 13.041942024230957, + "rewards/rejected": -9.629118347167969, + "step": 2492 + }, + { + "epoch": 0.22777523983554135, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 8.789043453492288e-06, + "logits/chosen": 572058282.6666666, + "logits/rejected": 505325158.4, + "logps/chosen": -365.5469156901042, + "logps/rejected": -367.194482421875, + "loss": 0.0205, + "rewards/chosen": 2.9056946436564126, + "rewards/margins": 11.178619448343913, + "rewards/rejected": -8.2729248046875, + "step": 2493 + }, + { + "epoch": 0.227866605756053, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.788105165791062e-06, + "logits/chosen": 598505685.3333334, + "logits/rejected": 551656396.8, + "logps/chosen": -281.21405029296875, + "logps/rejected": -504.93896484375, + "loss": 0.0269, + "rewards/chosen": 2.7339353561401367, + "rewards/margins": 11.968586540222168, + "rewards/rejected": -9.234651184082031, + "step": 2494 + }, + { + "epoch": 0.22795797167656465, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.787166564848637e-06, + "logits/chosen": 564055424.0, + "logits/rejected": 395381120.0, + "logps/chosen": -385.7093505859375, + "logps/rejected": -582.1007080078125, + "loss": 0.0294, + "rewards/chosen": 2.8867716789245605, + "rewards/margins": 15.56260633468628, + "rewards/rejected": -12.675834655761719, + "step": 2495 + }, + { + "epoch": 0.2280493375970763, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 8.786227650742624e-06, + "logits/chosen": 449558869.3333333, + "logits/rejected": 557050368.0, + "logps/chosen": -389.7670491536458, + "logps/rejected": -540.8400390625, + "loss": 0.0053, + "rewards/chosen": 4.575642903645833, + "rewards/margins": 14.046959431966144, + "rewards/rejected": -9.471316528320312, + "step": 2496 + }, + { + "epoch": 0.22814070351758794, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 8.785288423550668e-06, + "logits/chosen": 601987136.0, + "logits/rejected": 498554560.0, + "logps/chosen": -180.98013305664062, + "logps/rejected": -559.5274658203125, + "loss": 0.0196, + "rewards/chosen": 3.5815155506134033, + "rewards/margins": 13.920230150222778, + "rewards/rejected": -10.338714599609375, + "step": 2497 + }, + { + "epoch": 0.2282320694380996, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 8.784348883350428e-06, + "logits/chosen": 354891157.3333333, + "logits/rejected": 563850547.2, + "logps/chosen": -220.26505533854166, + "logps/rejected": -476.59365234375, + "loss": 0.0142, + "rewards/chosen": 3.502986272176107, + "rewards/margins": 11.711822636922202, + "rewards/rejected": -8.208836364746094, + "step": 2498 + }, + { + "epoch": 0.22832343535861124, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 8.783409030219599e-06, + "logits/chosen": 1207108608.0, + "logits/rejected": 909718912.0, + "logps/chosen": -630.1617431640625, + "logps/rejected": -484.45684814453125, + "loss": 0.0265, + "rewards/chosen": 3.414181709289551, + "rewards/margins": 10.362512588500977, + "rewards/rejected": -6.948330879211426, + "step": 2499 + }, + { + "epoch": 0.2284148012791229, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 8.782468864235899e-06, + "logits/chosen": 780739968.0, + "logits/rejected": 501374528.0, + "logps/chosen": -350.67669677734375, + "logps/rejected": -597.124267578125, + "loss": 0.0131, + "rewards/chosen": 4.01397705078125, + "rewards/margins": 14.717905044555664, + "rewards/rejected": -10.703927993774414, + "step": 2500 + }, + { + "epoch": 0.22850616719963454, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 8.781528385477066e-06, + "logits/chosen": 329185280.0, + "logits/rejected": 656363477.3333334, + "logps/chosen": -373.5151062011719, + "logps/rejected": -524.6189778645834, + "loss": 0.0084, + "rewards/chosen": 3.8137786388397217, + "rewards/margins": 13.363882780075073, + "rewards/rejected": -9.550104141235352, + "step": 2501 + }, + { + "epoch": 0.2285975331201462, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.780587594020875e-06, + "logits/chosen": 885217856.0, + "logits/rejected": 645846144.0, + "logps/chosen": -365.4848937988281, + "logps/rejected": -529.6053466796875, + "loss": 0.0154, + "rewards/chosen": 3.6852974891662598, + "rewards/margins": 11.547041893005371, + "rewards/rejected": -7.861744403839111, + "step": 2502 + }, + { + "epoch": 0.22868889904065784, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 8.779646489945118e-06, + "logits/chosen": 580224341.3333334, + "logits/rejected": 1022418944.0, + "logps/chosen": -325.1326904296875, + "logps/rejected": -227.97634887695312, + "loss": 0.1906, + "rewards/chosen": 2.487488587697347, + "rewards/margins": 7.101975758870443, + "rewards/rejected": -4.614487171173096, + "step": 2503 + }, + { + "epoch": 0.2287802649611695, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.778705073327613e-06, + "logits/chosen": 1202595669.3333333, + "logits/rejected": 657057920.0, + "logps/chosen": -286.9490559895833, + "logps/rejected": -569.4960327148438, + "loss": 0.045, + "rewards/chosen": 3.1665655771891275, + "rewards/margins": 11.4778839747111, + "rewards/rejected": -8.311318397521973, + "step": 2504 + }, + { + "epoch": 0.22887163088168114, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 8.777763344246209e-06, + "logits/chosen": 1138139136.0, + "logits/rejected": 748711552.0, + "logps/chosen": -298.98919677734375, + "logps/rejected": -493.83184814453125, + "loss": 0.0257, + "rewards/chosen": 3.1322600841522217, + "rewards/margins": 12.383300542831421, + "rewards/rejected": -9.2510404586792, + "step": 2505 + }, + { + "epoch": 0.2289629968021928, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.776821302778779e-06, + "logits/chosen": 790316544.0, + "logits/rejected": 543811968.0, + "logps/chosen": -245.5603790283203, + "logps/rejected": -577.9056396484375, + "loss": 0.0338, + "rewards/chosen": 3.0077221393585205, + "rewards/margins": 13.818238496780396, + "rewards/rejected": -10.810516357421875, + "step": 2506 + }, + { + "epoch": 0.22905436272270444, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 8.775878949003222e-06, + "logits/chosen": 903612416.0, + "logits/rejected": 447182816.0, + "logps/chosen": -313.81671142578125, + "logps/rejected": -448.28363037109375, + "loss": 0.022, + "rewards/chosen": 3.478823184967041, + "rewards/margins": 11.999786853790283, + "rewards/rejected": -8.520963668823242, + "step": 2507 + }, + { + "epoch": 0.22914572864321608, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 8.77493628299746e-06, + "logits/chosen": 551116416.0, + "logits/rejected": 277943082.6666667, + "logps/chosen": -470.9190368652344, + "logps/rejected": -282.3890787760417, + "loss": 0.0474, + "rewards/chosen": 1.5746002197265625, + "rewards/margins": 7.629738489786784, + "rewards/rejected": -6.055138270060222, + "step": 2508 + }, + { + "epoch": 0.22923709456372773, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 8.773993304839441e-06, + "logits/chosen": 359872256.0, + "logits/rejected": 494135424.0, + "logps/chosen": -290.0577087402344, + "logps/rejected": -567.5, + "loss": 0.0201, + "rewards/chosen": 3.533299446105957, + "rewards/margins": 12.730294227600098, + "rewards/rejected": -9.19699478149414, + "step": 2509 + }, + { + "epoch": 0.22932846048423938, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 8.773050014607143e-06, + "logits/chosen": 556078677.3333334, + "logits/rejected": 596081305.6, + "logps/chosen": -470.6608072916667, + "logps/rejected": -423.99775390625, + "loss": 0.0148, + "rewards/chosen": 3.7171703974405923, + "rewards/margins": 11.644900957743326, + "rewards/rejected": -7.927730560302734, + "step": 2510 + }, + { + "epoch": 0.22941982640475103, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 8.772106412378566e-06, + "logits/chosen": 506841248.0, + "logits/rejected": 356929056.0, + "logps/chosen": -159.65435791015625, + "logps/rejected": -363.61346435546875, + "loss": 0.0329, + "rewards/chosen": 2.946803569793701, + "rewards/margins": 11.904611110687256, + "rewards/rejected": -8.957807540893555, + "step": 2511 + }, + { + "epoch": 0.22951119232526268, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 8.77116249823174e-06, + "logits/chosen": 386996266.6666667, + "logits/rejected": 456880076.8, + "logps/chosen": -341.0807698567708, + "logps/rejected": -564.26337890625, + "loss": 0.0134, + "rewards/chosen": 3.6269664764404297, + "rewards/margins": 12.376094436645507, + "rewards/rejected": -8.749127960205078, + "step": 2512 + }, + { + "epoch": 0.22960255824577433, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.770218272244714e-06, + "logits/chosen": 668218048.0, + "logits/rejected": 447941888.0, + "logps/chosen": -327.85491943359375, + "logps/rejected": -359.9872233072917, + "loss": 0.0639, + "rewards/chosen": 2.754168748855591, + "rewards/margins": 8.714270830154419, + "rewards/rejected": -5.960102081298828, + "step": 2513 + }, + { + "epoch": 0.22969392416628598, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 8.769273734495571e-06, + "logits/chosen": 991107200.0, + "logits/rejected": 1054991360.0, + "logps/chosen": -247.70458984375, + "logps/rejected": -446.78179931640625, + "loss": 0.0235, + "rewards/chosen": 4.105359077453613, + "rewards/margins": 12.303057670593262, + "rewards/rejected": -8.197698593139648, + "step": 2514 + }, + { + "epoch": 0.22978529008679763, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 8.768328885062413e-06, + "logits/chosen": 362146645.3333333, + "logits/rejected": 325033664.0, + "logps/chosen": -306.6048990885417, + "logps/rejected": -195.46963500976562, + "loss": 0.0506, + "rewards/chosen": 2.921180089314779, + "rewards/margins": 10.847824414571127, + "rewards/rejected": -7.926644325256348, + "step": 2515 + }, + { + "epoch": 0.22987665600730928, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 8.767383724023369e-06, + "logits/chosen": 604672640.0, + "logits/rejected": 445674496.0, + "logps/chosen": -451.9299621582031, + "logps/rejected": -442.8885803222656, + "loss": 0.0119, + "rewards/chosen": 4.587412357330322, + "rewards/margins": 13.781373500823975, + "rewards/rejected": -9.193961143493652, + "step": 2516 + }, + { + "epoch": 0.22996802192782093, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 8.766438251456598e-06, + "logits/chosen": 443110442.6666667, + "logits/rejected": 669336128.0, + "logps/chosen": -337.56622314453125, + "logps/rejected": -412.5692138671875, + "loss": 0.0457, + "rewards/chosen": 3.664241154988607, + "rewards/margins": 13.277897198994955, + "rewards/rejected": -9.613656044006348, + "step": 2517 + }, + { + "epoch": 0.23005938784833257, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.76549246744028e-06, + "logits/chosen": 894056618.6666666, + "logits/rejected": 666065280.0, + "logps/chosen": -445.3953450520833, + "logps/rejected": -408.637939453125, + "loss": 0.0627, + "rewards/chosen": 3.026661237080892, + "rewards/margins": 12.388070424397787, + "rewards/rejected": -9.361409187316895, + "step": 2518 + }, + { + "epoch": 0.23015075376884422, + "grad_norm": 23.0, + "kl": 0.0, + "learning_rate": 8.764546372052622e-06, + "logits/chosen": 196244992.0, + "logits/rejected": 423999445.3333333, + "logps/chosen": -194.89251708984375, + "logps/rejected": -351.6798502604167, + "loss": 0.0321, + "rewards/chosen": 3.0217862129211426, + "rewards/margins": 11.064459959665934, + "rewards/rejected": -8.042673746744791, + "step": 2519 + }, + { + "epoch": 0.23024211968935587, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 8.76359996537186e-06, + "logits/chosen": 578351360.0, + "logits/rejected": 584071296.0, + "logps/chosen": -478.176025390625, + "logps/rejected": -653.2520141601562, + "loss": 0.018, + "rewards/chosen": 3.7041358947753906, + "rewards/margins": 16.236214637756348, + "rewards/rejected": -12.532078742980957, + "step": 2520 + }, + { + "epoch": 0.23033348560986752, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 8.762653247476249e-06, + "logits/chosen": 582686105.6, + "logits/rejected": 772979541.3333334, + "logps/chosen": -316.0122802734375, + "logps/rejected": -349.7831217447917, + "loss": 0.0439, + "rewards/chosen": 3.382485198974609, + "rewards/margins": 9.481599299112956, + "rewards/rejected": -6.099114100138347, + "step": 2521 + }, + { + "epoch": 0.23042485153037917, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.761706218444079e-06, + "logits/chosen": 361390432.0, + "logits/rejected": 425199392.0, + "logps/chosen": -260.1524658203125, + "logps/rejected": -410.7159423828125, + "loss": 0.1135, + "rewards/chosen": 2.6477103233337402, + "rewards/margins": 10.49120569229126, + "rewards/rejected": -7.8434953689575195, + "step": 2522 + }, + { + "epoch": 0.23051621745089082, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 8.760758878353657e-06, + "logits/chosen": 324180288.0, + "logits/rejected": 299454304.0, + "logps/chosen": -114.02914428710938, + "logps/rejected": -265.33551025390625, + "loss": 0.0437, + "rewards/chosen": 3.0119361877441406, + "rewards/margins": 10.457173347473145, + "rewards/rejected": -7.445237159729004, + "step": 2523 + }, + { + "epoch": 0.23060758337140247, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 8.75981122728332e-06, + "logits/chosen": 379772057.6, + "logits/rejected": 704207872.0, + "logps/chosen": -255.048388671875, + "logps/rejected": -207.15250651041666, + "loss": 0.1588, + "rewards/chosen": 2.9754312515258787, + "rewards/margins": 6.9646149953206375, + "rewards/rejected": -3.9891837437947593, + "step": 2524 + }, + { + "epoch": 0.23069894929191412, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 8.758863265311428e-06, + "logits/chosen": 572733866.6666666, + "logits/rejected": 843489126.4, + "logps/chosen": -512.406982421875, + "logps/rejected": -450.48193359375, + "loss": 0.0217, + "rewards/chosen": 3.841588338216146, + "rewards/margins": 11.538576253255208, + "rewards/rejected": -7.696987915039062, + "step": 2525 + }, + { + "epoch": 0.23079031521242577, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 8.757914992516371e-06, + "logits/chosen": 729290956.8, + "logits/rejected": 483452245.3333333, + "logps/chosen": -350.25029296875, + "logps/rejected": -591.7266845703125, + "loss": 0.0361, + "rewards/chosen": 3.263469696044922, + "rewards/margins": 12.421975453694662, + "rewards/rejected": -9.15850575764974, + "step": 2526 + }, + { + "epoch": 0.23088168113293742, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 8.756966408976563e-06, + "logits/chosen": 421740000.0, + "logits/rejected": 617093568.0, + "logps/chosen": -259.5379638671875, + "logps/rejected": -411.496337890625, + "loss": 0.0258, + "rewards/chosen": 3.5953168869018555, + "rewards/margins": 11.989383697509766, + "rewards/rejected": -8.39406681060791, + "step": 2527 + }, + { + "epoch": 0.23097304705344907, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 8.756017514770444e-06, + "logits/chosen": 418048694.85714287, + "logits/rejected": 855111488.0, + "logps/chosen": -316.83973911830356, + "logps/rejected": -213.17059326171875, + "loss": 0.0569, + "rewards/chosen": 3.407891409737723, + "rewards/margins": 8.583294527871267, + "rewards/rejected": -5.175403118133545, + "step": 2528 + }, + { + "epoch": 0.23106441297396071, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 8.755068309976474e-06, + "logits/chosen": 423691008.0, + "logits/rejected": 341212288.0, + "logps/chosen": -286.71953125, + "logps/rejected": -651.9000651041666, + "loss": 0.0199, + "rewards/chosen": 3.694884490966797, + "rewards/margins": 14.618675740559897, + "rewards/rejected": -10.9237912495931, + "step": 2529 + }, + { + "epoch": 0.23115577889447236, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.754118794673147e-06, + "logits/chosen": 558167232.0, + "logits/rejected": 220267872.0, + "logps/chosen": -220.8134765625, + "logps/rejected": -375.68359375, + "loss": 0.0397, + "rewards/chosen": 2.7152938842773438, + "rewards/margins": 10.36282205581665, + "rewards/rejected": -7.647528171539307, + "step": 2530 + }, + { + "epoch": 0.231247144814984, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.753168968938978e-06, + "logits/chosen": 595659712.0, + "logits/rejected": 719366784.0, + "logps/chosen": -449.4920349121094, + "logps/rejected": -521.0230712890625, + "loss": 0.0227, + "rewards/chosen": 3.8694381713867188, + "rewards/margins": 13.421196937561035, + "rewards/rejected": -9.551758766174316, + "step": 2531 + }, + { + "epoch": 0.23133851073549566, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 8.75221883285251e-06, + "logits/chosen": 842235750.4, + "logits/rejected": 507139072.0, + "logps/chosen": -640.92138671875, + "logps/rejected": -425.3269856770833, + "loss": 0.0096, + "rewards/chosen": 4.3379638671875, + "rewards/margins": 14.525911458333333, + "rewards/rejected": -10.187947591145834, + "step": 2532 + }, + { + "epoch": 0.2314298766560073, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.751268386492309e-06, + "logits/chosen": 709056000.0, + "logits/rejected": 328397440.0, + "logps/chosen": -328.390673828125, + "logps/rejected": -427.244873046875, + "loss": 0.0281, + "rewards/chosen": 3.565065383911133, + "rewards/margins": 12.506039301554361, + "rewards/rejected": -8.940973917643229, + "step": 2533 + }, + { + "epoch": 0.23152124257651896, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.750317629936968e-06, + "logits/chosen": 775916544.0, + "logits/rejected": 689820364.8, + "logps/chosen": -194.6093546549479, + "logps/rejected": -506.24609375, + "loss": 0.0335, + "rewards/chosen": 2.3580846786499023, + "rewards/margins": 10.354453086853027, + "rewards/rejected": -7.996368408203125, + "step": 2534 + }, + { + "epoch": 0.2316126084970306, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.749366563265107e-06, + "logits/chosen": 500910131.2, + "logits/rejected": 485231274.6666667, + "logps/chosen": -334.343017578125, + "logps/rejected": -297.65541585286456, + "loss": 0.0221, + "rewards/chosen": 3.6305160522460938, + "rewards/margins": 9.103659947713215, + "rewards/rejected": -5.473143895467122, + "step": 2535 + }, + { + "epoch": 0.23170397441754226, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 8.74841518655537e-06, + "logits/chosen": 366252953.6, + "logits/rejected": 404287914.6666667, + "logps/chosen": -300.1984375, + "logps/rejected": -356.3063151041667, + "loss": 0.0313, + "rewards/chosen": 3.226723861694336, + "rewards/margins": 10.590443801879882, + "rewards/rejected": -7.363719940185547, + "step": 2536 + }, + { + "epoch": 0.2317953403380539, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.747463499886429e-06, + "logits/chosen": 338354227.2, + "logits/rejected": 679373312.0, + "logps/chosen": -252.6533447265625, + "logps/rejected": -592.1092936197916, + "loss": 0.0185, + "rewards/chosen": 3.7614013671875, + "rewards/margins": 10.84069112141927, + "rewards/rejected": -7.0792897542317705, + "step": 2537 + }, + { + "epoch": 0.23188670625856556, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 8.746511503336974e-06, + "logits/chosen": 706788928.0, + "logits/rejected": 733267584.0, + "logps/chosen": -445.8681640625, + "logps/rejected": -587.0379638671875, + "loss": 0.0098, + "rewards/chosen": 4.283566474914551, + "rewards/margins": 13.027270317077637, + "rewards/rejected": -8.743703842163086, + "step": 2538 + }, + { + "epoch": 0.2319780721790772, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 8.74555919698573e-06, + "logits/chosen": 795965098.6666666, + "logits/rejected": 504298137.6, + "logps/chosen": -206.951171875, + "logps/rejected": -359.502587890625, + "loss": 0.0529, + "rewards/chosen": 2.9180736541748047, + "rewards/margins": 8.938999557495118, + "rewards/rejected": -6.020925903320313, + "step": 2539 + }, + { + "epoch": 0.23206943809958885, + "grad_norm": 8.5, + "kl": 4.408210754394531, + "learning_rate": 8.744606580911447e-06, + "logits/chosen": 444540117.3333333, + "logits/rejected": 360804032.0, + "logps/chosen": -265.6918538411458, + "logps/rejected": -298.6202392578125, + "loss": 0.0756, + "rewards/chosen": 3.4865169525146484, + "rewards/margins": 11.208871364593506, + "rewards/rejected": -7.722354412078857, + "step": 2540 + }, + { + "epoch": 0.2321608040201005, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 8.743653655192892e-06, + "logits/chosen": 435621333.3333333, + "logits/rejected": 1308793216.0, + "logps/chosen": -285.9829915364583, + "logps/rejected": -766.058837890625, + "loss": 0.0091, + "rewards/chosen": 4.851451873779297, + "rewards/margins": 14.820701599121094, + "rewards/rejected": -9.969249725341797, + "step": 2541 + }, + { + "epoch": 0.23225216994061215, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 8.742700419908867e-06, + "logits/chosen": 432093269.3333333, + "logits/rejected": 478700608.0, + "logps/chosen": -297.96824137369794, + "logps/rejected": -643.994384765625, + "loss": 0.0326, + "rewards/chosen": 3.472898483276367, + "rewards/margins": 17.102139472961426, + "rewards/rejected": -13.629240989685059, + "step": 2542 + }, + { + "epoch": 0.2323435358611238, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 8.741746875138192e-06, + "logits/chosen": 550343168.0, + "logits/rejected": 632999872.0, + "logps/chosen": -438.6560974121094, + "logps/rejected": -555.3240966796875, + "loss": 0.0436, + "rewards/chosen": 2.893242835998535, + "rewards/margins": 12.059651374816895, + "rewards/rejected": -9.16640853881836, + "step": 2543 + }, + { + "epoch": 0.23243490178163545, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 8.740793020959722e-06, + "logits/chosen": 695992115.2, + "logits/rejected": 374249557.3333333, + "logps/chosen": -272.1766357421875, + "logps/rejected": -453.9942626953125, + "loss": 0.0627, + "rewards/chosen": 2.9438621520996096, + "rewards/margins": 12.113829803466796, + "rewards/rejected": -9.169967651367188, + "step": 2544 + }, + { + "epoch": 0.2325262677021471, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 8.739838857452324e-06, + "logits/chosen": 456862105.6, + "logits/rejected": 353799658.6666667, + "logps/chosen": -287.293310546875, + "logps/rejected": -419.9317220052083, + "loss": 0.0352, + "rewards/chosen": 3.28829345703125, + "rewards/margins": 11.526017761230468, + "rewards/rejected": -8.237724304199219, + "step": 2545 + }, + { + "epoch": 0.23261763362265875, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 8.738884384694905e-06, + "logits/chosen": 306038144.0, + "logits/rejected": 486632192.0, + "logps/chosen": -330.0226745605469, + "logps/rejected": -280.2980041503906, + "loss": 0.0081, + "rewards/chosen": 4.797451019287109, + "rewards/margins": 11.962416648864746, + "rewards/rejected": -7.164965629577637, + "step": 2546 + }, + { + "epoch": 0.2327089995431704, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 8.73792960276639e-06, + "logits/chosen": 1088581120.0, + "logits/rejected": 820183722.6666666, + "logps/chosen": -304.21197509765625, + "logps/rejected": -413.0914713541667, + "loss": 0.037, + "rewards/chosen": 2.21272349357605, + "rewards/margins": 9.816075086593628, + "rewards/rejected": -7.603351593017578, + "step": 2547 + }, + { + "epoch": 0.23280036546368205, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 8.736974511745728e-06, + "logits/chosen": 542637824.0, + "logits/rejected": 493059498.6666667, + "logps/chosen": -396.4930419921875, + "logps/rejected": -543.0430908203125, + "loss": 0.0662, + "rewards/chosen": 3.3468215465545654, + "rewards/margins": 12.813260475794474, + "rewards/rejected": -9.466438929239908, + "step": 2548 + }, + { + "epoch": 0.2328917313841937, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 8.736019111711897e-06, + "logits/chosen": 386095872.0, + "logits/rejected": 505909248.0, + "logps/chosen": -301.9445068359375, + "logps/rejected": -462.9139404296875, + "loss": 0.0178, + "rewards/chosen": 4.040359115600586, + "rewards/margins": 12.376886622111002, + "rewards/rejected": -8.336527506510416, + "step": 2549 + }, + { + "epoch": 0.23298309730470534, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 8.735063402743901e-06, + "logits/chosen": 271980768.0, + "logits/rejected": 504031402.6666667, + "logps/chosen": -130.00457763671875, + "logps/rejected": -600.6427815755209, + "loss": 0.0038, + "rewards/chosen": 5.231195449829102, + "rewards/margins": 13.229979197184246, + "rewards/rejected": -7.9987837473551435, + "step": 2550 + }, + { + "epoch": 0.233074463225217, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.734107384920771e-06, + "logits/chosen": 349301708.8, + "logits/rejected": 291073536.0, + "logps/chosen": -358.030517578125, + "logps/rejected": -526.4441731770834, + "loss": 0.0229, + "rewards/chosen": 4.106365585327149, + "rewards/margins": 14.292648696899414, + "rewards/rejected": -10.186283111572266, + "step": 2551 + }, + { + "epoch": 0.23316582914572864, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 8.733151058321555e-06, + "logits/chosen": 421308992.0, + "logits/rejected": 255730272.0, + "logps/chosen": -348.0407409667969, + "logps/rejected": -443.84832763671875, + "loss": 0.0176, + "rewards/chosen": 3.4611992835998535, + "rewards/margins": 13.509317874908447, + "rewards/rejected": -10.048118591308594, + "step": 2552 + }, + { + "epoch": 0.2332571950662403, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 8.732194423025333e-06, + "logits/chosen": 393209664.0, + "logits/rejected": 352253312.0, + "logps/chosen": -367.2740478515625, + "logps/rejected": -322.69293212890625, + "loss": 0.0109, + "rewards/chosen": 4.1937150955200195, + "rewards/margins": 10.954204559326172, + "rewards/rejected": -6.760489463806152, + "step": 2553 + }, + { + "epoch": 0.23334856098675194, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 8.731237479111215e-06, + "logits/chosen": 282423968.0, + "logits/rejected": 474064416.0, + "logps/chosen": -196.42074584960938, + "logps/rejected": -451.9189453125, + "loss": 0.1028, + "rewards/chosen": 3.937199592590332, + "rewards/margins": 11.359962463378906, + "rewards/rejected": -7.422762870788574, + "step": 2554 + }, + { + "epoch": 0.2334399269072636, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 8.730280226658326e-06, + "logits/chosen": 462681568.0, + "logits/rejected": 504208896.0, + "logps/chosen": -358.801025390625, + "logps/rejected": -640.5301513671875, + "loss": 0.0204, + "rewards/chosen": 3.2397279739379883, + "rewards/margins": 12.72934627532959, + "rewards/rejected": -9.489618301391602, + "step": 2555 + }, + { + "epoch": 0.23353129282777524, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 8.729322665745826e-06, + "logits/chosen": 555903296.0, + "logits/rejected": 554221312.0, + "logps/chosen": -360.908447265625, + "logps/rejected": -424.6031799316406, + "loss": 0.0199, + "rewards/chosen": 4.14339542388916, + "rewards/margins": 11.536116600036621, + "rewards/rejected": -7.392721176147461, + "step": 2556 + }, + { + "epoch": 0.2336226587482869, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.728364796452892e-06, + "logits/chosen": 618517504.0, + "logits/rejected": 765959616.0, + "logps/chosen": -338.5880126953125, + "logps/rejected": -329.18804931640625, + "loss": 0.0303, + "rewards/chosen": 2.8056716918945312, + "rewards/margins": 10.782804489135742, + "rewards/rejected": -7.977132797241211, + "step": 2557 + }, + { + "epoch": 0.23371402466879854, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 8.727406618858734e-06, + "logits/chosen": 452026496.0, + "logits/rejected": 445051392.0, + "logps/chosen": -309.04443359375, + "logps/rejected": -457.27117919921875, + "loss": 0.022, + "rewards/chosen": 3.1897315979003906, + "rewards/margins": 11.180251121520996, + "rewards/rejected": -7.9905195236206055, + "step": 2558 + }, + { + "epoch": 0.23380539058931019, + "grad_norm": 28.0, + "kl": 0.0, + "learning_rate": 8.726448133042584e-06, + "logits/chosen": 533371968.0, + "logits/rejected": 372293760.0, + "logps/chosen": -408.6620178222656, + "logps/rejected": -495.583984375, + "loss": 0.1146, + "rewards/chosen": 2.233060359954834, + "rewards/margins": 12.009685039520264, + "rewards/rejected": -9.77662467956543, + "step": 2559 + }, + { + "epoch": 0.23389675650982183, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 8.725489339083702e-06, + "logits/chosen": 608011434.6666666, + "logits/rejected": 576440704.0, + "logps/chosen": -328.44122314453125, + "logps/rejected": -658.4171142578125, + "loss": 0.0169, + "rewards/chosen": 4.05282719930013, + "rewards/margins": 16.000562985738117, + "rewards/rejected": -11.947735786437988, + "step": 2560 + }, + { + "epoch": 0.23398812243033348, + "grad_norm": 0.349609375, + "kl": 0.0, + "learning_rate": 8.724530237061365e-06, + "logits/rejected": 382865312.0, + "logps/rejected": -511.103271484375, + "loss": 0.0007, + "rewards/rejected": -10.334261894226074, + "step": 2561 + }, + { + "epoch": 0.23407948835084513, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 8.72357082705489e-06, + "logits/chosen": 1068746368.0, + "logits/rejected": 384476288.0, + "logps/chosen": -362.47796630859375, + "logps/rejected": -468.9620361328125, + "loss": 0.0201, + "rewards/chosen": 3.464677572250366, + "rewards/margins": 13.328742742538452, + "rewards/rejected": -9.864065170288086, + "step": 2562 + }, + { + "epoch": 0.23417085427135678, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 8.722611109143604e-06, + "logits/chosen": 440075520.0, + "logits/rejected": 385044160.0, + "logps/chosen": -265.06524658203125, + "logps/rejected": -473.9554138183594, + "loss": 0.0235, + "rewards/chosen": 3.203085422515869, + "rewards/margins": 10.985555648803711, + "rewards/rejected": -7.782470226287842, + "step": 2563 + }, + { + "epoch": 0.23426222019186843, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.721651083406873e-06, + "logits/chosen": 567593600.0, + "logits/rejected": 592177664.0, + "logps/chosen": -374.2904052734375, + "logps/rejected": -569.2893676757812, + "loss": 0.0346, + "rewards/chosen": 3.130479574203491, + "rewards/margins": 15.714193105697632, + "rewards/rejected": -12.58371353149414, + "step": 2564 + }, + { + "epoch": 0.23435358611238008, + "grad_norm": 0.96875, + "kl": 0.0, + "learning_rate": 8.720690749924076e-06, + "logits/chosen": 356864000.0, + "logits/rejected": 590837394.2857143, + "logps/chosen": -188.57286071777344, + "logps/rejected": -640.1305803571429, + "loss": 0.0043, + "rewards/chosen": 3.3734116554260254, + "rewards/margins": 13.01956033706665, + "rewards/rejected": -9.646148681640625, + "step": 2565 + }, + { + "epoch": 0.23444495203289173, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 8.719730108774629e-06, + "logits/chosen": 834294208.0, + "logits/rejected": 980193152.0, + "logps/chosen": -519.6807250976562, + "logps/rejected": -378.296142578125, + "loss": 0.0109, + "rewards/chosen": 4.280793190002441, + "rewards/margins": 12.333572387695312, + "rewards/rejected": -8.052779197692871, + "step": 2566 + }, + { + "epoch": 0.23453631795340338, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 8.718769160037965e-06, + "logits/chosen": 639649472.0, + "logps/chosen": -282.4102783203125, + "loss": 0.0541, + "rewards/chosen": 3.075965404510498, + "step": 2567 + }, + { + "epoch": 0.23462768387391503, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 8.717807903793549e-06, + "logits/chosen": 1159100416.0, + "logits/rejected": 790435904.0, + "logps/chosen": -352.1922607421875, + "logps/rejected": -448.29937744140625, + "loss": 0.0385, + "rewards/chosen": 3.50040340423584, + "rewards/margins": 11.419450759887695, + "rewards/rejected": -7.9190473556518555, + "step": 2568 + }, + { + "epoch": 0.23471904979442668, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 8.716846340120864e-06, + "logits/chosen": 502399936.0, + "logits/rejected": 430756352.0, + "logps/chosen": -384.59625244140625, + "logps/rejected": -492.7049967447917, + "loss": 0.0077, + "rewards/chosen": 4.049816131591797, + "rewards/margins": 12.732471466064453, + "rewards/rejected": -8.682655334472656, + "step": 2569 + }, + { + "epoch": 0.23481041571493833, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 8.715884469099423e-06, + "logits/chosen": 378203264.0, + "logits/rejected": 672052800.0, + "logps/chosen": -278.99322509765625, + "logps/rejected": -608.852783203125, + "loss": 0.0098, + "rewards/chosen": 4.180178642272949, + "rewards/margins": 13.104960441589355, + "rewards/rejected": -8.924781799316406, + "step": 2570 + }, + { + "epoch": 0.23490178163544997, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 8.714922290808766e-06, + "logits/chosen": 570778828.8, + "logits/rejected": 450355413.3333333, + "logps/chosen": -280.8701904296875, + "logps/rejected": -447.1558430989583, + "loss": 0.0199, + "rewards/chosen": 3.658538818359375, + "rewards/margins": 13.819259643554688, + "rewards/rejected": -10.160720825195312, + "step": 2571 + }, + { + "epoch": 0.23499314755596162, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 8.713959805328455e-06, + "logits/chosen": 1387682304.0, + "logits/rejected": 677406592.0, + "logps/chosen": -433.5289306640625, + "logps/rejected": -437.486328125, + "loss": 0.0397, + "rewards/chosen": 3.664313793182373, + "rewards/margins": 12.285301685333252, + "rewards/rejected": -8.620987892150879, + "step": 2572 + }, + { + "epoch": 0.23508451347647327, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.712997012738077e-06, + "logits/chosen": 536229324.8, + "logits/rejected": 358183850.6666667, + "logps/chosen": -245.7166015625, + "logps/rejected": -442.5666097005208, + "loss": 0.0296, + "rewards/chosen": 3.6670791625976564, + "rewards/margins": 15.734271240234374, + "rewards/rejected": -12.067192077636719, + "step": 2573 + }, + { + "epoch": 0.23517587939698492, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 8.71203391311725e-06, + "logits/chosen": 447469670.4, + "logits/rejected": 830363648.0, + "logps/chosen": -243.7253173828125, + "logps/rejected": -616.7542317708334, + "loss": 0.0199, + "rewards/chosen": 3.805113983154297, + "rewards/margins": 13.058697255452476, + "rewards/rejected": -9.253583272298178, + "step": 2574 + }, + { + "epoch": 0.23526724531749657, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 8.711070506545611e-06, + "logits/chosen": 559917721.6, + "logits/rejected": 502146773.3333333, + "logps/chosen": -354.371240234375, + "logps/rejected": -584.8365478515625, + "loss": 0.052, + "rewards/chosen": 2.7546112060546877, + "rewards/margins": 10.687814585367839, + "rewards/rejected": -7.933203379313151, + "step": 2575 + }, + { + "epoch": 0.23535861123800822, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 8.710106793102823e-06, + "logits/chosen": 755893504.0, + "logits/rejected": 539544512.0, + "logps/chosen": -519.823486328125, + "logps/rejected": -442.41497802734375, + "loss": 0.0323, + "rewards/chosen": 3.4973762035369873, + "rewards/margins": 12.281550168991089, + "rewards/rejected": -8.784173965454102, + "step": 2576 + }, + { + "epoch": 0.23544997715851987, + "grad_norm": 27.25, + "kl": 0.0, + "learning_rate": 8.709142772868581e-06, + "logits/chosen": 411928512.0, + "logits/rejected": 317990272.0, + "logps/chosen": -404.275390625, + "logps/rejected": -476.4144694010417, + "loss": 0.0663, + "rewards/chosen": 2.350154161453247, + "rewards/margins": 12.717058102289835, + "rewards/rejected": -10.366903940836588, + "step": 2577 + }, + { + "epoch": 0.23554134307903152, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.708178445922596e-06, + "logits/chosen": 896186197.3333334, + "logits/rejected": 423057971.2, + "logps/chosen": -372.5047200520833, + "logps/rejected": -483.59345703125, + "loss": 0.0269, + "rewards/chosen": 2.7276827494303384, + "rewards/margins": 12.812361399332682, + "rewards/rejected": -10.084678649902344, + "step": 2578 + }, + { + "epoch": 0.23563270899954317, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 8.707213812344611e-06, + "logits/chosen": 539986944.0, + "logits/rejected": 429896832.0, + "logps/chosen": -338.2574462890625, + "logps/rejected": -765.364501953125, + "loss": 0.0326, + "rewards/chosen": 3.1417263031005858, + "rewards/margins": 17.683838017781575, + "rewards/rejected": -14.54211171468099, + "step": 2579 + }, + { + "epoch": 0.23572407492005482, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.70624887221439e-06, + "logits/chosen": 311686688.0, + "logits/rejected": 637256874.6666666, + "logps/chosen": -235.38113403320312, + "logps/rejected": -578.0875651041666, + "loss": 0.0136, + "rewards/chosen": 3.8495750427246094, + "rewards/margins": 13.873133341471354, + "rewards/rejected": -10.023558298746744, + "step": 2580 + }, + { + "epoch": 0.23581544084056646, + "grad_norm": 37.5, + "kl": 0.0, + "learning_rate": 8.70528362561173e-06, + "logits/chosen": 918116352.0, + "logits/rejected": 936721920.0, + "logps/chosen": -401.5515380859375, + "logps/rejected": -515.7159423828125, + "loss": 0.117, + "rewards/chosen": 3.367021942138672, + "rewards/margins": 9.464757029215495, + "rewards/rejected": -6.097735087076823, + "step": 2581 + }, + { + "epoch": 0.2359068067610781, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 8.704318072616443e-06, + "logits/chosen": 453789696.0, + "logits/rejected": 1142247680.0, + "logps/chosen": -212.6407470703125, + "logps/rejected": -483.8826904296875, + "loss": 0.1704, + "rewards/chosen": 3.1639763514200845, + "rewards/margins": 7.952767054239908, + "rewards/rejected": -4.788790702819824, + "step": 2582 + }, + { + "epoch": 0.23599817268158976, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.70335221330837e-06, + "logits/chosen": 487737386.6666667, + "logits/rejected": 584220800.0, + "logps/chosen": -280.85581461588544, + "logps/rejected": -539.1275634765625, + "loss": 0.0315, + "rewards/chosen": 3.6024694442749023, + "rewards/margins": 13.385659217834473, + "rewards/rejected": -9.78318977355957, + "step": 2583 + }, + { + "epoch": 0.2360895386021014, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 8.702386047767385e-06, + "logits/chosen": 505961386.6666667, + "logits/rejected": 258042656.0, + "logps/chosen": -278.851806640625, + "logps/rejected": -341.58782958984375, + "loss": 0.1172, + "rewards/chosen": 3.6409295399983725, + "rewards/margins": 13.63095792134603, + "rewards/rejected": -9.990028381347656, + "step": 2584 + }, + { + "epoch": 0.23618090452261306, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.701419576073375e-06, + "logits/chosen": 509649749.3333333, + "logits/rejected": 464203328.0, + "logps/chosen": -321.404541015625, + "logps/rejected": -584.1046752929688, + "loss": 0.0436, + "rewards/chosen": 3.0569047927856445, + "rewards/margins": 11.08207893371582, + "rewards/rejected": -8.025174140930176, + "step": 2585 + }, + { + "epoch": 0.2362722704431247, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 8.70045279830626e-06, + "logits/chosen": 440819242.6666667, + "logits/rejected": 514234464.0, + "logps/chosen": -271.2274983723958, + "logps/rejected": -677.89990234375, + "loss": 0.0418, + "rewards/chosen": 3.0131216049194336, + "rewards/margins": 14.843318939208984, + "rewards/rejected": -11.83019733428955, + "step": 2586 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.699485714545986e-06, + "logits/chosen": 627193088.0, + "logits/rejected": 382607689.14285713, + "logps/chosen": -440.38824462890625, + "logps/rejected": -406.7691127232143, + "loss": 0.0187, + "rewards/chosen": 3.2411561012268066, + "rewards/margins": 10.915625640324183, + "rewards/rejected": -7.674469539097378, + "step": 2587 + }, + { + "epoch": 0.236455002284148, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 8.698518324872518e-06, + "logits/chosen": 630048256.0, + "logits/rejected": 366978656.0, + "logps/chosen": -401.8580322265625, + "logps/rejected": -380.24896240234375, + "loss": 0.0292, + "rewards/chosen": 3.4951903025309243, + "rewards/margins": 13.18081537882487, + "rewards/rejected": -9.685625076293945, + "step": 2588 + }, + { + "epoch": 0.23654636820465966, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.697550629365854e-06, + "logits/chosen": 712125952.0, + "logits/rejected": 1573790378.6666667, + "logps/chosen": -421.995703125, + "logps/rejected": -1085.5107421875, + "loss": 0.0341, + "rewards/chosen": 2.9869640350341795, + "rewards/margins": 19.67238629659017, + "rewards/rejected": -16.68542226155599, + "step": 2589 + }, + { + "epoch": 0.2366377341251713, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 8.69658262810601e-06, + "logits/chosen": 481869056.0, + "logits/rejected": 495597866.6666667, + "logps/chosen": -331.117431640625, + "logps/rejected": -488.7178548177083, + "loss": 0.0272, + "rewards/chosen": 3.335069274902344, + "rewards/margins": 13.029929733276367, + "rewards/rejected": -9.694860458374023, + "step": 2590 + }, + { + "epoch": 0.23672910004568296, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 8.695614321173032e-06, + "logits/chosen": 648508416.0, + "logits/rejected": 392164522.6666667, + "logps/chosen": -507.163330078125, + "logps/rejected": -474.0513509114583, + "loss": 0.0108, + "rewards/chosen": 3.1192779541015625, + "rewards/margins": 14.764144897460938, + "rewards/rejected": -11.644866943359375, + "step": 2591 + }, + { + "epoch": 0.2368204659661946, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 8.69464570864699e-06, + "logits/chosen": 639343573.3333334, + "logits/rejected": 579818112.0, + "logps/chosen": -324.94915771484375, + "logps/rejected": -251.3460693359375, + "loss": 0.0275, + "rewards/chosen": 3.6081110636393228, + "rewards/margins": 11.114357630411783, + "rewards/rejected": -7.506246566772461, + "step": 2592 + }, + { + "epoch": 0.23691183188670625, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 8.693676790607982e-06, + "logits/chosen": 701725184.0, + "logits/rejected": 713767936.0, + "logps/chosen": -320.0519714355469, + "logps/rejected": -623.0804443359375, + "loss": 0.1028, + "rewards/chosen": 2.244936466217041, + "rewards/margins": 10.087440967559814, + "rewards/rejected": -7.842504501342773, + "step": 2593 + }, + { + "epoch": 0.2370031978072179, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.692707567136126e-06, + "logits/chosen": 650062080.0, + "logits/rejected": 232768512.0, + "logps/chosen": -430.0267578125, + "logps/rejected": -222.6973876953125, + "loss": 0.0345, + "rewards/chosen": 3.4922046661376953, + "rewards/margins": 9.383451461791992, + "rewards/rejected": -5.891246795654297, + "step": 2594 + }, + { + "epoch": 0.23709456372772955, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 8.691738038311565e-06, + "logits/chosen": 343436384.0, + "logits/rejected": 543435456.0, + "logps/chosen": -256.8003234863281, + "logps/rejected": -451.40887451171875, + "loss": 0.0143, + "rewards/chosen": 4.244022846221924, + "rewards/margins": 13.878450870513916, + "rewards/rejected": -9.634428024291992, + "step": 2595 + }, + { + "epoch": 0.2371859296482412, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 8.690768204214474e-06, + "logits/chosen": 391997952.0, + "logits/rejected": 602827059.2, + "logps/chosen": -328.63169352213544, + "logps/rejected": -537.282861328125, + "loss": 0.011, + "rewards/chosen": 4.4252058664957685, + "rewards/margins": 14.29764238993327, + "rewards/rejected": -9.8724365234375, + "step": 2596 + }, + { + "epoch": 0.23727729556875285, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 8.689798064925049e-06, + "logits/chosen": 782576042.6666666, + "logits/rejected": 614590361.6, + "logps/chosen": -390.01708984375, + "logps/rejected": -525.21474609375, + "loss": 0.0272, + "rewards/chosen": 2.6678171157836914, + "rewards/margins": 11.367717933654784, + "rewards/rejected": -8.699900817871093, + "step": 2597 + }, + { + "epoch": 0.2373686614892645, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 8.68882762052351e-06, + "logits/chosen": 388975189.3333333, + "logits/rejected": 399087923.2, + "logps/chosen": -310.8138020833333, + "logps/rejected": -501.79208984375, + "loss": 0.0121, + "rewards/chosen": 3.5603774388631186, + "rewards/margins": 12.436381467183432, + "rewards/rejected": -8.876004028320313, + "step": 2598 + }, + { + "epoch": 0.23746002740977615, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 8.687856871090106e-06, + "logits/chosen": 283070816.0, + "logits/rejected": 431628141.71428573, + "logps/chosen": -195.50027465820312, + "logps/rejected": -506.13071986607144, + "loss": 0.0059, + "rewards/chosen": 3.037802219390869, + "rewards/margins": 11.845230851854597, + "rewards/rejected": -8.807428632463727, + "step": 2599 + }, + { + "epoch": 0.2375513933302878, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 8.686885816705106e-06, + "logits/chosen": 578857920.0, + "logits/rejected": 371254400.0, + "logps/chosen": -460.4354553222656, + "logps/rejected": -461.7359619140625, + "loss": 0.0183, + "rewards/chosen": 3.6157708168029785, + "rewards/margins": 10.969469547271729, + "rewards/rejected": -7.35369873046875, + "step": 2600 + }, + { + "epoch": 0.23764275925079945, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 8.68591445744881e-06, + "logits/chosen": 1031468032.0, + "logits/rejected": 489848217.6, + "logps/chosen": -589.0472005208334, + "logps/rejected": -336.168505859375, + "loss": 0.0145, + "rewards/chosen": 3.798245112101237, + "rewards/margins": 11.551833979288737, + "rewards/rejected": -7.7535888671875, + "step": 2601 + }, + { + "epoch": 0.2377341251713111, + "grad_norm": 35.75, + "kl": 0.0, + "learning_rate": 8.684942793401539e-06, + "logits/chosen": 458168064.0, + "logps/chosen": -354.2601318359375, + "loss": 0.2134, + "rewards/chosen": 2.569639205932617, + "step": 2602 + }, + { + "epoch": 0.23782549109182274, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 8.683970824643639e-06, + "logits/chosen": 937377344.0, + "logits/rejected": 777862826.6666666, + "logps/chosen": -586.4014892578125, + "logps/rejected": -518.0915120442709, + "loss": 0.009, + "rewards/chosen": 3.5984129905700684, + "rewards/margins": 12.12977933883667, + "rewards/rejected": -8.531366348266602, + "step": 2603 + }, + { + "epoch": 0.2379168570123344, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.682998551255486e-06, + "logits/chosen": 743142592.0, + "logits/rejected": 653402688.0, + "logps/chosen": -393.6602783203125, + "logps/rejected": -404.16351318359375, + "loss": 0.0282, + "rewards/chosen": 3.078981876373291, + "rewards/margins": 11.280243396759033, + "rewards/rejected": -8.201261520385742, + "step": 2604 + }, + { + "epoch": 0.23800822293284604, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 8.682025973317477e-06, + "logits/chosen": 351196800.0, + "logits/rejected": 378060224.0, + "logps/chosen": -246.5380859375, + "logps/rejected": -295.461181640625, + "loss": 0.092, + "rewards/chosen": 3.927191734313965, + "rewards/margins": 10.135241031646729, + "rewards/rejected": -6.208049297332764, + "step": 2605 + }, + { + "epoch": 0.2380995888533577, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.681053090910032e-06, + "logits/chosen": 766126336.0, + "logits/rejected": 365908309.3333333, + "logps/chosen": -296.1243896484375, + "logps/rejected": -406.9957682291667, + "loss": 0.0297, + "rewards/chosen": 3.4070713043212892, + "rewards/margins": 11.485506566365562, + "rewards/rejected": -8.078435262044271, + "step": 2606 + }, + { + "epoch": 0.23819095477386934, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 8.680079904113606e-06, + "logits/chosen": 454703018.6666667, + "logits/rejected": 274567116.8, + "logps/chosen": -213.0323282877604, + "logps/rejected": -224.204736328125, + "loss": 0.024, + "rewards/chosen": 3.163511276245117, + "rewards/margins": 8.803354263305664, + "rewards/rejected": -5.639842987060547, + "step": 2607 + }, + { + "epoch": 0.238282320694381, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.679106413008665e-06, + "logits/chosen": 660419276.8, + "logits/rejected": 599671296.0, + "logps/chosen": -256.8627685546875, + "logps/rejected": -455.9098714192708, + "loss": 0.0385, + "rewards/chosen": 3.188009262084961, + "rewards/margins": 12.374961217244467, + "rewards/rejected": -9.186951955159506, + "step": 2608 + }, + { + "epoch": 0.23837368661489264, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 8.678132617675714e-06, + "logits/chosen": 731354688.0, + "logits/rejected": 541911744.0, + "logps/chosen": -337.79339599609375, + "logps/rejected": -460.4390563964844, + "loss": 0.0244, + "rewards/chosen": 3.3655338287353516, + "rewards/margins": 11.81314468383789, + "rewards/rejected": -8.447610855102539, + "step": 2609 + }, + { + "epoch": 0.2384650525354043, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 8.677158518195273e-06, + "logits/chosen": 419357056.0, + "logits/rejected": 263843456.0, + "logps/chosen": -333.6524658203125, + "logps/rejected": -347.2061767578125, + "loss": 0.0351, + "rewards/chosen": 2.769496440887451, + "rewards/margins": 10.157288551330566, + "rewards/rejected": -7.387792110443115, + "step": 2610 + }, + { + "epoch": 0.23855641845591594, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 8.676184114647893e-06, + "logits/chosen": 498706602.6666667, + "logits/rejected": 277485414.4, + "logps/chosen": -350.5043131510417, + "logps/rejected": -478.2447265625, + "loss": 0.0108, + "rewards/chosen": 3.7629353205362954, + "rewards/margins": 13.754804674784342, + "rewards/rejected": -9.991869354248047, + "step": 2611 + }, + { + "epoch": 0.23864778437642759, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 8.675209407114146e-06, + "logits/chosen": 376277120.0, + "logits/rejected": 866101418.6666666, + "logps/chosen": -215.88314819335938, + "logps/rejected": -403.7642822265625, + "loss": 0.0071, + "rewards/chosen": 4.094245910644531, + "rewards/margins": 11.675907135009766, + "rewards/rejected": -7.581661224365234, + "step": 2612 + }, + { + "epoch": 0.23873915029693923, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 8.674234395674632e-06, + "logits/chosen": 296795733.3333333, + "logits/rejected": 623232051.2, + "logps/chosen": -240.75634765625, + "logps/rejected": -580.482275390625, + "loss": 0.0244, + "rewards/chosen": 3.7121874491373696, + "rewards/margins": 13.020987192789713, + "rewards/rejected": -9.308799743652344, + "step": 2613 + }, + { + "epoch": 0.23883051621745088, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 8.673259080409978e-06, + "logits/chosen": 629788672.0, + "logits/rejected": 466448480.0, + "logps/chosen": -436.268310546875, + "logps/rejected": -579.3046875, + "loss": 0.0162, + "rewards/chosen": 3.7386999130249023, + "rewards/margins": 11.734301567077637, + "rewards/rejected": -7.995601654052734, + "step": 2614 + }, + { + "epoch": 0.23892188213796253, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 8.67228346140083e-06, + "logits/chosen": 897086805.3333334, + "logits/rejected": 473858764.8, + "logps/chosen": -536.10595703125, + "logps/rejected": -445.634716796875, + "loss": 0.0155, + "rewards/chosen": 3.2393035888671875, + "rewards/margins": 11.824435424804687, + "rewards/rejected": -8.5851318359375, + "step": 2615 + }, + { + "epoch": 0.23901324805847418, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 8.671307538727866e-06, + "logits/chosen": 614060629.3333334, + "logits/rejected": 731938560.0, + "logps/chosen": -227.58036295572916, + "logps/rejected": -438.398779296875, + "loss": 0.0205, + "rewards/chosen": 3.5037094751993814, + "rewards/margins": 12.47928320566813, + "rewards/rejected": -8.97557373046875, + "step": 2616 + }, + { + "epoch": 0.23910461397898583, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 8.670331312471783e-06, + "logits/chosen": 765687637.3333334, + "logits/rejected": 592803891.2, + "logps/chosen": -321.1302490234375, + "logps/rejected": -390.3549072265625, + "loss": 0.0201, + "rewards/chosen": 3.0429347356160483, + "rewards/margins": 12.441143353780111, + "rewards/rejected": -9.398208618164062, + "step": 2617 + }, + { + "epoch": 0.23919597989949748, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.669354782713307e-06, + "logits/chosen": 578853171.2, + "logits/rejected": 546511786.6666666, + "logps/chosen": -403.7915283203125, + "logps/rejected": -500.42041015625, + "loss": 0.0095, + "rewards/chosen": 4.371286010742187, + "rewards/margins": 12.968585586547851, + "rewards/rejected": -8.597299575805664, + "step": 2618 + }, + { + "epoch": 0.23928734582000913, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 8.668377949533187e-06, + "logits/chosen": 315584320.0, + "logits/rejected": 348106112.0, + "logps/chosen": -327.5884094238281, + "logps/rejected": -555.3007202148438, + "loss": 0.0082, + "rewards/chosen": 4.467778205871582, + "rewards/margins": 15.885676383972168, + "rewards/rejected": -11.417898178100586, + "step": 2619 + }, + { + "epoch": 0.23937871174052078, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.6674008130122e-06, + "logits/chosen": 743253760.0, + "logits/rejected": 975056998.4, + "logps/chosen": -513.0, + "logps/rejected": -440.15419921875, + "loss": 0.0197, + "rewards/chosen": 3.4164257049560547, + "rewards/margins": 11.629922103881835, + "rewards/rejected": -8.21349639892578, + "step": 2620 + }, + { + "epoch": 0.23947007766103243, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 8.666423373231145e-06, + "logits/chosen": 1080505753.6, + "logits/rejected": 833106090.6666666, + "logps/chosen": -246.5006591796875, + "logps/rejected": -729.010986328125, + "loss": 0.0104, + "rewards/chosen": 4.401939773559571, + "rewards/margins": 14.702160263061524, + "rewards/rejected": -10.300220489501953, + "step": 2621 + }, + { + "epoch": 0.23956144358154408, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 8.665445630270846e-06, + "logits/chosen": 598988141.7142857, + "logits/rejected": 205760864.0, + "logps/chosen": -316.7793666294643, + "logps/rejected": -327.4037170410156, + "loss": 0.1778, + "rewards/chosen": 1.8835152217320033, + "rewards/margins": 13.156308991568428, + "rewards/rejected": -11.272793769836426, + "step": 2622 + }, + { + "epoch": 0.23965280950205572, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 8.664467584212155e-06, + "logits/chosen": 527041280.0, + "logits/rejected": 537194368.0, + "logps/chosen": -347.601318359375, + "logps/rejected": -375.703369140625, + "loss": 0.0918, + "rewards/chosen": 3.31522274017334, + "rewards/margins": 9.950949668884277, + "rewards/rejected": -6.6357269287109375, + "step": 2623 + }, + { + "epoch": 0.23974417542256737, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 8.663489235135947e-06, + "logits/chosen": 468693606.4, + "logits/rejected": 574147157.3333334, + "logps/chosen": -328.1675537109375, + "logps/rejected": -388.1335042317708, + "loss": 0.0223, + "rewards/chosen": 3.751139831542969, + "rewards/margins": 12.248891703287761, + "rewards/rejected": -8.497751871744791, + "step": 2624 + }, + { + "epoch": 0.23983554134307902, + "grad_norm": 26.5, + "kl": 0.0, + "learning_rate": 8.662510583123123e-06, + "logits/chosen": 435931596.8, + "logits/rejected": 410764714.6666667, + "logps/chosen": -337.106689453125, + "logps/rejected": -492.8968912760417, + "loss": 0.0568, + "rewards/chosen": 2.7523334503173826, + "rewards/margins": 11.319735336303712, + "rewards/rejected": -8.567401885986328, + "step": 2625 + }, + { + "epoch": 0.23992690726359067, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 8.661531628254606e-06, + "logits/chosen": 507963744.0, + "logits/rejected": 670988224.0, + "logps/chosen": -245.27882385253906, + "logps/rejected": -545.5643310546875, + "loss": 0.0111, + "rewards/chosen": 4.464199066162109, + "rewards/margins": 12.643020629882812, + "rewards/rejected": -8.178821563720703, + "step": 2626 + }, + { + "epoch": 0.24001827318410232, + "grad_norm": 24.625, + "kl": 0.0, + "learning_rate": 8.66055237061135e-06, + "logits/chosen": 490672800.0, + "logits/rejected": 410285696.0, + "logps/chosen": -132.8243865966797, + "logps/rejected": -428.2404378255208, + "loss": 0.0967, + "rewards/chosen": 1.1520214080810547, + "rewards/margins": 9.90772819519043, + "rewards/rejected": -8.755706787109375, + "step": 2627 + }, + { + "epoch": 0.24010963910461397, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 8.659572810274327e-06, + "logits/chosen": 322249066.6666667, + "logits/rejected": 563728179.2, + "logps/chosen": -237.9794921875, + "logps/rejected": -516.25185546875, + "loss": 0.0108, + "rewards/chosen": 4.618287404378255, + "rewards/margins": 13.925006612141928, + "rewards/rejected": -9.306719207763672, + "step": 2628 + }, + { + "epoch": 0.24020100502512562, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 8.65859294732454e-06, + "logits/chosen": 752714649.6, + "logits/rejected": 461766229.3333333, + "logps/chosen": -420.3451171875, + "logps/rejected": -418.1103515625, + "loss": 0.1542, + "rewards/chosen": 2.052862548828125, + "rewards/margins": 8.216644032796223, + "rewards/rejected": -6.163781483968099, + "step": 2629 + }, + { + "epoch": 0.24029237094563727, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 8.657612781843014e-06, + "logits/chosen": 585519744.0, + "logits/rejected": 506407360.0, + "logps/chosen": -322.8861083984375, + "logps/rejected": -575.01220703125, + "loss": 0.1389, + "rewards/chosen": 1.650498390197754, + "rewards/margins": 12.061627388000488, + "rewards/rejected": -10.411128997802734, + "step": 2630 + }, + { + "epoch": 0.24038373686614892, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 8.6566323139108e-06, + "logits/chosen": 315975264.0, + "logits/rejected": 367566496.0, + "logps/chosen": -231.09197998046875, + "logps/rejected": -510.9344177246094, + "loss": 0.0082, + "rewards/chosen": 4.2931132316589355, + "rewards/margins": 14.776057720184326, + "rewards/rejected": -10.48294448852539, + "step": 2631 + }, + { + "epoch": 0.24047510278666057, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 8.655651543608971e-06, + "logits/chosen": 436718208.0, + "logits/rejected": 380533674.6666667, + "logps/chosen": -202.67486572265625, + "logps/rejected": -459.2591145833333, + "loss": 0.1152, + "rewards/chosen": 2.142364978790283, + "rewards/margins": 9.75264851252238, + "rewards/rejected": -7.610283533732097, + "step": 2632 + }, + { + "epoch": 0.24056646870717222, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 8.654670471018632e-06, + "logits/chosen": 856745386.6666666, + "logits/rejected": 651257753.6, + "logps/chosen": -363.6629231770833, + "logps/rejected": -471.936376953125, + "loss": 0.018, + "rewards/chosen": 3.1606648763020835, + "rewards/margins": 13.479338582356771, + "rewards/rejected": -10.318673706054687, + "step": 2633 + }, + { + "epoch": 0.24065783462768386, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.653689096220905e-06, + "logits/chosen": 592533555.2, + "logits/rejected": 1140598869.3333333, + "logps/chosen": -251.0997802734375, + "logps/rejected": -476.8501383463542, + "loss": 0.0329, + "rewards/chosen": 3.214922332763672, + "rewards/margins": 11.520542526245118, + "rewards/rejected": -8.305620193481445, + "step": 2634 + }, + { + "epoch": 0.2407492005481955, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 8.652707419296944e-06, + "logits/chosen": 710343424.0, + "logits/rejected": 464114688.0, + "logps/chosen": -256.3497721354167, + "logps/rejected": -478.15986328125, + "loss": 0.0118, + "rewards/chosen": 3.8320868810017905, + "rewards/margins": 12.797902615865071, + "rewards/rejected": -8.96581573486328, + "step": 2635 + }, + { + "epoch": 0.24084056646870716, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 8.65172544032792e-06, + "logits/chosen": 707166500.5714285, + "logits/rejected": 730504256.0, + "logps/chosen": -282.98507254464283, + "logps/rejected": -663.4351196289062, + "loss": 0.0691, + "rewards/chosen": 2.6646366119384766, + "rewards/margins": 9.53096342086792, + "rewards/rejected": -6.866326808929443, + "step": 2636 + }, + { + "epoch": 0.2409319323892188, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 8.650743159395039e-06, + "logits/chosen": 514242880.0, + "logits/rejected": 629606144.0, + "logps/chosen": -271.70062255859375, + "logps/rejected": -687.1517333984375, + "loss": 0.0242, + "rewards/chosen": 3.1438887119293213, + "rewards/margins": 12.029452562332153, + "rewards/rejected": -8.885563850402832, + "step": 2637 + }, + { + "epoch": 0.24102329830973046, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 8.64976057657952e-06, + "logits/chosen": 485898496.0, + "logits/rejected": 405586901.3333333, + "logps/chosen": -689.63818359375, + "logps/rejected": -432.7142740885417, + "loss": 0.0111, + "rewards/chosen": 3.54305362701416, + "rewards/margins": 11.920016924540201, + "rewards/rejected": -8.376963297526041, + "step": 2638 + }, + { + "epoch": 0.2411146642302421, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 8.64877769196262e-06, + "logits/chosen": 448828544.0, + "logits/rejected": 437787340.8, + "logps/chosen": -340.5013427734375, + "logps/rejected": -430.153515625, + "loss": 0.0103, + "rewards/chosen": 3.9092124303181968, + "rewards/margins": 12.163626035054525, + "rewards/rejected": -8.254413604736328, + "step": 2639 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 8.64779450562561e-06, + "logits/chosen": 577462528.0, + "logits/rejected": 1000356864.0, + "logps/chosen": -309.372314453125, + "logps/rejected": -323.3972981770833, + "loss": 0.0375, + "rewards/chosen": 3.8461856842041016, + "rewards/margins": 9.09761905670166, + "rewards/rejected": -5.251433372497559, + "step": 2640 + }, + { + "epoch": 0.2412973960712654, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 8.646811017649792e-06, + "logits/chosen": 443447296.0, + "logits/rejected": 228244821.33333334, + "logps/chosen": -403.6298828125, + "logps/rejected": -501.1680908203125, + "loss": 0.1346, + "rewards/chosen": 2.1938240051269533, + "rewards/margins": 13.165014139811198, + "rewards/rejected": -10.971190134684244, + "step": 2641 + }, + { + "epoch": 0.24138876199177706, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.645827228116493e-06, + "logits/chosen": 347698048.0, + "logits/rejected": 375447808.0, + "logps/chosen": -210.1877685546875, + "logps/rejected": -530.461669921875, + "loss": 0.0511, + "rewards/chosen": 2.587454986572266, + "rewards/margins": 12.569613647460937, + "rewards/rejected": -9.982158660888672, + "step": 2642 + }, + { + "epoch": 0.2414801279122887, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 8.644843137107058e-06, + "logits/chosen": 436049322.6666667, + "logits/rejected": 275175168.0, + "logps/chosen": -272.62636311848956, + "logps/rejected": -485.926513671875, + "loss": 0.0133, + "rewards/chosen": 4.127813339233398, + "rewards/margins": 12.772516250610352, + "rewards/rejected": -8.644702911376953, + "step": 2643 + }, + { + "epoch": 0.24157149383280035, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.643858744702869e-06, + "logits/chosen": 893709482.6666666, + "logits/rejected": 379916800.0, + "logps/chosen": -581.2491455078125, + "logps/rejected": -464.55009765625, + "loss": 0.0837, + "rewards/chosen": 3.4086268742879233, + "rewards/margins": 8.47114060719808, + "rewards/rejected": -5.062513732910157, + "step": 2644 + }, + { + "epoch": 0.241662859753312, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 8.64287405098532e-06, + "logits/chosen": 375685120.0, + "logits/rejected": 475655232.0, + "logps/chosen": -287.14747837611606, + "logps/rejected": -581.6273193359375, + "loss": 0.1647, + "rewards/chosen": 2.223219462803432, + "rewards/margins": 11.501411982945033, + "rewards/rejected": -9.278192520141602, + "step": 2645 + }, + { + "epoch": 0.24175422567382365, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.641889056035842e-06, + "logits/chosen": 459739392.0, + "logits/rejected": 496537258.6666667, + "logps/chosen": -328.86279296875, + "logps/rejected": -608.8685709635416, + "loss": 0.0307, + "rewards/chosen": 3.08082275390625, + "rewards/margins": 11.570214080810548, + "rewards/rejected": -8.489391326904297, + "step": 2646 + }, + { + "epoch": 0.2418455915943353, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.640903759935882e-06, + "logits/chosen": 837143722.6666666, + "logits/rejected": 527581132.8, + "logps/chosen": -645.3681233723959, + "logps/rejected": -430.668896484375, + "loss": 0.0157, + "rewards/chosen": 4.136377016703288, + "rewards/margins": 12.804509035746257, + "rewards/rejected": -8.668132019042968, + "step": 2647 + }, + { + "epoch": 0.24193695751484695, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 8.639918162766911e-06, + "logits/chosen": 905586048.0, + "logits/rejected": 595831637.3333334, + "logps/chosen": -549.9613037109375, + "logps/rejected": -584.2389322916666, + "loss": 0.0063, + "rewards/chosen": 3.8385894298553467, + "rewards/margins": 12.665114164352417, + "rewards/rejected": -8.82652473449707, + "step": 2648 + }, + { + "epoch": 0.2420283234353586, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 8.638932264610435e-06, + "logits/chosen": 468217376.0, + "logits/rejected": 465090986.6666667, + "logps/chosen": -331.57928466796875, + "logps/rejected": -602.2860921223959, + "loss": 0.071, + "rewards/chosen": 3.764106273651123, + "rewards/margins": 10.746629873911541, + "rewards/rejected": -6.982523600260417, + "step": 2649 + }, + { + "epoch": 0.24211968935587025, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 8.637946065547978e-06, + "logits/chosen": 416065331.2, + "logits/rejected": 411901610.6666667, + "logps/chosen": -185.301513671875, + "logps/rejected": -417.292236328125, + "loss": 0.0077, + "rewards/chosen": 4.915552139282227, + "rewards/margins": 11.086423873901367, + "rewards/rejected": -6.170871734619141, + "step": 2650 + }, + { + "epoch": 0.2422110552763819, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 8.636959565661086e-06, + "logits/chosen": 439782246.4, + "logits/rejected": 300785962.6666667, + "logps/chosen": -459.39287109375, + "logps/rejected": -405.5622151692708, + "loss": 0.007, + "rewards/chosen": 4.564706039428711, + "rewards/margins": 14.704078547159831, + "rewards/rejected": -10.13937250773112, + "step": 2651 + }, + { + "epoch": 0.24230242119689355, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 8.635972765031334e-06, + "logits/chosen": 493304512.0, + "logits/rejected": 538859178.6666666, + "logps/chosen": -407.7788391113281, + "logps/rejected": -386.7527669270833, + "loss": 0.0072, + "rewards/chosen": 4.3396897315979, + "rewards/margins": 11.853197574615479, + "rewards/rejected": -7.513507843017578, + "step": 2652 + }, + { + "epoch": 0.2423937871174052, + "grad_norm": 0.640625, + "kl": 0.0, + "learning_rate": 8.634985663740323e-06, + "logits/chosen": 253607488.0, + "logits/rejected": 833048917.3333334, + "logps/chosen": -97.18147277832031, + "logps/rejected": -664.7289225260416, + "loss": 0.0037, + "rewards/chosen": 4.263840198516846, + "rewards/margins": 15.174674193064371, + "rewards/rejected": -10.910833994547525, + "step": 2653 + }, + { + "epoch": 0.24248515303791685, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.633998261869679e-06, + "logits/chosen": 488375082.6666667, + "logits/rejected": 761359513.6, + "logps/chosen": -187.58349609375, + "logps/rejected": -418.744140625, + "loss": 0.0658, + "rewards/chosen": 2.343258857727051, + "rewards/margins": 11.53581485748291, + "rewards/rejected": -9.192555999755859, + "step": 2654 + }, + { + "epoch": 0.2425765189584285, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 8.633010559501045e-06, + "logits/chosen": 846538069.3333334, + "logits/rejected": 445667840.0, + "logps/chosen": -389.7484537760417, + "logps/rejected": -439.403466796875, + "loss": 0.006, + "rewards/chosen": 4.552423477172852, + "rewards/margins": 13.277515029907226, + "rewards/rejected": -8.725091552734375, + "step": 2655 + }, + { + "epoch": 0.24266788487894014, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 8.6320225567161e-06, + "logits/chosen": 610993356.8, + "logits/rejected": 913833984.0, + "logps/chosen": -280.641357421875, + "logps/rejected": -471.7985026041667, + "loss": 0.1094, + "rewards/chosen": 3.5228233337402344, + "rewards/margins": 10.429409662882488, + "rewards/rejected": -6.906586329142253, + "step": 2656 + }, + { + "epoch": 0.2427592507994518, + "grad_norm": 28.875, + "kl": 0.0, + "learning_rate": 8.63103425359654e-06, + "logits/chosen": 732145459.2, + "logits/rejected": 555190784.0, + "logps/chosen": -208.673291015625, + "logps/rejected": -220.1365763346354, + "loss": 0.152, + "rewards/chosen": 3.1455509185791017, + "rewards/margins": 6.974562517801921, + "rewards/rejected": -3.829011599222819, + "step": 2657 + }, + { + "epoch": 0.24285061671996344, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.63004565022409e-06, + "logits/chosen": 438318336.0, + "logits/rejected": 432856576.0, + "logps/chosen": -375.1265462239583, + "logps/rejected": -532.005078125, + "loss": 0.0848, + "rewards/chosen": 4.1303755442301435, + "rewards/margins": 13.390686670939129, + "rewards/rejected": -9.260311126708984, + "step": 2658 + }, + { + "epoch": 0.2429419826404751, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.629056746680498e-06, + "logits/chosen": 635315029.3333334, + "logits/rejected": 1017033113.6, + "logps/chosen": -446.9781901041667, + "logps/rejected": -446.07890625, + "loss": 0.0228, + "rewards/chosen": 2.974633534749349, + "rewards/margins": 11.050240834554037, + "rewards/rejected": -8.075607299804688, + "step": 2659 + }, + { + "epoch": 0.24303334856098674, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 8.628067543047538e-06, + "logits/chosen": 431763541.3333333, + "logits/rejected": 537109248.0, + "logps/chosen": -224.80792236328125, + "logps/rejected": -482.840625, + "loss": 0.0814, + "rewards/chosen": 2.837217648824056, + "rewards/margins": 9.611808713277181, + "rewards/rejected": -6.774591064453125, + "step": 2660 + }, + { + "epoch": 0.2431247144814984, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 8.627078039407006e-06, + "logits/chosen": 484427136.0, + "logits/rejected": 580276352.0, + "logps/chosen": -270.15777587890625, + "logps/rejected": -522.0216064453125, + "loss": 0.0452, + "rewards/chosen": 2.5312726497650146, + "rewards/margins": 10.902303457260132, + "rewards/rejected": -8.371030807495117, + "step": 2661 + }, + { + "epoch": 0.24321608040201004, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 8.626088235840726e-06, + "logits/chosen": 669944960.0, + "logits/rejected": 407895360.0, + "logps/chosen": -326.52838134765625, + "logps/rejected": -488.4403076171875, + "loss": 0.0171, + "rewards/chosen": 3.452012300491333, + "rewards/margins": 11.974995851516724, + "rewards/rejected": -8.52298355102539, + "step": 2662 + }, + { + "epoch": 0.2433074463225217, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 8.625098132430545e-06, + "logits/chosen": 608719104.0, + "logits/rejected": 705248960.0, + "logps/chosen": -409.4646301269531, + "logps/rejected": -800.3344116210938, + "loss": 0.0197, + "rewards/chosen": 3.3795952796936035, + "rewards/margins": 15.007904529571533, + "rewards/rejected": -11.62830924987793, + "step": 2663 + }, + { + "epoch": 0.24339881224303334, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.624107729258335e-06, + "logits/chosen": 294373120.0, + "logits/rejected": 470637738.6666667, + "logps/chosen": -359.62529296875, + "logps/rejected": -428.4529215494792, + "loss": 0.0213, + "rewards/chosen": 3.6732139587402344, + "rewards/margins": 11.74328867594401, + "rewards/rejected": -8.070074717203775, + "step": 2664 + }, + { + "epoch": 0.24349017816354498, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 8.623117026405995e-06, + "logits/chosen": 606991872.0, + "logits/rejected": 397395370.6666667, + "logps/chosen": -419.0379333496094, + "logps/rejected": -467.2528076171875, + "loss": 0.0312, + "rewards/chosen": 2.0511443614959717, + "rewards/margins": 13.805449883143107, + "rewards/rejected": -11.754305521647135, + "step": 2665 + }, + { + "epoch": 0.24358154408405663, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.622126023955446e-06, + "logits/chosen": 473615360.0, + "logits/rejected": 894169984.0, + "logps/chosen": -386.21185302734375, + "logps/rejected": -946.5096435546875, + "loss": 0.0287, + "rewards/chosen": 3.4116711616516113, + "rewards/margins": 13.295373439788818, + "rewards/rejected": -9.883702278137207, + "step": 2666 + }, + { + "epoch": 0.24367291000456828, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 8.621134721988634e-06, + "logits/chosen": 942801536.0, + "logits/rejected": 836742592.0, + "logps/chosen": -283.3548583984375, + "logps/rejected": -465.7201843261719, + "loss": 0.0175, + "rewards/chosen": 3.8242921829223633, + "rewards/margins": 13.868292808532715, + "rewards/rejected": -10.044000625610352, + "step": 2667 + }, + { + "epoch": 0.24376427592507993, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.62014312058753e-06, + "logits/chosen": 239153648.0, + "logits/rejected": 515847968.0, + "logps/chosen": -276.2344970703125, + "logps/rejected": -599.9903564453125, + "loss": 0.0297, + "rewards/chosen": 3.562893867492676, + "rewards/margins": 12.7985200881958, + "rewards/rejected": -9.235626220703125, + "step": 2668 + }, + { + "epoch": 0.24385564184559158, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 8.619151219834133e-06, + "logits/chosen": 777155200.0, + "logits/rejected": 1086715392.0, + "logps/chosen": -529.642333984375, + "logps/rejected": -495.0578918457031, + "loss": 0.0583, + "rewards/chosen": 2.776447296142578, + "rewards/margins": 11.97477912902832, + "rewards/rejected": -9.198331832885742, + "step": 2669 + }, + { + "epoch": 0.24394700776610323, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 8.618159019810462e-06, + "logits/chosen": 390936384.0, + "logits/rejected": 457111488.0, + "logps/chosen": -273.87493896484375, + "logps/rejected": -388.6855773925781, + "loss": 0.043, + "rewards/chosen": 3.563878297805786, + "rewards/margins": 10.791105508804321, + "rewards/rejected": -7.227227210998535, + "step": 2670 + }, + { + "epoch": 0.24403837368661488, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 8.617166520598563e-06, + "logits/chosen": 912936618.6666666, + "logits/rejected": 486475417.6, + "logps/chosen": -176.92061360677084, + "logps/rejected": -512.02939453125, + "loss": 0.0971, + "rewards/chosen": 3.5661961237589517, + "rewards/margins": 12.018260637919107, + "rewards/rejected": -8.452064514160156, + "step": 2671 + }, + { + "epoch": 0.24412973960712653, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 8.616173722280507e-06, + "logits/chosen": 1220597376.0, + "logits/rejected": 655862058.6666666, + "logps/chosen": -416.84307861328125, + "logps/rejected": -423.6985677083333, + "loss": 0.0094, + "rewards/chosen": 3.614438056945801, + "rewards/margins": 12.71797784169515, + "rewards/rejected": -9.10353978474935, + "step": 2672 + }, + { + "epoch": 0.2442211055276382, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.615180624938388e-06, + "logits/chosen": 897576857.6, + "logits/rejected": 594756010.6666666, + "logps/chosen": -351.3122802734375, + "logps/rejected": -359.0257161458333, + "loss": 0.0591, + "rewards/chosen": 4.119306182861328, + "rewards/margins": 10.728159205118814, + "rewards/rejected": -6.608853022257487, + "step": 2673 + }, + { + "epoch": 0.24431247144814985, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 8.614187228654327e-06, + "logits/chosen": 462831402.6666667, + "logits/rejected": 491399577.6, + "logps/chosen": -439.111083984375, + "logps/rejected": -533.84619140625, + "loss": 0.0169, + "rewards/chosen": 3.1835784912109375, + "rewards/margins": 12.640370178222657, + "rewards/rejected": -9.45679168701172, + "step": 2674 + }, + { + "epoch": 0.2444038373686615, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 8.613193533510468e-06, + "logits/chosen": 551533653.3333334, + "logits/rejected": 419635251.2, + "logps/chosen": -392.8758138020833, + "logps/rejected": -456.52568359375, + "loss": 0.0162, + "rewards/chosen": 3.1737823486328125, + "rewards/margins": 13.254895782470703, + "rewards/rejected": -10.08111343383789, + "step": 2675 + }, + { + "epoch": 0.24449520328917315, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.61219953958898e-06, + "logits/chosen": 587917209.6, + "logits/rejected": 327412330.6666667, + "logps/chosen": -423.55732421875, + "logps/rejected": -490.9668782552083, + "loss": 0.0237, + "rewards/chosen": 3.7343921661376953, + "rewards/margins": 11.744750340779623, + "rewards/rejected": -8.010358174641928, + "step": 2676 + }, + { + "epoch": 0.2445865692096848, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 8.61120524697206e-06, + "logits/chosen": 533168160.0, + "logits/rejected": 617792384.0, + "logps/chosen": -246.63760375976562, + "logps/rejected": -363.48687744140625, + "loss": 0.1397, + "rewards/chosen": 1.5847103595733643, + "rewards/margins": 8.519829034805298, + "rewards/rejected": -6.935118675231934, + "step": 2677 + }, + { + "epoch": 0.24467793513019645, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 8.610210655741923e-06, + "logits/chosen": 300915584.0, + "logits/rejected": 594666581.3333334, + "logps/chosen": -227.64190673828125, + "logps/rejected": -447.0774739583333, + "loss": 0.0071, + "rewards/chosen": 4.0186052322387695, + "rewards/margins": 12.660372098286947, + "rewards/rejected": -8.641766866048178, + "step": 2678 + }, + { + "epoch": 0.2447693010507081, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 8.609215765980813e-06, + "logits/chosen": 710370176.0, + "logits/rejected": 435749280.0, + "logps/chosen": -479.0713704427083, + "logps/rejected": -327.0756530761719, + "loss": 0.1212, + "rewards/chosen": 2.586688677469889, + "rewards/margins": 9.682970682779947, + "rewards/rejected": -7.096282005310059, + "step": 2679 + }, + { + "epoch": 0.24486066697121975, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 8.608220577771002e-06, + "logits/chosen": 583772800.0, + "logits/rejected": 671091840.0, + "logps/chosen": -449.8536376953125, + "logps/rejected": -501.238037109375, + "loss": 0.0414, + "rewards/chosen": 2.548844575881958, + "rewards/margins": 12.20077633857727, + "rewards/rejected": -9.651931762695312, + "step": 2680 + }, + { + "epoch": 0.2449520328917314, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 8.60722509119478e-06, + "logits/chosen": 633761382.4, + "logits/rejected": 613318613.3333334, + "logps/chosen": -238.0778076171875, + "logps/rejected": -308.53118896484375, + "loss": 0.0243, + "rewards/chosen": 3.5042755126953127, + "rewards/margins": 12.911265563964843, + "rewards/rejected": -9.406990051269531, + "step": 2681 + }, + { + "epoch": 0.24504339881224305, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 8.606229306334463e-06, + "logits/chosen": 727968000.0, + "logits/rejected": 709468672.0, + "logps/chosen": -467.87294921875, + "logps/rejected": -466.3358154296875, + "loss": 0.0512, + "rewards/chosen": 2.6434137344360353, + "rewards/margins": 10.95663725535075, + "rewards/rejected": -8.313223520914713, + "step": 2682 + }, + { + "epoch": 0.2451347647327547, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 8.605233223272395e-06, + "logits/chosen": 917751808.0, + "logits/rejected": 1461581312.0, + "logps/chosen": -300.778515625, + "logps/rejected": -865.580078125, + "loss": 0.0431, + "rewards/chosen": 2.78917236328125, + "rewards/margins": 14.439481608072917, + "rewards/rejected": -11.650309244791666, + "step": 2683 + }, + { + "epoch": 0.24522613065326634, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 8.604236842090944e-06, + "logits/chosen": 757632512.0, + "logits/rejected": 593762752.0, + "logps/chosen": -401.9662272135417, + "logps/rejected": -703.191162109375, + "loss": 0.0222, + "rewards/chosen": 3.853466033935547, + "rewards/margins": 15.377077102661133, + "rewards/rejected": -11.523611068725586, + "step": 2684 + }, + { + "epoch": 0.245317496573778, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 8.6032401628725e-06, + "logits/chosen": 490744490.6666667, + "logits/rejected": 392210662.4, + "logps/chosen": -358.7612711588542, + "logps/rejected": -404.60322265625, + "loss": 0.0273, + "rewards/chosen": 2.7180045445760093, + "rewards/margins": 12.58232339223226, + "rewards/rejected": -9.86431884765625, + "step": 2685 + }, + { + "epoch": 0.24540886249428964, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 8.602243185699476e-06, + "logits/chosen": 374469504.0, + "logits/rejected": 354394816.0, + "logps/chosen": -235.94178771972656, + "logps/rejected": -465.8864440917969, + "loss": 0.1292, + "rewards/chosen": 1.7912284135818481, + "rewards/margins": 9.89493191242218, + "rewards/rejected": -8.103703498840332, + "step": 2686 + }, + { + "epoch": 0.2455002284148013, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 8.60124591065432e-06, + "logits/chosen": 618536832.0, + "logits/rejected": 1273907968.0, + "logps/chosen": -191.47235107421875, + "logps/rejected": -543.9822387695312, + "loss": 0.0225, + "rewards/chosen": 3.1256113052368164, + "rewards/margins": 13.3053560256958, + "rewards/rejected": -10.179744720458984, + "step": 2687 + }, + { + "epoch": 0.24559159433531294, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 8.600248337819493e-06, + "logits/chosen": 392669760.0, + "logits/rejected": 628931328.0, + "logps/chosen": -372.4206848144531, + "logps/rejected": -587.423583984375, + "loss": 0.1894, + "rewards/chosen": 1.9238930940628052, + "rewards/margins": 10.11844265460968, + "rewards/rejected": -8.194549560546875, + "step": 2688 + }, + { + "epoch": 0.2456829602558246, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 8.599250467277483e-06, + "logits/chosen": 845400576.0, + "logits/rejected": 966548032.0, + "logps/chosen": -436.4640808105469, + "logps/rejected": -452.69366455078125, + "loss": 0.0325, + "rewards/chosen": 3.1817238330841064, + "rewards/margins": 14.155116319656372, + "rewards/rejected": -10.973392486572266, + "step": 2689 + }, + { + "epoch": 0.24577432617633624, + "grad_norm": 30.75, + "kl": 0.0, + "learning_rate": 8.598252299110809e-06, + "logits/chosen": 563060992.0, + "logits/rejected": 446391398.4, + "logps/chosen": -139.78750610351562, + "logps/rejected": -424.121728515625, + "loss": 0.1324, + "rewards/chosen": 0.6303189595540365, + "rewards/margins": 10.138904317220053, + "rewards/rejected": -9.508585357666016, + "step": 2690 + }, + { + "epoch": 0.2458656920968479, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 8.597253833402008e-06, + "logits/chosen": 602714441.1428572, + "logits/rejected": 576223104.0, + "logps/chosen": -252.15574428013392, + "logps/rejected": -706.1287841796875, + "loss": 0.0458, + "rewards/chosen": 3.169538770403181, + "rewards/margins": 12.160828862871442, + "rewards/rejected": -8.991290092468262, + "step": 2691 + }, + { + "epoch": 0.24595705801735954, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 8.596255070233646e-06, + "logits/chosen": 560358144.0, + "logits/rejected": 434597856.0, + "logps/chosen": -247.80569458007812, + "logps/rejected": -504.5052490234375, + "loss": 0.0718, + "rewards/chosen": 2.2045984268188477, + "rewards/margins": 11.9502592086792, + "rewards/rejected": -9.745660781860352, + "step": 2692 + }, + { + "epoch": 0.24604842393787119, + "grad_norm": 25.375, + "kl": 0.0, + "learning_rate": 8.595256009688307e-06, + "logits/chosen": 805975978.6666666, + "logits/rejected": 508364192.0, + "logps/chosen": -391.7577311197917, + "logps/rejected": -297.483154296875, + "loss": 0.0921, + "rewards/chosen": 2.8288405736287436, + "rewards/margins": 10.883375485738119, + "rewards/rejected": -8.054534912109375, + "step": 2693 + }, + { + "epoch": 0.24613978985838283, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 8.594256651848609e-06, + "logits/chosen": 586468394.6666666, + "logits/rejected": 278268320.0, + "logps/chosen": -463.4581705729167, + "logps/rejected": -513.4417724609375, + "loss": 0.2597, + "rewards/chosen": 1.3883692423502605, + "rewards/margins": 8.411156813303629, + "rewards/rejected": -7.022787570953369, + "step": 2694 + }, + { + "epoch": 0.24623115577889448, + "grad_norm": 1.9296875, + "kl": 1.2548809051513672, + "learning_rate": 8.593256996797186e-06, + "logits/chosen": 684210395.4285715, + "logits/rejected": 490328192.0, + "logps/chosen": -311.87430245535717, + "logps/rejected": -456.037109375, + "loss": 0.0156, + "rewards/chosen": 4.244965962001255, + "rewards/margins": 12.400966099330358, + "rewards/rejected": -8.156000137329102, + "step": 2695 + }, + { + "epoch": 0.24632252169940613, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 8.592257044616701e-06, + "logits/chosen": 294168729.6, + "logits/rejected": 378617813.3333333, + "logps/chosen": -355.399462890625, + "logps/rejected": -402.8438313802083, + "loss": 0.0214, + "rewards/chosen": 3.673542022705078, + "rewards/margins": 15.234910329182943, + "rewards/rejected": -11.561368306477865, + "step": 2696 + }, + { + "epoch": 0.24641388761991778, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.591256795389843e-06, + "logits/chosen": 447205472.0, + "logits/rejected": 755799296.0, + "logps/chosen": -268.2763671875, + "logps/rejected": -497.6751708984375, + "loss": 0.0303, + "rewards/chosen": 3.2865853309631348, + "rewards/margins": 11.71717882156372, + "rewards/rejected": -8.430593490600586, + "step": 2697 + }, + { + "epoch": 0.24650525354042943, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 8.590256249199322e-06, + "logits/chosen": 1076291840.0, + "logits/rejected": 648147200.0, + "logps/chosen": -227.702880859375, + "logps/rejected": -498.37432861328125, + "loss": 0.0189, + "rewards/chosen": 3.358626365661621, + "rewards/margins": 12.290017127990723, + "rewards/rejected": -8.931390762329102, + "step": 2698 + }, + { + "epoch": 0.24659661946094108, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.589255406127873e-06, + "logits/chosen": 605032960.0, + "logits/rejected": 463371221.3333333, + "logps/chosen": -221.3092041015625, + "logps/rejected": -405.081298828125, + "loss": 0.0222, + "rewards/chosen": 3.4646842956542967, + "rewards/margins": 11.586811828613282, + "rewards/rejected": -8.122127532958984, + "step": 2699 + }, + { + "epoch": 0.24668798538145273, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.588254266258256e-06, + "logits/chosen": 575979776.0, + "logits/rejected": 487786304.0, + "logps/chosen": -516.0465087890625, + "logps/rejected": -464.0218505859375, + "loss": 0.0174, + "rewards/chosen": 4.109579563140869, + "rewards/margins": 10.859133243560791, + "rewards/rejected": -6.749553680419922, + "step": 2700 + }, + { + "epoch": 0.24677935130196438, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 8.587252829673258e-06, + "logits/chosen": 682365781.3333334, + "logits/rejected": 266201523.2, + "logps/chosen": -248.47538248697916, + "logps/rejected": -244.57568359375, + "loss": 0.0191, + "rewards/chosen": 3.507136027018229, + "rewards/margins": 9.504534022013345, + "rewards/rejected": -5.997397994995117, + "step": 2701 + }, + { + "epoch": 0.24687071722247603, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.586251096455686e-06, + "logits/chosen": 700003584.0, + "logits/rejected": 783129600.0, + "logps/chosen": -419.31455078125, + "logps/rejected": -390.617431640625, + "loss": 0.0224, + "rewards/chosen": 3.5263648986816407, + "rewards/margins": 13.807879384358724, + "rewards/rejected": -10.281514485677084, + "step": 2702 + }, + { + "epoch": 0.24696208314298768, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 8.585249066688379e-06, + "logits/chosen": 551105484.8, + "logits/rejected": 878702336.0, + "logps/chosen": -332.245068359375, + "logps/rejected": -704.7725423177084, + "loss": 0.0245, + "rewards/chosen": 3.8067062377929686, + "rewards/margins": 14.061245727539063, + "rewards/rejected": -10.254539489746094, + "step": 2703 + }, + { + "epoch": 0.24705344906349933, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 8.58424674045419e-06, + "logits/chosen": 382782933.3333333, + "logits/rejected": 625854464.0, + "logps/chosen": -465.98291015625, + "logps/rejected": -447.909375, + "loss": 0.0124, + "rewards/chosen": 3.754854202270508, + "rewards/margins": 11.339510726928712, + "rewards/rejected": -7.584656524658203, + "step": 2704 + }, + { + "epoch": 0.24714481498401097, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.583244117836005e-06, + "logits/chosen": 540533376.0, + "logits/rejected": 420300000.0, + "logps/chosen": -438.2764485677083, + "logps/rejected": -607.1539916992188, + "loss": 0.0192, + "rewards/chosen": 3.9511985778808594, + "rewards/margins": 13.345647811889648, + "rewards/rejected": -9.394449234008789, + "step": 2705 + }, + { + "epoch": 0.24723618090452262, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 8.582241198916732e-06, + "logits/chosen": 516541900.8, + "logits/rejected": 609770666.6666666, + "logps/chosen": -372.7927734375, + "logps/rejected": -328.4828694661458, + "loss": 0.0235, + "rewards/chosen": 3.841143798828125, + "rewards/margins": 10.522448094685872, + "rewards/rejected": -6.681304295857747, + "step": 2706 + }, + { + "epoch": 0.24732754682503427, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 8.5812379837793e-06, + "logits/chosen": 522075840.0, + "logits/rejected": 380412800.0, + "logps/chosen": -291.2301940917969, + "logps/rejected": -413.77978515625, + "loss": 0.0289, + "rewards/chosen": 3.547667980194092, + "rewards/margins": 12.574899196624756, + "rewards/rejected": -9.027231216430664, + "step": 2707 + }, + { + "epoch": 0.24741891274554592, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 8.580234472506668e-06, + "logits/chosen": 168792661.33333334, + "logits/rejected": 353778534.4, + "logps/chosen": -120.04726155598958, + "logps/rejected": -505.903564453125, + "loss": 0.0221, + "rewards/chosen": 3.8418521881103516, + "rewards/margins": 12.469387435913086, + "rewards/rejected": -8.627535247802735, + "step": 2708 + }, + { + "epoch": 0.24751027866605757, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 8.579230665181819e-06, + "logits/chosen": 281179498.6666667, + "logits/rejected": 459130675.2, + "logps/chosen": -351.4403889973958, + "logps/rejected": -368.0114501953125, + "loss": 0.0949, + "rewards/chosen": 3.556966781616211, + "rewards/margins": 9.431435012817383, + "rewards/rejected": -5.874468231201172, + "step": 2709 + }, + { + "epoch": 0.24760164458656922, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 8.578226561887755e-06, + "logits/chosen": 621537177.6, + "logits/rejected": 320323114.6666667, + "logps/chosen": -288.4494140625, + "logps/rejected": -428.5242513020833, + "loss": 0.0611, + "rewards/chosen": 2.313729095458984, + "rewards/margins": 12.527730433146157, + "rewards/rejected": -10.214001337687174, + "step": 2710 + }, + { + "epoch": 0.24769301050708087, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 8.577222162707506e-06, + "logits/chosen": 555542613.3333334, + "logits/rejected": 275990169.6, + "logps/chosen": -279.7230631510417, + "logps/rejected": -402.13388671875, + "loss": 0.024, + "rewards/chosen": 4.054050127665202, + "rewards/margins": 12.224340502421061, + "rewards/rejected": -8.17029037475586, + "step": 2711 + }, + { + "epoch": 0.24778437642759252, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.576217467724129e-06, + "logits/chosen": 378509482.6666667, + "logits/rejected": 603599820.8, + "logps/chosen": -279.0738525390625, + "logps/rejected": -414.415087890625, + "loss": 0.1116, + "rewards/chosen": 3.1943012873331704, + "rewards/margins": 11.468484942118327, + "rewards/rejected": -8.274183654785157, + "step": 2712 + }, + { + "epoch": 0.24787574234810417, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.5752124770207e-06, + "logits/chosen": 507101610.6666667, + "logits/rejected": 305984588.8, + "logps/chosen": -409.3056640625, + "logps/rejected": -426.110009765625, + "loss": 0.0207, + "rewards/chosen": 3.9036388397216797, + "rewards/margins": 12.089399337768555, + "rewards/rejected": -8.185760498046875, + "step": 2713 + }, + { + "epoch": 0.24796710826861582, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 8.574207190680328e-06, + "logits/chosen": 424863795.2, + "logits/rejected": 320807850.6666667, + "logps/chosen": -246.8152587890625, + "logps/rejected": -391.9541422526042, + "loss": 0.0345, + "rewards/chosen": 3.140806198120117, + "rewards/margins": 12.597365697224935, + "rewards/rejected": -9.456559499104818, + "step": 2714 + }, + { + "epoch": 0.24805847418912746, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.573201608786134e-06, + "logits/chosen": 651162176.0, + "logits/rejected": 444406966.85714287, + "logps/chosen": -436.91473388671875, + "logps/rejected": -352.87904575892856, + "loss": 0.044, + "rewards/chosen": 2.398211717605591, + "rewards/margins": 9.672634499413626, + "rewards/rejected": -7.274422781808036, + "step": 2715 + }, + { + "epoch": 0.2481498401096391, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.572195731421275e-06, + "logits/chosen": 329864850.28571427, + "logits/rejected": 635533952.0, + "logps/chosen": -268.207275390625, + "logps/rejected": -565.454833984375, + "loss": 0.0738, + "rewards/chosen": 3.1172894069126675, + "rewards/margins": 5.5244549342564175, + "rewards/rejected": -2.40716552734375, + "step": 2716 + }, + { + "epoch": 0.24824120603015076, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 8.571189558668924e-06, + "logits/chosen": 717493708.8, + "logits/rejected": 1293755136.0, + "logps/chosen": -345.239208984375, + "logps/rejected": -628.6351725260416, + "loss": 0.0302, + "rewards/chosen": 3.5323890686035155, + "rewards/margins": 16.4069211324056, + "rewards/rejected": -12.874532063802084, + "step": 2717 + }, + { + "epoch": 0.2483325719506624, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 8.570183090612286e-06, + "logits/chosen": 626061760.0, + "logits/rejected": 503623616.0, + "logps/chosen": -244.06707763671875, + "logps/rejected": -309.6048583984375, + "loss": 0.0216, + "rewards/chosen": 3.160006046295166, + "rewards/margins": 10.7774338722229, + "rewards/rejected": -7.617427825927734, + "step": 2718 + }, + { + "epoch": 0.24842393787117406, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 8.569176327334585e-06, + "logits/chosen": 600048085.3333334, + "logits/rejected": 422472499.2, + "logps/chosen": -448.5077311197917, + "logps/rejected": -373.234765625, + "loss": 0.0145, + "rewards/chosen": 4.457559267679851, + "rewards/margins": 11.196623293558757, + "rewards/rejected": -6.739064025878906, + "step": 2719 + }, + { + "epoch": 0.2485153037916857, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 8.568169268919072e-06, + "logits/chosen": 482692249.6, + "logits/rejected": 735415296.0, + "logps/chosen": -265.119580078125, + "logps/rejected": -501.5321858723958, + "loss": 0.0303, + "rewards/chosen": 3.7965728759765627, + "rewards/margins": 14.118806076049804, + "rewards/rejected": -10.322233200073242, + "step": 2720 + }, + { + "epoch": 0.24860666971219736, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 8.567161915449018e-06, + "logits/chosen": 481446656.0, + "logits/rejected": 395514521.6, + "logps/chosen": -408.9205322265625, + "logps/rejected": -603.595361328125, + "loss": 0.0102, + "rewards/chosen": 3.7698167165120444, + "rewards/margins": 13.50599225362142, + "rewards/rejected": -9.736175537109375, + "step": 2721 + }, + { + "epoch": 0.248698035632709, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 8.566154267007725e-06, + "logits/chosen": 1073187020.8, + "logits/rejected": 599899818.6666666, + "logps/chosen": -397.69453125, + "logps/rejected": -489.1720377604167, + "loss": 0.0244, + "rewards/chosen": 3.2792545318603517, + "rewards/margins": 12.366207377115884, + "rewards/rejected": -9.086952845255533, + "step": 2722 + }, + { + "epoch": 0.24878940155322066, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 8.565146323678514e-06, + "logits/chosen": 985252992.0, + "logits/rejected": 815754240.0, + "logps/chosen": -393.22662353515625, + "logps/rejected": -532.7912248883929, + "loss": 0.0253, + "rewards/chosen": 3.8423614501953125, + "rewards/margins": 11.914919172014509, + "rewards/rejected": -8.072557721819196, + "step": 2723 + }, + { + "epoch": 0.2488807674737323, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 8.564138085544736e-06, + "logits/chosen": 335467008.0, + "logits/rejected": 371692714.6666667, + "logps/chosen": -250.22890625, + "logps/rejected": -472.1934814453125, + "loss": 0.0276, + "rewards/chosen": 3.5629310607910156, + "rewards/margins": 12.924267450968424, + "rewards/rejected": -9.361336390177408, + "step": 2724 + }, + { + "epoch": 0.24897213339424396, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 8.563129552689759e-06, + "logits/chosen": 318452949.3333333, + "logits/rejected": 664018329.6, + "logps/chosen": -265.8043619791667, + "logps/rejected": -703.689599609375, + "loss": 0.0069, + "rewards/chosen": 4.616269747416179, + "rewards/margins": 14.352224795023602, + "rewards/rejected": -9.735955047607423, + "step": 2725 + }, + { + "epoch": 0.2490634993147556, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.562120725196984e-06, + "logits/chosen": 570672128.0, + "logits/rejected": 716545945.6, + "logps/chosen": -376.0648193359375, + "logps/rejected": -453.9173828125, + "loss": 0.0098, + "rewards/chosen": 4.20233154296875, + "rewards/margins": 12.189610290527344, + "rewards/rejected": -7.987278747558594, + "step": 2726 + }, + { + "epoch": 0.24915486523526725, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 8.561111603149826e-06, + "logits/chosen": 576190464.0, + "logits/rejected": 271458048.0, + "logps/chosen": -513.3592529296875, + "logps/rejected": -285.15399169921875, + "loss": 0.0168, + "rewards/chosen": 3.905003547668457, + "rewards/margins": 11.244558811187744, + "rewards/rejected": -7.339555263519287, + "step": 2727 + }, + { + "epoch": 0.2492462311557789, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.560102186631733e-06, + "logits/chosen": 892861610.6666666, + "logits/rejected": 524959027.2, + "logps/chosen": -414.6806233723958, + "logps/rejected": -474.7498046875, + "loss": 0.0239, + "rewards/chosen": 2.7543551127115884, + "rewards/margins": 11.98568369547526, + "rewards/rejected": -9.231328582763672, + "step": 2728 + }, + { + "epoch": 0.24933759707629055, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.559092475726176e-06, + "logits/chosen": 710072576.0, + "logits/rejected": 386001996.8, + "logps/chosen": -408.1000162760417, + "logps/rejected": -247.7850341796875, + "loss": 0.0264, + "rewards/chosen": 3.298593521118164, + "rewards/margins": 10.01066780090332, + "rewards/rejected": -6.712074279785156, + "step": 2729 + }, + { + "epoch": 0.2494289629968022, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 8.558082470516644e-06, + "logits/chosen": 490748288.0, + "logits/rejected": 1191214336.0, + "logps/chosen": -260.68983968098956, + "logps/rejected": -450.8740539550781, + "loss": 0.0634, + "rewards/chosen": 3.6321487426757812, + "rewards/margins": 11.062653064727783, + "rewards/rejected": -7.430504322052002, + "step": 2730 + }, + { + "epoch": 0.24952032891731385, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 8.55707217108666e-06, + "logits/chosen": 791432652.8, + "logits/rejected": 705507925.3333334, + "logps/chosen": -196.8890625, + "logps/rejected": -325.41375732421875, + "loss": 0.0469, + "rewards/chosen": 2.840793800354004, + "rewards/margins": 10.43403263092041, + "rewards/rejected": -7.593238830566406, + "step": 2731 + }, + { + "epoch": 0.2496116948378255, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 8.556061577519765e-06, + "logits/chosen": 470278720.0, + "logits/rejected": 552748416.0, + "logps/chosen": -287.220458984375, + "logps/rejected": -432.60430908203125, + "loss": 0.012, + "rewards/chosen": 4.084222316741943, + "rewards/margins": 12.855812549591064, + "rewards/rejected": -8.771590232849121, + "step": 2732 + }, + { + "epoch": 0.24970306075833715, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.555050689899525e-06, + "logits/chosen": 437953638.4, + "logits/rejected": 301950336.0, + "logps/chosen": -383.080224609375, + "logps/rejected": -383.7117513020833, + "loss": 0.0471, + "rewards/chosen": 2.628114128112793, + "rewards/margins": 11.887580935160319, + "rewards/rejected": -9.259466807047525, + "step": 2733 + }, + { + "epoch": 0.2497944266788488, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.55403950830953e-06, + "logits/chosen": 698715776.0, + "logits/rejected": 344352704.0, + "logps/chosen": -309.59259033203125, + "logps/rejected": -300.3031921386719, + "loss": 0.0273, + "rewards/chosen": 2.8956804275512695, + "rewards/margins": 11.914352416992188, + "rewards/rejected": -9.018671989440918, + "step": 2734 + }, + { + "epoch": 0.24988579259936045, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 8.553028032833397e-06, + "logits/chosen": 496785322.6666667, + "logits/rejected": 826325632.0, + "logps/chosen": -384.1194661458333, + "logps/rejected": -920.2129516601562, + "loss": 0.0325, + "rewards/chosen": 3.5413195292154946, + "rewards/margins": 13.25854746500651, + "rewards/rejected": -9.717227935791016, + "step": 2735 + }, + { + "epoch": 0.2499771585198721, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.552016263554765e-06, + "logits/chosen": 547644928.0, + "logits/rejected": 597938773.3333334, + "logps/chosen": -398.177783203125, + "logps/rejected": -447.3111165364583, + "loss": 0.0423, + "rewards/chosen": 2.7553407669067385, + "rewards/margins": 12.2883633295695, + "rewards/rejected": -9.53302256266276, + "step": 2736 + }, + { + "epoch": 0.25006852444038374, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 8.5510042005573e-06, + "logits/chosen": 909898240.0, + "logits/rejected": 486953045.3333333, + "logps/chosen": -362.7441162109375, + "logps/rejected": -596.1116536458334, + "loss": 0.0084, + "rewards/chosen": 4.676476669311524, + "rewards/margins": 14.03013916015625, + "rewards/rejected": -9.353662490844727, + "step": 2737 + }, + { + "epoch": 0.25015989036089537, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.549991843924687e-06, + "logits/chosen": 304149354.6666667, + "logits/rejected": 462633318.4, + "logps/chosen": -291.5590413411458, + "logps/rejected": -549.89697265625, + "loss": 0.0178, + "rewards/chosen": 4.615525563557942, + "rewards/margins": 14.490672047932943, + "rewards/rejected": -9.875146484375, + "step": 2738 + }, + { + "epoch": 0.25025125628140704, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 8.54897919374064e-06, + "logits/chosen": 445931622.4, + "logits/rejected": 404793856.0, + "logps/chosen": -404.6939453125, + "logps/rejected": -451.9990234375, + "loss": 0.0172, + "rewards/chosen": 3.6689544677734376, + "rewards/margins": 14.017215983072916, + "rewards/rejected": -10.348261515299479, + "step": 2739 + }, + { + "epoch": 0.25034262220191866, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 8.547966250088896e-06, + "logits/chosen": 598230656.0, + "logits/rejected": 446431744.0, + "logps/chosen": -326.2747802734375, + "logps/rejected": -529.01181640625, + "loss": 0.008, + "rewards/chosen": 4.188559850056966, + "rewards/margins": 13.593942387898764, + "rewards/rejected": -9.405382537841797, + "step": 2740 + }, + { + "epoch": 0.25043398812243034, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 8.546953013053214e-06, + "logits/chosen": 657255594.6666666, + "logits/rejected": 1757323776.0, + "logps/chosen": -289.03216552734375, + "logps/rejected": -499.9963073730469, + "loss": 0.0948, + "rewards/chosen": 3.5899426142374673, + "rewards/margins": 9.475075403849283, + "rewards/rejected": -5.885132789611816, + "step": 2741 + }, + { + "epoch": 0.25052535404294196, + "grad_norm": 0.98828125, + "kl": 0.0, + "learning_rate": 8.545939482717382e-06, + "logits/chosen": 342746197.3333333, + "logits/rejected": 482059724.8, + "logps/chosen": -346.1033121744792, + "logps/rejected": -417.09677734375, + "loss": 0.0049, + "rewards/chosen": 4.618945439656575, + "rewards/margins": 13.5842103322347, + "rewards/rejected": -8.965264892578125, + "step": 2742 + }, + { + "epoch": 0.25061671996345364, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 8.54492565916521e-06, + "logits/chosen": 621475008.0, + "logits/rejected": 366329472.0, + "logps/chosen": -398.7506408691406, + "logps/rejected": -493.75665283203125, + "loss": 0.0113, + "rewards/chosen": 3.976105213165283, + "rewards/margins": 12.758106708526611, + "rewards/rejected": -8.782001495361328, + "step": 2743 + }, + { + "epoch": 0.25070808588396526, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 8.54391154248053e-06, + "logits/chosen": 402503193.6, + "logits/rejected": 1047269717.3333334, + "logps/chosen": -120.61217041015625, + "logps/rejected": -395.6534830729167, + "loss": 0.083, + "rewards/chosen": 2.820231056213379, + "rewards/margins": 10.895655886332193, + "rewards/rejected": -8.075424830118815, + "step": 2744 + }, + { + "epoch": 0.25079945180447694, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 8.542897132747201e-06, + "logits/chosen": 502704170.6666667, + "logits/rejected": 765178624.0, + "logps/chosen": -308.7174072265625, + "logps/rejected": -561.22548828125, + "loss": 0.0087, + "rewards/chosen": 3.8635425567626953, + "rewards/margins": 13.604934310913086, + "rewards/rejected": -9.74139175415039, + "step": 2745 + }, + { + "epoch": 0.25089081772498856, + "grad_norm": 0.8046875, + "kl": 0.0, + "learning_rate": 8.541882430049103e-06, + "logits/chosen": 608726357.3333334, + "logits/rejected": 1062786252.8, + "logps/chosen": -243.0432332356771, + "logps/rejected": -496.33759765625, + "loss": 0.0059, + "rewards/chosen": 4.653120040893555, + "rewards/margins": 12.922906112670898, + "rewards/rejected": -8.269786071777343, + "step": 2746 + }, + { + "epoch": 0.25098218364550023, + "grad_norm": 31.625, + "kl": 0.0, + "learning_rate": 8.540867434470146e-06, + "logits/chosen": 362428748.8, + "logits/rejected": 353339797.3333333, + "logps/chosen": -205.5770263671875, + "logps/rejected": -498.0873209635417, + "loss": 0.1292, + "rewards/chosen": 2.049884796142578, + "rewards/margins": 10.885043462117514, + "rewards/rejected": -8.835158665974935, + "step": 2747 + }, + { + "epoch": 0.25107354956601186, + "grad_norm": 0.50390625, + "kl": 0.0, + "learning_rate": 8.539852146094257e-06, + "logits/chosen": 640381056.0, + "logits/rejected": 911583317.3333334, + "logps/chosen": -347.861083984375, + "logps/rejected": -460.2237141927083, + "loss": 0.0024, + "rewards/chosen": 4.927639961242676, + "rewards/margins": 12.887360572814941, + "rewards/rejected": -7.959720611572266, + "step": 2748 + }, + { + "epoch": 0.25116491548652353, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 8.538836565005395e-06, + "logits/chosen": 1112035942.4, + "logits/rejected": 625420970.6666666, + "logps/chosen": -122.8909912109375, + "logps/rejected": -605.9407552083334, + "loss": 0.0279, + "rewards/chosen": 3.4879932403564453, + "rewards/margins": 14.464508692423502, + "rewards/rejected": -10.976515452067057, + "step": 2749 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 8.537820691287537e-06, + "logits/chosen": 1588679424.0, + "logits/rejected": 466436224.0, + "logps/chosen": -568.016357421875, + "logps/rejected": -353.174560546875, + "loss": 0.0151, + "rewards/chosen": 2.9518065452575684, + "rewards/margins": 11.437727769215902, + "rewards/rejected": -8.485921223958334, + "step": 2750 + }, + { + "epoch": 0.25134764732754683, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.536804525024684e-06, + "logits/chosen": 503738163.2, + "logits/rejected": 722783573.3333334, + "logps/chosen": -301.2956787109375, + "logps/rejected": -528.6199951171875, + "loss": 0.0487, + "rewards/chosen": 3.07238655090332, + "rewards/margins": 9.018353271484376, + "rewards/rejected": -5.945966720581055, + "step": 2751 + }, + { + "epoch": 0.25143901324805845, + "grad_norm": 10.1875, + "kl": 14.540258407592773, + "learning_rate": 8.53578806630087e-06, + "logits/chosen": 553273536.0, + "logps/chosen": -225.8207244873047, + "loss": 0.1055, + "rewards/chosen": 3.7916789054870605, + "step": 2752 + }, + { + "epoch": 0.25153037916857013, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 8.534771315200139e-06, + "logits/chosen": 1249841408.0, + "logits/rejected": 706502041.6, + "logps/chosen": -387.9419352213542, + "logps/rejected": -570.578369140625, + "loss": 0.0359, + "rewards/chosen": 3.8804747263590493, + "rewards/margins": 11.855959192911783, + "rewards/rejected": -7.975484466552734, + "step": 2753 + }, + { + "epoch": 0.25162174508908175, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.533754271806573e-06, + "logits/chosen": 598322790.4, + "logits/rejected": 471504128.0, + "logps/chosen": -413.536181640625, + "logps/rejected": -570.37353515625, + "loss": 0.0324, + "rewards/chosen": 3.252583312988281, + "rewards/margins": 13.642901865641274, + "rewards/rejected": -10.390318552652994, + "step": 2754 + }, + { + "epoch": 0.2517131110095934, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 8.532736936204269e-06, + "logits/chosen": 900925184.0, + "logits/rejected": 413155413.3333333, + "logps/chosen": -392.6676025390625, + "logps/rejected": -563.0515543619791, + "loss": 0.0047, + "rewards/chosen": 4.514592170715332, + "rewards/margins": 15.576526323954264, + "rewards/rejected": -11.061934153238932, + "step": 2755 + }, + { + "epoch": 0.25180447693010505, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 8.531719308477351e-06, + "logits/chosen": 493994291.2, + "logits/rejected": 332728042.6666667, + "logps/chosen": -255.5771484375, + "logps/rejected": -394.917724609375, + "loss": 0.0198, + "rewards/chosen": 3.7715293884277346, + "rewards/margins": 12.61595458984375, + "rewards/rejected": -8.844425201416016, + "step": 2756 + }, + { + "epoch": 0.2518958428506167, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 8.53070138870997e-06, + "logits/chosen": 450696768.0, + "logits/rejected": 514626976.0, + "logps/chosen": -302.39154052734375, + "logps/rejected": -569.3350830078125, + "loss": 0.0194, + "rewards/chosen": 3.6006507873535156, + "rewards/margins": 13.623811721801758, + "rewards/rejected": -10.023160934448242, + "step": 2757 + }, + { + "epoch": 0.25198720877112835, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 8.529683176986295e-06, + "logits/chosen": 560633139.2, + "logits/rejected": 365798314.6666667, + "logps/chosen": -268.426953125, + "logps/rejected": -460.0273030598958, + "loss": 0.0215, + "rewards/chosen": 3.5041412353515624, + "rewards/margins": 13.876786931355795, + "rewards/rejected": -10.372645696004232, + "step": 2758 + }, + { + "epoch": 0.25207857469164, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 8.528664673390526e-06, + "logits/chosen": 658287513.6, + "logits/rejected": 923200000.0, + "logps/chosen": -196.69718017578126, + "logps/rejected": -428.971435546875, + "loss": 0.058, + "rewards/chosen": 2.7302082061767576, + "rewards/margins": 12.358808008829751, + "rewards/rejected": -9.628599802652994, + "step": 2759 + }, + { + "epoch": 0.25216994061215164, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.527645878006881e-06, + "logits/chosen": 544047872.0, + "logits/rejected": 642411968.0, + "logps/chosen": -188.45875549316406, + "logps/rejected": -458.43780517578125, + "loss": 0.0383, + "rewards/chosen": 4.172070503234863, + "rewards/margins": 12.976090431213379, + "rewards/rejected": -8.804019927978516, + "step": 2760 + }, + { + "epoch": 0.2522613065326633, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 8.526626790919608e-06, + "logits/chosen": 651690240.0, + "logits/rejected": 1654830464.0, + "logps/chosen": -285.4615885416667, + "logps/rejected": -751.449951171875, + "loss": 0.0668, + "rewards/chosen": 2.508882681528727, + "rewards/margins": 15.201418081919352, + "rewards/rejected": -12.692535400390625, + "step": 2761 + }, + { + "epoch": 0.25235267245317494, + "grad_norm": 36.0, + "kl": 0.0, + "learning_rate": 8.525607412212972e-06, + "logits/chosen": 515405120.0, + "logits/rejected": 419279040.0, + "logps/chosen": -373.31658935546875, + "logps/rejected": -442.8458557128906, + "loss": 0.0714, + "rewards/chosen": 3.4349184036254883, + "rewards/margins": 11.08599042892456, + "rewards/rejected": -7.651072025299072, + "step": 2762 + }, + { + "epoch": 0.2524440383736866, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 8.52458774197127e-06, + "logits/chosen": 619187157.3333334, + "logits/rejected": 333546009.6, + "logps/chosen": -360.97607421875, + "logps/rejected": -496.226953125, + "loss": 0.1268, + "rewards/chosen": 2.5126527150472007, + "rewards/margins": 12.023171361287435, + "rewards/rejected": -9.510518646240234, + "step": 2763 + }, + { + "epoch": 0.25253540429419824, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 8.523567780278818e-06, + "logits/chosen": 421735065.6, + "logits/rejected": 286309290.6666667, + "logps/chosen": -371.977490234375, + "logps/rejected": -479.2659098307292, + "loss": 0.0089, + "rewards/chosen": 4.601205444335937, + "rewards/margins": 15.382862345377603, + "rewards/rejected": -10.781656901041666, + "step": 2764 + }, + { + "epoch": 0.2526267702147099, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 8.522547527219956e-06, + "logits/chosen": 521376563.2, + "logits/rejected": 329507541.3333333, + "logps/chosen": -311.63486328125, + "logps/rejected": -421.99365234375, + "loss": 0.0254, + "rewards/chosen": 3.4748291015625, + "rewards/margins": 12.36290651957194, + "rewards/rejected": -8.88807741800944, + "step": 2765 + }, + { + "epoch": 0.25271813613522154, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 8.521526982879052e-06, + "logits/chosen": 610847658.6666666, + "logits/rejected": 764033024.0, + "logps/chosen": -430.8921712239583, + "logps/rejected": -341.0328369140625, + "loss": 0.0616, + "rewards/chosen": 2.6993350982666016, + "rewards/margins": 10.948123931884766, + "rewards/rejected": -8.248788833618164, + "step": 2766 + }, + { + "epoch": 0.2528095020557332, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 8.520506147340492e-06, + "logits/chosen": 738141286.4, + "logits/rejected": 620608000.0, + "logps/chosen": -334.1482666015625, + "logps/rejected": -507.111328125, + "loss": 0.0273, + "rewards/chosen": 3.4457443237304686, + "rewards/margins": 12.303360112508138, + "rewards/rejected": -8.85761578877767, + "step": 2767 + }, + { + "epoch": 0.25290086797624484, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 8.519485020688694e-06, + "logits/chosen": 672261376.0, + "logits/rejected": 1061495125.3333334, + "logps/chosen": -382.6755676269531, + "logps/rejected": -529.2728271484375, + "loss": 0.0237, + "rewards/chosen": 2.352848768234253, + "rewards/margins": 13.054943799972534, + "rewards/rejected": -10.702095031738281, + "step": 2768 + }, + { + "epoch": 0.2529922338967565, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 8.518463603008093e-06, + "logits/chosen": 408788992.0, + "logits/rejected": 582719658.6666666, + "logps/chosen": -312.23126220703125, + "logps/rejected": -517.8253987630209, + "loss": 0.0057, + "rewards/chosen": 4.091639518737793, + "rewards/margins": 12.656765302022299, + "rewards/rejected": -8.565125783284506, + "step": 2769 + }, + { + "epoch": 0.25308359981726813, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 8.51744189438315e-06, + "logits/chosen": 573552947.2, + "logits/rejected": 344080298.6666667, + "logps/chosen": -413.7330078125, + "logps/rejected": -397.1107584635417, + "loss": 0.0168, + "rewards/chosen": 3.894841766357422, + "rewards/margins": 10.190836842854818, + "rewards/rejected": -6.2959950764973955, + "step": 2770 + }, + { + "epoch": 0.2531749657377798, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 8.516419894898356e-06, + "logits/chosen": 388712768.0, + "logits/rejected": 516182272.0, + "logps/chosen": -248.1649932861328, + "logps/rejected": -472.3523254394531, + "loss": 0.0109, + "rewards/chosen": 4.202823638916016, + "rewards/margins": 12.524885177612305, + "rewards/rejected": -8.322061538696289, + "step": 2771 + }, + { + "epoch": 0.25326633165829143, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 8.515397604638213e-06, + "logits/chosen": 514965344.0, + "logits/rejected": 1415967744.0, + "logps/chosen": -153.76101684570312, + "logps/rejected": -399.63580322265625, + "loss": 0.0563, + "rewards/chosen": 3.4507410526275635, + "rewards/margins": 9.075609922409058, + "rewards/rejected": -5.624868869781494, + "step": 2772 + }, + { + "epoch": 0.2533576975788031, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 8.51437502368726e-06, + "logits/chosen": 310456277.3333333, + "logits/rejected": 544633753.6, + "logps/chosen": -204.23516845703125, + "logps/rejected": -486.3748046875, + "loss": 0.0162, + "rewards/chosen": 3.5656325022379556, + "rewards/margins": 10.37317034403483, + "rewards/rejected": -6.807537841796875, + "step": 2773 + }, + { + "epoch": 0.25344906349931473, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 8.513352152130054e-06, + "logits/chosen": 233749808.0, + "logits/rejected": 458166944.0, + "logps/chosen": -195.69351196289062, + "logps/rejected": -643.103515625, + "loss": 0.0217, + "rewards/chosen": 4.054286956787109, + "rewards/margins": 12.716903686523438, + "rewards/rejected": -8.662616729736328, + "step": 2774 + }, + { + "epoch": 0.2535404294198264, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 8.512328990051179e-06, + "logits/chosen": 765934272.0, + "logits/rejected": 416974304.0, + "logps/chosen": -265.85302734375, + "logps/rejected": -433.7312927246094, + "loss": 0.0089, + "rewards/chosen": 4.026760101318359, + "rewards/margins": 13.383747100830078, + "rewards/rejected": -9.356986999511719, + "step": 2775 + }, + { + "epoch": 0.25363179534033803, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.511305537535238e-06, + "logits/chosen": 699140864.0, + "logits/rejected": 945614720.0, + "logps/chosen": -377.85089111328125, + "logps/rejected": -584.057373046875, + "loss": 0.1142, + "rewards/chosen": 2.538869619369507, + "rewards/margins": 12.262973546981812, + "rewards/rejected": -9.724103927612305, + "step": 2776 + }, + { + "epoch": 0.2537231612608497, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 8.51028179466686e-06, + "logits/chosen": 529171680.0, + "logits/rejected": 600286080.0, + "logps/chosen": -334.216796875, + "logps/rejected": -472.8862609863281, + "loss": 0.0095, + "rewards/chosen": 4.173914909362793, + "rewards/margins": 12.557835578918457, + "rewards/rejected": -8.383920669555664, + "step": 2777 + }, + { + "epoch": 0.2538145271813613, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 8.509257761530703e-06, + "logits/chosen": 655243605.3333334, + "logits/rejected": 420374816.0, + "logps/chosen": -371.2367757161458, + "logps/rejected": -399.04791259765625, + "loss": 0.0336, + "rewards/chosen": 3.2325652440389, + "rewards/margins": 12.225907643636068, + "rewards/rejected": -8.993342399597168, + "step": 2778 + }, + { + "epoch": 0.253905893101873, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 8.508233438211441e-06, + "logits/chosen": 543582250.6666666, + "logits/rejected": 829972172.8, + "logps/chosen": -302.3026529947917, + "logps/rejected": -691.97568359375, + "loss": 0.0166, + "rewards/chosen": 3.713027000427246, + "rewards/margins": 14.233794212341309, + "rewards/rejected": -10.520767211914062, + "step": 2779 + }, + { + "epoch": 0.2539972590223846, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.50720882479378e-06, + "logits/chosen": 801343829.3333334, + "logits/rejected": 543326566.4, + "logps/chosen": -415.6554361979167, + "logps/rejected": -453.8990234375, + "loss": 0.0128, + "rewards/chosen": 3.676031748453776, + "rewards/margins": 12.725970713297526, + "rewards/rejected": -9.04993896484375, + "step": 2780 + }, + { + "epoch": 0.2540886249428963, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 8.506183921362443e-06, + "logits/chosen": 263765824.0, + "logits/rejected": 1156723968.0, + "logps/chosen": -360.15667724609375, + "logps/rejected": -586.1001790364584, + "loss": 0.0056, + "rewards/chosen": 3.8810105323791504, + "rewards/margins": 12.785808404286703, + "rewards/rejected": -8.904797871907553, + "step": 2781 + }, + { + "epoch": 0.2541799908634079, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 8.505158728002183e-06, + "logits/chosen": 369564825.6, + "logits/rejected": 339670784.0, + "logps/chosen": -263.01865234375, + "logps/rejected": -437.5574137369792, + "loss": 0.035, + "rewards/chosen": 3.5267002105712892, + "rewards/margins": 11.979896418253581, + "rewards/rejected": -8.453196207682291, + "step": 2782 + }, + { + "epoch": 0.2542713567839196, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 8.50413324479777e-06, + "logits/chosen": 567694848.0, + "logits/rejected": 645481408.0, + "logps/chosen": -181.74827575683594, + "logps/rejected": -355.42578125, + "loss": 0.1194, + "rewards/chosen": 3.630323886871338, + "rewards/margins": 9.469202518463135, + "rewards/rejected": -5.838878631591797, + "step": 2783 + }, + { + "epoch": 0.2543627227044312, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.503107471834007e-06, + "logits/chosen": 398948256.0, + "logits/rejected": 677494144.0, + "logps/chosen": -208.23727416992188, + "logps/rejected": -432.2425842285156, + "loss": 0.1051, + "rewards/chosen": 3.452144145965576, + "rewards/margins": 10.965973377227783, + "rewards/rejected": -7.513829231262207, + "step": 2784 + }, + { + "epoch": 0.2544540886249429, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 8.50208140919571e-06, + "logits/chosen": 492983398.4, + "logits/rejected": 602594688.0, + "logps/chosen": -360.0627197265625, + "logps/rejected": -780.5271809895834, + "loss": 0.0472, + "rewards/chosen": 2.9195058822631834, + "rewards/margins": 14.11412410736084, + "rewards/rejected": -11.194618225097656, + "step": 2785 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 8.50105505696773e-06, + "logits/chosen": 1560761600.0, + "logits/rejected": 794130858.6666666, + "logps/chosen": -378.3595275878906, + "logps/rejected": -428.4232177734375, + "loss": 0.0988, + "rewards/chosen": 3.99489164352417, + "rewards/margins": 10.83248980840047, + "rewards/rejected": -6.837598164876302, + "step": 2786 + }, + { + "epoch": 0.2546368204659662, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 8.500028415234936e-06, + "logits/chosen": 839453312.0, + "logits/rejected": 522204928.0, + "logps/chosen": -421.2682189941406, + "logps/rejected": -468.5252162388393, + "loss": 0.0075, + "rewards/chosen": 2.841024875640869, + "rewards/margins": 11.125457968030657, + "rewards/rejected": -8.284433092389788, + "step": 2787 + }, + { + "epoch": 0.2547281863864778, + "grad_norm": 7.03125, + "kl": 0.2957725524902344, + "learning_rate": 8.499001484082218e-06, + "logits/chosen": 604345301.3333334, + "logits/rejected": 775713920.0, + "logps/chosen": -365.4840494791667, + "logps/rejected": -261.36553955078125, + "loss": 0.0412, + "rewards/chosen": 3.2624438603719077, + "rewards/margins": 9.066799481709799, + "rewards/rejected": -5.804355621337891, + "step": 2788 + }, + { + "epoch": 0.2548195523069895, + "grad_norm": 0.55078125, + "kl": 0.0, + "learning_rate": 8.497974263594498e-06, + "logits/chosen": 232419136.0, + "logits/rejected": 292633642.6666667, + "logps/chosen": -298.81793212890625, + "logps/rejected": -392.3287760416667, + "loss": 0.0025, + "rewards/chosen": 4.7682952880859375, + "rewards/margins": 13.44144058227539, + "rewards/rejected": -8.673145294189453, + "step": 2789 + }, + { + "epoch": 0.2549109182275011, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.496946753856713e-06, + "logits/chosen": 1253533440.0, + "logits/rejected": 381668288.0, + "logps/chosen": -341.45709228515625, + "logps/rejected": -307.1790466308594, + "loss": 0.0309, + "rewards/chosen": 3.5599846839904785, + "rewards/margins": 12.476427555084229, + "rewards/rejected": -8.91644287109375, + "step": 2790 + }, + { + "epoch": 0.2550022841480128, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 8.495918954953834e-06, + "logits/chosen": 484440384.0, + "logits/rejected": 388822592.0, + "logps/chosen": -385.133544921875, + "logps/rejected": -464.2791748046875, + "loss": 0.0103, + "rewards/chosen": 4.5880126953125, + "rewards/margins": 11.351046562194824, + "rewards/rejected": -6.763033866882324, + "step": 2791 + }, + { + "epoch": 0.2550936500685244, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 8.494890866970846e-06, + "logits/chosen": 694848320.0, + "logits/rejected": 456217472.0, + "logps/chosen": -313.4561767578125, + "logps/rejected": -384.8164469401042, + "loss": 0.0061, + "rewards/chosen": 3.931252956390381, + "rewards/margins": 11.702848593393963, + "rewards/rejected": -7.771595637003581, + "step": 2792 + }, + { + "epoch": 0.2551850159890361, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 8.493862489992766e-06, + "logits/chosen": 300194368.0, + "logits/rejected": 441549909.3333333, + "logps/chosen": -292.4241943359375, + "logps/rejected": -488.5893961588542, + "loss": 0.0097, + "rewards/chosen": 3.5697927474975586, + "rewards/margins": 11.765988985697428, + "rewards/rejected": -8.19619623819987, + "step": 2793 + }, + { + "epoch": 0.2552763819095477, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 8.492833824104629e-06, + "logits/chosen": 1101270912.0, + "logits/rejected": 691488768.0, + "logps/chosen": -192.4490966796875, + "logps/rejected": -611.0841064453125, + "loss": 0.0257, + "rewards/chosen": 3.514793872833252, + "rewards/margins": 11.914380550384521, + "rewards/rejected": -8.39958667755127, + "step": 2794 + }, + { + "epoch": 0.2553677478300594, + "grad_norm": 35.0, + "kl": 0.0, + "learning_rate": 8.491804869391496e-06, + "logits/chosen": 433200032.0, + "logits/rejected": 479106730.6666667, + "logps/chosen": -208.567138671875, + "logps/rejected": -418.441162109375, + "loss": 0.1063, + "rewards/chosen": 0.9635848999023438, + "rewards/margins": 9.255820592244467, + "rewards/rejected": -8.292235692342123, + "step": 2795 + }, + { + "epoch": 0.255459113750571, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 8.490775625938452e-06, + "logits/chosen": 517153728.0, + "logits/rejected": 535647488.0, + "logps/chosen": -323.1228332519531, + "logps/rejected": -588.621337890625, + "loss": 0.017, + "rewards/chosen": 3.827461004257202, + "rewards/margins": 13.31775164604187, + "rewards/rejected": -9.490290641784668, + "step": 2796 + }, + { + "epoch": 0.2555504796710827, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 8.489746093830609e-06, + "logits/chosen": 440197034.6666667, + "logits/rejected": 497840025.6, + "logps/chosen": -259.7042236328125, + "logps/rejected": -514.94033203125, + "loss": 0.0193, + "rewards/chosen": 2.9731887181599936, + "rewards/margins": 12.27242120107015, + "rewards/rejected": -9.299232482910156, + "step": 2797 + }, + { + "epoch": 0.2556418455915943, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 8.488716273153095e-06, + "logits/chosen": 458945877.3333333, + "logits/rejected": 656718438.4, + "logps/chosen": -238.6492716471354, + "logps/rejected": -408.047802734375, + "loss": 0.1253, + "rewards/chosen": 3.562323888142904, + "rewards/margins": 9.849353535970053, + "rewards/rejected": -6.287029647827149, + "step": 2798 + }, + { + "epoch": 0.255733211512106, + "grad_norm": 5.625, + "kl": 1.032562255859375, + "learning_rate": 8.48768616399107e-06, + "logits/chosen": 481898934.85714287, + "logits/rejected": 353259136.0, + "logps/chosen": -393.02144949776783, + "logps/rejected": -203.74319458007812, + "loss": 0.0385, + "rewards/chosen": 3.7031661442347934, + "rewards/margins": 8.735167162758962, + "rewards/rejected": -5.03200101852417, + "step": 2799 + }, + { + "epoch": 0.25582457743261766, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 8.486655766429712e-06, + "logits/chosen": 632883456.0, + "logits/rejected": 696879744.0, + "logps/chosen": -397.0897216796875, + "logps/rejected": -387.23504638671875, + "loss": 0.0133, + "rewards/chosen": 3.822460889816284, + "rewards/margins": 12.651598691940308, + "rewards/rejected": -8.829137802124023, + "step": 2800 + }, + { + "epoch": 0.2559159433531293, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 8.485625080554228e-06, + "logits/chosen": 509632460.8, + "logits/rejected": 311777685.3333333, + "logps/chosen": -266.0016357421875, + "logps/rejected": -467.1848958333333, + "loss": 0.0189, + "rewards/chosen": 3.993000793457031, + "rewards/margins": 14.550368118286134, + "rewards/rejected": -10.557367324829102, + "step": 2801 + }, + { + "epoch": 0.25600730927364096, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 8.484594106449844e-06, + "logits/chosen": 834807424.0, + "logits/rejected": 423033728.0, + "logps/chosen": -436.95013427734375, + "logps/rejected": -322.8454895019531, + "loss": 0.0094, + "rewards/chosen": 4.379038333892822, + "rewards/margins": 11.775768756866455, + "rewards/rejected": -7.396730422973633, + "step": 2802 + }, + { + "epoch": 0.2560986751941526, + "grad_norm": 26.875, + "kl": 0.0, + "learning_rate": 8.483562844201813e-06, + "logits/chosen": 534967917.71428573, + "logits/rejected": 507030880.0, + "logps/chosen": -280.7898646763393, + "logps/rejected": -419.9937438964844, + "loss": 0.0867, + "rewards/chosen": 2.745644978114537, + "rewards/margins": 12.4338926587786, + "rewards/rejected": -9.688247680664062, + "step": 2803 + }, + { + "epoch": 0.25619004111466426, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 8.482531293895412e-06, + "logits/chosen": 635297344.0, + "logits/rejected": 623544192.0, + "logps/chosen": -298.66717529296875, + "logps/rejected": -413.18994140625, + "loss": 0.0175, + "rewards/chosen": 4.001669406890869, + "rewards/margins": 12.628444194793701, + "rewards/rejected": -8.626774787902832, + "step": 2804 + }, + { + "epoch": 0.2562814070351759, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.481499455615938e-06, + "logits/chosen": 378498816.0, + "logits/rejected": 617418854.4, + "logps/chosen": -305.42319742838544, + "logps/rejected": -338.8903564453125, + "loss": 0.027, + "rewards/chosen": 4.371830304463704, + "rewards/margins": 11.195656140645344, + "rewards/rejected": -6.823825836181641, + "step": 2805 + }, + { + "epoch": 0.25637277295568756, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 8.480467329448716e-06, + "logits/chosen": 246066892.8, + "logits/rejected": 405082197.3333333, + "logps/chosen": -261.4996826171875, + "logps/rejected": -488.972900390625, + "loss": 0.0176, + "rewards/chosen": 4.176129150390625, + "rewards/margins": 12.082889048258464, + "rewards/rejected": -7.906759897867839, + "step": 2806 + }, + { + "epoch": 0.2564641388761992, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 8.479434915479093e-06, + "logits/chosen": 600583296.0, + "logits/rejected": 574543744.0, + "logps/chosen": -314.4345397949219, + "logps/rejected": -449.3832702636719, + "loss": 0.0382, + "rewards/chosen": 2.9033241271972656, + "rewards/margins": 9.584190845489502, + "rewards/rejected": -6.680866718292236, + "step": 2807 + }, + { + "epoch": 0.25655550479671085, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.478402213792441e-06, + "logits/chosen": 233898048.0, + "logits/rejected": 195304640.0, + "logps/chosen": -195.73519897460938, + "logps/rejected": -240.76986694335938, + "loss": 0.0283, + "rewards/chosen": 3.401932954788208, + "rewards/margins": 10.521793603897095, + "rewards/rejected": -7.119860649108887, + "step": 2808 + }, + { + "epoch": 0.2566468707172225, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.477369224474153e-06, + "logits/chosen": 406496288.0, + "logits/rejected": 705959104.0, + "logps/chosen": -256.6944580078125, + "logps/rejected": -548.4384155273438, + "loss": 0.013, + "rewards/chosen": 3.8832921981811523, + "rewards/margins": 12.789084434509277, + "rewards/rejected": -8.905792236328125, + "step": 2809 + }, + { + "epoch": 0.25673823663773415, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 8.476335947609649e-06, + "logits/chosen": 769885504.0, + "logits/rejected": 354101248.0, + "logps/chosen": -605.275634765625, + "logps/rejected": -411.0281575520833, + "loss": 0.0062, + "rewards/chosen": 3.720843553543091, + "rewards/margins": 14.388513485590616, + "rewards/rejected": -10.667669932047525, + "step": 2810 + }, + { + "epoch": 0.2568296025582458, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.475302383284373e-06, + "logits/chosen": 1135272960.0, + "logits/rejected": 558645632.0, + "logps/chosen": -227.01194763183594, + "logps/rejected": -378.4989013671875, + "loss": 0.0318, + "rewards/chosen": 3.646019697189331, + "rewards/margins": 9.635762929916382, + "rewards/rejected": -5.989743232727051, + "step": 2811 + }, + { + "epoch": 0.25692096847875745, + "grad_norm": 26.25, + "kl": 0.0, + "learning_rate": 8.474268531583787e-06, + "logits/chosen": 392202112.0, + "logits/rejected": 267063728.0, + "logps/chosen": -154.95980834960938, + "logps/rejected": -289.009521484375, + "loss": 0.0747, + "rewards/chosen": 2.7659590244293213, + "rewards/margins": 9.566800355911255, + "rewards/rejected": -6.800841331481934, + "step": 2812 + }, + { + "epoch": 0.25701233439926907, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.473234392593383e-06, + "logits/chosen": 150831637.33333334, + "logits/rejected": 436926873.6, + "logps/chosen": -352.6474202473958, + "logps/rejected": -511.024609375, + "loss": 0.0952, + "rewards/chosen": 3.1042842864990234, + "rewards/margins": 11.428420639038086, + "rewards/rejected": -8.324136352539062, + "step": 2813 + }, + { + "epoch": 0.25710370031978075, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 8.472199966398677e-06, + "logits/chosen": 647585536.0, + "logits/rejected": 642546304.0, + "logps/chosen": -337.6070251464844, + "logps/rejected": -432.3981018066406, + "loss": 0.0269, + "rewards/chosen": 3.3250107765197754, + "rewards/margins": 13.410499095916748, + "rewards/rejected": -10.085488319396973, + "step": 2814 + }, + { + "epoch": 0.25719506624029237, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 8.471165253085203e-06, + "logits/chosen": 704320128.0, + "logits/rejected": 512967466.6666667, + "logps/chosen": -246.908447265625, + "logps/rejected": -711.8307291666666, + "loss": 0.0262, + "rewards/chosen": 2.3536086082458496, + "rewards/margins": 15.842151800791422, + "rewards/rejected": -13.488543192545572, + "step": 2815 + }, + { + "epoch": 0.25728643216080405, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 8.470130252738525e-06, + "logits/chosen": 876940458.6666666, + "logits/rejected": 535565414.4, + "logps/chosen": -456.5139973958333, + "logps/rejected": -578.68837890625, + "loss": 0.0196, + "rewards/chosen": 3.753591537475586, + "rewards/margins": 13.74012107849121, + "rewards/rejected": -9.986529541015624, + "step": 2816 + }, + { + "epoch": 0.25737779808131567, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 8.469094965444225e-06, + "logits/chosen": 354900448.0, + "logits/rejected": 578559040.0, + "logps/chosen": -243.84744262695312, + "logps/rejected": -496.375, + "loss": 0.0164, + "rewards/chosen": 3.8514063358306885, + "rewards/margins": 11.701470136642456, + "rewards/rejected": -7.850063800811768, + "step": 2817 + }, + { + "epoch": 0.25746916400182734, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 8.468059391287912e-06, + "logits/chosen": 530603520.0, + "logits/rejected": 731373226.6666666, + "logps/chosen": -319.958642578125, + "logps/rejected": -569.9471028645834, + "loss": 0.0219, + "rewards/chosen": 3.7476272583007812, + "rewards/margins": 13.594230651855469, + "rewards/rejected": -9.846603393554688, + "step": 2818 + }, + { + "epoch": 0.25756052992233897, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 8.467023530355221e-06, + "logits/chosen": 576645836.8, + "logits/rejected": 577923413.3333334, + "logps/chosen": -274.4232177734375, + "logps/rejected": -443.5780843098958, + "loss": 0.0526, + "rewards/chosen": 2.7294116973876954, + "rewards/margins": 9.884424463907878, + "rewards/rejected": -7.155012766520183, + "step": 2819 + }, + { + "epoch": 0.25765189584285064, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 8.465987382731808e-06, + "logits/chosen": 689260224.0, + "logits/rejected": 1196703488.0, + "logps/chosen": -335.47857666015625, + "logps/rejected": -626.5149536132812, + "loss": 0.028, + "rewards/chosen": 3.1854100227355957, + "rewards/margins": 14.284437656402588, + "rewards/rejected": -11.099027633666992, + "step": 2820 + }, + { + "epoch": 0.25774326176336226, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.46495094850335e-06, + "logits/chosen": 486367104.0, + "logits/rejected": 398549440.0, + "logps/chosen": -334.725830078125, + "logps/rejected": -343.14892578125, + "loss": 0.1135, + "rewards/chosen": 3.715172529220581, + "rewards/margins": 9.098813772201538, + "rewards/rejected": -5.383641242980957, + "step": 2821 + }, + { + "epoch": 0.25783462768387394, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 8.463914227755552e-06, + "logits/chosen": 505707392.0, + "logits/rejected": 437624012.8, + "logps/chosen": -475.9371337890625, + "logps/rejected": -500.776171875, + "loss": 0.0386, + "rewards/chosen": 2.450918515523275, + "rewards/margins": 10.112947018941243, + "rewards/rejected": -7.662028503417969, + "step": 2822 + }, + { + "epoch": 0.25792599360438556, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 8.46287722057414e-06, + "logits/chosen": 321054176.0, + "logits/rejected": 481815744.0, + "logps/chosen": -193.76153564453125, + "logps/rejected": -418.03558349609375, + "loss": 0.1202, + "rewards/chosen": 2.445559024810791, + "rewards/margins": 9.824357032775879, + "rewards/rejected": -7.378798007965088, + "step": 2823 + }, + { + "epoch": 0.25801735952489724, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 8.46183992704487e-06, + "logits/chosen": 494720704.0, + "logits/rejected": 643804416.0, + "logps/chosen": -229.00424194335938, + "logps/rejected": -589.72509765625, + "loss": 0.0673, + "rewards/chosen": 3.788787364959717, + "rewards/margins": 11.449248313903809, + "rewards/rejected": -7.660460948944092, + "step": 2824 + }, + { + "epoch": 0.25810872544540886, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 8.460802347253512e-06, + "logits/chosen": 280051040.0, + "logits/rejected": 341740224.0, + "logps/chosen": -275.2384033203125, + "logps/rejected": -447.1422119140625, + "loss": 0.0096, + "rewards/chosen": 5.305123805999756, + "rewards/margins": 14.732263088226318, + "rewards/rejected": -9.427139282226562, + "step": 2825 + }, + { + "epoch": 0.25820009136592054, + "grad_norm": 35.0, + "kl": 0.0, + "learning_rate": 8.459764481285864e-06, + "logits/chosen": 353537813.3333333, + "logits/rejected": 463608640.0, + "logps/chosen": -235.229248046875, + "logps/rejected": -827.974609375, + "loss": 0.0518, + "rewards/chosen": 3.178914705912272, + "rewards/margins": 17.389241854349773, + "rewards/rejected": -14.2103271484375, + "step": 2826 + }, + { + "epoch": 0.25829145728643216, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 8.458726329227748e-06, + "logits/chosen": 425554688.0, + "logits/rejected": 327713510.4, + "logps/chosen": -222.14776611328125, + "logps/rejected": -281.6508544921875, + "loss": 0.1519, + "rewards/chosen": 3.264629364013672, + "rewards/margins": 8.66296615600586, + "rewards/rejected": -5.398336791992188, + "step": 2827 + }, + { + "epoch": 0.25838282320694383, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 8.45768789116501e-06, + "logits/chosen": 622524672.0, + "logits/rejected": 690833344.0, + "logps/chosen": -328.96002197265625, + "logps/rejected": -311.48443603515625, + "loss": 0.0185, + "rewards/chosen": 4.376860618591309, + "rewards/margins": 12.229784965515137, + "rewards/rejected": -7.852924346923828, + "step": 2828 + }, + { + "epoch": 0.25847418912745546, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.456649167183521e-06, + "logits/chosen": 423993548.8, + "logits/rejected": 1085585408.0, + "logps/chosen": -166.80787353515626, + "logps/rejected": -558.9878336588541, + "loss": 0.1468, + "rewards/chosen": 1.852334976196289, + "rewards/margins": 10.151203791300455, + "rewards/rejected": -8.298868815104166, + "step": 2829 + }, + { + "epoch": 0.25856555504796713, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 8.455610157369172e-06, + "logits/chosen": 834807881.1428572, + "logits/rejected": 424731712.0, + "logps/chosen": -438.3851841517857, + "logps/rejected": -392.66912841796875, + "loss": 0.0361, + "rewards/chosen": 3.336460658482143, + "rewards/margins": 9.508894034794398, + "rewards/rejected": -6.172433376312256, + "step": 2830 + }, + { + "epoch": 0.25865692096847875, + "grad_norm": 0.484375, + "kl": 0.0, + "learning_rate": 8.454570861807878e-06, + "logits/chosen": 571217024.0, + "logits/rejected": 292619520.0, + "logps/chosen": -414.60791015625, + "logps/rejected": -364.8379150390625, + "loss": 0.0023, + "rewards/chosen": 5.408458709716797, + "rewards/margins": 13.320659637451172, + "rewards/rejected": -7.912200927734375, + "step": 2831 + }, + { + "epoch": 0.25874828688899043, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 8.453531280585584e-06, + "logits/chosen": 362066944.0, + "logits/rejected": 509500202.6666667, + "logps/chosen": -185.62252807617188, + "logps/rejected": -500.789794921875, + "loss": 0.0101, + "rewards/chosen": 3.541347026824951, + "rewards/margins": 14.790530681610107, + "rewards/rejected": -11.249183654785156, + "step": 2832 + }, + { + "epoch": 0.25883965280950205, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 8.452491413788249e-06, + "logits/chosen": 695645568.0, + "logits/rejected": 397540864.0, + "logps/chosen": -387.978759765625, + "logps/rejected": -451.2773844401042, + "loss": 0.0191, + "rewards/chosen": 3.187727451324463, + "rewards/margins": 11.264766534169516, + "rewards/rejected": -8.077039082845053, + "step": 2833 + }, + { + "epoch": 0.25893101873001373, + "grad_norm": 24.0, + "kl": 0.0, + "learning_rate": 8.451451261501862e-06, + "logits/chosen": 544989632.0, + "logits/rejected": 388044864.0, + "logps/chosen": -403.69580078125, + "logps/rejected": -686.93701171875, + "loss": 0.0546, + "rewards/chosen": 3.555616855621338, + "rewards/margins": 11.714758396148682, + "rewards/rejected": -8.159141540527344, + "step": 2834 + }, + { + "epoch": 0.25902238465052535, + "grad_norm": 23.125, + "kl": 0.0, + "learning_rate": 8.450410823812435e-06, + "logits/chosen": 1045543082.6666666, + "logits/rejected": 417228236.8, + "logps/chosen": -327.4832356770833, + "logps/rejected": -401.892529296875, + "loss": 0.0495, + "rewards/chosen": 2.394932428995768, + "rewards/margins": 11.68194719950358, + "rewards/rejected": -9.287014770507813, + "step": 2835 + }, + { + "epoch": 0.259113750571037, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.449370100806003e-06, + "logits/chosen": 747890432.0, + "logits/rejected": 691773824.0, + "logps/chosen": -399.1266174316406, + "logps/rejected": -448.48138427734375, + "loss": 0.02, + "rewards/chosen": 3.3169145584106445, + "rewards/margins": 10.863400936126709, + "rewards/rejected": -7.5464863777160645, + "step": 2836 + }, + { + "epoch": 0.25920511649154865, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.44832909256862e-06, + "logits/chosen": 500244010.6666667, + "logits/rejected": 431019827.2, + "logps/chosen": -357.1893717447917, + "logps/rejected": -458.8447265625, + "loss": 0.0243, + "rewards/chosen": 2.927160898844401, + "rewards/margins": 10.996624247233072, + "rewards/rejected": -8.069463348388672, + "step": 2837 + }, + { + "epoch": 0.2592964824120603, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 8.447287799186373e-06, + "logits/chosen": 657139285.3333334, + "logits/rejected": 632857036.8, + "logps/chosen": -393.4408365885417, + "logps/rejected": -521.660693359375, + "loss": 0.0226, + "rewards/chosen": 2.97103214263916, + "rewards/margins": 11.074444007873534, + "rewards/rejected": -8.103411865234374, + "step": 2838 + }, + { + "epoch": 0.25938784833257195, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 8.446246220745363e-06, + "logits/chosen": 535417526.85714287, + "logits/rejected": 676042944.0, + "logps/chosen": -382.35477120535717, + "logps/rejected": -211.69647216796875, + "loss": 0.0153, + "rewards/chosen": 4.334753308977399, + "rewards/margins": 14.671178136553083, + "rewards/rejected": -10.336424827575684, + "step": 2839 + }, + { + "epoch": 0.2594792142530836, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 8.44520435733172e-06, + "logits/chosen": 658453674.6666666, + "logits/rejected": 670304716.8, + "logps/chosen": -231.9857381184896, + "logps/rejected": -509.36650390625, + "loss": 0.0425, + "rewards/chosen": 3.096289316813151, + "rewards/margins": 13.017627970377603, + "rewards/rejected": -9.921338653564453, + "step": 2840 + }, + { + "epoch": 0.25957058017359524, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 8.4441622090316e-06, + "logits/chosen": 507148083.2, + "logits/rejected": 572298922.6666666, + "logps/chosen": -336.230908203125, + "logps/rejected": -852.7205403645834, + "loss": 0.0243, + "rewards/chosen": 3.486587905883789, + "rewards/margins": 13.322834396362305, + "rewards/rejected": -9.836246490478516, + "step": 2841 + }, + { + "epoch": 0.2596619460941069, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.443119775931175e-06, + "logits/chosen": 428209632.0, + "logits/rejected": 536995200.0, + "logps/chosen": -233.63536071777344, + "logps/rejected": -477.86285400390625, + "loss": 0.0398, + "rewards/chosen": 2.915259599685669, + "rewards/margins": 10.765263795852661, + "rewards/rejected": -7.850004196166992, + "step": 2842 + }, + { + "epoch": 0.25975331201461854, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 8.442077058116644e-06, + "logits/chosen": 956116608.0, + "logits/rejected": 379819776.0, + "logps/chosen": -308.5424499511719, + "logps/rejected": -482.79241943359375, + "loss": 0.0144, + "rewards/chosen": 4.025485038757324, + "rewards/margins": 13.122052192687988, + "rewards/rejected": -9.096567153930664, + "step": 2843 + }, + { + "epoch": 0.2598446779351302, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 8.441034055674235e-06, + "logits/chosen": 550782464.0, + "logits/rejected": 513559808.0, + "logps/chosen": -449.72626953125, + "logps/rejected": -545.83056640625, + "loss": 0.0361, + "rewards/chosen": 3.748869705200195, + "rewards/margins": 10.829756546020509, + "rewards/rejected": -7.0808868408203125, + "step": 2844 + }, + { + "epoch": 0.25993604385564184, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.439990768690189e-06, + "logits/chosen": 320807232.0, + "logits/rejected": 227129770.66666666, + "logps/chosen": -291.4193115234375, + "logps/rejected": -382.43359375, + "loss": 0.0234, + "rewards/chosen": 2.354295253753662, + "rewards/margins": 11.776661078135172, + "rewards/rejected": -9.42236582438151, + "step": 2845 + }, + { + "epoch": 0.2600274097761535, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 8.43894719725078e-06, + "logits/chosen": 342015552.0, + "logits/rejected": 931927424.0, + "logps/chosen": -280.6255798339844, + "logps/rejected": -626.22412109375, + "loss": 0.005, + "rewards/chosen": 4.855562210083008, + "rewards/margins": 14.48023509979248, + "rewards/rejected": -9.624672889709473, + "step": 2846 + }, + { + "epoch": 0.26011877569666514, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 8.437903341442299e-06, + "logits/chosen": 467655680.0, + "logits/rejected": 368392601.6, + "logps/chosen": -385.1296793619792, + "logps/rejected": -489.33828125, + "loss": 0.01, + "rewards/chosen": 4.011071840922038, + "rewards/margins": 12.415042940775553, + "rewards/rejected": -8.403971099853516, + "step": 2847 + }, + { + "epoch": 0.2602101416171768, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.436859201351065e-06, + "logits/chosen": 599232960.0, + "logits/rejected": 799716352.0, + "logps/chosen": -329.6871643066406, + "logps/rejected": -710.1634521484375, + "loss": 0.0165, + "rewards/chosen": 3.881092071533203, + "rewards/margins": 15.705358505249023, + "rewards/rejected": -11.82426643371582, + "step": 2848 + }, + { + "epoch": 0.26030150753768844, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 8.435814777063418e-06, + "logits/chosen": 367668275.2, + "logits/rejected": 430973866.6666667, + "logps/chosen": -194.98463134765626, + "logps/rejected": -401.4098307291667, + "loss": 0.0459, + "rewards/chosen": 3.2409934997558594, + "rewards/margins": 11.237630844116211, + "rewards/rejected": -7.996637344360352, + "step": 2849 + }, + { + "epoch": 0.2603928734582001, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 8.434770068665723e-06, + "logits/chosen": 607530752.0, + "logits/rejected": 342763136.0, + "logps/chosen": -562.032470703125, + "logps/rejected": -272.560400390625, + "loss": 0.0201, + "rewards/chosen": 2.8993631998697915, + "rewards/margins": 11.259207407633463, + "rewards/rejected": -8.359844207763672, + "step": 2850 + }, + { + "epoch": 0.26048423937871173, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.433725076244368e-06, + "logits/chosen": 437095497.14285713, + "logits/rejected": 282414464.0, + "logps/chosen": -264.20408412388394, + "logps/rejected": -464.2523498535156, + "loss": 0.0298, + "rewards/chosen": 3.800385066441127, + "rewards/margins": 12.875602313450404, + "rewards/rejected": -9.075217247009277, + "step": 2851 + }, + { + "epoch": 0.2605756052992234, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.43267979988576e-06, + "logits/chosen": 774279808.0, + "logits/rejected": 540142805.3333334, + "logps/chosen": -589.3526611328125, + "logps/rejected": -366.6201578776042, + "loss": 0.0256, + "rewards/chosen": 2.203662157058716, + "rewards/margins": 10.71708607673645, + "rewards/rejected": -8.513423919677734, + "step": 2852 + }, + { + "epoch": 0.26066697121973503, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 8.431634239676342e-06, + "logits/chosen": 609739712.0, + "logits/rejected": 658203477.3333334, + "logps/chosen": -475.12506103515625, + "logps/rejected": -346.0384928385417, + "loss": 0.0072, + "rewards/chosen": 4.686007976531982, + "rewards/margins": 12.623299439748127, + "rewards/rejected": -7.9372914632161455, + "step": 2853 + }, + { + "epoch": 0.2607583371402467, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 8.430588395702564e-06, + "logits/chosen": 428465792.0, + "logits/rejected": 296412096.0, + "logps/chosen": -350.30706787109375, + "logps/rejected": -409.8688659667969, + "loss": 0.0204, + "rewards/chosen": 3.722503185272217, + "rewards/margins": 13.028615474700928, + "rewards/rejected": -9.306112289428711, + "step": 2854 + }, + { + "epoch": 0.26084970306075833, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 8.429542268050913e-06, + "logits/chosen": 366338918.4, + "logits/rejected": 374069376.0, + "logps/chosen": -278.436572265625, + "logps/rejected": -402.050048828125, + "loss": 0.017, + "rewards/chosen": 3.957659149169922, + "rewards/margins": 11.999203109741211, + "rewards/rejected": -8.041543960571289, + "step": 2855 + }, + { + "epoch": 0.26094106898127, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 8.428495856807889e-06, + "logits/chosen": 352592025.6, + "logits/rejected": 554358997.3333334, + "logps/chosen": -213.2733154296875, + "logps/rejected": -524.513427734375, + "loss": 0.0463, + "rewards/chosen": 2.803090286254883, + "rewards/margins": 12.335272343953452, + "rewards/rejected": -9.532182057698568, + "step": 2856 + }, + { + "epoch": 0.26103243490178163, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 8.427449162060024e-06, + "logits/chosen": 492462560.0, + "logits/rejected": 348733504.0, + "logps/chosen": -310.956787109375, + "logps/rejected": -417.9844970703125, + "loss": 0.0078, + "rewards/chosen": 4.421355247497559, + "rewards/margins": 13.99048137664795, + "rewards/rejected": -9.56912612915039, + "step": 2857 + }, + { + "epoch": 0.2611238008222933, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 8.426402183893872e-06, + "logits/chosen": 455432832.0, + "logits/rejected": 583489365.3333334, + "logps/chosen": -362.75164794921875, + "logps/rejected": -524.5133463541666, + "loss": 0.0046, + "rewards/chosen": 4.169131278991699, + "rewards/margins": 12.639840761820475, + "rewards/rejected": -8.470709482828775, + "step": 2858 + }, + { + "epoch": 0.2612151667428049, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 8.425354922396003e-06, + "logits/chosen": 476152661.3333333, + "logits/rejected": 572412160.0, + "logps/chosen": -191.0150349934896, + "logps/rejected": -490.12109375, + "loss": 0.11, + "rewards/chosen": 2.8158063888549805, + "rewards/margins": 12.311803245544434, + "rewards/rejected": -9.495996856689453, + "step": 2859 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 8.424307377653018e-06, + "logits/chosen": 278537856.0, + "logits/rejected": 446634922.6666667, + "logps/chosen": -235.20620727539062, + "logps/rejected": -508.4255777994792, + "loss": 0.0938, + "rewards/chosen": 1.775298833847046, + "rewards/margins": 12.799748500188192, + "rewards/rejected": -11.024449666341146, + "step": 2860 + }, + { + "epoch": 0.2613978985838282, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.423259549751541e-06, + "logits/chosen": 617241429.3333334, + "logits/rejected": 583603456.0, + "logps/chosen": -210.452392578125, + "logps/rejected": -775.3553466796875, + "loss": 0.0376, + "rewards/chosen": 3.5323381423950195, + "rewards/margins": 15.342828750610352, + "rewards/rejected": -11.810490608215332, + "step": 2861 + }, + { + "epoch": 0.2614892645043399, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 8.422211438778217e-06, + "logits/chosen": 612551424.0, + "logits/rejected": 549213798.4, + "logps/chosen": -491.23681640625, + "logps/rejected": -466.31845703125, + "loss": 0.011, + "rewards/chosen": 3.8007771174112954, + "rewards/margins": 11.798151842753093, + "rewards/rejected": -7.997374725341797, + "step": 2862 + }, + { + "epoch": 0.2615806304248515, + "grad_norm": 0.1103515625, + "kl": 0.0, + "learning_rate": 8.421163044819714e-06, + "logits/chosen": 370847488.0, + "logits/rejected": 1042775917.7142857, + "logps/chosen": -150.79685974121094, + "logps/rejected": -478.83530970982144, + "loss": 0.0006, + "rewards/chosen": 5.626135349273682, + "rewards/margins": 15.71544613157, + "rewards/rejected": -10.089310782296318, + "step": 2863 + }, + { + "epoch": 0.2616719963453632, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.420114367962724e-06, + "logits/chosen": 481725344.0, + "logits/rejected": 224124245.33333334, + "logps/chosen": -396.94464111328125, + "logps/rejected": -297.41457112630206, + "loss": 0.0109, + "rewards/chosen": 3.9490766525268555, + "rewards/margins": 11.044152895609539, + "rewards/rejected": -7.095076243082683, + "step": 2864 + }, + { + "epoch": 0.2617633622658748, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 8.419065408293966e-06, + "logits/chosen": 490962585.6, + "logits/rejected": 718569386.6666666, + "logps/chosen": -301.8578369140625, + "logps/rejected": -666.569091796875, + "loss": 0.0303, + "rewards/chosen": 3.2145927429199217, + "rewards/margins": 12.769364166259766, + "rewards/rejected": -9.554771423339844, + "step": 2865 + }, + { + "epoch": 0.2618547281863865, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 8.418016165900176e-06, + "logits/chosen": 911064704.0, + "logits/rejected": 685374592.0, + "logps/chosen": -239.30224609375, + "logps/rejected": -367.68695068359375, + "loss": 0.0101, + "rewards/chosen": 4.148647785186768, + "rewards/margins": 13.85458517074585, + "rewards/rejected": -9.705937385559082, + "step": 2866 + }, + { + "epoch": 0.2619460941068981, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 8.416966640868119e-06, + "logits/chosen": 788892672.0, + "logits/rejected": 393669632.0, + "logps/chosen": -387.8686767578125, + "logps/rejected": -539.7842203776041, + "loss": 0.0365, + "rewards/chosen": 3.2146327972412108, + "rewards/margins": 14.117163467407227, + "rewards/rejected": -10.902530670166016, + "step": 2867 + }, + { + "epoch": 0.2620374600274098, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 8.415916833284577e-06, + "logits/chosen": 488745984.0, + "logits/rejected": 379087104.0, + "logps/chosen": -206.1094970703125, + "logps/rejected": -401.7134033203125, + "loss": 0.0394, + "rewards/chosen": 3.748182932535807, + "rewards/margins": 9.80460001627604, + "rewards/rejected": -6.056417083740234, + "step": 2868 + }, + { + "epoch": 0.2621288259479214, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 8.414866743236363e-06, + "logits/chosen": 359184793.6, + "logits/rejected": 428493141.3333333, + "logps/chosen": -319.29208984375, + "logps/rejected": -497.7480875651042, + "loss": 0.029, + "rewards/chosen": 3.7559356689453125, + "rewards/margins": 13.99404207865397, + "rewards/rejected": -10.238106409708658, + "step": 2869 + }, + { + "epoch": 0.2622201918684331, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 8.413816370810309e-06, + "logits/chosen": 516178400.0, + "logits/rejected": 645050944.0, + "logps/chosen": -261.40814208984375, + "logps/rejected": -538.482666015625, + "loss": 0.0097, + "rewards/chosen": 4.426494598388672, + "rewards/margins": 13.836804389953613, + "rewards/rejected": -9.410309791564941, + "step": 2870 + }, + { + "epoch": 0.2623115577889447, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.412765716093273e-06, + "logits/chosen": 270272896.0, + "logits/rejected": 369107968.0, + "logps/chosen": -244.57501220703125, + "logps/rejected": -441.933740234375, + "loss": 0.0094, + "rewards/chosen": 4.233765284220378, + "rewards/margins": 13.871199671427409, + "rewards/rejected": -9.637434387207032, + "step": 2871 + }, + { + "epoch": 0.2624029237094564, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 8.41171477917213e-06, + "logits/chosen": 431201248.0, + "logits/rejected": 510664416.0, + "logps/chosen": -354.45703125, + "logps/rejected": -489.09796142578125, + "loss": 0.0093, + "rewards/chosen": 4.482560634613037, + "rewards/margins": 13.020411014556885, + "rewards/rejected": -8.537850379943848, + "step": 2872 + }, + { + "epoch": 0.262494289629968, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 8.410663560133784e-06, + "logits/chosen": 386353578.6666667, + "logits/rejected": 534004736.0, + "logps/chosen": -195.79671223958334, + "logps/rejected": -608.6126953125, + "loss": 0.0117, + "rewards/chosen": 3.8710104624430337, + "rewards/margins": 13.61202303568522, + "rewards/rejected": -9.741012573242188, + "step": 2873 + }, + { + "epoch": 0.2625856555504797, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.409612059065162e-06, + "logits/chosen": 422333536.0, + "logits/rejected": 425456512.0, + "logps/chosen": -318.20068359375, + "logps/rejected": -605.2972412109375, + "loss": 0.0152, + "rewards/chosen": 4.046143531799316, + "rewards/margins": 13.564008712768555, + "rewards/rejected": -9.517865180969238, + "step": 2874 + }, + { + "epoch": 0.2626770214709913, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 8.408560276053215e-06, + "logits/chosen": 664159829.3333334, + "logits/rejected": 412049305.6, + "logps/chosen": -418.3763427734375, + "logps/rejected": -480.78193359375, + "loss": 0.0148, + "rewards/chosen": 3.642069180806478, + "rewards/margins": 11.785610135396322, + "rewards/rejected": -8.143540954589843, + "step": 2875 + }, + { + "epoch": 0.262768387391503, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 8.407508211184915e-06, + "logits/chosen": 573287296.0, + "logits/rejected": 526874240.0, + "logps/chosen": -266.603515625, + "logps/rejected": -603.1253051757812, + "loss": 0.0217, + "rewards/chosen": 3.3624107837677, + "rewards/margins": 14.18956208229065, + "rewards/rejected": -10.82715129852295, + "step": 2876 + }, + { + "epoch": 0.2628597533120146, + "grad_norm": 84.0, + "kl": 0.0, + "learning_rate": 8.406455864547256e-06, + "logits/chosen": 438751616.0, + "logits/rejected": 783440213.3333334, + "logps/chosen": -236.68490600585938, + "logps/rejected": -629.4165852864584, + "loss": 0.1239, + "rewards/chosen": 2.3440628051757812, + "rewards/margins": 9.830705006917317, + "rewards/rejected": -7.486642201741536, + "step": 2877 + }, + { + "epoch": 0.2629511192325263, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 8.40540323622726e-06, + "logits/chosen": 571289600.0, + "logits/rejected": 503836672.0, + "logps/chosen": -437.5263366699219, + "logps/rejected": -666.0160522460938, + "loss": 0.0185, + "rewards/chosen": 3.6704931259155273, + "rewards/margins": 10.687360286712646, + "rewards/rejected": -7.016867160797119, + "step": 2878 + }, + { + "epoch": 0.2630424851530379, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.404350326311966e-06, + "logits/chosen": 805280320.0, + "logits/rejected": 421893696.0, + "logps/chosen": -431.3507995605469, + "logps/rejected": -369.5257568359375, + "loss": 0.0366, + "rewards/chosen": 2.751960039138794, + "rewards/margins": 10.955540895462036, + "rewards/rejected": -8.203580856323242, + "step": 2879 + }, + { + "epoch": 0.2631338510735496, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 8.403297134888441e-06, + "logits/chosen": 581268352.0, + "logits/rejected": 603999680.0, + "logps/chosen": -408.01983642578125, + "logps/rejected": -658.7037353515625, + "loss": 0.0199, + "rewards/chosen": 3.887788772583008, + "rewards/margins": 12.93850326538086, + "rewards/rejected": -9.050714492797852, + "step": 2880 + }, + { + "epoch": 0.2632252169940612, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 8.402243662043779e-06, + "logits/chosen": 466415616.0, + "logits/rejected": 551616298.6666666, + "logps/chosen": -373.11904296875, + "logps/rejected": -558.5205078125, + "loss": 0.0141, + "rewards/chosen": 4.438531494140625, + "rewards/margins": 14.122285461425781, + "rewards/rejected": -9.683753967285156, + "step": 2881 + }, + { + "epoch": 0.2633165829145729, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 8.401189907865087e-06, + "logits/chosen": 680712960.0, + "logits/rejected": 509481779.2, + "logps/chosen": -334.2716064453125, + "logps/rejected": -469.63408203125, + "loss": 0.0083, + "rewards/chosen": 3.8679517110188804, + "rewards/margins": 12.100872548421224, + "rewards/rejected": -8.232920837402343, + "step": 2882 + }, + { + "epoch": 0.2634079488350845, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 8.4001358724395e-06, + "logits/chosen": 602223445.3333334, + "logits/rejected": 411706432.0, + "logps/chosen": -522.216796875, + "logps/rejected": -874.7098388671875, + "loss": 0.0374, + "rewards/chosen": 3.164926211039225, + "rewards/margins": 20.08290449778239, + "rewards/rejected": -16.917978286743164, + "step": 2883 + }, + { + "epoch": 0.2634993147555962, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 8.399081555854183e-06, + "logits/chosen": 554619904.0, + "logits/rejected": 455698090.6666667, + "logps/chosen": -240.9851837158203, + "logps/rejected": -507.0570882161458, + "loss": 0.0125, + "rewards/chosen": 3.019211769104004, + "rewards/margins": 13.114688555399576, + "rewards/rejected": -10.095476786295572, + "step": 2884 + }, + { + "epoch": 0.2635906806761078, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 8.398026958196312e-06, + "logits/chosen": 662366464.0, + "logits/rejected": 669104341.3333334, + "logps/chosen": -347.6533203125, + "logps/rejected": -729.4440104166666, + "loss": 0.0191, + "rewards/chosen": 4.061179351806641, + "rewards/margins": 12.643957646687827, + "rewards/rejected": -8.582778294881185, + "step": 2885 + }, + { + "epoch": 0.2636820465966195, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 8.396972079553097e-06, + "logits/chosen": 644266112.0, + "logits/rejected": 749054122.6666666, + "logps/chosen": -284.9497375488281, + "logps/rejected": -570.3347981770834, + "loss": 0.0067, + "rewards/chosen": 3.6923294067382812, + "rewards/margins": 12.049264907836914, + "rewards/rejected": -8.356935501098633, + "step": 2886 + }, + { + "epoch": 0.2637734125171311, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 8.395916920011762e-06, + "logits/chosen": 292498624.0, + "logits/rejected": 477599616.0, + "logps/chosen": -159.86314392089844, + "logps/rejected": -488.07305908203125, + "loss": 0.0393, + "rewards/chosen": 3.0000481605529785, + "rewards/margins": 11.76238203048706, + "rewards/rejected": -8.762333869934082, + "step": 2887 + }, + { + "epoch": 0.2638647784376428, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.394861479659562e-06, + "logits/chosen": 451975168.0, + "logits/rejected": 448515712.0, + "logps/chosen": -333.7390380859375, + "logps/rejected": -620.3506673177084, + "loss": 0.0334, + "rewards/chosen": 2.994888114929199, + "rewards/margins": 13.883867708841958, + "rewards/rejected": -10.88897959391276, + "step": 2888 + }, + { + "epoch": 0.2639561443581544, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 8.393805758583774e-06, + "logits/chosen": 447687765.3333333, + "logits/rejected": 772216320.0, + "logps/chosen": -304.94533284505206, + "logps/rejected": -509.43017578125, + "loss": 0.0178, + "rewards/chosen": 3.692653020222982, + "rewards/margins": 12.72684186299642, + "rewards/rejected": -9.034188842773437, + "step": 2889 + }, + { + "epoch": 0.2640475102786661, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 8.392749756871692e-06, + "logits/chosen": 510677657.6, + "logits/rejected": 245356970.66666666, + "logps/chosen": -296.956982421875, + "logps/rejected": -299.47226969401044, + "loss": 0.0666, + "rewards/chosen": 2.4296279907226563, + "rewards/margins": 12.108118947347005, + "rewards/rejected": -9.67849095662435, + "step": 2890 + }, + { + "epoch": 0.2641388761991777, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 8.39169347461064e-06, + "logits/chosen": 410796134.4, + "logits/rejected": 837643434.6666666, + "logps/chosen": -206.4424560546875, + "logps/rejected": -540.3494466145834, + "loss": 0.1429, + "rewards/chosen": 1.7694646835327148, + "rewards/margins": 11.339365196228027, + "rewards/rejected": -9.569900512695312, + "step": 2891 + }, + { + "epoch": 0.2642302421196894, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 8.390636911887964e-06, + "logits/chosen": 671337088.0, + "logits/rejected": 559909632.0, + "logps/chosen": -240.6512908935547, + "logps/rejected": -369.108642578125, + "loss": 0.0117, + "rewards/chosen": 4.302116394042969, + "rewards/margins": 10.632366180419922, + "rewards/rejected": -6.330249786376953, + "step": 2892 + }, + { + "epoch": 0.264321608040201, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.38958006879103e-06, + "logits/chosen": 362556512.0, + "logits/rejected": 288576128.0, + "logps/chosen": -118.60443115234375, + "logps/rejected": -386.4572347005208, + "loss": 0.0134, + "rewards/chosen": 3.4641342163085938, + "rewards/margins": 12.229086558024088, + "rewards/rejected": -8.764952341715494, + "step": 2893 + }, + { + "epoch": 0.26441297396071267, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.388522945407228e-06, + "logits/chosen": 1122166272.0, + "logits/rejected": 758077888.0, + "logps/chosen": -282.15447998046875, + "logps/rejected": -418.21002197265625, + "loss": 0.0457, + "rewards/chosen": 3.2313456535339355, + "rewards/margins": 11.621931552886963, + "rewards/rejected": -8.390585899353027, + "step": 2894 + }, + { + "epoch": 0.2645043398812243, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 8.387465541823974e-06, + "logits/chosen": 444291114.6666667, + "logits/rejected": 373587148.8, + "logps/chosen": -345.85400390625, + "logps/rejected": -423.15830078125, + "loss": 0.0059, + "rewards/chosen": 4.3941144943237305, + "rewards/margins": 12.80482006072998, + "rewards/rejected": -8.41070556640625, + "step": 2895 + }, + { + "epoch": 0.26459570580173597, + "grad_norm": 0.671875, + "kl": 0.0, + "learning_rate": 8.386407858128707e-06, + "logits/chosen": 730814272.0, + "logits/rejected": 864749312.0, + "logps/chosen": -258.2505798339844, + "logps/rejected": -502.7159830729167, + "loss": 0.0036, + "rewards/chosen": 4.477916717529297, + "rewards/margins": 12.367133458455402, + "rewards/rejected": -7.8892167409261065, + "step": 2896 + }, + { + "epoch": 0.2646870717222476, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.385349894408884e-06, + "logits/chosen": 1004659814.4, + "logits/rejected": 633109504.0, + "logps/chosen": -587.54931640625, + "logps/rejected": -526.4456380208334, + "loss": 0.0223, + "rewards/chosen": 3.464861297607422, + "rewards/margins": 12.557395044962565, + "rewards/rejected": -9.092533747355143, + "step": 2897 + }, + { + "epoch": 0.26477843764275927, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.384291650751992e-06, + "logits/chosen": 423614540.8, + "logits/rejected": 381916714.6666667, + "logps/chosen": -276.8393310546875, + "logps/rejected": -593.1526692708334, + "loss": 0.0253, + "rewards/chosen": 3.3524036407470703, + "rewards/margins": 12.576617558797201, + "rewards/rejected": -9.22421391805013, + "step": 2898 + }, + { + "epoch": 0.2648698035632709, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 8.383233127245536e-06, + "logits/chosen": 713290496.0, + "logits/rejected": 350377984.0, + "logps/chosen": -377.054443359375, + "logps/rejected": -368.14483642578125, + "loss": 0.0349, + "rewards/chosen": 3.286912282307943, + "rewards/margins": 10.18643601735433, + "rewards/rejected": -6.899523735046387, + "step": 2899 + }, + { + "epoch": 0.26496116948378257, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 8.382174323977046e-06, + "logits/chosen": 688824704.0, + "logits/rejected": 813559381.3333334, + "logps/chosen": -529.8264770507812, + "logps/rejected": -443.1314290364583, + "loss": 0.0049, + "rewards/chosen": 4.113288879394531, + "rewards/margins": 13.349418004353842, + "rewards/rejected": -9.23612912495931, + "step": 2900 + }, + { + "epoch": 0.2650525354042942, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 8.381115241034077e-06, + "logits/chosen": 734934937.6, + "logits/rejected": 1026185216.0, + "logps/chosen": -438.631298828125, + "logps/rejected": -538.7283935546875, + "loss": 0.0195, + "rewards/chosen": 4.039008331298828, + "rewards/margins": 12.036212158203124, + "rewards/rejected": -7.997203826904297, + "step": 2901 + }, + { + "epoch": 0.26514390132480586, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.380055878504202e-06, + "logits/chosen": 511094816.0, + "logits/rejected": 758008960.0, + "logps/chosen": -231.261962890625, + "logps/rejected": -524.343505859375, + "loss": 0.0313, + "rewards/chosen": 2.9174742698669434, + "rewards/margins": 10.49733304977417, + "rewards/rejected": -7.579858779907227, + "step": 2902 + }, + { + "epoch": 0.2652352672453175, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 8.378996236475025e-06, + "logits/chosen": 521167360.0, + "logits/rejected": 481889728.0, + "logps/chosen": -329.56201171875, + "logps/rejected": -423.0082092285156, + "loss": 0.0178, + "rewards/chosen": 3.548431396484375, + "rewards/margins": 12.878678321838379, + "rewards/rejected": -9.330246925354004, + "step": 2903 + }, + { + "epoch": 0.26532663316582916, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 8.377936315034167e-06, + "logits/chosen": 821327140.5714285, + "logits/rejected": 573741248.0, + "logps/chosen": -370.3829868861607, + "logps/rejected": -547.9332885742188, + "loss": 0.0267, + "rewards/chosen": 3.8044891357421875, + "rewards/margins": 13.33518123626709, + "rewards/rejected": -9.530692100524902, + "step": 2904 + }, + { + "epoch": 0.2654179990863408, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 8.376876114269272e-06, + "logits/chosen": 505917269.3333333, + "logits/rejected": 625636160.0, + "logps/chosen": -263.3161214192708, + "logps/rejected": -304.41448974609375, + "loss": 0.0545, + "rewards/chosen": 2.7398808797200522, + "rewards/margins": 8.614286740620932, + "rewards/rejected": -5.874405860900879, + "step": 2905 + }, + { + "epoch": 0.26550936500685246, + "grad_norm": 39.5, + "kl": 0.0, + "learning_rate": 8.37581563426801e-06, + "logits/chosen": 691959168.0, + "logits/rejected": 833453226.6666666, + "logps/chosen": -290.50323486328125, + "logps/rejected": -389.7923990885417, + "loss": 0.0552, + "rewards/chosen": 3.6086974143981934, + "rewards/margins": 11.147925853729248, + "rewards/rejected": -7.539228439331055, + "step": 2906 + }, + { + "epoch": 0.2656007309273641, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 8.374754875118072e-06, + "logits/chosen": 1523707392.0, + "logits/rejected": 433343936.0, + "logps/chosen": -581.48486328125, + "logps/rejected": -238.10513305664062, + "loss": 0.0119, + "rewards/chosen": 3.8352508544921875, + "rewards/margins": 10.598361015319824, + "rewards/rejected": -6.763110160827637, + "step": 2907 + }, + { + "epoch": 0.26569209684787576, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 8.373693836907175e-06, + "logits/chosen": 506269013.3333333, + "logits/rejected": 540700825.6, + "logps/chosen": -292.0944417317708, + "logps/rejected": -362.130908203125, + "loss": 0.011, + "rewards/chosen": 3.880215326944987, + "rewards/margins": 12.83506991068522, + "rewards/rejected": -8.954854583740234, + "step": 2908 + }, + { + "epoch": 0.2657834627683874, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 8.372632519723054e-06, + "logits/chosen": 343875264.0, + "logits/rejected": 373646848.0, + "logps/chosen": -111.06599934895833, + "logps/rejected": -694.816015625, + "loss": 0.0242, + "rewards/chosen": 3.1842177708943686, + "rewards/margins": 15.335890134175619, + "rewards/rejected": -12.15167236328125, + "step": 2909 + }, + { + "epoch": 0.26587482868889906, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 8.371570923653473e-06, + "logits/chosen": 469884586.6666667, + "logits/rejected": 512944742.4, + "logps/chosen": -331.1973876953125, + "logps/rejected": -486.004638671875, + "loss": 0.0116, + "rewards/chosen": 4.139633814493815, + "rewards/margins": 11.416896692911784, + "rewards/rejected": -7.277262878417969, + "step": 2910 + }, + { + "epoch": 0.2659661946094107, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 8.370509048786215e-06, + "logits/chosen": 382225237.3333333, + "logits/rejected": 591820492.8, + "logps/chosen": -189.5328165690104, + "logps/rejected": -457.444873046875, + "loss": 0.0297, + "rewards/chosen": 2.6317526499430337, + "rewards/margins": 9.725864283243816, + "rewards/rejected": -7.094111633300781, + "step": 2911 + }, + { + "epoch": 0.26605756052992235, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 8.369446895209085e-06, + "logits/chosen": 769039872.0, + "logits/rejected": 1220402944.0, + "logps/chosen": -322.18914794921875, + "logps/rejected": -583.7118530273438, + "loss": 0.0213, + "rewards/chosen": 3.6376380920410156, + "rewards/margins": 11.293335437774658, + "rewards/rejected": -7.655697345733643, + "step": 2912 + }, + { + "epoch": 0.266148926450434, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 8.368384463009917e-06, + "logits/chosen": 407802240.0, + "logits/rejected": 461523148.8, + "logps/chosen": -341.7986246744792, + "logps/rejected": -401.62119140625, + "loss": 0.0082, + "rewards/chosen": 3.952817916870117, + "rewards/margins": 11.044563674926758, + "rewards/rejected": -7.091745758056641, + "step": 2913 + }, + { + "epoch": 0.26624029237094565, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 8.367321752276562e-06, + "logits/chosen": 478768576.0, + "logps/chosen": -233.1852264404297, + "loss": 0.0722, + "rewards/chosen": 3.029608964920044, + "step": 2914 + }, + { + "epoch": 0.2663316582914573, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 8.3662587630969e-06, + "logits/chosen": 375938969.6, + "logits/rejected": 620519936.0, + "logps/chosen": -252.5164306640625, + "logps/rejected": -622.0575358072916, + "loss": 0.0242, + "rewards/chosen": 3.773459243774414, + "rewards/margins": 14.203210322062173, + "rewards/rejected": -10.42975107828776, + "step": 2915 + }, + { + "epoch": 0.26642302421196895, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 8.365195495558825e-06, + "logits/chosen": 696626944.0, + "logits/rejected": 661312704.0, + "logps/chosen": -308.9773254394531, + "logps/rejected": -392.05682373046875, + "loss": 0.029, + "rewards/chosen": 3.233996629714966, + "rewards/margins": 12.2689049243927, + "rewards/rejected": -9.034908294677734, + "step": 2916 + }, + { + "epoch": 0.26651439013248057, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 8.364131949750261e-06, + "logits/chosen": 553270485.3333334, + "logits/rejected": 1126783232.0, + "logps/chosen": -285.4195556640625, + "logps/rejected": -481.1250915527344, + "loss": 0.037, + "rewards/chosen": 3.566669146219889, + "rewards/margins": 11.815622011820475, + "rewards/rejected": -8.248952865600586, + "step": 2917 + }, + { + "epoch": 0.26660575605299225, + "grad_norm": 28.0, + "kl": 0.0, + "learning_rate": 8.363068125759155e-06, + "logits/chosen": 259928038.4, + "logits/rejected": 639477802.6666666, + "logps/chosen": -389.11103515625, + "logps/rejected": -348.1699625651042, + "loss": 0.0922, + "rewards/chosen": 3.406591033935547, + "rewards/margins": 10.64408899943034, + "rewards/rejected": -7.237497965494792, + "step": 2918 + }, + { + "epoch": 0.26669712197350387, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 8.362004023673473e-06, + "logits/chosen": 583538358.8571428, + "logits/rejected": 513966656.0, + "logps/chosen": -252.17787388392858, + "logps/rejected": -567.1517944335938, + "loss": 0.0192, + "rewards/chosen": 4.236962454659598, + "rewards/margins": 10.856295245034353, + "rewards/rejected": -6.619332790374756, + "step": 2919 + }, + { + "epoch": 0.26678848789401555, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.36093964358121e-06, + "logits/chosen": 343913813.3333333, + "logits/rejected": 522387353.6, + "logps/chosen": -263.8203531901042, + "logps/rejected": -430.871044921875, + "loss": 0.0205, + "rewards/chosen": 3.5932610829671225, + "rewards/margins": 10.984549840291342, + "rewards/rejected": -7.391288757324219, + "step": 2920 + }, + { + "epoch": 0.26687985381452717, + "grad_norm": 0.1416015625, + "kl": 0.0, + "learning_rate": 8.359874985570378e-06, + "logits/rejected": 728473728.0, + "logps/rejected": -567.2533569335938, + "loss": 0.0005, + "rewards/rejected": -10.102304458618164, + "step": 2921 + }, + { + "epoch": 0.26697121973503884, + "grad_norm": 38.25, + "kl": 0.0, + "learning_rate": 8.358810049729014e-06, + "logits/chosen": 295648832.0, + "logits/rejected": 434298400.0, + "logps/chosen": -287.17633056640625, + "logps/rejected": -723.8218383789062, + "loss": 0.1073, + "rewards/chosen": 2.1375796794891357, + "rewards/margins": 10.727303266525269, + "rewards/rejected": -8.589723587036133, + "step": 2922 + }, + { + "epoch": 0.26706258565555047, + "grad_norm": 2.03125, + "kl": 0.2801780700683594, + "learning_rate": 8.357744836145179e-06, + "logits/chosen": 492619593.14285713, + "logits/rejected": 518060768.0, + "logps/chosen": -389.2638462611607, + "logps/rejected": -520.523681640625, + "loss": 0.0155, + "rewards/chosen": 4.208107267107282, + "rewards/margins": 13.916933332170759, + "rewards/rejected": -9.708826065063477, + "step": 2923 + }, + { + "epoch": 0.26715395157606214, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.356679344906958e-06, + "logits/chosen": 763678720.0, + "logits/rejected": 1044073676.8, + "logps/chosen": -420.8544108072917, + "logps/rejected": -587.52509765625, + "loss": 0.0171, + "rewards/chosen": 3.3053194681803384, + "rewards/margins": 13.996296183268228, + "rewards/rejected": -10.69097671508789, + "step": 2924 + }, + { + "epoch": 0.26724531749657376, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 8.355613576102453e-06, + "logits/chosen": 728969420.8, + "logits/rejected": 608727978.6666666, + "logps/chosen": -319.8333984375, + "logps/rejected": -625.1543375651041, + "loss": 0.0186, + "rewards/chosen": 3.62802734375, + "rewards/margins": 13.02712605794271, + "rewards/rejected": -9.399098714192709, + "step": 2925 + }, + { + "epoch": 0.26733668341708544, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 8.354547529819796e-06, + "logits/chosen": 1030399692.8, + "logits/rejected": 870502570.6666666, + "logps/chosen": -234.615966796875, + "logps/rejected": -617.9435221354166, + "loss": 0.0617, + "rewards/chosen": 2.485469627380371, + "rewards/margins": 11.741485404968262, + "rewards/rejected": -9.25601577758789, + "step": 2926 + }, + { + "epoch": 0.26742804933759706, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 8.35348120614714e-06, + "logits/chosen": 873119658.6666666, + "logits/rejected": 447586457.6, + "logps/chosen": -244.39306640625, + "logps/rejected": -347.2184326171875, + "loss": 0.0159, + "rewards/chosen": 3.4955224990844727, + "rewards/margins": 12.19539737701416, + "rewards/rejected": -8.699874877929688, + "step": 2927 + }, + { + "epoch": 0.26751941525810874, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.352414605172659e-06, + "logits/chosen": 386954188.8, + "logits/rejected": 330982229.3333333, + "logps/chosen": -236.739794921875, + "logps/rejected": -326.25986735026044, + "loss": 0.0156, + "rewards/chosen": 4.0394752502441404, + "rewards/margins": 12.320249684651692, + "rewards/rejected": -8.280774434407553, + "step": 2928 + }, + { + "epoch": 0.26761078117862036, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 8.35134772698455e-06, + "logits/chosen": 830173696.0, + "logits/rejected": 453437440.0, + "logps/chosen": -522.7317301432291, + "logps/rejected": -545.8767700195312, + "loss": 0.0212, + "rewards/chosen": 3.7313931783040366, + "rewards/margins": 14.619126637776693, + "rewards/rejected": -10.887733459472656, + "step": 2929 + }, + { + "epoch": 0.26770214709913204, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.350280571671034e-06, + "logits/chosen": 622716342.8571428, + "logits/rejected": 331478752.0, + "logps/chosen": -265.54366629464283, + "logps/rejected": -610.971435546875, + "loss": 0.0998, + "rewards/chosen": 3.5170533316476003, + "rewards/margins": 16.03033515385219, + "rewards/rejected": -12.51328182220459, + "step": 2930 + }, + { + "epoch": 0.26779351301964366, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 8.349213139320357e-06, + "logits/chosen": 781983872.0, + "logits/rejected": 521239488.0, + "logps/chosen": -363.6766052246094, + "logps/rejected": -454.17218017578125, + "loss": 0.0239, + "rewards/chosen": 3.367330551147461, + "rewards/margins": 12.432497024536133, + "rewards/rejected": -9.065166473388672, + "step": 2931 + }, + { + "epoch": 0.26788487894015534, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.348145430020786e-06, + "logits/chosen": 549173034.6666666, + "logits/rejected": 1092267776.0, + "logps/chosen": -264.6370442708333, + "logps/rejected": -614.5056762695312, + "loss": 0.0444, + "rewards/chosen": 3.060022989908854, + "rewards/margins": 13.65478769938151, + "rewards/rejected": -10.594764709472656, + "step": 2932 + }, + { + "epoch": 0.26797624486066696, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 8.347077443860608e-06, + "logits/chosen": 561256320.0, + "logits/rejected": 503699498.6666667, + "logps/chosen": -242.55433654785156, + "logps/rejected": -440.8301595052083, + "loss": 0.1188, + "rewards/chosen": 3.168633222579956, + "rewards/margins": 8.477273384730022, + "rewards/rejected": -5.308640162150065, + "step": 2933 + }, + { + "epoch": 0.26806761078117863, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 8.346009180928136e-06, + "logits/chosen": 579175321.6, + "logits/rejected": 363000405.3333333, + "logps/chosen": -420.4400390625, + "logps/rejected": -555.7594401041666, + "loss": 0.0167, + "rewards/chosen": 3.6988899230957033, + "rewards/margins": 15.396075439453124, + "rewards/rejected": -11.697185516357422, + "step": 2934 + }, + { + "epoch": 0.26815897670169025, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.344940641311707e-06, + "logits/chosen": 400352896.0, + "logits/rejected": 227020064.0, + "logps/chosen": -299.84401448567706, + "logps/rejected": -383.9544372558594, + "loss": 0.0191, + "rewards/chosen": 4.403174082438151, + "rewards/margins": 10.059106508890789, + "rewards/rejected": -5.655932426452637, + "step": 2935 + }, + { + "epoch": 0.26825034262220193, + "grad_norm": 25.375, + "kl": 0.0, + "learning_rate": 8.343871825099678e-06, + "logits/chosen": 384861738.6666667, + "logits/rejected": 377178931.2, + "logps/chosen": -271.7130940755208, + "logps/rejected": -346.571533203125, + "loss": 0.0348, + "rewards/chosen": 3.6660448710123696, + "rewards/margins": 10.515455881754557, + "rewards/rejected": -6.8494110107421875, + "step": 2936 + }, + { + "epoch": 0.26834170854271355, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 8.34280273238043e-06, + "logits/chosen": 350542720.0, + "logits/rejected": 475634240.0, + "logps/chosen": -232.5342000325521, + "logps/rejected": -623.2059936523438, + "loss": 0.0206, + "rewards/chosen": 3.8920090993245444, + "rewards/margins": 12.909024556477865, + "rewards/rejected": -9.01701545715332, + "step": 2937 + }, + { + "epoch": 0.26843307446322523, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 8.34173336324237e-06, + "logits/chosen": 425635302.4, + "logits/rejected": 385549397.3333333, + "logps/chosen": -297.962060546875, + "logps/rejected": -618.9121907552084, + "loss": 0.0152, + "rewards/chosen": 4.294766998291015, + "rewards/margins": 15.234073893229166, + "rewards/rejected": -10.93930689493815, + "step": 2938 + }, + { + "epoch": 0.26852444038373685, + "grad_norm": 6.21875, + "kl": 8.626445770263672, + "learning_rate": 8.340663717773922e-06, + "logits/chosen": 727587986.2857143, + "logits/rejected": 2533988352.0, + "logps/chosen": -338.80482700892856, + "logps/rejected": -558.3268432617188, + "loss": 0.0487, + "rewards/chosen": 4.129855564662388, + "rewards/margins": 14.52779906136649, + "rewards/rejected": -10.397943496704102, + "step": 2939 + }, + { + "epoch": 0.26861580630424853, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 8.339593796063534e-06, + "logits/chosen": 841238016.0, + "logits/rejected": 368576928.0, + "logps/chosen": -348.33935546875, + "logps/rejected": -285.73211669921875, + "loss": 0.0289, + "rewards/chosen": 3.747530937194824, + "rewards/margins": 12.144644737243652, + "rewards/rejected": -8.397113800048828, + "step": 2940 + }, + { + "epoch": 0.26870717222476015, + "grad_norm": 32.75, + "kl": 0.0, + "learning_rate": 8.338523598199683e-06, + "logits/chosen": 434468480.0, + "logits/rejected": 436607283.2, + "logps/chosen": -247.49188232421875, + "logps/rejected": -474.045361328125, + "loss": 0.1504, + "rewards/chosen": 1.8645904858907063, + "rewards/margins": 10.50640261967977, + "rewards/rejected": -8.641812133789063, + "step": 2941 + }, + { + "epoch": 0.2687985381452718, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 8.337453124270864e-06, + "logits/chosen": 443587648.0, + "logits/rejected": 523906048.0, + "logps/chosen": -194.42636108398438, + "logps/rejected": -521.256591796875, + "loss": 0.0484, + "rewards/chosen": 3.705721378326416, + "rewards/margins": 10.82859182357788, + "rewards/rejected": -7.122870445251465, + "step": 2942 + }, + { + "epoch": 0.26888990406578345, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 8.336382374365592e-06, + "logits/chosen": 514919321.6, + "logits/rejected": 952510634.6666666, + "logps/chosen": -306.7189453125, + "logps/rejected": -380.0325520833333, + "loss": 0.0461, + "rewards/chosen": 2.6744007110595702, + "rewards/margins": 12.440744145711264, + "rewards/rejected": -9.766343434651693, + "step": 2943 + }, + { + "epoch": 0.2689812699862951, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 8.335311348572411e-06, + "logits/chosen": 570462336.0, + "logits/rejected": 384773376.0, + "logps/chosen": -371.22637939453125, + "logps/rejected": -416.4619445800781, + "loss": 0.0189, + "rewards/chosen": 3.7613046169281006, + "rewards/margins": 12.268172979354858, + "rewards/rejected": -8.506868362426758, + "step": 2944 + }, + { + "epoch": 0.26907263590680675, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.334240046979883e-06, + "logits/chosen": 584999789.7142857, + "logits/rejected": 1938530688.0, + "logps/chosen": -340.5120326450893, + "logps/rejected": -998.5494995117188, + "loss": 0.0432, + "rewards/chosen": 3.4492105756487166, + "rewards/margins": 12.688536099025182, + "rewards/rejected": -9.239325523376465, + "step": 2945 + }, + { + "epoch": 0.2691640018273184, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 8.333168469676595e-06, + "logits/chosen": 416139264.0, + "logits/rejected": 602995712.0, + "logps/chosen": -319.76318359375, + "logps/rejected": -748.62939453125, + "loss": 0.0219, + "rewards/chosen": 4.172706604003906, + "rewards/margins": 13.789753913879395, + "rewards/rejected": -9.617047309875488, + "step": 2946 + }, + { + "epoch": 0.26925536774783004, + "grad_norm": 0.19140625, + "kl": 0.0, + "learning_rate": 8.332096616751157e-06, + "logits/rejected": 303763840.0, + "logps/rejected": -381.431640625, + "loss": 0.0012, + "rewards/rejected": -8.147600173950195, + "step": 2947 + }, + { + "epoch": 0.2693467336683417, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.331024488292203e-06, + "logits/chosen": 553324800.0, + "logits/rejected": 527085721.6, + "logps/chosen": -557.3900146484375, + "logps/rejected": -694.3166015625, + "loss": 0.0143, + "rewards/chosen": 3.490898768107096, + "rewards/margins": 12.512441889444986, + "rewards/rejected": -9.02154312133789, + "step": 2948 + }, + { + "epoch": 0.26943809958885334, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.329952084388387e-06, + "logits/chosen": 540548352.0, + "logits/rejected": 452081376.0, + "logps/chosen": -367.7607421875, + "logps/rejected": -726.2107543945312, + "loss": 0.0351, + "rewards/chosen": 2.662358283996582, + "rewards/margins": 12.967844009399414, + "rewards/rejected": -10.305485725402832, + "step": 2949 + }, + { + "epoch": 0.269529465509365, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 8.328879405128385e-06, + "logits/chosen": 286533056.0, + "logits/rejected": 877340160.0, + "logps/chosen": -195.54544067382812, + "logps/rejected": -464.9585367838542, + "loss": 0.0087, + "rewards/chosen": 5.52031135559082, + "rewards/margins": 13.461225509643555, + "rewards/rejected": -7.940914154052734, + "step": 2950 + }, + { + "epoch": 0.26962083142987664, + "grad_norm": 47.75, + "kl": 0.0, + "learning_rate": 8.327806450600897e-06, + "logits/chosen": 372013781.3333333, + "logits/rejected": 433340416.0, + "logps/chosen": -151.83955891927084, + "logps/rejected": -401.21533203125, + "loss": 0.053, + "rewards/chosen": 3.805515925089518, + "rewards/margins": 13.101289240519206, + "rewards/rejected": -9.295773315429688, + "step": 2951 + }, + { + "epoch": 0.2697121973503883, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.32673322089465e-06, + "logits/chosen": 377597376.0, + "logits/rejected": 398574912.0, + "logps/chosen": -237.2783203125, + "logps/rejected": -559.9305419921875, + "loss": 0.1215, + "rewards/chosen": 2.5279810428619385, + "rewards/margins": 10.883179903030396, + "rewards/rejected": -8.355198860168457, + "step": 2952 + }, + { + "epoch": 0.26980356327089994, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 8.32565971609839e-06, + "logits/chosen": 492894634.6666667, + "logits/rejected": 354067302.4, + "logps/chosen": -200.572265625, + "logps/rejected": -374.870556640625, + "loss": 0.0193, + "rewards/chosen": 3.289127985636393, + "rewards/margins": 10.289660517374674, + "rewards/rejected": -7.000532531738282, + "step": 2953 + }, + { + "epoch": 0.2698949291914116, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 8.324585936300881e-06, + "logits/chosen": 516194346.6666667, + "logits/rejected": 529383321.6, + "logps/chosen": -352.8806966145833, + "logps/rejected": -484.54189453125, + "loss": 0.0105, + "rewards/chosen": 3.9985790252685547, + "rewards/margins": 13.289020919799805, + "rewards/rejected": -9.29044189453125, + "step": 2954 + }, + { + "epoch": 0.26998629511192324, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 8.323511881590922e-06, + "logits/chosen": 549088213.3333334, + "logits/rejected": 607575705.6, + "logps/chosen": -461.2188313802083, + "logps/rejected": -423.436669921875, + "loss": 0.0154, + "rewards/chosen": 3.546395937601725, + "rewards/margins": 11.279303042093913, + "rewards/rejected": -7.732907104492187, + "step": 2955 + }, + { + "epoch": 0.2700776610324349, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 8.322437552057321e-06, + "logits/chosen": 756454656.0, + "logits/rejected": 505402709.3333333, + "logps/chosen": -276.1369140625, + "logps/rejected": -516.9432373046875, + "loss": 0.0178, + "rewards/chosen": 3.7112762451171877, + "rewards/margins": 10.86172472635905, + "rewards/rejected": -7.150448481241862, + "step": 2956 + }, + { + "epoch": 0.27016902695294653, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 8.321362947788916e-06, + "logits/chosen": 600013184.0, + "logits/rejected": 294757248.0, + "logps/chosen": -440.5107421875, + "logps/rejected": -394.340771484375, + "loss": 0.0291, + "rewards/chosen": 3.223422368367513, + "rewards/margins": 9.795852025349935, + "rewards/rejected": -6.572429656982422, + "step": 2957 + }, + { + "epoch": 0.2702603928734582, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 8.320288068874571e-06, + "logits/chosen": 559434496.0, + "logits/rejected": 358093909.3333333, + "logps/chosen": -279.8948974609375, + "logps/rejected": -485.3590494791667, + "loss": 0.0214, + "rewards/chosen": 3.7378890991210936, + "rewards/margins": 12.817153422037759, + "rewards/rejected": -9.079264322916666, + "step": 2958 + }, + { + "epoch": 0.27035175879396983, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 8.319212915403166e-06, + "logits/chosen": 669731379.2, + "logits/rejected": 665922560.0, + "logps/chosen": -423.0001953125, + "logps/rejected": -497.665771484375, + "loss": 0.0092, + "rewards/chosen": 4.768206024169922, + "rewards/margins": 15.64758275349935, + "rewards/rejected": -10.879376729329428, + "step": 2959 + }, + { + "epoch": 0.2704431247144815, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.318137487463606e-06, + "logits/chosen": 543367360.0, + "logits/rejected": 679091584.0, + "logps/chosen": -206.8211669921875, + "logps/rejected": -475.3971862792969, + "loss": 0.1631, + "rewards/chosen": 0.5322089791297913, + "rewards/margins": 8.818789064884186, + "rewards/rejected": -8.286580085754395, + "step": 2960 + }, + { + "epoch": 0.27053449063499313, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 8.317061785144817e-06, + "logits/chosen": 708211008.0, + "logits/rejected": 314739072.0, + "logps/chosen": -439.3348388671875, + "logps/rejected": -404.42340087890625, + "loss": 0.0143, + "rewards/chosen": 3.764186143875122, + "rewards/margins": 12.507949113845825, + "rewards/rejected": -8.743762969970703, + "step": 2961 + }, + { + "epoch": 0.2706258565555048, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 8.315985808535754e-06, + "logits/chosen": 610789478.4, + "logits/rejected": 363339648.0, + "logps/chosen": -432.65322265625, + "logps/rejected": -409.0834554036458, + "loss": 0.0237, + "rewards/chosen": 3.916747283935547, + "rewards/margins": 14.544174702962241, + "rewards/rejected": -10.627427419026693, + "step": 2962 + }, + { + "epoch": 0.27071722247601643, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 8.314909557725385e-06, + "logits/chosen": 517722453.3333333, + "logits/rejected": 295292928.0, + "logps/chosen": -291.4986572265625, + "logps/rejected": -383.46636962890625, + "loss": 0.0442, + "rewards/chosen": 2.834007898966471, + "rewards/margins": 12.087209383646647, + "rewards/rejected": -9.253201484680176, + "step": 2963 + }, + { + "epoch": 0.2708085883965281, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 8.313833032802711e-06, + "logits/chosen": 383527296.0, + "logits/rejected": 530436928.0, + "logps/chosen": -212.80472819010416, + "logps/rejected": -351.8755798339844, + "loss": 0.1185, + "rewards/chosen": 3.0309327443440757, + "rewards/margins": 11.019673665364584, + "rewards/rejected": -7.988740921020508, + "step": 2964 + }, + { + "epoch": 0.2708999543170397, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 8.31275623385675e-06, + "logits/chosen": 252697770.66666666, + "logits/rejected": 386726809.6, + "logps/chosen": -345.720947265625, + "logps/rejected": -446.3109375, + "loss": 0.0056, + "rewards/chosen": 4.4737904866536455, + "rewards/margins": 12.846629842122397, + "rewards/rejected": -8.37283935546875, + "step": 2965 + }, + { + "epoch": 0.2709913202375514, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 8.31167916097654e-06, + "logits/chosen": 644624170.6666666, + "logits/rejected": 389929753.6, + "logps/chosen": -339.68332926432294, + "logps/rejected": -409.99013671875, + "loss": 0.0164, + "rewards/chosen": 3.1250832875569663, + "rewards/margins": 11.05589230855306, + "rewards/rejected": -7.930809020996094, + "step": 2966 + }, + { + "epoch": 0.271082686158063, + "grad_norm": 31.25, + "kl": 0.0, + "learning_rate": 8.310601814251147e-06, + "logits/chosen": 831398912.0, + "logits/rejected": 839877120.0, + "logps/chosen": -439.13702392578125, + "logps/rejected": -351.98980712890625, + "loss": 0.0479, + "rewards/chosen": 2.9917025566101074, + "rewards/margins": 10.633235454559326, + "rewards/rejected": -7.641532897949219, + "step": 2967 + }, + { + "epoch": 0.2711740520785747, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 8.309524193769655e-06, + "logits/chosen": 389398869.3333333, + "logits/rejected": 669466624.0, + "logps/chosen": -193.45100911458334, + "logps/rejected": -637.12373046875, + "loss": 0.0082, + "rewards/chosen": 4.620864232381185, + "rewards/margins": 12.967778905232748, + "rewards/rejected": -8.346914672851563, + "step": 2968 + }, + { + "epoch": 0.2712654179990863, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 8.308446299621179e-06, + "logits/chosen": 645967257.6, + "logits/rejected": 610139648.0, + "logps/chosen": -195.89647216796874, + "logps/rejected": -413.118408203125, + "loss": 0.0171, + "rewards/chosen": 3.9545921325683593, + "rewards/margins": 11.860391998291016, + "rewards/rejected": -7.905799865722656, + "step": 2969 + }, + { + "epoch": 0.271356783919598, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 8.307368131894846e-06, + "logits/chosen": 390029664.0, + "logits/rejected": 563865024.0, + "logps/chosen": -172.38412475585938, + "logps/rejected": -372.7660217285156, + "loss": 0.01, + "rewards/chosen": 4.207408905029297, + "rewards/margins": 11.74513292312622, + "rewards/rejected": -7.537724018096924, + "step": 2970 + }, + { + "epoch": 0.2714481498401096, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 8.306289690679812e-06, + "logits/chosen": 992436480.0, + "logits/rejected": 352222939.4285714, + "logps/chosen": -287.71710205078125, + "logps/rejected": -299.6980678013393, + "loss": 0.1289, + "rewards/chosen": 4.790942668914795, + "rewards/margins": 10.587532111576625, + "rewards/rejected": -5.796589442661831, + "step": 2971 + }, + { + "epoch": 0.2715395157606213, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 8.305210976065254e-06, + "logits/chosen": 686952768.0, + "logits/rejected": 440545088.0, + "logps/chosen": -252.99734497070312, + "logps/rejected": -670.1259765625, + "loss": 0.0281, + "rewards/chosen": 2.9141976833343506, + "rewards/margins": 15.789047002792358, + "rewards/rejected": -12.874849319458008, + "step": 2972 + }, + { + "epoch": 0.2716308816811329, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 8.304131988140373e-06, + "logits/chosen": 576756992.0, + "logits/rejected": 566661440.0, + "logps/chosen": -251.45529174804688, + "logps/rejected": -495.1116943359375, + "loss": 0.0649, + "rewards/chosen": 3.2822718620300293, + "rewards/margins": 10.579915523529053, + "rewards/rejected": -7.297643661499023, + "step": 2973 + }, + { + "epoch": 0.2717222476016446, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.30305272699439e-06, + "logits/chosen": 511898496.0, + "logits/rejected": 749967616.0, + "logps/chosen": -248.1343994140625, + "logps/rejected": -593.9895629882812, + "loss": 0.0219, + "rewards/chosen": 3.3803904056549072, + "rewards/margins": 12.018173456192017, + "rewards/rejected": -8.63778305053711, + "step": 2974 + }, + { + "epoch": 0.2718136135221562, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 8.301973192716549e-06, + "logits/chosen": 438630400.0, + "logits/rejected": 476420736.0, + "logps/chosen": -383.81201171875, + "logps/rejected": -547.2900390625, + "loss": 0.0242, + "rewards/chosen": 4.030763626098633, + "rewards/margins": 11.999953587849934, + "rewards/rejected": -7.969189961751302, + "step": 2975 + }, + { + "epoch": 0.2719049794426679, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.300893385396116e-06, + "logits/chosen": 931433856.0, + "logits/rejected": 525931392.0, + "logps/chosen": -266.4215087890625, + "logps/rejected": -185.81475830078125, + "loss": 0.1292, + "rewards/chosen": 3.2606313228607178, + "rewards/margins": 8.784478902816772, + "rewards/rejected": -5.523847579956055, + "step": 2976 + }, + { + "epoch": 0.2719963453631795, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 8.299813305122388e-06, + "logits/chosen": 498434099.2, + "logits/rejected": 409441962.6666667, + "logps/chosen": -309.800146484375, + "logps/rejected": -555.3503011067709, + "loss": 0.0489, + "rewards/chosen": 2.508297348022461, + "rewards/margins": 13.48269182840983, + "rewards/rejected": -10.97439448038737, + "step": 2977 + }, + { + "epoch": 0.2720877112836912, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 8.29873295198467e-06, + "logits/chosen": 562191872.0, + "logits/rejected": 632782848.0, + "logps/chosen": -282.70916748046875, + "logps/rejected": -556.918212890625, + "loss": 0.0106, + "rewards/chosen": 4.059795379638672, + "rewards/margins": 13.862113952636719, + "rewards/rejected": -9.802318572998047, + "step": 2978 + }, + { + "epoch": 0.2721790772042028, + "grad_norm": 0.07421875, + "kl": 0.0, + "learning_rate": 8.297652326072301e-06, + "logits/rejected": 886741952.0, + "logps/rejected": -494.9320373535156, + "loss": 0.0002, + "rewards/rejected": -8.7496337890625, + "step": 2979 + }, + { + "epoch": 0.2722704431247145, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 8.296571427474638e-06, + "logits/chosen": 520678080.0, + "logits/rejected": 586242880.0, + "logps/chosen": -349.01348876953125, + "logps/rejected": -591.7376098632812, + "loss": 0.0219, + "rewards/chosen": 3.180319309234619, + "rewards/margins": 12.887689113616943, + "rewards/rejected": -9.707369804382324, + "step": 2980 + }, + { + "epoch": 0.2723618090452261, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 8.295490256281063e-06, + "logits/chosen": 577868288.0, + "logits/rejected": 257818544.0, + "logps/chosen": -405.905517578125, + "logps/rejected": -313.4676513671875, + "loss": 0.1633, + "rewards/chosen": 1.6814907391866047, + "rewards/margins": 10.446147759755453, + "rewards/rejected": -8.764657020568848, + "step": 2981 + }, + { + "epoch": 0.2724531749657378, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 8.294408812580975e-06, + "logits/chosen": 556131392.0, + "logits/rejected": 915875584.0, + "logps/chosen": -418.56103515625, + "logps/rejected": -812.1409912109375, + "loss": 0.0186, + "rewards/chosen": 3.62738299369812, + "rewards/margins": 14.73453688621521, + "rewards/rejected": -11.10715389251709, + "step": 2982 + }, + { + "epoch": 0.2725445408862494, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 8.293327096463803e-06, + "logits/chosen": 508103253.3333333, + "logits/rejected": 1101589504.0, + "logps/chosen": -263.1458740234375, + "logps/rejected": -466.6146240234375, + "loss": 0.0377, + "rewards/chosen": 3.6828956604003906, + "rewards/margins": 11.84237003326416, + "rewards/rejected": -8.15947437286377, + "step": 2983 + }, + { + "epoch": 0.2726359068067611, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 8.292245108018993e-06, + "logits/chosen": 718068138.6666666, + "logits/rejected": 688646144.0, + "logps/chosen": -343.330810546875, + "logps/rejected": -466.927392578125, + "loss": 0.0169, + "rewards/chosen": 3.1559251149495444, + "rewards/margins": 11.579995854695639, + "rewards/rejected": -8.424070739746094, + "step": 2984 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 8.291162847336016e-06, + "logits/chosen": 521817024.0, + "logits/rejected": 381184864.0, + "logps/chosen": -411.9560241699219, + "logps/rejected": -500.6103515625, + "loss": 0.0109, + "rewards/chosen": 4.200549125671387, + "rewards/margins": 12.012056827545166, + "rewards/rejected": -7.811507701873779, + "step": 2985 + }, + { + "epoch": 0.2728186386477844, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 8.290080314504364e-06, + "logits/chosen": 412503008.0, + "logits/rejected": 329497526.85714287, + "logps/chosen": -441.9188232421875, + "logps/rejected": -326.1393345424107, + "loss": 0.0042, + "rewards/chosen": 3.478472948074341, + "rewards/margins": 12.233166115624565, + "rewards/rejected": -8.754693167550224, + "step": 2986 + }, + { + "epoch": 0.272910004568296, + "grad_norm": 38.5, + "kl": 0.0, + "learning_rate": 8.288997509613554e-06, + "logits/chosen": 572914624.0, + "logits/rejected": 507972672.0, + "logps/chosen": -387.27203369140625, + "logps/rejected": -353.75250244140625, + "loss": 0.052, + "rewards/chosen": 2.7819528579711914, + "rewards/margins": 11.41301441192627, + "rewards/rejected": -8.631061553955078, + "step": 2987 + }, + { + "epoch": 0.2730013704888077, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 8.287914432753123e-06, + "logits/chosen": 584451584.0, + "logits/rejected": 1384470869.3333333, + "logps/chosen": -289.411474609375, + "logps/rejected": -651.021484375, + "loss": 0.037, + "rewards/chosen": 3.211670684814453, + "rewards/margins": 11.862459945678712, + "rewards/rejected": -8.650789260864258, + "step": 2988 + }, + { + "epoch": 0.2730927364093193, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 8.286831084012632e-06, + "logits/chosen": 787578965.3333334, + "logits/rejected": 550449766.4, + "logps/chosen": -444.0046793619792, + "logps/rejected": -473.1447265625, + "loss": 0.0084, + "rewards/chosen": 3.8956289291381836, + "rewards/margins": 11.64543285369873, + "rewards/rejected": -7.749803924560547, + "step": 2989 + }, + { + "epoch": 0.273184102329831, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 8.285747463481664e-06, + "logits/chosen": 353610816.0, + "logits/rejected": 796365926.4, + "logps/chosen": -175.76798502604166, + "logps/rejected": -339.067724609375, + "loss": 0.024, + "rewards/chosen": 3.0945409138997397, + "rewards/margins": 11.806471761067709, + "rewards/rejected": -8.711930847167968, + "step": 2990 + }, + { + "epoch": 0.2732754682503426, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 8.284663571249822e-06, + "logits/chosen": 604471978.6666666, + "logits/rejected": 621192832.0, + "logps/chosen": -277.57175699869794, + "logps/rejected": -848.8247680664062, + "loss": 0.0305, + "rewards/chosen": 3.530178705851237, + "rewards/margins": 11.691769282023111, + "rewards/rejected": -8.161590576171875, + "step": 2991 + }, + { + "epoch": 0.2733668341708543, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 8.283579407406738e-06, + "logits/chosen": 666068821.3333334, + "logits/rejected": 295164800.0, + "logps/chosen": -320.64475504557294, + "logps/rejected": -189.6558837890625, + "loss": 0.0663, + "rewards/chosen": 2.8848292032877603, + "rewards/margins": 10.237959543863932, + "rewards/rejected": -7.353130340576172, + "step": 2992 + }, + { + "epoch": 0.2734582000913659, + "grad_norm": 64.5, + "kl": 0.0, + "learning_rate": 8.282494972042058e-06, + "logits/chosen": 1080347776.0, + "logits/rejected": 692015872.0, + "logps/chosen": -361.6857604980469, + "logps/rejected": -418.59063720703125, + "loss": 0.0592, + "rewards/chosen": 4.3080573081970215, + "rewards/margins": 11.928131103515625, + "rewards/rejected": -7.6200737953186035, + "step": 2993 + }, + { + "epoch": 0.2735495660118776, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 8.281410265245458e-06, + "logits/chosen": 1185085235.2, + "logits/rejected": 393549568.0, + "logps/chosen": -245.8036376953125, + "logps/rejected": -379.6570638020833, + "loss": 0.0885, + "rewards/chosen": 2.8308677673339844, + "rewards/margins": 11.44717534383138, + "rewards/rejected": -8.616307576497396, + "step": 2994 + }, + { + "epoch": 0.2736409319323892, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 8.280325287106631e-06, + "logits/chosen": 316476885.3333333, + "logits/rejected": 453022688.0, + "logps/chosen": -356.2127278645833, + "logps/rejected": -249.662841796875, + "loss": 0.042, + "rewards/chosen": 3.365650177001953, + "rewards/margins": 11.15935230255127, + "rewards/rejected": -7.793702125549316, + "step": 2995 + }, + { + "epoch": 0.2737322978529009, + "grad_norm": 37.75, + "kl": 0.0, + "learning_rate": 8.279240037715297e-06, + "logits/chosen": 543887040.0, + "logits/rejected": 237762128.0, + "logps/chosen": -382.07611083984375, + "logps/rejected": -341.70343017578125, + "loss": 0.1007, + "rewards/chosen": 3.001251697540283, + "rewards/margins": 9.078238010406494, + "rewards/rejected": -6.076986312866211, + "step": 2996 + }, + { + "epoch": 0.2738236637734125, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.278154517161194e-06, + "logits/chosen": 317073749.3333333, + "logits/rejected": 479463116.8, + "logps/chosen": -216.6991170247396, + "logps/rejected": -460.34326171875, + "loss": 0.0086, + "rewards/chosen": 4.667289733886719, + "rewards/margins": 13.18208999633789, + "rewards/rejected": -8.514800262451171, + "step": 2997 + }, + { + "epoch": 0.2739150296939242, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.277068725534085e-06, + "logits/chosen": 986701414.4, + "logits/rejected": 458096896.0, + "logps/chosen": -325.1587646484375, + "logps/rejected": -431.7528483072917, + "loss": 0.0289, + "rewards/chosen": 3.5773845672607423, + "rewards/margins": 13.858637110392252, + "rewards/rejected": -10.28125254313151, + "step": 2998 + }, + { + "epoch": 0.2740063956144358, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.275982662923756e-06, + "logits/chosen": 483785045.3333333, + "logits/rejected": 224925600.0, + "logps/chosen": -324.4012044270833, + "logps/rejected": -404.6444396972656, + "loss": 0.024, + "rewards/chosen": 3.8570032119750977, + "rewards/margins": 14.050719261169434, + "rewards/rejected": -10.193716049194336, + "step": 2999 + }, + { + "epoch": 0.27409776153494747, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 8.274896329420014e-06, + "logits/chosen": 462525235.2, + "logits/rejected": 542564010.6666666, + "logps/chosen": -247.1737060546875, + "logps/rejected": -439.04296875, + "loss": 0.0133, + "rewards/chosen": 4.154936981201172, + "rewards/margins": 13.643064371744792, + "rewards/rejected": -9.48812739054362, + "step": 3000 + }, + { + "epoch": 0.2741891274554591, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 8.273809725112687e-06, + "logits/chosen": 634639462.4, + "logits/rejected": 1114350165.3333333, + "logps/chosen": -369.9818603515625, + "logps/rejected": -573.2442220052084, + "loss": 0.0276, + "rewards/chosen": 3.4287078857421873, + "rewards/margins": 13.481059519449868, + "rewards/rejected": -10.052351633707682, + "step": 3001 + }, + { + "epoch": 0.27428049337597077, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 8.272722850091629e-06, + "logits/chosen": 621178112.0, + "logits/rejected": 522709504.0, + "logps/chosen": -293.45953369140625, + "logps/rejected": -736.23876953125, + "loss": 0.0182, + "rewards/chosen": 4.222645282745361, + "rewards/margins": 13.81675386428833, + "rewards/rejected": -9.594108581542969, + "step": 3002 + }, + { + "epoch": 0.2743718592964824, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 8.271635704446712e-06, + "logits/chosen": 308633472.0, + "logits/rejected": 406823577.6, + "logps/chosen": -280.4785970052083, + "logps/rejected": -417.703857421875, + "loss": 0.0734, + "rewards/chosen": 3.673764228820801, + "rewards/margins": 9.90003604888916, + "rewards/rejected": -6.2262718200683596, + "step": 3003 + }, + { + "epoch": 0.27446322521699407, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 8.270548288267836e-06, + "logits/chosen": 667564245.3333334, + "logits/rejected": 614170009.6, + "logps/chosen": -406.94287109375, + "logps/rejected": -302.070556640625, + "loss": 0.0091, + "rewards/chosen": 3.8127406438191733, + "rewards/margins": 12.138909467061362, + "rewards/rejected": -8.326168823242188, + "step": 3004 + }, + { + "epoch": 0.2745545911375057, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 8.269460601644919e-06, + "logits/chosen": 277693504.0, + "logits/rejected": 466240213.3333333, + "logps/chosen": -113.97335815429688, + "logps/rejected": -540.0859375, + "loss": 0.0398, + "rewards/chosen": 1.8177683353424072, + "rewards/margins": 9.834719578425089, + "rewards/rejected": -8.016951243082682, + "step": 3005 + }, + { + "epoch": 0.27464595705801736, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 8.268372644667901e-06, + "logits/chosen": 786293248.0, + "logits/rejected": 423576704.0, + "logps/chosen": -264.7911376953125, + "logps/rejected": -444.6990966796875, + "loss": 0.0098, + "rewards/chosen": 3.2987382411956787, + "rewards/margins": 12.119615316390991, + "rewards/rejected": -8.820877075195312, + "step": 3006 + }, + { + "epoch": 0.274737322978529, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 8.267284417426748e-06, + "logits/chosen": 328358720.0, + "logits/rejected": 384542880.0, + "logps/chosen": -254.16165161132812, + "logps/rejected": -474.9385986328125, + "loss": 0.0144, + "rewards/chosen": 4.080447673797607, + "rewards/margins": 11.97796106338501, + "rewards/rejected": -7.897513389587402, + "step": 3007 + }, + { + "epoch": 0.27482868889904066, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.266195920011446e-06, + "logits/chosen": 700273834.6666666, + "logits/rejected": 334545152.0, + "logps/chosen": -263.0593668619792, + "logps/rejected": -309.2514953613281, + "loss": 0.0361, + "rewards/chosen": 3.2284463246663413, + "rewards/margins": 12.086048444112143, + "rewards/rejected": -8.8576021194458, + "step": 3008 + }, + { + "epoch": 0.2749200548195523, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 8.265107152512005e-06, + "logits/chosen": 637111872.0, + "logits/rejected": 430900288.0, + "logps/chosen": -205.4044189453125, + "logps/rejected": -583.6731567382812, + "loss": 0.0208, + "rewards/chosen": 3.4344258308410645, + "rewards/margins": 12.382629871368408, + "rewards/rejected": -8.948204040527344, + "step": 3009 + }, + { + "epoch": 0.27501142074006396, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 8.264018115018452e-06, + "logits/chosen": 605290240.0, + "logits/rejected": 409111765.3333333, + "logps/chosen": -377.9910888671875, + "logps/rejected": -343.9248860677083, + "loss": 0.0095, + "rewards/chosen": 4.331541538238525, + "rewards/margins": 11.744242191314697, + "rewards/rejected": -7.412700653076172, + "step": 3010 + }, + { + "epoch": 0.2751027866605756, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 8.262928807620843e-06, + "logits/chosen": 326711232.0, + "logits/rejected": 363300821.3333333, + "logps/chosen": -251.199951171875, + "logps/rejected": -506.9580485026042, + "loss": 0.0466, + "rewards/chosen": 4.419526100158691, + "rewards/margins": 11.406873385111492, + "rewards/rejected": -6.9873472849528, + "step": 3011 + }, + { + "epoch": 0.27519415258108726, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.261839230409255e-06, + "logits/chosen": 740385344.0, + "logits/rejected": 299242656.0, + "logps/chosen": -630.392822265625, + "logps/rejected": -552.1110229492188, + "loss": 0.0159, + "rewards/chosen": 3.793107032775879, + "rewards/margins": 13.827548027038574, + "rewards/rejected": -10.034440994262695, + "step": 3012 + }, + { + "epoch": 0.2752855185015989, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.260749383473783e-06, + "logits/chosen": 664630912.0, + "logits/rejected": 715401728.0, + "logps/chosen": -268.317626953125, + "logps/rejected": -467.20635986328125, + "loss": 0.0145, + "rewards/chosen": 3.878678321838379, + "rewards/margins": 11.976584434509277, + "rewards/rejected": -8.097906112670898, + "step": 3013 + }, + { + "epoch": 0.27537688442211056, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 8.25965926690455e-06, + "logits/chosen": 629540266.6666666, + "logits/rejected": 826802944.0, + "logps/chosen": -526.8398844401041, + "logps/rejected": -508.357421875, + "loss": 0.018, + "rewards/chosen": 3.0334513982137046, + "rewards/margins": 12.884363110860189, + "rewards/rejected": -9.850911712646484, + "step": 3014 + }, + { + "epoch": 0.2754682503426222, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 8.258568880791694e-06, + "logits/chosen": 821305856.0, + "logits/rejected": 729981440.0, + "logps/chosen": -447.5176086425781, + "logps/rejected": -342.926025390625, + "loss": 0.1305, + "rewards/chosen": 3.6213607788085938, + "rewards/margins": 9.26446533203125, + "rewards/rejected": -5.643104553222656, + "step": 3015 + }, + { + "epoch": 0.27555961626313386, + "grad_norm": 23.875, + "kl": 0.0, + "learning_rate": 8.257478225225385e-06, + "logits/chosen": 407740992.0, + "logits/rejected": 949295396.5714285, + "logps/chosen": -62.544776916503906, + "logps/rejected": -510.43404715401783, + "loss": 0.0588, + "rewards/chosen": 0.13784562051296234, + "rewards/margins": 8.904250998582159, + "rewards/rejected": -8.766405378069196, + "step": 3016 + }, + { + "epoch": 0.2756509821836455, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.256387300295808e-06, + "logits/chosen": 215486016.0, + "logits/rejected": 525184307.2, + "logps/chosen": -314.04079182942706, + "logps/rejected": -772.572216796875, + "loss": 0.0249, + "rewards/chosen": 3.253956158955892, + "rewards/margins": 12.40628236134847, + "rewards/rejected": -9.152326202392578, + "step": 3017 + }, + { + "epoch": 0.27574234810415715, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 8.255296106093172e-06, + "logits/chosen": 501772185.6, + "logits/rejected": 780756565.3333334, + "logps/chosen": -359.179443359375, + "logps/rejected": -501.5526123046875, + "loss": 0.0343, + "rewards/chosen": 2.955794906616211, + "rewards/margins": 10.40544090270996, + "rewards/rejected": -7.44964599609375, + "step": 3018 + }, + { + "epoch": 0.2758337140246688, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 8.254204642707708e-06, + "logits/chosen": 382198848.0, + "logits/rejected": 526703424.0, + "logps/chosen": -240.49365234375, + "logps/rejected": -551.9190063476562, + "loss": 0.0485, + "rewards/chosen": 3.1713433265686035, + "rewards/margins": 11.376351833343506, + "rewards/rejected": -8.205008506774902, + "step": 3019 + }, + { + "epoch": 0.27592507994518045, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 8.253112910229672e-06, + "logits/chosen": 890296832.0, + "logits/rejected": 550267072.0, + "logps/chosen": -355.7593994140625, + "logps/rejected": -350.42816162109375, + "loss": 0.0223, + "rewards/chosen": 3.8954925537109375, + "rewards/margins": 11.240259170532227, + "rewards/rejected": -7.344766616821289, + "step": 3020 + }, + { + "epoch": 0.2760164458656921, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 8.252020908749338e-06, + "logits/chosen": 1006282922.6666666, + "logits/rejected": 297175705.6, + "logps/chosen": -328.2762044270833, + "logps/rejected": -319.351123046875, + "loss": 0.0247, + "rewards/chosen": 2.855238914489746, + "rewards/margins": 10.701852989196777, + "rewards/rejected": -7.846614074707031, + "step": 3021 + }, + { + "epoch": 0.27610781178620375, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 8.250928638357008e-06, + "logits/chosen": 539024384.0, + "logits/rejected": 433879808.0, + "logps/chosen": -408.14202880859375, + "logps/rejected": -428.8060607910156, + "loss": 0.0061, + "rewards/chosen": 4.489904403686523, + "rewards/margins": 13.89939022064209, + "rewards/rejected": -9.409485816955566, + "step": 3022 + }, + { + "epoch": 0.27619917770671537, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.249836099142998e-06, + "logits/chosen": 533530521.6, + "logits/rejected": 454828245.3333333, + "logps/chosen": -318.7625, + "logps/rejected": -606.7555338541666, + "loss": 0.0424, + "rewards/chosen": 2.6567672729492187, + "rewards/margins": 11.553355916341147, + "rewards/rejected": -8.896588643391928, + "step": 3023 + }, + { + "epoch": 0.27629054362722705, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 8.248743291197654e-06, + "logits/chosen": 755536768.0, + "logits/rejected": 460869973.3333333, + "logps/chosen": -215.04039001464844, + "logps/rejected": -501.7433268229167, + "loss": 0.0468, + "rewards/chosen": 2.017307758331299, + "rewards/margins": 10.59021806716919, + "rewards/rejected": -8.57291030883789, + "step": 3024 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 8.247650214611339e-06, + "logits/chosen": 762374741.3333334, + "logits/rejected": 640589260.8, + "logps/chosen": -420.4742838541667, + "logps/rejected": -465.31845703125, + "loss": 0.0176, + "rewards/chosen": 3.295142491658529, + "rewards/margins": 11.822955449422201, + "rewards/rejected": -8.527812957763672, + "step": 3025 + }, + { + "epoch": 0.27647327546825035, + "grad_norm": 42.75, + "kl": 0.0, + "learning_rate": 8.246556869474443e-06, + "logits/chosen": 468603946.6666667, + "logits/rejected": 170586720.0, + "logps/chosen": -367.8457845052083, + "logps/rejected": -564.8251342773438, + "loss": 0.0585, + "rewards/chosen": 3.0708510080973306, + "rewards/margins": 10.983275572458902, + "rewards/rejected": -7.912424564361572, + "step": 3026 + }, + { + "epoch": 0.27656464138876197, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 8.245463255877372e-06, + "logits/chosen": 572859392.0, + "logits/rejected": 406139946.6666667, + "logps/chosen": -339.844970703125, + "logps/rejected": -400.1909586588542, + "loss": 0.0315, + "rewards/chosen": 3.2904449462890626, + "rewards/margins": 11.95121726989746, + "rewards/rejected": -8.660772323608398, + "step": 3027 + }, + { + "epoch": 0.27665600730927364, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 8.244369373910562e-06, + "logits/chosen": 601015488.0, + "logits/rejected": 570812245.3333334, + "logps/chosen": -437.6434631347656, + "logps/rejected": -545.1154378255209, + "loss": 0.0113, + "rewards/chosen": 3.2789430618286133, + "rewards/margins": 11.453005154927572, + "rewards/rejected": -8.174062093098959, + "step": 3028 + }, + { + "epoch": 0.27674737322978527, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.243275223664462e-06, + "logits/chosen": 436814950.4, + "logits/rejected": 392174805.3333333, + "logps/chosen": -315.9781494140625, + "logps/rejected": -348.2694498697917, + "loss": 0.0118, + "rewards/chosen": 4.103354644775391, + "rewards/margins": 14.220769755045573, + "rewards/rejected": -10.117415110270182, + "step": 3029 + }, + { + "epoch": 0.27683873915029694, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 8.242180805229551e-06, + "logits/chosen": 688450560.0, + "logits/rejected": 456938020.5714286, + "logps/chosen": -527.5379638671875, + "logps/rejected": -415.77364676339283, + "loss": 0.0159, + "rewards/chosen": 4.027203559875488, + "rewards/margins": 11.292558806283132, + "rewards/rejected": -7.265355246407645, + "step": 3030 + }, + { + "epoch": 0.27693010507080856, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 8.24108611869633e-06, + "logits/chosen": 490133248.0, + "logits/rejected": 398569152.0, + "logps/chosen": -382.60125732421875, + "logps/rejected": -414.35821533203125, + "loss": 0.0097, + "rewards/chosen": 4.582969665527344, + "rewards/margins": 13.000558853149414, + "rewards/rejected": -8.41758918762207, + "step": 3031 + }, + { + "epoch": 0.27702147099132024, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.239991164155312e-06, + "logits/chosen": 523274272.0, + "logits/rejected": 950484992.0, + "logps/chosen": -289.780517578125, + "logps/rejected": -626.9597981770834, + "loss": 0.0121, + "rewards/chosen": 3.217080593109131, + "rewards/margins": 10.894664923350017, + "rewards/rejected": -7.677584330240886, + "step": 3032 + }, + { + "epoch": 0.27711283691183186, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.238895941697048e-06, + "logits/chosen": 517434922.6666667, + "logits/rejected": 399853248.0, + "logps/chosen": -353.6717529296875, + "logps/rejected": -393.44622802734375, + "loss": 0.0343, + "rewards/chosen": 3.1891002655029297, + "rewards/margins": 10.851470947265625, + "rewards/rejected": -7.662370681762695, + "step": 3033 + }, + { + "epoch": 0.27720420283234354, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.237800451412095e-06, + "logits/chosen": 501251712.0, + "logits/rejected": 473178675.2, + "logps/chosen": -357.6527913411458, + "logps/rejected": -575.3818359375, + "loss": 0.0257, + "rewards/chosen": 2.8833274841308594, + "rewards/margins": 12.945463562011719, + "rewards/rejected": -10.06213607788086, + "step": 3034 + }, + { + "epoch": 0.27729556875285516, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 8.236704693391045e-06, + "logits/chosen": 510442592.0, + "logits/rejected": 437766560.0, + "logps/chosen": -312.7357482910156, + "logps/rejected": -489.0511169433594, + "loss": 0.0854, + "rewards/chosen": 3.7081217765808105, + "rewards/margins": 9.047075748443604, + "rewards/rejected": -5.338953971862793, + "step": 3035 + }, + { + "epoch": 0.27738693467336684, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 8.235608667724507e-06, + "logits/chosen": 701962496.0, + "logits/rejected": 515916288.0, + "logps/chosen": -360.0957336425781, + "logps/rejected": -394.937255859375, + "loss": 0.014, + "rewards/chosen": 3.074573040008545, + "rewards/margins": 11.079108079274496, + "rewards/rejected": -8.004535039265951, + "step": 3036 + }, + { + "epoch": 0.27747830059387846, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 8.234512374503107e-06, + "logits/chosen": 1351571712.0, + "logits/rejected": 842874944.0, + "logps/chosen": -259.3297119140625, + "logps/rejected": -338.26251220703125, + "loss": 0.0127, + "rewards/chosen": 4.046699523925781, + "rewards/margins": 11.565753936767578, + "rewards/rejected": -7.519054412841797, + "step": 3037 + }, + { + "epoch": 0.27756966651439013, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.233415813817507e-06, + "logits/chosen": 637361971.2, + "logits/rejected": 264238250.66666666, + "logps/chosen": -415.27998046875, + "logps/rejected": -394.2686360677083, + "loss": 0.0202, + "rewards/chosen": 3.7172592163085936, + "rewards/margins": 13.288819122314454, + "rewards/rejected": -9.57155990600586, + "step": 3038 + }, + { + "epoch": 0.27766103243490176, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.232318985758374e-06, + "logits/chosen": 747503396.5714285, + "logits/rejected": 713389056.0, + "logps/chosen": -410.57700892857144, + "logps/rejected": -666.899169921875, + "loss": 0.0304, + "rewards/chosen": 3.708477020263672, + "rewards/margins": 13.513250350952148, + "rewards/rejected": -9.804773330688477, + "step": 3039 + }, + { + "epoch": 0.27775239835541343, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 8.23122189041641e-06, + "logits/chosen": 619083264.0, + "logits/rejected": 412864032.0, + "logps/chosen": -458.5518798828125, + "logps/rejected": -467.3875427246094, + "loss": 0.1014, + "rewards/chosen": 3.783900499343872, + "rewards/margins": 9.486751317977905, + "rewards/rejected": -5.702850818634033, + "step": 3040 + }, + { + "epoch": 0.27784376427592505, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 8.230124527882332e-06, + "logits/chosen": 1200193024.0, + "logits/rejected": 500309418.6666667, + "logps/chosen": -432.266162109375, + "logps/rejected": -399.612548828125, + "loss": 0.0139, + "rewards/chosen": 4.236491394042969, + "rewards/margins": 13.732925923665366, + "rewards/rejected": -9.496434529622396, + "step": 3041 + }, + { + "epoch": 0.27793513019643673, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 8.229026898246885e-06, + "logits/chosen": 695282176.0, + "logits/rejected": 344405205.3333333, + "logps/chosen": -428.03623046875, + "logps/rejected": -354.0045166015625, + "loss": 0.0267, + "rewards/chosen": 3.3905155181884767, + "rewards/margins": 9.93524538675944, + "rewards/rejected": -6.544729868570964, + "step": 3042 + }, + { + "epoch": 0.27802649611694835, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 8.227929001600832e-06, + "logits/chosen": 923421440.0, + "logits/rejected": 593213269.3333334, + "logps/chosen": -271.38043212890625, + "logps/rejected": -567.5873209635416, + "loss": 0.0079, + "rewards/chosen": 3.4981918334960938, + "rewards/margins": 13.461596171061197, + "rewards/rejected": -9.963404337565104, + "step": 3043 + }, + { + "epoch": 0.27811786203746003, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 8.226830838034958e-06, + "logits/chosen": 507447082.6666667, + "logits/rejected": 378473728.0, + "logps/chosen": -398.4451497395833, + "logps/rejected": -762.0123901367188, + "loss": 0.0589, + "rewards/chosen": 2.7635927200317383, + "rewards/margins": 14.294753074645996, + "rewards/rejected": -11.531160354614258, + "step": 3044 + }, + { + "epoch": 0.27820922795797165, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 8.225732407640068e-06, + "logits/chosen": 690552277.3333334, + "logits/rejected": 801779097.6, + "logps/chosen": -303.2985432942708, + "logps/rejected": -614.02900390625, + "loss": 0.0213, + "rewards/chosen": 2.983924229939779, + "rewards/margins": 14.475528844197592, + "rewards/rejected": -11.491604614257813, + "step": 3045 + }, + { + "epoch": 0.2783005938784833, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 8.224633710506997e-06, + "logits/chosen": 715797845.3333334, + "logits/rejected": 365355571.2, + "logps/chosen": -316.11822509765625, + "logps/rejected": -315.408642578125, + "loss": 0.015, + "rewards/chosen": 3.6460208892822266, + "rewards/margins": 11.662925338745117, + "rewards/rejected": -8.01690444946289, + "step": 3046 + }, + { + "epoch": 0.27839195979899495, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 8.223534746726596e-06, + "logits/chosen": 176424752.0, + "logits/rejected": 348141952.0, + "logps/chosen": -147.29086303710938, + "logps/rejected": -553.2702026367188, + "loss": 0.0096, + "rewards/chosen": 4.140588283538818, + "rewards/margins": 15.097604274749756, + "rewards/rejected": -10.957015991210938, + "step": 3047 + }, + { + "epoch": 0.2784833257195066, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 8.222435516389737e-06, + "logits/chosen": 592988032.0, + "logits/rejected": 618847872.0, + "logps/chosen": -363.44647216796875, + "logps/rejected": -706.9037475585938, + "loss": 0.0203, + "rewards/chosen": 3.353306770324707, + "rewards/margins": 15.589367866516113, + "rewards/rejected": -12.236061096191406, + "step": 3048 + }, + { + "epoch": 0.27857469164001825, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 8.221336019587317e-06, + "logits/chosen": 726425856.0, + "logits/rejected": 523749961.14285713, + "logps/chosen": -565.510986328125, + "logps/rejected": -608.8763950892857, + "loss": 0.0064, + "rewards/chosen": 2.9336793422698975, + "rewards/margins": 13.620307411466326, + "rewards/rejected": -10.686628069196429, + "step": 3049 + }, + { + "epoch": 0.2786660575605299, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 8.220236256410254e-06, + "logits/chosen": 417774528.0, + "logits/rejected": 567957589.3333334, + "logps/chosen": -401.390380859375, + "logps/rejected": -490.2533365885417, + "loss": 0.0044, + "rewards/chosen": 4.102418899536133, + "rewards/margins": 13.188144048055014, + "rewards/rejected": -9.08572514851888, + "step": 3050 + }, + { + "epoch": 0.27875742348104154, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.21913622694949e-06, + "logits/chosen": 749554048.0, + "logits/rejected": 705521536.0, + "logps/chosen": -220.0754852294922, + "logps/rejected": -748.24169921875, + "loss": 0.0362, + "rewards/chosen": 2.776184558868408, + "rewards/margins": 13.249642848968506, + "rewards/rejected": -10.473458290100098, + "step": 3051 + }, + { + "epoch": 0.2788487894015532, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.218035931295985e-06, + "logits/chosen": 606063872.0, + "logits/rejected": 372393856.0, + "logps/chosen": -404.8547119140625, + "logps/rejected": -259.8204752604167, + "loss": 0.0248, + "rewards/chosen": 3.4114990234375, + "rewards/margins": 10.935605367024738, + "rewards/rejected": -7.524106343587239, + "step": 3052 + }, + { + "epoch": 0.27894015532206484, + "grad_norm": 24.25, + "kl": 0.0, + "learning_rate": 8.216935369540725e-06, + "logits/chosen": 387204352.0, + "logits/rejected": 410895552.0, + "logps/chosen": -231.29812622070312, + "logps/rejected": -537.8119506835938, + "loss": 0.0962, + "rewards/chosen": 2.5911669731140137, + "rewards/margins": 9.958889484405518, + "rewards/rejected": -7.367722511291504, + "step": 3053 + }, + { + "epoch": 0.2790315212425765, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 8.215834541774716e-06, + "logits/chosen": 658627413.3333334, + "logits/rejected": 463591392.0, + "logps/chosen": -375.6747639973958, + "logps/rejected": -588.4500122070312, + "loss": 0.0312, + "rewards/chosen": 3.5979512532552085, + "rewards/margins": 13.100493748982748, + "rewards/rejected": -9.502542495727539, + "step": 3054 + }, + { + "epoch": 0.2791228871630882, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 8.214733448088986e-06, + "logits/chosen": 759733811.2, + "logits/rejected": 650738944.0, + "logps/chosen": -448.058251953125, + "logps/rejected": -557.95556640625, + "loss": 0.0326, + "rewards/chosen": 3.229508972167969, + "rewards/margins": 9.26265614827474, + "rewards/rejected": -6.0331471761067705, + "step": 3055 + }, + { + "epoch": 0.2792142530835998, + "grad_norm": 23.125, + "kl": 0.0, + "learning_rate": 8.213632088574584e-06, + "logits/chosen": 596963840.0, + "logits/rejected": 624620544.0, + "logps/chosen": -396.520068359375, + "logps/rejected": -483.4095052083333, + "loss": 0.0504, + "rewards/chosen": 3.3279720306396485, + "rewards/margins": 9.811462656656902, + "rewards/rejected": -6.483490626017253, + "step": 3056 + }, + { + "epoch": 0.2793056190041115, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 8.212530463322584e-06, + "logits/chosen": 541342464.0, + "logits/rejected": 314302016.0, + "logps/chosen": -296.39263916015625, + "logps/rejected": -376.4380187988281, + "loss": 0.1227, + "rewards/chosen": 3.0559964179992676, + "rewards/margins": 8.693970203399658, + "rewards/rejected": -5.637973785400391, + "step": 3057 + }, + { + "epoch": 0.2793969849246231, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 8.211428572424079e-06, + "logits/chosen": 399188992.0, + "logits/rejected": 638532736.0, + "logps/chosen": -247.99603271484375, + "logps/rejected": -652.48583984375, + "loss": 0.0075, + "rewards/chosen": 4.498039245605469, + "rewards/margins": 14.996051788330078, + "rewards/rejected": -10.49801254272461, + "step": 3058 + }, + { + "epoch": 0.2794883508451348, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 8.210326415970185e-06, + "logits/chosen": 510855338.6666667, + "logits/rejected": 307865408.0, + "logps/chosen": -329.26975504557294, + "logps/rejected": -353.72161865234375, + "loss": 0.0616, + "rewards/chosen": 2.5757875442504883, + "rewards/margins": 12.197335243225098, + "rewards/rejected": -9.62154769897461, + "step": 3059 + }, + { + "epoch": 0.2795797167656464, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 8.20922399405204e-06, + "logits/chosen": 679115264.0, + "logits/rejected": 674427072.0, + "logps/chosen": -407.8277893066406, + "logps/rejected": -343.77886962890625, + "loss": 0.0329, + "rewards/chosen": 2.737936496734619, + "rewards/margins": 9.90712022781372, + "rewards/rejected": -7.169183731079102, + "step": 3060 + }, + { + "epoch": 0.2796710826861581, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.208121306760806e-06, + "logits/chosen": 679734464.0, + "logits/rejected": 586081344.0, + "logps/chosen": -427.2547607421875, + "logps/rejected": -300.28582763671875, + "loss": 0.0201, + "rewards/chosen": 3.8556785583496094, + "rewards/margins": 10.439730644226074, + "rewards/rejected": -6.584052085876465, + "step": 3061 + }, + { + "epoch": 0.2797624486066697, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 8.207018354187664e-06, + "logits/chosen": 292359840.0, + "logits/rejected": 581893997.7142857, + "logps/chosen": -187.927978515625, + "logps/rejected": -392.3042689732143, + "loss": 0.0048, + "rewards/chosen": 4.825036525726318, + "rewards/margins": 12.430291652679443, + "rewards/rejected": -7.605255126953125, + "step": 3062 + }, + { + "epoch": 0.2798538145271814, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 8.205915136423816e-06, + "logits/chosen": 408467968.0, + "logits/rejected": 443857408.0, + "logps/chosen": -209.412353515625, + "logps/rejected": -357.55450439453125, + "loss": 0.0261, + "rewards/chosen": 3.8588975270589194, + "rewards/margins": 10.4870236714681, + "rewards/rejected": -6.62812614440918, + "step": 3063 + }, + { + "epoch": 0.279945180447693, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 8.204811653560493e-06, + "logits/chosen": 365953706.6666667, + "logits/rejected": 395383968.0, + "logps/chosen": -286.1824951171875, + "logps/rejected": -363.1982421875, + "loss": 0.023, + "rewards/chosen": 4.0024261474609375, + "rewards/margins": 13.42856502532959, + "rewards/rejected": -9.426138877868652, + "step": 3064 + }, + { + "epoch": 0.2800365463682047, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 8.203707905688939e-06, + "logits/chosen": 473312096.0, + "logits/rejected": 925754752.0, + "logps/chosen": -223.49246215820312, + "logps/rejected": -824.0352783203125, + "loss": 0.0132, + "rewards/chosen": 3.8333704471588135, + "rewards/margins": 14.880981683731079, + "rewards/rejected": -11.047611236572266, + "step": 3065 + }, + { + "epoch": 0.2801279122887163, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 8.202603892900422e-06, + "logits/chosen": 487941568.0, + "logits/rejected": 431385664.0, + "logps/chosen": -419.19696044921875, + "logps/rejected": -514.896484375, + "loss": 0.0124, + "rewards/chosen": 3.725245714187622, + "rewards/margins": 12.466853380203247, + "rewards/rejected": -8.741607666015625, + "step": 3066 + }, + { + "epoch": 0.280219278209228, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.201499615286237e-06, + "logits/chosen": 540985813.3333334, + "logits/rejected": 441205657.6, + "logps/chosen": -319.8036702473958, + "logps/rejected": -391.476611328125, + "loss": 0.0269, + "rewards/chosen": 2.705572764078776, + "rewards/margins": 12.047555796305337, + "rewards/rejected": -9.341983032226562, + "step": 3067 + }, + { + "epoch": 0.2803106441297396, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 8.200395072937695e-06, + "logits/chosen": 414325120.0, + "logits/rejected": 1023691673.6, + "logps/chosen": -243.14847819010416, + "logps/rejected": -398.9661376953125, + "loss": 0.0035, + "rewards/chosen": 4.771161397298177, + "rewards/margins": 14.279071553548178, + "rewards/rejected": -9.50791015625, + "step": 3068 + }, + { + "epoch": 0.2804020100502513, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 8.199290265946135e-06, + "logits/chosen": 570213312.0, + "logits/rejected": 450864480.0, + "logps/chosen": -304.69622802734375, + "logps/rejected": -221.18190002441406, + "loss": 0.1319, + "rewards/chosen": 2.0888211727142334, + "rewards/margins": 9.917696237564087, + "rewards/rejected": -7.8288750648498535, + "step": 3069 + }, + { + "epoch": 0.2804933759707629, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 8.19818519440291e-06, + "logits/chosen": 658616960.0, + "logits/rejected": 404307392.0, + "logps/chosen": -439.2139892578125, + "logps/rejected": -476.22064208984375, + "loss": 0.017, + "rewards/chosen": 3.5250141620635986, + "rewards/margins": 13.22870945930481, + "rewards/rejected": -9.703695297241211, + "step": 3070 + }, + { + "epoch": 0.2805847418912746, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 8.197079858399403e-06, + "logits/chosen": 903075904.0, + "logits/rejected": 437627904.0, + "logps/chosen": -476.2049865722656, + "logps/rejected": -463.91709681919644, + "loss": 0.0374, + "rewards/chosen": 4.797616481781006, + "rewards/margins": 13.076391288212367, + "rewards/rejected": -8.278774806431361, + "step": 3071 + }, + { + "epoch": 0.2806761078117862, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 8.195974258027013e-06, + "logits/chosen": 367482700.8, + "logits/rejected": 434476970.6666667, + "logps/chosen": -343.505810546875, + "logps/rejected": -455.7359212239583, + "loss": 0.0175, + "rewards/chosen": 3.820082092285156, + "rewards/margins": 13.871445083618164, + "rewards/rejected": -10.051362991333008, + "step": 3072 + }, + { + "epoch": 0.2807674737322979, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.194868393377162e-06, + "logits/chosen": 447544217.6, + "logits/rejected": 326455232.0, + "logps/chosen": -309.560791015625, + "logps/rejected": -344.3408203125, + "loss": 0.0756, + "rewards/chosen": 3.200128936767578, + "rewards/margins": 12.138857905069987, + "rewards/rejected": -8.938728968302408, + "step": 3073 + }, + { + "epoch": 0.2808588396528095, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.193762264541297e-06, + "logits/chosen": 266688352.0, + "logits/rejected": 455774890.6666667, + "logps/chosen": -698.681396484375, + "logps/rejected": -503.9095052083333, + "loss": 0.0295, + "rewards/chosen": 2.01763916015625, + "rewards/margins": 12.035603205362955, + "rewards/rejected": -10.017964045206705, + "step": 3074 + }, + { + "epoch": 0.2809502055733212, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 8.192655871610884e-06, + "logits/chosen": 587687552.0, + "logits/rejected": 414935392.0, + "logps/chosen": -427.4556884765625, + "logps/rejected": -400.9229736328125, + "loss": 0.0193, + "rewards/chosen": 3.602933406829834, + "rewards/margins": 11.950531482696533, + "rewards/rejected": -8.3475980758667, + "step": 3075 + }, + { + "epoch": 0.2810415714938328, + "grad_norm": 33.75, + "kl": 0.0, + "learning_rate": 8.19154921467741e-06, + "logits/chosen": 608426026.6666666, + "logits/rejected": 380899808.0, + "logps/chosen": -468.68896484375, + "logps/rejected": -425.5357360839844, + "loss": 0.1694, + "rewards/chosen": 2.271235624949137, + "rewards/margins": 11.369837919871012, + "rewards/rejected": -9.098602294921875, + "step": 3076 + }, + { + "epoch": 0.2811329374143445, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.190442293832387e-06, + "logits/chosen": 565825728.0, + "logits/rejected": 271995904.0, + "logps/chosen": -291.6683349609375, + "logps/rejected": -450.4630126953125, + "loss": 0.1893, + "rewards/chosen": 1.5631277561187744, + "rewards/margins": 9.850818872451782, + "rewards/rejected": -8.287691116333008, + "step": 3077 + }, + { + "epoch": 0.2812243033348561, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 8.189335109167346e-06, + "logits/chosen": 323906464.0, + "logits/rejected": 428415552.0, + "logps/chosen": -164.0767822265625, + "logps/rejected": -712.7832641601562, + "loss": 0.1546, + "rewards/chosen": 1.606781005859375, + "rewards/margins": 11.862550735473633, + "rewards/rejected": -10.255769729614258, + "step": 3078 + }, + { + "epoch": 0.2813156692553678, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.188227660773842e-06, + "logits/chosen": 481766092.8, + "logits/rejected": 343099242.6666667, + "logps/chosen": -395.150341796875, + "logps/rejected": -308.4763590494792, + "loss": 0.1444, + "rewards/chosen": 2.036910629272461, + "rewards/margins": 11.047796503702799, + "rewards/rejected": -9.010885874430338, + "step": 3079 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 34.25, + "kl": 0.0, + "learning_rate": 8.18711994874345e-06, + "logits/chosen": 767792213.3333334, + "logits/rejected": 486457036.8, + "logps/chosen": -198.49420166015625, + "logps/rejected": -512.14814453125, + "loss": 0.0925, + "rewards/chosen": 2.471357822418213, + "rewards/margins": 12.851937961578368, + "rewards/rejected": -10.380580139160156, + "step": 3080 + }, + { + "epoch": 0.28149840109639107, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 8.186011973167767e-06, + "logits/chosen": 883876800.0, + "logits/rejected": 915413930.6666666, + "logps/chosen": -408.5592346191406, + "logps/rejected": -1137.5367838541667, + "loss": 0.0203, + "rewards/chosen": 2.47686767578125, + "rewards/margins": 12.79859733581543, + "rewards/rejected": -10.32172966003418, + "step": 3081 + }, + { + "epoch": 0.2815897670169027, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 8.184903734138412e-06, + "logits/chosen": 871509674.6666666, + "logits/rejected": 292082720.0, + "logps/chosen": -410.9979248046875, + "logps/rejected": -251.6814422607422, + "loss": 0.0441, + "rewards/chosen": 3.238344192504883, + "rewards/margins": 10.310154914855957, + "rewards/rejected": -7.071810722351074, + "step": 3082 + }, + { + "epoch": 0.28168113293741437, + "grad_norm": 0.6484375, + "kl": 0.0, + "learning_rate": 8.18379523174703e-06, + "logits/chosen": 364205184.0, + "logits/rejected": 606239780.5714285, + "logps/chosen": -437.77911376953125, + "logps/rejected": -356.09877232142856, + "loss": 0.0028, + "rewards/chosen": 4.217111110687256, + "rewards/margins": 12.103263650621686, + "rewards/rejected": -7.8861525399344305, + "step": 3083 + }, + { + "epoch": 0.281772498857926, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 8.182686466085277e-06, + "logits/chosen": 872144896.0, + "logits/rejected": 385325269.3333333, + "logps/chosen": -277.111328125, + "logps/rejected": -338.9427490234375, + "loss": 0.0489, + "rewards/chosen": 3.2063674926757812, + "rewards/margins": 11.85691006978353, + "rewards/rejected": -8.650542577107748, + "step": 3084 + }, + { + "epoch": 0.28186386477843767, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 8.181577437244843e-06, + "logits/chosen": 459717939.2, + "logits/rejected": 1319261866.6666667, + "logps/chosen": -177.54979248046874, + "logps/rejected": -525.7408854166666, + "loss": 0.0089, + "rewards/chosen": 4.523062133789063, + "rewards/margins": 11.683711751302084, + "rewards/rejected": -7.1606496175130205, + "step": 3085 + }, + { + "epoch": 0.2819552306989493, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 8.180468145317435e-06, + "logits/chosen": 674530867.2, + "logits/rejected": 931520341.3333334, + "logps/chosen": -474.0671875, + "logps/rejected": -342.6129964192708, + "loss": 0.035, + "rewards/chosen": 3.349173736572266, + "rewards/margins": 10.500866444905599, + "rewards/rejected": -7.151692708333333, + "step": 3086 + }, + { + "epoch": 0.28204659661946097, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 8.179358590394777e-06, + "logits/chosen": 349953237.3333333, + "logits/rejected": 713235968.0, + "logps/chosen": -202.164794921875, + "logps/rejected": -244.5270233154297, + "loss": 0.0212, + "rewards/chosen": 3.7433878580729165, + "rewards/margins": 9.476864020029703, + "rewards/rejected": -5.733476161956787, + "step": 3087 + }, + { + "epoch": 0.2821379625399726, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 8.17824877256862e-06, + "logits/chosen": 715152281.6, + "logits/rejected": 510207914.6666667, + "logps/chosen": -292.20224609375, + "logps/rejected": -508.9053955078125, + "loss": 0.0186, + "rewards/chosen": 4.104469299316406, + "rewards/margins": 14.740415573120117, + "rewards/rejected": -10.635946273803711, + "step": 3088 + }, + { + "epoch": 0.28222932846048426, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 8.177138691930737e-06, + "logits/chosen": 849902080.0, + "logits/rejected": 506476330.6666667, + "logps/chosen": -372.404248046875, + "logps/rejected": -582.3385823567709, + "loss": 0.0375, + "rewards/chosen": 2.9923446655273436, + "rewards/margins": 11.891668446858723, + "rewards/rejected": -8.89932378133138, + "step": 3089 + }, + { + "epoch": 0.2823206943809959, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 8.176028348572924e-06, + "logits/chosen": 462028064.0, + "logits/rejected": 429384704.0, + "logps/chosen": -451.9507141113281, + "logps/rejected": -492.6112060546875, + "loss": 0.007, + "rewards/chosen": 3.6280884742736816, + "rewards/margins": 12.036836465199789, + "rewards/rejected": -8.408747990926107, + "step": 3090 + }, + { + "epoch": 0.28241206030150756, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 8.17491774258699e-06, + "logits/chosen": 411008480.0, + "logits/rejected": 561484480.0, + "logps/chosen": -279.6812438964844, + "logps/rejected": -494.11090087890625, + "loss": 0.0135, + "rewards/chosen": 3.785979747772217, + "rewards/margins": 13.499507427215576, + "rewards/rejected": -9.71352767944336, + "step": 3091 + }, + { + "epoch": 0.2825034262220192, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 8.173806874064777e-06, + "logits/chosen": 663859157.3333334, + "logits/rejected": 614285184.0, + "logps/chosen": -287.2915852864583, + "logps/rejected": -722.6337890625, + "loss": 0.2115, + "rewards/chosen": 2.1665353775024414, + "rewards/margins": 12.958356857299805, + "rewards/rejected": -10.791821479797363, + "step": 3092 + }, + { + "epoch": 0.28259479214253086, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 8.172695743098139e-06, + "logits/chosen": 294413019.4285714, + "logits/rejected": 152930336.0, + "logps/chosen": -267.30862862723217, + "logps/rejected": -299.2389221191406, + "loss": 0.0507, + "rewards/chosen": 3.579617364065988, + "rewards/margins": 16.219099862234934, + "rewards/rejected": -12.639482498168945, + "step": 3093 + }, + { + "epoch": 0.2826861580630425, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 8.17158434977896e-06, + "logits/chosen": 602948224.0, + "logits/rejected": 543485696.0, + "logps/chosen": -435.5113525390625, + "logps/rejected": -617.682861328125, + "loss": 0.0319, + "rewards/chosen": 3.092829465866089, + "rewards/margins": 11.776697874069214, + "rewards/rejected": -8.683868408203125, + "step": 3094 + }, + { + "epoch": 0.28277752398355416, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.170472694199143e-06, + "logits/chosen": 382211008.0, + "logits/rejected": 509694122.6666667, + "logps/chosen": -506.456787109375, + "logps/rejected": -378.0971272786458, + "loss": 0.0105, + "rewards/chosen": 3.322079658508301, + "rewards/margins": 11.240799903869629, + "rewards/rejected": -7.918720245361328, + "step": 3095 + }, + { + "epoch": 0.2828688899040658, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 8.169360776450606e-06, + "logits/chosen": 636826368.0, + "logits/rejected": 848141184.0, + "logps/chosen": -330.62632242838544, + "logps/rejected": -1192.6142578125, + "loss": 0.0406, + "rewards/chosen": 3.2477308909098306, + "rewards/margins": 17.976699511210125, + "rewards/rejected": -14.728968620300293, + "step": 3096 + }, + { + "epoch": 0.28296025582457746, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 8.168248596625299e-06, + "logits/chosen": 526774997.3333333, + "logits/rejected": 534178457.6, + "logps/chosen": -280.3968505859375, + "logps/rejected": -439.13076171875, + "loss": 0.0172, + "rewards/chosen": 3.508790651957194, + "rewards/margins": 11.592219607035318, + "rewards/rejected": -8.083428955078125, + "step": 3097 + }, + { + "epoch": 0.2830516217450891, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 8.167136154815187e-06, + "logits/chosen": 530484480.0, + "logits/rejected": 479184384.0, + "logps/chosen": -375.7179768880208, + "logps/rejected": -451.01123046875, + "loss": 0.0176, + "rewards/chosen": 3.4610347747802734, + "rewards/margins": 11.699972152709961, + "rewards/rejected": -8.238937377929688, + "step": 3098 + }, + { + "epoch": 0.28314298766560075, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 8.166023451112259e-06, + "logits/chosen": 505174272.0, + "logits/rejected": 386278758.4, + "logps/chosen": -442.73876953125, + "logps/rejected": -449.02265625, + "loss": 0.0129, + "rewards/chosen": 3.377732276916504, + "rewards/margins": 11.57979679107666, + "rewards/rejected": -8.202064514160156, + "step": 3099 + }, + { + "epoch": 0.2832343535861124, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 8.164910485608526e-06, + "logits/chosen": 842847846.4, + "logits/rejected": 635976661.3333334, + "logps/chosen": -369.1385986328125, + "logps/rejected": -504.5325520833333, + "loss": 0.018, + "rewards/chosen": 3.944257354736328, + "rewards/margins": 11.729918416341146, + "rewards/rejected": -7.785661061604817, + "step": 3100 + }, + { + "epoch": 0.28332571950662405, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.16379725839602e-06, + "logits/chosen": 675943744.0, + "logits/rejected": 1045911424.0, + "logps/chosen": -480.8173828125, + "logps/rejected": -374.53662109375, + "loss": 0.0226, + "rewards/chosen": 3.1504034996032715, + "rewards/margins": 12.043094158172607, + "rewards/rejected": -8.892690658569336, + "step": 3101 + }, + { + "epoch": 0.2834170854271357, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 8.162683769566793e-06, + "logits/chosen": 466923776.0, + "logits/rejected": 455559552.0, + "logps/chosen": -229.32489013671875, + "logps/rejected": -307.15283203125, + "loss": 0.048, + "rewards/chosen": 2.4453306198120117, + "rewards/margins": 8.329588413238525, + "rewards/rejected": -5.884257793426514, + "step": 3102 + }, + { + "epoch": 0.28350845134764735, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 8.161570019212921e-06, + "logits/chosen": 425438506.6666667, + "logits/rejected": 443760128.0, + "logps/chosen": -204.8287353515625, + "logps/rejected": -435.76943359375, + "loss": 0.0716, + "rewards/chosen": 3.078985850016276, + "rewards/margins": 9.635878245035807, + "rewards/rejected": -6.556892395019531, + "step": 3103 + }, + { + "epoch": 0.28359981726815897, + "grad_norm": 36.5, + "kl": 0.0, + "learning_rate": 8.1604560074265e-06, + "logits/chosen": 319158272.0, + "logits/rejected": 494019712.0, + "logps/chosen": -25.30396842956543, + "logps/rejected": -591.518310546875, + "loss": 0.0298, + "rewards/chosen": 2.368220806121826, + "rewards/margins": 11.278115113576254, + "rewards/rejected": -8.909894307454428, + "step": 3104 + }, + { + "epoch": 0.28369118318867065, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 8.159341734299652e-06, + "logits/chosen": 464642816.0, + "logits/rejected": 743030579.2, + "logps/chosen": -277.316650390625, + "logps/rejected": -306.4609375, + "loss": 0.0096, + "rewards/chosen": 4.119630177815755, + "rewards/margins": 11.565831502278645, + "rewards/rejected": -7.446201324462891, + "step": 3105 + }, + { + "epoch": 0.28378254910918227, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 8.158227199924514e-06, + "logits/chosen": 540740403.2, + "logits/rejected": 279410624.0, + "logps/chosen": -318.3783203125, + "logps/rejected": -399.5398763020833, + "loss": 0.0155, + "rewards/chosen": 3.983686065673828, + "rewards/margins": 11.945464706420898, + "rewards/rejected": -7.96177864074707, + "step": 3106 + }, + { + "epoch": 0.28387391502969395, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 8.157112404393248e-06, + "logits/chosen": 541083456.0, + "logits/rejected": 361316416.0, + "logps/chosen": -337.21685791015625, + "logps/rejected": -294.7637939453125, + "loss": 0.0144, + "rewards/chosen": 3.820941925048828, + "rewards/margins": 11.255925178527832, + "rewards/rejected": -7.434983253479004, + "step": 3107 + }, + { + "epoch": 0.28396528095020557, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 8.155997347798036e-06, + "logits/chosen": 277164586.6666667, + "logits/rejected": 529720729.6, + "logps/chosen": -208.771484375, + "logps/rejected": -417.875048828125, + "loss": 0.0081, + "rewards/chosen": 4.224762598673503, + "rewards/margins": 11.846831385294596, + "rewards/rejected": -7.622068786621094, + "step": 3108 + }, + { + "epoch": 0.28405664687071724, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 8.154882030231087e-06, + "logits/chosen": 619096832.0, + "logits/rejected": 416660544.0, + "logps/chosen": -375.9363098144531, + "logps/rejected": -405.7886047363281, + "loss": 0.0367, + "rewards/chosen": 3.2949090003967285, + "rewards/margins": 9.828378200531006, + "rewards/rejected": -6.533469200134277, + "step": 3109 + }, + { + "epoch": 0.28414801279122887, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.153766451784621e-06, + "logits/chosen": 1277694464.0, + "logits/rejected": 680129920.0, + "logps/chosen": -345.2165222167969, + "logps/rejected": -437.644775390625, + "loss": 0.0357, + "rewards/chosen": 2.9787375926971436, + "rewards/margins": 11.936154127120972, + "rewards/rejected": -8.957416534423828, + "step": 3110 + }, + { + "epoch": 0.28423937871174054, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 8.152650612550892e-06, + "logits/chosen": 425737536.0, + "logits/rejected": 409924096.0, + "logps/chosen": -216.72122192382812, + "logps/rejected": -321.5858154296875, + "loss": 0.1173, + "rewards/chosen": 4.009067535400391, + "rewards/margins": 10.792588551839192, + "rewards/rejected": -6.783521016438802, + "step": 3111 + }, + { + "epoch": 0.28433074463225216, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 8.151534512622167e-06, + "logits/chosen": 518573363.2, + "logits/rejected": 1494954325.3333333, + "logps/chosen": -240.6185546875, + "logps/rejected": -321.9519856770833, + "loss": 0.0324, + "rewards/chosen": 4.427934265136718, + "rewards/margins": 11.822586313883463, + "rewards/rejected": -7.394652048746745, + "step": 3112 + }, + { + "epoch": 0.28442211055276384, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 8.150418152090737e-06, + "logits/chosen": 1023912960.0, + "logits/rejected": 650915392.0, + "logps/chosen": -302.3218688964844, + "logps/rejected": -458.0728759765625, + "loss": 0.0204, + "rewards/chosen": 3.2915420532226562, + "rewards/margins": 11.180392742156982, + "rewards/rejected": -7.888850688934326, + "step": 3113 + }, + { + "epoch": 0.28451347647327546, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 8.149301531048917e-06, + "logits/chosen": 647245312.0, + "logits/rejected": 349252032.0, + "logps/chosen": -238.3970947265625, + "logps/rejected": -416.2752278645833, + "loss": 0.1008, + "rewards/chosen": 2.3532506942749025, + "rewards/margins": 10.564590644836425, + "rewards/rejected": -8.211339950561523, + "step": 3114 + }, + { + "epoch": 0.28460484239378714, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 8.148184649589038e-06, + "logits/chosen": 1173208268.8, + "logits/rejected": 727693653.3333334, + "logps/chosen": -402.9712158203125, + "logps/rejected": -671.0913899739584, + "loss": 0.02, + "rewards/chosen": 3.7553585052490233, + "rewards/margins": 12.233299382527669, + "rewards/rejected": -8.477940877278646, + "step": 3115 + }, + { + "epoch": 0.28469620831429876, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 8.147067507803457e-06, + "logits/chosen": 468985600.0, + "logits/rejected": 619378790.4, + "logps/chosen": -361.9445393880208, + "logps/rejected": -511.07119140625, + "loss": 0.0272, + "rewards/chosen": 4.0287221272786455, + "rewards/margins": 10.454359181722005, + "rewards/rejected": -6.425637054443359, + "step": 3116 + }, + { + "epoch": 0.28478757423481044, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 8.14595010578455e-06, + "logits/chosen": 569604608.0, + "logits/rejected": 872584448.0, + "logps/chosen": -378.381591796875, + "logps/rejected": -455.6962076822917, + "loss": 0.0341, + "rewards/chosen": 3.0995983123779296, + "rewards/margins": 11.81753069559733, + "rewards/rejected": -8.7179323832194, + "step": 3117 + }, + { + "epoch": 0.28487894015532206, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 8.14483244362472e-06, + "logits/chosen": 422391833.6, + "logits/rejected": 549793536.0, + "logps/chosen": -210.0215576171875, + "logps/rejected": -438.5589599609375, + "loss": 0.0345, + "rewards/chosen": 3.450786590576172, + "rewards/margins": 10.696853637695312, + "rewards/rejected": -7.246067047119141, + "step": 3118 + }, + { + "epoch": 0.28497030607583373, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 8.143714521416382e-06, + "logits/chosen": 439287808.0, + "logits/rejected": 209126698.66666666, + "logps/chosen": -307.9869873046875, + "logps/rejected": -253.41377766927084, + "loss": 0.0248, + "rewards/chosen": 3.695781707763672, + "rewards/margins": 9.952699661254883, + "rewards/rejected": -6.256917953491211, + "step": 3119 + }, + { + "epoch": 0.28506167199634536, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.14259633925198e-06, + "logits/chosen": 436934656.0, + "logits/rejected": 645220522.6666666, + "logps/chosen": -428.2914123535156, + "logps/rejected": -361.9429931640625, + "loss": 0.0196, + "rewards/chosen": 2.6243927478790283, + "rewards/margins": 9.988743384679157, + "rewards/rejected": -7.36435063680013, + "step": 3120 + }, + { + "epoch": 0.28515303791685703, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 8.14147789722398e-06, + "logits/chosen": 1320864896.0, + "logits/rejected": 661230336.0, + "logps/chosen": -537.304443359375, + "logps/rejected": -410.92315673828125, + "loss": 0.017, + "rewards/chosen": 3.668569326400757, + "rewards/margins": 12.384544134140015, + "rewards/rejected": -8.715974807739258, + "step": 3121 + }, + { + "epoch": 0.28524440383736865, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 8.140359195424864e-06, + "logits/chosen": 447091712.0, + "logits/rejected": 507300693.3333333, + "logps/chosen": -344.098388671875, + "logps/rejected": -458.6382649739583, + "loss": 0.0409, + "rewards/chosen": 2.968209648132324, + "rewards/margins": 10.90232474009196, + "rewards/rejected": -7.934115091959636, + "step": 3122 + }, + { + "epoch": 0.28533576975788033, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 8.139240233947137e-06, + "logits/chosen": 386557226.6666667, + "logits/rejected": 291526348.8, + "logps/chosen": -307.44020589192706, + "logps/rejected": -437.69365234375, + "loss": 0.0145, + "rewards/chosen": 3.5951315561930337, + "rewards/margins": 11.904060999552408, + "rewards/rejected": -8.308929443359375, + "step": 3123 + }, + { + "epoch": 0.28542713567839195, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 8.138121012883329e-06, + "logits/chosen": 453563818.6666667, + "logits/rejected": 658349209.6, + "logps/chosen": -176.82552083333334, + "logps/rejected": -818.58935546875, + "loss": 0.0069, + "rewards/chosen": 4.22178300221761, + "rewards/margins": 15.582763226826984, + "rewards/rejected": -11.360980224609374, + "step": 3124 + }, + { + "epoch": 0.28551850159890363, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 8.137001532325988e-06, + "logits/chosen": 463699558.4, + "logits/rejected": 471080405.3333333, + "logps/chosen": -212.271875, + "logps/rejected": -597.476806640625, + "loss": 0.0201, + "rewards/chosen": 3.894498825073242, + "rewards/margins": 14.226948420206705, + "rewards/rejected": -10.332449595133463, + "step": 3125 + }, + { + "epoch": 0.28560986751941525, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.135881792367686e-06, + "logits/chosen": 523397171.2, + "logits/rejected": 673371306.6666666, + "logps/chosen": -370.4700439453125, + "logps/rejected": -470.8308919270833, + "loss": 0.0158, + "rewards/chosen": 3.822615051269531, + "rewards/margins": 11.961063130696616, + "rewards/rejected": -8.138448079427084, + "step": 3126 + }, + { + "epoch": 0.2857012334399269, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 8.134761793101013e-06, + "logits/chosen": 471536512.0, + "logits/rejected": 408345753.6, + "logps/chosen": -335.526123046875, + "logps/rejected": -489.97587890625, + "loss": 0.0082, + "rewards/chosen": 4.071599960327148, + "rewards/margins": 13.795714950561523, + "rewards/rejected": -9.724114990234375, + "step": 3127 + }, + { + "epoch": 0.28579259936043855, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 8.133641534618585e-06, + "logits/chosen": 312037120.0, + "logits/rejected": 477484748.8, + "logps/chosen": -208.175537109375, + "logps/rejected": -476.073095703125, + "loss": 0.0091, + "rewards/chosen": 4.092272122701009, + "rewards/margins": 12.77833932240804, + "rewards/rejected": -8.686067199707031, + "step": 3128 + }, + { + "epoch": 0.2858839652809502, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 8.132521017013037e-06, + "logits/chosen": 586948224.0, + "logits/rejected": 229350080.0, + "logps/chosen": -434.9502360026042, + "logps/rejected": -212.20326232910156, + "loss": 0.0341, + "rewards/chosen": 3.1393213272094727, + "rewards/margins": 9.317338943481445, + "rewards/rejected": -6.178017616271973, + "step": 3129 + }, + { + "epoch": 0.28597533120146185, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 8.131400240377023e-06, + "logits/chosen": 807105536.0, + "logits/rejected": 1313798016.0, + "logps/chosen": -410.29876708984375, + "logps/rejected": -693.667724609375, + "loss": 0.012, + "rewards/chosen": 4.0048675537109375, + "rewards/margins": 14.579103469848633, + "rewards/rejected": -10.574235916137695, + "step": 3130 + }, + { + "epoch": 0.2860666971219735, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.130279204803222e-06, + "logits/chosen": 707764693.3333334, + "logits/rejected": 395079168.0, + "logps/chosen": -228.34611002604166, + "logps/rejected": -496.6191101074219, + "loss": 0.0494, + "rewards/chosen": 3.0615386962890625, + "rewards/margins": 12.709997177124023, + "rewards/rejected": -9.648458480834961, + "step": 3131 + }, + { + "epoch": 0.28615806304248514, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 8.129157910384334e-06, + "logits/chosen": 317786538.6666667, + "logits/rejected": 579545088.0, + "logps/chosen": -239.7608846028646, + "logps/rejected": -841.51015625, + "loss": 0.0191, + "rewards/chosen": 4.131045977274577, + "rewards/margins": 14.795237414042155, + "rewards/rejected": -10.664191436767577, + "step": 3132 + }, + { + "epoch": 0.2862494289629968, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 8.12803635721308e-06, + "logits/chosen": 241531477.33333334, + "logits/rejected": 456981094.4, + "logps/chosen": -388.4872233072917, + "logps/rejected": -462.3275390625, + "loss": 0.0144, + "rewards/chosen": 3.827878952026367, + "rewards/margins": 11.576377487182617, + "rewards/rejected": -7.74849853515625, + "step": 3133 + }, + { + "epoch": 0.28634079488350844, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 8.1269145453822e-06, + "logits/chosen": 368705664.0, + "logits/rejected": 474181312.0, + "logps/chosen": -260.18597412109375, + "logps/rejected": -725.4758911132812, + "loss": 0.0091, + "rewards/chosen": 4.408977508544922, + "rewards/margins": 15.365030288696289, + "rewards/rejected": -10.956052780151367, + "step": 3134 + }, + { + "epoch": 0.2864321608040201, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 8.12579247498446e-06, + "logits/chosen": 640966016.0, + "logits/rejected": 1399882240.0, + "logps/chosen": -350.0343017578125, + "logps/rejected": -744.9652709960938, + "loss": 0.0182, + "rewards/chosen": 3.5110538005828857, + "rewards/margins": 12.73397707939148, + "rewards/rejected": -9.222923278808594, + "step": 3135 + }, + { + "epoch": 0.28652352672453174, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 8.124670146112641e-06, + "logits/chosen": 457123328.0, + "logits/rejected": 340772288.0, + "logps/chosen": -332.70782470703125, + "logps/rejected": -429.52960205078125, + "loss": 0.0143, + "rewards/chosen": 3.744424819946289, + "rewards/margins": 12.591817855834961, + "rewards/rejected": -8.847393035888672, + "step": 3136 + }, + { + "epoch": 0.2866148926450434, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.123547558859553e-06, + "logits/chosen": 513947296.0, + "logits/rejected": 537790592.0, + "logps/chosen": -294.5638427734375, + "logps/rejected": -401.13287353515625, + "loss": 0.0254, + "rewards/chosen": 4.002154350280762, + "rewards/margins": 11.669487953186035, + "rewards/rejected": -7.667333602905273, + "step": 3137 + }, + { + "epoch": 0.28670625856555504, + "grad_norm": 41.25, + "kl": 0.0, + "learning_rate": 8.122424713318022e-06, + "logits/chosen": 326924416.0, + "logits/rejected": 512376027.4285714, + "logps/chosen": -200.93528747558594, + "logps/rejected": -386.96829659598217, + "loss": 0.075, + "rewards/chosen": 5.952815532684326, + "rewards/margins": 12.965273244040354, + "rewards/rejected": -7.012457711356027, + "step": 3138 + }, + { + "epoch": 0.2867976244860667, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 8.121301609580898e-06, + "logits/chosen": 563478869.3333334, + "logits/rejected": 624662630.4, + "logps/chosen": -442.572998046875, + "logps/rejected": -566.46904296875, + "loss": 0.0064, + "rewards/chosen": 4.214175542195638, + "rewards/margins": 14.26466687520345, + "rewards/rejected": -10.050491333007812, + "step": 3139 + }, + { + "epoch": 0.28688899040657834, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 8.120178247741049e-06, + "logits/chosen": 633323161.6, + "logits/rejected": 563646250.6666666, + "logps/chosen": -265.7030517578125, + "logps/rejected": -466.5968017578125, + "loss": 0.034, + "rewards/chosen": 3.3305438995361327, + "rewards/margins": 11.394518915812174, + "rewards/rejected": -8.063975016276041, + "step": 3140 + }, + { + "epoch": 0.28698035632709, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.119054627891369e-06, + "logits/chosen": 646484138.6666666, + "logits/rejected": 711190067.2, + "logps/chosen": -521.9698893229166, + "logps/rejected": -641.162548828125, + "loss": 0.0239, + "rewards/chosen": 2.739207903544108, + "rewards/margins": 11.560251299540202, + "rewards/rejected": -8.821043395996094, + "step": 3141 + }, + { + "epoch": 0.28707172224760164, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 8.11793075012477e-06, + "logits/chosen": 463960217.6, + "logits/rejected": 413964501.3333333, + "logps/chosen": -357.606982421875, + "logps/rejected": -701.87109375, + "loss": 0.031, + "rewards/chosen": 3.318825531005859, + "rewards/margins": 16.103059895833333, + "rewards/rejected": -12.784234364827475, + "step": 3142 + }, + { + "epoch": 0.2871630881681133, + "grad_norm": 36.5, + "kl": 0.0, + "learning_rate": 8.116806614534185e-06, + "logits/chosen": 476895104.0, + "logits/rejected": 577730901.3333334, + "logps/chosen": -418.5635986328125, + "logps/rejected": -532.00634765625, + "loss": 0.0725, + "rewards/chosen": 1.4478744268417358, + "rewards/margins": 9.39333124955495, + "rewards/rejected": -7.945456822713216, + "step": 3143 + }, + { + "epoch": 0.28725445408862493, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 8.115682221212571e-06, + "logits/chosen": 509578080.0, + "logits/rejected": 655775808.0, + "logps/chosen": -298.2341613769531, + "logps/rejected": -359.8809814453125, + "loss": 0.0108, + "rewards/chosen": 4.419393062591553, + "rewards/margins": 12.91228437423706, + "rewards/rejected": -8.492891311645508, + "step": 3144 + }, + { + "epoch": 0.2873458200091366, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 8.114557570252905e-06, + "logits/chosen": 443501209.6, + "logits/rejected": 653611605.3333334, + "logps/chosen": -223.202587890625, + "logps/rejected": -601.5392252604166, + "loss": 0.0416, + "rewards/chosen": 3.2457366943359376, + "rewards/margins": 12.406217829386392, + "rewards/rejected": -9.160481135050455, + "step": 3145 + }, + { + "epoch": 0.28743718592964823, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 8.113432661748187e-06, + "logits/chosen": 649460394.6666666, + "logits/rejected": 797153920.0, + "logps/chosen": -329.6803385416667, + "logps/rejected": -960.1657104492188, + "loss": 0.0439, + "rewards/chosen": 3.646124839782715, + "rewards/margins": 15.292593955993652, + "rewards/rejected": -11.646469116210938, + "step": 3146 + }, + { + "epoch": 0.2875285518501599, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 8.112307495791431e-06, + "logits/chosen": 988283221.3333334, + "logits/rejected": 580718208.0, + "logps/chosen": -643.2784016927084, + "logps/rejected": -495.2299499511719, + "loss": 0.0328, + "rewards/chosen": 3.24111270904541, + "rewards/margins": 12.03000259399414, + "rewards/rejected": -8.78888988494873, + "step": 3147 + }, + { + "epoch": 0.28761991777067153, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 8.111182072475683e-06, + "logits/chosen": 380047744.0, + "logits/rejected": 484748736.0, + "logps/chosen": -244.81607055664062, + "logps/rejected": -514.5626831054688, + "loss": 0.0279, + "rewards/chosen": 3.6421308517456055, + "rewards/margins": 11.349897861480713, + "rewards/rejected": -7.707767009735107, + "step": 3148 + }, + { + "epoch": 0.2877112836911832, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.110056391894005e-06, + "logits/chosen": 403701760.0, + "logits/rejected": 385722026.6666667, + "logps/chosen": -370.0306640625, + "logps/rejected": -475.882568359375, + "loss": 0.0254, + "rewards/chosen": 4.078503799438477, + "rewards/margins": 12.49803784688314, + "rewards/rejected": -8.419534047444662, + "step": 3149 + }, + { + "epoch": 0.2878026496116948, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 8.108930454139476e-06, + "logits/chosen": 460855296.0, + "logits/rejected": 356950374.4, + "logps/chosen": -346.8282877604167, + "logps/rejected": -437.844384765625, + "loss": 0.0076, + "rewards/chosen": 3.9141613642374673, + "rewards/margins": 14.672779528299968, + "rewards/rejected": -10.7586181640625, + "step": 3150 + }, + { + "epoch": 0.2878940155322065, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 8.107804259305205e-06, + "logits/chosen": 406054688.0, + "logits/rejected": 433411145.14285713, + "logps/chosen": -166.0860595703125, + "logps/rejected": -463.68174525669644, + "loss": 0.0056, + "rewards/chosen": 3.205737352371216, + "rewards/margins": 11.200089965547834, + "rewards/rejected": -7.994352613176618, + "step": 3151 + }, + { + "epoch": 0.2879853814527181, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.106677807484315e-06, + "logits/chosen": 727899699.2, + "logits/rejected": 735071914.6666666, + "logps/chosen": -473.823095703125, + "logps/rejected": -773.505126953125, + "loss": 0.0162, + "rewards/chosen": 3.882454681396484, + "rewards/margins": 13.40649871826172, + "rewards/rejected": -9.524044036865234, + "step": 3152 + }, + { + "epoch": 0.2880767473732298, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 8.105551098769954e-06, + "logits/chosen": 925459968.0, + "logits/rejected": 633717632.0, + "logps/chosen": -374.5634765625, + "logps/rejected": -660.4679565429688, + "loss": 0.0241, + "rewards/chosen": 3.061889171600342, + "rewards/margins": 12.772040843963623, + "rewards/rejected": -9.710151672363281, + "step": 3153 + }, + { + "epoch": 0.2881681132937414, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 8.104424133255293e-06, + "logits/chosen": 622625066.6666666, + "logits/rejected": 233546928.0, + "logps/chosen": -437.050537109375, + "logps/rejected": -522.6491088867188, + "loss": 0.0299, + "rewards/chosen": 3.5816237131754556, + "rewards/margins": 10.238300959269205, + "rewards/rejected": -6.65667724609375, + "step": 3154 + }, + { + "epoch": 0.2882594792142531, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 8.103296911033518e-06, + "logits/chosen": 595961216.0, + "logits/rejected": 512445525.3333333, + "logps/chosen": -477.2911682128906, + "logps/rejected": -598.6549072265625, + "loss": 0.004, + "rewards/chosen": 4.462928771972656, + "rewards/margins": 14.746875762939453, + "rewards/rejected": -10.283946990966797, + "step": 3155 + }, + { + "epoch": 0.2883508451347647, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 8.102169432197842e-06, + "logits/chosen": 694490316.8, + "logits/rejected": 525438378.6666667, + "logps/chosen": -163.1286865234375, + "logps/rejected": -452.7192789713542, + "loss": 0.0485, + "rewards/chosen": 3.460559844970703, + "rewards/margins": 9.679719924926758, + "rewards/rejected": -6.219160079956055, + "step": 3156 + }, + { + "epoch": 0.2884422110552764, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 8.101041696841495e-06, + "logits/chosen": 459505472.0, + "logits/rejected": 727888896.0, + "logps/chosen": -218.41612243652344, + "logps/rejected": -440.12030029296875, + "loss": 0.0651, + "rewards/chosen": 2.3323068618774414, + "rewards/margins": 9.867997169494629, + "rewards/rejected": -7.5356903076171875, + "step": 3157 + }, + { + "epoch": 0.288533576975788, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 8.099913705057732e-06, + "logits/chosen": 757583923.2, + "logits/rejected": 384289877.3333333, + "logps/chosen": -284.17470703125, + "logps/rejected": -450.954833984375, + "loss": 0.0569, + "rewards/chosen": 2.9624822616577147, + "rewards/margins": 10.82561601003011, + "rewards/rejected": -7.8631337483723955, + "step": 3158 + }, + { + "epoch": 0.2886249428962997, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 8.098785456939826e-06, + "logits/chosen": 721594880.0, + "logits/rejected": 742343680.0, + "logps/chosen": -307.1598307291667, + "logps/rejected": -508.95, + "loss": 0.0268, + "rewards/chosen": 2.9082908630371094, + "rewards/margins": 12.07834243774414, + "rewards/rejected": -9.170051574707031, + "step": 3159 + }, + { + "epoch": 0.2887163088168113, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.097656952581074e-06, + "logits/chosen": 310616768.0, + "logits/rejected": 325717376.0, + "logps/chosen": -209.62161254882812, + "logps/rejected": -377.9285888671875, + "loss": 0.025, + "rewards/chosen": 3.577090263366699, + "rewards/margins": 11.56833553314209, + "rewards/rejected": -7.991245269775391, + "step": 3160 + }, + { + "epoch": 0.288807674737323, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 8.096528192074793e-06, + "logits/chosen": 254971754.66666666, + "logits/rejected": 664639808.0, + "logps/chosen": -264.0543619791667, + "logps/rejected": -1103.87255859375, + "loss": 0.0275, + "rewards/chosen": 3.6797892252604165, + "rewards/margins": 16.422588030497234, + "rewards/rejected": -12.742798805236816, + "step": 3161 + }, + { + "epoch": 0.2888990406578346, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 8.09539917551432e-06, + "logits/chosen": 410005184.0, + "logits/rejected": 649469184.0, + "logps/chosen": -411.61029052734375, + "logps/rejected": -567.052490234375, + "loss": 0.0061, + "rewards/chosen": 4.5312910079956055, + "rewards/margins": 14.448952039082846, + "rewards/rejected": -9.91766103108724, + "step": 3162 + }, + { + "epoch": 0.2889904065783463, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 8.094269902993016e-06, + "logits/chosen": 436932224.0, + "logits/rejected": 382815539.2, + "logps/chosen": -346.41943359375, + "logps/rejected": -392.524609375, + "loss": 0.0187, + "rewards/chosen": 3.0010255177815757, + "rewards/margins": 11.999497350056966, + "rewards/rejected": -8.99847183227539, + "step": 3163 + }, + { + "epoch": 0.2890817724988579, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.093140374604258e-06, + "logits/chosen": 733304422.4, + "logits/rejected": 417778858.6666667, + "logps/chosen": -176.155029296875, + "logps/rejected": -542.2265625, + "loss": 0.0429, + "rewards/chosen": 2.7207496643066404, + "rewards/margins": 10.7765199025472, + "rewards/rejected": -8.05577023824056, + "step": 3164 + }, + { + "epoch": 0.2891731384193696, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 8.09201059044145e-06, + "logits/chosen": 476602560.0, + "logits/rejected": 694751168.0, + "logps/chosen": -373.7246398925781, + "logps/rejected": -467.468994140625, + "loss": 0.0153, + "rewards/chosen": 4.374234199523926, + "rewards/margins": 14.315120697021484, + "rewards/rejected": -9.940886497497559, + "step": 3165 + }, + { + "epoch": 0.2892645043398812, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 8.090880550598015e-06, + "logits/chosen": 604333184.0, + "logits/rejected": 734569792.0, + "logps/chosen": -350.8827819824219, + "logps/rejected": -491.36444091796875, + "loss": 0.0213, + "rewards/chosen": 3.4354658126831055, + "rewards/margins": 11.750974655151367, + "rewards/rejected": -8.315508842468262, + "step": 3166 + }, + { + "epoch": 0.2893558702603929, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 8.089750255167396e-06, + "logits/chosen": 625301888.0, + "logits/rejected": 679351424.0, + "logps/chosen": -340.8411865234375, + "logps/rejected": -469.67608642578125, + "loss": 0.0172, + "rewards/chosen": 4.227900505065918, + "rewards/margins": 11.635674476623535, + "rewards/rejected": -7.407773971557617, + "step": 3167 + }, + { + "epoch": 0.2894472361809045, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 8.088619704243058e-06, + "logits/chosen": 370039424.0, + "logits/rejected": 326490624.0, + "logps/chosen": -302.6526184082031, + "logps/rejected": -508.65869140625, + "loss": 0.1149, + "rewards/chosen": 3.9478468894958496, + "rewards/margins": 12.208842754364014, + "rewards/rejected": -8.260995864868164, + "step": 3168 + }, + { + "epoch": 0.2895386021014162, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.087488897918488e-06, + "logits/chosen": 661676416.0, + "logits/rejected": 671070592.0, + "logps/chosen": -360.8673400878906, + "logps/rejected": -449.38031005859375, + "loss": 0.1285, + "rewards/chosen": 2.137897491455078, + "rewards/margins": 10.002208232879639, + "rewards/rejected": -7.8643107414245605, + "step": 3169 + }, + { + "epoch": 0.2896299680219278, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 8.086357836287191e-06, + "logits/chosen": 357786602.6666667, + "logits/rejected": 423109248.0, + "logps/chosen": -261.3002522786458, + "logps/rejected": -390.2319641113281, + "loss": 0.0289, + "rewards/chosen": 3.611024856567383, + "rewards/margins": 12.368060111999512, + "rewards/rejected": -8.757035255432129, + "step": 3170 + }, + { + "epoch": 0.2897213339424395, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.085226519442697e-06, + "logits/chosen": 858346368.0, + "logits/rejected": 586726400.0, + "logps/chosen": -240.98629760742188, + "logps/rejected": -495.76116943359375, + "loss": 0.0334, + "rewards/chosen": 2.8118605613708496, + "rewards/margins": 11.458268642425537, + "rewards/rejected": -8.646408081054688, + "step": 3171 + }, + { + "epoch": 0.2898126998629511, + "grad_norm": 36.5, + "kl": 0.0, + "learning_rate": 8.084094947478556e-06, + "logits/chosen": 409895475.2, + "logits/rejected": 522367573.3333333, + "logps/chosen": -167.2864501953125, + "logps/rejected": -842.9889322916666, + "loss": 0.0564, + "rewards/chosen": 3.2107826232910157, + "rewards/margins": 15.445157623291015, + "rewards/rejected": -12.234375, + "step": 3172 + }, + { + "epoch": 0.2899040657834628, + "grad_norm": 26.0, + "kl": 0.0, + "learning_rate": 8.082963120488337e-06, + "logits/chosen": 443851995.4285714, + "logits/rejected": 390626816.0, + "logps/chosen": -200.44049944196428, + "logps/rejected": -372.96746826171875, + "loss": 0.0493, + "rewards/chosen": 3.524350847516741, + "rewards/margins": 8.70217057636806, + "rewards/rejected": -5.177819728851318, + "step": 3173 + }, + { + "epoch": 0.2899954317039744, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 8.081831038565631e-06, + "logits/chosen": 587511296.0, + "logits/rejected": 478632512.0, + "logps/chosen": -320.45220947265625, + "logps/rejected": -537.582763671875, + "loss": 0.0127, + "rewards/chosen": 4.092085361480713, + "rewards/margins": 14.694143772125244, + "rewards/rejected": -10.602058410644531, + "step": 3174 + }, + { + "epoch": 0.2900867976244861, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 8.080698701804054e-06, + "logits/chosen": 498622924.8, + "logits/rejected": 507238357.3333333, + "logps/chosen": -383.0866943359375, + "logps/rejected": -717.9546712239584, + "loss": 0.0225, + "rewards/chosen": 4.300484466552734, + "rewards/margins": 14.206429290771485, + "rewards/rejected": -9.90594482421875, + "step": 3175 + }, + { + "epoch": 0.2901781635449977, + "grad_norm": 34.75, + "kl": 0.0, + "learning_rate": 8.079566110297237e-06, + "logits/chosen": 318884736.0, + "logits/rejected": 423949376.0, + "logps/chosen": -151.15847778320312, + "logps/rejected": -575.9681396484375, + "loss": 0.0532, + "rewards/chosen": 2.2561140060424805, + "rewards/margins": 12.032503128051758, + "rewards/rejected": -9.776389122009277, + "step": 3176 + }, + { + "epoch": 0.2902695294655094, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 8.078433264138835e-06, + "logits/chosen": 413058304.0, + "logits/rejected": 228044256.0, + "logps/chosen": -247.783203125, + "logps/rejected": -461.1403503417969, + "loss": 0.0217, + "rewards/chosen": 3.388819694519043, + "rewards/margins": 11.11096715927124, + "rewards/rejected": -7.722147464752197, + "step": 3177 + }, + { + "epoch": 0.290360895386021, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 8.077300163422525e-06, + "logits/chosen": 614850218.6666666, + "logits/rejected": 345240115.2, + "logps/chosen": -304.37660725911456, + "logps/rejected": -479.35849609375, + "loss": 0.0127, + "rewards/chosen": 3.971833864847819, + "rewards/margins": 15.535619417826334, + "rewards/rejected": -11.563785552978516, + "step": 3178 + }, + { + "epoch": 0.2904522613065327, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.076166808242004e-06, + "logits/chosen": 532203648.0, + "logits/rejected": 763303232.0, + "logps/chosen": -320.403076171875, + "logps/rejected": -497.238037109375, + "loss": 0.0287, + "rewards/chosen": 3.2465474605560303, + "rewards/margins": 12.6592538356781, + "rewards/rejected": -9.41270637512207, + "step": 3179 + }, + { + "epoch": 0.2905436272270443, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 8.075033198690987e-06, + "logits/chosen": 696380224.0, + "logits/rejected": 478090464.0, + "logps/chosen": -443.6958312988281, + "logps/rejected": -451.60382080078125, + "loss": 0.0207, + "rewards/chosen": 3.567488670349121, + "rewards/margins": 12.579845428466797, + "rewards/rejected": -9.012356758117676, + "step": 3180 + }, + { + "epoch": 0.290634993147556, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 8.073899334863215e-06, + "logits/chosen": 348247594.6666667, + "logits/rejected": 485071616.0, + "logps/chosen": -215.1669921875, + "logps/rejected": -597.303857421875, + "loss": 0.0376, + "rewards/chosen": 2.7158114115397134, + "rewards/margins": 12.645184580485026, + "rewards/rejected": -9.929373168945313, + "step": 3181 + }, + { + "epoch": 0.2907263590680676, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.072765216852451e-06, + "logits/chosen": 862843477.3333334, + "logits/rejected": 579162214.4, + "logps/chosen": -312.52272542317706, + "logps/rejected": -513.3109375, + "loss": 0.0742, + "rewards/chosen": 3.963237444559733, + "rewards/margins": 10.440843645731608, + "rewards/rejected": -6.477606201171875, + "step": 3182 + }, + { + "epoch": 0.2908177249885793, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 8.071630844752471e-06, + "logits/chosen": 604790058.6666666, + "logits/rejected": 1733190784.0, + "logps/chosen": -250.8469441731771, + "logps/rejected": -683.4703979492188, + "loss": 0.0255, + "rewards/chosen": 3.631803512573242, + "rewards/margins": 11.434708595275879, + "rewards/rejected": -7.802905082702637, + "step": 3183 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 8.07049621865708e-06, + "logits/chosen": 678506752.0, + "logits/rejected": 1196593664.0, + "logps/chosen": -277.1744384765625, + "logps/rejected": -551.89697265625, + "loss": 0.0321, + "rewards/chosen": 3.180103063583374, + "rewards/margins": 12.53248381614685, + "rewards/rejected": -9.352380752563477, + "step": 3184 + }, + { + "epoch": 0.29100045682960257, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.069361338660102e-06, + "logits/chosen": 737963520.0, + "logits/rejected": 562405273.6, + "logps/chosen": -301.9694417317708, + "logps/rejected": -693.104296875, + "loss": 0.0187, + "rewards/chosen": 3.5824015935262046, + "rewards/margins": 14.008434613545736, + "rewards/rejected": -10.426033020019531, + "step": 3185 + }, + { + "epoch": 0.2910918227501142, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 8.068226204855379e-06, + "logits/chosen": 531575744.0, + "logits/rejected": 516020992.0, + "logps/chosen": -465.03369140625, + "logps/rejected": -482.3238525390625, + "loss": 0.0191, + "rewards/chosen": 3.710556745529175, + "rewards/margins": 12.034662008285522, + "rewards/rejected": -8.324105262756348, + "step": 3186 + }, + { + "epoch": 0.29118318867062587, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.067090817336777e-06, + "logits/chosen": 535131349.3333333, + "logits/rejected": 783948748.8, + "logps/chosen": -302.52581787109375, + "logps/rejected": -570.009814453125, + "loss": 0.031, + "rewards/chosen": 2.999976476033529, + "rewards/margins": 11.277417119344076, + "rewards/rejected": -8.277440643310547, + "step": 3187 + }, + { + "epoch": 0.2912745545911375, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 8.065955176198182e-06, + "logits/chosen": 320213632.0, + "logits/rejected": 312981973.3333333, + "logps/chosen": -311.29815673828125, + "logps/rejected": -441.5160319010417, + "loss": 0.0202, + "rewards/chosen": 3.8572497367858887, + "rewards/margins": 14.420728842417398, + "rewards/rejected": -10.56347910563151, + "step": 3188 + }, + { + "epoch": 0.29136592051164917, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 8.064819281533499e-06, + "logits/chosen": 291797440.0, + "logits/rejected": 691435776.0, + "logps/chosen": -403.6463928222656, + "logps/rejected": -478.32208251953125, + "loss": 0.0094, + "rewards/chosen": 4.705949783325195, + "rewards/margins": 12.677422523498535, + "rewards/rejected": -7.97147274017334, + "step": 3189 + }, + { + "epoch": 0.2914572864321608, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 8.063683133436657e-06, + "logits/chosen": 678936234.6666666, + "logits/rejected": 943862784.0, + "logps/chosen": -401.7197265625, + "logps/rejected": -496.194775390625, + "loss": 0.1212, + "rewards/chosen": 3.3097194035847983, + "rewards/margins": 10.581994183858235, + "rewards/rejected": -7.272274780273437, + "step": 3190 + }, + { + "epoch": 0.29154865235267247, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 8.062546732001608e-06, + "logits/chosen": 590804480.0, + "logits/rejected": 658051754.6666666, + "logps/chosen": -391.7810546875, + "logps/rejected": -900.5841471354166, + "loss": 0.0292, + "rewards/chosen": 3.270555114746094, + "rewards/margins": 14.133798217773437, + "rewards/rejected": -10.863243103027344, + "step": 3191 + }, + { + "epoch": 0.2916400182731841, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 8.061410077322319e-06, + "logits/chosen": 407943722.6666667, + "logits/rejected": 600662937.6, + "logps/chosen": -183.8393758138021, + "logps/rejected": -471.1130859375, + "loss": 0.0141, + "rewards/chosen": 3.451538403828939, + "rewards/margins": 11.84008363087972, + "rewards/rejected": -8.38854522705078, + "step": 3192 + }, + { + "epoch": 0.29173138419369576, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 8.060273169492778e-06, + "logits/chosen": 1002131370.6666666, + "logits/rejected": 513567436.8, + "logps/chosen": -267.3661295572917, + "logps/rejected": -555.11630859375, + "loss": 0.0259, + "rewards/chosen": 2.8526767094930015, + "rewards/margins": 12.102849896748861, + "rewards/rejected": -9.250173187255859, + "step": 3193 + }, + { + "epoch": 0.2918227501142074, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.059136008607002e-06, + "logits/chosen": 1123287296.0, + "logits/rejected": 1031160320.0, + "logps/chosen": -345.3330485026042, + "logps/rejected": -488.112109375, + "loss": 0.0222, + "rewards/chosen": 3.356764793395996, + "rewards/margins": 11.194057273864747, + "rewards/rejected": -7.83729248046875, + "step": 3194 + }, + { + "epoch": 0.29191411603471906, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.057998594759022e-06, + "logits/chosen": 307045696.0, + "logits/rejected": 529407561.14285713, + "logps/chosen": -286.9096374511719, + "logps/rejected": -423.20835658482144, + "loss": 0.0874, + "rewards/chosen": 4.6415863037109375, + "rewards/margins": 11.04776872907366, + "rewards/rejected": -6.406182425362723, + "step": 3195 + }, + { + "epoch": 0.2920054819552307, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 8.056860928042892e-06, + "logits/chosen": 618570240.0, + "logits/rejected": 624285286.4, + "logps/chosen": -401.8917643229167, + "logps/rejected": -491.541845703125, + "loss": 0.016, + "rewards/chosen": 3.255577723185221, + "rewards/margins": 11.929352442423502, + "rewards/rejected": -8.673774719238281, + "step": 3196 + }, + { + "epoch": 0.29209684787574236, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 8.055723008552684e-06, + "logits/chosen": 380150208.0, + "logits/rejected": 507993600.0, + "logps/chosen": -346.64312744140625, + "logps/rejected": -562.3262329101562, + "loss": 0.0169, + "rewards/chosen": 3.7941575050354004, + "rewards/margins": 11.88698434829712, + "rewards/rejected": -8.092826843261719, + "step": 3197 + }, + { + "epoch": 0.292188213796254, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 8.054584836382496e-06, + "logits/chosen": 291929344.0, + "logits/rejected": 518127040.0, + "logps/chosen": -402.142822265625, + "logps/rejected": -566.2949829101562, + "loss": 0.0243, + "rewards/chosen": 3.3471858501434326, + "rewards/margins": 13.475264310836792, + "rewards/rejected": -10.12807846069336, + "step": 3198 + }, + { + "epoch": 0.29227957971676566, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 8.053446411626442e-06, + "logits/chosen": 668326400.0, + "logits/rejected": 526813988.5714286, + "logps/chosen": -172.0641632080078, + "logps/rejected": -645.6418805803571, + "loss": 0.0522, + "rewards/chosen": 3.050854444503784, + "rewards/margins": 11.767850773675102, + "rewards/rejected": -8.716996329171318, + "step": 3199 + }, + { + "epoch": 0.2923709456372773, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 8.052307734378661e-06, + "logits/chosen": 379691808.0, + "logits/rejected": 779046016.0, + "logps/chosen": -244.1895751953125, + "logps/rejected": -495.9484558105469, + "loss": 0.0161, + "rewards/chosen": 3.5828661918640137, + "rewards/margins": 12.62703275680542, + "rewards/rejected": -9.044166564941406, + "step": 3200 + }, + { + "epoch": 0.29246231155778896, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 8.051168804733314e-06, + "logits/chosen": 657202995.2, + "logits/rejected": 509593216.0, + "logps/chosen": -426.6005859375, + "logps/rejected": -357.260986328125, + "loss": 0.0182, + "rewards/chosen": 4.044665145874023, + "rewards/margins": 12.69846102396647, + "rewards/rejected": -8.653795878092447, + "step": 3201 + }, + { + "epoch": 0.2925536774783006, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 8.050029622784573e-06, + "logits/chosen": 476375859.2, + "logits/rejected": 568653141.3333334, + "logps/chosen": -228.59501953125, + "logps/rejected": -257.8435465494792, + "loss": 0.0907, + "rewards/chosen": 3.2988914489746093, + "rewards/margins": 8.349683507283528, + "rewards/rejected": -5.050792058308919, + "step": 3202 + }, + { + "epoch": 0.29264504339881225, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 8.048890188626642e-06, + "logits/chosen": 737245184.0, + "logits/rejected": 740444330.6666666, + "logps/chosen": -580.56943359375, + "logps/rejected": -946.2545572916666, + "loss": 0.0193, + "rewards/chosen": 4.035494995117188, + "rewards/margins": 14.691341908772788, + "rewards/rejected": -10.6558469136556, + "step": 3203 + }, + { + "epoch": 0.2927364093193239, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 8.047750502353741e-06, + "logits/chosen": 262697813.33333334, + "logits/rejected": 486920140.8, + "logps/chosen": -433.1221516927083, + "logps/rejected": -414.500390625, + "loss": 0.0145, + "rewards/chosen": 3.6262035369873047, + "rewards/margins": 11.918394088745117, + "rewards/rejected": -8.292190551757812, + "step": 3204 + }, + { + "epoch": 0.29282777523983555, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 8.046610564060113e-06, + "logits/chosen": 971712085.3333334, + "logits/rejected": 599143372.8, + "logps/chosen": -535.3455403645834, + "logps/rejected": -538.66474609375, + "loss": 0.0179, + "rewards/chosen": 3.2023022969563804, + "rewards/margins": 13.058770497639975, + "rewards/rejected": -9.856468200683594, + "step": 3205 + }, + { + "epoch": 0.2929191411603472, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 8.045470373840016e-06, + "logits/chosen": 590353578.6666666, + "logits/rejected": 416546201.6, + "logps/chosen": -411.2044270833333, + "logps/rejected": -528.5041015625, + "loss": 0.0729, + "rewards/chosen": 3.647314707438151, + "rewards/margins": 13.053789774576822, + "rewards/rejected": -9.406475067138672, + "step": 3206 + }, + { + "epoch": 0.29301050708085885, + "grad_norm": 30.875, + "kl": 0.0, + "learning_rate": 8.04432993178774e-06, + "logits/chosen": 315314944.0, + "logits/rejected": 467760192.0, + "logps/chosen": -340.312255859375, + "logps/rejected": -510.9176025390625, + "loss": 0.0743, + "rewards/chosen": 2.8176183700561523, + "rewards/margins": 12.590734481811523, + "rewards/rejected": -9.773116111755371, + "step": 3207 + }, + { + "epoch": 0.29310187300137047, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 8.043189237997582e-06, + "logits/chosen": 865232998.4, + "logits/rejected": 347434688.0, + "logps/chosen": -337.845947265625, + "logps/rejected": -454.93798828125, + "loss": 0.0267, + "rewards/chosen": 3.3678489685058595, + "rewards/margins": 13.834691619873047, + "rewards/rejected": -10.466842651367188, + "step": 3208 + }, + { + "epoch": 0.29319323892188215, + "grad_norm": 0.7734375, + "kl": 0.0, + "learning_rate": 8.042048292563869e-06, + "logits/chosen": 290596352.0, + "logits/rejected": 696794752.0, + "logps/chosen": -179.24856567382812, + "logps/rejected": -381.08319091796875, + "loss": 0.0053, + "rewards/chosen": 4.840086936950684, + "rewards/margins": 12.575418949127197, + "rewards/rejected": -7.735332012176514, + "step": 3209 + }, + { + "epoch": 0.29328460484239377, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 8.040907095580948e-06, + "logits/chosen": 792353621.3333334, + "logits/rejected": 653171916.8, + "logps/chosen": -362.611083984375, + "logps/rejected": -702.42080078125, + "loss": 0.0263, + "rewards/chosen": 3.075697580973307, + "rewards/margins": 12.73779093424479, + "rewards/rejected": -9.662093353271484, + "step": 3210 + }, + { + "epoch": 0.29337597076290545, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 8.039765647143187e-06, + "logits/chosen": 590942617.6, + "logits/rejected": 547714090.6666666, + "logps/chosen": -317.3958984375, + "logps/rejected": -698.07373046875, + "loss": 0.0329, + "rewards/chosen": 3.2599891662597655, + "rewards/margins": 13.810484313964844, + "rewards/rejected": -10.550495147705078, + "step": 3211 + }, + { + "epoch": 0.29346733668341707, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.03862394734497e-06, + "logits/chosen": 823988565.3333334, + "logits/rejected": 684096256.0, + "logps/chosen": -341.0434977213542, + "logps/rejected": -535.30615234375, + "loss": 0.01, + "rewards/chosen": 4.420324325561523, + "rewards/margins": 12.97792320251465, + "rewards/rejected": -8.557598876953126, + "step": 3212 + }, + { + "epoch": 0.29355870260392875, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 8.037481996280705e-06, + "logits/chosen": 771657152.0, + "logits/rejected": 933505408.0, + "logps/chosen": -309.660888671875, + "logps/rejected": -505.31005859375, + "loss": 0.0303, + "rewards/chosen": 2.9706273078918457, + "rewards/margins": 10.981173992156982, + "rewards/rejected": -8.010546684265137, + "step": 3213 + }, + { + "epoch": 0.29365006852444037, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.036339794044822e-06, + "logits/chosen": 620539699.2, + "logits/rejected": 690652202.6666666, + "logps/chosen": -477.25693359375, + "logps/rejected": -541.3173828125, + "loss": 0.0355, + "rewards/chosen": 3.255967712402344, + "rewards/margins": 11.775544738769531, + "rewards/rejected": -8.519577026367188, + "step": 3214 + }, + { + "epoch": 0.29374143444495204, + "grad_norm": 10.1875, + "kl": 9.855539321899414, + "learning_rate": 8.03519734073177e-06, + "logits/chosen": 340117184.0, + "logps/chosen": -217.91116333007812, + "loss": 0.0944, + "rewards/chosen": 3.918428421020508, + "step": 3215 + }, + { + "epoch": 0.29383280036546366, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.034054636436022e-06, + "logits/chosen": 725689344.0, + "logits/rejected": 605128021.3333334, + "logps/chosen": -373.88323974609375, + "logps/rejected": -603.7901204427084, + "loss": 0.0171, + "rewards/chosen": 2.6234560012817383, + "rewards/margins": 13.337133725484213, + "rewards/rejected": -10.713677724202475, + "step": 3216 + }, + { + "epoch": 0.29392416628597534, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 8.032911681252064e-06, + "logits/chosen": 876089536.0, + "logits/rejected": 377371428.5714286, + "logps/chosen": -714.573486328125, + "logps/rejected": -487.97757393973217, + "loss": 0.0028, + "rewards/chosen": 3.812023878097534, + "rewards/margins": 14.159425088337489, + "rewards/rejected": -10.347401210239955, + "step": 3217 + }, + { + "epoch": 0.29401553220648696, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 8.031768475274412e-06, + "logits/chosen": 617628501.3333334, + "logits/rejected": 457879091.2, + "logps/chosen": -350.7324625651042, + "logps/rejected": -406.914306640625, + "loss": 0.0192, + "rewards/chosen": 3.099721908569336, + "rewards/margins": 12.343479537963868, + "rewards/rejected": -9.243757629394532, + "step": 3218 + }, + { + "epoch": 0.29410689812699864, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 8.030625018597598e-06, + "logits/chosen": 758896768.0, + "logits/rejected": 406820242.28571427, + "logps/chosen": -127.5328369140625, + "logps/rejected": -436.5897739955357, + "loss": 0.117, + "rewards/chosen": 3.128610372543335, + "rewards/margins": 11.593940768923078, + "rewards/rejected": -8.465330396379743, + "step": 3219 + }, + { + "epoch": 0.29419826404751026, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.029481311316172e-06, + "logits/chosen": 594028117.3333334, + "logits/rejected": 374897715.2, + "logps/chosen": -379.8101399739583, + "logps/rejected": -437.302783203125, + "loss": 0.0295, + "rewards/chosen": 2.660581588745117, + "rewards/margins": 12.105808639526368, + "rewards/rejected": -9.44522705078125, + "step": 3220 + }, + { + "epoch": 0.29428962996802194, + "grad_norm": 27.125, + "kl": 0.0, + "learning_rate": 8.028337353524712e-06, + "logits/chosen": 638952448.0, + "logits/rejected": 531693920.0, + "logps/chosen": -309.82220458984375, + "logps/rejected": -422.0125732421875, + "loss": 0.1223, + "rewards/chosen": 1.600706934928894, + "rewards/margins": 9.643583178520203, + "rewards/rejected": -8.042876243591309, + "step": 3221 + }, + { + "epoch": 0.29438099588853356, + "grad_norm": 9.0, + "kl": 9.813898086547852, + "learning_rate": 8.02719314531781e-06, + "logits/chosen": 401519872.0, + "logits/rejected": 239988096.0, + "logps/chosen": -364.3318568638393, + "logps/rejected": -281.95904541015625, + "loss": 0.0632, + "rewards/chosen": 3.8450230189732144, + "rewards/margins": 10.392796857016428, + "rewards/rejected": -6.547773838043213, + "step": 3222 + }, + { + "epoch": 0.29447236180904524, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 8.026048686790082e-06, + "logits/chosen": 916064768.0, + "logits/rejected": 895017642.6666666, + "logps/chosen": -240.410791015625, + "logps/rejected": -634.7274576822916, + "loss": 0.0275, + "rewards/chosen": 3.961060333251953, + "rewards/margins": 13.291934458414712, + "rewards/rejected": -9.33087412516276, + "step": 3223 + }, + { + "epoch": 0.29456372772955686, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.024903978036166e-06, + "logits/chosen": 654206805.3333334, + "logits/rejected": 1133332992.0, + "logps/chosen": -439.885009765625, + "logps/rejected": -718.672265625, + "loss": 0.0207, + "rewards/chosen": 3.208070755004883, + "rewards/margins": 13.982570266723632, + "rewards/rejected": -10.77449951171875, + "step": 3224 + }, + { + "epoch": 0.29465509365006853, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 8.023759019150716e-06, + "logits/chosen": 443780224.0, + "logits/rejected": 762788659.2, + "logps/chosen": -230.56380208333334, + "logps/rejected": -514.08916015625, + "loss": 0.0093, + "rewards/chosen": 4.099125226338704, + "rewards/margins": 14.160030682881672, + "rewards/rejected": -10.060905456542969, + "step": 3225 + }, + { + "epoch": 0.29474645957058015, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.022613810228413e-06, + "logits/chosen": 865030144.0, + "logits/rejected": 641876096.0, + "logps/chosen": -299.98486328125, + "logps/rejected": -446.9568684895833, + "loss": 0.0278, + "rewards/chosen": 3.1503681182861327, + "rewards/margins": 11.151033782958985, + "rewards/rejected": -8.000665664672852, + "step": 3226 + }, + { + "epoch": 0.29483782549109183, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.02146835136395e-06, + "logits/chosen": 558444288.0, + "logits/rejected": 799254080.0, + "logps/chosen": -194.6627197265625, + "logps/rejected": -575.5543212890625, + "loss": 0.0433, + "rewards/chosen": 3.198603947957357, + "rewards/margins": 9.24795134862264, + "rewards/rejected": -6.049347400665283, + "step": 3227 + }, + { + "epoch": 0.29492919141160345, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 8.020322642652053e-06, + "logits/chosen": 1014392320.0, + "logits/rejected": 415789472.0, + "logps/chosen": -313.4767150878906, + "logps/rejected": -286.2862854003906, + "loss": 0.0895, + "rewards/chosen": 3.7202091217041016, + "rewards/margins": 8.914484024047852, + "rewards/rejected": -5.19427490234375, + "step": 3228 + }, + { + "epoch": 0.29502055733211513, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 8.019176684187453e-06, + "logits/chosen": 1151370854.4, + "logits/rejected": 719469397.3333334, + "logps/chosen": -418.99169921875, + "logps/rejected": -370.7864990234375, + "loss": 0.0291, + "rewards/chosen": 3.6307411193847656, + "rewards/margins": 11.76506487528483, + "rewards/rejected": -8.134323755900065, + "step": 3229 + }, + { + "epoch": 0.29511192325262675, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 8.018030476064918e-06, + "logits/chosen": 842784576.0, + "logits/rejected": 785259434.6666666, + "logps/chosen": -513.7980346679688, + "logps/rejected": -451.7753092447917, + "loss": 0.0198, + "rewards/chosen": 2.9075927734375, + "rewards/margins": 11.071755727132162, + "rewards/rejected": -8.164162953694662, + "step": 3230 + }, + { + "epoch": 0.29520328917313843, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 8.016884018379223e-06, + "logits/chosen": 658093977.6, + "logits/rejected": 659233749.3333334, + "logps/chosen": -394.0104736328125, + "logps/rejected": -375.2217610677083, + "loss": 0.0276, + "rewards/chosen": 3.4338077545166015, + "rewards/margins": 10.087467575073243, + "rewards/rejected": -6.653659820556641, + "step": 3231 + }, + { + "epoch": 0.29529465509365005, + "grad_norm": 26.0, + "kl": 0.0, + "learning_rate": 8.015737311225173e-06, + "logits/chosen": 485524736.0, + "logits/rejected": 350723891.2, + "logps/chosen": -309.922119140625, + "logps/rejected": -489.1689453125, + "loss": 0.0885, + "rewards/chosen": 3.6869370142618814, + "rewards/margins": 13.267498842875161, + "rewards/rejected": -9.58056182861328, + "step": 3232 + }, + { + "epoch": 0.2953860210141617, + "grad_norm": 32.5, + "kl": 0.0, + "learning_rate": 8.014590354697588e-06, + "logits/chosen": 457583680.0, + "logits/rejected": 503948288.0, + "logps/chosen": -342.4151611328125, + "logps/rejected": -508.3761393229167, + "loss": 0.0398, + "rewards/chosen": 3.5746231079101562, + "rewards/margins": 11.903231302897135, + "rewards/rejected": -8.328608194986979, + "step": 3233 + }, + { + "epoch": 0.29547738693467335, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 8.01344314889131e-06, + "logits/chosen": 395885536.0, + "logits/rejected": 539732096.0, + "logps/chosen": -256.11492919921875, + "logps/rejected": -363.336669921875, + "loss": 0.0261, + "rewards/chosen": 3.2195801734924316, + "rewards/margins": 12.598259449005127, + "rewards/rejected": -9.378679275512695, + "step": 3234 + }, + { + "epoch": 0.295568752855185, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 8.012295693901206e-06, + "logits/chosen": 441625161.14285713, + "logits/rejected": 1072398848.0, + "logps/chosen": -296.06937081473217, + "logps/rejected": -530.6112060546875, + "loss": 0.0509, + "rewards/chosen": 2.9882896968296597, + "rewards/margins": 11.37036977495466, + "rewards/rejected": -8.382080078125, + "step": 3235 + }, + { + "epoch": 0.29566011877569665, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 8.011147989822157e-06, + "logits/chosen": 675693824.0, + "logits/rejected": 580706261.3333334, + "logps/chosen": -443.235400390625, + "logps/rejected": -475.7483723958333, + "loss": 0.0196, + "rewards/chosen": 3.534934234619141, + "rewards/margins": 12.882281494140624, + "rewards/rejected": -9.347347259521484, + "step": 3236 + }, + { + "epoch": 0.2957514846962083, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 8.010000036749068e-06, + "logits/chosen": 445757120.0, + "logits/rejected": 554372778.6666666, + "logps/chosen": -276.849365234375, + "logps/rejected": -670.4480387369791, + "loss": 0.0049, + "rewards/chosen": 4.30526065826416, + "rewards/margins": 14.52077833811442, + "rewards/rejected": -10.21551767985026, + "step": 3237 + }, + { + "epoch": 0.29584285061671994, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 8.008851834776865e-06, + "logits/chosen": 509400160.0, + "logits/rejected": 674206805.3333334, + "logps/chosen": -334.60321044921875, + "logps/rejected": -436.4808756510417, + "loss": 0.0147, + "rewards/chosen": 2.9575142860412598, + "rewards/margins": 12.101988315582275, + "rewards/rejected": -9.144474029541016, + "step": 3238 + }, + { + "epoch": 0.2959342165372316, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 8.007703384000494e-06, + "logits/chosen": 639735552.0, + "logits/rejected": 1154309504.0, + "logps/chosen": -217.36669921875, + "logps/rejected": -588.2557983398438, + "loss": 0.0176, + "rewards/chosen": 3.7021584510803223, + "rewards/margins": 14.951201915740967, + "rewards/rejected": -11.249043464660645, + "step": 3239 + }, + { + "epoch": 0.29602558245774324, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.006554684514917e-06, + "logits/chosen": 684748928.0, + "logits/rejected": 535658048.0, + "logps/chosen": -309.60101318359375, + "logps/rejected": -396.4810791015625, + "loss": 0.0284, + "rewards/chosen": 3.110116958618164, + "rewards/margins": 9.932804107666016, + "rewards/rejected": -6.822687149047852, + "step": 3240 + }, + { + "epoch": 0.2961169483782549, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 8.005405736415127e-06, + "logits/chosen": 567232000.0, + "logits/rejected": 259308697.6, + "logps/chosen": -178.2129109700521, + "logps/rejected": -347.5696533203125, + "loss": 0.0156, + "rewards/chosen": 3.3870598475138345, + "rewards/margins": 12.221410433451334, + "rewards/rejected": -8.8343505859375, + "step": 3241 + }, + { + "epoch": 0.29620831429876654, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 8.004256539796127e-06, + "logits/chosen": 339246464.0, + "logits/rejected": 596181696.0, + "logps/chosen": -243.2178192138672, + "logps/rejected": -356.07440185546875, + "loss": 0.0111, + "rewards/chosen": 5.026717662811279, + "rewards/margins": 13.027191638946533, + "rewards/rejected": -8.000473976135254, + "step": 3242 + }, + { + "epoch": 0.2962996802192782, + "grad_norm": 0.70703125, + "kl": 0.0, + "learning_rate": 8.003107094752945e-06, + "logits/chosen": 555500714.6666666, + "logits/rejected": 705984512.0, + "logps/chosen": -214.65877278645834, + "logps/rejected": -717.0935546875, + "loss": 0.0043, + "rewards/chosen": 4.543590545654297, + "rewards/margins": 14.109381866455077, + "rewards/rejected": -9.56579132080078, + "step": 3243 + }, + { + "epoch": 0.29639104613978984, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 8.001957401380633e-06, + "logits/chosen": 593754112.0, + "logits/rejected": 474495692.8, + "logps/chosen": -316.52838134765625, + "logps/rejected": -651.10712890625, + "loss": 0.0136, + "rewards/chosen": 3.5489794413248696, + "rewards/margins": 15.0519655863444, + "rewards/rejected": -11.50298614501953, + "step": 3244 + }, + { + "epoch": 0.2964824120603015, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 8.000807459774256e-06, + "logits/chosen": 366837418.6666667, + "logits/rejected": 363886438.4, + "logps/chosen": -183.58577473958334, + "logps/rejected": -519.04365234375, + "loss": 0.0097, + "rewards/chosen": 3.664576848347982, + "rewards/margins": 12.322767766316732, + "rewards/rejected": -8.65819091796875, + "step": 3245 + }, + { + "epoch": 0.29657377798081314, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 7.999657270028904e-06, + "logits/chosen": 1083552384.0, + "logits/rejected": 538316842.6666666, + "logps/chosen": -100.06808471679688, + "logps/rejected": -505.3080240885417, + "loss": 0.0275, + "rewards/chosen": 2.5082969665527344, + "rewards/margins": 13.466423034667969, + "rewards/rejected": -10.958126068115234, + "step": 3246 + }, + { + "epoch": 0.2966651439013248, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 7.998506832239689e-06, + "logits/chosen": 485651840.0, + "logits/rejected": 596132992.0, + "logps/chosen": -309.82861328125, + "logps/rejected": -631.5264282226562, + "loss": 0.0251, + "rewards/chosen": 3.1858246326446533, + "rewards/margins": 11.844376802444458, + "rewards/rejected": -8.658552169799805, + "step": 3247 + }, + { + "epoch": 0.29675650982183643, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 7.997356146501743e-06, + "logits/chosen": 489047008.0, + "logits/rejected": 492465920.0, + "logps/chosen": -363.2611999511719, + "logps/rejected": -565.8915201822916, + "loss": 0.0048, + "rewards/chosen": 4.264848232269287, + "rewards/margins": 14.317329565684, + "rewards/rejected": -10.052481333414713, + "step": 3248 + }, + { + "epoch": 0.2968478757423481, + "grad_norm": 0.498046875, + "kl": 0.0, + "learning_rate": 7.996205212910214e-06, + "logits/chosen": 225386848.0, + "logits/rejected": 513697280.0, + "logps/chosen": -222.22137451171875, + "logps/rejected": -535.5574137369791, + "loss": 0.0028, + "rewards/chosen": 5.2101945877075195, + "rewards/margins": 14.599578539530436, + "rewards/rejected": -9.389383951822916, + "step": 3249 + }, + { + "epoch": 0.29693924166285973, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 7.995054031560271e-06, + "logits/chosen": 383139865.6, + "logits/rejected": 266024896.0, + "logps/chosen": -230.8485107421875, + "logps/rejected": -422.6077880859375, + "loss": 0.0865, + "rewards/chosen": 2.517159271240234, + "rewards/margins": 10.957394409179688, + "rewards/rejected": -8.440235137939453, + "step": 3250 + }, + { + "epoch": 0.2970306075833714, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.993902602547113e-06, + "logits/chosen": 756242090.6666666, + "logits/rejected": 531978592.0, + "logps/chosen": -295.2711588541667, + "logps/rejected": -426.4436340332031, + "loss": 0.0186, + "rewards/chosen": 4.113508224487305, + "rewards/margins": 13.818010330200195, + "rewards/rejected": -9.70450210571289, + "step": 3251 + }, + { + "epoch": 0.29712197350388303, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 7.99275092596595e-06, + "logits/chosen": 450431680.0, + "logits/rejected": 785089194.6666666, + "logps/chosen": -287.0353088378906, + "logps/rejected": -238.6031697591146, + "loss": 0.0098, + "rewards/chosen": 3.566314220428467, + "rewards/margins": 10.937854290008545, + "rewards/rejected": -7.371540069580078, + "step": 3252 + }, + { + "epoch": 0.2972133394243947, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 7.99159900191201e-06, + "logits/chosen": 1327911850.6666667, + "logits/rejected": 677645004.8, + "logps/chosen": -317.213623046875, + "logps/rejected": -476.672265625, + "loss": 0.0272, + "rewards/chosen": 2.8215576807657876, + "rewards/margins": 10.708324495951334, + "rewards/rejected": -7.886766815185547, + "step": 3253 + }, + { + "epoch": 0.29730470534490633, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 7.990446830480551e-06, + "logits/chosen": 475519648.0, + "logits/rejected": 833160832.0, + "logps/chosen": -278.38287353515625, + "logps/rejected": -446.6842041015625, + "loss": 0.0383, + "rewards/chosen": 3.295286178588867, + "rewards/margins": 12.066217422485352, + "rewards/rejected": -8.770931243896484, + "step": 3254 + }, + { + "epoch": 0.297396071265418, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.989294411766848e-06, + "logits/chosen": 499439206.4, + "logits/rejected": 348138752.0, + "logps/chosen": -338.4810546875, + "logps/rejected": -310.45263671875, + "loss": 0.02, + "rewards/chosen": 3.713713836669922, + "rewards/margins": 11.196660232543945, + "rewards/rejected": -7.482946395874023, + "step": 3255 + }, + { + "epoch": 0.2974874371859296, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 7.988141745866194e-06, + "logits/chosen": 569884480.0, + "logits/rejected": 660772096.0, + "logps/chosen": -355.4984436035156, + "logps/rejected": -573.05126953125, + "loss": 0.0149, + "rewards/chosen": 3.8657729625701904, + "rewards/margins": 13.62246584892273, + "rewards/rejected": -9.756692886352539, + "step": 3256 + }, + { + "epoch": 0.2975788031064413, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 7.986988832873903e-06, + "logits/chosen": 404735680.0, + "logits/rejected": 555360128.0, + "logps/chosen": -352.68292236328125, + "logps/rejected": -314.33343505859375, + "loss": 0.036, + "rewards/chosen": 3.412823438644409, + "rewards/margins": 10.086719274520874, + "rewards/rejected": -6.673895835876465, + "step": 3257 + }, + { + "epoch": 0.2976701690269529, + "grad_norm": 0.326171875, + "kl": 0.0, + "learning_rate": 7.98583567288531e-06, + "logits/chosen": 146441168.0, + "logits/rejected": 510537252.5714286, + "logps/chosen": -70.19683837890625, + "logps/rejected": -477.3886021205357, + "loss": 0.002, + "rewards/chosen": 4.5194292068481445, + "rewards/margins": 12.834608214242119, + "rewards/rejected": -8.315179007393974, + "step": 3258 + }, + { + "epoch": 0.2977615349474646, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 7.98468226599577e-06, + "logits/chosen": 567625301.3333334, + "logits/rejected": 535146336.0, + "logps/chosen": -376.9883219401042, + "logps/rejected": -548.6029052734375, + "loss": 0.0239, + "rewards/chosen": 3.739566167195638, + "rewards/margins": 13.268868764241537, + "rewards/rejected": -9.529302597045898, + "step": 3259 + }, + { + "epoch": 0.2978529008679762, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 7.983528612300662e-06, + "logits/chosen": 270171616.0, + "logits/rejected": 560206336.0, + "logps/chosen": -198.5438232421875, + "logps/rejected": -546.0262858072916, + "loss": 0.0053, + "rewards/chosen": 4.407126426696777, + "rewards/margins": 13.55057684580485, + "rewards/rejected": -9.143450419108072, + "step": 3260 + }, + { + "epoch": 0.2979442667884879, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 7.98237471189538e-06, + "logits/chosen": 569111990.8571428, + "logits/rejected": 524079264.0, + "logps/chosen": -286.61631556919644, + "logps/rejected": -498.29339599609375, + "loss": 0.019, + "rewards/chosen": 4.433739798409598, + "rewards/margins": 14.778313773018972, + "rewards/rejected": -10.344573974609375, + "step": 3261 + }, + { + "epoch": 0.2980356327089995, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 7.981220564875341e-06, + "logits/chosen": 560542037.3333334, + "logits/rejected": 294409472.0, + "logps/chosen": -351.0951334635417, + "logps/rejected": -637.2733764648438, + "loss": 0.0235, + "rewards/chosen": 3.6838061014811196, + "rewards/margins": 18.09034029642741, + "rewards/rejected": -14.406534194946289, + "step": 3262 + }, + { + "epoch": 0.2981269986295112, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.980066171335984e-06, + "logits/chosen": 589804970.6666666, + "logits/rejected": 405324646.4, + "logps/chosen": -313.5090738932292, + "logps/rejected": -477.71943359375, + "loss": 0.0089, + "rewards/chosen": 4.216057459513347, + "rewards/margins": 12.410255304972331, + "rewards/rejected": -8.194197845458984, + "step": 3263 + }, + { + "epoch": 0.2982183645500228, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 7.978911531372764e-06, + "logits/chosen": 424693376.0, + "logits/rejected": 597855451.4285715, + "logps/chosen": -164.65762329101562, + "logps/rejected": -651.2140066964286, + "loss": 0.0127, + "rewards/chosen": 2.1922242641448975, + "rewards/margins": 12.057619946343559, + "rewards/rejected": -9.865395682198661, + "step": 3264 + }, + { + "epoch": 0.2983097304705345, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.977756645081163e-06, + "logits/chosen": 376683093.3333333, + "logits/rejected": 688451276.8, + "logps/chosen": -96.724609375, + "logps/rejected": -614.7201171875, + "loss": 0.0311, + "rewards/chosen": 2.530913829803467, + "rewards/margins": 12.508533763885499, + "rewards/rejected": -9.977619934082032, + "step": 3265 + }, + { + "epoch": 0.2984010963910461, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 7.976601512556676e-06, + "logits/chosen": 665918016.0, + "logits/rejected": 422527872.0, + "logps/chosen": -217.97683715820312, + "logps/rejected": -490.0048828125, + "loss": 0.0128, + "rewards/chosen": 3.0291876792907715, + "rewards/margins": 11.909727891286215, + "rewards/rejected": -8.880540211995443, + "step": 3266 + }, + { + "epoch": 0.2984924623115578, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 7.975446133894822e-06, + "logits/chosen": 751012864.0, + "logits/rejected": 572739797.3333334, + "logps/chosen": -328.4127685546875, + "logps/rejected": -428.2778727213542, + "loss": 0.0243, + "rewards/chosen": 3.40106086730957, + "rewards/margins": 11.373856989542643, + "rewards/rejected": -7.972796122233073, + "step": 3267 + }, + { + "epoch": 0.2985838282320694, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 7.974290509191143e-06, + "logits/chosen": 774568362.6666666, + "logits/rejected": 657542720.0, + "logps/chosen": -312.71270751953125, + "logps/rejected": -655.2421875, + "loss": 0.0531, + "rewards/chosen": 3.1053234736124673, + "rewards/margins": 14.050542513529459, + "rewards/rejected": -10.945219039916992, + "step": 3268 + }, + { + "epoch": 0.2986751941525811, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 7.973134638541193e-06, + "logits/chosen": 650797482.6666666, + "logits/rejected": 958938931.2, + "logps/chosen": -363.5299072265625, + "logps/rejected": -708.088330078125, + "loss": 0.0098, + "rewards/chosen": 3.9489920934041343, + "rewards/margins": 12.788343747456869, + "rewards/rejected": -8.839351654052734, + "step": 3269 + }, + { + "epoch": 0.2987665600730927, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 7.971978522040558e-06, + "logits/chosen": 563880448.0, + "logits/rejected": 592123712.0, + "logps/chosen": -333.2642415364583, + "logps/rejected": -433.4660339355469, + "loss": 0.0216, + "rewards/chosen": 3.6873671213785806, + "rewards/margins": 11.608577410380045, + "rewards/rejected": -7.921210289001465, + "step": 3270 + }, + { + "epoch": 0.2988579259936044, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 7.970822159784832e-06, + "logits/chosen": 565619328.0, + "logits/rejected": 794462037.3333334, + "logps/chosen": -361.59979248046875, + "logps/rejected": -402.2019856770833, + "loss": 0.0176, + "rewards/chosen": 2.604029893875122, + "rewards/margins": 12.120425462722778, + "rewards/rejected": -9.516395568847656, + "step": 3271 + }, + { + "epoch": 0.298949291914116, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 7.969665551869641e-06, + "logits/chosen": 429915264.0, + "logits/rejected": 418425881.6, + "logps/chosen": -253.3579305013021, + "logps/rejected": -529.4830078125, + "loss": 0.0068, + "rewards/chosen": 4.540092468261719, + "rewards/margins": 14.231309509277343, + "rewards/rejected": -9.691217041015625, + "step": 3272 + }, + { + "epoch": 0.2990406578346277, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 7.968508698390624e-06, + "logits/chosen": 216607136.0, + "logits/rejected": 400483541.3333333, + "logps/chosen": -200.98031616210938, + "logps/rejected": -433.0032145182292, + "loss": 0.0066, + "rewards/chosen": 4.5986175537109375, + "rewards/margins": 12.492815653483074, + "rewards/rejected": -7.894198099772136, + "step": 3273 + }, + { + "epoch": 0.2991320237551393, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.96735159944344e-06, + "logits/chosen": 767701811.2, + "logits/rejected": 1284521472.0, + "logps/chosen": -382.2423095703125, + "logps/rejected": -620.9397379557291, + "loss": 0.0268, + "rewards/chosen": 3.2266307830810548, + "rewards/margins": 11.324595387776693, + "rewards/rejected": -8.097964604695639, + "step": 3274 + }, + { + "epoch": 0.299223389675651, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 7.966194255123774e-06, + "logits/chosen": 286023424.0, + "logits/rejected": 580724019.2, + "logps/chosen": -325.5729573567708, + "logps/rejected": -505.61015625, + "loss": 0.0061, + "rewards/chosen": 4.494382858276367, + "rewards/margins": 13.243400192260742, + "rewards/rejected": -8.749017333984375, + "step": 3275 + }, + { + "epoch": 0.2993147555961626, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 7.965036665527323e-06, + "logits/chosen": 298470912.0, + "logits/rejected": 413715488.0, + "logps/chosen": -170.9093017578125, + "logps/rejected": -628.9863891601562, + "loss": 0.016, + "rewards/chosen": 4.02370548248291, + "rewards/margins": 13.938782691955566, + "rewards/rejected": -9.915077209472656, + "step": 3276 + }, + { + "epoch": 0.2994061215166743, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 7.963878830749811e-06, + "logits/chosen": 667469440.0, + "logits/rejected": 434180192.0, + "logps/chosen": -543.9610595703125, + "logps/rejected": -499.4549560546875, + "loss": 0.0146, + "rewards/chosen": 3.7236833572387695, + "rewards/margins": 15.34482479095459, + "rewards/rejected": -11.62114143371582, + "step": 3277 + }, + { + "epoch": 0.2994974874371859, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.962720750886983e-06, + "logits/chosen": 750095308.8, + "logits/rejected": 618841002.6666666, + "logps/chosen": -403.1693359375, + "logps/rejected": -587.7589111328125, + "loss": 0.0183, + "rewards/chosen": 3.7297897338867188, + "rewards/margins": 11.704694112141926, + "rewards/rejected": -7.974904378255208, + "step": 3278 + }, + { + "epoch": 0.2995888533576976, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 7.961562426034599e-06, + "logits/chosen": 471778944.0, + "logits/rejected": 563941568.0, + "logps/chosen": -269.27878824869794, + "logps/rejected": -681.8883056640625, + "loss": 0.1319, + "rewards/chosen": 2.706292470296224, + "rewards/margins": 12.230230649312338, + "rewards/rejected": -9.523938179016113, + "step": 3279 + }, + { + "epoch": 0.2996802192782092, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 7.96040385628844e-06, + "logits/chosen": 164804032.0, + "logits/rejected": 491081088.0, + "logps/chosen": -432.78302001953125, + "logps/rejected": -334.5054524739583, + "loss": 0.0042, + "rewards/chosen": 4.384849548339844, + "rewards/margins": 12.613262176513672, + "rewards/rejected": -8.228412628173828, + "step": 3280 + }, + { + "epoch": 0.2997715851987209, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 7.959245041744313e-06, + "logits/chosen": 716510208.0, + "logits/rejected": 446220864.0, + "logps/chosen": -150.6805419921875, + "logps/rejected": -532.775634765625, + "loss": 0.0545, + "rewards/chosen": 2.707144021987915, + "rewards/margins": 12.921700716018677, + "rewards/rejected": -10.214556694030762, + "step": 3281 + }, + { + "epoch": 0.2998629511192325, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 7.958085982498037e-06, + "logits/chosen": 573789952.0, + "logits/rejected": 341997216.0, + "logps/chosen": -292.3345947265625, + "logps/rejected": -380.5333251953125, + "loss": 0.018, + "rewards/chosen": 3.685340404510498, + "rewards/margins": 11.317705154418945, + "rewards/rejected": -7.632364749908447, + "step": 3282 + }, + { + "epoch": 0.2999543170397442, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 7.956926678645458e-06, + "logits/chosen": 440585685.3333333, + "logits/rejected": 803341120.0, + "logps/chosen": -313.22149658203125, + "logps/rejected": -669.5177001953125, + "loss": 0.0519, + "rewards/chosen": 2.896864573160807, + "rewards/margins": 11.202991167704264, + "rewards/rejected": -8.306126594543457, + "step": 3283 + }, + { + "epoch": 0.3000456829602558, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 7.95576713028244e-06, + "logits/chosen": 421951744.0, + "logits/rejected": 569953024.0, + "logps/chosen": -282.59002685546875, + "logps/rejected": -516.896142578125, + "loss": 0.0111, + "rewards/chosen": 3.7735671997070312, + "rewards/margins": 12.932525634765625, + "rewards/rejected": -9.158958435058594, + "step": 3284 + }, + { + "epoch": 0.3001370488807675, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 7.954607337504866e-06, + "logits/chosen": 727927040.0, + "logits/rejected": 891543424.0, + "logps/chosen": -367.6972961425781, + "logps/rejected": -533.3400268554688, + "loss": 0.0185, + "rewards/chosen": 3.5838022232055664, + "rewards/margins": 13.150460243225098, + "rewards/rejected": -9.566658020019531, + "step": 3285 + }, + { + "epoch": 0.3002284148012791, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 7.95344730040864e-06, + "logits/chosen": 448987712.0, + "logits/rejected": 432970410.6666667, + "logps/chosen": -394.97357177734375, + "logps/rejected": -537.4957275390625, + "loss": 0.0175, + "rewards/chosen": 2.6199052333831787, + "rewards/margins": 11.204964717229208, + "rewards/rejected": -8.58505948384603, + "step": 3286 + }, + { + "epoch": 0.3003197807217908, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 7.952287019089686e-06, + "logits/chosen": 1796554752.0, + "logits/rejected": 532484992.0, + "logps/chosen": -361.10931396484375, + "logps/rejected": -384.88134765625, + "loss": 0.0245, + "rewards/chosen": 3.374267578125, + "rewards/margins": 11.775283813476562, + "rewards/rejected": -8.401016235351562, + "step": 3287 + }, + { + "epoch": 0.3004111466423024, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 7.95112649364395e-06, + "logits/chosen": 538613376.0, + "logits/rejected": 245012787.2, + "logps/chosen": -390.9920247395833, + "logps/rejected": -208.42109375, + "loss": 0.0124, + "rewards/chosen": 3.739527384440104, + "rewards/margins": 9.99672991434733, + "rewards/rejected": -6.257202529907227, + "step": 3288 + }, + { + "epoch": 0.3005025125628141, + "grad_norm": 68.5, + "kl": 0.0, + "learning_rate": 7.949965724167394e-06, + "logits/chosen": 866578944.0, + "logits/rejected": 469505024.0, + "logps/chosen": -405.50927734375, + "logps/rejected": -404.0683349609375, + "loss": 0.0586, + "rewards/chosen": 4.101452827453613, + "rewards/margins": 11.30444278717041, + "rewards/rejected": -7.202989959716797, + "step": 3289 + }, + { + "epoch": 0.3005938784833257, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 7.948804710756007e-06, + "logits/chosen": 580594602.6666666, + "logits/rejected": 437336729.6, + "logps/chosen": -341.2132975260417, + "logps/rejected": -377.674658203125, + "loss": 0.0253, + "rewards/chosen": 3.213303565979004, + "rewards/margins": 10.701194190979004, + "rewards/rejected": -7.487890625, + "step": 3290 + }, + { + "epoch": 0.30068524440383737, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 7.947643453505791e-06, + "logits/chosen": 759921493.3333334, + "logits/rejected": 389200000.0, + "logps/chosen": -237.8240966796875, + "logps/rejected": -234.571337890625, + "loss": 0.0166, + "rewards/chosen": 3.422316233317057, + "rewards/margins": 11.203409067789714, + "rewards/rejected": -7.781092834472656, + "step": 3291 + }, + { + "epoch": 0.300776610324349, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 7.946481952512772e-06, + "logits/chosen": 393355776.0, + "logits/rejected": 272814336.0, + "logps/chosen": -199.9178466796875, + "logps/rejected": -259.26527913411456, + "loss": 0.1191, + "rewards/chosen": 4.100884628295899, + "rewards/margins": 8.59795748392741, + "rewards/rejected": -4.497072855631511, + "step": 3292 + }, + { + "epoch": 0.30086797624486067, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 7.945320207872995e-06, + "logits/chosen": 583692480.0, + "logits/rejected": 433722336.0, + "logps/chosen": -339.50164794921875, + "logps/rejected": -560.662353515625, + "loss": 0.0233, + "rewards/chosen": 3.1550557613372803, + "rewards/margins": 12.106317281723022, + "rewards/rejected": -8.951261520385742, + "step": 3293 + }, + { + "epoch": 0.3009593421653723, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 7.944158219682526e-06, + "logits/chosen": 408854169.6, + "logits/rejected": 568335786.6666666, + "logps/chosen": -246.31591796875, + "logps/rejected": -547.3324788411459, + "loss": 0.1581, + "rewards/chosen": 1.5472413063049317, + "rewards/margins": 11.302134291330972, + "rewards/rejected": -9.754892985026041, + "step": 3294 + }, + { + "epoch": 0.30105070808588397, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 7.94299598803745e-06, + "logits/chosen": 634749141.3333334, + "logits/rejected": 733715200.0, + "logps/chosen": -305.44482421875, + "logps/rejected": -641.5341796875, + "loss": 0.0171, + "rewards/chosen": 3.0672486623128257, + "rewards/margins": 12.736425908406575, + "rewards/rejected": -9.66917724609375, + "step": 3295 + }, + { + "epoch": 0.3011420740063956, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 7.941833513033873e-06, + "logits/chosen": 696138342.4, + "logits/rejected": 524836309.3333333, + "logps/chosen": -236.064453125, + "logps/rejected": -562.1453450520834, + "loss": 0.028, + "rewards/chosen": 3.1502458572387697, + "rewards/margins": 11.05337734222412, + "rewards/rejected": -7.903131484985352, + "step": 3296 + }, + { + "epoch": 0.30123343992690726, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.940670794767922e-06, + "logits/chosen": 520436309.3333333, + "logits/rejected": 444715212.8, + "logps/chosen": -404.1779378255208, + "logps/rejected": -462.50048828125, + "loss": 0.0191, + "rewards/chosen": 3.027067502339681, + "rewards/margins": 11.583082516988119, + "rewards/rejected": -8.556015014648438, + "step": 3297 + }, + { + "epoch": 0.3013248058474189, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 7.939507833335742e-06, + "logits/chosen": 957488640.0, + "logits/rejected": 808607061.3333334, + "logps/chosen": -368.6295471191406, + "logps/rejected": -763.8191731770834, + "loss": 0.0051, + "rewards/chosen": 4.252748489379883, + "rewards/margins": 14.589414596557617, + "rewards/rejected": -10.336666107177734, + "step": 3298 + }, + { + "epoch": 0.30141617176793056, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 7.9383446288335e-06, + "logits/chosen": 697708373.3333334, + "logits/rejected": 711359385.6, + "logps/chosen": -460.6191813151042, + "logps/rejected": -682.90302734375, + "loss": 0.0105, + "rewards/chosen": 3.6693716049194336, + "rewards/margins": 11.847235679626465, + "rewards/rejected": -8.177864074707031, + "step": 3299 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 7.937181181357378e-06, + "logits/chosen": 372815552.0, + "logits/rejected": 453585920.0, + "logps/chosen": -329.265380859375, + "logps/rejected": -545.417236328125, + "loss": 0.0142, + "rewards/chosen": 4.058221817016602, + "rewards/margins": 13.294929504394531, + "rewards/rejected": -9.23670768737793, + "step": 3300 + }, + { + "epoch": 0.30159890360895386, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.93601749100359e-06, + "logits/chosen": 969239259.4285715, + "logits/rejected": 324279136.0, + "logps/chosen": -343.8086635044643, + "logps/rejected": -290.532470703125, + "loss": 0.0437, + "rewards/chosen": 3.1712570190429688, + "rewards/margins": 8.7479829788208, + "rewards/rejected": -5.576725959777832, + "step": 3301 + }, + { + "epoch": 0.3016902695294655, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.934853557868356e-06, + "logits/chosen": 369572394.6666667, + "logits/rejected": 329721792.0, + "logps/chosen": -233.10884602864584, + "logps/rejected": -352.37005615234375, + "loss": 0.0288, + "rewards/chosen": 3.536236127217611, + "rewards/margins": 9.8802703221639, + "rewards/rejected": -6.344034194946289, + "step": 3302 + }, + { + "epoch": 0.30178163544997716, + "grad_norm": 0.48046875, + "kl": 0.0, + "learning_rate": 7.933689382047927e-06, + "logits/chosen": 389536042.6666667, + "logits/rejected": 491217203.2, + "logps/chosen": -136.2852783203125, + "logps/rejected": -626.252587890625, + "loss": 0.0033, + "rewards/chosen": 4.886033058166504, + "rewards/margins": 15.515796089172364, + "rewards/rejected": -10.62976303100586, + "step": 3303 + }, + { + "epoch": 0.3018730013704888, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 7.932524963638565e-06, + "logits/chosen": 578200640.0, + "logits/rejected": 1588926592.0, + "logps/chosen": -200.21392822265625, + "logps/rejected": -698.1854248046875, + "loss": 0.134, + "rewards/chosen": 1.7159850597381592, + "rewards/margins": 11.231902837753296, + "rewards/rejected": -9.515917778015137, + "step": 3304 + }, + { + "epoch": 0.30196436729100046, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 7.931360302736559e-06, + "logits/chosen": 448698794.6666667, + "logits/rejected": 905116672.0, + "logps/chosen": -333.1518961588542, + "logps/rejected": -1131.8194580078125, + "loss": 0.0444, + "rewards/chosen": 3.2206125259399414, + "rewards/margins": 13.984684944152832, + "rewards/rejected": -10.76407241821289, + "step": 3305 + }, + { + "epoch": 0.3020557332115121, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 7.930195399438217e-06, + "logits/chosen": 649765632.0, + "logits/rejected": 354451872.0, + "logps/chosen": -267.68646240234375, + "logps/rejected": -558.232421875, + "loss": 0.0182, + "rewards/chosen": 3.7093818187713623, + "rewards/margins": 13.508013486862183, + "rewards/rejected": -9.79863166809082, + "step": 3306 + }, + { + "epoch": 0.30214709913202376, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 7.929030253839864e-06, + "logits/chosen": 341740928.0, + "logits/rejected": 239610304.0, + "logps/chosen": -196.8353271484375, + "logps/rejected": -340.632568359375, + "loss": 0.0563, + "rewards/chosen": 2.6471128463745117, + "rewards/margins": 9.767524719238281, + "rewards/rejected": -7.1204118728637695, + "step": 3307 + }, + { + "epoch": 0.3022384650525354, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 7.927864866037848e-06, + "logits/chosen": 847627904.0, + "logits/rejected": 1106018816.0, + "logps/chosen": -344.28582763671875, + "logps/rejected": -391.755615234375, + "loss": 0.0076, + "rewards/chosen": 4.407861709594727, + "rewards/margins": 13.129173278808594, + "rewards/rejected": -8.721311569213867, + "step": 3308 + }, + { + "epoch": 0.30232983097304705, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 7.926699236128534e-06, + "logits/chosen": 364265216.0, + "logits/rejected": 301289932.8, + "logps/chosen": -231.41253662109375, + "logps/rejected": -303.328662109375, + "loss": 0.0306, + "rewards/chosen": 3.0247459411621094, + "rewards/margins": 9.060018157958984, + "rewards/rejected": -6.035272216796875, + "step": 3309 + }, + { + "epoch": 0.30242119689355873, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 7.925533364208308e-06, + "logits/chosen": 738196992.0, + "logits/rejected": 736678528.0, + "logps/chosen": -503.9637451171875, + "logps/rejected": -613.0616455078125, + "loss": 0.0185, + "rewards/chosen": 4.409533500671387, + "rewards/margins": 12.209892272949219, + "rewards/rejected": -7.800358772277832, + "step": 3310 + }, + { + "epoch": 0.30251256281407035, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 7.92436725037358e-06, + "logits/chosen": 656266342.4, + "logits/rejected": 419921621.3333333, + "logps/chosen": -319.723486328125, + "logps/rejected": -476.1772054036458, + "loss": 0.022, + "rewards/chosen": 3.844984436035156, + "rewards/margins": 12.76993153889974, + "rewards/rejected": -8.924947102864584, + "step": 3311 + }, + { + "epoch": 0.30260392873458203, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.923200894720776e-06, + "logits/chosen": 650656128.0, + "logits/rejected": 512348608.0, + "logps/chosen": -339.787353515625, + "logps/rejected": -209.55157470703125, + "loss": 0.0174, + "rewards/chosen": 3.382183313369751, + "rewards/margins": 11.170390367507935, + "rewards/rejected": -7.788207054138184, + "step": 3312 + }, + { + "epoch": 0.30269529465509365, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 7.922034297346342e-06, + "logits/chosen": 576407040.0, + "logits/rejected": 430352998.4, + "logps/chosen": -338.70998128255206, + "logps/rejected": -557.951611328125, + "loss": 0.0183, + "rewards/chosen": 3.491260210673014, + "rewards/margins": 12.532365862528483, + "rewards/rejected": -9.041105651855469, + "step": 3313 + }, + { + "epoch": 0.3027866605756053, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 7.920867458346743e-06, + "logits/chosen": 450492586.6666667, + "logits/rejected": 689540096.0, + "logps/chosen": -298.4061279296875, + "logps/rejected": -495.12265625, + "loss": 0.0154, + "rewards/chosen": 3.649014472961426, + "rewards/margins": 12.890544319152832, + "rewards/rejected": -9.241529846191407, + "step": 3314 + }, + { + "epoch": 0.30287802649611695, + "grad_norm": 0.95703125, + "kl": 0.0, + "learning_rate": 7.919700377818468e-06, + "logits/chosen": 295307392.0, + "logits/rejected": 860673536.0, + "logps/chosen": -164.04678344726562, + "logps/rejected": -526.9490792410714, + "loss": 0.0042, + "rewards/chosen": 3.9953033924102783, + "rewards/margins": 10.80321376664298, + "rewards/rejected": -6.8079103742327005, + "step": 3315 + }, + { + "epoch": 0.3029693924166286, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 7.918533055858026e-06, + "logits/chosen": 445642112.0, + "logits/rejected": 506405248.0, + "logps/chosen": -186.032958984375, + "logps/rejected": -442.9664713541667, + "loss": 0.0096, + "rewards/chosen": 3.5121381282806396, + "rewards/margins": 11.654233058293661, + "rewards/rejected": -8.142094930013021, + "step": 3316 + }, + { + "epoch": 0.30306075833714025, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 7.917365492561938e-06, + "logits/chosen": 452526233.6, + "logits/rejected": 224850816.0, + "logps/chosen": -241.0892822265625, + "logps/rejected": -239.91654459635416, + "loss": 0.0887, + "rewards/chosen": 3.7301780700683596, + "rewards/margins": 8.57238311767578, + "rewards/rejected": -4.842205047607422, + "step": 3317 + }, + { + "epoch": 0.3031521242576519, + "grad_norm": 62.75, + "kl": 0.0, + "learning_rate": 7.916197688026754e-06, + "logits/chosen": 423723827.2, + "logits/rejected": 391717034.6666667, + "logps/chosen": -277.7246337890625, + "logps/rejected": -395.5460611979167, + "loss": 0.1101, + "rewards/chosen": 2.704991340637207, + "rewards/margins": 11.775650978088379, + "rewards/rejected": -9.070659637451172, + "step": 3318 + }, + { + "epoch": 0.30324349017816354, + "grad_norm": 33.0, + "kl": 0.0, + "learning_rate": 7.915029642349042e-06, + "logits/chosen": 588003072.0, + "logits/rejected": 972084352.0, + "logps/chosen": -314.54575602213544, + "logps/rejected": -123.42605590820312, + "loss": 0.0493, + "rewards/chosen": 3.990481694539388, + "rewards/margins": 8.175353844960531, + "rewards/rejected": -4.184872150421143, + "step": 3319 + }, + { + "epoch": 0.3033348560986752, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 7.913861355625388e-06, + "logits/chosen": 433588288.0, + "logits/rejected": 508901696.0, + "logps/chosen": -412.711181640625, + "logps/rejected": -557.9849853515625, + "loss": 0.0156, + "rewards/chosen": 3.6091980934143066, + "rewards/margins": 12.330336093902588, + "rewards/rejected": -8.721138000488281, + "step": 3320 + }, + { + "epoch": 0.30342622201918684, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 7.912692827952395e-06, + "logits/chosen": 478010560.0, + "logits/rejected": 346014592.0, + "logps/chosen": -326.858642578125, + "logps/rejected": -534.3135986328125, + "loss": 0.0409, + "rewards/chosen": 3.2382073402404785, + "rewards/margins": 9.534367084503174, + "rewards/rejected": -6.296159744262695, + "step": 3321 + }, + { + "epoch": 0.3035175879396985, + "grad_norm": 0.4453125, + "kl": 0.0, + "learning_rate": 7.911524059426693e-06, + "logits/chosen": 222966016.0, + "logits/rejected": 664856268.8, + "logps/chosen": -117.2754618326823, + "logps/rejected": -640.09931640625, + "loss": 0.0251, + "rewards/chosen": 3.94219970703125, + "rewards/margins": 14.292176818847656, + "rewards/rejected": -10.349977111816406, + "step": 3322 + }, + { + "epoch": 0.30360895386021014, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.910355050144926e-06, + "logits/chosen": 474461088.0, + "logits/rejected": 423754464.0, + "logps/chosen": -440.1097412109375, + "logps/rejected": -532.50634765625, + "loss": 0.0173, + "rewards/chosen": 3.677570343017578, + "rewards/margins": 12.350330352783203, + "rewards/rejected": -8.672760009765625, + "step": 3323 + }, + { + "epoch": 0.3037003197807218, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.909185800203763e-06, + "logits/chosen": 570103125.3333334, + "logits/rejected": 442088345.6, + "logps/chosen": -263.64589436848956, + "logps/rejected": -392.2997802734375, + "loss": 0.1104, + "rewards/chosen": 2.975099245707194, + "rewards/margins": 9.668863741556804, + "rewards/rejected": -6.693764495849609, + "step": 3324 + }, + { + "epoch": 0.30379168570123344, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 7.908016309699886e-06, + "logits/chosen": 521227861.3333333, + "logits/rejected": 544487104.0, + "logps/chosen": -398.6548665364583, + "logps/rejected": -474.7853698730469, + "loss": 0.0242, + "rewards/chosen": 4.069246927897136, + "rewards/margins": 12.544825236002605, + "rewards/rejected": -8.475578308105469, + "step": 3325 + }, + { + "epoch": 0.3038830516217451, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 7.906846578730006e-06, + "logits/chosen": 442121216.0, + "logits/rejected": 589306368.0, + "logps/chosen": -207.1547088623047, + "logps/rejected": -584.8980712890625, + "loss": 0.0086, + "rewards/chosen": 4.622409820556641, + "rewards/margins": 14.181753158569336, + "rewards/rejected": -9.559343338012695, + "step": 3326 + }, + { + "epoch": 0.30397441754225674, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 7.905676607390845e-06, + "logits/chosen": 523014176.0, + "logits/rejected": 584473344.0, + "logps/chosen": -282.30169677734375, + "logps/rejected": -576.6014404296875, + "loss": 0.0418, + "rewards/chosen": 2.8817951679229736, + "rewards/margins": 11.5482656955719, + "rewards/rejected": -8.666470527648926, + "step": 3327 + }, + { + "epoch": 0.3040657834627684, + "grad_norm": 0.9296875, + "kl": 0.0, + "learning_rate": 7.904506395779152e-06, + "logits/chosen": 479692896.0, + "logits/rejected": 629928960.0, + "logps/chosen": -279.2819519042969, + "logps/rejected": -593.1780133928571, + "loss": 0.0039, + "rewards/chosen": 3.4515349864959717, + "rewards/margins": 15.250089543206352, + "rewards/rejected": -11.79855455671038, + "step": 3328 + }, + { + "epoch": 0.30415714938328003, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 7.903335943991689e-06, + "logits/chosen": 424558592.0, + "logits/rejected": 416371456.0, + "logps/chosen": -295.9343566894531, + "logps/rejected": -316.32037353515625, + "loss": 0.0142, + "rewards/chosen": 3.877500295639038, + "rewards/margins": 12.321561574935913, + "rewards/rejected": -8.444061279296875, + "step": 3329 + }, + { + "epoch": 0.3042485153037917, + "grad_norm": 34.25, + "kl": 0.0, + "learning_rate": 7.902165252125245e-06, + "logits/chosen": 500324949.3333333, + "logits/rejected": 428379776.0, + "logps/chosen": -286.44594319661456, + "logps/rejected": -520.9505615234375, + "loss": 0.0634, + "rewards/chosen": 3.320144017537435, + "rewards/margins": 14.800762494405111, + "rewards/rejected": -11.480618476867676, + "step": 3330 + }, + { + "epoch": 0.30433988122430333, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 7.900994320276623e-06, + "logits/chosen": 780978005.3333334, + "logits/rejected": 565636224.0, + "logps/chosen": -318.8227945963542, + "logps/rejected": -714.3790283203125, + "loss": 0.0391, + "rewards/chosen": 3.155148188273112, + "rewards/margins": 12.301800409952799, + "rewards/rejected": -9.146652221679688, + "step": 3331 + }, + { + "epoch": 0.304431247144815, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.899823148542648e-06, + "logits/chosen": 566079872.0, + "logits/rejected": 400201536.0, + "logps/chosen": -269.5376281738281, + "logps/rejected": -583.6842041015625, + "loss": 0.0252, + "rewards/chosen": 3.2501327991485596, + "rewards/margins": 12.74028992652893, + "rewards/rejected": -9.490157127380371, + "step": 3332 + }, + { + "epoch": 0.30452261306532663, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.898651737020166e-06, + "logits/chosen": 825501269.3333334, + "logits/rejected": 673401856.0, + "logps/chosen": -364.9672444661458, + "logps/rejected": -472.4913330078125, + "loss": 0.0331, + "rewards/chosen": 3.3320000966389975, + "rewards/margins": 13.05984910329183, + "rewards/rejected": -9.727849006652832, + "step": 3333 + }, + { + "epoch": 0.3046139789858383, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 7.897480085806044e-06, + "logits/chosen": 713704601.6, + "logits/rejected": 604578432.0, + "logps/chosen": -304.8059814453125, + "logps/rejected": -272.57257080078125, + "loss": 0.0251, + "rewards/chosen": 4.063545227050781, + "rewards/margins": 10.50540542602539, + "rewards/rejected": -6.441860198974609, + "step": 3334 + }, + { + "epoch": 0.30470534490634993, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 7.896308194997164e-06, + "logits/chosen": 706849536.0, + "logits/rejected": 434215884.8, + "logps/chosen": -316.7298990885417, + "logps/rejected": -510.777734375, + "loss": 0.0132, + "rewards/chosen": 4.650401433308919, + "rewards/margins": 14.545722325642902, + "rewards/rejected": -9.895320892333984, + "step": 3335 + }, + { + "epoch": 0.3047967108268616, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 7.895136064690432e-06, + "logits/chosen": 685188693.3333334, + "logits/rejected": 1170931404.8, + "logps/chosen": -242.96305338541666, + "logps/rejected": -672.484130859375, + "loss": 0.0196, + "rewards/chosen": 3.3939088185628257, + "rewards/margins": 13.57996915181478, + "rewards/rejected": -10.186060333251953, + "step": 3336 + }, + { + "epoch": 0.3048880767473732, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 7.893963694982769e-06, + "logits/chosen": 1460614144.0, + "logits/rejected": 775948970.6666666, + "logps/chosen": -254.674560546875, + "logps/rejected": -406.8088785807292, + "loss": 0.1196, + "rewards/chosen": 1.9294617176055908, + "rewards/margins": 9.16706395149231, + "rewards/rejected": -7.237602233886719, + "step": 3337 + }, + { + "epoch": 0.3049794426678849, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 7.892791085971124e-06, + "logits/chosen": 430923059.2, + "logits/rejected": 425905322.6666667, + "logps/chosen": -308.698583984375, + "logps/rejected": -754.5763346354166, + "loss": 0.0142, + "rewards/chosen": 4.265194320678711, + "rewards/margins": 18.677883529663085, + "rewards/rejected": -14.412689208984375, + "step": 3338 + }, + { + "epoch": 0.3050708085883965, + "grad_norm": 28.875, + "kl": 0.0, + "learning_rate": 7.891618237752457e-06, + "logits/chosen": 521511509.3333333, + "logits/rejected": 953293721.6, + "logps/chosen": -389.8830159505208, + "logps/rejected": -448.2482421875, + "loss": 0.0683, + "rewards/chosen": 2.8959859212239585, + "rewards/margins": 11.119427235921224, + "rewards/rejected": -8.223441314697265, + "step": 3339 + }, + { + "epoch": 0.3051621745089082, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 7.890445150423754e-06, + "logits/chosen": 567188480.0, + "logits/rejected": 572663381.3333334, + "logps/chosen": -343.6455078125, + "logps/rejected": -556.28857421875, + "loss": 0.0367, + "rewards/chosen": 2.884404182434082, + "rewards/margins": 12.75560474395752, + "rewards/rejected": -9.871200561523438, + "step": 3340 + }, + { + "epoch": 0.3052535404294198, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 7.889271824082017e-06, + "logits/chosen": 878832493.7142857, + "logits/rejected": 467290624.0, + "logps/chosen": -329.24169921875, + "logps/rejected": -195.90921020507812, + "loss": 0.051, + "rewards/chosen": 3.1914002554757253, + "rewards/margins": 9.183352674756732, + "rewards/rejected": -5.991952419281006, + "step": 3341 + }, + { + "epoch": 0.3053449063499315, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 7.88809825882427e-06, + "logits/chosen": 430177331.2, + "logits/rejected": 337785429.3333333, + "logps/chosen": -392.64228515625, + "logps/rejected": -350.6086832682292, + "loss": 0.023, + "rewards/chosen": 3.5416046142578126, + "rewards/margins": 11.676764424641927, + "rewards/rejected": -8.135159810384115, + "step": 3342 + }, + { + "epoch": 0.3054362722704431, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 7.886924454747556e-06, + "logits/chosen": 370212778.6666667, + "logits/rejected": 531062656.0, + "logps/chosen": -281.4005940755208, + "logps/rejected": -410.1417236328125, + "loss": 0.0339, + "rewards/chosen": 3.825861612955729, + "rewards/margins": 13.40676180521647, + "rewards/rejected": -9.580900192260742, + "step": 3343 + }, + { + "epoch": 0.3055276381909548, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 7.885750411948938e-06, + "logits/chosen": 553174336.0, + "logits/rejected": 538403401.1428572, + "logps/chosen": -624.1209716796875, + "logps/rejected": -425.462646484375, + "loss": 0.0682, + "rewards/chosen": 3.068310499191284, + "rewards/margins": 9.677542720522199, + "rewards/rejected": -6.609232221330915, + "step": 3344 + }, + { + "epoch": 0.3056190041114664, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 7.884576130525497e-06, + "logits/chosen": 847747264.0, + "logits/rejected": 729483776.0, + "logps/chosen": -366.77435302734375, + "logps/rejected": -733.4981689453125, + "loss": 0.0075, + "rewards/chosen": 4.273959159851074, + "rewards/margins": 12.729940414428711, + "rewards/rejected": -8.455981254577637, + "step": 3345 + }, + { + "epoch": 0.3057103700319781, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 7.883401610574338e-06, + "logits/chosen": 490762368.0, + "logits/rejected": 789639219.2, + "logps/chosen": -184.5042928059896, + "logps/rejected": -441.45546875, + "loss": 0.0266, + "rewards/chosen": 3.6398531595865884, + "rewards/margins": 10.270469919840494, + "rewards/rejected": -6.630616760253906, + "step": 3346 + }, + { + "epoch": 0.3058017359524897, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 7.882226852192579e-06, + "logits/chosen": 960689493.3333334, + "logits/rejected": 905539686.4, + "logps/chosen": -376.6138916015625, + "logps/rejected": -608.465234375, + "loss": 0.0134, + "rewards/chosen": 4.203552881876628, + "rewards/margins": 13.818192164103191, + "rewards/rejected": -9.614639282226562, + "step": 3347 + }, + { + "epoch": 0.3058931018730014, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 7.881051855477363e-06, + "logits/chosen": 537664256.0, + "logits/rejected": 729451212.8, + "logps/chosen": -299.76904296875, + "logps/rejected": -868.90732421875, + "loss": 0.0142, + "rewards/chosen": 3.3964560826619468, + "rewards/margins": 12.583520444234212, + "rewards/rejected": -9.187064361572265, + "step": 3348 + }, + { + "epoch": 0.305984467793513, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.879876620525854e-06, + "logits/chosen": 538802907.4285715, + "logits/rejected": 369302016.0, + "logps/chosen": -360.35484095982144, + "logps/rejected": -577.7787475585938, + "loss": 0.0408, + "rewards/chosen": 3.322813034057617, + "rewards/margins": 17.016179084777832, + "rewards/rejected": -13.693366050720215, + "step": 3349 + }, + { + "epoch": 0.3060758337140247, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 7.878701147435231e-06, + "logits/chosen": 750729728.0, + "logits/rejected": 395801907.2, + "logps/chosen": -684.1949869791666, + "logps/rejected": -310.7254638671875, + "loss": 0.0091, + "rewards/chosen": 4.15997314453125, + "rewards/margins": 12.472618103027344, + "rewards/rejected": -8.312644958496094, + "step": 3350 + }, + { + "epoch": 0.3061671996345363, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.877525436302696e-06, + "logits/chosen": 504682752.0, + "logits/rejected": 669399680.0, + "logps/chosen": -356.19244384765625, + "logps/rejected": -301.90576171875, + "loss": 0.0132, + "rewards/chosen": 4.845307350158691, + "rewards/margins": 11.023515701293945, + "rewards/rejected": -6.178208351135254, + "step": 3351 + }, + { + "epoch": 0.306258565555048, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 7.876349487225466e-06, + "logits/chosen": 740400256.0, + "logits/rejected": 441942880.0, + "logps/chosen": -395.056396484375, + "logps/rejected": -358.99945068359375, + "loss": 0.1151, + "rewards/chosen": 3.9974420070648193, + "rewards/margins": 9.203500986099243, + "rewards/rejected": -5.206058979034424, + "step": 3352 + }, + { + "epoch": 0.3063499314755596, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 7.875173300300784e-06, + "logits/chosen": 532280032.0, + "logits/rejected": 242915936.0, + "logps/chosen": -333.2607116699219, + "logps/rejected": -427.92633056640625, + "loss": 0.0327, + "rewards/chosen": 3.0724215507507324, + "rewards/margins": 14.211866855621338, + "rewards/rejected": -11.139445304870605, + "step": 3353 + }, + { + "epoch": 0.3064412973960713, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 7.873996875625908e-06, + "logits/chosen": 482156800.0, + "logits/rejected": 560567808.0, + "logps/chosen": -358.0587463378906, + "logps/rejected": -623.6229248046875, + "loss": 0.0223, + "rewards/chosen": 3.5079145431518555, + "rewards/margins": 14.079212188720703, + "rewards/rejected": -10.571297645568848, + "step": 3354 + }, + { + "epoch": 0.3065326633165829, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 7.87282021329812e-06, + "logits/chosen": 1064105676.8, + "logits/rejected": 847315114.6666666, + "logps/chosen": -435.77841796875, + "logps/rejected": -734.1224772135416, + "loss": 0.0213, + "rewards/chosen": 3.9064300537109373, + "rewards/margins": 13.740550486246743, + "rewards/rejected": -9.834120432535807, + "step": 3355 + }, + { + "epoch": 0.3066240292370946, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 7.871643313414718e-06, + "logits/chosen": 379553450.6666667, + "logits/rejected": 591588403.2, + "logps/chosen": -244.26700846354166, + "logps/rejected": -590.768408203125, + "loss": 0.0267, + "rewards/chosen": 2.7401037216186523, + "rewards/margins": 11.467451286315917, + "rewards/rejected": -8.727347564697265, + "step": 3356 + }, + { + "epoch": 0.3067153951576062, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.87046617607302e-06, + "logits/chosen": 453046144.0, + "logits/rejected": 322128588.8, + "logps/chosen": -351.0432942708333, + "logps/rejected": -449.96669921875, + "loss": 0.0102, + "rewards/chosen": 3.9752279917399087, + "rewards/margins": 12.537431208292643, + "rewards/rejected": -8.562203216552735, + "step": 3357 + }, + { + "epoch": 0.3068067610781179, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 7.869288801370365e-06, + "logits/chosen": 482037312.0, + "logits/rejected": 749797632.0, + "logps/chosen": -360.0473327636719, + "logps/rejected": -584.4180908203125, + "loss": 0.0107, + "rewards/chosen": 4.232297897338867, + "rewards/margins": 12.14872121810913, + "rewards/rejected": -7.916423320770264, + "step": 3358 + }, + { + "epoch": 0.3068981269986295, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 7.868111189404111e-06, + "logits/chosen": 1036883029.3333334, + "logits/rejected": 486558976.0, + "logps/chosen": -337.9230550130208, + "logps/rejected": -440.10029296875, + "loss": 0.006, + "rewards/chosen": 4.449913342793782, + "rewards/margins": 13.24588197072347, + "rewards/rejected": -8.795968627929687, + "step": 3359 + }, + { + "epoch": 0.3069894929191412, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 7.866933340271634e-06, + "logits/chosen": 751167829.3333334, + "logits/rejected": 603757465.6, + "logps/chosen": -384.15234375, + "logps/rejected": -392.156005859375, + "loss": 0.0079, + "rewards/chosen": 4.30984624226888, + "rewards/margins": 14.081651051839192, + "rewards/rejected": -9.771804809570312, + "step": 3360 + }, + { + "epoch": 0.3070808588396528, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.865755254070333e-06, + "logits/chosen": 625571968.0, + "logits/rejected": 391792576.0, + "logps/chosen": -298.4412841796875, + "logps/rejected": -578.21240234375, + "loss": 0.0101, + "rewards/chosen": 4.21053409576416, + "rewards/margins": 14.969502449035645, + "rewards/rejected": -10.758968353271484, + "step": 3361 + }, + { + "epoch": 0.3071722247601645, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 7.864576930897625e-06, + "logits/chosen": 602353600.0, + "logits/rejected": 585421226.6666666, + "logps/chosen": -347.6923828125, + "logps/rejected": -797.43505859375, + "loss": 0.0147, + "rewards/chosen": 2.979233503341675, + "rewards/margins": 12.180172681808472, + "rewards/rejected": -9.200939178466797, + "step": 3362 + }, + { + "epoch": 0.3072635906806761, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 7.863398370850946e-06, + "logits/chosen": 415335808.0, + "logits/rejected": 510279776.0, + "logps/chosen": -343.75469970703125, + "logps/rejected": -566.8280029296875, + "loss": 0.0279, + "rewards/chosen": 3.4970016479492188, + "rewards/margins": 14.510337829589844, + "rewards/rejected": -11.013336181640625, + "step": 3363 + }, + { + "epoch": 0.3073549566011878, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 7.862219574027751e-06, + "logits/chosen": 612553557.3333334, + "logits/rejected": 587007385.6, + "logps/chosen": -460.1614990234375, + "logps/rejected": -520.246728515625, + "loss": 0.0123, + "rewards/chosen": 3.4956836700439453, + "rewards/margins": 12.504642105102539, + "rewards/rejected": -9.008958435058593, + "step": 3364 + }, + { + "epoch": 0.3074463225216994, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 7.861040540525516e-06, + "logits/chosen": 1119618304.0, + "logits/rejected": 709703488.0, + "logps/chosen": -242.14797973632812, + "logps/rejected": -547.0508422851562, + "loss": 0.0186, + "rewards/chosen": 3.610015392303467, + "rewards/margins": 12.916951656341553, + "rewards/rejected": -9.306936264038086, + "step": 3365 + }, + { + "epoch": 0.3075376884422111, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 7.859861270441736e-06, + "logits/chosen": 421136793.6, + "logits/rejected": 738868480.0, + "logps/chosen": -239.525537109375, + "logps/rejected": -443.0667724609375, + "loss": 0.0246, + "rewards/chosen": 3.6562610626220704, + "rewards/margins": 12.678420639038086, + "rewards/rejected": -9.022159576416016, + "step": 3366 + }, + { + "epoch": 0.3076290543627227, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 7.858681763873927e-06, + "logits/chosen": 418296345.6, + "logits/rejected": 573430869.3333334, + "logps/chosen": -186.08455810546874, + "logps/rejected": -401.1959228515625, + "loss": 0.0209, + "rewards/chosen": 4.133396148681641, + "rewards/margins": 13.460796991984049, + "rewards/rejected": -9.327400843302408, + "step": 3367 + }, + { + "epoch": 0.3077204202832344, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 7.857502020919621e-06, + "logits/chosen": 839192448.0, + "logits/rejected": 456660565.3333333, + "logps/chosen": -387.2950439453125, + "logps/rejected": -390.6898600260417, + "loss": 0.0269, + "rewards/chosen": 2.1322433948516846, + "rewards/margins": 11.174863735834757, + "rewards/rejected": -9.042620340983072, + "step": 3368 + }, + { + "epoch": 0.307811786203746, + "grad_norm": 29.375, + "kl": 0.0, + "learning_rate": 7.856322041676374e-06, + "logits/chosen": 1225013760.0, + "logits/rejected": 1045573184.0, + "logps/chosen": -274.0872497558594, + "logps/rejected": -373.6873474121094, + "loss": 0.1113, + "rewards/chosen": 2.1269326210021973, + "rewards/margins": 11.614874362945557, + "rewards/rejected": -9.48794174194336, + "step": 3369 + }, + { + "epoch": 0.3079031521242577, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 7.855141826241759e-06, + "logits/chosen": 766742528.0, + "logits/rejected": 371110336.0, + "logps/chosen": -268.3052164713542, + "logps/rejected": -481.02667236328125, + "loss": 0.047, + "rewards/chosen": 2.972747484842936, + "rewards/margins": 13.915866533915201, + "rewards/rejected": -10.943119049072266, + "step": 3370 + }, + { + "epoch": 0.3079945180447693, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 7.853961374713367e-06, + "logits/chosen": 449795584.0, + "logits/rejected": 636966553.6, + "logps/chosen": -342.6387125651042, + "logps/rejected": -591.219287109375, + "loss": 0.0067, + "rewards/chosen": 4.479925791422526, + "rewards/margins": 13.508624521891278, + "rewards/rejected": -9.02869873046875, + "step": 3371 + }, + { + "epoch": 0.30808588396528097, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 7.852780687188815e-06, + "logits/chosen": 422076928.0, + "logits/rejected": 372309024.0, + "logps/chosen": -314.58265904017856, + "logps/rejected": -960.0396118164062, + "loss": 0.0398, + "rewards/chosen": 3.512676783970424, + "rewards/margins": 18.120849200657435, + "rewards/rejected": -14.608172416687012, + "step": 3372 + }, + { + "epoch": 0.3081772498857926, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.85159976376573e-06, + "logits/chosen": 714728746.6666666, + "logits/rejected": 1052837376.0, + "logps/chosen": -239.62483723958334, + "logps/rejected": -376.04130859375, + "loss": 0.0147, + "rewards/chosen": 3.27320925394694, + "rewards/margins": 12.688100306193034, + "rewards/rejected": -9.414891052246094, + "step": 3373 + }, + { + "epoch": 0.30826861580630427, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 7.850418604541764e-06, + "logits/chosen": 461760512.0, + "logits/rejected": 218352544.0, + "logps/chosen": -337.26068115234375, + "logps/rejected": -277.0101623535156, + "loss": 0.031, + "rewards/chosen": 3.605144500732422, + "rewards/margins": 8.752141952514648, + "rewards/rejected": -5.146997451782227, + "step": 3374 + }, + { + "epoch": 0.3083599817268159, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 7.84923720961459e-06, + "logits/chosen": 602409344.0, + "logits/rejected": 658402048.0, + "logps/chosen": -369.18707275390625, + "logps/rejected": -376.04986572265625, + "loss": 0.0146, + "rewards/chosen": 3.5206353664398193, + "rewards/margins": 12.591062307357788, + "rewards/rejected": -9.070426940917969, + "step": 3375 + }, + { + "epoch": 0.30845134764732757, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 7.8480555790819e-06, + "logits/chosen": 552288554.6666666, + "logits/rejected": 422840601.6, + "logps/chosen": -324.4908854166667, + "logps/rejected": -535.422509765625, + "loss": 0.0197, + "rewards/chosen": 3.197005271911621, + "rewards/margins": 12.076164817810058, + "rewards/rejected": -8.879159545898437, + "step": 3376 + }, + { + "epoch": 0.3085427135678392, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 7.846873713041399e-06, + "logits/chosen": 355965184.0, + "logits/rejected": 337368384.0, + "logps/chosen": -298.4744567871094, + "logps/rejected": -195.19566345214844, + "loss": 0.107, + "rewards/chosen": 3.8487274646759033, + "rewards/margins": 9.395910501480103, + "rewards/rejected": -5.547183036804199, + "step": 3377 + }, + { + "epoch": 0.30863407948835087, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 7.845691611590818e-06, + "logits/chosen": 515537920.0, + "logits/rejected": 605593408.0, + "logps/chosen": -232.3128662109375, + "logps/rejected": -340.6075744628906, + "loss": 0.0936, + "rewards/chosen": 3.7213692665100098, + "rewards/margins": 9.940882205963135, + "rewards/rejected": -6.219512939453125, + "step": 3378 + }, + { + "epoch": 0.3087254454088625, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.844509274827907e-06, + "logits/chosen": 489962304.0, + "logits/rejected": 615710464.0, + "logps/chosen": -357.95166015625, + "logps/rejected": -451.08526611328125, + "loss": 0.0865, + "rewards/chosen": 3.8022279739379883, + "rewards/margins": 11.39067029953003, + "rewards/rejected": -7.588442325592041, + "step": 3379 + }, + { + "epoch": 0.30881681132937416, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 7.843326702850436e-06, + "logits/chosen": 454663893.3333333, + "logits/rejected": 445569638.4, + "logps/chosen": -188.82686360677084, + "logps/rejected": -539.78671875, + "loss": 0.0217, + "rewards/chosen": 3.3389104207356772, + "rewards/margins": 12.20366948445638, + "rewards/rejected": -8.864759063720703, + "step": 3380 + }, + { + "epoch": 0.3089081772498858, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 7.84214389575619e-06, + "logits/chosen": 727119232.0, + "logits/rejected": 781289408.0, + "logps/chosen": -376.950439453125, + "logps/rejected": -479.5126647949219, + "loss": 0.0478, + "rewards/chosen": 3.2379486560821533, + "rewards/margins": 11.488544702529907, + "rewards/rejected": -8.250596046447754, + "step": 3381 + }, + { + "epoch": 0.30899954317039746, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 7.840960853642974e-06, + "logits/chosen": 276428224.0, + "logits/rejected": 473270186.6666667, + "logps/chosen": -191.59274291992188, + "logps/rejected": -515.7494303385416, + "loss": 0.0107, + "rewards/chosen": 4.0050530433654785, + "rewards/margins": 14.189351876576742, + "rewards/rejected": -10.184298833211264, + "step": 3382 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 7.83977757660862e-06, + "logits/chosen": 717704789.3333334, + "logits/rejected": 459487264.0, + "logps/chosen": -421.7561848958333, + "logps/rejected": -510.5594787597656, + "loss": 0.0309, + "rewards/chosen": 3.6215794881184897, + "rewards/margins": 13.48959477742513, + "rewards/rejected": -9.86801528930664, + "step": 3383 + }, + { + "epoch": 0.30918227501142076, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 7.838594064750971e-06, + "logits/chosen": 522077994.6666667, + "logits/rejected": 1127928422.4, + "logps/chosen": -232.4592081705729, + "logps/rejected": -591.688330078125, + "loss": 0.0092, + "rewards/chosen": 3.7703399658203125, + "rewards/margins": 13.501876831054688, + "rewards/rejected": -9.731536865234375, + "step": 3384 + }, + { + "epoch": 0.3092736409319324, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.837410318167892e-06, + "logits/chosen": 626582820.5714285, + "logits/rejected": 468668096.0, + "logps/chosen": -242.60909598214286, + "logps/rejected": -493.39361572265625, + "loss": 0.0503, + "rewards/chosen": 2.8201936994280135, + "rewards/margins": 10.624985149928502, + "rewards/rejected": -7.804791450500488, + "step": 3385 + }, + { + "epoch": 0.30936500685244406, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 7.836226336957271e-06, + "logits/chosen": 440347904.0, + "logits/rejected": 378118741.3333333, + "logps/chosen": -273.13555908203125, + "logps/rejected": -461.8917643229167, + "loss": 0.058, + "rewards/chosen": 2.876896858215332, + "rewards/margins": 12.315016110738119, + "rewards/rejected": -9.438119252522787, + "step": 3386 + }, + { + "epoch": 0.3094563727729557, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 7.835042121217008e-06, + "logits/chosen": 535154080.0, + "logits/rejected": 781291392.0, + "logps/chosen": -371.45758056640625, + "logps/rejected": -514.9834594726562, + "loss": 0.0192, + "rewards/chosen": 3.6398935317993164, + "rewards/margins": 13.8666353225708, + "rewards/rejected": -10.226741790771484, + "step": 3387 + }, + { + "epoch": 0.30954773869346736, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 7.83385767104503e-06, + "logits/chosen": 727710848.0, + "logits/rejected": 404055072.0, + "logps/chosen": -435.70404052734375, + "logps/rejected": -572.1983032226562, + "loss": 0.0225, + "rewards/chosen": 3.3883910179138184, + "rewards/margins": 16.10659170150757, + "rewards/rejected": -12.71820068359375, + "step": 3388 + }, + { + "epoch": 0.309639104613979, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 7.832672986539281e-06, + "logits/chosen": 398786346.6666667, + "logits/rejected": 386652979.2, + "logps/chosen": -417.2147216796875, + "logps/rejected": -521.9021484375, + "loss": 0.031, + "rewards/chosen": 3.1084108352661133, + "rewards/margins": 12.835657691955566, + "rewards/rejected": -9.727246856689453, + "step": 3389 + }, + { + "epoch": 0.30973047053449065, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 7.831488067797718e-06, + "logits/chosen": 637936085.3333334, + "logits/rejected": 982548684.8, + "logps/chosen": -194.37369791666666, + "logps/rejected": -659.51337890625, + "loss": 0.0174, + "rewards/chosen": 3.420260747273763, + "rewards/margins": 13.48935178120931, + "rewards/rejected": -10.069091033935546, + "step": 3390 + }, + { + "epoch": 0.3098218364550023, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 7.830302914918329e-06, + "logits/chosen": 502841184.0, + "logits/rejected": 641842944.0, + "logps/chosen": -213.79393005371094, + "logps/rejected": -341.735595703125, + "loss": 0.0468, + "rewards/chosen": 2.346236228942871, + "rewards/margins": 9.600199222564697, + "rewards/rejected": -7.253962993621826, + "step": 3391 + }, + { + "epoch": 0.30991320237551395, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 7.82911752799911e-06, + "logits/chosen": 534683392.0, + "logits/rejected": 357262873.6, + "logps/chosen": -383.2812906901042, + "logps/rejected": -559.251513671875, + "loss": 0.1116, + "rewards/chosen": 2.1267380714416504, + "rewards/margins": 12.37853708267212, + "rewards/rejected": -10.251799011230469, + "step": 3392 + }, + { + "epoch": 0.3100045682960256, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 7.827931907138085e-06, + "logits/chosen": 658223701.3333334, + "logits/rejected": 285062912.0, + "logps/chosen": -276.9874267578125, + "logps/rejected": -460.68857421875, + "loss": 0.0209, + "rewards/chosen": 3.0968310038248696, + "rewards/margins": 12.629356638590494, + "rewards/rejected": -9.532525634765625, + "step": 3393 + }, + { + "epoch": 0.31009593421653725, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 7.826746052433291e-06, + "logits/chosen": 282364160.0, + "logits/rejected": 427596768.0, + "logps/chosen": -147.82757568359375, + "logps/rejected": -559.3489990234375, + "loss": 0.0381, + "rewards/chosen": 3.6617140769958496, + "rewards/margins": 15.033328533172607, + "rewards/rejected": -11.371614456176758, + "step": 3394 + }, + { + "epoch": 0.31018730013704887, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 7.82555996398279e-06, + "logits/chosen": 309162336.0, + "logits/rejected": 508906624.0, + "logps/chosen": -173.7482147216797, + "logps/rejected": -524.8917236328125, + "loss": 0.0159, + "rewards/chosen": 4.281315803527832, + "rewards/margins": 13.484135627746582, + "rewards/rejected": -9.20281982421875, + "step": 3395 + }, + { + "epoch": 0.31027866605756055, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 7.82437364188466e-06, + "logits/chosen": 406117408.0, + "logits/rejected": 392109440.0, + "logps/chosen": -314.50958251953125, + "logps/rejected": -423.517333984375, + "loss": 0.014, + "rewards/chosen": 3.7720322608947754, + "rewards/margins": 14.68904733657837, + "rewards/rejected": -10.917015075683594, + "step": 3396 + }, + { + "epoch": 0.31037003197807217, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 7.823187086236996e-06, + "logits/chosen": 490617685.3333333, + "logits/rejected": 446376345.6, + "logps/chosen": -264.1774088541667, + "logps/rejected": -545.73857421875, + "loss": 0.0457, + "rewards/chosen": 2.0100380579630532, + "rewards/margins": 11.129867617289225, + "rewards/rejected": -9.119829559326172, + "step": 3397 + }, + { + "epoch": 0.31046139789858385, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 7.82200029713792e-06, + "logits/chosen": 451371392.0, + "logits/rejected": 293205824.0, + "logps/chosen": -278.871826171875, + "logps/rejected": -425.6331787109375, + "loss": 0.0303, + "rewards/chosen": 2.8497657775878906, + "rewards/margins": 12.275846481323242, + "rewards/rejected": -9.426080703735352, + "step": 3398 + }, + { + "epoch": 0.31055276381909547, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.820813274685563e-06, + "logits/chosen": 477000288.0, + "logits/rejected": 480645632.0, + "logps/chosen": -306.3069763183594, + "logps/rejected": -547.5255126953125, + "loss": 0.0309, + "rewards/chosen": 2.9105381965637207, + "rewards/margins": 12.920313358306885, + "rewards/rejected": -10.009775161743164, + "step": 3399 + }, + { + "epoch": 0.31064412973960714, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 7.819626018978087e-06, + "logits/chosen": 570672682.6666666, + "logits/rejected": 649317939.2, + "logps/chosen": -425.8907063802083, + "logps/rejected": -499.7486328125, + "loss": 0.0173, + "rewards/chosen": 3.1798229217529297, + "rewards/margins": 12.707705307006837, + "rewards/rejected": -9.527882385253907, + "step": 3400 + }, + { + "epoch": 0.31073549566011877, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 7.818438530113661e-06, + "logits/chosen": 696559762.2857143, + "logits/rejected": 530825280.0, + "logps/chosen": -312.7963169642857, + "logps/rejected": -503.88458251953125, + "loss": 0.0745, + "rewards/chosen": 2.623398917061942, + "rewards/margins": 10.788578169686453, + "rewards/rejected": -8.165179252624512, + "step": 3401 + }, + { + "epoch": 0.31082686158063044, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 7.817250808190483e-06, + "logits/chosen": 1436775808.0, + "logits/rejected": 651456192.0, + "logps/chosen": -402.1632385253906, + "logps/rejected": -474.3134765625, + "loss": 0.0179, + "rewards/chosen": 3.676206588745117, + "rewards/margins": 13.183160781860352, + "rewards/rejected": -9.506954193115234, + "step": 3402 + }, + { + "epoch": 0.31091822750114206, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 7.816062853306767e-06, + "logits/chosen": 530110822.4, + "logits/rejected": 440809472.0, + "logps/chosen": -212.769189453125, + "logps/rejected": -389.924560546875, + "loss": 0.0237, + "rewards/chosen": 4.053179168701172, + "rewards/margins": 12.86299680074056, + "rewards/rejected": -8.809817632039389, + "step": 3403 + }, + { + "epoch": 0.31100959342165374, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 7.814874665560742e-06, + "logits/chosen": 395693141.3333333, + "logits/rejected": 450354112.0, + "logps/chosen": -260.7348225911458, + "logps/rejected": -476.3865966796875, + "loss": 0.0307, + "rewards/chosen": 3.735497792561849, + "rewards/margins": 14.564536412556967, + "rewards/rejected": -10.829038619995117, + "step": 3404 + }, + { + "epoch": 0.31110095934216536, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 7.813686245050663e-06, + "logits/chosen": 520392789.3333333, + "logits/rejected": 796230451.2, + "logps/chosen": -253.73697916666666, + "logps/rejected": -476.833203125, + "loss": 0.0191, + "rewards/chosen": 3.4265000025431314, + "rewards/margins": 12.465155092875163, + "rewards/rejected": -9.038655090332032, + "step": 3405 + }, + { + "epoch": 0.31119232526267704, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 7.8124975918748e-06, + "logits/chosen": 472366304.0, + "logits/rejected": 622353216.0, + "logps/chosen": -271.8887939453125, + "logps/rejected": -352.2694091796875, + "loss": 0.091, + "rewards/chosen": 3.4170985221862793, + "rewards/margins": 10.013566493988037, + "rewards/rejected": -6.596467971801758, + "step": 3406 + }, + { + "epoch": 0.31128369118318866, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 7.811308706131447e-06, + "logits/chosen": 523818560.0, + "logits/rejected": 288225728.0, + "logps/chosen": -311.0443420410156, + "logps/rejected": -415.50933837890625, + "loss": 0.0251, + "rewards/chosen": 3.051499366760254, + "rewards/margins": 11.789700508117676, + "rewards/rejected": -8.738201141357422, + "step": 3407 + }, + { + "epoch": 0.31137505710370034, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 7.810119587918911e-06, + "logits/chosen": 708423488.0, + "logits/rejected": 816581705.1428572, + "logps/chosen": -473.5667724609375, + "logps/rejected": -394.38905552455356, + "loss": 0.0055, + "rewards/chosen": 3.265185594558716, + "rewards/margins": 11.586083378110613, + "rewards/rejected": -8.320897783551898, + "step": 3408 + }, + { + "epoch": 0.31146642302421196, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 7.808930237335521e-06, + "logits/chosen": 270521642.6666667, + "logits/rejected": 537052416.0, + "logps/chosen": -227.22420247395834, + "logps/rejected": -659.41015625, + "loss": 0.0098, + "rewards/chosen": 3.9798997243245444, + "rewards/margins": 12.736959966023763, + "rewards/rejected": -8.757060241699218, + "step": 3409 + }, + { + "epoch": 0.31155778894472363, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 7.807740654479625e-06, + "logits/chosen": 699817258.6666666, + "logits/rejected": 538105344.0, + "logps/chosen": -292.8133544921875, + "logps/rejected": -569.438623046875, + "loss": 0.0977, + "rewards/chosen": 1.6033166249593098, + "rewards/margins": 11.950924046834311, + "rewards/rejected": -10.347607421875, + "step": 3410 + }, + { + "epoch": 0.31164915486523526, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 7.80655083944959e-06, + "logits/chosen": 501655082.6666667, + "logits/rejected": 505635532.8, + "logps/chosen": -210.97049967447916, + "logps/rejected": -501.1921875, + "loss": 0.0134, + "rewards/chosen": 3.6417369842529297, + "rewards/margins": 15.011227798461913, + "rewards/rejected": -11.369490814208984, + "step": 3411 + }, + { + "epoch": 0.31174052078574693, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 7.805360792343806e-06, + "logits/chosen": 574819882.6666666, + "logits/rejected": 358911744.0, + "logps/chosen": -348.4935709635417, + "logps/rejected": -301.8054504394531, + "loss": 0.0909, + "rewards/chosen": 3.642559051513672, + "rewards/margins": 8.674269199371338, + "rewards/rejected": -5.031710147857666, + "step": 3412 + }, + { + "epoch": 0.31183188670625855, + "grad_norm": 1.0625, + "kl": 0.0, + "learning_rate": 7.804170513260677e-06, + "logits/chosen": 781971968.0, + "logits/rejected": 479060114.28571427, + "logps/chosen": -761.3094482421875, + "logps/rejected": -486.61007254464283, + "loss": 0.0036, + "rewards/chosen": 3.7121827602386475, + "rewards/margins": 12.710860695157733, + "rewards/rejected": -8.998677934919085, + "step": 3413 + }, + { + "epoch": 0.31192325262677023, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 7.802980002298625e-06, + "logits/chosen": 768264640.0, + "logits/rejected": 504495264.0, + "logps/chosen": -624.904052734375, + "logps/rejected": -527.71826171875, + "loss": 0.0155, + "rewards/chosen": 3.6738250255584717, + "rewards/margins": 13.094571828842163, + "rewards/rejected": -9.420746803283691, + "step": 3414 + }, + { + "epoch": 0.31201461854728185, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 7.801789259556101e-06, + "logits/chosen": 719005824.0, + "logits/rejected": 445976160.0, + "logps/chosen": -381.9263610839844, + "logps/rejected": -475.0135803222656, + "loss": 0.0228, + "rewards/chosen": 3.1385064125061035, + "rewards/margins": 12.831371784210205, + "rewards/rejected": -9.692865371704102, + "step": 3415 + }, + { + "epoch": 0.31210598446779353, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 7.800598285131562e-06, + "logits/chosen": 531818752.0, + "logits/rejected": 372618393.6, + "logps/chosen": -280.29974365234375, + "logps/rejected": -543.527490234375, + "loss": 0.0097, + "rewards/chosen": 4.235352834065755, + "rewards/margins": 13.578804524739581, + "rewards/rejected": -9.343451690673827, + "step": 3416 + }, + { + "epoch": 0.31219735038830515, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 7.799407079123493e-06, + "logits/chosen": 444866688.0, + "logits/rejected": 898861465.6, + "logps/chosen": -322.9446614583333, + "logps/rejected": -627.80615234375, + "loss": 0.0096, + "rewards/chosen": 4.362834294637044, + "rewards/margins": 14.526661809285482, + "rewards/rejected": -10.163827514648437, + "step": 3417 + }, + { + "epoch": 0.3122887163088168, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 7.798215641630395e-06, + "logits/chosen": 379786912.0, + "logits/rejected": 554049280.0, + "logps/chosen": -247.40753173828125, + "logps/rejected": -529.4814046223959, + "loss": 0.0067, + "rewards/chosen": 3.9748740196228027, + "rewards/margins": 12.419004599253336, + "rewards/rejected": -8.444130579630533, + "step": 3418 + }, + { + "epoch": 0.31238008222932845, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 7.797023972750792e-06, + "logits/chosen": 375605120.0, + "logits/rejected": 388052416.0, + "logps/chosen": -368.7978515625, + "logps/rejected": -322.1955871582031, + "loss": 0.013, + "rewards/chosen": 3.869032382965088, + "rewards/margins": 12.875783443450928, + "rewards/rejected": -9.00675106048584, + "step": 3419 + }, + { + "epoch": 0.3124714481498401, + "grad_norm": 0.53515625, + "kl": 0.0, + "learning_rate": 7.795832072583219e-06, + "logits/chosen": 513779584.0, + "logits/rejected": 1474440704.0, + "logps/chosen": -393.9639892578125, + "logps/rejected": -828.035888671875, + "loss": 0.0029, + "rewards/chosen": 5.332388401031494, + "rewards/margins": 16.526442050933838, + "rewards/rejected": -11.194053649902344, + "step": 3420 + }, + { + "epoch": 0.31256281407035175, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.794639941226238e-06, + "logits/chosen": 823263914.6666666, + "logits/rejected": 1585969792.0, + "logps/chosen": -413.8788248697917, + "logps/rejected": -370.37890625, + "loss": 0.0421, + "rewards/chosen": 3.3623857498168945, + "rewards/margins": 11.589299201965332, + "rewards/rejected": -8.226913452148438, + "step": 3421 + }, + { + "epoch": 0.3126541799908634, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 7.793447578778427e-06, + "logits/chosen": 546864213.3333334, + "logits/rejected": 438172768.0, + "logps/chosen": -281.94126383463544, + "logps/rejected": -419.2176818847656, + "loss": 0.0097, + "rewards/chosen": 4.736151695251465, + "rewards/margins": 14.3575439453125, + "rewards/rejected": -9.621392250061035, + "step": 3422 + }, + { + "epoch": 0.31274554591137504, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 7.792254985338383e-06, + "logits/chosen": 416667733.3333333, + "logits/rejected": 443364710.4, + "logps/chosen": -311.7737630208333, + "logps/rejected": -561.62763671875, + "loss": 0.0087, + "rewards/chosen": 4.061034520467122, + "rewards/margins": 12.86493771870931, + "rewards/rejected": -8.803903198242187, + "step": 3423 + }, + { + "epoch": 0.3128369118318867, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.791062161004724e-06, + "logits/chosen": 1268524672.0, + "logits/rejected": 661504981.3333334, + "logps/chosen": -412.721923828125, + "logps/rejected": -422.9894205729167, + "loss": 0.0097, + "rewards/chosen": 3.38166880607605, + "rewards/margins": 13.020185550053915, + "rewards/rejected": -9.638516743977865, + "step": 3424 + }, + { + "epoch": 0.31292827775239834, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 7.789869105876083e-06, + "logits/chosen": 602814549.3333334, + "logits/rejected": 389439923.2, + "logps/chosen": -190.41288248697916, + "logps/rejected": -477.42529296875, + "loss": 0.0094, + "rewards/chosen": 4.2228959401448565, + "rewards/margins": 14.326352818806967, + "rewards/rejected": -10.10345687866211, + "step": 3425 + }, + { + "epoch": 0.31301964367291, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 7.788675820051113e-06, + "logits/chosen": 467117397.3333333, + "logits/rejected": 368252928.0, + "logps/chosen": -313.9114176432292, + "logps/rejected": -400.4098876953125, + "loss": 0.0156, + "rewards/chosen": 3.617189089457194, + "rewards/margins": 12.506884066263835, + "rewards/rejected": -8.889694976806641, + "step": 3426 + }, + { + "epoch": 0.31311100959342164, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.787482303628494e-06, + "logits/chosen": 477349888.0, + "logits/rejected": 383200512.0, + "logps/chosen": -266.1, + "logps/rejected": -379.3208821614583, + "loss": 0.0291, + "rewards/chosen": 3.3688213348388674, + "rewards/margins": 12.316958618164062, + "rewards/rejected": -8.948137283325195, + "step": 3427 + }, + { + "epoch": 0.3132023755139333, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 7.786288556706913e-06, + "logits/chosen": 520030112.0, + "logits/rejected": 461519104.0, + "logps/chosen": -340.363525390625, + "logps/rejected": -514.3336181640625, + "loss": 0.0166, + "rewards/chosen": 3.455514907836914, + "rewards/margins": 12.674713134765625, + "rewards/rejected": -9.219198226928711, + "step": 3428 + }, + { + "epoch": 0.31329374143444494, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 7.785094579385086e-06, + "logits/chosen": 490809600.0, + "logits/rejected": 523812960.0, + "logps/chosen": -291.4854329427083, + "logps/rejected": -662.06201171875, + "loss": 0.0346, + "rewards/chosen": 3.413632074991862, + "rewards/margins": 12.047977129618326, + "rewards/rejected": -8.634345054626465, + "step": 3429 + }, + { + "epoch": 0.3133851073549566, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.783900371761741e-06, + "logits/chosen": 918278784.0, + "logits/rejected": 528189184.0, + "logps/chosen": -448.10626220703125, + "logps/rejected": -528.3553466796875, + "loss": 0.045, + "rewards/chosen": 3.45365309715271, + "rewards/margins": 10.900380849838257, + "rewards/rejected": -7.446727752685547, + "step": 3430 + }, + { + "epoch": 0.31347647327546824, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 7.782705933935629e-06, + "logits/chosen": 798183424.0, + "logits/rejected": 544639360.0, + "logps/chosen": -281.6935628255208, + "logps/rejected": -527.9147338867188, + "loss": 0.0401, + "rewards/chosen": 4.159964879353841, + "rewards/margins": 14.12805684407552, + "rewards/rejected": -9.96809196472168, + "step": 3431 + }, + { + "epoch": 0.3135678391959799, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 7.781511266005517e-06, + "logits/chosen": 532187072.0, + "logits/rejected": 1404048128.0, + "logps/chosen": -115.3485107421875, + "logps/rejected": -373.006103515625, + "loss": 0.0187, + "rewards/chosen": 4.135491371154785, + "rewards/margins": 11.276946544647217, + "rewards/rejected": -7.141455173492432, + "step": 3432 + }, + { + "epoch": 0.31365920511649154, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 7.780316368070196e-06, + "logits/chosen": 685089280.0, + "logits/rejected": 466846421.3333333, + "logps/chosen": -295.2703369140625, + "logps/rejected": -524.9766845703125, + "loss": 0.0299, + "rewards/chosen": 3.5472888946533203, + "rewards/margins": 11.563555399576822, + "rewards/rejected": -8.016266504923502, + "step": 3433 + }, + { + "epoch": 0.3137505710370032, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.779121240228468e-06, + "logits/chosen": 276790476.8, + "logits/rejected": 266065237.33333334, + "logps/chosen": -328.95576171875, + "logps/rejected": -518.3880208333334, + "loss": 0.0378, + "rewards/chosen": 3.036176300048828, + "rewards/margins": 11.426566060384115, + "rewards/rejected": -8.390389760335287, + "step": 3434 + }, + { + "epoch": 0.31384193695751483, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.777925882579167e-06, + "logits/chosen": 400042112.0, + "logits/rejected": 461555814.4, + "logps/chosen": -349.2785237630208, + "logps/rejected": -612.70517578125, + "loss": 0.052, + "rewards/chosen": 2.1172210375467935, + "rewards/margins": 12.588723723093668, + "rewards/rejected": -10.471502685546875, + "step": 3435 + }, + { + "epoch": 0.3139333028780265, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 7.776730295221131e-06, + "logits/chosen": 481077248.0, + "logits/rejected": 520736819.2, + "logps/chosen": -224.75752766927084, + "logps/rejected": -584.162646484375, + "loss": 0.0182, + "rewards/chosen": 3.8179848988850913, + "rewards/margins": 12.959435399373373, + "rewards/rejected": -9.141450500488281, + "step": 3436 + }, + { + "epoch": 0.31402466879853813, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 7.775534478253228e-06, + "logits/chosen": 505962666.6666667, + "logits/rejected": 752163072.0, + "logps/chosen": -379.3493245442708, + "logps/rejected": -539.3035888671875, + "loss": 0.0419, + "rewards/chosen": 3.339667638142904, + "rewards/margins": 13.53673013051351, + "rewards/rejected": -10.197062492370605, + "step": 3437 + }, + { + "epoch": 0.3141160347190498, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 7.77433843177434e-06, + "logits/chosen": 298720192.0, + "logits/rejected": 598857557.3333334, + "logps/chosen": -215.64321899414062, + "logps/rejected": -526.1598714192709, + "loss": 0.0127, + "rewards/chosen": 3.7104601860046387, + "rewards/margins": 13.772671540578207, + "rewards/rejected": -10.062211354573568, + "step": 3438 + }, + { + "epoch": 0.31420740063956143, + "grad_norm": 59.0, + "kl": 0.0, + "learning_rate": 7.773142155883366e-06, + "logits/chosen": 608433664.0, + "logits/rejected": 1104524185.6, + "logps/chosen": -289.1824951171875, + "logps/rejected": -474.90283203125, + "loss": 0.0876, + "rewards/chosen": 1.8746037483215332, + "rewards/margins": 8.268253803253174, + "rewards/rejected": -6.393650054931641, + "step": 3439 + }, + { + "epoch": 0.3142987665600731, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 7.771945650679232e-06, + "logits/chosen": 481257504.0, + "logits/rejected": 363302656.0, + "logps/chosen": -324.315185546875, + "logps/rejected": -389.682861328125, + "loss": 0.0081, + "rewards/chosen": 4.168407917022705, + "rewards/margins": 12.724005222320557, + "rewards/rejected": -8.555597305297852, + "step": 3440 + }, + { + "epoch": 0.3143901324805847, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 7.770748916260875e-06, + "logits/chosen": 545510976.0, + "logits/rejected": 419965184.0, + "logps/chosen": -203.4532928466797, + "logps/rejected": -555.2247721354166, + "loss": 0.0111, + "rewards/chosen": 3.233792304992676, + "rewards/margins": 13.975771268208822, + "rewards/rejected": -10.741978963216146, + "step": 3441 + }, + { + "epoch": 0.3144814984010964, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.769551952727255e-06, + "logits/chosen": 630717738.6666666, + "logits/rejected": 998548224.0, + "logps/chosen": -448.1518961588542, + "logps/rejected": -279.5130615234375, + "loss": 0.0442, + "rewards/chosen": 3.1440658569335938, + "rewards/margins": 9.488170623779297, + "rewards/rejected": -6.344104766845703, + "step": 3442 + }, + { + "epoch": 0.314572864321608, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 7.768354760177346e-06, + "logits/chosen": 639574272.0, + "logits/rejected": 564709760.0, + "logps/chosen": -293.37255859375, + "logps/rejected": -669.6825561523438, + "loss": 0.0113, + "rewards/chosen": 4.465263366699219, + "rewards/margins": 14.99534797668457, + "rewards/rejected": -10.530084609985352, + "step": 3443 + }, + { + "epoch": 0.3146642302421197, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.767157338710149e-06, + "logits/chosen": 399600810.6666667, + "logits/rejected": 681809536.0, + "logps/chosen": -300.00771077473956, + "logps/rejected": -838.697021484375, + "loss": 0.0441, + "rewards/chosen": 3.4676682154337564, + "rewards/margins": 10.978517214457193, + "rewards/rejected": -7.5108489990234375, + "step": 3444 + }, + { + "epoch": 0.3147555961626313, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 7.765959688424679e-06, + "logits/chosen": 619266432.0, + "logits/rejected": 534033568.0, + "logps/chosen": -452.2040710449219, + "logps/rejected": -470.074951171875, + "loss": 0.0125, + "rewards/chosen": 4.0172014236450195, + "rewards/margins": 12.612977981567383, + "rewards/rejected": -8.595776557922363, + "step": 3445 + }, + { + "epoch": 0.314846962083143, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 7.764761809419969e-06, + "logits/chosen": 439052800.0, + "logits/rejected": 646172864.0, + "logps/chosen": -265.64739990234375, + "logps/rejected": -580.9871215820312, + "loss": 0.0078, + "rewards/chosen": 4.45556640625, + "rewards/margins": 13.956483840942383, + "rewards/rejected": -9.500917434692383, + "step": 3446 + }, + { + "epoch": 0.3149383280036546, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.763563701795074e-06, + "logits/chosen": 621139584.0, + "logits/rejected": 311998131.2, + "logps/chosen": -353.6490885416667, + "logps/rejected": -389.866796875, + "loss": 0.0124, + "rewards/chosen": 3.837792714436849, + "rewards/margins": 13.4296506245931, + "rewards/rejected": -9.59185791015625, + "step": 3447 + }, + { + "epoch": 0.3150296939241663, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 7.762365365649068e-06, + "logits/chosen": 643836586.6666666, + "logits/rejected": 546901299.2, + "logps/chosen": -482.0414632161458, + "logps/rejected": -835.9181640625, + "loss": 0.0084, + "rewards/chosen": 4.061717351277669, + "rewards/margins": 16.084612401326496, + "rewards/rejected": -12.022895050048827, + "step": 3448 + }, + { + "epoch": 0.3151210598446779, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.761166801081037e-06, + "logits/chosen": 836176042.6666666, + "logits/rejected": 730681600.0, + "logps/chosen": -229.0634765625, + "logps/rejected": -420.78330078125, + "loss": 0.0215, + "rewards/chosen": 3.275029500325521, + "rewards/margins": 11.568048604329428, + "rewards/rejected": -8.293019104003907, + "step": 3449 + }, + { + "epoch": 0.3152124257651896, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.759968008190095e-06, + "logits/chosen": 653541760.0, + "logits/rejected": 733059737.6, + "logps/chosen": -356.5403645833333, + "logps/rejected": -539.382861328125, + "loss": 0.0147, + "rewards/chosen": 3.540008544921875, + "rewards/margins": 12.640187835693359, + "rewards/rejected": -9.100179290771484, + "step": 3450 + }, + { + "epoch": 0.3153037916857012, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 7.758768987075371e-06, + "logits/chosen": 826006937.6, + "logits/rejected": 516453504.0, + "logps/chosen": -230.74072265625, + "logps/rejected": -690.3024088541666, + "loss": 0.0175, + "rewards/chosen": 4.142966842651367, + "rewards/margins": 17.829153315226236, + "rewards/rejected": -13.68618647257487, + "step": 3451 + }, + { + "epoch": 0.3153951576062129, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 7.757569737836013e-06, + "logits/chosen": 416185651.2, + "logits/rejected": 406890368.0, + "logps/chosen": -284.290771484375, + "logps/rejected": -425.1213785807292, + "loss": 0.0264, + "rewards/chosen": 3.2904991149902343, + "rewards/margins": 11.310079956054688, + "rewards/rejected": -8.019580841064453, + "step": 3452 + }, + { + "epoch": 0.3154865235267245, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 7.756370260571184e-06, + "logits/chosen": 794247509.3333334, + "logits/rejected": 459987558.4, + "logps/chosen": -449.1168619791667, + "logps/rejected": -595.39794921875, + "loss": 0.0297, + "rewards/chosen": 3.2498130798339844, + "rewards/margins": 13.125489044189454, + "rewards/rejected": -9.87567596435547, + "step": 3453 + }, + { + "epoch": 0.3155778894472362, + "grad_norm": 0.734375, + "kl": 0.0, + "learning_rate": 7.755170555380075e-06, + "logits/chosen": 718890496.0, + "logits/rejected": 307578880.0, + "logps/chosen": -307.4938049316406, + "logps/rejected": -430.2274576822917, + "loss": 0.0045, + "rewards/chosen": 4.2322845458984375, + "rewards/margins": 13.606023152669271, + "rewards/rejected": -9.373738606770834, + "step": 3454 + }, + { + "epoch": 0.3156692553677478, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 7.75397062236189e-06, + "logits/chosen": 695517909.3333334, + "logits/rejected": 325442816.0, + "logps/chosen": -252.29901123046875, + "logps/rejected": -279.1183166503906, + "loss": 0.0385, + "rewards/chosen": 3.2777894337972007, + "rewards/margins": 7.133628209431967, + "rewards/rejected": -3.8558387756347656, + "step": 3455 + }, + { + "epoch": 0.3157606212882595, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.752770461615849e-06, + "logits/chosen": 468455392.0, + "logits/rejected": 387680448.0, + "logps/chosen": -290.56243896484375, + "logps/rejected": -575.4432373046875, + "loss": 0.0256, + "rewards/chosen": 3.449648380279541, + "rewards/margins": 13.926950931549072, + "rewards/rejected": -10.477302551269531, + "step": 3456 + }, + { + "epoch": 0.3158519872087711, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 7.751570073241196e-06, + "logits/chosen": 375128224.0, + "logits/rejected": 624344746.6666666, + "logps/chosen": -472.4405822753906, + "logps/rejected": -424.828125, + "loss": 0.0127, + "rewards/chosen": 3.1069741249084473, + "rewards/margins": 11.3319624265035, + "rewards/rejected": -8.224988301595053, + "step": 3457 + }, + { + "epoch": 0.3159433531292828, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.750369457337191e-06, + "logits/chosen": 661703680.0, + "logits/rejected": 372269994.6666667, + "logps/chosen": -422.628173828125, + "logps/rejected": -499.9309895833333, + "loss": 0.0091, + "rewards/chosen": 4.130183219909668, + "rewards/margins": 11.178052584330242, + "rewards/rejected": -7.047869364420573, + "step": 3458 + }, + { + "epoch": 0.3160347190497944, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 7.749168614003116e-06, + "logits/chosen": 865181184.0, + "logits/rejected": 1232576768.0, + "logps/chosen": -449.4404296875, + "logps/rejected": -406.7541910807292, + "loss": 0.0224, + "rewards/chosen": 3.448224639892578, + "rewards/margins": 13.649388885498047, + "rewards/rejected": -10.201164245605469, + "step": 3459 + }, + { + "epoch": 0.3161260849703061, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 7.747967543338267e-06, + "logits/chosen": 500538265.6, + "logits/rejected": 295180522.6666667, + "logps/chosen": -342.63740234375, + "logps/rejected": -377.0652262369792, + "loss": 0.0234, + "rewards/chosen": 3.352290725708008, + "rewards/margins": 11.747148768107095, + "rewards/rejected": -8.394858042399088, + "step": 3460 + }, + { + "epoch": 0.3162174508908177, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 7.746766245441963e-06, + "logits/chosen": 605674368.0, + "logits/rejected": 740704384.0, + "logps/chosen": -369.640869140625, + "logps/rejected": -636.3416748046875, + "loss": 0.0461, + "rewards/chosen": 3.4479265213012695, + "rewards/margins": 12.791237831115723, + "rewards/rejected": -9.343311309814453, + "step": 3461 + }, + { + "epoch": 0.3163088168113294, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 7.74556472041354e-06, + "logits/chosen": 304699904.0, + "logits/rejected": 719339008.0, + "logps/chosen": -126.73442840576172, + "logps/rejected": -616.9921177455357, + "loss": 0.0046, + "rewards/chosen": 3.786017656326294, + "rewards/margins": 12.928992509841919, + "rewards/rejected": -9.142974853515625, + "step": 3462 + }, + { + "epoch": 0.316400182731841, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 7.744362968352352e-06, + "logits/chosen": 323962675.2, + "logits/rejected": 596729685.3333334, + "logps/chosen": -281.0546875, + "logps/rejected": -390.8270263671875, + "loss": 0.0341, + "rewards/chosen": 3.6419776916503905, + "rewards/margins": 12.242664082845053, + "rewards/rejected": -8.600686391194662, + "step": 3463 + }, + { + "epoch": 0.3164915486523527, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.743160989357774e-06, + "logits/chosen": 535948960.0, + "logits/rejected": 543118592.0, + "logps/chosen": -346.71533203125, + "logps/rejected": -483.3018493652344, + "loss": 0.0153, + "rewards/chosen": 3.718686819076538, + "rewards/margins": 12.636586904525757, + "rewards/rejected": -8.917900085449219, + "step": 3464 + }, + { + "epoch": 0.3165829145728643, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 7.7419587835292e-06, + "logits/chosen": 651516992.0, + "logits/rejected": 554441344.0, + "logps/chosen": -401.6859436035156, + "logps/rejected": -580.830322265625, + "loss": 0.0166, + "rewards/chosen": 3.4614877700805664, + "rewards/margins": 11.779807090759277, + "rewards/rejected": -8.318319320678711, + "step": 3465 + }, + { + "epoch": 0.316674280493376, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 7.740756350966037e-06, + "logits/chosen": 215637120.0, + "logits/rejected": 394748672.0, + "logps/chosen": -212.463134765625, + "logps/rejected": -421.819873046875, + "loss": 0.0107, + "rewards/chosen": 4.494548797607422, + "rewards/margins": 12.552710723876952, + "rewards/rejected": -8.05816192626953, + "step": 3466 + }, + { + "epoch": 0.3167656464138876, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.739553691767718e-06, + "logits/chosen": 515575637.3333333, + "logits/rejected": 450966560.0, + "logps/chosen": -440.8587646484375, + "logps/rejected": -496.2210693359375, + "loss": 0.0238, + "rewards/chosen": 3.763487180074056, + "rewards/margins": 16.14120896657308, + "rewards/rejected": -12.377721786499023, + "step": 3467 + }, + { + "epoch": 0.3168570123343993, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 7.738350806033692e-06, + "logits/chosen": 379078229.3333333, + "logits/rejected": 571427840.0, + "logps/chosen": -189.8680419921875, + "logps/rejected": -774.5560913085938, + "loss": 0.0129, + "rewards/chosen": 4.140391985575358, + "rewards/margins": 13.50517241160075, + "rewards/rejected": -9.36478042602539, + "step": 3468 + }, + { + "epoch": 0.3169483782549109, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 7.737147693863427e-06, + "logits/chosen": 563773610.6666666, + "logits/rejected": 444824704.0, + "logps/chosen": -322.9339192708333, + "logps/rejected": -336.57073974609375, + "loss": 0.0325, + "rewards/chosen": 3.3494949340820312, + "rewards/margins": 9.04418659210205, + "rewards/rejected": -5.6946916580200195, + "step": 3469 + }, + { + "epoch": 0.3170397441754226, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 7.735944355356407e-06, + "logits/chosen": 622889344.0, + "logits/rejected": 632428544.0, + "logps/chosen": -270.6987711588542, + "logps/rejected": -421.41103515625, + "loss": 0.0086, + "rewards/chosen": 4.0381418863932295, + "rewards/margins": 13.02288309733073, + "rewards/rejected": -8.9847412109375, + "step": 3470 + }, + { + "epoch": 0.3171311100959342, + "grad_norm": 47.5, + "kl": 0.0, + "learning_rate": 7.734740790612137e-06, + "logits/chosen": 703277226.6666666, + "logits/rejected": 577966540.8, + "logps/chosen": -345.05029296875, + "logps/rejected": -509.40849609375, + "loss": 0.0377, + "rewards/chosen": 2.978858311971029, + "rewards/margins": 12.52528928120931, + "rewards/rejected": -9.546430969238282, + "step": 3471 + }, + { + "epoch": 0.3172224760164459, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 7.73353699973014e-06, + "logits/chosen": 595774976.0, + "logits/rejected": 476081450.6666667, + "logps/chosen": -89.16378021240234, + "logps/rejected": -402.870849609375, + "loss": 0.0257, + "rewards/chosen": 2.6886439323425293, + "rewards/margins": 10.105869452158611, + "rewards/rejected": -7.417225519816081, + "step": 3472 + }, + { + "epoch": 0.3173138419369575, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 7.732332982809962e-06, + "logits/chosen": 424869120.0, + "logits/rejected": 711738709.3333334, + "logps/chosen": -308.4091796875, + "logps/rejected": -496.8771565755208, + "loss": 0.0189, + "rewards/chosen": 2.9101028442382812, + "rewards/margins": 11.076881408691406, + "rewards/rejected": -8.166778564453125, + "step": 3473 + }, + { + "epoch": 0.3174052078574692, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 7.731128739951162e-06, + "logits/chosen": 667440486.4, + "logits/rejected": 908683264.0, + "logps/chosen": -191.05146484375, + "logps/rejected": -778.27392578125, + "loss": 0.1326, + "rewards/chosen": 2.6101394653320313, + "rewards/margins": 13.356879170735677, + "rewards/rejected": -10.746739705403646, + "step": 3474 + }, + { + "epoch": 0.3174965737779808, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 7.729924271253319e-06, + "logits/chosen": 279329792.0, + "logits/rejected": 431087189.3333333, + "logps/chosen": -319.994189453125, + "logps/rejected": -359.9637044270833, + "loss": 0.0871, + "rewards/chosen": 3.493834686279297, + "rewards/margins": 10.595259221394857, + "rewards/rejected": -7.10142453511556, + "step": 3475 + }, + { + "epoch": 0.31758793969849247, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.728719576816032e-06, + "logits/chosen": 523715712.0, + "logits/rejected": 725851648.0, + "logps/chosen": -279.58070882161456, + "logps/rejected": -505.5736328125, + "loss": 0.0731, + "rewards/chosen": 2.880986531575521, + "rewards/margins": 9.800042470296225, + "rewards/rejected": -6.919055938720703, + "step": 3476 + }, + { + "epoch": 0.3176793056190041, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 7.727514656738918e-06, + "logits/chosen": 311777877.3333333, + "logits/rejected": 551413145.6, + "logps/chosen": -309.65565999348956, + "logps/rejected": -658.75458984375, + "loss": 0.0123, + "rewards/chosen": 3.762333552042643, + "rewards/margins": 13.927173487345376, + "rewards/rejected": -10.164839935302734, + "step": 3477 + }, + { + "epoch": 0.31777067153951577, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 7.726309511121614e-06, + "logits/chosen": 703069696.0, + "logits/rejected": 435451093.3333333, + "logps/chosen": -408.240185546875, + "logps/rejected": -437.8251139322917, + "loss": 0.0205, + "rewards/chosen": 3.499002456665039, + "rewards/margins": 13.844573338826498, + "rewards/rejected": -10.345570882161459, + "step": 3478 + }, + { + "epoch": 0.3178620374600274, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 7.725104140063772e-06, + "logits/chosen": 554745514.6666666, + "logits/rejected": 560716390.4, + "logps/chosen": -421.9391276041667, + "logps/rejected": -739.44599609375, + "loss": 0.0088, + "rewards/chosen": 4.327060063680013, + "rewards/margins": 15.996472295125326, + "rewards/rejected": -11.669412231445312, + "step": 3479 + }, + { + "epoch": 0.31795340338053907, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 7.723898543665067e-06, + "logits/chosen": 404746112.0, + "logits/rejected": 217117909.33333334, + "logps/chosen": -421.104296875, + "logps/rejected": -334.6854654947917, + "loss": 0.0262, + "rewards/chosen": 4.334239196777344, + "rewards/margins": 14.262740961710612, + "rewards/rejected": -9.928501764933268, + "step": 3480 + }, + { + "epoch": 0.3180447693010507, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 7.722692722025188e-06, + "logits/chosen": 549416857.6, + "logits/rejected": 795500032.0, + "logps/chosen": -267.2623291015625, + "logps/rejected": -726.0113118489584, + "loss": 0.0323, + "rewards/chosen": 3.357133483886719, + "rewards/margins": 14.340673065185547, + "rewards/rejected": -10.983539581298828, + "step": 3481 + }, + { + "epoch": 0.31813613522156237, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 7.721486675243851e-06, + "logits/chosen": 597018368.0, + "logits/rejected": 1079542886.4, + "logps/chosen": -385.8282877604167, + "logps/rejected": -1018.1896484375, + "loss": 0.0179, + "rewards/chosen": 3.0571155548095703, + "rewards/margins": 15.94550666809082, + "rewards/rejected": -12.88839111328125, + "step": 3482 + }, + { + "epoch": 0.318227501142074, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 7.72028040342078e-06, + "logits/chosen": 472969625.6, + "logits/rejected": 511507072.0, + "logps/chosen": -230.32421875, + "logps/rejected": -787.0514322916666, + "loss": 0.0139, + "rewards/chosen": 3.9374122619628906, + "rewards/margins": 18.709748586018883, + "rewards/rejected": -14.77233632405599, + "step": 3483 + }, + { + "epoch": 0.31831886706258566, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 7.719073906655722e-06, + "logits/chosen": 617619200.0, + "logits/rejected": 665212224.0, + "logps/chosen": -282.627685546875, + "logps/rejected": -537.625, + "loss": 0.0175, + "rewards/chosen": 3.4350409507751465, + "rewards/margins": 11.097041130065918, + "rewards/rejected": -7.6620001792907715, + "step": 3484 + }, + { + "epoch": 0.3184102329830973, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 7.717867185048446e-06, + "logits/chosen": 989571008.0, + "logits/rejected": 537067712.0, + "logps/chosen": -348.594970703125, + "logps/rejected": -418.17230224609375, + "loss": 0.0312, + "rewards/chosen": 2.985234260559082, + "rewards/margins": 11.160344123840332, + "rewards/rejected": -8.17510986328125, + "step": 3485 + }, + { + "epoch": 0.31850159890360896, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.716660238698734e-06, + "logits/chosen": 865922133.3333334, + "logits/rejected": 684521830.4, + "logps/chosen": -335.6634928385417, + "logps/rejected": -465.41259765625, + "loss": 0.1095, + "rewards/chosen": 2.7960468928019204, + "rewards/margins": 8.80057455698649, + "rewards/rejected": -6.00452766418457, + "step": 3486 + }, + { + "epoch": 0.3185929648241206, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 7.715453067706392e-06, + "logits/chosen": 743346090.6666666, + "logits/rejected": 994792550.4, + "logps/chosen": -378.9117838541667, + "logps/rejected": -508.6646484375, + "loss": 0.0142, + "rewards/chosen": 3.5184192657470703, + "rewards/margins": 13.217700576782226, + "rewards/rejected": -9.699281311035156, + "step": 3487 + }, + { + "epoch": 0.31868433074463226, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 7.714245672171238e-06, + "logits/chosen": 577154252.8, + "logits/rejected": 697029546.6666666, + "logps/chosen": -448.78935546875, + "logps/rejected": -495.1392415364583, + "loss": 0.0252, + "rewards/chosen": 3.7847030639648436, + "rewards/margins": 14.85539576212565, + "rewards/rejected": -11.070692698160807, + "step": 3488 + }, + { + "epoch": 0.3187756966651439, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 7.713038052193118e-06, + "logits/chosen": 375603904.0, + "logits/rejected": 415628842.6666667, + "logps/chosen": -284.3910217285156, + "logps/rejected": -578.0147705078125, + "loss": 0.0096, + "rewards/chosen": 3.257314443588257, + "rewards/margins": 12.436604579289755, + "rewards/rejected": -9.179290135701498, + "step": 3489 + }, + { + "epoch": 0.31886706258565556, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 7.711830207871887e-06, + "logits/chosen": 759994368.0, + "logits/rejected": 416431424.0, + "logps/chosen": -576.0037841796875, + "logps/rejected": -423.955078125, + "loss": 0.0081, + "rewards/chosen": 4.669650554656982, + "rewards/margins": 13.232934474945068, + "rewards/rejected": -8.563283920288086, + "step": 3490 + }, + { + "epoch": 0.3189584285061672, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 7.710622139307423e-06, + "logits/chosen": 526662400.0, + "logits/rejected": 539708211.2, + "logps/chosen": -433.7515462239583, + "logps/rejected": -453.809765625, + "loss": 0.0084, + "rewards/chosen": 4.25010363260905, + "rewards/margins": 12.7016783396403, + "rewards/rejected": -8.45157470703125, + "step": 3491 + }, + { + "epoch": 0.31904979442667886, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.709413846599622e-06, + "logits/chosen": 441979712.0, + "logits/rejected": 360871552.0, + "logps/chosen": -392.1941833496094, + "logps/rejected": -665.2188720703125, + "loss": 0.0206, + "rewards/chosen": 3.8335556983947754, + "rewards/margins": 16.36267328262329, + "rewards/rejected": -12.529117584228516, + "step": 3492 + }, + { + "epoch": 0.3191411603471905, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 7.708205329848401e-06, + "logits/chosen": 412928281.6, + "logits/rejected": 917454250.6666666, + "logps/chosen": -311.606689453125, + "logps/rejected": -884.08642578125, + "loss": 0.0234, + "rewards/chosen": 4.1370697021484375, + "rewards/margins": 20.261683146158855, + "rewards/rejected": -16.124613444010418, + "step": 3493 + }, + { + "epoch": 0.31923252626770215, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 7.70699658915369e-06, + "logits/chosen": 525892608.0, + "logits/rejected": 731764633.6, + "logps/chosen": -303.6075032552083, + "logps/rejected": -544.7478515625, + "loss": 0.0094, + "rewards/chosen": 4.126698811848958, + "rewards/margins": 13.959853108723959, + "rewards/rejected": -9.833154296875, + "step": 3494 + }, + { + "epoch": 0.3193238921882138, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 7.705787624615442e-06, + "logits/chosen": 275174656.0, + "logits/rejected": 813937664.0, + "logps/chosen": -106.53079986572266, + "logps/rejected": -504.59521484375, + "loss": 0.0115, + "rewards/chosen": 2.4309470653533936, + "rewards/margins": 10.933882202420916, + "rewards/rejected": -8.502935137067523, + "step": 3495 + }, + { + "epoch": 0.31941525810872545, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 7.70457843633363e-06, + "logits/chosen": 350600256.0, + "logits/rejected": 569698602.6666666, + "logps/chosen": -396.5909423828125, + "logps/rejected": -450.2544759114583, + "loss": 0.0168, + "rewards/chosen": 2.7768936157226562, + "rewards/margins": 11.207903544108072, + "rewards/rejected": -8.431009928385416, + "step": 3496 + }, + { + "epoch": 0.3195066240292371, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 7.703369024408235e-06, + "logits/chosen": 519195861.3333333, + "logits/rejected": 385344409.6, + "logps/chosen": -225.43265787760416, + "logps/rejected": -589.53896484375, + "loss": 0.0064, + "rewards/chosen": 4.201023101806641, + "rewards/margins": 12.788056945800781, + "rewards/rejected": -8.58703384399414, + "step": 3497 + }, + { + "epoch": 0.31959798994974875, + "grad_norm": 0.7265625, + "kl": 0.0, + "learning_rate": 7.702159388939272e-06, + "logits/chosen": 693953152.0, + "logits/rejected": 421224704.0, + "logps/chosen": -367.4750671386719, + "logps/rejected": -589.6660563151041, + "loss": 0.0036, + "rewards/chosen": 4.3067426681518555, + "rewards/margins": 14.72754192352295, + "rewards/rejected": -10.420799255371094, + "step": 3498 + }, + { + "epoch": 0.31968935587026037, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.700949530026764e-06, + "logits/chosen": 888468260.5714285, + "logits/rejected": 626215552.0, + "logps/chosen": -231.32561383928572, + "logps/rejected": -592.0045166015625, + "loss": 0.0441, + "rewards/chosen": 3.285249710083008, + "rewards/margins": 12.458650588989258, + "rewards/rejected": -9.17340087890625, + "step": 3499 + }, + { + "epoch": 0.31978072179077205, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 7.699739447770752e-06, + "logits/chosen": 668429696.0, + "logits/rejected": 526815552.0, + "logps/chosen": -358.2160949707031, + "logps/rejected": -296.47100830078125, + "loss": 0.0435, + "rewards/chosen": 2.569131851196289, + "rewards/margins": 11.005615234375, + "rewards/rejected": -8.436483383178711, + "step": 3500 + }, + { + "epoch": 0.31987208771128367, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.698529142271303e-06, + "logits/chosen": 626519808.0, + "logits/rejected": 236622496.0, + "logps/chosen": -475.1646321614583, + "logps/rejected": -307.06329345703125, + "loss": 0.0746, + "rewards/chosen": 2.860457420349121, + "rewards/margins": 8.696475505828857, + "rewards/rejected": -5.836018085479736, + "step": 3501 + }, + { + "epoch": 0.31996345363179535, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 7.697318613628496e-06, + "logits/chosen": 476932096.0, + "logits/rejected": 440773760.0, + "logps/chosen": -377.4140218098958, + "logps/rejected": -470.4162902832031, + "loss": 0.0394, + "rewards/chosen": 3.2880468368530273, + "rewards/margins": 13.807888984680176, + "rewards/rejected": -10.519842147827148, + "step": 3502 + }, + { + "epoch": 0.32005481955230697, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 7.696107861942431e-06, + "logits/chosen": 775561045.3333334, + "logits/rejected": 1135749836.8, + "logps/chosen": -370.452392578125, + "logps/rejected": -739.701806640625, + "loss": 0.0099, + "rewards/chosen": 3.8760757446289062, + "rewards/margins": 11.988414764404297, + "rewards/rejected": -8.11233901977539, + "step": 3503 + }, + { + "epoch": 0.32014618547281865, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 7.694896887313225e-06, + "logits/chosen": 749623552.0, + "logits/rejected": 710310208.0, + "logps/chosen": -295.5060729980469, + "logps/rejected": -502.6708984375, + "loss": 0.0462, + "rewards/chosen": 2.874950408935547, + "rewards/margins": 12.690422058105469, + "rewards/rejected": -9.815471649169922, + "step": 3504 + }, + { + "epoch": 0.32023755139333027, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.693685689841018e-06, + "logits/chosen": 298200384.0, + "logits/rejected": 949649846.8571428, + "logps/chosen": -196.6048126220703, + "logps/rejected": -395.2351771763393, + "loss": 0.0094, + "rewards/chosen": 2.7500107288360596, + "rewards/margins": 9.692549398967198, + "rewards/rejected": -6.942538670131138, + "step": 3505 + }, + { + "epoch": 0.32032891731384194, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.692474269625961e-06, + "logits/chosen": 341782826.6666667, + "logits/rejected": 524228454.4, + "logps/chosen": -401.7542317708333, + "logps/rejected": -604.00205078125, + "loss": 0.0225, + "rewards/chosen": 4.338537851969401, + "rewards/margins": 13.1805414835612, + "rewards/rejected": -8.842003631591798, + "step": 3506 + }, + { + "epoch": 0.32042028323435356, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 7.691262626768226e-06, + "logits/chosen": 518894720.0, + "logits/rejected": 296086976.0, + "logps/chosen": -372.5334167480469, + "logps/rejected": -498.31817626953125, + "loss": 0.0137, + "rewards/chosen": 3.6533761024475098, + "rewards/margins": 12.478479862213135, + "rewards/rejected": -8.825103759765625, + "step": 3507 + }, + { + "epoch": 0.32051164915486524, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 7.690050761368007e-06, + "logits/chosen": 524522956.8, + "logits/rejected": 445234261.3333333, + "logps/chosen": -375.708251953125, + "logps/rejected": -397.1079508463542, + "loss": 0.0192, + "rewards/chosen": 3.878936004638672, + "rewards/margins": 12.722331237792968, + "rewards/rejected": -8.843395233154297, + "step": 3508 + }, + { + "epoch": 0.32060301507537686, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.688838673525516e-06, + "logits/chosen": 424487168.0, + "logits/rejected": 421402291.2, + "logps/chosen": -392.5672200520833, + "logps/rejected": -470.221142578125, + "loss": 0.0214, + "rewards/chosen": 3.4138005574544272, + "rewards/margins": 12.884073384602866, + "rewards/rejected": -9.470272827148438, + "step": 3509 + }, + { + "epoch": 0.32069438099588854, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 7.687626363340978e-06, + "logits/chosen": 473996160.0, + "logits/rejected": 576565717.3333334, + "logps/chosen": -236.81549072265625, + "logps/rejected": -527.9121907552084, + "loss": 0.0158, + "rewards/chosen": 4.250545024871826, + "rewards/margins": 12.674894491831461, + "rewards/rejected": -8.424349466959635, + "step": 3510 + }, + { + "epoch": 0.32078574691640016, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 7.686413830914642e-06, + "logits/chosen": 872037376.0, + "logits/rejected": 371453152.0, + "logps/chosen": -539.339599609375, + "logps/rejected": -267.1338806152344, + "loss": 0.0193, + "rewards/chosen": 3.395831346511841, + "rewards/margins": 11.668587923049927, + "rewards/rejected": -8.272756576538086, + "step": 3511 + }, + { + "epoch": 0.32087711283691184, + "grad_norm": 0.7734375, + "kl": 0.0, + "learning_rate": 7.685201076346773e-06, + "logits/chosen": 1690945024.0, + "logits/rejected": 656072106.6666666, + "logps/chosen": -316.6110534667969, + "logps/rejected": -365.354736328125, + "loss": 0.0051, + "rewards/chosen": 4.150899887084961, + "rewards/margins": 13.627246220906576, + "rewards/rejected": -9.476346333821615, + "step": 3512 + }, + { + "epoch": 0.32096847875742346, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 7.683988099737652e-06, + "logits/chosen": 471577002.6666667, + "logits/rejected": 510425395.2, + "logps/chosen": -160.84266153971353, + "logps/rejected": -427.988818359375, + "loss": 0.0136, + "rewards/chosen": 3.7046705881754556, + "rewards/margins": 12.360972468058268, + "rewards/rejected": -8.656301879882813, + "step": 3513 + }, + { + "epoch": 0.32105984467793514, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 7.682774901187585e-06, + "logits/chosen": 376804906.6666667, + "logits/rejected": 509510604.8, + "logps/chosen": -222.34639485677084, + "logps/rejected": -408.830517578125, + "loss": 0.0089, + "rewards/chosen": 3.826775550842285, + "rewards/margins": 11.815529823303223, + "rewards/rejected": -7.9887542724609375, + "step": 3514 + }, + { + "epoch": 0.32115121059844676, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 7.681561480796886e-06, + "logits/chosen": 644684185.6, + "logits/rejected": 402750549.3333333, + "logps/chosen": -429.529052734375, + "logps/rejected": -466.787841796875, + "loss": 0.0481, + "rewards/chosen": 2.8912534713745117, + "rewards/margins": 11.741796811421713, + "rewards/rejected": -8.850543340047201, + "step": 3515 + }, + { + "epoch": 0.32124257651895843, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.680347838665902e-06, + "logits/chosen": 586810368.0, + "logits/rejected": 384413866.6666667, + "logps/chosen": -233.434228515625, + "logps/rejected": -407.1798502604167, + "loss": 0.0173, + "rewards/chosen": 4.414683532714844, + "rewards/margins": 13.719681294759116, + "rewards/rejected": -9.304997762044271, + "step": 3516 + }, + { + "epoch": 0.32133394243947005, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 7.679133974894984e-06, + "logits/chosen": 311553578.6666667, + "logits/rejected": 454318694.4, + "logps/chosen": -312.0650227864583, + "logps/rejected": -516.95771484375, + "loss": 0.0048, + "rewards/chosen": 4.640145619710286, + "rewards/margins": 13.774175771077473, + "rewards/rejected": -9.134030151367188, + "step": 3517 + }, + { + "epoch": 0.32142530835998173, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 7.677919889584508e-06, + "logits/chosen": 540472832.0, + "logits/rejected": 617997397.3333334, + "logps/chosen": -429.79864501953125, + "logps/rejected": -557.6952311197916, + "loss": 0.0056, + "rewards/chosen": 3.920515537261963, + "rewards/margins": 13.799014886220297, + "rewards/rejected": -9.878499348958334, + "step": 3518 + }, + { + "epoch": 0.32151667428049335, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 7.676705582834871e-06, + "logits/chosen": 1095517388.8, + "logits/rejected": 565442048.0, + "logps/chosen": -420.39306640625, + "logps/rejected": -440.1455891927083, + "loss": 0.026, + "rewards/chosen": 3.797773742675781, + "rewards/margins": 11.684413655598958, + "rewards/rejected": -7.886639912923177, + "step": 3519 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 7.67549105474648e-06, + "logits/chosen": 469922986.6666667, + "logits/rejected": 355104256.0, + "logps/chosen": -225.38094075520834, + "logps/rejected": -395.602490234375, + "loss": 0.0194, + "rewards/chosen": 3.006908098856608, + "rewards/margins": 12.229810015360513, + "rewards/rejected": -9.222901916503906, + "step": 3520 + }, + { + "epoch": 0.32169940612151665, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 7.67427630541977e-06, + "logits/chosen": 670353408.0, + "logits/rejected": 587463594.6666666, + "logps/chosen": -495.865380859375, + "logps/rejected": -529.1979166666666, + "loss": 0.0148, + "rewards/chosen": 4.356261825561523, + "rewards/margins": 13.026249821980795, + "rewards/rejected": -8.669987996419271, + "step": 3521 + }, + { + "epoch": 0.32179077204202833, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 7.673061334955187e-06, + "logits/chosen": 952867840.0, + "logits/rejected": 1055940437.3333334, + "logps/chosen": -387.11220703125, + "logps/rejected": -491.9967041015625, + "loss": 0.0199, + "rewards/chosen": 4.103912734985352, + "rewards/margins": 12.02161954243978, + "rewards/rejected": -7.917706807454427, + "step": 3522 + }, + { + "epoch": 0.32188213796253995, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 7.671846143453198e-06, + "logits/chosen": 382105728.0, + "logits/rejected": 372444352.0, + "logps/chosen": -331.4906412760417, + "logps/rejected": -299.0704650878906, + "loss": 0.0167, + "rewards/chosen": 4.327219009399414, + "rewards/margins": 11.894097328186035, + "rewards/rejected": -7.566878318786621, + "step": 3523 + }, + { + "epoch": 0.3219735038830516, + "grad_norm": 33.0, + "kl": 0.0, + "learning_rate": 7.670630731014288e-06, + "logits/chosen": 406128713.14285713, + "logits/rejected": 371714816.0, + "logps/chosen": -240.86872209821428, + "logps/rejected": -252.40484619140625, + "loss": 0.2396, + "rewards/chosen": 1.951380729675293, + "rewards/margins": 9.45545482635498, + "rewards/rejected": -7.5040740966796875, + "step": 3524 + }, + { + "epoch": 0.32206486980356325, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 7.66941509773896e-06, + "logits/chosen": 491752362.6666667, + "logits/rejected": 495583334.4, + "logps/chosen": -218.5654093424479, + "logps/rejected": -535.50263671875, + "loss": 0.0264, + "rewards/chosen": 2.9970954259236655, + "rewards/margins": 11.699063809712728, + "rewards/rejected": -8.701968383789062, + "step": 3525 + }, + { + "epoch": 0.3221562357240749, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 7.668199243727736e-06, + "logits/chosen": 750668736.0, + "logits/rejected": 492791637.3333333, + "logps/chosen": -596.4805908203125, + "logps/rejected": -359.6081136067708, + "loss": 0.0082, + "rewards/chosen": 3.7245211601257324, + "rewards/margins": 12.24434487024943, + "rewards/rejected": -8.519823710123697, + "step": 3526 + }, + { + "epoch": 0.32224760164458655, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 7.666983169081157e-06, + "logits/chosen": 340364800.0, + "logits/rejected": 386479488.0, + "logps/chosen": -219.8671875, + "logps/rejected": -280.2060241699219, + "loss": 0.0094, + "rewards/chosen": 4.741456031799316, + "rewards/margins": 13.603341102600098, + "rewards/rejected": -8.861885070800781, + "step": 3527 + }, + { + "epoch": 0.3223389675650982, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 7.665766873899778e-06, + "logits/chosen": 481774944.0, + "logits/rejected": 767269888.0, + "logps/chosen": -372.32037353515625, + "logps/rejected": -588.9557291666666, + "loss": 0.0076, + "rewards/chosen": 3.5020217895507812, + "rewards/margins": 14.071699778238932, + "rewards/rejected": -10.56967798868815, + "step": 3528 + }, + { + "epoch": 0.32243033348560984, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 7.66455035828418e-06, + "logits/chosen": 654651264.0, + "logits/rejected": 690777728.0, + "logps/chosen": -438.23907470703125, + "logps/rejected": -592.7564697265625, + "loss": 0.0111, + "rewards/chosen": 4.152948379516602, + "rewards/margins": 14.535615921020508, + "rewards/rejected": -10.382667541503906, + "step": 3529 + }, + { + "epoch": 0.3225216994061215, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 7.663333622334953e-06, + "logits/chosen": 621706069.3333334, + "logits/rejected": 959329894.4, + "logps/chosen": -462.9033203125, + "logps/rejected": -401.372900390625, + "loss": 0.0313, + "rewards/chosen": 3.4005514780680337, + "rewards/margins": 11.945413080851237, + "rewards/rejected": -8.544861602783204, + "step": 3530 + }, + { + "epoch": 0.32261306532663314, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 7.662116666152713e-06, + "logits/chosen": 450313824.0, + "logits/rejected": 501921536.0, + "logps/chosen": -311.87890625, + "logps/rejected": -629.564697265625, + "loss": 0.0179, + "rewards/chosen": 3.940645694732666, + "rewards/margins": 11.955507755279541, + "rewards/rejected": -8.014862060546875, + "step": 3531 + }, + { + "epoch": 0.3227044312471448, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.660899489838087e-06, + "logits/chosen": 954523340.8, + "logits/rejected": 342732736.0, + "logps/chosen": -266.5600830078125, + "logps/rejected": -391.9936116536458, + "loss": 0.0543, + "rewards/chosen": 3.7979087829589844, + "rewards/margins": 11.965655008951822, + "rewards/rejected": -8.167746225992838, + "step": 3532 + }, + { + "epoch": 0.32279579716765644, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 7.65968209349173e-06, + "logits/chosen": 708080725.3333334, + "logits/rejected": 366679884.8, + "logps/chosen": -239.0866902669271, + "logps/rejected": -452.71513671875, + "loss": 0.0205, + "rewards/chosen": 2.8813584645589194, + "rewards/margins": 13.697419865926108, + "rewards/rejected": -10.816061401367188, + "step": 3533 + }, + { + "epoch": 0.3228871630881681, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 7.658464477214305e-06, + "logits/chosen": 359515552.0, + "logits/rejected": 738609792.0, + "logps/chosen": -158.34967041015625, + "logps/rejected": -486.96490478515625, + "loss": 0.1574, + "rewards/chosen": 3.2248294353485107, + "rewards/margins": 10.188109159469604, + "rewards/rejected": -6.963279724121094, + "step": 3534 + }, + { + "epoch": 0.32297852900867974, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 7.6572466411065e-06, + "logits/chosen": 671441728.0, + "logits/rejected": 1156373333.3333333, + "logps/chosen": -383.23541259765625, + "logps/rejected": -731.9771321614584, + "loss": 0.0089, + "rewards/chosen": 3.773817539215088, + "rewards/margins": 13.12322727839152, + "rewards/rejected": -9.349409739176432, + "step": 3535 + }, + { + "epoch": 0.3230698949291914, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 7.656028585269017e-06, + "logits/chosen": 488802496.0, + "logits/rejected": 503717088.0, + "logps/chosen": -295.2570495605469, + "logps/rejected": -484.045654296875, + "loss": 0.0102, + "rewards/chosen": 4.1095781326293945, + "rewards/margins": 13.050618171691895, + "rewards/rejected": -8.9410400390625, + "step": 3536 + }, + { + "epoch": 0.32316126084970304, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 7.65481030980258e-06, + "logits/chosen": 632543027.2, + "logits/rejected": 366328490.6666667, + "logps/chosen": -394.296826171875, + "logps/rejected": -434.4597981770833, + "loss": 0.0168, + "rewards/chosen": 3.6814136505126953, + "rewards/margins": 14.17878278096517, + "rewards/rejected": -10.497369130452475, + "step": 3537 + }, + { + "epoch": 0.3232526267702147, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 7.653591814807925e-06, + "logits/chosen": 360125792.0, + "logits/rejected": 663352256.0, + "logps/chosen": -320.62872314453125, + "logps/rejected": -685.7225341796875, + "loss": 0.0238, + "rewards/chosen": 4.035454273223877, + "rewards/margins": 12.657652378082275, + "rewards/rejected": -8.622198104858398, + "step": 3538 + }, + { + "epoch": 0.32334399269072633, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 7.652373100385816e-06, + "logits/chosen": 660823168.0, + "logits/rejected": 405591488.0, + "logps/chosen": -428.74359130859375, + "logps/rejected": -456.8280334472656, + "loss": 0.0152, + "rewards/chosen": 3.5272529125213623, + "rewards/margins": 12.999765157699585, + "rewards/rejected": -9.472512245178223, + "step": 3539 + }, + { + "epoch": 0.323435358611238, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.651154166637025e-06, + "logits/chosen": 596333184.0, + "logits/rejected": 450702688.0, + "logps/chosen": -420.1725667317708, + "logps/rejected": -510.88653564453125, + "loss": 0.0228, + "rewards/chosen": 3.8109003702799478, + "rewards/margins": 13.22792116800944, + "rewards/rejected": -9.417020797729492, + "step": 3540 + }, + { + "epoch": 0.32352672453174963, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 7.649935013662347e-06, + "logits/chosen": 401044480.0, + "logits/rejected": 455890176.0, + "logps/chosen": -304.3531901041667, + "logps/rejected": -479.725244140625, + "loss": 0.0288, + "rewards/chosen": 2.6255900065104165, + "rewards/margins": 11.380399576822915, + "rewards/rejected": -8.7548095703125, + "step": 3541 + }, + { + "epoch": 0.3236180904522613, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 7.648715641562598e-06, + "logits/chosen": 372940288.0, + "logits/rejected": 607386752.0, + "logps/chosen": -218.81241861979166, + "logps/rejected": -907.2904052734375, + "loss": 0.0256, + "rewards/chosen": 3.996970812479655, + "rewards/margins": 13.77043596903483, + "rewards/rejected": -9.773465156555176, + "step": 3542 + }, + { + "epoch": 0.32370945637277293, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 7.647496050438603e-06, + "logits/chosen": 696745536.0, + "logits/rejected": 409338752.0, + "logps/chosen": -518.391845703125, + "logps/rejected": -452.4021911621094, + "loss": 0.0583, + "rewards/chosen": 3.449815273284912, + "rewards/margins": 10.105623722076416, + "rewards/rejected": -6.655808448791504, + "step": 3543 + }, + { + "epoch": 0.3238008222932846, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 7.646276240391218e-06, + "logits/chosen": 420133888.0, + "logits/rejected": 447536608.0, + "logps/chosen": -288.92457798549106, + "logps/rejected": -426.8572692871094, + "loss": 0.0398, + "rewards/chosen": 3.9072159358433316, + "rewards/margins": 12.857810837881907, + "rewards/rejected": -8.950594902038574, + "step": 3544 + }, + { + "epoch": 0.32389218821379623, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 7.645056211521302e-06, + "logits/chosen": 581609002.6666666, + "logits/rejected": 464348467.2, + "logps/chosen": -342.8544108072917, + "logps/rejected": -453.72470703125, + "loss": 0.0173, + "rewards/chosen": 3.0693699518839517, + "rewards/margins": 11.076668230692546, + "rewards/rejected": -8.007298278808594, + "step": 3545 + }, + { + "epoch": 0.3239835541343079, + "grad_norm": 26.875, + "kl": 0.0, + "learning_rate": 7.643835963929747e-06, + "logits/chosen": 588190464.0, + "logits/rejected": 486093653.3333333, + "logps/chosen": -435.3044921875, + "logps/rejected": -446.9014078776042, + "loss": 0.0705, + "rewards/chosen": 3.0587604522705076, + "rewards/margins": 12.743658192952473, + "rewards/rejected": -9.684897740681967, + "step": 3546 + }, + { + "epoch": 0.3240749200548195, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 7.642615497717452e-06, + "logits/chosen": 1180874496.0, + "logits/rejected": 375401045.3333333, + "logps/chosen": -179.45074462890625, + "logps/rejected": -412.1807861328125, + "loss": 0.0109, + "rewards/chosen": 3.994352102279663, + "rewards/margins": 12.345067739486694, + "rewards/rejected": -8.350715637207031, + "step": 3547 + }, + { + "epoch": 0.3241662859753312, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 7.641394812985343e-06, + "logits/chosen": 346390476.8, + "logits/rejected": 459713536.0, + "logps/chosen": -243.723974609375, + "logps/rejected": -656.6234944661459, + "loss": 0.0156, + "rewards/chosen": 4.362246704101563, + "rewards/margins": 14.410938771565757, + "rewards/rejected": -10.048692067464193, + "step": 3548 + }, + { + "epoch": 0.3242576518958428, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 7.640173909834353e-06, + "logits/chosen": 773927253.3333334, + "logits/rejected": 620843929.6, + "logps/chosen": -129.2515869140625, + "logps/rejected": -417.320703125, + "loss": 0.0404, + "rewards/chosen": 3.316929499308268, + "rewards/margins": 13.553498713175456, + "rewards/rejected": -10.236569213867188, + "step": 3549 + }, + { + "epoch": 0.3243490178163545, + "grad_norm": 37.75, + "kl": 0.0, + "learning_rate": 7.638952788365444e-06, + "logits/chosen": 553948864.0, + "logits/rejected": 390338432.0, + "logps/chosen": -235.96511840820312, + "logps/rejected": -551.8512573242188, + "loss": 0.0526, + "rewards/chosen": 2.7972030639648438, + "rewards/margins": 11.520793914794922, + "rewards/rejected": -8.723590850830078, + "step": 3550 + }, + { + "epoch": 0.3244403837368661, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 7.637731448679588e-06, + "logits/chosen": 906967616.0, + "logits/rejected": 624579754.6666666, + "logps/chosen": -467.19561767578125, + "logps/rejected": -349.97021484375, + "loss": 0.0269, + "rewards/chosen": 2.603817939758301, + "rewards/margins": 9.525110562642414, + "rewards/rejected": -6.921292622884114, + "step": 3551 + }, + { + "epoch": 0.3245317496573778, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 7.636509890877782e-06, + "logits/chosen": 540432320.0, + "logits/rejected": 562658752.0, + "logps/chosen": -301.2670593261719, + "logps/rejected": -528.643798828125, + "loss": 0.0174, + "rewards/chosen": 3.8930113315582275, + "rewards/margins": 13.594901323318481, + "rewards/rejected": -9.701889991760254, + "step": 3552 + }, + { + "epoch": 0.3246231155778894, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.635288115061035e-06, + "logits/chosen": 500130688.0, + "logits/rejected": 309535963.4285714, + "logps/chosen": -278.9001770019531, + "logps/rejected": -444.97408621651783, + "loss": 0.0265, + "rewards/chosen": 2.9352691173553467, + "rewards/margins": 10.665610347475324, + "rewards/rejected": -7.730341230119977, + "step": 3553 + }, + { + "epoch": 0.3247144814984011, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.634066121330377e-06, + "logits/chosen": 373315584.0, + "logits/rejected": 297784917.3333333, + "logps/chosen": -341.9939453125, + "logps/rejected": -339.03525797526044, + "loss": 0.1245, + "rewards/chosen": 3.2781768798828126, + "rewards/margins": 8.145467376708984, + "rewards/rejected": -4.867290496826172, + "step": 3554 + }, + { + "epoch": 0.3248058474189127, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 7.632843909786855e-06, + "logits/chosen": 472152160.0, + "logits/rejected": 961284096.0, + "logps/chosen": -179.28941345214844, + "logps/rejected": -399.177734375, + "loss": 0.045, + "rewards/chosen": 2.4535746574401855, + "rewards/margins": 9.752543449401855, + "rewards/rejected": -7.29896879196167, + "step": 3555 + }, + { + "epoch": 0.3248972133394244, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 7.631621480531538e-06, + "logits/chosen": 588043562.6666666, + "logits/rejected": 656451712.0, + "logps/chosen": -424.509033203125, + "logps/rejected": -300.07537841796875, + "loss": 0.0342, + "rewards/chosen": 3.35546875, + "rewards/margins": 12.637853622436523, + "rewards/rejected": -9.282384872436523, + "step": 3556 + }, + { + "epoch": 0.324988579259936, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 7.630398833665507e-06, + "logits/chosen": 470959829.3333333, + "logits/rejected": 719301017.6, + "logps/chosen": -177.513671875, + "logps/rejected": -882.1490234375, + "loss": 0.01, + "rewards/chosen": 3.718776067097982, + "rewards/margins": 15.583603032430014, + "rewards/rejected": -11.864826965332032, + "step": 3557 + }, + { + "epoch": 0.3250799451804477, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 7.6291759692898616e-06, + "logits/rejected": 609967744.0, + "logps/rejected": -439.651611328125, + "loss": 0.0908, + "rewards/rejected": -7.3602495193481445, + "step": 3558 + }, + { + "epoch": 0.3251713111009593, + "grad_norm": 30.5, + "kl": 0.0, + "learning_rate": 7.627952887505725e-06, + "logits/chosen": 294166336.0, + "logits/rejected": 433024128.0, + "logps/chosen": -295.2568359375, + "logps/rejected": -367.4286804199219, + "loss": 0.0869, + "rewards/chosen": 4.41558313369751, + "rewards/margins": 9.979160785675049, + "rewards/rejected": -5.563577651977539, + "step": 3559 + }, + { + "epoch": 0.325262677021471, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 7.6267295884142325e-06, + "logits/chosen": 1024673075.2, + "logits/rejected": 638681429.3333334, + "logps/chosen": -323.335986328125, + "logps/rejected": -407.2169189453125, + "loss": 0.0255, + "rewards/chosen": 3.765419769287109, + "rewards/margins": 11.587452697753907, + "rewards/rejected": -7.822032928466797, + "step": 3560 + }, + { + "epoch": 0.3253540429419826, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 7.62550607211654e-06, + "logits/chosen": 612391424.0, + "logits/rejected": 374417237.3333333, + "logps/chosen": -327.0685546875, + "logps/rejected": -324.46185302734375, + "loss": 0.0271, + "rewards/chosen": 3.4561389923095702, + "rewards/margins": 12.404968643188477, + "rewards/rejected": -8.948829650878906, + "step": 3561 + }, + { + "epoch": 0.3254454088624943, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 7.624282338713821e-06, + "logits/chosen": 280383200.0, + "logits/rejected": 375059264.0, + "logps/chosen": -181.0042724609375, + "logps/rejected": -372.177978515625, + "loss": 0.0238, + "rewards/chosen": 3.777161121368408, + "rewards/margins": 13.72275972366333, + "rewards/rejected": -9.945598602294922, + "step": 3562 + }, + { + "epoch": 0.3255367747830059, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 7.62305838830727e-06, + "logits/chosen": 354907690.6666667, + "logits/rejected": 371300288.0, + "logps/chosen": -304.62082926432294, + "logps/rejected": -604.8528442382812, + "loss": 0.0369, + "rewards/chosen": 3.1377700169881186, + "rewards/margins": 17.24368699391683, + "rewards/rejected": -14.105916976928711, + "step": 3563 + }, + { + "epoch": 0.3256281407035176, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.62183422099809e-06, + "logits/chosen": 629403072.0, + "logits/rejected": 561072054.8571428, + "logps/chosen": -572.2310791015625, + "logps/rejected": -462.1843959263393, + "loss": 0.0552, + "rewards/chosen": 2.732067823410034, + "rewards/margins": 9.225808450153895, + "rewards/rejected": -6.493740626743862, + "step": 3564 + }, + { + "epoch": 0.32571950662402926, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 7.620609836887513e-06, + "logits/chosen": 516970784.0, + "logits/rejected": 431429717.3333333, + "logps/chosen": -340.6190490722656, + "logps/rejected": -496.025634765625, + "loss": 0.0098, + "rewards/chosen": 3.247534990310669, + "rewards/margins": 11.966127316157023, + "rewards/rejected": -8.718592325846354, + "step": 3565 + }, + { + "epoch": 0.3258108725445409, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 7.6193852360767815e-06, + "logits/chosen": 797865984.0, + "logits/rejected": 1021747264.0, + "logps/chosen": -231.30706787109375, + "logps/rejected": -779.04541015625, + "loss": 0.0886, + "rewards/chosen": 2.0476126670837402, + "rewards/margins": 11.947971820831299, + "rewards/rejected": -9.900359153747559, + "step": 3566 + }, + { + "epoch": 0.32590223846505256, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 7.618160418667162e-06, + "logits/chosen": 434715520.0, + "logits/rejected": 420952448.0, + "logps/chosen": -217.19918823242188, + "logps/rejected": -489.557373046875, + "loss": 0.0322, + "rewards/chosen": 1.9687203168869019, + "rewards/margins": 12.276921153068542, + "rewards/rejected": -10.30820083618164, + "step": 3567 + }, + { + "epoch": 0.3259936043855642, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 7.616935384759933e-06, + "logits/chosen": 311918336.0, + "logits/rejected": 422041124.5714286, + "logps/chosen": -197.14779663085938, + "logps/rejected": -465.4518345424107, + "loss": 0.0258, + "rewards/chosen": 2.8301467895507812, + "rewards/margins": 11.495713370186943, + "rewards/rejected": -8.665566580636161, + "step": 3568 + }, + { + "epoch": 0.32608497030607586, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 7.615710134456394e-06, + "logits/chosen": 439669418.6666667, + "logits/rejected": 596057395.2, + "logps/chosen": -235.384033203125, + "logps/rejected": -532.88408203125, + "loss": 0.0163, + "rewards/chosen": 3.126840909322103, + "rewards/margins": 13.603376706441244, + "rewards/rejected": -10.47653579711914, + "step": 3569 + }, + { + "epoch": 0.3261763362265875, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.614484667857862e-06, + "logits/chosen": 514062037.3333333, + "logits/rejected": 642391142.4, + "logps/chosen": -253.26436360677084, + "logps/rejected": -464.7732421875, + "loss": 0.0156, + "rewards/chosen": 3.580852190653483, + "rewards/margins": 13.440855089823405, + "rewards/rejected": -9.860002899169922, + "step": 3570 + }, + { + "epoch": 0.32626770214709916, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.613258985065672e-06, + "logits/chosen": 559056832.0, + "logits/rejected": 355557440.0, + "logps/chosen": -358.1488037109375, + "logps/rejected": -350.7337646484375, + "loss": 0.0147, + "rewards/chosen": 3.880054235458374, + "rewards/margins": 11.521663904190063, + "rewards/rejected": -7.6416096687316895, + "step": 3571 + }, + { + "epoch": 0.3263590680676108, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 7.612033086181176e-06, + "logits/chosen": 735165824.0, + "logits/rejected": 865426176.0, + "logps/chosen": -357.83892822265625, + "logps/rejected": -383.28973388671875, + "loss": 0.0364, + "rewards/chosen": 3.0949716567993164, + "rewards/margins": 12.08259105682373, + "rewards/rejected": -8.987619400024414, + "step": 3572 + }, + { + "epoch": 0.32645043398812246, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.610806971305745e-06, + "logits/chosen": 356944716.8, + "logits/rejected": 989017941.3333334, + "logps/chosen": -306.48486328125, + "logps/rejected": -585.6981608072916, + "loss": 0.0255, + "rewards/chosen": 3.3898849487304688, + "rewards/margins": 12.575257619222006, + "rewards/rejected": -9.185372670491537, + "step": 3573 + }, + { + "epoch": 0.3265417999086341, + "grad_norm": 0.76171875, + "kl": 0.0, + "learning_rate": 7.609580640540768e-06, + "logits/chosen": 1370565376.0, + "logits/rejected": 757657139.2, + "logps/chosen": -900.564453125, + "logps/rejected": -498.9056640625, + "loss": 0.0034, + "rewards/chosen": 4.895041147867839, + "rewards/margins": 13.7147829691569, + "rewards/rejected": -8.819741821289062, + "step": 3574 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 7.608354093987649e-06, + "logits/chosen": 478627328.0, + "logits/rejected": 282475584.0, + "logps/chosen": -273.4754333496094, + "logps/rejected": -488.1600341796875, + "loss": 0.0206, + "rewards/chosen": 3.3138580322265625, + "rewards/margins": 13.643468856811523, + "rewards/rejected": -10.329610824584961, + "step": 3575 + }, + { + "epoch": 0.3267245317496574, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 7.607127331747815e-06, + "logits/chosen": 721371520.0, + "logits/rejected": 455052096.0, + "logps/chosen": -330.2841796875, + "logps/rejected": -445.0611572265625, + "loss": 0.0107, + "rewards/chosen": 4.265005111694336, + "rewards/margins": 12.10598373413086, + "rewards/rejected": -7.840978622436523, + "step": 3576 + }, + { + "epoch": 0.32681589767016905, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 7.605900353922704e-06, + "logits/chosen": 461766041.6, + "logits/rejected": 567539669.3333334, + "logps/chosen": -239.00263671875, + "logps/rejected": -609.5101318359375, + "loss": 0.0203, + "rewards/chosen": 3.6431529998779295, + "rewards/margins": 13.79297777811686, + "rewards/rejected": -10.149824778238932, + "step": 3577 + }, + { + "epoch": 0.3269072635906807, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.604673160613778e-06, + "logits/chosen": 553506304.0, + "logits/rejected": 328632960.0, + "logps/chosen": -307.54560546875, + "logps/rejected": -326.27512613932294, + "loss": 0.0282, + "rewards/chosen": 3.5999214172363283, + "rewards/margins": 11.951904805501304, + "rewards/rejected": -8.351983388264975, + "step": 3578 + }, + { + "epoch": 0.32699862951119235, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 7.603445751922516e-06, + "logits/chosen": 822510592.0, + "logits/rejected": 540683434.6666666, + "logps/chosen": -338.9902099609375, + "logps/rejected": -449.9728597005208, + "loss": 0.03, + "rewards/chosen": 3.1861244201660157, + "rewards/margins": 13.047892506917318, + "rewards/rejected": -9.861768086751303, + "step": 3579 + }, + { + "epoch": 0.327089995431704, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 7.602218127950412e-06, + "logits/chosen": 391475264.0, + "logits/rejected": 629388416.0, + "logps/chosen": -290.41650390625, + "logps/rejected": -482.4542541503906, + "loss": 0.0312, + "rewards/chosen": 3.2702889442443848, + "rewards/margins": 10.061282634735107, + "rewards/rejected": -6.790993690490723, + "step": 3580 + }, + { + "epoch": 0.32718136135221565, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 7.6009902887989775e-06, + "logits/chosen": 574438912.0, + "logits/rejected": 570099797.3333334, + "logps/chosen": -302.546875, + "logps/rejected": -414.7227376302083, + "loss": 0.0311, + "rewards/chosen": 3.0690322875976563, + "rewards/margins": 13.686229960123697, + "rewards/rejected": -10.617197672526041, + "step": 3581 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.599762234569745e-06, + "logits/chosen": 481661805.71428573, + "logits/rejected": 1918576896.0, + "logps/chosen": -394.360107421875, + "logps/rejected": -1437.334716796875, + "loss": 0.0552, + "rewards/chosen": 3.046075003487723, + "rewards/margins": 20.76936640058245, + "rewards/rejected": -17.723291397094727, + "step": 3582 + }, + { + "epoch": 0.32736409319323895, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 7.598533965364263e-06, + "logits/chosen": 327843200.0, + "logits/rejected": 349317568.0, + "logps/chosen": -306.4650065104167, + "logps/rejected": -441.47869873046875, + "loss": 0.0336, + "rewards/chosen": 3.4408251444498696, + "rewards/margins": 12.886825243631998, + "rewards/rejected": -9.446000099182129, + "step": 3583 + }, + { + "epoch": 0.32745545911375057, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 7.5973054812841e-06, + "logits/chosen": 598730432.0, + "logits/rejected": 444271488.0, + "logps/chosen": -409.9486083984375, + "logps/rejected": -509.00390625, + "loss": 0.0048, + "rewards/chosen": 4.3331098556518555, + "rewards/margins": 14.228793144226074, + "rewards/rejected": -9.895683288574219, + "step": 3584 + }, + { + "epoch": 0.32754682503426225, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 7.596076782430836e-06, + "logits/chosen": 995423360.0, + "logits/rejected": 913141589.3333334, + "logps/chosen": -328.799560546875, + "logps/rejected": -542.8427734375, + "loss": 0.0176, + "rewards/chosen": 3.4448609352111816, + "rewards/margins": 10.94551642735799, + "rewards/rejected": -7.50065549214681, + "step": 3585 + }, + { + "epoch": 0.32763819095477387, + "grad_norm": 43.0, + "kl": 0.0, + "learning_rate": 7.594847868906076e-06, + "logits/chosen": 421745749.3333333, + "logits/rejected": 526833356.8, + "logps/chosen": -386.4227701822917, + "logps/rejected": -583.8734375, + "loss": 0.025, + "rewards/chosen": 3.664440155029297, + "rewards/margins": 13.236930084228515, + "rewards/rejected": -9.572489929199218, + "step": 3586 + }, + { + "epoch": 0.32772955687528554, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 7.593618740811439e-06, + "logits/chosen": 561654592.0, + "logits/rejected": 793346816.0, + "logps/chosen": -300.21697998046875, + "logps/rejected": -400.90838623046875, + "loss": 0.0133, + "rewards/chosen": 3.7250118255615234, + "rewards/margins": 14.775339126586914, + "rewards/rejected": -11.05032730102539, + "step": 3587 + }, + { + "epoch": 0.32782092279579716, + "grad_norm": 0.80859375, + "kl": 0.0, + "learning_rate": 7.592389398248563e-06, + "logits/chosen": 323172032.0, + "logits/rejected": 536423104.0, + "logps/chosen": -232.06051635742188, + "logps/rejected": -592.1127319335938, + "loss": 0.0046, + "rewards/chosen": 5.2295241355896, + "rewards/margins": 16.064584255218506, + "rewards/rejected": -10.835060119628906, + "step": 3588 + }, + { + "epoch": 0.32791228871630884, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 7.5911598413191025e-06, + "logits/chosen": 693297280.0, + "logits/rejected": 362652501.3333333, + "logps/chosen": -266.7084655761719, + "logps/rejected": -462.1373291015625, + "loss": 0.1253, + "rewards/chosen": 3.40285587310791, + "rewards/margins": 9.714383761088055, + "rewards/rejected": -6.3115278879801435, + "step": 3589 + }, + { + "epoch": 0.32800365463682046, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 7.5899300701247315e-06, + "logits/chosen": 465677920.0, + "logits/rejected": 629264576.0, + "logps/chosen": -326.98931884765625, + "logps/rejected": -415.51513671875, + "loss": 0.0221, + "rewards/chosen": 3.354719638824463, + "rewards/margins": 11.896350383758545, + "rewards/rejected": -8.541630744934082, + "step": 3590 + }, + { + "epoch": 0.32809502055733214, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.588700084767138e-06, + "logits/chosen": 548250197.3333334, + "logits/rejected": 415141580.8, + "logps/chosen": -324.27626546223956, + "logps/rejected": -289.86455078125, + "loss": 0.0261, + "rewards/chosen": 3.4325714111328125, + "rewards/margins": 10.21783905029297, + "rewards/rejected": -6.785267639160156, + "step": 3591 + }, + { + "epoch": 0.32818638647784376, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.5874698853480345e-06, + "logits/chosen": 969113088.0, + "logits/rejected": 500512640.0, + "logps/chosen": -338.614013671875, + "logps/rejected": -665.3138834635416, + "loss": 0.0544, + "rewards/chosen": 3.6963970184326174, + "rewards/margins": 9.30965092976888, + "rewards/rejected": -5.613253911336263, + "step": 3592 + }, + { + "epoch": 0.32827775239835544, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 7.586239471969144e-06, + "logits/chosen": 526783317.3333333, + "logits/rejected": 570035200.0, + "logps/chosen": -376.4921468098958, + "logps/rejected": -750.55322265625, + "loss": 0.1004, + "rewards/chosen": 2.8815199534098306, + "rewards/margins": 13.428376833597818, + "rewards/rejected": -10.546856880187988, + "step": 3593 + }, + { + "epoch": 0.32836911831886706, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 7.585008844732211e-06, + "logits/chosen": 605296281.6, + "logits/rejected": 571246592.0, + "logps/chosen": -343.57509765625, + "logps/rejected": -912.1593424479166, + "loss": 0.0299, + "rewards/chosen": 3.680756759643555, + "rewards/margins": 12.020164362589519, + "rewards/rejected": -8.339407602945963, + "step": 3594 + }, + { + "epoch": 0.32846048423937874, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 7.583778003738997e-06, + "logits/chosen": 1276967552.0, + "logits/rejected": 1012322688.0, + "logps/chosen": -437.5440368652344, + "logps/rejected": -591.8616943359375, + "loss": 0.0175, + "rewards/chosen": 3.903036594390869, + "rewards/margins": 12.36679220199585, + "rewards/rejected": -8.46375560760498, + "step": 3595 + }, + { + "epoch": 0.32855185015989036, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 7.58254694909128e-06, + "logits/chosen": 454131968.0, + "logits/rejected": 345009024.0, + "logps/chosen": -315.60540771484375, + "logps/rejected": -377.0872802734375, + "loss": 0.0253, + "rewards/chosen": 3.693141460418701, + "rewards/margins": 11.637171745300293, + "rewards/rejected": -7.944030284881592, + "step": 3596 + }, + { + "epoch": 0.32864321608040203, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 7.581315680890862e-06, + "logits/chosen": 671719082.6666666, + "logits/rejected": 362022630.4, + "logps/chosen": -278.2054850260417, + "logps/rejected": -433.356396484375, + "loss": 0.0211, + "rewards/chosen": 3.1967347462972007, + "rewards/margins": 11.46735699971517, + "rewards/rejected": -8.270622253417969, + "step": 3597 + }, + { + "epoch": 0.32873458200091366, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 7.58008419923955e-06, + "logits/chosen": 433997482.6666667, + "logits/rejected": 511869952.0, + "logps/chosen": -235.3245849609375, + "logps/rejected": -356.7771484375, + "loss": 0.012, + "rewards/chosen": 4.031391779581706, + "rewards/margins": 11.881635920206705, + "rewards/rejected": -7.850244140625, + "step": 3598 + }, + { + "epoch": 0.32882594792142533, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 7.578852504239181e-06, + "logits/chosen": 658385356.8, + "logits/rejected": 691949354.6666666, + "logps/chosen": -236.512890625, + "logps/rejected": -506.8216959635417, + "loss": 0.0457, + "rewards/chosen": 3.530155563354492, + "rewards/margins": 12.904393895467123, + "rewards/rejected": -9.37423833211263, + "step": 3599 + }, + { + "epoch": 0.32891731384193695, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.577620595991604e-06, + "logits/chosen": 518026093.71428573, + "logits/rejected": 531893376.0, + "logps/chosen": -298.58558872767856, + "logps/rejected": -414.7852783203125, + "loss": 0.0189, + "rewards/chosen": 4.1409615107945035, + "rewards/margins": 11.994339806692942, + "rewards/rejected": -7.8533782958984375, + "step": 3600 + }, + { + "epoch": 0.32900867976244863, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 7.576388474598686e-06, + "logits/chosen": 429551296.0, + "logits/rejected": 453648064.0, + "logps/chosen": -306.8807373046875, + "logps/rejected": -462.7275390625, + "loss": 0.0157, + "rewards/chosen": 3.5998363494873047, + "rewards/margins": 11.652779579162598, + "rewards/rejected": -8.052943229675293, + "step": 3601 + }, + { + "epoch": 0.32910004568296025, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 7.57515614016231e-06, + "logits/chosen": 765962752.0, + "logits/rejected": 280597952.0, + "logps/chosen": -492.3476867675781, + "logps/rejected": -286.15301513671875, + "loss": 0.1281, + "rewards/chosen": 2.331425666809082, + "rewards/margins": 9.716078281402588, + "rewards/rejected": -7.384652614593506, + "step": 3602 + }, + { + "epoch": 0.32919141160347193, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 7.5739235927843835e-06, + "logits/chosen": 339743456.0, + "logits/rejected": 687547136.0, + "logps/chosen": -253.88540649414062, + "logps/rejected": -496.6671142578125, + "loss": 0.012, + "rewards/chosen": 3.897507429122925, + "rewards/margins": 12.668022871017456, + "rewards/rejected": -8.770515441894531, + "step": 3603 + }, + { + "epoch": 0.32928277752398355, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 7.572690832566821e-06, + "logits/chosen": 635268736.0, + "logits/rejected": 467285184.0, + "logps/chosen": -313.4454345703125, + "logps/rejected": -555.2905883789062, + "loss": 0.0142, + "rewards/chosen": 4.016937732696533, + "rewards/margins": 14.129106998443604, + "rewards/rejected": -10.11216926574707, + "step": 3604 + }, + { + "epoch": 0.3293741434444952, + "grad_norm": 56.0, + "kl": 0.0, + "learning_rate": 7.571457859611565e-06, + "logits/chosen": 267332032.0, + "logits/rejected": 543669632.0, + "logps/chosen": -515.194580078125, + "logps/rejected": -370.2745056152344, + "loss": 0.0539, + "rewards/chosen": 3.4792332649230957, + "rewards/margins": 9.168188095092773, + "rewards/rejected": -5.688954830169678, + "step": 3605 + }, + { + "epoch": 0.32946550936500685, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 7.570224674020568e-06, + "logits/chosen": 478134208.0, + "logits/rejected": 508150016.0, + "logps/chosen": -310.38739013671875, + "logps/rejected": -483.13525390625, + "loss": 0.01, + "rewards/chosen": 3.904184103012085, + "rewards/margins": 12.570885101954142, + "rewards/rejected": -8.666700998942057, + "step": 3606 + }, + { + "epoch": 0.3295568752855185, + "grad_norm": 0.345703125, + "kl": 0.0, + "learning_rate": 7.568991275895805e-06, + "logits/chosen": 285201408.0, + "logits/rejected": 644084565.3333334, + "logps/chosen": -210.84632873535156, + "logps/rejected": -397.3350016276042, + "loss": 0.0023, + "rewards/chosen": 4.918859481811523, + "rewards/margins": 14.956579208374023, + "rewards/rejected": -10.0377197265625, + "step": 3607 + }, + { + "epoch": 0.32964824120603015, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 7.5677576653392645e-06, + "logits/chosen": 613425472.0, + "logits/rejected": 759228864.0, + "logps/chosen": -268.0890808105469, + "logps/rejected": -783.18115234375, + "loss": 0.0286, + "rewards/chosen": 2.9511780738830566, + "rewards/margins": 15.789552211761475, + "rewards/rejected": -12.838374137878418, + "step": 3608 + }, + { + "epoch": 0.3297396071265418, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 7.566523842452958e-06, + "logits/chosen": 1285790208.0, + "logits/rejected": 506001817.6, + "logps/chosen": -438.3120930989583, + "logps/rejected": -327.2440673828125, + "loss": 0.028, + "rewards/chosen": 2.583947022755941, + "rewards/margins": 10.515212281545004, + "rewards/rejected": -7.931265258789063, + "step": 3609 + }, + { + "epoch": 0.32983097304705344, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 7.5652898073389066e-06, + "logits/chosen": 558193920.0, + "logits/rejected": 523213397.3333333, + "logps/chosen": -358.197265625, + "logps/rejected": -516.339599609375, + "loss": 0.0158, + "rewards/chosen": 3.9459674835205076, + "rewards/margins": 12.611103185017903, + "rewards/rejected": -8.665135701497396, + "step": 3610 + }, + { + "epoch": 0.3299223389675651, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 7.564055560099157e-06, + "logits/chosen": 564877772.8, + "logits/rejected": 566165930.6666666, + "logps/chosen": -339.8201171875, + "logps/rejected": -497.5491129557292, + "loss": 0.035, + "rewards/chosen": 3.0005443572998045, + "rewards/margins": 12.516046015421548, + "rewards/rejected": -9.515501658121744, + "step": 3611 + }, + { + "epoch": 0.33001370488807674, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 7.562821100835771e-06, + "logits/chosen": 757355861.3333334, + "logits/rejected": 1140126924.8, + "logps/chosen": -342.5710856119792, + "logps/rejected": -418.68935546875, + "loss": 0.0144, + "rewards/chosen": 3.4013408025105796, + "rewards/margins": 12.14391892751058, + "rewards/rejected": -8.742578125, + "step": 3612 + }, + { + "epoch": 0.3301050708085884, + "grad_norm": 32.25, + "kl": 0.0, + "learning_rate": 7.561586429650822e-06, + "logits/chosen": 452963123.2, + "logits/rejected": 572051669.3333334, + "logps/chosen": -265.68310546875, + "logps/rejected": -438.359619140625, + "loss": 0.0478, + "rewards/chosen": 3.3940216064453126, + "rewards/margins": 12.038941955566406, + "rewards/rejected": -8.644920349121094, + "step": 3613 + }, + { + "epoch": 0.33019643672910004, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 7.560351546646411e-06, + "logits/chosen": 556306090.6666666, + "logits/rejected": 386152012.8, + "logps/chosen": -330.23345947265625, + "logps/rejected": -684.10791015625, + "loss": 0.0193, + "rewards/chosen": 3.8125292460123696, + "rewards/margins": 12.846756998697916, + "rewards/rejected": -9.034227752685547, + "step": 3614 + }, + { + "epoch": 0.3302878026496117, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 7.559116451924648e-06, + "logits/chosen": 681606144.0, + "logits/rejected": 413846144.0, + "logps/chosen": -512.281005859375, + "logps/rejected": -391.2212727864583, + "loss": 0.0395, + "rewards/chosen": 3.2009532928466795, + "rewards/margins": 11.10315081278483, + "rewards/rejected": -7.902197519938151, + "step": 3615 + }, + { + "epoch": 0.33037916857012334, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.557881145587667e-06, + "logits/chosen": 424181333.3333333, + "logits/rejected": 355921344.0, + "logps/chosen": -265.02537027994794, + "logps/rejected": -279.79010009765625, + "loss": 0.1314, + "rewards/chosen": 2.7873751322428384, + "rewards/margins": 11.313320795694986, + "rewards/rejected": -8.525945663452148, + "step": 3616 + }, + { + "epoch": 0.330470534490635, + "grad_norm": 40.25, + "kl": 0.0, + "learning_rate": 7.5566456277376135e-06, + "logits/chosen": 655342016.0, + "logits/rejected": 269002720.0, + "logps/chosen": -332.95098876953125, + "logps/rejected": -341.936279296875, + "loss": 0.1802, + "rewards/chosen": 1.021521806716919, + "rewards/margins": 8.815137147903442, + "rewards/rejected": -7.793615341186523, + "step": 3617 + }, + { + "epoch": 0.33056190041114664, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.555409898476653e-06, + "logits/chosen": 510060832.0, + "logits/rejected": 653721344.0, + "logps/chosen": -311.28057861328125, + "logps/rejected": -476.3183288574219, + "loss": 0.0156, + "rewards/chosen": 3.9663100242614746, + "rewards/margins": 11.665923595428467, + "rewards/rejected": -7.699613571166992, + "step": 3618 + }, + { + "epoch": 0.3306532663316583, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.5541739579069714e-06, + "logits/chosen": 592764352.0, + "logits/rejected": 433804288.0, + "logps/chosen": -329.41888427734375, + "logps/rejected": -438.40484619140625, + "loss": 0.051, + "rewards/chosen": 2.633474826812744, + "rewards/margins": 11.912085056304932, + "rewards/rejected": -9.278610229492188, + "step": 3619 + }, + { + "epoch": 0.33074463225216993, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 7.552937806130769e-06, + "logits/chosen": 537407129.6, + "logits/rejected": 592940970.6666666, + "logps/chosen": -311.17421875, + "logps/rejected": -524.1295979817709, + "loss": 0.0135, + "rewards/chosen": 3.9736228942871095, + "rewards/margins": 14.255119578043619, + "rewards/rejected": -10.28149668375651, + "step": 3620 + }, + { + "epoch": 0.3308359981726816, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 7.551701443250263e-06, + "logits/chosen": 371737514.6666667, + "logits/rejected": 596120166.4, + "logps/chosen": -127.50811767578125, + "logps/rejected": -416.07861328125, + "loss": 0.0316, + "rewards/chosen": 2.9354171752929688, + "rewards/margins": 11.18843765258789, + "rewards/rejected": -8.253020477294921, + "step": 3621 + }, + { + "epoch": 0.33092736409319323, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 7.5504648693676905e-06, + "logits/chosen": 450449728.0, + "logits/rejected": 338230656.0, + "logps/chosen": -403.4126892089844, + "logps/rejected": -331.43365478515625, + "loss": 0.0147, + "rewards/chosen": 3.869609832763672, + "rewards/margins": 11.188897132873535, + "rewards/rejected": -7.319287300109863, + "step": 3622 + }, + { + "epoch": 0.3310187300137049, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 7.549228084585303e-06, + "logits/chosen": 442318848.0, + "logits/rejected": 582227840.0, + "logps/chosen": -255.7685546875, + "logps/rejected": -493.2293701171875, + "loss": 0.011, + "rewards/chosen": 4.275862693786621, + "rewards/margins": 14.92802619934082, + "rewards/rejected": -10.6521635055542, + "step": 3623 + }, + { + "epoch": 0.33111009593421653, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.547991089005371e-06, + "logits/chosen": 414336486.4, + "logits/rejected": 355476010.6666667, + "logps/chosen": -218.6902587890625, + "logps/rejected": -261.33424886067706, + "loss": 0.0386, + "rewards/chosen": 2.8899478912353516, + "rewards/margins": 11.733189900716146, + "rewards/rejected": -8.843242009480795, + "step": 3624 + }, + { + "epoch": 0.3312014618547282, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 7.5467538827301844e-06, + "logits/chosen": 808617130.6666666, + "logits/rejected": 707205120.0, + "logps/chosen": -323.134765625, + "logps/rejected": -443.053125, + "loss": 0.0286, + "rewards/chosen": 2.6387507120768228, + "rewards/margins": 11.395652262369792, + "rewards/rejected": -8.75690155029297, + "step": 3625 + }, + { + "epoch": 0.33129282777523983, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 7.545516465862048e-06, + "logits/chosen": 493297024.0, + "logits/rejected": 1121327488.0, + "logps/chosen": -247.4792938232422, + "logps/rejected": -505.6912536621094, + "loss": 0.0275, + "rewards/chosen": 3.355734348297119, + "rewards/margins": 11.064725399017334, + "rewards/rejected": -7.708991050720215, + "step": 3626 + }, + { + "epoch": 0.3313841936957515, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 7.544278838503286e-06, + "logits/chosen": 815074816.0, + "logits/rejected": 457793536.0, + "logps/chosen": -297.5144287109375, + "logps/rejected": -464.0750325520833, + "loss": 0.0299, + "rewards/chosen": 3.295553970336914, + "rewards/margins": 13.853674697875977, + "rewards/rejected": -10.558120727539062, + "step": 3627 + }, + { + "epoch": 0.3314755596162631, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.543041000756236e-06, + "logits/chosen": 706284288.0, + "logits/rejected": 381923712.0, + "logps/chosen": -459.5030212402344, + "logps/rejected": -362.9996337890625, + "loss": 0.025, + "rewards/chosen": 3.041236400604248, + "rewards/margins": 11.74998426437378, + "rewards/rejected": -8.708747863769531, + "step": 3628 + }, + { + "epoch": 0.3315669255367748, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 7.541802952723257e-06, + "logits/chosen": 1058709248.0, + "logits/rejected": 549409792.0, + "logps/chosen": -423.879150390625, + "logps/rejected": -485.7927734375, + "loss": 0.0094, + "rewards/chosen": 4.223453839619954, + "rewards/margins": 12.897832806905111, + "rewards/rejected": -8.674378967285156, + "step": 3629 + }, + { + "epoch": 0.3316582914572864, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 7.540564694506724e-06, + "logits/chosen": 306822208.0, + "logits/rejected": 373315904.0, + "logps/chosen": -152.37661743164062, + "logps/rejected": -442.4317626953125, + "loss": 0.0178, + "rewards/chosen": 3.8208723068237305, + "rewards/margins": 13.179664611816406, + "rewards/rejected": -9.358792304992676, + "step": 3630 + }, + { + "epoch": 0.3317496573777981, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 7.539326226209032e-06, + "logits/chosen": 477584832.0, + "logits/rejected": 521563648.0, + "logps/chosen": -354.2781677246094, + "logps/rejected": -656.800537109375, + "loss": 0.012, + "rewards/chosen": 4.3464860916137695, + "rewards/margins": 14.752897262573242, + "rewards/rejected": -10.406411170959473, + "step": 3631 + }, + { + "epoch": 0.3318410232983097, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 7.5380875479325855e-06, + "logits/chosen": 571168563.2, + "logits/rejected": 784776106.6666666, + "logps/chosen": -474.8228515625, + "logps/rejected": -683.767578125, + "loss": 0.014, + "rewards/chosen": 4.076969909667969, + "rewards/margins": 12.868355560302735, + "rewards/rejected": -8.791385650634766, + "step": 3632 + }, + { + "epoch": 0.3319323892188214, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 7.536848659779817e-06, + "logits/chosen": 417812531.2, + "logits/rejected": 484744021.3333333, + "logps/chosen": -390.975830078125, + "logps/rejected": -565.3649495442709, + "loss": 0.0174, + "rewards/chosen": 4.052693939208984, + "rewards/margins": 11.468453979492187, + "rewards/rejected": -7.415760040283203, + "step": 3633 + }, + { + "epoch": 0.332023755139333, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.535609561853165e-06, + "logits/chosen": 585108544.0, + "logits/rejected": 423488480.0, + "logps/chosen": -137.12725830078125, + "logps/rejected": -286.19512939453125, + "loss": 0.0294, + "rewards/chosen": 3.3789172172546387, + "rewards/margins": 10.46677017211914, + "rewards/rejected": -7.087852954864502, + "step": 3634 + }, + { + "epoch": 0.3321151210598447, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 7.534370254255099e-06, + "logits/chosen": 252889770.66666666, + "logits/rejected": 475055820.8, + "logps/chosen": -116.37799072265625, + "logps/rejected": -354.635400390625, + "loss": 0.0264, + "rewards/chosen": 3.433791478474935, + "rewards/margins": 10.500088628133138, + "rewards/rejected": -7.066297149658203, + "step": 3635 + }, + { + "epoch": 0.3322064869803563, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 7.533130737088092e-06, + "logits/chosen": 392791104.0, + "logits/rejected": 533655808.0, + "logps/chosen": -301.44427490234375, + "logps/rejected": -361.7335510253906, + "loss": 0.0409, + "rewards/chosen": 3.2922682762145996, + "rewards/margins": 10.078078746795654, + "rewards/rejected": -6.785810470581055, + "step": 3636 + }, + { + "epoch": 0.332297852900868, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 7.531891010454644e-06, + "logits/chosen": 1256033024.0, + "logits/rejected": 591484117.3333334, + "logps/chosen": -333.8414001464844, + "logps/rejected": -507.7961018880208, + "loss": 0.0342, + "rewards/chosen": 3.3115129470825195, + "rewards/margins": 10.490127245585125, + "rewards/rejected": -7.1786142985026045, + "step": 3637 + }, + { + "epoch": 0.3323892188213796, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.530651074457268e-06, + "logits/chosen": 456850880.0, + "logits/rejected": 654376192.0, + "logps/chosen": -241.34532165527344, + "logps/rejected": -308.97064208984375, + "loss": 0.0648, + "rewards/chosen": 3.0482845306396484, + "rewards/margins": 10.941783428192139, + "rewards/rejected": -7.89349889755249, + "step": 3638 + }, + { + "epoch": 0.3324805847418913, + "grad_norm": 26.875, + "kl": 0.0, + "learning_rate": 7.529410929198495e-06, + "logits/chosen": 475651114.6666667, + "logits/rejected": 716855424.0, + "logps/chosen": -285.08827718098956, + "logps/rejected": -605.486083984375, + "loss": 0.0584, + "rewards/chosen": 3.1839030583699546, + "rewards/margins": 10.689006169637045, + "rewards/rejected": -7.50510311126709, + "step": 3639 + }, + { + "epoch": 0.3325719506624029, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 7.528170574780873e-06, + "logits/chosen": 635239168.0, + "logits/rejected": 416116384.0, + "logps/chosen": -505.8870035807292, + "logps/rejected": -526.9509887695312, + "loss": 0.0213, + "rewards/chosen": 3.7563597361246743, + "rewards/margins": 14.9845978418986, + "rewards/rejected": -11.228238105773926, + "step": 3640 + }, + { + "epoch": 0.3326633165829146, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 7.52693001130697e-06, + "logits/chosen": 430594048.0, + "logits/rejected": 346038144.0, + "logps/chosen": -503.5148111979167, + "logps/rejected": -347.41064453125, + "loss": 0.0123, + "rewards/chosen": 3.6550944646199546, + "rewards/margins": 11.071686871846516, + "rewards/rejected": -7.416592407226562, + "step": 3641 + }, + { + "epoch": 0.3327546825034262, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 7.525689238879367e-06, + "logits/chosen": 279112960.0, + "logits/rejected": 582207963.4285715, + "logps/chosen": -196.0700225830078, + "logps/rejected": -498.58939034598217, + "loss": 0.0079, + "rewards/chosen": 2.7851531505584717, + "rewards/margins": 11.64860006741115, + "rewards/rejected": -8.863446916852679, + "step": 3642 + }, + { + "epoch": 0.3328460484239379, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.524448257600665e-06, + "logits/chosen": 854576298.6666666, + "logits/rejected": 812640051.2, + "logps/chosen": -250.59513346354166, + "logps/rejected": -374.360986328125, + "loss": 0.0925, + "rewards/chosen": 3.712376912434896, + "rewards/margins": 9.931432851155598, + "rewards/rejected": -6.219055938720703, + "step": 3643 + }, + { + "epoch": 0.3329374143444495, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 7.523207067573482e-06, + "logits/chosen": 607443584.0, + "logits/rejected": 380756736.0, + "logps/chosen": -255.30328369140625, + "logps/rejected": -421.2858479817708, + "loss": 0.0174, + "rewards/chosen": 3.1510040760040283, + "rewards/margins": 11.152426958084106, + "rewards/rejected": -8.001422882080078, + "step": 3644 + }, + { + "epoch": 0.3330287802649612, + "grad_norm": 36.75, + "kl": 0.0, + "learning_rate": 7.5219656689004525e-06, + "logits/chosen": 496263680.0, + "logits/rejected": 565833024.0, + "logps/chosen": -224.55462646484375, + "logps/rejected": -262.376953125, + "loss": 0.1135, + "rewards/chosen": 3.076684315999349, + "rewards/margins": 10.679421742757162, + "rewards/rejected": -7.6027374267578125, + "step": 3645 + }, + { + "epoch": 0.3331201461854728, + "grad_norm": 4.15625, + "kl": 1.6031150817871094, + "learning_rate": 7.520724061684227e-06, + "logits/chosen": 408453302.85714287, + "logits/rejected": 170155680.0, + "logps/chosen": -271.154296875, + "logps/rejected": -204.99713134765625, + "loss": 0.0468, + "rewards/chosen": 3.4053916931152344, + "rewards/margins": 10.643598079681396, + "rewards/rejected": -7.238206386566162, + "step": 3646 + }, + { + "epoch": 0.3332115121059845, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 7.5194822460274785e-06, + "logits/chosen": 692699306.6666666, + "logits/rejected": 671349606.4, + "logps/chosen": -520.1565348307291, + "logps/rejected": -627.7685546875, + "loss": 0.015, + "rewards/chosen": 3.921865463256836, + "rewards/margins": 13.586146926879882, + "rewards/rejected": -9.664281463623047, + "step": 3647 + }, + { + "epoch": 0.3333028780264961, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 7.518240222032891e-06, + "logits/chosen": 172619541.33333334, + "logits/rejected": 249670963.2, + "logps/chosen": -147.2935994466146, + "logps/rejected": -382.26513671875, + "loss": 0.1142, + "rewards/chosen": 2.7523622512817383, + "rewards/margins": 11.578413581848144, + "rewards/rejected": -8.826051330566406, + "step": 3648 + }, + { + "epoch": 0.3333942439470078, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 7.51699798980317e-06, + "logits/chosen": 539333034.6666666, + "logits/rejected": 380665292.8, + "logps/chosen": -561.8061930338541, + "logps/rejected": -572.871484375, + "loss": 0.0213, + "rewards/chosen": 3.207556406656901, + "rewards/margins": 12.210619608561197, + "rewards/rejected": -9.003063201904297, + "step": 3649 + }, + { + "epoch": 0.3334856098675194, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 7.515755549441035e-06, + "logits/chosen": 1009729664.0, + "logits/rejected": 401881600.0, + "logps/chosen": -364.14739990234375, + "logps/rejected": -382.12310791015625, + "loss": 0.025, + "rewards/chosen": 3.44964599609375, + "rewards/margins": 12.698784828186035, + "rewards/rejected": -9.249138832092285, + "step": 3650 + }, + { + "epoch": 0.3335769757880311, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 7.514512901049224e-06, + "logits/chosen": 1171044864.0, + "logits/rejected": 550122240.0, + "logps/chosen": -529.0106811523438, + "logps/rejected": -464.3973911830357, + "loss": 0.0055, + "rewards/chosen": 3.326733350753784, + "rewards/margins": 10.693238564899989, + "rewards/rejected": -7.366505214146206, + "step": 3651 + }, + { + "epoch": 0.3336683417085427, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.5132700447304955e-06, + "logits/chosen": 431576416.0, + "logits/rejected": 397467392.0, + "logps/chosen": -268.5662841796875, + "logps/rejected": -421.49554443359375, + "loss": 0.0134, + "rewards/chosen": 3.8952693939208984, + "rewards/margins": 12.116375923156738, + "rewards/rejected": -8.22110652923584, + "step": 3652 + }, + { + "epoch": 0.3337597076290544, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.512026980587619e-06, + "logits/chosen": 392300074.6666667, + "logits/rejected": 386766976.0, + "logps/chosen": -322.3240966796875, + "logps/rejected": -567.5828125, + "loss": 0.0101, + "rewards/chosen": 3.9879582722981772, + "rewards/margins": 13.887368520100912, + "rewards/rejected": -9.899410247802734, + "step": 3653 + }, + { + "epoch": 0.333851073549566, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 7.5107837087233855e-06, + "logits/chosen": 452924928.0, + "logits/rejected": 297936448.0, + "logps/chosen": -279.437841796875, + "logps/rejected": -279.9251708984375, + "loss": 0.0203, + "rewards/chosen": 3.524144744873047, + "rewards/margins": 10.667150115966797, + "rewards/rejected": -7.14300537109375, + "step": 3654 + }, + { + "epoch": 0.3339424394700777, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 7.509540229240601e-06, + "logits/chosen": 1110734848.0, + "logits/rejected": 589544064.0, + "logps/chosen": -286.11224365234375, + "logps/rejected": -519.3759765625, + "loss": 0.0154, + "rewards/chosen": 4.506412506103516, + "rewards/margins": 12.199877738952637, + "rewards/rejected": -7.693465232849121, + "step": 3655 + }, + { + "epoch": 0.3340338053905893, + "grad_norm": 13.1875, + "kl": 5.325374603271484, + "learning_rate": 7.508296542242095e-06, + "logits/chosen": 557161106.2857143, + "logits/rejected": 680285568.0, + "logps/chosen": -276.24368722098217, + "logps/rejected": -734.280029296875, + "loss": 0.0759, + "rewards/chosen": 3.294021333966936, + "rewards/margins": 14.930947984967913, + "rewards/rejected": -11.636926651000977, + "step": 3656 + }, + { + "epoch": 0.334125171311101, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 7.5070526478307025e-06, + "logits/chosen": 308137376.0, + "logits/rejected": 282295232.0, + "logps/chosen": -317.05303955078125, + "logps/rejected": -376.42010498046875, + "loss": 0.0174, + "rewards/chosen": 3.4310495853424072, + "rewards/margins": 12.566299676895142, + "rewards/rejected": -9.135250091552734, + "step": 3657 + }, + { + "epoch": 0.3342165372316126, + "grad_norm": 6.0, + "kl": 4.030971527099609, + "learning_rate": 7.5058085461092855e-06, + "logits/chosen": 684146907.4285715, + "logits/rejected": 774748288.0, + "logps/chosen": -370.3369838169643, + "logps/rejected": -635.2813720703125, + "loss": 0.0457, + "rewards/chosen": 3.6129188537597656, + "rewards/margins": 14.498282432556152, + "rewards/rejected": -10.885363578796387, + "step": 3658 + }, + { + "epoch": 0.3343079031521243, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.504564237180716e-06, + "logits/chosen": 378387520.0, + "logits/rejected": 411506112.0, + "logps/chosen": -291.38861083984375, + "logps/rejected": -382.1190185546875, + "loss": 0.0226, + "rewards/chosen": 3.7374565601348877, + "rewards/margins": 12.09672474861145, + "rewards/rejected": -8.359268188476562, + "step": 3659 + }, + { + "epoch": 0.3343992690726359, + "grad_norm": 15.0, + "kl": 0.8443107604980469, + "learning_rate": 7.503319721147892e-06, + "logits/chosen": 758640025.6, + "logits/rejected": 435405056.0, + "logps/chosen": -365.4556640625, + "logps/rejected": -485.2076822916667, + "loss": 0.124, + "rewards/chosen": 4.620694732666015, + "rewards/margins": 8.23703187306722, + "rewards/rejected": -3.6163371404012046, + "step": 3660 + }, + { + "epoch": 0.3344906349931476, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.502074998113718e-06, + "logits/chosen": 488375840.0, + "logits/rejected": 593960320.0, + "logps/chosen": -298.4363708496094, + "logps/rejected": -630.3829345703125, + "loss": 0.0555, + "rewards/chosen": 2.4970812797546387, + "rewards/margins": 12.71649980545044, + "rewards/rejected": -10.2194185256958, + "step": 3661 + }, + { + "epoch": 0.3345820009136592, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 7.500830068181127e-06, + "logits/chosen": 533093888.0, + "logits/rejected": 992275163.4285715, + "logps/chosen": -151.00247192382812, + "logps/rejected": -651.2767857142857, + "loss": 0.008, + "rewards/chosen": 3.383305311203003, + "rewards/margins": 10.780522108078003, + "rewards/rejected": -7.397216796875, + "step": 3662 + }, + { + "epoch": 0.33467336683417087, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 7.499584931453059e-06, + "logits/chosen": 608626090.6666666, + "logits/rejected": 608803584.0, + "logps/chosen": -288.6032307942708, + "logps/rejected": -446.220458984375, + "loss": 0.0318, + "rewards/chosen": 3.3914000193277993, + "rewards/margins": 11.505008379618326, + "rewards/rejected": -8.113608360290527, + "step": 3663 + }, + { + "epoch": 0.3347647327546825, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.498339588032475e-06, + "logits/chosen": 910341222.4, + "logits/rejected": 600754730.6666666, + "logps/chosen": -374.1223388671875, + "logps/rejected": -440.765625, + "loss": 0.0224, + "rewards/chosen": 3.3492225646972655, + "rewards/margins": 11.04389928181966, + "rewards/rejected": -7.6946767171223955, + "step": 3664 + }, + { + "epoch": 0.33485609867519417, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 7.497094038022357e-06, + "logits/chosen": 601144192.0, + "logits/rejected": 570685110.8571428, + "logps/chosen": -464.15789794921875, + "logps/rejected": -555.563720703125, + "loss": 0.0103, + "rewards/chosen": 3.00384521484375, + "rewards/margins": 10.3211304800851, + "rewards/rejected": -7.317285265241351, + "step": 3665 + }, + { + "epoch": 0.3349474645957058, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 7.495848281525695e-06, + "logits/chosen": 562812288.0, + "logits/rejected": 529492224.0, + "logps/chosen": -273.3332112630208, + "logps/rejected": -553.1773681640625, + "loss": 0.0368, + "rewards/chosen": 3.5413792928059897, + "rewards/margins": 12.84613068898519, + "rewards/rejected": -9.3047513961792, + "step": 3666 + }, + { + "epoch": 0.33503883051621747, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 7.494602318645507e-06, + "logits/chosen": 1197053952.0, + "logits/rejected": 785755264.0, + "logps/chosen": -216.9801483154297, + "logps/rejected": -554.177001953125, + "loss": 0.0377, + "rewards/chosen": 3.341398239135742, + "rewards/margins": 10.094400405883789, + "rewards/rejected": -6.753002166748047, + "step": 3667 + }, + { + "epoch": 0.3351301964367291, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 7.49335614948482e-06, + "logits/chosen": 761711872.0, + "logits/rejected": 792446310.4, + "logps/chosen": -345.152099609375, + "logps/rejected": -375.0705078125, + "loss": 0.0141, + "rewards/chosen": 3.570640246073405, + "rewards/margins": 12.327935473124185, + "rewards/rejected": -8.757295227050781, + "step": 3668 + }, + { + "epoch": 0.33522156235724077, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.492109774146681e-06, + "logits/chosen": 863044242.2857143, + "logits/rejected": 892086656.0, + "logps/chosen": -371.51402064732144, + "logps/rejected": -505.2837219238281, + "loss": 0.0319, + "rewards/chosen": 3.7150780814034596, + "rewards/margins": 11.282677854810442, + "rewards/rejected": -7.567599773406982, + "step": 3669 + }, + { + "epoch": 0.3353129282777524, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 7.490863192734153e-06, + "logits/chosen": 595246016.0, + "logits/rejected": 340911616.0, + "logps/chosen": -425.4549560546875, + "logps/rejected": -439.95166015625, + "loss": 0.0162, + "rewards/chosen": 3.542600154876709, + "rewards/margins": 10.810200691223145, + "rewards/rejected": -7.2676005363464355, + "step": 3670 + }, + { + "epoch": 0.33540429419826406, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 7.489616405350319e-06, + "logits/chosen": 604122112.0, + "logits/rejected": 486747989.3333333, + "logps/chosen": -348.92001953125, + "logps/rejected": -564.8169352213541, + "loss": 0.0431, + "rewards/chosen": 2.8546749114990235, + "rewards/margins": 12.74298947652181, + "rewards/rejected": -9.888314565022787, + "step": 3671 + }, + { + "epoch": 0.3354956601187757, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 7.488369412098273e-06, + "logits/chosen": 639696076.8, + "logits/rejected": 849612629.3333334, + "logps/chosen": -269.186279296875, + "logps/rejected": -624.1733805338541, + "loss": 0.0199, + "rewards/chosen": 3.9139209747314454, + "rewards/margins": 13.867150243123373, + "rewards/rejected": -9.953229268391928, + "step": 3672 + }, + { + "epoch": 0.33558702603928736, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 7.487122213081133e-06, + "logits/chosen": 454450240.0, + "logits/rejected": 420365482.6666667, + "logps/chosen": -382.15594482421875, + "logps/rejected": -447.813720703125, + "loss": 0.0142, + "rewards/chosen": 4.135266304016113, + "rewards/margins": 12.668144543965658, + "rewards/rejected": -8.532878239949545, + "step": 3673 + }, + { + "epoch": 0.335678391959799, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.485874808402028e-06, + "logits/chosen": 544868736.0, + "logits/rejected": 538871168.0, + "logps/chosen": -459.16632080078125, + "logps/rejected": -371.6145833333333, + "loss": 0.0303, + "rewards/chosen": 2.6315064430236816, + "rewards/margins": 9.525816440582275, + "rewards/rejected": -6.894309997558594, + "step": 3674 + }, + { + "epoch": 0.33576975788031066, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 7.484627198164109e-06, + "logits/chosen": 408309845.3333333, + "logits/rejected": 504485683.2, + "logps/chosen": -320.33042399088544, + "logps/rejected": -508.8697265625, + "loss": 0.012, + "rewards/chosen": 3.5063130060831704, + "rewards/margins": 13.30709654490153, + "rewards/rejected": -9.80078353881836, + "step": 3675 + }, + { + "epoch": 0.3358611238008223, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.483379382470542e-06, + "logits/chosen": 628105216.0, + "logits/rejected": 362136256.0, + "logps/chosen": -332.61309814453125, + "logps/rejected": -493.5596618652344, + "loss": 0.0185, + "rewards/chosen": 3.4338185787200928, + "rewards/margins": 13.478626012802124, + "rewards/rejected": -10.044807434082031, + "step": 3676 + }, + { + "epoch": 0.33595248972133396, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 7.48213136142451e-06, + "logits/chosen": 579771904.0, + "logits/rejected": 310182314.6666667, + "logps/chosen": -283.1235595703125, + "logps/rejected": -489.0528971354167, + "loss": 0.0087, + "rewards/chosen": 4.696963119506836, + "rewards/margins": 14.911727396647134, + "rewards/rejected": -10.214764277140299, + "step": 3677 + }, + { + "epoch": 0.3360438556418456, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 7.480883135129211e-06, + "logits/chosen": 620329408.0, + "logits/rejected": 729786816.0, + "logps/chosen": -326.99041748046875, + "logps/rejected": -609.0922241210938, + "loss": 0.028, + "rewards/chosen": 3.2946205139160156, + "rewards/margins": 12.889019012451172, + "rewards/rejected": -9.594398498535156, + "step": 3678 + }, + { + "epoch": 0.33613522156235726, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 7.4796347036878625e-06, + "logits/chosen": 651826730.6666666, + "logits/rejected": 511859302.4, + "logps/chosen": -375.9875081380208, + "logps/rejected": -556.4787109375, + "loss": 0.0231, + "rewards/chosen": 3.22261651357015, + "rewards/margins": 11.698077328999839, + "rewards/rejected": -8.475460815429688, + "step": 3679 + }, + { + "epoch": 0.3362265874828689, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.478386067203699e-06, + "logits/chosen": 407799734.85714287, + "logits/rejected": 976349568.0, + "logps/chosen": -318.96285574776783, + "logps/rejected": -1298.962646484375, + "loss": 0.0195, + "rewards/chosen": 4.184075764247349, + "rewards/margins": 15.033392361232213, + "rewards/rejected": -10.849316596984863, + "step": 3680 + }, + { + "epoch": 0.33631795340338055, + "grad_norm": 36.0, + "kl": 0.0, + "learning_rate": 7.477137225779969e-06, + "logits/chosen": 1007768780.8, + "logits/rejected": 482113706.6666667, + "logps/chosen": -345.1248046875, + "logps/rejected": -547.7545979817709, + "loss": 0.0917, + "rewards/chosen": 3.5116130828857424, + "rewards/margins": 10.64462776184082, + "rewards/rejected": -7.133014678955078, + "step": 3681 + }, + { + "epoch": 0.3364093193238922, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 7.475888179519943e-06, + "logits/chosen": 763730261.3333334, + "logits/rejected": 642629632.0, + "logps/chosen": -522.324951171875, + "logps/rejected": -299.412841796875, + "loss": 0.0516, + "rewards/chosen": 2.4150559107462564, + "rewards/margins": 10.239617983500162, + "rewards/rejected": -7.824562072753906, + "step": 3682 + }, + { + "epoch": 0.33650068524440385, + "grad_norm": 0.47265625, + "kl": 0.0, + "learning_rate": 7.474638928526904e-06, + "logits/chosen": 603968320.0, + "logits/rejected": 623152597.3333334, + "logps/chosen": -488.1578063964844, + "logps/rejected": -858.8733723958334, + "loss": 0.0021, + "rewards/chosen": 4.845202445983887, + "rewards/margins": 16.569463411966957, + "rewards/rejected": -11.724260965983072, + "step": 3683 + }, + { + "epoch": 0.3365920511649155, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.473389472904154e-06, + "logits/chosen": 745719637.3333334, + "logits/rejected": 1046661017.6, + "logps/chosen": -417.3810221354167, + "logps/rejected": -551.52890625, + "loss": 0.0171, + "rewards/chosen": 3.6597798665364585, + "rewards/margins": 13.110804494222005, + "rewards/rejected": -9.451024627685547, + "step": 3684 + }, + { + "epoch": 0.33668341708542715, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 7.47213981275501e-06, + "logits/rejected": 441844288.0, + "logps/rejected": -420.23321533203125, + "loss": 0.082, + "rewards/rejected": -8.706583976745605, + "step": 3685 + }, + { + "epoch": 0.33677478300593877, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 7.470889948182809e-06, + "logits/chosen": 476332160.0, + "logits/rejected": 242959616.0, + "logps/chosen": -328.0923258463542, + "logps/rejected": -429.7696838378906, + "loss": 0.0368, + "rewards/chosen": 3.853351910909017, + "rewards/margins": 15.452276547749838, + "rewards/rejected": -11.59892463684082, + "step": 3686 + }, + { + "epoch": 0.33686614892645045, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 7.4696398792909016e-06, + "logits/chosen": 1034100672.0, + "logits/rejected": 418635520.0, + "logps/chosen": -505.5388488769531, + "logps/rejected": -455.0576477050781, + "loss": 0.0148, + "rewards/chosen": 3.6190414428710938, + "rewards/margins": 12.889989852905273, + "rewards/rejected": -9.27094841003418, + "step": 3687 + }, + { + "epoch": 0.33695751484696207, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 7.4683896061826586e-06, + "logits/chosen": 272842410.6666667, + "logits/rejected": 343339724.8, + "logps/chosen": -362.3979085286458, + "logps/rejected": -294.6058837890625, + "loss": 0.0044, + "rewards/chosen": 4.938645680745442, + "rewards/margins": 12.936841328938801, + "rewards/rejected": -7.998195648193359, + "step": 3688 + }, + { + "epoch": 0.33704888076747375, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 7.467139128961465e-06, + "logits/chosen": 661875626.6666666, + "logits/rejected": 694885939.2, + "logps/chosen": -476.8406575520833, + "logps/rejected": -683.34345703125, + "loss": 0.0144, + "rewards/chosen": 3.358138084411621, + "rewards/margins": 11.900084495544434, + "rewards/rejected": -8.541946411132812, + "step": 3689 + }, + { + "epoch": 0.33714024668798537, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 7.465888447730724e-06, + "logits/chosen": 472722880.0, + "logits/rejected": 418701568.0, + "logps/chosen": -338.91015625, + "logps/rejected": -378.5125427246094, + "loss": 0.0502, + "rewards/chosen": 2.3309402465820312, + "rewards/margins": 11.486647605895996, + "rewards/rejected": -9.155707359313965, + "step": 3690 + }, + { + "epoch": 0.33723161260849704, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 7.464637562593856e-06, + "logits/chosen": 406186410.6666667, + "logits/rejected": 708513996.8, + "logps/chosen": -309.0227864583333, + "logps/rejected": -565.083544921875, + "loss": 0.0177, + "rewards/chosen": 4.137335777282715, + "rewards/margins": 11.561791038513183, + "rewards/rejected": -7.424455261230468, + "step": 3691 + }, + { + "epoch": 0.33732297852900867, + "grad_norm": 0.79296875, + "kl": 0.0, + "learning_rate": 7.463386473654297e-06, + "logits/chosen": 566786944.0, + "logits/rejected": 498614400.0, + "logps/chosen": -321.1593933105469, + "logps/rejected": -405.2257486979167, + "loss": 0.0033, + "rewards/chosen": 4.703546524047852, + "rewards/margins": 12.419195810953777, + "rewards/rejected": -7.715649286905925, + "step": 3692 + }, + { + "epoch": 0.33741434444952034, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.4621351810154996e-06, + "logits/chosen": 1371617365.3333333, + "logits/rejected": 722467840.0, + "logps/chosen": -457.2676595052083, + "logps/rejected": -654.343505859375, + "loss": 0.0277, + "rewards/chosen": 3.507439931233724, + "rewards/margins": 15.24389394124349, + "rewards/rejected": -11.736454010009766, + "step": 3693 + }, + { + "epoch": 0.33750571037003196, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 7.460883684780935e-06, + "logits/chosen": 486407372.8, + "logits/rejected": 652201045.3333334, + "logps/chosen": -252.7752197265625, + "logps/rejected": -666.4293619791666, + "loss": 0.034, + "rewards/chosen": 3.0813175201416017, + "rewards/margins": 17.828822708129884, + "rewards/rejected": -14.747505187988281, + "step": 3694 + }, + { + "epoch": 0.33759707629054364, + "grad_norm": 49.0, + "kl": 0.0, + "learning_rate": 7.459631985054092e-06, + "logits/chosen": 794101657.6, + "logits/rejected": 243565354.66666666, + "logps/chosen": -235.174853515625, + "logps/rejected": -480.81494140625, + "loss": 0.0962, + "rewards/chosen": 2.1697566986083983, + "rewards/margins": 9.921852238972981, + "rewards/rejected": -7.752095540364583, + "step": 3695 + }, + { + "epoch": 0.33768844221105526, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 7.45838008193847e-06, + "logits/chosen": 765346048.0, + "logits/rejected": 512046182.4, + "logps/chosen": -545.2935791015625, + "logps/rejected": -413.659619140625, + "loss": 0.0098, + "rewards/chosen": 4.0295836130778, + "rewards/margins": 11.996747461954753, + "rewards/rejected": -7.967163848876953, + "step": 3696 + }, + { + "epoch": 0.33777980813156694, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.4571279755375945e-06, + "logits/chosen": 591704128.0, + "logits/rejected": 797615616.0, + "logps/chosen": -241.22996520996094, + "logps/rejected": -605.030517578125, + "loss": 0.0172, + "rewards/chosen": 4.503502368927002, + "rewards/margins": 13.178072452545166, + "rewards/rejected": -8.674570083618164, + "step": 3697 + }, + { + "epoch": 0.33787117405207856, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 7.455875665955001e-06, + "logits/chosen": 611793920.0, + "logits/rejected": 421394176.0, + "logps/chosen": -306.21771240234375, + "logps/rejected": -495.4023742675781, + "loss": 0.0121, + "rewards/chosen": 3.9584274291992188, + "rewards/margins": 13.146869659423828, + "rewards/rejected": -9.18844223022461, + "step": 3698 + }, + { + "epoch": 0.33796253997259024, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 7.454623153294243e-06, + "logits/chosen": 607495424.0, + "logits/rejected": 841104320.0, + "logps/chosen": -272.60919189453125, + "logps/rejected": -434.38677978515625, + "loss": 0.0176, + "rewards/chosen": 3.4172170162200928, + "rewards/margins": 11.689813375473022, + "rewards/rejected": -8.27259635925293, + "step": 3699 + }, + { + "epoch": 0.33805390589310186, + "grad_norm": 0.419921875, + "kl": 0.0, + "learning_rate": 7.453370437658896e-06, + "logits/chosen": 320010922.6666667, + "logits/rejected": 360416870.4, + "logps/chosen": -360.7508951822917, + "logps/rejected": -452.19365234375, + "loss": 0.002, + "rewards/chosen": 5.3411204020182295, + "rewards/margins": 15.41891886393229, + "rewards/rejected": -10.077798461914062, + "step": 3700 + }, + { + "epoch": 0.33814527181361353, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 7.452117519152542e-06, + "logits/chosen": 443121152.0, + "logits/rejected": 558438016.0, + "logps/chosen": -402.9605712890625, + "logps/rejected": -619.911376953125, + "loss": 0.0057, + "rewards/chosen": 4.748363494873047, + "rewards/margins": 14.549347877502441, + "rewards/rejected": -9.800984382629395, + "step": 3701 + }, + { + "epoch": 0.33823663773412516, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.450864397878789e-06, + "logits/chosen": 673288192.0, + "logits/rejected": 491029760.0, + "logps/chosen": -465.62969970703125, + "logps/rejected": -615.720458984375, + "loss": 0.0215, + "rewards/chosen": 3.6155967712402344, + "rewards/margins": 13.269834518432617, + "rewards/rejected": -9.654237747192383, + "step": 3702 + }, + { + "epoch": 0.33832800365463683, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 7.449611073941258e-06, + "logits/chosen": 443954329.6, + "logits/rejected": 290359957.3333333, + "logps/chosen": -309.4937255859375, + "logps/rejected": -307.85113525390625, + "loss": 0.0116, + "rewards/chosen": 4.02094841003418, + "rewards/margins": 11.8393980662028, + "rewards/rejected": -7.81844965616862, + "step": 3703 + }, + { + "epoch": 0.33841936957514845, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 7.448357547443587e-06, + "logits/chosen": 456807168.0, + "logits/rejected": 343796864.0, + "logps/chosen": -354.6939697265625, + "logps/rejected": -522.802490234375, + "loss": 0.0037, + "rewards/chosen": 4.8681640625, + "rewards/margins": 16.224197387695312, + "rewards/rejected": -11.356033325195312, + "step": 3704 + }, + { + "epoch": 0.33851073549566013, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 7.447103818489432e-06, + "logits/chosen": 762620480.0, + "logits/rejected": 571872256.0, + "logps/chosen": -373.57916259765625, + "logps/rejected": -560.8748779296875, + "loss": 0.0192, + "rewards/chosen": 3.497893810272217, + "rewards/margins": 13.321715831756592, + "rewards/rejected": -9.823822021484375, + "step": 3705 + }, + { + "epoch": 0.33860210141617175, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 7.445849887182464e-06, + "logits/chosen": 575142272.0, + "logits/rejected": 856154368.0, + "logps/chosen": -244.99862670898438, + "logps/rejected": -565.6048583984375, + "loss": 0.0345, + "rewards/chosen": 2.6525535583496094, + "rewards/margins": 11.457534790039062, + "rewards/rejected": -8.804981231689453, + "step": 3706 + }, + { + "epoch": 0.33869346733668343, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 7.4445957536263715e-06, + "logits/chosen": 286623462.4, + "logits/rejected": 350257920.0, + "logps/chosen": -270.0154296875, + "logps/rejected": -471.6072591145833, + "loss": 0.0406, + "rewards/chosen": 3.8267280578613283, + "rewards/margins": 12.169427744547527, + "rewards/rejected": -8.342699686686197, + "step": 3707 + }, + { + "epoch": 0.33878483325719505, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 7.4433414179248585e-06, + "logits/chosen": 624482880.0, + "logits/rejected": 563760128.0, + "logps/chosen": -308.2993469238281, + "logps/rejected": -709.299560546875, + "loss": 0.0057, + "rewards/chosen": 4.19804573059082, + "rewards/margins": 13.488606770833334, + "rewards/rejected": -9.290561040242514, + "step": 3708 + }, + { + "epoch": 0.3388761991777067, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 7.4420868801816514e-06, + "logits/chosen": 797839424.0, + "logits/rejected": 983164160.0, + "logps/chosen": -393.131103515625, + "logps/rejected": -632.2738037109375, + "loss": 0.0163, + "rewards/chosen": 3.5708487033843994, + "rewards/margins": 12.070144891738892, + "rewards/rejected": -8.499296188354492, + "step": 3709 + }, + { + "epoch": 0.33896756509821835, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 7.440832140500484e-06, + "logits/chosen": 401159552.0, + "logits/rejected": 357981866.6666667, + "logps/chosen": -263.6877685546875, + "logps/rejected": -398.54345703125, + "loss": 0.0274, + "rewards/chosen": 3.4249923706054686, + "rewards/margins": 12.99427172342936, + "rewards/rejected": -9.569279352823893, + "step": 3710 + }, + { + "epoch": 0.33905893101873, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 7.439577198985114e-06, + "logits/chosen": 756288307.2, + "logits/rejected": 401259434.6666667, + "logps/chosen": -443.3998046875, + "logps/rejected": -566.0338948567709, + "loss": 0.0105, + "rewards/chosen": 4.245056915283203, + "rewards/margins": 12.072794087727864, + "rewards/rejected": -7.827737172444661, + "step": 3711 + }, + { + "epoch": 0.33915029693924165, + "grad_norm": 0.671875, + "kl": 0.0, + "learning_rate": 7.438322055739311e-06, + "logits/chosen": 396476160.0, + "logits/rejected": 480337254.4, + "logps/chosen": -282.52427164713544, + "logps/rejected": -317.314013671875, + "loss": 0.005, + "rewards/chosen": 4.664223353068034, + "rewards/margins": 11.703354517618816, + "rewards/rejected": -7.039131164550781, + "step": 3712 + }, + { + "epoch": 0.3392416628597533, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 7.437066710866867e-06, + "logits/chosen": 516388480.0, + "logits/rejected": 604251904.0, + "logps/chosen": -307.78466796875, + "logps/rejected": -616.1472778320312, + "loss": 0.0161, + "rewards/chosen": 3.852526903152466, + "rewards/margins": 14.130433320999146, + "rewards/rejected": -10.27790641784668, + "step": 3713 + }, + { + "epoch": 0.33933302878026494, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 7.435811164471586e-06, + "logits/chosen": 696825984.0, + "logits/rejected": 1188077738.6666667, + "logps/chosen": -322.1207275390625, + "logps/rejected": -527.4781901041666, + "loss": 0.0093, + "rewards/chosen": 3.3547608852386475, + "rewards/margins": 12.827843109766642, + "rewards/rejected": -9.473082224527994, + "step": 3714 + }, + { + "epoch": 0.3394243947007766, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 7.43455541665729e-06, + "logits/chosen": 520203040.0, + "logits/rejected": 486593792.0, + "logps/chosen": -447.578369140625, + "logps/rejected": -529.971923828125, + "loss": 0.0261, + "rewards/chosen": 3.3245673179626465, + "rewards/margins": 14.059889316558838, + "rewards/rejected": -10.735321998596191, + "step": 3715 + }, + { + "epoch": 0.33951576062128824, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 7.433299467527818e-06, + "logits/chosen": 639825706.6666666, + "logits/rejected": 428306483.2, + "logps/chosen": -280.10528564453125, + "logps/rejected": -465.55361328125, + "loss": 0.0237, + "rewards/chosen": 2.9416097005208335, + "rewards/margins": 11.151993306477864, + "rewards/rejected": -8.21038360595703, + "step": 3716 + }, + { + "epoch": 0.3396071265417999, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 7.432043317187023e-06, + "logits/chosen": 383634858.6666667, + "logits/rejected": 350507622.4, + "logps/chosen": -200.1671346028646, + "logps/rejected": -343.764697265625, + "loss": 0.0266, + "rewards/chosen": 3.249892234802246, + "rewards/margins": 10.621929740905761, + "rewards/rejected": -7.372037506103515, + "step": 3717 + }, + { + "epoch": 0.33969849246231154, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 7.430786965738782e-06, + "logits/chosen": 982172928.0, + "logits/rejected": 563292672.0, + "logps/chosen": -357.8436279296875, + "logps/rejected": -474.83856201171875, + "loss": 0.0097, + "rewards/chosen": 4.12779426574707, + "rewards/margins": 12.817422866821289, + "rewards/rejected": -8.689628601074219, + "step": 3718 + }, + { + "epoch": 0.3397898583828232, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 7.429530413286978e-06, + "logits/chosen": 296464332.8, + "logits/rejected": 515860437.3333333, + "logps/chosen": -215.7351806640625, + "logps/rejected": -586.9331868489584, + "loss": 0.0146, + "rewards/chosen": 4.1853900909423825, + "rewards/margins": 14.87245012919108, + "rewards/rejected": -10.687060038248697, + "step": 3719 + }, + { + "epoch": 0.33988122430333484, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 7.428273659935521e-06, + "logits/chosen": 429011541.3333333, + "logits/rejected": 390566988.8, + "logps/chosen": -219.2193603515625, + "logps/rejected": -480.003076171875, + "loss": 0.0063, + "rewards/chosen": 4.493231455485026, + "rewards/margins": 13.034704081217448, + "rewards/rejected": -8.541472625732421, + "step": 3720 + }, + { + "epoch": 0.3399725902238465, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 7.4270167057883295e-06, + "logits/chosen": 510564717.71428573, + "logits/rejected": 555578240.0, + "logps/chosen": -255.92731584821428, + "logps/rejected": -376.81976318359375, + "loss": 0.0469, + "rewards/chosen": 3.3185626438685825, + "rewards/margins": 10.1159473827907, + "rewards/rejected": -6.797384738922119, + "step": 3721 + }, + { + "epoch": 0.34006395614435814, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 7.4257595509493445e-06, + "logits/chosen": 246205856.0, + "logits/rejected": 435437536.0, + "logps/chosen": -257.11627197265625, + "logps/rejected": -390.7657165527344, + "loss": 0.1106, + "rewards/chosen": 2.720240592956543, + "rewards/margins": 13.571623802185059, + "rewards/rejected": -10.851383209228516, + "step": 3722 + }, + { + "epoch": 0.3401553220648698, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 7.424502195522518e-06, + "logits/chosen": 363619328.0, + "logits/rejected": 389357312.0, + "logps/chosen": -304.63609095982144, + "logps/rejected": -345.7856140136719, + "loss": 0.1344, + "rewards/chosen": 2.8735899244035994, + "rewards/margins": 11.138039997645787, + "rewards/rejected": -8.264450073242188, + "step": 3723 + }, + { + "epoch": 0.34024668798538144, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 7.4232446396118265e-06, + "logits/chosen": 428574003.2, + "logits/rejected": 190124202.66666666, + "logps/chosen": -402.4324462890625, + "logps/rejected": -295.9416910807292, + "loss": 0.0154, + "rewards/chosen": 4.224127197265625, + "rewards/margins": 13.692957305908203, + "rewards/rejected": -9.468830108642578, + "step": 3724 + }, + { + "epoch": 0.3403380539058931, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 7.421986883321252e-06, + "logits/chosen": 719337574.4, + "logits/rejected": 972459349.3333334, + "logps/chosen": -340.2147216796875, + "logps/rejected": -542.2633870442709, + "loss": 0.0235, + "rewards/chosen": 3.489247512817383, + "rewards/margins": 12.117599995930991, + "rewards/rejected": -8.628352483113607, + "step": 3725 + }, + { + "epoch": 0.34042941982640473, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 7.420728926754803e-06, + "logits/chosen": 449608618.6666667, + "logits/rejected": 281247078.4, + "logps/chosen": -286.81032307942706, + "logps/rejected": -284.37900390625, + "loss": 0.0133, + "rewards/chosen": 3.9413474400838218, + "rewards/margins": 12.905075772603354, + "rewards/rejected": -8.963728332519532, + "step": 3726 + }, + { + "epoch": 0.3405207857469164, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 7.419470770016501e-06, + "logits/chosen": 352337493.3333333, + "logits/rejected": 593323648.0, + "logps/chosen": -266.78741455078125, + "logps/rejected": -586.8673706054688, + "loss": 0.0263, + "rewards/chosen": 3.7988545099894204, + "rewards/margins": 10.770478884379068, + "rewards/rejected": -6.971624374389648, + "step": 3727 + }, + { + "epoch": 0.34061215166742803, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 7.418212413210384e-06, + "logits/chosen": 462927274.6666667, + "logits/rejected": 331098393.6, + "logps/chosen": -223.64007568359375, + "logps/rejected": -293.398681640625, + "loss": 0.0234, + "rewards/chosen": 3.274075190226237, + "rewards/margins": 11.14324328104655, + "rewards/rejected": -7.869168090820312, + "step": 3728 + }, + { + "epoch": 0.3407035175879397, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 7.416953856440503e-06, + "logits/chosen": 722546816.0, + "logits/rejected": 747099328.0, + "logps/chosen": -234.22293090820312, + "logps/rejected": -356.41339111328125, + "loss": 0.043, + "rewards/chosen": 3.3676023483276367, + "rewards/margins": 9.007184982299805, + "rewards/rejected": -5.639582633972168, + "step": 3729 + }, + { + "epoch": 0.34079488350845133, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 7.415695099810934e-06, + "logits/chosen": 541220249.6, + "logits/rejected": 488585002.6666667, + "logps/chosen": -263.98798828125, + "logps/rejected": -566.1295572916666, + "loss": 0.0296, + "rewards/chosen": 3.3895828247070314, + "rewards/margins": 13.193481318155925, + "rewards/rejected": -9.803898493448893, + "step": 3730 + }, + { + "epoch": 0.340886249428963, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.414436143425763e-06, + "logits/chosen": 400526796.8, + "logits/rejected": 595980373.3333334, + "logps/chosen": -293.9591796875, + "logps/rejected": -496.2156575520833, + "loss": 0.012, + "rewards/chosen": 4.6596935272216795, + "rewards/margins": 13.412991333007813, + "rewards/rejected": -8.753297805786133, + "step": 3731 + }, + { + "epoch": 0.3409776153494746, + "grad_norm": 0.69921875, + "kl": 0.0, + "learning_rate": 7.413176987389091e-06, + "logits/chosen": 367571136.0, + "logits/rejected": 902449493.3333334, + "logps/chosen": -265.55670166015625, + "logps/rejected": -684.3181966145834, + "loss": 0.0026, + "rewards/chosen": 4.5649566650390625, + "rewards/margins": 15.253519694010416, + "rewards/rejected": -10.688563028971354, + "step": 3732 + }, + { + "epoch": 0.3410689812699863, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.4119176318050415e-06, + "logits/chosen": 542087296.0, + "logits/rejected": 480052800.0, + "logps/chosen": -265.4325358072917, + "logps/rejected": -439.38238525390625, + "loss": 0.0254, + "rewards/chosen": 4.061118761698405, + "rewards/margins": 13.76710859934489, + "rewards/rejected": -9.705989837646484, + "step": 3733 + }, + { + "epoch": 0.3411603471904979, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 7.41065807677775e-06, + "logits/chosen": 503978342.4, + "logits/rejected": 578550613.3333334, + "logps/chosen": -270.431396484375, + "logps/rejected": -645.127685546875, + "loss": 0.0396, + "rewards/chosen": 2.8118505477905273, + "rewards/margins": 15.842678387959799, + "rewards/rejected": -13.030827840169271, + "step": 3734 + }, + { + "epoch": 0.3412517131110096, + "grad_norm": 30.0, + "kl": 0.0, + "learning_rate": 7.409398322411372e-06, + "logits/chosen": 633351680.0, + "logits/rejected": 321959456.0, + "logps/chosen": -353.7451869419643, + "logps/rejected": -689.309814453125, + "loss": 0.0648, + "rewards/chosen": 2.8127523149762834, + "rewards/margins": 14.501106807163783, + "rewards/rejected": -11.6883544921875, + "step": 3735 + }, + { + "epoch": 0.3413430790315212, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.408138368810075e-06, + "logits/chosen": 542947328.0, + "logits/rejected": 505351424.0, + "logps/chosen": -238.815478515625, + "logps/rejected": -358.5875651041667, + "loss": 0.0408, + "rewards/chosen": 2.7294488906860352, + "rewards/margins": 11.125504748026529, + "rewards/rejected": -8.396055857340494, + "step": 3736 + }, + { + "epoch": 0.3414344449520329, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 7.406878216078048e-06, + "logits/chosen": 354448864.0, + "logits/rejected": 298249408.0, + "logps/chosen": -449.23291015625, + "logps/rejected": -365.4901428222656, + "loss": 0.0189, + "rewards/chosen": 3.717960834503174, + "rewards/margins": 11.64149522781372, + "rewards/rejected": -7.923534393310547, + "step": 3737 + }, + { + "epoch": 0.3415258108725445, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 7.4056178643194935e-06, + "logits/chosen": 462276672.0, + "logits/rejected": 870437034.6666666, + "logps/chosen": -130.12704467773438, + "logps/rejected": -492.7635498046875, + "loss": 0.013, + "rewards/chosen": 3.0348377227783203, + "rewards/margins": 12.203702290852865, + "rewards/rejected": -9.168864568074545, + "step": 3738 + }, + { + "epoch": 0.3416171767930562, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 7.404357313638628e-06, + "logits/chosen": 571925632.0, + "logits/rejected": 604915776.0, + "logps/chosen": -600.49951171875, + "logps/rejected": -413.6722412109375, + "loss": 0.0168, + "rewards/chosen": 3.4494752883911133, + "rewards/margins": 13.120537757873535, + "rewards/rejected": -9.671062469482422, + "step": 3739 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 7.403096564139691e-06, + "logits/chosen": 448596377.6, + "logits/rejected": 551836160.0, + "logps/chosen": -165.110986328125, + "logps/rejected": -647.3857421875, + "loss": 0.0316, + "rewards/chosen": 3.314207077026367, + "rewards/margins": 12.781895319620768, + "rewards/rejected": -9.4676882425944, + "step": 3740 + }, + { + "epoch": 0.3417999086340795, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.401835615926934e-06, + "logits/chosen": 226366890.66666666, + "logits/rejected": 583830220.8, + "logps/chosen": -200.0374552408854, + "logps/rejected": -239.3624267578125, + "loss": 0.1674, + "rewards/chosen": 3.0048192342122397, + "rewards/margins": 8.321877415974935, + "rewards/rejected": -5.317058181762695, + "step": 3741 + }, + { + "epoch": 0.3418912745545911, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.4005744691046225e-06, + "logits/chosen": 428772544.0, + "logits/rejected": 510805504.0, + "logps/chosen": -377.2447509765625, + "logps/rejected": -448.943603515625, + "loss": 0.0199, + "rewards/chosen": 3.4076995849609375, + "rewards/margins": 14.893651008605957, + "rewards/rejected": -11.48595142364502, + "step": 3742 + }, + { + "epoch": 0.3419826404751028, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 7.399313123777046e-06, + "logits/chosen": 461826121.14285713, + "logits/rejected": 554987776.0, + "logps/chosen": -228.10850306919642, + "logps/rejected": -514.7501220703125, + "loss": 0.0312, + "rewards/chosen": 3.8951508658272878, + "rewards/margins": 13.7351348059518, + "rewards/rejected": -9.839983940124512, + "step": 3743 + }, + { + "epoch": 0.3420740063956144, + "grad_norm": 42.25, + "kl": 0.0, + "learning_rate": 7.398051580048503e-06, + "logits/chosen": 706887424.0, + "logits/rejected": 791703680.0, + "logps/chosen": -387.8341064453125, + "logps/rejected": -582.4095458984375, + "loss": 0.0734, + "rewards/chosen": 2.281243324279785, + "rewards/margins": 12.505200386047363, + "rewards/rejected": -10.223957061767578, + "step": 3744 + }, + { + "epoch": 0.3421653723161261, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 7.396789838023315e-06, + "logits/chosen": 556956842.6666666, + "logits/rejected": 654100480.0, + "logps/chosen": -346.0941569010417, + "logps/rejected": -481.882763671875, + "loss": 0.0107, + "rewards/chosen": 3.5517171223958335, + "rewards/margins": 12.768851979573569, + "rewards/rejected": -9.217134857177735, + "step": 3745 + }, + { + "epoch": 0.3422567382366377, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 7.395527897805812e-06, + "logits/chosen": 556783488.0, + "logits/rejected": 397328896.0, + "logps/chosen": -272.45196533203125, + "logps/rejected": -465.9503580729167, + "loss": 0.0088, + "rewards/chosen": 3.324493408203125, + "rewards/margins": 12.974543253580729, + "rewards/rejected": -9.650049845377604, + "step": 3746 + }, + { + "epoch": 0.3423481041571494, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 7.394265759500348e-06, + "logits/chosen": 456912000.0, + "logits/rejected": 446214553.6, + "logps/chosen": -321.0303955078125, + "logps/rejected": -474.83720703125, + "loss": 0.0245, + "rewards/chosen": 3.4025160471598306, + "rewards/margins": 13.11934954325358, + "rewards/rejected": -9.71683349609375, + "step": 3747 + }, + { + "epoch": 0.342439470077661, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 7.393003423211289e-06, + "logits/chosen": 430826624.0, + "logits/rejected": 796069120.0, + "logps/chosen": -332.2226257324219, + "logps/rejected": -411.0264485677083, + "loss": 0.0936, + "rewards/chosen": 4.86444616317749, + "rewards/margins": 12.150675614674885, + "rewards/rejected": -7.2862294514973955, + "step": 3748 + }, + { + "epoch": 0.3425308359981727, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.391740889043018e-06, + "logits/chosen": 498464153.6, + "logits/rejected": 562963413.3333334, + "logps/chosen": -253.49248046875, + "logps/rejected": -643.8561197916666, + "loss": 0.0351, + "rewards/chosen": 3.378081512451172, + "rewards/margins": 13.637797546386718, + "rewards/rejected": -10.259716033935547, + "step": 3749 + }, + { + "epoch": 0.3426222019186843, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 7.390478157099936e-06, + "logits/chosen": 540103040.0, + "logits/rejected": 842908352.0, + "logps/chosen": -322.4116516113281, + "logps/rejected": -705.278564453125, + "loss": 0.0146, + "rewards/chosen": 4.216836929321289, + "rewards/margins": 15.141271591186523, + "rewards/rejected": -10.924434661865234, + "step": 3750 + }, + { + "epoch": 0.342713567839196, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 7.389215227486459e-06, + "logits/chosen": 874249728.0, + "logits/rejected": 622606950.4, + "logps/chosen": -372.4867350260417, + "logps/rejected": -542.8076171875, + "loss": 0.0251, + "rewards/chosen": 2.879648208618164, + "rewards/margins": 11.539183425903321, + "rewards/rejected": -8.659535217285157, + "step": 3751 + }, + { + "epoch": 0.3428049337597076, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 7.387952100307019e-06, + "logits/chosen": 547020672.0, + "logits/rejected": 750529536.0, + "logps/chosen": -499.7442932128906, + "logps/rejected": -864.9892578125, + "loss": 0.0065, + "rewards/chosen": 4.769506931304932, + "rewards/margins": 15.32721471786499, + "rewards/rejected": -10.557707786560059, + "step": 3752 + }, + { + "epoch": 0.3428962996802193, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 7.386688775666065e-06, + "logits/chosen": 315748640.0, + "logits/rejected": 475948032.0, + "logps/chosen": -279.476318359375, + "logps/rejected": -364.9230651855469, + "loss": 0.1233, + "rewards/chosen": 3.6311962604522705, + "rewards/margins": 10.044644594192505, + "rewards/rejected": -6.413448333740234, + "step": 3753 + }, + { + "epoch": 0.3429876656007309, + "grad_norm": 0.50390625, + "kl": 0.0, + "learning_rate": 7.385425253668063e-06, + "logits/chosen": 338515456.0, + "logits/rejected": 514587936.0, + "logps/chosen": -302.1690673828125, + "logps/rejected": -476.58233642578125, + "loss": 0.0037, + "rewards/chosen": 5.620241641998291, + "rewards/margins": 12.431655883789062, + "rewards/rejected": -6.8114142417907715, + "step": 3754 + }, + { + "epoch": 0.3430790315212426, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.384161534417492e-06, + "logits/chosen": 498316288.0, + "logits/rejected": 465655978.6666667, + "logps/chosen": -314.790966796875, + "logps/rejected": -521.6134847005209, + "loss": 0.0314, + "rewards/chosen": 3.083039474487305, + "rewards/margins": 12.168652725219726, + "rewards/rejected": -9.085613250732422, + "step": 3755 + }, + { + "epoch": 0.3431703974417542, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 7.382897618018854e-06, + "logits/chosen": 383849420.8, + "logits/rejected": 463679744.0, + "logps/chosen": -324.476611328125, + "logps/rejected": -534.431884765625, + "loss": 0.0237, + "rewards/chosen": 4.239872741699219, + "rewards/margins": 13.98650690714518, + "rewards/rejected": -9.746634165445963, + "step": 3756 + }, + { + "epoch": 0.3432617633622659, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 7.381633504576659e-06, + "logits/chosen": 489367398.4, + "logits/rejected": 639445632.0, + "logps/chosen": -267.80888671875, + "logps/rejected": -499.49853515625, + "loss": 0.0239, + "rewards/chosen": 3.8262779235839846, + "rewards/margins": 11.2118350982666, + "rewards/rejected": -7.385557174682617, + "step": 3757 + }, + { + "epoch": 0.3433531292827775, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 7.380369194195441e-06, + "logits/chosen": 719125632.0, + "logits/rejected": 386130474.6666667, + "logps/chosen": -328.025390625, + "logps/rejected": -382.6588541666667, + "loss": 0.0656, + "rewards/chosen": 4.603400230407715, + "rewards/margins": 12.01225694020589, + "rewards/rejected": -7.408856709798177, + "step": 3758 + }, + { + "epoch": 0.3434444952032892, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 7.379104686979746e-06, + "logits/chosen": 1036097984.0, + "logits/rejected": 491580544.0, + "logps/chosen": -296.37933349609375, + "logps/rejected": -618.5849609375, + "loss": 0.0193, + "rewards/chosen": 3.264453411102295, + "rewards/margins": 14.958512783050537, + "rewards/rejected": -11.694059371948242, + "step": 3759 + }, + { + "epoch": 0.3435358611238008, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 7.377839983034135e-06, + "logits/chosen": 539303350.8571428, + "logits/rejected": 178753184.0, + "logps/chosen": -336.05517578125, + "logps/rejected": -314.20477294921875, + "loss": 0.0555, + "rewards/chosen": 3.4321411677769254, + "rewards/margins": 15.379709107535227, + "rewards/rejected": -11.9475679397583, + "step": 3760 + }, + { + "epoch": 0.3436272270443125, + "grad_norm": 3.578125, + "kl": 1.7968635559082031, + "learning_rate": 7.376575082463188e-06, + "logits/chosen": 658563498.6666666, + "logits/rejected": 448020544.0, + "logps/chosen": -405.0470377604167, + "logps/rejected": -286.96588134765625, + "loss": 0.0275, + "rewards/chosen": 3.652815500895182, + "rewards/margins": 8.970245997111002, + "rewards/rejected": -5.31743049621582, + "step": 3761 + }, + { + "epoch": 0.3437185929648241, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 7.375309985371502e-06, + "logits/chosen": 449010688.0, + "logits/rejected": 274113254.4, + "logps/chosen": -413.8279215494792, + "logps/rejected": -392.51884765625, + "loss": 0.0201, + "rewards/chosen": 2.995395024617513, + "rewards/margins": 12.386004002888997, + "rewards/rejected": -9.390608978271484, + "step": 3762 + }, + { + "epoch": 0.3438099588853358, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 7.374044691863689e-06, + "logits/chosen": 509743232.0, + "logits/rejected": 599693632.0, + "logps/chosen": -271.4343566894531, + "logps/rejected": -908.12646484375, + "loss": 0.0286, + "rewards/chosen": 2.9567999839782715, + "rewards/margins": 12.824429035186768, + "rewards/rejected": -9.867629051208496, + "step": 3763 + }, + { + "epoch": 0.3439013248058474, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 7.372779202044376e-06, + "logits/chosen": 725091123.2, + "logits/rejected": 240918058.66666666, + "logps/chosen": -487.6673828125, + "logps/rejected": -242.19685872395834, + "loss": 0.0273, + "rewards/chosen": 3.4089855194091796, + "rewards/margins": 10.689040247599284, + "rewards/rejected": -7.2800547281901045, + "step": 3764 + }, + { + "epoch": 0.3439926907263591, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 7.371513516018205e-06, + "logits/chosen": 560205824.0, + "logits/rejected": 420060864.0, + "logps/chosen": -404.3731994628906, + "logps/rejected": -502.2850646972656, + "loss": 0.0145, + "rewards/chosen": 3.641698122024536, + "rewards/margins": 12.416774034500122, + "rewards/rejected": -8.775075912475586, + "step": 3765 + }, + { + "epoch": 0.3440840566468707, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 7.37024763388984e-06, + "logits/chosen": 596523929.6, + "logits/rejected": 1009812394.6666666, + "logps/chosen": -350.78583984375, + "logps/rejected": -538.7224527994791, + "loss": 0.0158, + "rewards/chosen": 4.284150695800781, + "rewards/margins": 13.986299133300781, + "rewards/rejected": -9.7021484375, + "step": 3766 + }, + { + "epoch": 0.34417542256738237, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 7.368981555763956e-06, + "logits/chosen": 497401280.0, + "logps/chosen": -200.43936157226562, + "loss": 0.0464, + "rewards/chosen": 3.5172231197357178, + "step": 3767 + }, + { + "epoch": 0.344266788487894, + "grad_norm": 45.25, + "kl": 0.0, + "learning_rate": 7.367715281745247e-06, + "logits/chosen": 832471680.0, + "logits/rejected": 606778965.3333334, + "logps/chosen": -817.3096313476562, + "logps/rejected": -437.7210693359375, + "loss": 0.0612, + "rewards/chosen": 3.2778778076171875, + "rewards/margins": 11.296435038248697, + "rewards/rejected": -8.01855723063151, + "step": 3768 + }, + { + "epoch": 0.34435815440840567, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 7.366448811938422e-06, + "logits/chosen": 433974912.0, + "logits/rejected": 509303872.0, + "logps/chosen": -385.6719055175781, + "logps/rejected": -545.278076171875, + "loss": 0.0302, + "rewards/chosen": 2.77250599861145, + "rewards/margins": 12.014403581619263, + "rewards/rejected": -9.241897583007812, + "step": 3769 + }, + { + "epoch": 0.3444495203289173, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 7.365182146448205e-06, + "logits/chosen": 788959914.6666666, + "logits/rejected": 760577587.2, + "logps/chosen": -422.8927001953125, + "logps/rejected": -468.06357421875, + "loss": 0.0089, + "rewards/chosen": 4.020861307779948, + "rewards/margins": 11.943644205729168, + "rewards/rejected": -7.922782897949219, + "step": 3770 + }, + { + "epoch": 0.34454088624942897, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 7.36391528537934e-06, + "logits/chosen": 728175411.2, + "logits/rejected": 1374977536.0, + "logps/chosen": -391.55966796875, + "logps/rejected": -828.877685546875, + "loss": 0.016, + "rewards/chosen": 3.800258255004883, + "rewards/margins": 11.350120162963867, + "rewards/rejected": -7.549861907958984, + "step": 3771 + }, + { + "epoch": 0.3446322521699406, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 7.362648228836581e-06, + "logits/chosen": 619789888.0, + "logits/rejected": 502213952.0, + "logps/chosen": -361.7986755371094, + "logps/rejected": -417.30078125, + "loss": 0.0454, + "rewards/chosen": 4.332194805145264, + "rewards/margins": 12.772092342376709, + "rewards/rejected": -8.439897537231445, + "step": 3772 + }, + { + "epoch": 0.34472361809045227, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 7.361380976924705e-06, + "logits/chosen": 828418944.0, + "logits/rejected": 463741824.0, + "logps/chosen": -328.3192138671875, + "logps/rejected": -432.1324768066406, + "loss": 0.0118, + "rewards/chosen": 3.8548340797424316, + "rewards/margins": 12.179583072662354, + "rewards/rejected": -8.324748992919922, + "step": 3773 + }, + { + "epoch": 0.3448149840109639, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 7.3601135297485e-06, + "logits/chosen": 892938112.0, + "logits/rejected": 476810496.0, + "logps/chosen": -355.73504638671875, + "logps/rejected": -449.1507873535156, + "loss": 0.0095, + "rewards/chosen": 4.213146209716797, + "rewards/margins": 13.539811134338379, + "rewards/rejected": -9.326664924621582, + "step": 3774 + }, + { + "epoch": 0.34490634993147556, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 7.358845887412773e-06, + "logits/chosen": 318986240.0, + "logits/rejected": 510741440.0, + "logps/chosen": -212.05300903320312, + "logps/rejected": -216.87879943847656, + "loss": 0.0094, + "rewards/chosen": 4.326613426208496, + "rewards/margins": 11.377804279327393, + "rewards/rejected": -7.0511908531188965, + "step": 3775 + }, + { + "epoch": 0.3449977158519872, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 7.357578050022347e-06, + "logits/chosen": 407923328.0, + "logits/rejected": 681120640.0, + "logps/chosen": -249.1962432861328, + "logps/rejected": -346.1769714355469, + "loss": 0.0182, + "rewards/chosen": 3.5478174686431885, + "rewards/margins": 10.703475713729858, + "rewards/rejected": -7.15565824508667, + "step": 3776 + }, + { + "epoch": 0.34508908177249886, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 7.356310017682061e-06, + "logits/chosen": 446222560.0, + "logits/rejected": 388227285.3333333, + "logps/chosen": -248.74041748046875, + "logps/rejected": -392.8070475260417, + "loss": 0.0218, + "rewards/chosen": 2.618353843688965, + "rewards/margins": 10.826870282491049, + "rewards/rejected": -8.208516438802084, + "step": 3777 + }, + { + "epoch": 0.3451804476930105, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 7.355041790496766e-06, + "logits/chosen": 441502822.4, + "logits/rejected": 474147840.0, + "logps/chosen": -342.06865234375, + "logps/rejected": -466.1247151692708, + "loss": 0.0158, + "rewards/chosen": 4.444078063964843, + "rewards/margins": 14.576710001627603, + "rewards/rejected": -10.13263193766276, + "step": 3778 + }, + { + "epoch": 0.34527181361352216, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 7.353773368571336e-06, + "logits/chosen": 351445440.0, + "logits/rejected": 384338752.0, + "logps/chosen": -338.06988525390625, + "logps/rejected": -567.961181640625, + "loss": 0.0184, + "rewards/chosen": 3.872997522354126, + "rewards/margins": 13.811132669448853, + "rewards/rejected": -9.938135147094727, + "step": 3779 + }, + { + "epoch": 0.3453631795340338, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 7.352504752010656e-06, + "logits/chosen": 471873088.0, + "logits/rejected": 576430912.0, + "logps/chosen": -268.4776611328125, + "logps/rejected": -391.2613830566406, + "loss": 0.0234, + "rewards/chosen": 3.2206358909606934, + "rewards/margins": 10.561183452606201, + "rewards/rejected": -7.340547561645508, + "step": 3780 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 0.349609375, + "kl": 0.0, + "learning_rate": 7.351235940919631e-06, + "logits/chosen": 309237120.0, + "logits/rejected": 498342229.3333333, + "logps/chosen": -536.742431640625, + "logps/rejected": -526.8958333333334, + "loss": 0.0014, + "rewards/chosen": 5.553738594055176, + "rewards/margins": 14.43708070119222, + "rewards/rejected": -8.883342107137045, + "step": 3781 + }, + { + "epoch": 0.3455459113750571, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 7.349966935403177e-06, + "logits/chosen": 492966080.0, + "logits/rejected": 449277120.0, + "logps/chosen": -287.23724365234375, + "logps/rejected": -376.6574401855469, + "loss": 0.0208, + "rewards/chosen": 3.5760278701782227, + "rewards/margins": 12.96984577178955, + "rewards/rejected": -9.393817901611328, + "step": 3782 + }, + { + "epoch": 0.34563727729556876, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 7.34869773556623e-06, + "logits/chosen": 928196198.4, + "logits/rejected": 695262037.3333334, + "logps/chosen": -383.2734375, + "logps/rejected": -654.3655598958334, + "loss": 0.0354, + "rewards/chosen": 3.3803192138671876, + "rewards/margins": 15.02898686726888, + "rewards/rejected": -11.648667653401693, + "step": 3783 + }, + { + "epoch": 0.3457286432160804, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 7.347428341513743e-06, + "logits/chosen": 396573619.2, + "logits/rejected": 917902421.3333334, + "logps/chosen": -216.067822265625, + "logps/rejected": -738.7223307291666, + "loss": 0.0195, + "rewards/chosen": 3.796992874145508, + "rewards/margins": 17.369924799601236, + "rewards/rejected": -13.572931925455729, + "step": 3784 + }, + { + "epoch": 0.34582000913659205, + "grad_norm": 0.93359375, + "kl": 0.0, + "learning_rate": 7.346158753350679e-06, + "logits/chosen": 408221482.6666667, + "logits/rejected": 592608000.0, + "logps/chosen": -275.6983642578125, + "logps/rejected": -470.1761169433594, + "loss": 0.0071, + "rewards/chosen": 4.722205479939778, + "rewards/margins": 12.649839719136555, + "rewards/rejected": -7.927634239196777, + "step": 3785 + }, + { + "epoch": 0.3459113750571037, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.344888971182027e-06, + "logits/chosen": 614359082.6666666, + "logits/rejected": 439329280.0, + "logps/chosen": -351.7647298177083, + "logps/rejected": -610.5880737304688, + "loss": 0.0131, + "rewards/chosen": 4.194072723388672, + "rewards/margins": 13.38232421875, + "rewards/rejected": -9.188251495361328, + "step": 3786 + }, + { + "epoch": 0.34600274097761535, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.34361899511278e-06, + "logits/chosen": 696918118.4, + "logits/rejected": 497686613.3333333, + "logps/chosen": -487.026416015625, + "logps/rejected": -636.6324462890625, + "loss": 0.0247, + "rewards/chosen": 4.014548492431641, + "rewards/margins": 14.04531021118164, + "rewards/rejected": -10.03076171875, + "step": 3787 + }, + { + "epoch": 0.346094106898127, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 7.342348825247958e-06, + "logits/chosen": 366612224.0, + "logits/rejected": 430080768.0, + "logps/chosen": -291.9582275390625, + "logps/rejected": -405.614013671875, + "loss": 0.0315, + "rewards/chosen": 3.283565902709961, + "rewards/margins": 11.772611363728842, + "rewards/rejected": -8.48904546101888, + "step": 3788 + }, + { + "epoch": 0.34618547281863865, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 7.3410784616925905e-06, + "logits/chosen": 572748236.8, + "logits/rejected": 892618410.6666666, + "logps/chosen": -334.7153076171875, + "logps/rejected": -405.8800862630208, + "loss": 0.0271, + "rewards/chosen": 3.3980789184570312, + "rewards/margins": 11.38544209798177, + "rewards/rejected": -7.987363179524739, + "step": 3789 + }, + { + "epoch": 0.34627683873915027, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 7.3398079045517245e-06, + "logits/chosen": 353039232.0, + "logits/rejected": 705198738.2857143, + "logps/chosen": -226.3491668701172, + "logps/rejected": -410.59476143973217, + "loss": 0.0804, + "rewards/chosen": 5.453004360198975, + "rewards/margins": 12.732253279004778, + "rewards/rejected": -7.279248918805804, + "step": 3790 + }, + { + "epoch": 0.34636820465966195, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 7.338537153930423e-06, + "logits/chosen": 1050596522.6666666, + "logits/rejected": 437999104.0, + "logps/chosen": -339.19744873046875, + "logps/rejected": -410.62333984375, + "loss": 0.0117, + "rewards/chosen": 4.407373110453288, + "rewards/margins": 11.3299103418986, + "rewards/rejected": -6.922537231445313, + "step": 3791 + }, + { + "epoch": 0.34645957058017357, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 7.337266209933766e-06, + "logits/chosen": 778678080.0, + "logits/rejected": 700041344.0, + "logps/chosen": -395.5816955566406, + "logps/rejected": -668.4019775390625, + "loss": 0.0226, + "rewards/chosen": 3.7111568450927734, + "rewards/margins": 14.182222366333008, + "rewards/rejected": -10.471065521240234, + "step": 3792 + }, + { + "epoch": 0.34655093650068525, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 7.335995072666848e-06, + "logits/chosen": 492452192.0, + "logits/rejected": 608009792.0, + "logps/chosen": -352.5959167480469, + "logps/rejected": -697.57470703125, + "loss": 0.0194, + "rewards/chosen": 3.336946487426758, + "rewards/margins": 13.107330322265625, + "rewards/rejected": -9.770383834838867, + "step": 3793 + }, + { + "epoch": 0.34664230242119687, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 7.334723742234784e-06, + "logits/chosen": 509377450.6666667, + "logits/rejected": 616903782.4, + "logps/chosen": -384.0085042317708, + "logps/rejected": -571.037158203125, + "loss": 0.0113, + "rewards/chosen": 4.124800046284993, + "rewards/margins": 14.641419919331867, + "rewards/rejected": -10.516619873046874, + "step": 3794 + }, + { + "epoch": 0.34673366834170855, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 7.333452218742695e-06, + "logits/chosen": 442961056.0, + "logits/rejected": 456692736.0, + "logps/chosen": -361.6756591796875, + "logps/rejected": -449.3871765136719, + "loss": 0.0105, + "rewards/chosen": 4.157966613769531, + "rewards/margins": 12.913848876953125, + "rewards/rejected": -8.755882263183594, + "step": 3795 + }, + { + "epoch": 0.34682503426222017, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 7.332180502295729e-06, + "logits/chosen": 396949888.0, + "logits/rejected": 439976832.0, + "logps/chosen": -365.5125427246094, + "logps/rejected": -521.389404296875, + "loss": 0.0284, + "rewards/chosen": 4.195628643035889, + "rewards/margins": 12.086010456085205, + "rewards/rejected": -7.890381813049316, + "step": 3796 + }, + { + "epoch": 0.34691640018273184, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 7.330908592999042e-06, + "logits/chosen": 652885376.0, + "logits/rejected": 408064960.0, + "logps/chosen": -297.8097330729167, + "logps/rejected": -469.42279052734375, + "loss": 0.0414, + "rewards/chosen": 3.2553440729777017, + "rewards/margins": 11.089743296305338, + "rewards/rejected": -7.834399223327637, + "step": 3797 + }, + { + "epoch": 0.34700776610324346, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 7.329636490957812e-06, + "logits/chosen": 507549760.0, + "logits/rejected": 495968128.0, + "logps/chosen": -136.4568634033203, + "logps/rejected": -531.9795735677084, + "loss": 0.0085, + "rewards/chosen": 3.730264186859131, + "rewards/margins": 14.347137610117594, + "rewards/rejected": -10.616873423258463, + "step": 3798 + }, + { + "epoch": 0.34709913202375514, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 7.328364196277229e-06, + "logits/chosen": 1101920768.0, + "logits/rejected": 556772394.6666666, + "logps/chosen": -519.4757690429688, + "logps/rejected": -552.9111328125, + "loss": 0.0106, + "rewards/chosen": 3.169360399246216, + "rewards/margins": 13.230902910232544, + "rewards/rejected": -10.061542510986328, + "step": 3799 + }, + { + "epoch": 0.34719049794426676, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 7.327091709062499e-06, + "logits/chosen": 335169472.0, + "logits/rejected": 450300006.4, + "logps/chosen": -179.8890177408854, + "logps/rejected": -304.436669921875, + "loss": 0.0142, + "rewards/chosen": 3.544636090596517, + "rewards/margins": 13.11457322438558, + "rewards/rejected": -9.569937133789063, + "step": 3800 + }, + { + "epoch": 0.34728186386477844, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 7.325819029418847e-06, + "logits/chosen": 693750374.4, + "logits/rejected": 1267738624.0, + "logps/chosen": -331.228759765625, + "logps/rejected": -576.2662760416666, + "loss": 0.0225, + "rewards/chosen": 3.4702217102050783, + "rewards/margins": 14.171851348876952, + "rewards/rejected": -10.701629638671875, + "step": 3801 + }, + { + "epoch": 0.34737322978529006, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 7.324546157451509e-06, + "logits/chosen": 373176640.0, + "logits/rejected": 491681248.0, + "logps/chosen": -357.4365234375, + "logps/rejected": -560.2606201171875, + "loss": 0.0103, + "rewards/chosen": 4.498589515686035, + "rewards/margins": 12.974383354187012, + "rewards/rejected": -8.475793838500977, + "step": 3802 + }, + { + "epoch": 0.34746459570580174, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 7.323273093265742e-06, + "logits/chosen": 537071744.0, + "logits/rejected": 373427379.2, + "logps/chosen": -348.1189371744792, + "logps/rejected": -308.309716796875, + "loss": 0.0105, + "rewards/chosen": 3.875694910685221, + "rewards/margins": 10.36249148050944, + "rewards/rejected": -6.486796569824219, + "step": 3803 + }, + { + "epoch": 0.34755596162631336, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.3219998369668155e-06, + "logits/chosen": 785934592.0, + "logits/rejected": 702969536.0, + "logps/chosen": -226.4619903564453, + "logps/rejected": -276.97869873046875, + "loss": 0.0692, + "rewards/chosen": 3.228419542312622, + "rewards/margins": 9.064515829086304, + "rewards/rejected": -5.836096286773682, + "step": 3804 + }, + { + "epoch": 0.34764732754682504, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 7.320726388660017e-06, + "logits/chosen": 459209216.0, + "logits/rejected": 382271488.0, + "logps/chosen": -249.0597381591797, + "logps/rejected": -463.2477213541667, + "loss": 0.0122, + "rewards/chosen": 3.0152297019958496, + "rewards/margins": 11.702125072479248, + "rewards/rejected": -8.686895370483398, + "step": 3805 + }, + { + "epoch": 0.34773869346733666, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 7.319452748450646e-06, + "logits/chosen": 391086848.0, + "logits/rejected": 366116778.6666667, + "logps/chosen": -322.2142578125, + "logps/rejected": -428.59765625, + "loss": 0.0227, + "rewards/chosen": 3.7721317291259764, + "rewards/margins": 12.991499710083009, + "rewards/rejected": -9.219367980957031, + "step": 3806 + }, + { + "epoch": 0.34783005938784833, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.3181789164440255e-06, + "logits/chosen": 607070592.0, + "logits/rejected": 422037248.0, + "logps/chosen": -465.4757080078125, + "logps/rejected": -671.2926635742188, + "loss": 0.1302, + "rewards/chosen": 2.05668044090271, + "rewards/margins": 12.552253484725952, + "rewards/rejected": -10.495573043823242, + "step": 3807 + }, + { + "epoch": 0.34792142530835996, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 7.3169048927454844e-06, + "logits/chosen": 590181312.0, + "logits/rejected": 508677856.0, + "logps/chosen": -188.42422485351562, + "logps/rejected": -520.7213134765625, + "loss": 0.0289, + "rewards/chosen": 3.579357624053955, + "rewards/margins": 10.76618242263794, + "rewards/rejected": -7.186824798583984, + "step": 3808 + }, + { + "epoch": 0.34801279122887163, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 7.315630677460378e-06, + "logits/chosen": 282582784.0, + "logits/rejected": 573227690.6666666, + "logps/chosen": -232.717333984375, + "logps/rejected": -514.4271647135416, + "loss": 0.0129, + "rewards/chosen": 4.878913497924804, + "rewards/margins": 13.669804509480795, + "rewards/rejected": -8.79089101155599, + "step": 3809 + }, + { + "epoch": 0.34810415714938325, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.314356270694067e-06, + "logits/chosen": 558995968.0, + "logits/rejected": 500409395.2, + "logps/chosen": -344.3064778645833, + "logps/rejected": -527.18359375, + "loss": 0.0161, + "rewards/chosen": 3.271538416544596, + "rewards/margins": 11.29011141459147, + "rewards/rejected": -8.018572998046874, + "step": 3810 + }, + { + "epoch": 0.34819552306989493, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 7.313081672551937e-06, + "logits/chosen": 500956064.0, + "logits/rejected": 968893781.3333334, + "logps/chosen": -182.90350341796875, + "logps/rejected": -589.4551595052084, + "loss": 0.1207, + "rewards/chosen": 1.2880252599716187, + "rewards/margins": 11.672745744387308, + "rewards/rejected": -10.38472048441569, + "step": 3811 + }, + { + "epoch": 0.34828688899040655, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 7.311806883139383e-06, + "logits/chosen": 520783701.3333333, + "logits/rejected": 330974105.6, + "logps/chosen": -336.2624918619792, + "logps/rejected": -349.46591796875, + "loss": 0.0124, + "rewards/chosen": 3.4069722493489585, + "rewards/margins": 11.682740529378256, + "rewards/rejected": -8.275768280029297, + "step": 3812 + }, + { + "epoch": 0.34837825491091823, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 7.31053190256182e-06, + "logits/chosen": 494283161.6, + "logits/rejected": 479058005.3333333, + "logps/chosen": -301.873681640625, + "logps/rejected": -518.6407877604166, + "loss": 0.0201, + "rewards/chosen": 3.6979110717773436, + "rewards/margins": 12.681268183390298, + "rewards/rejected": -8.983357111612955, + "step": 3813 + }, + { + "epoch": 0.34846962083142985, + "grad_norm": 38.5, + "kl": 0.0, + "learning_rate": 7.3092567309246745e-06, + "logits/chosen": 579472691.2, + "logits/rejected": 521928789.3333333, + "logps/chosen": -391.558544921875, + "logps/rejected": -481.9388834635417, + "loss": 0.0566, + "rewards/chosen": 2.3664960861206055, + "rewards/margins": 12.251371065775553, + "rewards/rejected": -9.884874979654947, + "step": 3814 + }, + { + "epoch": 0.3485609867519415, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 7.307981368333394e-06, + "logits/chosen": 1642018099.2, + "logits/rejected": 708538197.3333334, + "logps/chosen": -463.627685546875, + "logps/rejected": -454.6624348958333, + "loss": 0.0217, + "rewards/chosen": 3.5928024291992187, + "rewards/margins": 12.951694869995118, + "rewards/rejected": -9.358892440795898, + "step": 3815 + }, + { + "epoch": 0.34865235267245315, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.30670581489344e-06, + "logits/chosen": 607592618.6666666, + "logits/rejected": 553440358.4, + "logps/chosen": -205.14774576822916, + "logps/rejected": -443.655078125, + "loss": 0.0177, + "rewards/chosen": 3.2913843790690103, + "rewards/margins": 14.15292765299479, + "rewards/rejected": -10.861543273925781, + "step": 3816 + }, + { + "epoch": 0.3487437185929648, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 7.305430070710287e-06, + "logits/chosen": 539692800.0, + "logits/rejected": 523338624.0, + "logps/chosen": -279.3632080078125, + "logps/rejected": -275.8220621744792, + "loss": 0.0082, + "rewards/chosen": 5.389655303955078, + "rewards/margins": 11.314128112792968, + "rewards/rejected": -5.924472808837891, + "step": 3817 + }, + { + "epoch": 0.34883508451347645, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 7.304154135889425e-06, + "logits/chosen": 193959637.33333334, + "logits/rejected": 456180377.6, + "logps/chosen": -140.4205118815104, + "logps/rejected": -554.21796875, + "loss": 0.013, + "rewards/chosen": 4.147693951924642, + "rewards/margins": 14.614399274190266, + "rewards/rejected": -10.466705322265625, + "step": 3818 + }, + { + "epoch": 0.3489264504339881, + "grad_norm": 4.1875, + "kl": 1.1145057678222656, + "learning_rate": 7.302878010536365e-06, + "logits/chosen": 627107547.4285715, + "logits/rejected": 415759232.0, + "logps/chosen": -333.36558314732144, + "logps/rejected": -426.3476867675781, + "loss": 0.0363, + "rewards/chosen": 3.454958234514509, + "rewards/margins": 12.322219167436872, + "rewards/rejected": -8.867260932922363, + "step": 3819 + }, + { + "epoch": 0.3490178163544998, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 7.301601694756632e-06, + "logits/chosen": 363911594.6666667, + "logits/rejected": 361331040.0, + "logps/chosen": -297.74294026692706, + "logps/rejected": -324.443359375, + "loss": 0.1346, + "rewards/chosen": 2.8150370915730796, + "rewards/margins": 10.650577863057455, + "rewards/rejected": -7.835540771484375, + "step": 3820 + }, + { + "epoch": 0.3491091822750114, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 7.300325188655762e-06, + "logits/chosen": 345696128.0, + "logits/rejected": 703523264.0, + "logps/chosen": -195.1586456298828, + "logps/rejected": -686.7589111328125, + "loss": 0.1302, + "rewards/chosen": 1.9966580867767334, + "rewards/margins": 11.286760091781616, + "rewards/rejected": -9.290102005004883, + "step": 3821 + }, + { + "epoch": 0.3492005481955231, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 7.299048492339312e-06, + "logits/chosen": 936526643.2, + "logits/rejected": 1147426645.3333333, + "logps/chosen": -564.7109375, + "logps/rejected": -563.0357259114584, + "loss": 0.0509, + "rewards/chosen": 3.268389892578125, + "rewards/margins": 11.407272847493491, + "rewards/rejected": -8.138882954915365, + "step": 3822 + }, + { + "epoch": 0.3492919141160347, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 7.297771605912853e-06, + "logits/chosen": 654338304.0, + "logits/rejected": 436043264.0, + "logps/chosen": -228.49288940429688, + "logps/rejected": -576.2464599609375, + "loss": 0.0246, + "rewards/chosen": 3.1723923683166504, + "rewards/margins": 12.047473430633545, + "rewards/rejected": -8.875081062316895, + "step": 3823 + }, + { + "epoch": 0.3493832800365464, + "grad_norm": 98.5, + "kl": 0.0, + "learning_rate": 7.296494529481972e-06, + "logits/chosen": 449466624.0, + "logits/rejected": 501638016.0, + "logps/chosen": -194.33082580566406, + "logps/rejected": -477.0855305989583, + "loss": 0.0496, + "rewards/chosen": 1.509484887123108, + "rewards/margins": 11.443612694740295, + "rewards/rejected": -9.934127807617188, + "step": 3824 + }, + { + "epoch": 0.349474645957058, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 7.29521726315227e-06, + "logits/chosen": 569692544.0, + "logits/rejected": 430618752.0, + "logps/chosen": -461.40985107421875, + "logps/rejected": -357.0872802734375, + "loss": 0.0305, + "rewards/chosen": 2.832780599594116, + "rewards/margins": 11.428025960922241, + "rewards/rejected": -8.595245361328125, + "step": 3825 + }, + { + "epoch": 0.3495660118775697, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 7.2939398070293665e-06, + "logits/chosen": 486617792.0, + "logits/rejected": 380761173.3333333, + "logps/chosen": -246.09475708007812, + "logps/rejected": -382.9635009765625, + "loss": 0.0102, + "rewards/chosen": 3.4303460121154785, + "rewards/margins": 11.708779493967691, + "rewards/rejected": -8.278433481852213, + "step": 3826 + }, + { + "epoch": 0.3496573777980813, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.292662161218894e-06, + "logits/chosen": 620923099.4285715, + "logits/rejected": 678174144.0, + "logps/chosen": -336.7857142857143, + "logps/rejected": -602.1377563476562, + "loss": 0.0376, + "rewards/chosen": 3.4370215279715404, + "rewards/margins": 12.547733579363141, + "rewards/rejected": -9.110712051391602, + "step": 3827 + }, + { + "epoch": 0.349748743718593, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 7.291384325826505e-06, + "logits/chosen": 341718491.4285714, + "logits/rejected": 665083264.0, + "logps/chosen": -304.2725306919643, + "logps/rejected": -535.2525634765625, + "loss": 0.0287, + "rewards/chosen": 3.985279083251953, + "rewards/margins": 13.429126739501953, + "rewards/rejected": -9.44384765625, + "step": 3828 + }, + { + "epoch": 0.3498401096391046, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 7.29010630095786e-06, + "logits/chosen": 627516970.6666666, + "logits/rejected": 603003596.8, + "logps/chosen": -274.568115234375, + "logps/rejected": -621.28515625, + "loss": 0.0177, + "rewards/chosen": 4.186128298441569, + "rewards/margins": 14.196998659769694, + "rewards/rejected": -10.010870361328125, + "step": 3829 + }, + { + "epoch": 0.3499314755596163, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 7.288828086718643e-06, + "logits/chosen": 469683232.0, + "logits/rejected": 386417056.0, + "logps/chosen": -244.0639190673828, + "logps/rejected": -642.4949951171875, + "loss": 0.0193, + "rewards/chosen": 3.3580541610717773, + "rewards/margins": 13.824042320251465, + "rewards/rejected": -10.465988159179688, + "step": 3830 + }, + { + "epoch": 0.3500228414801279, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 7.287549683214551e-06, + "logits/chosen": 347497770.6666667, + "logits/rejected": 279185856.0, + "logps/chosen": -263.2764892578125, + "logps/rejected": -395.4250183105469, + "loss": 0.0213, + "rewards/chosen": 4.026732444763184, + "rewards/margins": 12.969364166259766, + "rewards/rejected": -8.942631721496582, + "step": 3831 + }, + { + "epoch": 0.3501142074006396, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 7.286271090551293e-06, + "logits/chosen": 286462822.4, + "logits/rejected": 270819114.6666667, + "logps/chosen": -223.8414794921875, + "logps/rejected": -293.87115478515625, + "loss": 0.0115, + "rewards/chosen": 4.074515533447266, + "rewards/margins": 12.844332885742187, + "rewards/rejected": -8.769817352294922, + "step": 3832 + }, + { + "epoch": 0.3502055733211512, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.2849923088345995e-06, + "logits/chosen": 533627699.2, + "logits/rejected": 952267605.3333334, + "logps/chosen": -183.4870361328125, + "logps/rejected": -953.111083984375, + "loss": 0.0403, + "rewards/chosen": 2.8481197357177734, + "rewards/margins": 15.866968154907227, + "rewards/rejected": -13.018848419189453, + "step": 3833 + }, + { + "epoch": 0.3502969392416629, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 7.283713338170213e-06, + "logits/chosen": 541247744.0, + "logits/rejected": 766563904.0, + "logps/chosen": -381.7866516113281, + "logps/rejected": -537.8013916015625, + "loss": 0.0261, + "rewards/chosen": 3.6544189453125, + "rewards/margins": 13.213239669799805, + "rewards/rejected": -9.558820724487305, + "step": 3834 + }, + { + "epoch": 0.3503883051621745, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 7.282434178663892e-06, + "logits/chosen": 762796339.2, + "logits/rejected": 1401845418.6666667, + "logps/chosen": -468.756689453125, + "logps/rejected": -627.8651123046875, + "loss": 0.0246, + "rewards/chosen": 3.240275573730469, + "rewards/margins": 12.23400421142578, + "rewards/rejected": -8.993728637695312, + "step": 3835 + }, + { + "epoch": 0.3504796710826862, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 7.281154830421413e-06, + "logits/chosen": 331892787.2, + "logits/rejected": 545128021.3333334, + "logps/chosen": -236.6799072265625, + "logps/rejected": -733.4632975260416, + "loss": 0.0133, + "rewards/chosen": 4.594272613525391, + "rewards/margins": 14.14007822672526, + "rewards/rejected": -9.54580561319987, + "step": 3836 + }, + { + "epoch": 0.3505710370031978, + "grad_norm": 0.92578125, + "kl": 0.0, + "learning_rate": 7.2798752935485635e-06, + "logits/chosen": 275759232.0, + "logits/rejected": 552338176.0, + "logps/chosen": -287.84490966796875, + "logps/rejected": -442.4962565104167, + "loss": 0.0042, + "rewards/chosen": 5.064267158508301, + "rewards/margins": 13.250981330871582, + "rewards/rejected": -8.186714172363281, + "step": 3837 + }, + { + "epoch": 0.3506624029237095, + "grad_norm": 0.53515625, + "kl": 0.0, + "learning_rate": 7.27859556815115e-06, + "logits/chosen": 188520160.0, + "logits/rejected": 549582336.0, + "logps/chosen": -75.02328491210938, + "logps/rejected": -488.1024693080357, + "loss": 0.0023, + "rewards/chosen": 4.102206707000732, + "rewards/margins": 13.036706175122942, + "rewards/rejected": -8.93449946812221, + "step": 3838 + }, + { + "epoch": 0.3507537688442211, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 7.2773156543349965e-06, + "logits/chosen": 712119125.3333334, + "logits/rejected": 325168672.0, + "logps/chosen": -422.3382568359375, + "logps/rejected": -534.8964233398438, + "loss": 0.1317, + "rewards/chosen": 2.6272644996643066, + "rewards/margins": 12.809390544891357, + "rewards/rejected": -10.18212604522705, + "step": 3839 + }, + { + "epoch": 0.3508451347647328, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 7.276035552205937e-06, + "logits/chosen": 1281424768.0, + "logits/rejected": 365745792.0, + "logps/chosen": -325.8314208984375, + "logps/rejected": -408.2576090494792, + "loss": 0.0065, + "rewards/chosen": 3.739743232727051, + "rewards/margins": 12.430143674214682, + "rewards/rejected": -8.69040044148763, + "step": 3840 + }, + { + "epoch": 0.3509365006852444, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 7.274755261869826e-06, + "logits/chosen": 634287616.0, + "logits/rejected": 653764300.8, + "logps/chosen": -346.8839925130208, + "logps/rejected": -366.9388671875, + "loss": 0.0261, + "rewards/chosen": 3.228546142578125, + "rewards/margins": 10.188617706298828, + "rewards/rejected": -6.960071563720703, + "step": 3841 + }, + { + "epoch": 0.3510278666057561, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 7.27347478343253e-06, + "logits/chosen": 542890410.6666666, + "logits/rejected": 939781222.4, + "logps/chosen": -189.30623372395834, + "logps/rejected": -296.516357421875, + "loss": 0.2337, + "rewards/chosen": 1.393214225769043, + "rewards/margins": 7.753273963928223, + "rewards/rejected": -6.36005973815918, + "step": 3842 + }, + { + "epoch": 0.3511192325262677, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 7.272194116999936e-06, + "logits/chosen": 445705216.0, + "logits/rejected": 535201237.3333333, + "logps/chosen": -242.4439453125, + "logps/rejected": -240.98311360677084, + "loss": 0.0073, + "rewards/chosen": 4.728411102294922, + "rewards/margins": 11.442673110961914, + "rewards/rejected": -6.714262008666992, + "step": 3843 + }, + { + "epoch": 0.3512105984467794, + "grad_norm": 34.75, + "kl": 0.0, + "learning_rate": 7.270913262677939e-06, + "logits/chosen": 473351765.3333333, + "logits/rejected": 598081152.0, + "logps/chosen": -233.42728678385416, + "logps/rejected": -580.2431030273438, + "loss": 0.0911, + "rewards/chosen": 3.1666313807169595, + "rewards/margins": 12.798945109049479, + "rewards/rejected": -9.63231372833252, + "step": 3844 + }, + { + "epoch": 0.351301964367291, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.269632220572456e-06, + "logits/chosen": 512651366.4, + "logits/rejected": 392895786.6666667, + "logps/chosen": -392.3275390625, + "logps/rejected": -342.710205078125, + "loss": 0.0258, + "rewards/chosen": 3.8306304931640627, + "rewards/margins": 10.594723892211913, + "rewards/rejected": -6.764093399047852, + "step": 3845 + }, + { + "epoch": 0.3513933302878027, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 7.268350990789415e-06, + "logits/chosen": 420892313.6, + "logits/rejected": 802984618.6666666, + "logps/chosen": -211.9040283203125, + "logps/rejected": -174.0784912109375, + "loss": 0.1286, + "rewards/chosen": 3.182788848876953, + "rewards/margins": 8.324147860209148, + "rewards/rejected": -5.141359011332194, + "step": 3846 + }, + { + "epoch": 0.3514846962083143, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 7.267069573434767e-06, + "logits/chosen": 560721664.0, + "logits/rejected": 417365350.4, + "logps/chosen": -213.24454752604166, + "logps/rejected": -241.8505859375, + "loss": 0.0136, + "rewards/chosen": 4.229048411051433, + "rewards/margins": 11.400560251871745, + "rewards/rejected": -7.171511840820313, + "step": 3847 + }, + { + "epoch": 0.351576062128826, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 7.265787968614467e-06, + "logits/chosen": 579563306.6666666, + "logits/rejected": 426535705.6, + "logps/chosen": -422.3262939453125, + "logps/rejected": -472.925, + "loss": 0.0186, + "rewards/chosen": 3.252307891845703, + "rewards/margins": 12.189362335205079, + "rewards/rejected": -8.937054443359376, + "step": 3848 + }, + { + "epoch": 0.3516674280493376, + "grad_norm": 0.62109375, + "kl": 0.0, + "learning_rate": 7.264506176434498e-06, + "logits/chosen": 446666410.6666667, + "logits/rejected": 495019929.6, + "logps/chosen": -272.5458577473958, + "logps/rejected": -598.801318359375, + "loss": 0.0036, + "rewards/chosen": 4.9945831298828125, + "rewards/margins": 13.822615051269532, + "rewards/rejected": -8.828031921386719, + "step": 3849 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 34.75, + "kl": 0.0, + "learning_rate": 7.263224197000847e-06, + "logits/chosen": 595889216.0, + "logits/rejected": 311213632.0, + "logps/chosen": -253.34375, + "logps/rejected": -352.8233947753906, + "loss": 0.0351, + "rewards/chosen": 3.1941685676574707, + "rewards/margins": 10.300743579864502, + "rewards/rejected": -7.106575012207031, + "step": 3850 + }, + { + "epoch": 0.3518501598903609, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 7.261942030419525e-06, + "logits/chosen": 404270933.3333333, + "logits/rejected": 665717350.4, + "logps/chosen": -207.43513997395834, + "logps/rejected": -423.502294921875, + "loss": 0.0311, + "rewards/chosen": 3.803011894226074, + "rewards/margins": 11.740316581726074, + "rewards/rejected": -7.9373046875, + "step": 3851 + }, + { + "epoch": 0.35194152581087257, + "grad_norm": 0.2353515625, + "kl": 0.0, + "learning_rate": 7.260659676796555e-06, + "logits/rejected": 732233216.0, + "logps/rejected": -511.0384521484375, + "loss": 0.0011, + "rewards/rejected": -8.064594268798828, + "step": 3852 + }, + { + "epoch": 0.3520328917313842, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 7.259377136237975e-06, + "logits/chosen": 467576729.6, + "logits/rejected": 492047018.6666667, + "logps/chosen": -362.09716796875, + "logps/rejected": -286.71213785807294, + "loss": 0.1391, + "rewards/chosen": 3.6997108459472656, + "rewards/margins": 8.103664080301922, + "rewards/rejected": -4.403953234354655, + "step": 3853 + }, + { + "epoch": 0.35212425765189587, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 7.258094408849839e-06, + "logits/chosen": 1040138432.0, + "logits/rejected": 590919552.0, + "logps/chosen": -497.4123229980469, + "logps/rejected": -543.7957153320312, + "loss": 0.0128, + "rewards/chosen": 4.067012786865234, + "rewards/margins": 15.454602241516113, + "rewards/rejected": -11.387589454650879, + "step": 3854 + }, + { + "epoch": 0.3522156235724075, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 7.256811494738217e-06, + "logits/chosen": 534634592.0, + "logits/rejected": 931212288.0, + "logps/chosen": -327.1011962890625, + "logps/rejected": -319.7493082682292, + "loss": 0.0233, + "rewards/chosen": 3.2706665992736816, + "rewards/margins": 11.023633162180584, + "rewards/rejected": -7.752966562906901, + "step": 3855 + }, + { + "epoch": 0.35230698949291916, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 7.255528394009195e-06, + "logits/chosen": 608125184.0, + "logits/rejected": 373423402.6666667, + "logps/chosen": -293.3912353515625, + "logps/rejected": -413.1270345052083, + "loss": 0.0222, + "rewards/chosen": 3.1190261840820312, + "rewards/margins": 11.065316518147785, + "rewards/rejected": -7.946290334065755, + "step": 3856 + }, + { + "epoch": 0.3523983554134308, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 7.254245106768872e-06, + "logits/chosen": 256368486.4, + "logits/rejected": 540784768.0, + "logps/chosen": -229.8222412109375, + "logps/rejected": -587.7676188151041, + "loss": 0.0062, + "rewards/chosen": 4.8846794128417965, + "rewards/margins": 14.09194590250651, + "rewards/rejected": -9.207266489664713, + "step": 3857 + }, + { + "epoch": 0.35248972133394246, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.252961633123365e-06, + "logits/chosen": 509500245.3333333, + "logits/rejected": 332816486.4, + "logps/chosen": -373.0017903645833, + "logps/rejected": -421.1998046875, + "loss": 0.0191, + "rewards/chosen": 3.2747678756713867, + "rewards/margins": 13.607937431335449, + "rewards/rejected": -10.333169555664062, + "step": 3858 + }, + { + "epoch": 0.3525810872544541, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 7.251677973178805e-06, + "logits/chosen": 847225216.0, + "logits/rejected": 383315029.3333333, + "logps/chosen": -696.7574462890625, + "logps/rejected": -417.3138834635417, + "loss": 0.0131, + "rewards/chosen": 3.2351105213165283, + "rewards/margins": 12.298131227493286, + "rewards/rejected": -9.063020706176758, + "step": 3859 + }, + { + "epoch": 0.35267245317496576, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 7.25039412704134e-06, + "logits/chosen": 642641578.6666666, + "logits/rejected": 561902745.6, + "logps/chosen": -249.24102783203125, + "logps/rejected": -467.81484375, + "loss": 0.0165, + "rewards/chosen": 3.165520668029785, + "rewards/margins": 13.566233253479004, + "rewards/rejected": -10.400712585449218, + "step": 3860 + }, + { + "epoch": 0.3527638190954774, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 7.249110094817129e-06, + "logits/chosen": 516671914.6666667, + "logits/rejected": 566387302.4, + "logps/chosen": -112.6350606282552, + "logps/rejected": -430.267529296875, + "loss": 0.0227, + "rewards/chosen": 3.3314825693766275, + "rewards/margins": 11.764491907755533, + "rewards/rejected": -8.433009338378906, + "step": 3861 + }, + { + "epoch": 0.35285518501598906, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 7.247825876612353e-06, + "logits/chosen": 530171296.0, + "logits/rejected": 520974528.0, + "logps/chosen": -298.9043884277344, + "logps/rejected": -388.3620300292969, + "loss": 0.0353, + "rewards/chosen": 2.8294081687927246, + "rewards/margins": 10.96375322341919, + "rewards/rejected": -8.134345054626465, + "step": 3862 + }, + { + "epoch": 0.3529465509365007, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 7.246541472533203e-06, + "logits/chosen": 578640486.4, + "logits/rejected": 477191424.0, + "logps/chosen": -304.3432861328125, + "logps/rejected": -544.4869791666666, + "loss": 0.0224, + "rewards/chosen": 3.7936779022216798, + "rewards/margins": 11.507926559448242, + "rewards/rejected": -7.7142486572265625, + "step": 3863 + }, + { + "epoch": 0.35303791685701236, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 7.245256882685888e-06, + "logits/chosen": 422706240.0, + "logits/rejected": 332703040.0, + "logps/chosen": -224.4334716796875, + "logps/rejected": -435.47662353515625, + "loss": 0.0406, + "rewards/chosen": 3.2269883155822754, + "rewards/margins": 12.535254955291748, + "rewards/rejected": -9.308266639709473, + "step": 3864 + }, + { + "epoch": 0.353129282777524, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 7.243972107176631e-06, + "logits/chosen": 384489760.0, + "logits/rejected": 625181952.0, + "logps/chosen": -232.5093994140625, + "logps/rejected": -443.47198486328125, + "loss": 0.0104, + "rewards/chosen": 4.576013565063477, + "rewards/margins": 12.24324893951416, + "rewards/rejected": -7.667235374450684, + "step": 3865 + }, + { + "epoch": 0.35322064869803566, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 7.2426871461116735e-06, + "logits/chosen": 459887257.6, + "logits/rejected": 724297472.0, + "logps/chosen": -270.49130859375, + "logps/rejected": -304.3618977864583, + "loss": 0.1227, + "rewards/chosen": 4.022213745117187, + "rewards/margins": 7.8151961644490555, + "rewards/rejected": -3.7929824193318686, + "step": 3866 + }, + { + "epoch": 0.3533120146185473, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 7.241401999597266e-06, + "logits/chosen": 369983846.4, + "logits/rejected": 371460053.3333333, + "logps/chosen": -255.1275634765625, + "logps/rejected": -279.26145426432294, + "loss": 0.1176, + "rewards/chosen": 4.39642562866211, + "rewards/margins": 9.615682983398438, + "rewards/rejected": -5.219257354736328, + "step": 3867 + }, + { + "epoch": 0.35340338053905895, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 7.240116667739681e-06, + "logits/chosen": 456651673.6, + "logits/rejected": 575878314.6666666, + "logps/chosen": -201.153076171875, + "logps/rejected": -660.5867513020834, + "loss": 0.0501, + "rewards/chosen": 2.500537109375, + "rewards/margins": 14.525305938720702, + "rewards/rejected": -12.024768829345703, + "step": 3868 + }, + { + "epoch": 0.3534947464595706, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 7.238831150645203e-06, + "logits/chosen": 567206528.0, + "logits/rejected": 1054445440.0, + "logps/chosen": -386.617431640625, + "logps/rejected": -570.2594604492188, + "loss": 0.0166, + "rewards/chosen": 4.159183025360107, + "rewards/margins": 13.961961269378662, + "rewards/rejected": -9.802778244018555, + "step": 3869 + }, + { + "epoch": 0.35358611238008225, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 7.237545448420131e-06, + "logits/chosen": 951807104.0, + "logits/rejected": 531540699.4285714, + "logps/chosen": -164.8682403564453, + "logps/rejected": -464.8615025111607, + "loss": 0.091, + "rewards/chosen": 3.9494400024414062, + "rewards/margins": 10.844130379813059, + "rewards/rejected": -6.894690377371652, + "step": 3870 + }, + { + "epoch": 0.3536774783005939, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.236259561170783e-06, + "logits/chosen": 565338496.0, + "logits/rejected": 643641600.0, + "logps/chosen": -337.8201904296875, + "logps/rejected": -639.94677734375, + "loss": 0.0217, + "rewards/chosen": 3.3495001792907715, + "rewards/margins": 13.287034511566162, + "rewards/rejected": -9.93753433227539, + "step": 3871 + }, + { + "epoch": 0.35376884422110555, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 7.234973489003488e-06, + "logits/chosen": 220832864.0, + "logits/rejected": 574237269.3333334, + "logps/chosen": -323.2057189941406, + "logps/rejected": -354.8424479166667, + "loss": 0.0131, + "rewards/chosen": 3.8722658157348633, + "rewards/margins": 10.791982332865398, + "rewards/rejected": -6.919716517130534, + "step": 3872 + }, + { + "epoch": 0.35386021014161717, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 7.233687232024595e-06, + "logits/chosen": 361747413.3333333, + "logits/rejected": 410177984.0, + "logps/chosen": -261.7204182942708, + "logps/rejected": -570.4317016601562, + "loss": 0.0442, + "rewards/chosen": 3.9684365590413413, + "rewards/margins": 8.008907636006674, + "rewards/rejected": -4.040471076965332, + "step": 3873 + }, + { + "epoch": 0.35395157606212885, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 7.23240079034046e-06, + "logits/chosen": 350134112.0, + "logits/rejected": 697669504.0, + "logps/chosen": -262.54827880859375, + "logps/rejected": -407.4431457519531, + "loss": 0.0306, + "rewards/chosen": 2.901254653930664, + "rewards/margins": 11.713285446166992, + "rewards/rejected": -8.812030792236328, + "step": 3874 + }, + { + "epoch": 0.35404294198264047, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 7.2311141640574655e-06, + "logits/chosen": 656144192.0, + "logits/rejected": 734623872.0, + "logps/chosen": -264.65118408203125, + "logps/rejected": -684.5430908203125, + "loss": 0.0191, + "rewards/chosen": 3.6347620487213135, + "rewards/margins": 14.227676630020142, + "rewards/rejected": -10.592914581298828, + "step": 3875 + }, + { + "epoch": 0.35413430790315215, + "grad_norm": 25.0, + "kl": 0.0, + "learning_rate": 7.2298273532819995e-06, + "logits/chosen": 662546816.0, + "logits/rejected": 384563168.0, + "logps/chosen": -323.3411560058594, + "logps/rejected": -510.93096923828125, + "loss": 0.0661, + "rewards/chosen": 3.0691635608673096, + "rewards/margins": 14.12847638130188, + "rewards/rejected": -11.05931282043457, + "step": 3876 + }, + { + "epoch": 0.35422567382366377, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 7.228540358120474e-06, + "logits/chosen": 335302848.0, + "logits/rejected": 362083498.6666667, + "logps/chosen": -155.8950958251953, + "logps/rejected": -480.0091959635417, + "loss": 0.0034, + "rewards/chosen": 4.46588134765625, + "rewards/margins": 13.571797688802084, + "rewards/rejected": -9.105916341145834, + "step": 3877 + }, + { + "epoch": 0.35431703974417544, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.227253178679306e-06, + "logits/chosen": 845376341.3333334, + "logits/rejected": 430068940.8, + "logps/chosen": -245.94510904947916, + "logps/rejected": -425.3908203125, + "loss": 0.0217, + "rewards/chosen": 2.961310068766276, + "rewards/margins": 12.254293314615884, + "rewards/rejected": -9.292983245849609, + "step": 3878 + }, + { + "epoch": 0.35440840566468707, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 7.225965815064938e-06, + "logits/chosen": 601690624.0, + "logits/rejected": 563989913.6, + "logps/chosen": -571.6348470052084, + "logps/rejected": -433.80087890625, + "loss": 0.014, + "rewards/chosen": 3.658135096232096, + "rewards/margins": 12.461633936564127, + "rewards/rejected": -8.803498840332031, + "step": 3879 + }, + { + "epoch": 0.35449977158519874, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 7.22467826738382e-06, + "logits/chosen": 379859328.0, + "logits/rejected": 353099161.6, + "logps/chosen": -313.2201334635417, + "logps/rejected": -370.76650390625, + "loss": 0.0114, + "rewards/chosen": 4.000447591145833, + "rewards/margins": 13.560244496663412, + "rewards/rejected": -9.559796905517578, + "step": 3880 + }, + { + "epoch": 0.35459113750571036, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 7.223390535742422e-06, + "logits/chosen": 530161152.0, + "logits/rejected": 383333632.0, + "logps/chosen": -270.8453857421875, + "logps/rejected": -626.0499674479166, + "loss": 0.0204, + "rewards/chosen": 4.086801528930664, + "rewards/margins": 13.050166447957357, + "rewards/rejected": -8.963364919026693, + "step": 3881 + }, + { + "epoch": 0.35468250342622204, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 7.222102620247226e-06, + "logits/chosen": 386674261.3333333, + "logits/rejected": 427332172.8, + "logps/chosen": -313.87892659505206, + "logps/rejected": -616.982275390625, + "loss": 0.0154, + "rewards/chosen": 3.9328206380208335, + "rewards/margins": 14.073918660481771, + "rewards/rejected": -10.141098022460938, + "step": 3882 + }, + { + "epoch": 0.35477386934673366, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 7.220814521004732e-06, + "logits/chosen": 1139908710.4, + "logits/rejected": 530900778.6666667, + "logps/chosen": -349.05234375, + "logps/rejected": -575.7441813151041, + "loss": 0.0345, + "rewards/chosen": 3.0650508880615233, + "rewards/margins": 12.126584498087563, + "rewards/rejected": -9.061533610026041, + "step": 3883 + }, + { + "epoch": 0.35486523526724534, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 7.219526238121455e-06, + "logits/chosen": 383223488.0, + "logits/rejected": 458620970.6666667, + "logps/chosen": -314.04229736328125, + "logps/rejected": -335.7789713541667, + "loss": 0.0171, + "rewards/chosen": 5.031262397766113, + "rewards/margins": 13.560901959737143, + "rewards/rejected": -8.52963956197103, + "step": 3884 + }, + { + "epoch": 0.35495660118775696, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 7.218237771703921e-06, + "logits/chosen": 784707072.0, + "logits/rejected": 1260390912.0, + "logps/chosen": -318.557763671875, + "logps/rejected": -377.1080729166667, + "loss": 0.046, + "rewards/chosen": 2.7284530639648437, + "rewards/margins": 10.63251978556315, + "rewards/rejected": -7.904066721598308, + "step": 3885 + }, + { + "epoch": 0.35504796710826864, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 7.2169491218586755e-06, + "logits/chosen": 436629312.0, + "logits/rejected": 561111424.0, + "logps/chosen": -305.3471984863281, + "logps/rejected": -598.8347778320312, + "loss": 0.0163, + "rewards/chosen": 4.0499267578125, + "rewards/margins": 12.727766036987305, + "rewards/rejected": -8.677839279174805, + "step": 3886 + }, + { + "epoch": 0.35513933302878026, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 7.2156602886922785e-06, + "logits/chosen": 469612236.8, + "logits/rejected": 508259498.6666667, + "logps/chosen": -418.783837890625, + "logps/rejected": -459.3522135416667, + "loss": 0.0136, + "rewards/chosen": 4.105417633056641, + "rewards/margins": 13.088976287841797, + "rewards/rejected": -8.983558654785156, + "step": 3887 + }, + { + "epoch": 0.35523069894929193, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 7.214371272311303e-06, + "logits/chosen": 567016149.3333334, + "logits/rejected": 445749350.4, + "logps/chosen": -347.9984537760417, + "logps/rejected": -531.32119140625, + "loss": 0.0185, + "rewards/chosen": 3.297821044921875, + "rewards/margins": 11.90030517578125, + "rewards/rejected": -8.602484130859375, + "step": 3888 + }, + { + "epoch": 0.35532206486980356, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 7.213082072822342e-06, + "logits/chosen": 677200426.6666666, + "logits/rejected": 600372326.4, + "logps/chosen": -226.50408935546875, + "logps/rejected": -718.78681640625, + "loss": 0.0072, + "rewards/chosen": 4.221008936564128, + "rewards/margins": 14.721782557169597, + "rewards/rejected": -10.500773620605468, + "step": 3889 + }, + { + "epoch": 0.35541343079031523, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 7.211792690331998e-06, + "logits/chosen": 515439957.3333333, + "logits/rejected": 242722841.6, + "logps/chosen": -342.7497151692708, + "logps/rejected": -327.22685546875, + "loss": 0.0195, + "rewards/chosen": 3.0983495712280273, + "rewards/margins": 11.780800437927246, + "rewards/rejected": -8.682450866699218, + "step": 3890 + }, + { + "epoch": 0.35550479671082685, + "grad_norm": 0.859375, + "kl": 0.0, + "learning_rate": 7.210503124946889e-06, + "logits/chosen": 514440448.0, + "logits/rejected": 786693851.4285715, + "logps/chosen": -386.0860595703125, + "logps/rejected": -497.547607421875, + "loss": 0.0028, + "rewards/chosen": 3.8603882789611816, + "rewards/margins": 14.10858120237078, + "rewards/rejected": -10.248192923409599, + "step": 3891 + }, + { + "epoch": 0.35559616263133853, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 7.209213376773654e-06, + "logits/chosen": 573797120.0, + "logits/rejected": 1088673280.0, + "logps/chosen": -505.2295227050781, + "logps/rejected": -621.0369698660714, + "loss": 0.0038, + "rewards/chosen": 3.5181732177734375, + "rewards/margins": 12.390033176967076, + "rewards/rejected": -8.871859959193639, + "step": 3892 + }, + { + "epoch": 0.35568752855185015, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 7.20792344591894e-06, + "logits/chosen": 558616576.0, + "logits/rejected": 467806528.0, + "logps/chosen": -373.2669372558594, + "logps/rejected": -450.427734375, + "loss": 0.0248, + "rewards/chosen": 3.0538525581359863, + "rewards/margins": 13.771608829498291, + "rewards/rejected": -10.717756271362305, + "step": 3893 + }, + { + "epoch": 0.35577889447236183, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.2066333324894164e-06, + "logits/chosen": 433161152.0, + "logits/rejected": 597277696.0, + "logps/chosen": -262.0128173828125, + "logps/rejected": -344.70819091796875, + "loss": 0.0334, + "rewards/chosen": 2.8695731163024902, + "rewards/margins": 11.066978931427002, + "rewards/rejected": -8.197405815124512, + "step": 3894 + }, + { + "epoch": 0.35587026039287345, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 7.20534303659176e-06, + "logits/chosen": 779727168.0, + "logits/rejected": 710179328.0, + "logps/chosen": -595.4882202148438, + "logps/rejected": -666.8179931640625, + "loss": 0.0245, + "rewards/chosen": 3.026738166809082, + "rewards/margins": 10.56127405166626, + "rewards/rejected": -7.534535884857178, + "step": 3895 + }, + { + "epoch": 0.3559616263133851, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 7.204052558332668e-06, + "logits/chosen": 538897344.0, + "logits/rejected": 338014229.3333333, + "logps/chosen": -309.81146240234375, + "logps/rejected": -410.7445068359375, + "loss": 0.013, + "rewards/chosen": 3.1503982543945312, + "rewards/margins": 10.45589828491211, + "rewards/rejected": -7.305500030517578, + "step": 3896 + }, + { + "epoch": 0.35605299223389675, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 7.20276189781885e-06, + "logits/chosen": -29319510.0, + "logits/rejected": 561195849.1428572, + "logps/chosen": -999.05615234375, + "logps/rejected": -568.4223981584821, + "loss": 0.004, + "rewards/chosen": 3.6045167446136475, + "rewards/margins": 13.253703015191215, + "rewards/rejected": -9.649186270577568, + "step": 3897 + }, + { + "epoch": 0.3561443581544084, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 7.2014710551570324e-06, + "logits/chosen": 896134144.0, + "logits/rejected": 351415232.0, + "logps/chosen": -394.4573567708333, + "logps/rejected": -336.4488220214844, + "loss": 0.0513, + "rewards/chosen": 3.2170416514078775, + "rewards/margins": 13.010956446329752, + "rewards/rejected": -9.793914794921875, + "step": 3898 + }, + { + "epoch": 0.35623572407492005, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 7.200180030453954e-06, + "logits/chosen": 487690752.0, + "logits/rejected": 420482918.4, + "logps/chosen": -317.3374837239583, + "logps/rejected": -428.81884765625, + "loss": 0.1144, + "rewards/chosen": 3.8200887044270835, + "rewards/margins": 11.230415089925131, + "rewards/rejected": -7.410326385498047, + "step": 3899 + }, + { + "epoch": 0.3563270899954317, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 7.198888823816373e-06, + "logits/chosen": 458118553.6, + "logits/rejected": 415186474.6666667, + "logps/chosen": -390.908203125, + "logps/rejected": -452.4849446614583, + "loss": 0.0238, + "rewards/chosen": 3.2911285400390624, + "rewards/margins": 14.318490091959635, + "rewards/rejected": -11.027361551920572, + "step": 3900 + }, + { + "epoch": 0.35641845591594334, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 7.19759743535106e-06, + "logits/chosen": 558777139.2, + "logits/rejected": 538459733.3333334, + "logps/chosen": -476.985498046875, + "logps/rejected": -532.7329915364584, + "loss": 0.0192, + "rewards/chosen": 3.842079925537109, + "rewards/margins": 12.84981486002604, + "rewards/rejected": -9.007734934488932, + "step": 3901 + }, + { + "epoch": 0.356509821836455, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 7.196305865164801e-06, + "logits/chosen": 827895193.6, + "logits/rejected": 620661504.0, + "logps/chosen": -528.4091796875, + "logps/rejected": -515.072021484375, + "loss": 0.1237, + "rewards/chosen": 2.8518911361694337, + "rewards/margins": 11.03858496348063, + "rewards/rejected": -8.186693827311197, + "step": 3902 + }, + { + "epoch": 0.35660118775696664, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 7.195014113364394e-06, + "logits/chosen": 504645632.0, + "logits/rejected": 298157408.0, + "logps/chosen": -315.7429504394531, + "logps/rejected": -532.0016479492188, + "loss": 0.0084, + "rewards/chosen": 4.908468246459961, + "rewards/margins": 11.343165397644043, + "rewards/rejected": -6.434697151184082, + "step": 3903 + }, + { + "epoch": 0.3566925536774783, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 7.1937221800566596e-06, + "logits/chosen": 760694186.6666666, + "logits/rejected": 361199712.0, + "logps/chosen": -401.3262125651042, + "logps/rejected": -547.786865234375, + "loss": 0.026, + "rewards/chosen": 3.4631067911783853, + "rewards/margins": 13.855934778849283, + "rewards/rejected": -10.392827987670898, + "step": 3904 + }, + { + "epoch": 0.35678391959798994, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 7.192430065348424e-06, + "logits/chosen": 564923733.3333334, + "logits/rejected": 623235276.8, + "logps/chosen": -271.7808837890625, + "logps/rejected": -571.21630859375, + "loss": 0.0082, + "rewards/chosen": 4.369709014892578, + "rewards/margins": 13.976348876953125, + "rewards/rejected": -9.606639862060547, + "step": 3905 + }, + { + "epoch": 0.3568752855185016, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 7.191137769346535e-06, + "logits/chosen": 493800405.3333333, + "logits/rejected": 161456832.0, + "logps/chosen": -390.3256429036458, + "logps/rejected": -270.8317565917969, + "loss": 0.0392, + "rewards/chosen": 3.0919577280680337, + "rewards/margins": 13.662474314371744, + "rewards/rejected": -10.570516586303711, + "step": 3906 + }, + { + "epoch": 0.35696665143901324, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 7.189845292157856e-06, + "logits/chosen": 829113856.0, + "logits/rejected": 748833536.0, + "logps/chosen": -584.57041015625, + "logps/rejected": -532.8829345703125, + "loss": 0.0347, + "rewards/chosen": 3.0685089111328123, + "rewards/margins": 14.787250010172524, + "rewards/rejected": -11.718741099039713, + "step": 3907 + }, + { + "epoch": 0.3570580173595249, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 7.18855263388926e-06, + "logits/chosen": 558345728.0, + "logits/rejected": 341068138.6666667, + "logps/chosen": -260.9326904296875, + "logps/rejected": -407.208251953125, + "loss": 0.0272, + "rewards/chosen": 3.511149597167969, + "rewards/margins": 13.124757639567058, + "rewards/rejected": -9.613608042399088, + "step": 3908 + }, + { + "epoch": 0.35714938328003654, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 7.187259794647638e-06, + "logits/chosen": 535406890.6666667, + "logits/rejected": 768833088.0, + "logps/chosen": -234.130126953125, + "logps/rejected": -329.3364562988281, + "loss": 0.0327, + "rewards/chosen": 4.109676361083984, + "rewards/margins": 11.548977851867676, + "rewards/rejected": -7.439301490783691, + "step": 3909 + }, + { + "epoch": 0.3572407492005482, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 7.1859667745398965e-06, + "logits/chosen": 833549312.0, + "logits/rejected": 651347968.0, + "logps/chosen": -477.3546956380208, + "logps/rejected": -522.511328125, + "loss": 0.0174, + "rewards/chosen": 3.137120564778646, + "rewards/margins": 12.796185811360678, + "rewards/rejected": -9.659065246582031, + "step": 3910 + }, + { + "epoch": 0.35733211512105983, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 7.184673573672958e-06, + "logits/chosen": 695342208.0, + "logits/rejected": 680032128.0, + "logps/chosen": -330.20245361328125, + "logps/rejected": -590.22509765625, + "loss": 0.0063, + "rewards/chosen": 4.7523345947265625, + "rewards/margins": 14.798677444458008, + "rewards/rejected": -10.046342849731445, + "step": 3911 + }, + { + "epoch": 0.3574234810415715, + "grad_norm": 44.5, + "kl": 0.0, + "learning_rate": 7.183380192153754e-06, + "logits/chosen": 399146304.0, + "logits/rejected": 691829802.6666666, + "logps/chosen": -265.8298034667969, + "logps/rejected": -621.4419352213541, + "loss": 0.0831, + "rewards/chosen": 1.624809741973877, + "rewards/margins": 10.838350137074789, + "rewards/rejected": -9.213540395100912, + "step": 3912 + }, + { + "epoch": 0.35751484696208313, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 7.182086630089238e-06, + "logits/chosen": 272083968.0, + "logits/rejected": 567713194.6666666, + "logps/chosen": -321.7215576171875, + "logps/rejected": -671.7695719401041, + "loss": 0.016, + "rewards/chosen": 4.081381225585938, + "rewards/margins": 14.934044901529948, + "rewards/rejected": -10.85266367594401, + "step": 3913 + }, + { + "epoch": 0.3576062128825948, + "grad_norm": 0.921875, + "kl": 0.0, + "learning_rate": 7.1807928875863756e-06, + "logits/chosen": 468223264.0, + "logits/rejected": 548075605.3333334, + "logps/chosen": -382.78643798828125, + "logps/rejected": -633.2232259114584, + "loss": 0.0031, + "rewards/chosen": 4.939118385314941, + "rewards/margins": 15.07811450958252, + "rewards/rejected": -10.138996124267578, + "step": 3914 + }, + { + "epoch": 0.35769757880310643, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 7.179498964752147e-06, + "logits/chosen": 34596704.0, + "logits/rejected": 364760758.85714287, + "logps/chosen": -358.683837890625, + "logps/rejected": -417.0034877232143, + "loss": 0.0035, + "rewards/chosen": 3.6412017345428467, + "rewards/margins": 12.349323443004064, + "rewards/rejected": -8.708121708461217, + "step": 3915 + }, + { + "epoch": 0.3577889447236181, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 7.178204861693546e-06, + "logits/chosen": 471756492.8, + "logits/rejected": 604133717.3333334, + "logps/chosen": -309.2581787109375, + "logps/rejected": -712.5555013020834, + "loss": 0.0194, + "rewards/chosen": 3.775652313232422, + "rewards/margins": 14.63159662882487, + "rewards/rejected": -10.855944315592447, + "step": 3916 + }, + { + "epoch": 0.35788031064412973, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 7.176910578517586e-06, + "logits/chosen": 467218624.0, + "logits/rejected": 387208704.0, + "logps/chosen": -327.77960205078125, + "logps/rejected": -393.6181640625, + "loss": 0.026, + "rewards/chosen": 3.052534580230713, + "rewards/margins": 11.159294605255127, + "rewards/rejected": -8.106760025024414, + "step": 3917 + }, + { + "epoch": 0.3579716765646414, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 7.175616115331291e-06, + "logits/chosen": 578511274.6666666, + "logits/rejected": 455138304.0, + "logps/chosen": -438.1425374348958, + "logps/rejected": -637.96201171875, + "loss": 0.0128, + "rewards/chosen": 3.4134581883748374, + "rewards/margins": 13.472660001118978, + "rewards/rejected": -10.05920181274414, + "step": 3918 + }, + { + "epoch": 0.358063042485153, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 7.1743214722417e-06, + "logits/chosen": 426054784.0, + "logits/rejected": 632575701.3333334, + "logps/chosen": -256.5801086425781, + "logps/rejected": -619.1638590494791, + "loss": 0.0079, + "rewards/chosen": 4.187535285949707, + "rewards/margins": 14.128490130106607, + "rewards/rejected": -9.9409548441569, + "step": 3919 + }, + { + "epoch": 0.3581544084056647, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 7.17302664935587e-06, + "logits/chosen": 462060748.8, + "logits/rejected": 1113246720.0, + "logps/chosen": -332.4681640625, + "logps/rejected": -457.2770182291667, + "loss": 0.037, + "rewards/chosen": 3.342931365966797, + "rewards/margins": 12.436725234985351, + "rewards/rejected": -9.093793869018555, + "step": 3920 + }, + { + "epoch": 0.3582457743261763, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 7.171731646780867e-06, + "logits/chosen": 486505685.3333333, + "logits/rejected": 424262604.8, + "logps/chosen": -329.35784912109375, + "logps/rejected": -392.8944091796875, + "loss": 0.1169, + "rewards/chosen": 3.4702768325805664, + "rewards/margins": 8.968437385559081, + "rewards/rejected": -5.498160552978516, + "step": 3921 + }, + { + "epoch": 0.358337140246688, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 7.1704364646237814e-06, + "logits/chosen": 561802026.6666666, + "logits/rejected": 495289241.6, + "logps/chosen": -212.0146687825521, + "logps/rejected": -240.07294921875, + "loss": 0.0093, + "rewards/chosen": 4.457517306009929, + "rewards/margins": 12.628621737162273, + "rewards/rejected": -8.171104431152344, + "step": 3922 + }, + { + "epoch": 0.3584285061671996, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 7.169141102991708e-06, + "logits/chosen": 569923264.0, + "logits/rejected": 461052842.6666667, + "logps/chosen": -287.45758056640625, + "logps/rejected": -470.1732177734375, + "loss": 0.0089, + "rewards/chosen": 4.371522426605225, + "rewards/margins": 11.942517757415771, + "rewards/rejected": -7.570995330810547, + "step": 3923 + }, + { + "epoch": 0.3585198720877113, + "grad_norm": 43.25, + "kl": 0.0, + "learning_rate": 7.167845561991763e-06, + "logits/chosen": 348848512.0, + "logits/rejected": 543717056.0, + "logps/chosen": -378.47174072265625, + "logps/rejected": -463.8782043457031, + "loss": 0.1003, + "rewards/chosen": 2.002425193786621, + "rewards/margins": 9.800251960754395, + "rewards/rejected": -7.797826766967773, + "step": 3924 + }, + { + "epoch": 0.3586112380082229, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 7.1665498417310764e-06, + "logits/chosen": 304111104.0, + "logits/rejected": 598687104.0, + "logps/chosen": -260.9918212890625, + "logps/rejected": -528.33447265625, + "loss": 0.0401, + "rewards/chosen": 3.3802382946014404, + "rewards/margins": 11.47360110282898, + "rewards/rejected": -8.093362808227539, + "step": 3925 + }, + { + "epoch": 0.3587026039287346, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 7.165253942316791e-06, + "logits/chosen": 588157568.0, + "logits/rejected": 464800224.0, + "logps/chosen": -315.78411865234375, + "logps/rejected": -425.30218505859375, + "loss": 0.0114, + "rewards/chosen": 4.558518886566162, + "rewards/margins": 12.848872661590576, + "rewards/rejected": -8.290353775024414, + "step": 3926 + }, + { + "epoch": 0.3587939698492462, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 7.163957863856066e-06, + "logits/chosen": 649847210.6666666, + "logits/rejected": 391441536.0, + "logps/chosen": -227.6287638346354, + "logps/rejected": -480.9390625, + "loss": 0.0073, + "rewards/chosen": 4.021043459574382, + "rewards/margins": 12.448477236429852, + "rewards/rejected": -8.42743377685547, + "step": 3927 + }, + { + "epoch": 0.3588853357697579, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 7.162661606456075e-06, + "logits/chosen": 834748800.0, + "logits/rejected": 658355968.0, + "logps/chosen": -352.8830261230469, + "logps/rejected": -594.3553466796875, + "loss": 0.0169, + "rewards/chosen": 3.4832143783569336, + "rewards/margins": 11.809952735900879, + "rewards/rejected": -8.326738357543945, + "step": 3928 + }, + { + "epoch": 0.3589767016902695, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 7.161365170224006e-06, + "logits/chosen": 685469491.2, + "logits/rejected": 800539733.3333334, + "logps/chosen": -267.40458984375, + "logps/rejected": -394.7109781901042, + "loss": 0.1433, + "rewards/chosen": 2.0653152465820312, + "rewards/margins": 10.203231811523438, + "rewards/rejected": -8.137916564941406, + "step": 3929 + }, + { + "epoch": 0.3590680676107812, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 7.160068555267063e-06, + "logits/chosen": 525532202.6666667, + "logits/rejected": 377693030.4, + "logps/chosen": -414.50537109375, + "logps/rejected": -465.140673828125, + "loss": 0.0226, + "rewards/chosen": 4.178128878275554, + "rewards/margins": 13.095366732279459, + "rewards/rejected": -8.917237854003906, + "step": 3930 + }, + { + "epoch": 0.3591594335312928, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 7.158771761692464e-06, + "logits/chosen": 425478826.6666667, + "logits/rejected": 665462080.0, + "logps/chosen": -299.7196044921875, + "logps/rejected": -486.0699157714844, + "loss": 0.0932, + "rewards/chosen": 3.168416976928711, + "rewards/margins": 8.001975059509277, + "rewards/rejected": -4.833558082580566, + "step": 3931 + }, + { + "epoch": 0.3592507994518045, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.157474789607443e-06, + "logits/chosen": 380839705.6, + "logits/rejected": 400529322.6666667, + "logps/chosen": -306.5506591796875, + "logps/rejected": -553.6551513671875, + "loss": 0.0232, + "rewards/chosen": 3.98235969543457, + "rewards/margins": 11.689205932617188, + "rewards/rejected": -7.706846237182617, + "step": 3932 + }, + { + "epoch": 0.3593421653723161, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 7.156177639119246e-06, + "logits/chosen": 454826496.0, + "logits/rejected": 730820522.6666666, + "logps/chosen": -369.403662109375, + "logps/rejected": -433.0303141276042, + "loss": 0.0167, + "rewards/chosen": 4.578081893920898, + "rewards/margins": 13.060744094848634, + "rewards/rejected": -8.482662200927734, + "step": 3933 + }, + { + "epoch": 0.3594335312928278, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 7.154880310335136e-06, + "logits/chosen": 718474581.3333334, + "logits/rejected": 627170304.0, + "logps/chosen": -348.2618001302083, + "logps/rejected": -603.18564453125, + "loss": 0.012, + "rewards/chosen": 3.5356502532958984, + "rewards/margins": 13.746189498901368, + "rewards/rejected": -10.210539245605469, + "step": 3934 + }, + { + "epoch": 0.3595248972133394, + "grad_norm": 30.0, + "kl": 0.0, + "learning_rate": 7.153582803362389e-06, + "logits/chosen": 540618816.0, + "logits/rejected": 680799296.0, + "logps/chosen": -434.63360595703125, + "logps/rejected": -435.9117431640625, + "loss": 0.0743, + "rewards/chosen": 3.5054948329925537, + "rewards/margins": 9.07600712776184, + "rewards/rejected": -5.570512294769287, + "step": 3935 + }, + { + "epoch": 0.3596162631338511, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 7.152285118308298e-06, + "logits/chosen": 675350720.0, + "logits/rejected": 759115264.0, + "logps/chosen": -432.92169189453125, + "logps/rejected": -351.3656005859375, + "loss": 0.0273, + "rewards/chosen": 3.498739719390869, + "rewards/margins": 9.857304096221924, + "rewards/rejected": -6.358564376831055, + "step": 3936 + }, + { + "epoch": 0.3597076290543627, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 7.150987255280169e-06, + "logits/chosen": 607782848.0, + "logits/rejected": 799878528.0, + "logps/chosen": -394.3822937011719, + "logps/rejected": -550.9066772460938, + "loss": 0.0141, + "rewards/chosen": 3.969416856765747, + "rewards/margins": 12.343850374221802, + "rewards/rejected": -8.374433517456055, + "step": 3937 + }, + { + "epoch": 0.3597989949748744, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 7.149689214385323e-06, + "logits/chosen": 509165184.0, + "logits/rejected": 629279786.6666666, + "logps/chosen": -351.8628234863281, + "logps/rejected": -395.2628580729167, + "loss": 0.0085, + "rewards/chosen": 3.3948683738708496, + "rewards/margins": 12.124233404795328, + "rewards/rejected": -8.729365030924479, + "step": 3938 + }, + { + "epoch": 0.359890360895386, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 7.148390995731096e-06, + "logits/chosen": 430785952.0, + "logits/rejected": 562999680.0, + "logps/chosen": -249.2706298828125, + "logps/rejected": -622.1796875, + "loss": 0.0202, + "rewards/chosen": 3.6074085235595703, + "rewards/margins": 12.54966926574707, + "rewards/rejected": -8.9422607421875, + "step": 3939 + }, + { + "epoch": 0.3599817268158977, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 7.14709259942484e-06, + "logits/chosen": 358079402.6666667, + "logits/rejected": 409885107.2, + "logps/chosen": -202.83260091145834, + "logps/rejected": -453.80087890625, + "loss": 0.0163, + "rewards/chosen": 3.645387649536133, + "rewards/margins": 11.976762771606445, + "rewards/rejected": -8.331375122070312, + "step": 3940 + }, + { + "epoch": 0.3600730927364093, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 7.145794025573919e-06, + "logits/chosen": 1297051904.0, + "logits/rejected": 555543168.0, + "logps/chosen": -226.21133422851562, + "logps/rejected": -464.5162760416667, + "loss": 0.1149, + "rewards/chosen": 0.6677219271659851, + "rewards/margins": 9.12218270699183, + "rewards/rejected": -8.454460779825846, + "step": 3941 + }, + { + "epoch": 0.360164458656921, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 7.144495274285712e-06, + "logits/chosen": 865668864.0, + "logits/rejected": 675715072.0, + "logps/chosen": -582.1607666015625, + "logps/rejected": -441.5334065755208, + "loss": 0.0062, + "rewards/chosen": 4.158969402313232, + "rewards/margins": 13.1273086865743, + "rewards/rejected": -8.968339284261068, + "step": 3942 + }, + { + "epoch": 0.3602558245774326, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 7.143196345667616e-06, + "logits/chosen": 549009152.0, + "logits/rejected": 758276992.0, + "logps/chosen": -265.85540771484375, + "logps/rejected": -720.8634033203125, + "loss": 0.0331, + "rewards/chosen": 2.833596706390381, + "rewards/margins": 13.0188889503479, + "rewards/rejected": -10.18529224395752, + "step": 3943 + }, + { + "epoch": 0.3603471904979443, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 7.14189723982704e-06, + "logits/chosen": 272688064.0, + "logits/rejected": 679597933.7142857, + "logps/chosen": -33.67451477050781, + "logps/rejected": -371.5488978794643, + "loss": 0.0244, + "rewards/chosen": 1.627374291419983, + "rewards/margins": 8.97008560385023, + "rewards/rejected": -7.342711312430246, + "step": 3944 + }, + { + "epoch": 0.3604385564184559, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 7.140597956871407e-06, + "logits/chosen": 476156501.3333333, + "logits/rejected": 412798464.0, + "logps/chosen": -305.9820963541667, + "logps/rejected": -417.6962890625, + "loss": 0.0144, + "rewards/chosen": 3.5414307912190757, + "rewards/margins": 12.042668279012045, + "rewards/rejected": -8.501237487792968, + "step": 3945 + }, + { + "epoch": 0.3605299223389676, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.139298496908155e-06, + "logits/chosen": 772025241.6, + "logits/rejected": 539650560.0, + "logps/chosen": -392.3760009765625, + "logps/rejected": -527.3212483723959, + "loss": 0.0163, + "rewards/chosen": 3.7992313385009764, + "rewards/margins": 13.997081883748372, + "rewards/rejected": -10.197850545247396, + "step": 3946 + }, + { + "epoch": 0.3606212882594792, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 7.137998860044739e-06, + "logits/chosen": 719237632.0, + "logits/rejected": 568918101.3333334, + "logps/chosen": -429.26263427734375, + "logps/rejected": -716.92529296875, + "loss": 0.0072, + "rewards/chosen": 3.7540009021759033, + "rewards/margins": 14.534767707188925, + "rewards/rejected": -10.780766805013021, + "step": 3947 + }, + { + "epoch": 0.3607126541799909, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 7.136699046388625e-06, + "logits/chosen": 592275046.4, + "logits/rejected": 418699520.0, + "logps/chosen": -405.4122314453125, + "logps/rejected": -355.6458333333333, + "loss": 0.0383, + "rewards/chosen": 3.2228355407714844, + "rewards/margins": 12.399815241495768, + "rewards/rejected": -9.176979700724283, + "step": 3948 + }, + { + "epoch": 0.3608040201005025, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 7.1353990560472995e-06, + "logits/chosen": 1132592981.3333333, + "logits/rejected": 647140249.6, + "logps/chosen": -444.3046875, + "logps/rejected": -390.48857421875, + "loss": 0.0933, + "rewards/chosen": 3.350837071736654, + "rewards/margins": 12.415555699666342, + "rewards/rejected": -9.064718627929688, + "step": 3949 + }, + { + "epoch": 0.3608953860210142, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 7.134098889128255e-06, + "logits/chosen": 554391961.6, + "logits/rejected": 338291328.0, + "logps/chosen": -483.92294921875, + "logps/rejected": -501.0538736979167, + "loss": 0.0169, + "rewards/chosen": 3.8927772521972654, + "rewards/margins": 14.759771474202473, + "rewards/rejected": -10.866994222005209, + "step": 3950 + }, + { + "epoch": 0.3609867519415258, + "grad_norm": 0.51953125, + "kl": 0.0, + "learning_rate": 7.132798545739007e-06, + "logits/chosen": 1039654336.0, + "logits/rejected": 538499200.0, + "logps/chosen": -246.38800048828125, + "logps/rejected": -519.1668294270834, + "loss": 0.0028, + "rewards/chosen": 4.7672929763793945, + "rewards/margins": 14.834951718648275, + "rewards/rejected": -10.06765874226888, + "step": 3951 + }, + { + "epoch": 0.3610781178620375, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 7.131498025987078e-06, + "logits/chosen": 1237281109.3333333, + "logits/rejected": 718220236.8, + "logps/chosen": -310.80824788411456, + "logps/rejected": -439.0056640625, + "loss": 0.025, + "rewards/chosen": 3.0756282806396484, + "rewards/margins": 11.68422203063965, + "rewards/rejected": -8.60859375, + "step": 3952 + }, + { + "epoch": 0.3611694837825491, + "grad_norm": 0.58984375, + "kl": 0.0, + "learning_rate": 7.130197329980013e-06, + "logits/rejected": 750986048.0, + "logps/rejected": -487.8730773925781, + "loss": 0.0021, + "rewards/rejected": -8.258295059204102, + "step": 3953 + }, + { + "epoch": 0.36126084970306077, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 7.128896457825364e-06, + "logits/chosen": 671987404.8, + "logits/rejected": 572904021.3333334, + "logps/chosen": -411.969580078125, + "logps/rejected": -678.6511637369791, + "loss": 0.0254, + "rewards/chosen": 3.27368049621582, + "rewards/margins": 12.94915402730306, + "rewards/rejected": -9.67547353108724, + "step": 3954 + }, + { + "epoch": 0.3613522156235724, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 7.127595409630703e-06, + "logits/chosen": 557803008.0, + "logits/rejected": 502031072.0, + "logps/chosen": -449.40093994140625, + "logps/rejected": -495.6030578613281, + "loss": 0.0256, + "rewards/chosen": 3.3258583545684814, + "rewards/margins": 13.29605221748352, + "rewards/rejected": -9.970193862915039, + "step": 3955 + }, + { + "epoch": 0.36144358154408407, + "grad_norm": 43.75, + "kl": 0.0, + "learning_rate": 7.126294185503614e-06, + "logits/chosen": 735645866.6666666, + "logits/rejected": 1016042598.4, + "logps/chosen": -288.1068115234375, + "logps/rejected": -656.54541015625, + "loss": 0.0313, + "rewards/chosen": 3.1519447962443032, + "rewards/margins": 12.864957491556803, + "rewards/rejected": -9.7130126953125, + "step": 3956 + }, + { + "epoch": 0.3615349474645957, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 7.124992785551698e-06, + "logits/chosen": 474690016.0, + "logits/rejected": 396985984.0, + "logps/chosen": -390.2701110839844, + "logps/rejected": -724.321533203125, + "loss": 0.0207, + "rewards/chosen": 3.4028687477111816, + "rewards/margins": 16.36249589920044, + "rewards/rejected": -12.959627151489258, + "step": 3957 + }, + { + "epoch": 0.36162631338510737, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 7.123691209882566e-06, + "logits/chosen": 474807705.6, + "logits/rejected": 471376384.0, + "logps/chosen": -315.20146484375, + "logps/rejected": -648.1817626953125, + "loss": 0.0306, + "rewards/chosen": 4.097568130493164, + "rewards/margins": 16.037704086303712, + "rewards/rejected": -11.940135955810547, + "step": 3958 + }, + { + "epoch": 0.361717679305619, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 7.122389458603847e-06, + "logits/chosen": 451996000.0, + "logits/rejected": 337989248.0, + "logps/chosen": -291.7870178222656, + "logps/rejected": -559.0211181640625, + "loss": 0.0164, + "rewards/chosen": 3.7640252113342285, + "rewards/margins": 11.421825885772705, + "rewards/rejected": -7.657800674438477, + "step": 3959 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 7.121087531823184e-06, + "logits/chosen": 505379669.3333333, + "logits/rejected": 531992544.0, + "logps/chosen": -364.2393391927083, + "logps/rejected": -437.9120788574219, + "loss": 0.0183, + "rewards/chosen": 4.164546648661296, + "rewards/margins": 13.546571413675945, + "rewards/rejected": -9.382024765014648, + "step": 3960 + }, + { + "epoch": 0.3619004111466423, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 7.119785429648234e-06, + "logits/chosen": 449987754.6666667, + "logits/rejected": 436355264.0, + "logps/chosen": -284.53786214192706, + "logps/rejected": -244.045166015625, + "loss": 0.0228, + "rewards/chosen": 3.8649876912434897, + "rewards/margins": 10.831153710683187, + "rewards/rejected": -6.966166019439697, + "step": 3961 + }, + { + "epoch": 0.36199177706715396, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 7.11848315218667e-06, + "logits/chosen": 702490214.4, + "logits/rejected": 1091035221.3333333, + "logps/chosen": -329.8268798828125, + "logps/rejected": -410.0071614583333, + "loss": 0.0136, + "rewards/chosen": 4.112530517578125, + "rewards/margins": 10.781619517008464, + "rewards/rejected": -6.669088999430339, + "step": 3962 + }, + { + "epoch": 0.3620831429876656, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 7.117180699546178e-06, + "logits/chosen": 1619455658.6666667, + "logits/rejected": 717232640.0, + "logps/chosen": -401.3430582682292, + "logps/rejected": -753.7263671875, + "loss": 0.0094, + "rewards/chosen": 4.057496388753255, + "rewards/margins": 12.11908009847005, + "rewards/rejected": -8.061583709716796, + "step": 3963 + }, + { + "epoch": 0.36217450890817726, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 7.115878071834459e-06, + "logits/chosen": 461640832.0, + "logits/rejected": 417345817.6, + "logps/chosen": -393.441650390625, + "logps/rejected": -502.98134765625, + "loss": 0.0227, + "rewards/chosen": 3.629810651143392, + "rewards/margins": 12.527631696065267, + "rewards/rejected": -8.897821044921875, + "step": 3964 + }, + { + "epoch": 0.3622658748286889, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 7.114575269159225e-06, + "logits/chosen": 809459456.0, + "logits/rejected": 764136009.1428572, + "logps/chosen": -398.2864074707031, + "logps/rejected": -462.77211216517856, + "loss": 0.022, + "rewards/chosen": 4.489813327789307, + "rewards/margins": 13.900925431932722, + "rewards/rejected": -9.411112104143415, + "step": 3965 + }, + { + "epoch": 0.36235724074920056, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.11327229162821e-06, + "logits/chosen": 993092812.8, + "logits/rejected": 558750805.3333334, + "logps/chosen": -183.79495849609376, + "logps/rejected": -588.9156087239584, + "loss": 0.0327, + "rewards/chosen": 3.523316192626953, + "rewards/margins": 11.293479410807292, + "rewards/rejected": -7.770163218180339, + "step": 3966 + }, + { + "epoch": 0.3624486066697122, + "grad_norm": 0.7109375, + "kl": 0.0, + "learning_rate": 7.111969139349156e-06, + "logits/chosen": 196162730.66666666, + "logits/rejected": 374908928.0, + "logps/chosen": -154.46729532877603, + "logps/rejected": -435.052734375, + "loss": 0.0045, + "rewards/chosen": 4.755504608154297, + "rewards/margins": 13.295189666748048, + "rewards/rejected": -8.53968505859375, + "step": 3967 + }, + { + "epoch": 0.36253997259022386, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 7.110665812429822e-06, + "logits/chosen": 531270848.0, + "logits/rejected": 791138880.0, + "logps/chosen": -337.8740234375, + "logps/rejected": -491.7481994628906, + "loss": 0.0165, + "rewards/chosen": 3.8637688159942627, + "rewards/margins": 12.598635911941528, + "rewards/rejected": -8.734867095947266, + "step": 3968 + }, + { + "epoch": 0.3626313385107355, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 7.109362310977981e-06, + "logits/chosen": 436624608.0, + "logits/rejected": 855262720.0, + "logps/chosen": -604.2435913085938, + "logps/rejected": -393.15936279296875, + "loss": 0.0243, + "rewards/chosen": 3.851254463195801, + "rewards/margins": 12.681282043457031, + "rewards/rejected": -8.83002758026123, + "step": 3969 + }, + { + "epoch": 0.36272270443124716, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 7.10805863510142e-06, + "logits/chosen": 491604704.0, + "logits/rejected": 502336640.0, + "logps/chosen": -267.14215087890625, + "logps/rejected": -612.354248046875, + "loss": 0.0142, + "rewards/chosen": 3.6170239448547363, + "rewards/margins": 13.691896915435791, + "rewards/rejected": -10.074872970581055, + "step": 3970 + }, + { + "epoch": 0.3628140703517588, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 7.106754784907942e-06, + "logits/chosen": 548898496.0, + "logits/rejected": 934581930.6666666, + "logps/chosen": -316.6631164550781, + "logps/rejected": -446.58544921875, + "loss": 0.0136, + "rewards/chosen": 3.452469825744629, + "rewards/margins": 11.747036298116049, + "rewards/rejected": -8.29456647237142, + "step": 3971 + }, + { + "epoch": 0.36290543627227045, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 7.105450760505364e-06, + "logits/chosen": 467935232.0, + "logits/rejected": 434271914.6666667, + "logps/chosen": -392.220068359375, + "logps/rejected": -556.9890950520834, + "loss": 0.0233, + "rewards/chosen": 3.432250213623047, + "rewards/margins": 12.897711944580077, + "rewards/rejected": -9.465461730957031, + "step": 3972 + }, + { + "epoch": 0.3629968021927821, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 7.104146562001514e-06, + "logits/chosen": 594233270.8571428, + "logits/rejected": 433581216.0, + "logps/chosen": -302.3060825892857, + "logps/rejected": -781.7393798828125, + "loss": 0.0354, + "rewards/chosen": 3.5752928597586497, + "rewards/margins": 16.85706833430699, + "rewards/rejected": -13.28177547454834, + "step": 3973 + }, + { + "epoch": 0.36308816811329375, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 7.102842189504239e-06, + "logits/chosen": 518995660.8, + "logits/rejected": 469340330.6666667, + "logps/chosen": -380.3749755859375, + "logps/rejected": -693.2527669270834, + "loss": 0.0188, + "rewards/chosen": 4.236686325073242, + "rewards/margins": 15.438457107543945, + "rewards/rejected": -11.201770782470703, + "step": 3974 + }, + { + "epoch": 0.3631795340338054, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 7.101537643121399e-06, + "logits/chosen": 520535744.0, + "logits/rejected": 581635072.0, + "logps/chosen": -341.21826171875, + "logps/rejected": -573.66748046875, + "loss": 0.0184, + "rewards/chosen": 3.856999635696411, + "rewards/margins": 13.212184190750122, + "rewards/rejected": -9.355184555053711, + "step": 3975 + }, + { + "epoch": 0.36327089995431705, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 7.100232922960865e-06, + "logits/chosen": 495939379.2, + "logits/rejected": 558778282.6666666, + "logps/chosen": -309.2333984375, + "logps/rejected": -642.463134765625, + "loss": 0.0204, + "rewards/chosen": 3.6571125030517577, + "rewards/margins": 14.285597229003907, + "rewards/rejected": -10.628484725952148, + "step": 3976 + }, + { + "epoch": 0.36336226587482867, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 7.098928029130529e-06, + "logits/chosen": 494161408.0, + "logits/rejected": 563113301.3333334, + "logps/chosen": -296.77496337890625, + "logps/rejected": -641.96875, + "loss": 0.0077, + "rewards/chosen": 3.6776719093322754, + "rewards/margins": 14.89676014582316, + "rewards/rejected": -11.219088236490885, + "step": 3977 + }, + { + "epoch": 0.36345363179534035, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 7.097622961738292e-06, + "logits/chosen": 1278349824.0, + "logits/rejected": 1502875904.0, + "logps/chosen": -457.15240478515625, + "logps/rejected": -546.0460205078125, + "loss": 0.017, + "rewards/chosen": 3.575124502182007, + "rewards/margins": 12.552383184432983, + "rewards/rejected": -8.977258682250977, + "step": 3978 + }, + { + "epoch": 0.36354499771585197, + "grad_norm": 0.458984375, + "kl": 0.0, + "learning_rate": 7.096317720892072e-06, + "logits/chosen": 360554368.0, + "logits/rejected": 946037674.6666666, + "logps/chosen": -179.55010986328125, + "logps/rejected": -559.5701497395834, + "loss": 0.0025, + "rewards/chosen": 4.7709059715271, + "rewards/margins": 14.541511058807373, + "rewards/rejected": -9.770605087280273, + "step": 3979 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 7.095012306699797e-06, + "logits/chosen": 503844915.2, + "logits/rejected": 480581717.3333333, + "logps/chosen": -286.094580078125, + "logps/rejected": -582.4535319010416, + "loss": 0.1333, + "rewards/chosen": 3.377178955078125, + "rewards/margins": 8.436013984680176, + "rewards/rejected": -5.058835029602051, + "step": 3980 + }, + { + "epoch": 0.36372772955687527, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 7.093706719269416e-06, + "logits/chosen": 930868004.5714285, + "logits/rejected": 463640704.0, + "logps/chosen": -296.7783203125, + "logps/rejected": -617.0106201171875, + "loss": 0.0414, + "rewards/chosen": 3.17279543195452, + "rewards/margins": 18.576564925057546, + "rewards/rejected": -15.403769493103027, + "step": 3981 + }, + { + "epoch": 0.36381909547738694, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.092400958708887e-06, + "logits/chosen": 426649190.4, + "logits/rejected": 619794901.3333334, + "logps/chosen": -260.497705078125, + "logps/rejected": -431.5057779947917, + "loss": 0.0241, + "rewards/chosen": 3.86609001159668, + "rewards/margins": 13.499065272013347, + "rewards/rejected": -9.632975260416666, + "step": 3982 + }, + { + "epoch": 0.36391046139789857, + "grad_norm": 0.55078125, + "kl": 0.0, + "learning_rate": 7.0910950251261865e-06, + "logits/chosen": 280525269.3333333, + "logits/rejected": 249826892.8, + "logps/chosen": -184.25520833333334, + "logps/rejected": -398.1271484375, + "loss": 0.0038, + "rewards/chosen": 4.900183359781901, + "rewards/margins": 14.171800486246745, + "rewards/rejected": -9.271617126464843, + "step": 3983 + }, + { + "epoch": 0.36400182731841024, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.0897889186293015e-06, + "logits/chosen": 400386240.0, + "logits/rejected": 1121153664.0, + "logps/chosen": -434.2312316894531, + "logps/rejected": -633.3651123046875, + "loss": 0.0116, + "rewards/chosen": 4.221441745758057, + "rewards/margins": 13.72570276260376, + "rewards/rejected": -9.504261016845703, + "step": 3984 + }, + { + "epoch": 0.36409319323892186, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 7.088482639326235e-06, + "logits/chosen": 421950592.0, + "logits/rejected": 387870816.0, + "logps/chosen": -392.015869140625, + "logps/rejected": -539.5767211914062, + "loss": 0.0114, + "rewards/chosen": 4.041889190673828, + "rewards/margins": 12.501920700073242, + "rewards/rejected": -8.460031509399414, + "step": 3985 + }, + { + "epoch": 0.36418455915943354, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.087176187325005e-06, + "logits/chosen": 542524842.6666666, + "logits/rejected": 632055091.2, + "logps/chosen": -360.7372639973958, + "logps/rejected": -602.62421875, + "loss": 0.0136, + "rewards/chosen": 3.953091303507487, + "rewards/margins": 12.363836542765299, + "rewards/rejected": -8.410745239257812, + "step": 3986 + }, + { + "epoch": 0.36427592507994516, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 7.085869562733641e-06, + "logits/chosen": 453526886.4, + "logits/rejected": 408805418.6666667, + "logps/chosen": -403.78974609375, + "logps/rejected": -539.8896891276041, + "loss": 0.0244, + "rewards/chosen": 3.7470123291015627, + "rewards/margins": 14.425985209147136, + "rewards/rejected": -10.678972880045572, + "step": 3987 + }, + { + "epoch": 0.36436729100045684, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 7.084562765660191e-06, + "logits/chosen": 445790848.0, + "logits/rejected": 431203218.28571427, + "logps/chosen": -378.452880859375, + "logps/rejected": -490.1652134486607, + "loss": 0.0065, + "rewards/chosen": 3.1359527111053467, + "rewards/margins": 12.409165688923427, + "rewards/rejected": -9.27321297781808, + "step": 3988 + }, + { + "epoch": 0.36445865692096846, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 7.083255796212716e-06, + "logits/chosen": 615085696.0, + "logits/rejected": 416382912.0, + "logps/chosen": -245.395263671875, + "logps/rejected": -623.726318359375, + "loss": 0.0313, + "rewards/chosen": 3.569260279337565, + "rewards/margins": 14.303953806559244, + "rewards/rejected": -10.73469352722168, + "step": 3989 + }, + { + "epoch": 0.36455002284148014, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.0819486544992865e-06, + "logits/chosen": 551709900.8, + "logits/rejected": 720175274.6666666, + "logps/chosen": -337.238232421875, + "logps/rejected": -587.1344401041666, + "loss": 0.0498, + "rewards/chosen": 2.8104475021362303, + "rewards/margins": 17.123578198750813, + "rewards/rejected": -14.313130696614584, + "step": 3990 + }, + { + "epoch": 0.36464138876199176, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 7.080641340627994e-06, + "logits/chosen": 971398848.0, + "logits/rejected": 1051239277.7142857, + "logps/chosen": -734.7299194335938, + "logps/rejected": -640.9118303571429, + "loss": 0.007, + "rewards/chosen": 4.607537746429443, + "rewards/margins": 14.046524388449532, + "rewards/rejected": -9.438986642020089, + "step": 3991 + }, + { + "epoch": 0.36473275468250344, + "grad_norm": 0.890625, + "kl": 0.0, + "learning_rate": 7.079333854706938e-06, + "logits/chosen": 501252992.0, + "logits/rejected": 400456345.6, + "logps/chosen": -255.11958821614584, + "logps/rejected": -451.57958984375, + "loss": 0.0052, + "rewards/chosen": 4.726830800374349, + "rewards/margins": 12.780746968587238, + "rewards/rejected": -8.05391616821289, + "step": 3992 + }, + { + "epoch": 0.36482412060301506, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 7.07802619684424e-06, + "logits/chosen": 849329664.0, + "logits/rejected": 394165077.3333333, + "logps/chosen": -556.434814453125, + "logps/rejected": -432.2490234375, + "loss": 0.0122, + "rewards/chosen": 3.286007881164551, + "rewards/margins": 13.684544563293457, + "rewards/rejected": -10.398536682128906, + "step": 3993 + }, + { + "epoch": 0.36491548652352673, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 7.076718367148029e-06, + "logits/chosen": 628150374.4, + "logits/rejected": 467624149.3333333, + "logps/chosen": -418.416015625, + "logps/rejected": -559.865966796875, + "loss": 0.0214, + "rewards/chosen": 3.799456787109375, + "rewards/margins": 11.587041727701823, + "rewards/rejected": -7.787584940592448, + "step": 3994 + }, + { + "epoch": 0.36500685244403835, + "grad_norm": 31.375, + "kl": 0.0, + "learning_rate": 7.075410365726449e-06, + "logits/chosen": 469767040.0, + "logits/rejected": 530950092.8, + "logps/chosen": -222.27925618489584, + "logps/rejected": -365.9202880859375, + "loss": 0.0735, + "rewards/chosen": 3.851963678995768, + "rewards/margins": 11.175507990519206, + "rewards/rejected": -7.323544311523437, + "step": 3995 + }, + { + "epoch": 0.36509821836455003, + "grad_norm": 36.75, + "kl": 0.0, + "learning_rate": 7.07410219268766e-06, + "logits/chosen": 1149940224.0, + "logits/rejected": 401462604.8, + "logps/chosen": -319.5985107421875, + "logps/rejected": -376.415185546875, + "loss": 0.1083, + "rewards/chosen": 1.5250523885091145, + "rewards/margins": 10.677439626057943, + "rewards/rejected": -9.152387237548828, + "step": 3996 + }, + { + "epoch": 0.36518958428506165, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 7.072793848139838e-06, + "logits/chosen": 1015539648.0, + "logits/rejected": 648482346.6666666, + "logps/chosen": -265.41119384765625, + "logps/rejected": -542.4825032552084, + "loss": 0.0095, + "rewards/chosen": 3.3759186267852783, + "rewards/margins": 12.0930384794871, + "rewards/rejected": -8.717119852701822, + "step": 3997 + }, + { + "epoch": 0.36528095020557333, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 7.07148533219117e-06, + "logits/chosen": 680873386.6666666, + "logits/rejected": 668988262.4, + "logps/chosen": -405.6565755208333, + "logps/rejected": -444.03603515625, + "loss": 0.017, + "rewards/chosen": 3.2571868896484375, + "rewards/margins": 12.306668090820313, + "rewards/rejected": -9.049481201171876, + "step": 3998 + }, + { + "epoch": 0.36537231612608495, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 7.070176644949857e-06, + "logits/chosen": 1390935040.0, + "logits/rejected": 521523821.71428573, + "logps/chosen": -183.61117553710938, + "logps/rejected": -440.79220145089283, + "loss": 0.0083, + "rewards/chosen": 3.388070821762085, + "rewards/margins": 11.104627915791102, + "rewards/rejected": -7.716557094029018, + "step": 3999 + }, + { + "epoch": 0.3654636820465966, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 7.068867786524116e-06, + "logits/chosen": 351390229.3333333, + "logits/rejected": 904500096.0, + "logps/chosen": -294.0291748046875, + "logps/rejected": -591.1060791015625, + "loss": 0.143, + "rewards/chosen": 2.8060550689697266, + "rewards/margins": 10.391968250274658, + "rewards/rejected": -7.585913181304932, + "step": 4000 + }, + { + "epoch": 0.36555504796710825, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 7.067558757022177e-06, + "logits/chosen": 289195424.0, + "logits/rejected": 544652736.0, + "logps/chosen": -204.5388946533203, + "logps/rejected": -565.0079956054688, + "loss": 0.0241, + "rewards/chosen": 3.7199766635894775, + "rewards/margins": 13.624685525894165, + "rewards/rejected": -9.904708862304688, + "step": 4001 + }, + { + "epoch": 0.3656464138876199, + "grad_norm": 0.92578125, + "kl": 0.0, + "learning_rate": 7.066249556552287e-06, + "logits/chosen": 555244970.6666666, + "logits/rejected": 732444979.2, + "logps/chosen": -183.5909423828125, + "logps/rejected": -518.1322265625, + "loss": 0.007, + "rewards/chosen": 4.070408821105957, + "rewards/margins": 12.797839546203614, + "rewards/rejected": -8.727430725097657, + "step": 4002 + }, + { + "epoch": 0.36573777980813155, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 7.064940185222701e-06, + "logits/chosen": 520214880.0, + "logits/rejected": 569737642.6666666, + "logps/chosen": -235.31796264648438, + "logps/rejected": -428.1612548828125, + "loss": 0.0131, + "rewards/chosen": 2.962686538696289, + "rewards/margins": 11.532668431599935, + "rewards/rejected": -8.569981892903646, + "step": 4003 + }, + { + "epoch": 0.3658291457286432, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 7.063630643141696e-06, + "logits/chosen": 439996211.2, + "logits/rejected": 744690346.6666666, + "logps/chosen": -269.7732421875, + "logps/rejected": -614.4817301432291, + "loss": 0.0182, + "rewards/chosen": 3.8635490417480467, + "rewards/margins": 12.363951492309571, + "rewards/rejected": -8.500402450561523, + "step": 4004 + }, + { + "epoch": 0.36592051164915484, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 7.0623209304175555e-06, + "logits/chosen": 492582758.4, + "logits/rejected": 466590122.6666667, + "logps/chosen": -340.0845703125, + "logps/rejected": -351.1519368489583, + "loss": 0.0281, + "rewards/chosen": 3.2075035095214846, + "rewards/margins": 12.338902028401694, + "rewards/rejected": -9.131398518880209, + "step": 4005 + }, + { + "epoch": 0.3660118775696665, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 7.061011047158581e-06, + "logits/chosen": 412721194.6666667, + "logits/rejected": 469584793.6, + "logps/chosen": -174.13724772135416, + "logps/rejected": -478.22783203125, + "loss": 0.0259, + "rewards/chosen": 2.7064174016316733, + "rewards/margins": 10.470218976338705, + "rewards/rejected": -7.763801574707031, + "step": 4006 + }, + { + "epoch": 0.36610324349017814, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.05970099347309e-06, + "logits/chosen": 688799872.0, + "logits/rejected": 411559296.0, + "logps/chosen": -463.4678649902344, + "logps/rejected": -408.25201416015625, + "loss": 0.0229, + "rewards/chosen": 3.2254085540771484, + "rewards/margins": 12.84109115600586, + "rewards/rejected": -9.615682601928711, + "step": 4007 + }, + { + "epoch": 0.3661946094106898, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 7.0583907694694095e-06, + "logits/chosen": 650919360.0, + "logits/rejected": 745755264.0, + "logps/chosen": -373.564208984375, + "logps/rejected": -550.7469482421875, + "loss": 0.0183, + "rewards/chosen": 3.559735059738159, + "rewards/margins": 13.399868726730347, + "rewards/rejected": -9.840133666992188, + "step": 4008 + }, + { + "epoch": 0.36628597533120144, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 7.057080375255885e-06, + "logits/chosen": 991353920.0, + "logits/rejected": 793199808.0, + "logps/chosen": -492.79345703125, + "logps/rejected": -345.55499267578125, + "loss": 0.0248, + "rewards/chosen": 3.487454891204834, + "rewards/margins": 11.016758441925049, + "rewards/rejected": -7.529303550720215, + "step": 4009 + }, + { + "epoch": 0.3663773412517131, + "grad_norm": 0.546875, + "kl": 0.0, + "learning_rate": 7.055769810940871e-06, + "logits/chosen": 341409877.3333333, + "logits/rejected": 250664089.6, + "logps/chosen": -314.99444580078125, + "logps/rejected": -382.051416015625, + "loss": 0.003, + "rewards/chosen": 4.934879302978516, + "rewards/margins": 13.801793670654297, + "rewards/rejected": -8.866914367675781, + "step": 4010 + }, + { + "epoch": 0.36646870717222474, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.054459076632742e-06, + "logits/chosen": 600299434.6666666, + "logits/rejected": 515015782.4, + "logps/chosen": -231.474365234375, + "logps/rejected": -523.740234375, + "loss": 0.0189, + "rewards/chosen": 4.276098887125651, + "rewards/margins": 14.289839426676433, + "rewards/rejected": -10.013740539550781, + "step": 4011 + }, + { + "epoch": 0.3665600730927364, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.053148172439881e-06, + "logits/chosen": 678995353.6, + "logits/rejected": 788204202.6666666, + "logps/chosen": -381.2430419921875, + "logps/rejected": -586.78466796875, + "loss": 0.0184, + "rewards/chosen": 3.9258148193359377, + "rewards/margins": 14.403522364298503, + "rewards/rejected": -10.477707544962565, + "step": 4012 + }, + { + "epoch": 0.36665143901324804, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 7.051837098470691e-06, + "logits/chosen": 507556821.3333333, + "logits/rejected": 940428416.0, + "logps/chosen": -459.3861490885417, + "logps/rejected": -656.6183471679688, + "loss": 0.0182, + "rewards/chosen": 4.3771413167317705, + "rewards/margins": 12.242763360341389, + "rewards/rejected": -7.865622043609619, + "step": 4013 + }, + { + "epoch": 0.3667428049337597, + "grad_norm": 0.6484375, + "kl": 0.0, + "learning_rate": 7.050525854833582e-06, + "logits/chosen": 985283200.0, + "logits/rejected": 536438820.5714286, + "logps/chosen": -754.3847045898438, + "logps/rejected": -552.1171875, + "loss": 0.0026, + "rewards/chosen": 4.157080173492432, + "rewards/margins": 13.841109888894218, + "rewards/rejected": -9.684029715401786, + "step": 4014 + }, + { + "epoch": 0.36683417085427134, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.049214441636984e-06, + "logits/chosen": 826668373.3333334, + "logits/rejected": 366900352.0, + "logps/chosen": -470.3468424479167, + "logps/rejected": -312.3070068359375, + "loss": 0.023, + "rewards/chosen": 3.639706293741862, + "rewards/margins": 12.47443930308024, + "rewards/rejected": -8.834733009338379, + "step": 4015 + }, + { + "epoch": 0.366925536774783, + "grad_norm": 82.0, + "kl": 0.0, + "learning_rate": 7.047902858989337e-06, + "logits/chosen": 564815433.1428572, + "logits/rejected": 402249856.0, + "logps/chosen": -313.22970145089283, + "logps/rejected": -385.94293212890625, + "loss": 0.0766, + "rewards/chosen": 2.931849343436105, + "rewards/margins": 12.653144700186594, + "rewards/rejected": -9.721295356750488, + "step": 4016 + }, + { + "epoch": 0.36701690269529463, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 7.046591106999099e-06, + "logits/chosen": 382356505.6, + "logits/rejected": 489581013.3333333, + "logps/chosen": -200.5326171875, + "logps/rejected": -526.7086995442709, + "loss": 0.0262, + "rewards/chosen": 3.8451751708984374, + "rewards/margins": 11.16718266805013, + "rewards/rejected": -7.322007497151692, + "step": 4017 + }, + { + "epoch": 0.3671082686158063, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 7.045279185774739e-06, + "logits/chosen": 406023200.0, + "logits/rejected": 414817728.0, + "logps/chosen": -355.56756591796875, + "logps/rejected": -814.4451293945312, + "loss": 0.0129, + "rewards/chosen": 4.30857515335083, + "rewards/margins": 17.281668186187744, + "rewards/rejected": -12.973093032836914, + "step": 4018 + }, + { + "epoch": 0.36719963453631793, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 7.04396709542474e-06, + "logits/chosen": 579126528.0, + "logits/rejected": 466331084.8, + "logps/chosen": -414.130859375, + "logps/rejected": -394.741748046875, + "loss": 0.0275, + "rewards/chosen": 2.8242292404174805, + "rewards/margins": 11.584737968444824, + "rewards/rejected": -8.760508728027343, + "step": 4019 + }, + { + "epoch": 0.3672910004568296, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 7.042654836057599e-06, + "logits/chosen": 614950442.6666666, + "logits/rejected": 330574182.4, + "logps/chosen": -451.1561279296875, + "logps/rejected": -489.26552734375, + "loss": 0.0085, + "rewards/chosen": 3.898522694905599, + "rewards/margins": 14.0643185933431, + "rewards/rejected": -10.1657958984375, + "step": 4020 + }, + { + "epoch": 0.36738236637734123, + "grad_norm": 35.75, + "kl": 0.0, + "learning_rate": 7.04134240778183e-06, + "logits/chosen": 585139814.4, + "logits/rejected": 461552213.3333333, + "logps/chosen": -416.534716796875, + "logps/rejected": -432.5340983072917, + "loss": 0.0458, + "rewards/chosen": 3.1465253829956055, + "rewards/margins": 8.27332083384196, + "rewards/rejected": -5.1267954508463545, + "step": 4021 + }, + { + "epoch": 0.3674737322978529, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 7.040029810705956e-06, + "logits/chosen": 439324704.0, + "logits/rejected": 530829280.0, + "logps/chosen": -285.49261474609375, + "logps/rejected": -637.031005859375, + "loss": 0.0128, + "rewards/chosen": 3.869656562805176, + "rewards/margins": 13.224365234375, + "rewards/rejected": -9.354708671569824, + "step": 4022 + }, + { + "epoch": 0.3675650982183645, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 7.038717044938519e-06, + "logits/chosen": 411126848.0, + "logits/rejected": 404448288.0, + "logps/chosen": -288.5695495605469, + "logps/rejected": -645.7338256835938, + "loss": 0.0159, + "rewards/chosen": 3.9028940200805664, + "rewards/margins": 15.227062225341797, + "rewards/rejected": -11.32416820526123, + "step": 4023 + }, + { + "epoch": 0.3676564641388762, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 7.037404110588071e-06, + "logits/chosen": 387662656.0, + "logits/rejected": 440366208.0, + "logps/chosen": -219.26559448242188, + "logps/rejected": -445.8433837890625, + "loss": 0.01, + "rewards/chosen": 4.526506423950195, + "rewards/margins": 12.789144515991211, + "rewards/rejected": -8.262638092041016, + "step": 4024 + }, + { + "epoch": 0.3677478300593878, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 7.03609100776318e-06, + "logits/chosen": 424491161.6, + "logits/rejected": 474870058.6666667, + "logps/chosen": -371.6886474609375, + "logps/rejected": -540.6346842447916, + "loss": 0.0179, + "rewards/chosen": 3.7589900970458983, + "rewards/margins": 12.906409072875977, + "rewards/rejected": -9.147418975830078, + "step": 4025 + }, + { + "epoch": 0.3678391959798995, + "grad_norm": 51.25, + "kl": 0.0, + "learning_rate": 7.034777736572428e-06, + "logits/chosen": 592456874.6666666, + "logits/rejected": 515149977.6, + "logps/chosen": -488.3731282552083, + "logps/rejected": -521.686865234375, + "loss": 0.0754, + "rewards/chosen": 3.8897666931152344, + "rewards/margins": 11.322933197021484, + "rewards/rejected": -7.43316650390625, + "step": 4026 + }, + { + "epoch": 0.3679305619004111, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 7.033464297124408e-06, + "logits/chosen": 469819200.0, + "logits/rejected": 387242898.28571427, + "logps/chosen": -314.8192138671875, + "logps/rejected": -511.8732212611607, + "loss": 0.0045, + "rewards/chosen": 3.323474168777466, + "rewards/margins": 13.766757249832153, + "rewards/rejected": -10.443283081054688, + "step": 4027 + }, + { + "epoch": 0.3680219278209228, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 7.032150689527733e-06, + "logits/chosen": 458262016.0, + "logits/rejected": 395248426.6666667, + "logps/chosen": -237.19873046875, + "logps/rejected": -561.6118570963541, + "loss": 0.0198, + "rewards/chosen": 4.13465576171875, + "rewards/margins": 15.738294728597005, + "rewards/rejected": -11.603638966878256, + "step": 4028 + }, + { + "epoch": 0.3681132937414344, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 7.030836913891024e-06, + "logits/chosen": 424778265.6, + "logits/rejected": 573039061.3333334, + "logps/chosen": -239.9576171875, + "logps/rejected": -493.6819661458333, + "loss": 0.022, + "rewards/chosen": 3.8179443359375, + "rewards/margins": 13.260239410400391, + "rewards/rejected": -9.44229507446289, + "step": 4029 + }, + { + "epoch": 0.3682046596619461, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 7.029522970322917e-06, + "logits/chosen": 438600960.0, + "logits/rejected": 563280896.0, + "logps/chosen": -297.1455383300781, + "logps/rejected": -374.10235595703125, + "loss": 0.0175, + "rewards/chosen": 3.4592795372009277, + "rewards/margins": 12.18227243423462, + "rewards/rejected": -8.722992897033691, + "step": 4030 + }, + { + "epoch": 0.3682960255824577, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 7.028208858932066e-06, + "logits/chosen": 349697664.0, + "logits/rejected": 761951829.3333334, + "logps/chosen": -249.34584045410156, + "logps/rejected": -611.378173828125, + "loss": 0.003, + "rewards/chosen": 5.550567626953125, + "rewards/margins": 16.073381423950195, + "rewards/rejected": -10.52281379699707, + "step": 4031 + }, + { + "epoch": 0.3683873915029694, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 7.026894579827135e-06, + "logits/chosen": 437888426.6666667, + "logits/rejected": 257297280.0, + "logps/chosen": -267.39699300130206, + "logps/rejected": -368.3528564453125, + "loss": 0.0176, + "rewards/chosen": 3.0451930363972983, + "rewards/margins": 13.037273724873861, + "rewards/rejected": -9.992080688476562, + "step": 4032 + }, + { + "epoch": 0.368478757423481, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.025580133116799e-06, + "logits/chosen": 719396480.0, + "logits/rejected": 804058240.0, + "logps/chosen": -310.187744140625, + "logps/rejected": -581.641357421875, + "loss": 0.0195, + "rewards/chosen": 3.7662386894226074, + "rewards/margins": 12.291079044342041, + "rewards/rejected": -8.524840354919434, + "step": 4033 + }, + { + "epoch": 0.3685701233439927, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 7.024265518909755e-06, + "logits/chosen": 381527456.0, + "logits/rejected": 512380608.0, + "logps/chosen": -379.97674560546875, + "logps/rejected": -686.498046875, + "loss": 0.0109, + "rewards/chosen": 3.866342544555664, + "rewards/margins": 14.808279991149902, + "rewards/rejected": -10.941937446594238, + "step": 4034 + }, + { + "epoch": 0.3686614892645043, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 7.022950737314707e-06, + "logits/chosen": 515925760.0, + "logits/rejected": 521071104.0, + "logps/chosen": -347.871337890625, + "logps/rejected": -820.9759928385416, + "loss": 0.0238, + "rewards/chosen": 3.581622314453125, + "rewards/margins": 15.906544240315757, + "rewards/rejected": -12.32492192586263, + "step": 4035 + }, + { + "epoch": 0.368752855185016, + "grad_norm": 34.5, + "kl": 0.0, + "learning_rate": 7.021635788440377e-06, + "logits/chosen": 554543808.0, + "logits/rejected": 623460096.0, + "logps/chosen": -352.7615966796875, + "logps/rejected": -475.0503845214844, + "loss": 0.0869, + "rewards/chosen": 2.6065804958343506, + "rewards/margins": 13.147394895553589, + "rewards/rejected": -10.540814399719238, + "step": 4036 + }, + { + "epoch": 0.3688442211055276, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 7.0203206723954974e-06, + "logits/chosen": 394091712.0, + "logits/rejected": 394469120.0, + "logps/chosen": -250.95811462402344, + "logps/rejected": -542.5769653320312, + "loss": 0.0221, + "rewards/chosen": 3.8446483612060547, + "rewards/margins": 15.835672378540039, + "rewards/rejected": -11.991024017333984, + "step": 4037 + }, + { + "epoch": 0.3689355870260393, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 7.019005389288818e-06, + "logits/chosen": 652202432.0, + "logits/rejected": 647816362.6666666, + "logps/chosen": -469.90496826171875, + "logps/rejected": -389.26708984375, + "loss": 0.023, + "rewards/chosen": 2.9040908813476562, + "rewards/margins": 10.837501525878906, + "rewards/rejected": -7.93341064453125, + "step": 4038 + }, + { + "epoch": 0.3690269529465509, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 7.017689939229098e-06, + "logits/chosen": 424545408.0, + "logits/rejected": 516954208.0, + "logps/chosen": -335.9462890625, + "logps/rejected": -476.4100036621094, + "loss": 0.0089, + "rewards/chosen": 4.439635276794434, + "rewards/margins": 12.311574935913086, + "rewards/rejected": -7.871939659118652, + "step": 4039 + }, + { + "epoch": 0.3691183188670626, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 7.016374322325116e-06, + "logits/chosen": 452034901.3333333, + "logits/rejected": 603800166.4, + "logps/chosen": -269.3166097005208, + "logps/rejected": -644.570703125, + "loss": 0.0053, + "rewards/chosen": 4.263886451721191, + "rewards/margins": 13.986429786682129, + "rewards/rejected": -9.722543334960937, + "step": 4040 + }, + { + "epoch": 0.3692096847875742, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 7.015058538685658e-06, + "logits/chosen": 871601971.2, + "logits/rejected": 476466005.3333333, + "logps/chosen": -470.0904296875, + "logps/rejected": -583.7013346354166, + "loss": 0.0406, + "rewards/chosen": 3.106697845458984, + "rewards/margins": 12.84589080810547, + "rewards/rejected": -9.739192962646484, + "step": 4041 + }, + { + "epoch": 0.3693010507080859, + "grad_norm": 13.875, + "kl": 8.306819915771484, + "learning_rate": 7.01374258841953e-06, + "logits/chosen": 666181705.1428572, + "logits/rejected": 504695872.0, + "logps/chosen": -469.55995396205356, + "logps/rejected": -455.9174499511719, + "loss": 0.1285, + "rewards/chosen": 2.8642234802246094, + "rewards/margins": 12.16662883758545, + "rewards/rejected": -9.30240535736084, + "step": 4042 + }, + { + "epoch": 0.3693924166285975, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 7.0124264716355474e-06, + "logits/chosen": 432866816.0, + "logits/rejected": 294564992.0, + "logps/chosen": -368.4157958984375, + "logps/rejected": -320.349609375, + "loss": 0.1333, + "rewards/chosen": 2.4914344787597655, + "rewards/margins": 10.862244669596354, + "rewards/rejected": -8.370810190836588, + "step": 4043 + }, + { + "epoch": 0.3694837825491092, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 7.011110188442541e-06, + "logits/chosen": 452657216.0, + "logits/rejected": 494745600.0, + "logps/chosen": -189.4274139404297, + "logps/rejected": -482.60595703125, + "loss": 0.02, + "rewards/chosen": 3.8342316150665283, + "rewards/margins": 12.119421243667603, + "rewards/rejected": -8.285189628601074, + "step": 4044 + }, + { + "epoch": 0.3695751484696208, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 7.0097937389493555e-06, + "logits/chosen": 536741312.0, + "logits/rejected": 458778624.0, + "logps/chosen": -292.1031494140625, + "logps/rejected": -460.79736328125, + "loss": 0.0171, + "rewards/chosen": 3.440678596496582, + "rewards/margins": 12.436575889587402, + "rewards/rejected": -8.99589729309082, + "step": 4045 + }, + { + "epoch": 0.3696665143901325, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 7.008477123264849e-06, + "logits/chosen": 814937600.0, + "logits/rejected": 1247927552.0, + "logps/chosen": -342.30926513671875, + "logps/rejected": -520.613525390625, + "loss": 0.0321, + "rewards/chosen": 2.7381091117858887, + "rewards/margins": 11.839818477630615, + "rewards/rejected": -9.101709365844727, + "step": 4046 + }, + { + "epoch": 0.3697578803106441, + "grad_norm": 47.25, + "kl": 0.0, + "learning_rate": 7.007160341497893e-06, + "logits/chosen": 384147029.3333333, + "logits/rejected": 744209766.4, + "logps/chosen": -191.56058756510416, + "logps/rejected": -632.77041015625, + "loss": 0.0852, + "rewards/chosen": 2.6288676261901855, + "rewards/margins": 12.170366191864014, + "rewards/rejected": -9.541498565673828, + "step": 4047 + }, + { + "epoch": 0.3698492462311558, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 7.005843393757372e-06, + "logits/chosen": 506686080.0, + "logits/rejected": 318764192.0, + "logps/chosen": -246.21005249023438, + "logps/rejected": -261.22650146484375, + "loss": 0.0084, + "rewards/chosen": 4.472720146179199, + "rewards/margins": 11.30128288269043, + "rewards/rejected": -6.8285627365112305, + "step": 4048 + }, + { + "epoch": 0.3699406121516674, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 7.00452628015219e-06, + "logits/chosen": 418836864.0, + "logits/rejected": 507393536.0, + "logps/chosen": -246.1425984700521, + "logps/rejected": -574.91376953125, + "loss": 0.0132, + "rewards/chosen": 3.407158533732096, + "rewards/margins": 12.188974634806314, + "rewards/rejected": -8.781816101074218, + "step": 4049 + }, + { + "epoch": 0.3700319780721791, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.003209000791255e-06, + "logits/chosen": 890380390.4, + "logits/rejected": 563433728.0, + "logps/chosen": -253.9204833984375, + "logps/rejected": -452.5470784505208, + "loss": 0.0155, + "rewards/chosen": 4.385169219970703, + "rewards/margins": 11.449706904093425, + "rewards/rejected": -7.064537684122722, + "step": 4050 + }, + { + "epoch": 0.3701233439926907, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 7.001891555783497e-06, + "logits/chosen": 257959637.33333334, + "logits/rejected": 431011328.0, + "logps/chosen": -182.21488444010416, + "logps/rejected": -385.960498046875, + "loss": 0.0165, + "rewards/chosen": 4.128513336181641, + "rewards/margins": 11.526155853271485, + "rewards/rejected": -7.397642517089844, + "step": 4051 + }, + { + "epoch": 0.3702147099132024, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 7.000573945237853e-06, + "logits/chosen": 482352025.6, + "logits/rejected": 614739626.6666666, + "logps/chosen": -321.876904296875, + "logps/rejected": -612.6185302734375, + "loss": 0.147, + "rewards/chosen": 1.7380382537841796, + "rewards/margins": 11.000988515218099, + "rewards/rejected": -9.26295026143392, + "step": 4052 + }, + { + "epoch": 0.370306075833714, + "grad_norm": 0.58203125, + "kl": 0.0, + "learning_rate": 6.999256169263282e-06, + "logits/chosen": 306453077.3333333, + "logits/rejected": 432580608.0, + "logps/chosen": -200.5239461263021, + "logps/rejected": -389.059130859375, + "loss": 0.0042, + "rewards/chosen": 4.640159924825032, + "rewards/margins": 12.202099927266438, + "rewards/rejected": -7.561940002441406, + "step": 4053 + }, + { + "epoch": 0.3703974417542257, + "grad_norm": 0.8828125, + "kl": 0.0, + "learning_rate": 6.997938227968747e-06, + "logits/chosen": 591037909.3333334, + "logits/rejected": 434535526.4, + "logps/chosen": -311.1540934244792, + "logps/rejected": -696.065771484375, + "loss": 0.0056, + "rewards/chosen": 4.512733459472656, + "rewards/margins": 13.492455291748048, + "rewards/rejected": -8.979721832275391, + "step": 4054 + }, + { + "epoch": 0.3704888076747373, + "grad_norm": 6.25, + "kl": 11.098735809326172, + "learning_rate": 6.9966201214632335e-06, + "logits/chosen": 627844352.0, + "logps/chosen": -290.0552978515625, + "loss": 0.0462, + "rewards/chosen": 4.662961006164551, + "step": 4055 + }, + { + "epoch": 0.370580173595249, + "grad_norm": 3.328125, + "kl": 4.06646728515625, + "learning_rate": 6.9953018498557345e-06, + "logits/chosen": 534029860.5714286, + "logits/rejected": 500575424.0, + "logps/chosen": -393.25655691964283, + "logps/rejected": -527.78759765625, + "loss": 0.0227, + "rewards/chosen": 4.295560564313616, + "rewards/margins": 14.431998934064593, + "rewards/rejected": -10.136438369750977, + "step": 4056 + }, + { + "epoch": 0.3706715395157606, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 6.9939834132552595e-06, + "logits/chosen": 552613120.0, + "logits/rejected": 456892128.0, + "logps/chosen": -398.1683044433594, + "logps/rejected": -453.31707763671875, + "loss": 0.014, + "rewards/chosen": 4.152582168579102, + "rewards/margins": 11.60701847076416, + "rewards/rejected": -7.454436302185059, + "step": 4057 + }, + { + "epoch": 0.37076290543627227, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 6.992664811770832e-06, + "logits/chosen": 751408981.3333334, + "logits/rejected": 394204224.0, + "logps/chosen": -338.57177734375, + "logps/rejected": -418.4685974121094, + "loss": 0.1633, + "rewards/chosen": 2.0623346964518228, + "rewards/margins": 10.803311030069986, + "rewards/rejected": -8.740976333618164, + "step": 4058 + }, + { + "epoch": 0.3708542713567839, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 6.991346045511487e-06, + "logits/chosen": 646140672.0, + "logits/rejected": 490300608.0, + "logps/chosen": -504.1751403808594, + "logps/rejected": -461.0527038574219, + "loss": 0.0213, + "rewards/chosen": 3.4140777587890625, + "rewards/margins": 13.52580451965332, + "rewards/rejected": -10.111726760864258, + "step": 4059 + }, + { + "epoch": 0.37094563727729557, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 6.990027114586272e-06, + "logits/chosen": 523702732.8, + "logits/rejected": 206757632.0, + "logps/chosen": -354.534375, + "logps/rejected": -192.1122843424479, + "loss": 0.1339, + "rewards/chosen": 2.350691223144531, + "rewards/margins": 8.859620793660481, + "rewards/rejected": -6.50892957051595, + "step": 4060 + }, + { + "epoch": 0.3710370031978072, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 6.988708019104255e-06, + "logits/chosen": 452499552.0, + "logits/rejected": 363669504.0, + "logps/chosen": -334.2120056152344, + "logps/rejected": -616.4290771484375, + "loss": 0.0393, + "rewards/chosen": 2.6228184700012207, + "rewards/margins": 12.593897342681885, + "rewards/rejected": -9.971078872680664, + "step": 4061 + }, + { + "epoch": 0.37112836911831887, + "grad_norm": 42.25, + "kl": 0.0, + "learning_rate": 6.987388759174509e-06, + "logits/chosen": 695553728.0, + "logits/rejected": 383577344.0, + "logps/chosen": -332.1477966308594, + "logps/rejected": -478.8951822916667, + "loss": 0.0421, + "rewards/chosen": 2.328486680984497, + "rewards/margins": 12.195562283198038, + "rewards/rejected": -9.867075602213541, + "step": 4062 + }, + { + "epoch": 0.3712197350388305, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.986069334906126e-06, + "logits/chosen": 577910528.0, + "logits/rejected": 773749376.0, + "logps/chosen": -334.70111083984375, + "logps/rejected": -566.0531616210938, + "loss": 0.0243, + "rewards/chosen": 3.177614688873291, + "rewards/margins": 13.07832384109497, + "rewards/rejected": -9.90070915222168, + "step": 4063 + }, + { + "epoch": 0.37131110095934217, + "grad_norm": 59.5, + "kl": 0.0, + "learning_rate": 6.9847497464082115e-06, + "logits/chosen": 577571737.6, + "logits/rejected": 430296917.3333333, + "logps/chosen": -323.90302734375, + "logps/rejected": -277.7537434895833, + "loss": 0.103, + "rewards/chosen": 3.0248064041137694, + "rewards/margins": 11.441474978129069, + "rewards/rejected": -8.416668574015299, + "step": 4064 + }, + { + "epoch": 0.3714024668798538, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 6.983429993789881e-06, + "logits/chosen": 647545344.0, + "logits/rejected": 563417307.4285715, + "logps/chosen": -330.12139892578125, + "logps/rejected": -472.81033761160717, + "loss": 0.0109, + "rewards/chosen": 2.4143950939178467, + "rewards/margins": 11.591284785951887, + "rewards/rejected": -9.17688969203404, + "step": 4065 + }, + { + "epoch": 0.37149383280036546, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 6.982110077160268e-06, + "logits/chosen": 437685824.0, + "logits/rejected": 418565600.0, + "logps/chosen": -270.125, + "logps/rejected": -541.2286376953125, + "loss": 0.0271, + "rewards/chosen": 3.1146304607391357, + "rewards/margins": 12.221177816390991, + "rewards/rejected": -9.106547355651855, + "step": 4066 + }, + { + "epoch": 0.3715851987208771, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 6.980789996628515e-06, + "logits/chosen": 366101536.0, + "logits/rejected": 674634240.0, + "logps/chosen": -315.8408203125, + "logps/rejected": -731.5573120117188, + "loss": 0.0135, + "rewards/chosen": 4.2912797927856445, + "rewards/margins": 13.210907936096191, + "rewards/rejected": -8.919628143310547, + "step": 4067 + }, + { + "epoch": 0.37167656464138876, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 6.979469752303783e-06, + "logits/chosen": 488572032.0, + "logits/rejected": 416916480.0, + "logps/chosen": -303.7012939453125, + "logps/rejected": -397.820556640625, + "loss": 0.0062, + "rewards/chosen": 3.075061082839966, + "rewards/margins": 10.94811122758048, + "rewards/rejected": -7.873050144740513, + "step": 4068 + }, + { + "epoch": 0.3717679305619004, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 6.978149344295242e-06, + "logits/chosen": 519399424.0, + "logits/rejected": 443282636.8, + "logps/chosen": -345.5895182291667, + "logps/rejected": -307.124365234375, + "loss": 0.0243, + "rewards/chosen": 2.869379679361979, + "rewards/margins": 11.762430826822916, + "rewards/rejected": -8.893051147460938, + "step": 4069 + }, + { + "epoch": 0.37185929648241206, + "grad_norm": 0.8984375, + "kl": 0.0, + "learning_rate": 6.976828772712079e-06, + "logits/chosen": 540518144.0, + "logits/rejected": 505308876.8, + "logps/chosen": -323.6169026692708, + "logps/rejected": -425.4203125, + "loss": 0.0035, + "rewards/chosen": 5.070759137471517, + "rewards/margins": 14.417020734151205, + "rewards/rejected": -9.346261596679687, + "step": 4070 + }, + { + "epoch": 0.3719506624029237, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 6.97550803766349e-06, + "logits/chosen": 956488618.6666666, + "logits/rejected": 630940160.0, + "logps/chosen": -610.7442626953125, + "logps/rejected": -517.93935546875, + "loss": 0.012, + "rewards/chosen": 4.241825421651204, + "rewards/margins": 12.77910722096761, + "rewards/rejected": -8.537281799316407, + "step": 4071 + }, + { + "epoch": 0.37204202832343536, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 6.974187139258693e-06, + "logits/chosen": 524589738.6666667, + "logits/rejected": 1001414656.0, + "logps/chosen": -241.5599568684896, + "logps/rejected": -590.6474609375, + "loss": 0.0091, + "rewards/chosen": 4.129473050435384, + "rewards/margins": 12.523917325337727, + "rewards/rejected": -8.394444274902344, + "step": 4072 + }, + { + "epoch": 0.37213339424394704, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 6.972866077606909e-06, + "logits/chosen": 489814869.3333333, + "logits/rejected": 443477964.8, + "logps/chosen": -282.79746500651044, + "logps/rejected": -375.6846923828125, + "loss": 0.0172, + "rewards/chosen": 3.18851629892985, + "rewards/margins": 12.31412893931071, + "rewards/rejected": -9.12561264038086, + "step": 4073 + }, + { + "epoch": 0.37222476016445866, + "grad_norm": 0.6015625, + "kl": 0.0, + "learning_rate": 6.97154485281738e-06, + "logits/chosen": 852270080.0, + "logits/rejected": 519180970.6666667, + "logps/chosen": -335.15447998046875, + "logps/rejected": -470.8539225260417, + "loss": 0.0036, + "rewards/chosen": 4.373735427856445, + "rewards/margins": 13.10520108540853, + "rewards/rejected": -8.731465657552084, + "step": 4074 + }, + { + "epoch": 0.37231612608497033, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 6.970223464999358e-06, + "logits/chosen": 437631658.6666667, + "logits/rejected": 156855984.0, + "logps/chosen": -333.5445963541667, + "logps/rejected": -180.41754150390625, + "loss": 0.0228, + "rewards/chosen": 3.5652221043904624, + "rewards/margins": 9.53385623296102, + "rewards/rejected": -5.968634128570557, + "step": 4075 + }, + { + "epoch": 0.37240749200548195, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 6.9689019142621095e-06, + "logits/chosen": 502889164.8, + "logits/rejected": 538151424.0, + "logps/chosen": -328.369482421875, + "logps/rejected": -511.3856201171875, + "loss": 0.0437, + "rewards/chosen": 3.0819772720336913, + "rewards/margins": 12.117108090718588, + "rewards/rejected": -9.035130818684896, + "step": 4076 + }, + { + "epoch": 0.37249885792599363, + "grad_norm": 0.85546875, + "kl": 0.0, + "learning_rate": 6.9675802007149155e-06, + "logits/chosen": 366819040.0, + "logits/rejected": 563851605.3333334, + "logps/chosen": -214.66085815429688, + "logps/rejected": -465.9630533854167, + "loss": 0.007, + "rewards/chosen": 3.6167688369750977, + "rewards/margins": 12.960651079813639, + "rewards/rejected": -9.343882242838541, + "step": 4077 + }, + { + "epoch": 0.37259022384650525, + "grad_norm": 0.376953125, + "kl": 0.0, + "learning_rate": 6.966258324467069e-06, + "logits/chosen": 378155264.0, + "logits/rejected": 458463573.3333333, + "logps/chosen": -239.63958740234375, + "logps/rejected": -482.1740315755208, + "loss": 0.0022, + "rewards/chosen": 4.816314697265625, + "rewards/margins": 14.339049657185873, + "rewards/rejected": -9.522734959920248, + "step": 4078 + }, + { + "epoch": 0.37268158976701693, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 6.964936285627877e-06, + "logits/chosen": 624835379.2, + "logits/rejected": 1296498176.0, + "logps/chosen": -363.3140625, + "logps/rejected": -649.5689290364584, + "loss": 0.0125, + "rewards/chosen": 4.660498046875, + "rewards/margins": 14.98390998840332, + "rewards/rejected": -10.32341194152832, + "step": 4079 + }, + { + "epoch": 0.37277295568752855, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 6.963614084306659e-06, + "logits/chosen": 672523776.0, + "logits/rejected": 252552085.33333334, + "logps/chosen": -383.93681640625, + "logps/rejected": -319.9571533203125, + "loss": 0.0317, + "rewards/chosen": 3.1799530029296874, + "rewards/margins": 10.411322021484375, + "rewards/rejected": -7.2313690185546875, + "step": 4080 + }, + { + "epoch": 0.37286432160804023, + "grad_norm": 11.625, + "kl": 14.62924575805664, + "learning_rate": 6.96229172061275e-06, + "logits/chosen": 490439936.0, + "logps/chosen": -360.1082763671875, + "loss": 0.1359, + "rewards/chosen": 3.65651273727417, + "step": 4081 + }, + { + "epoch": 0.37295568752855185, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 6.960969194655496e-06, + "logits/chosen": 233462476.8, + "logits/rejected": 274596736.0, + "logps/chosen": -144.843408203125, + "logps/rejected": -191.27632649739584, + "loss": 0.0261, + "rewards/chosen": 4.396509170532227, + "rewards/margins": 11.94318135579427, + "rewards/rejected": -7.546672185262044, + "step": 4082 + }, + { + "epoch": 0.3730470534490635, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.9596465065442595e-06, + "logits/chosen": 655436032.0, + "logits/rejected": 385991338.6666667, + "logps/chosen": -326.571484375, + "logps/rejected": -417.9774169921875, + "loss": 0.0559, + "rewards/chosen": 2.9279251098632812, + "rewards/margins": 9.262827555338543, + "rewards/rejected": -6.334902445475261, + "step": 4083 + }, + { + "epoch": 0.37313841936957515, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.958323656388413e-06, + "logits/chosen": 284808362.6666667, + "logits/rejected": 497740768.0, + "logps/chosen": -273.3852945963542, + "logps/rejected": -892.06298828125, + "loss": 0.0307, + "rewards/chosen": 3.4590282440185547, + "rewards/margins": 15.106232643127441, + "rewards/rejected": -11.647204399108887, + "step": 4084 + }, + { + "epoch": 0.3732297852900868, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 6.957000644297346e-06, + "logits/chosen": 839079296.0, + "logits/rejected": 696892288.0, + "logps/chosen": -317.72625732421875, + "logps/rejected": -441.06927490234375, + "loss": 0.0167, + "rewards/chosen": 3.668482780456543, + "rewards/margins": 12.27265453338623, + "rewards/rejected": -8.604171752929688, + "step": 4085 + }, + { + "epoch": 0.37332115121059845, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 6.955677470380455e-06, + "logits/chosen": 1088278613.3333333, + "logits/rejected": 366177408.0, + "logps/chosen": -316.1858723958333, + "logps/rejected": -361.22308349609375, + "loss": 0.035, + "rewards/chosen": 3.2187576293945312, + "rewards/margins": 12.652422904968262, + "rewards/rejected": -9.43366527557373, + "step": 4086 + }, + { + "epoch": 0.3734125171311101, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 6.954354134747159e-06, + "logits/chosen": 715368857.6, + "logits/rejected": 709869866.6666666, + "logps/chosen": -514.77412109375, + "logps/rejected": -659.6027018229166, + "loss": 0.0161, + "rewards/chosen": 4.129439926147461, + "rewards/margins": 15.068717575073242, + "rewards/rejected": -10.939277648925781, + "step": 4087 + }, + { + "epoch": 0.37350388305162174, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 6.9530306375068825e-06, + "logits/chosen": 766201173.3333334, + "logits/rejected": 448955443.2, + "logps/chosen": -423.2705078125, + "logps/rejected": -597.72607421875, + "loss": 0.0147, + "rewards/chosen": 3.6361653010050454, + "rewards/margins": 14.58663069407145, + "rewards/rejected": -10.950465393066406, + "step": 4088 + }, + { + "epoch": 0.3735952489721334, + "grad_norm": 28.5, + "kl": 0.0, + "learning_rate": 6.951706978769068e-06, + "logits/chosen": 381058144.0, + "logits/rejected": 471031744.0, + "logps/chosen": -249.73133850097656, + "logps/rejected": -467.6312561035156, + "loss": 0.0705, + "rewards/chosen": 3.303638458251953, + "rewards/margins": 12.327041625976562, + "rewards/rejected": -9.02340316772461, + "step": 4089 + }, + { + "epoch": 0.37368661489264504, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 6.950383158643168e-06, + "logits/chosen": 677957632.0, + "logits/rejected": 478275925.3333333, + "logps/chosen": -295.6120910644531, + "logps/rejected": -666.303955078125, + "loss": 0.0154, + "rewards/chosen": 2.8171303272247314, + "rewards/margins": 12.01426879564921, + "rewards/rejected": -9.197138468424479, + "step": 4090 + }, + { + "epoch": 0.3737779808131567, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 6.949059177238653e-06, + "logits/chosen": 636960896.0, + "logits/rejected": 584044117.3333334, + "logps/chosen": -332.68218994140625, + "logps/rejected": -438.7265218098958, + "loss": 0.0106, + "rewards/chosen": 3.597548007965088, + "rewards/margins": 11.68007198969523, + "rewards/rejected": -8.082523981730143, + "step": 4091 + }, + { + "epoch": 0.37386934673366834, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 6.9477350346650016e-06, + "logits/chosen": 567909120.0, + "logits/rejected": 412764416.0, + "logps/chosen": -153.0958251953125, + "logps/rejected": -429.09527587890625, + "loss": 0.0363, + "rewards/chosen": 2.8954215049743652, + "rewards/margins": 12.889970302581787, + "rewards/rejected": -9.994548797607422, + "step": 4092 + }, + { + "epoch": 0.37396071265418, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.94641073103171e-06, + "logits/chosen": 459090261.3333333, + "logits/rejected": 489592012.8, + "logps/chosen": -139.95304361979166, + "logps/rejected": -623.01416015625, + "loss": 0.1096, + "rewards/chosen": 1.4352951049804688, + "rewards/margins": 12.584127807617188, + "rewards/rejected": -11.148832702636719, + "step": 4093 + }, + { + "epoch": 0.37405207857469164, + "grad_norm": 0.734375, + "kl": 0.0, + "learning_rate": 6.945086266448284e-06, + "logits/chosen": 856177856.0, + "logits/rejected": 703100928.0, + "logps/chosen": -330.0036315917969, + "logps/rejected": -635.807373046875, + "loss": 0.0036, + "rewards/chosen": 4.607844352722168, + "rewards/margins": 13.29510529836019, + "rewards/rejected": -8.687260945638021, + "step": 4094 + }, + { + "epoch": 0.3741434444952033, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 6.943761641024245e-06, + "logits/chosen": 687414976.0, + "logits/rejected": 1052272256.0, + "logps/chosen": -364.4603271484375, + "logps/rejected": -567.3016967773438, + "loss": 0.0189, + "rewards/chosen": 3.3035144805908203, + "rewards/margins": 13.580448150634766, + "rewards/rejected": -10.276933670043945, + "step": 4095 + }, + { + "epoch": 0.37423481041571494, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 6.942436854869129e-06, + "logits/chosen": 455383072.0, + "logits/rejected": 611901513.1428572, + "logps/chosen": -195.44268798828125, + "logps/rejected": -655.8642578125, + "loss": 0.0061, + "rewards/chosen": 3.126171827316284, + "rewards/margins": 12.017583199909755, + "rewards/rejected": -8.89141137259347, + "step": 4096 + }, + { + "epoch": 0.3743261763362266, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 6.941111908092481e-06, + "logits/chosen": 526519398.4, + "logits/rejected": 423677440.0, + "logps/chosen": -298.073095703125, + "logps/rejected": -651.7601725260416, + "loss": 0.03, + "rewards/chosen": 3.0745948791503905, + "rewards/margins": 14.576654815673828, + "rewards/rejected": -11.502059936523438, + "step": 4097 + }, + { + "epoch": 0.37441754225673823, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 6.939786800803865e-06, + "logits/chosen": 597439829.3333334, + "logits/rejected": 495410380.8, + "logps/chosen": -322.5732421875, + "logps/rejected": -411.35078125, + "loss": 0.0094, + "rewards/chosen": 3.903458913167318, + "rewards/margins": 11.273615010579427, + "rewards/rejected": -7.370156097412109, + "step": 4098 + }, + { + "epoch": 0.3745089081772499, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 6.93846153311285e-06, + "logits/chosen": 683519488.0, + "logits/rejected": 539549354.6666666, + "logps/chosen": -435.17254638671875, + "logps/rejected": -527.54443359375, + "loss": 0.0079, + "rewards/chosen": 3.914628505706787, + "rewards/margins": 12.13724915186564, + "rewards/rejected": -8.222620646158854, + "step": 4099 + }, + { + "epoch": 0.37460027409776153, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 6.9371361051290286e-06, + "logits/chosen": 289497792.0, + "logits/rejected": 432723072.0, + "logps/chosen": -176.78363037109375, + "logps/rejected": -408.5773518880208, + "loss": 0.0784, + "rewards/chosen": 2.9803333282470703, + "rewards/margins": 9.159802754720051, + "rewards/rejected": -6.1794694264729815, + "step": 4100 + }, + { + "epoch": 0.3746916400182732, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 6.935810516961998e-06, + "logits/chosen": 615748928.0, + "logits/rejected": 1050107328.0, + "logps/chosen": -405.9329833984375, + "logps/rejected": -671.3449096679688, + "loss": 0.0338, + "rewards/chosen": 2.7345707416534424, + "rewards/margins": 13.52453351020813, + "rewards/rejected": -10.789962768554688, + "step": 4101 + }, + { + "epoch": 0.37478300593878483, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 6.934484768721375e-06, + "logits/chosen": 851504947.2, + "logits/rejected": 704393216.0, + "logps/chosen": -317.731787109375, + "logps/rejected": -416.013916015625, + "loss": 0.0225, + "rewards/chosen": 3.812146759033203, + "rewards/margins": 14.345399475097656, + "rewards/rejected": -10.533252716064453, + "step": 4102 + }, + { + "epoch": 0.3748743718592965, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 6.9331588605167835e-06, + "logits/chosen": 376314931.2, + "logits/rejected": 322494378.6666667, + "logps/chosen": -274.3484375, + "logps/rejected": -317.6925862630208, + "loss": 0.0442, + "rewards/chosen": 2.7981319427490234, + "rewards/margins": 10.024465560913086, + "rewards/rejected": -7.2263336181640625, + "step": 4103 + }, + { + "epoch": 0.37496573777980813, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 6.931832792457865e-06, + "logits/chosen": 916329856.0, + "logits/rejected": 389315242.6666667, + "logps/chosen": -357.58160400390625, + "logps/rejected": -423.0567220052083, + "loss": 0.0077, + "rewards/chosen": 4.525672912597656, + "rewards/margins": 14.429255167643229, + "rewards/rejected": -9.903582255045572, + "step": 4104 + }, + { + "epoch": 0.3750571037003198, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 6.930506564654275e-06, + "logits/chosen": 377919658.6666667, + "logits/rejected": 612225433.6, + "logps/chosen": -201.85396321614584, + "logps/rejected": -681.55, + "loss": 0.007, + "rewards/chosen": 4.47059981028239, + "rewards/margins": 15.556726519266764, + "rewards/rejected": -11.086126708984375, + "step": 4105 + }, + { + "epoch": 0.3751484696208314, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 6.9291801772156775e-06, + "logits/chosen": 796804480.0, + "logits/rejected": 804373376.0, + "logps/chosen": -439.9009704589844, + "logps/rejected": -667.9774169921875, + "loss": 0.1125, + "rewards/chosen": 3.7705330848693848, + "rewards/margins": 12.486050128936768, + "rewards/rejected": -8.715517044067383, + "step": 4106 + }, + { + "epoch": 0.3752398355413431, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 6.927853630251753e-06, + "logits/chosen": 501700864.0, + "logits/rejected": 342293504.0, + "logps/chosen": -413.71826171875, + "logps/rejected": -545.2335611979166, + "loss": 0.0217, + "rewards/chosen": 3.6948970794677733, + "rewards/margins": 14.141184870402018, + "rewards/rejected": -10.446287790934244, + "step": 4107 + }, + { + "epoch": 0.3753312014618547, + "grad_norm": 37.0, + "kl": 0.0, + "learning_rate": 6.926526923872197e-06, + "logits/chosen": 613164352.0, + "logits/rejected": 542002176.0, + "logps/chosen": -203.39584350585938, + "logps/rejected": -528.14794921875, + "loss": 0.2053, + "rewards/chosen": 1.3377748727798462, + "rewards/margins": 8.47108805179596, + "rewards/rejected": -7.133313179016113, + "step": 4108 + }, + { + "epoch": 0.3754225673823664, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 6.925200058186714e-06, + "logits/chosen": 941338880.0, + "logits/rejected": 581726412.8, + "logps/chosen": -288.8098958333333, + "logps/rejected": -573.3369140625, + "loss": 0.0094, + "rewards/chosen": 3.8878164291381836, + "rewards/margins": 14.586957359313965, + "rewards/rejected": -10.699140930175782, + "step": 4109 + }, + { + "epoch": 0.375513933302878, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 6.923873033305022e-06, + "logits/chosen": 457024704.0, + "logits/rejected": 401563221.3333333, + "logps/chosen": -257.72625732421875, + "logps/rejected": -613.0816243489584, + "loss": 0.0074, + "rewards/chosen": 3.8582496643066406, + "rewards/margins": 15.217851003011068, + "rewards/rejected": -11.359601338704428, + "step": 4110 + }, + { + "epoch": 0.3756052992233897, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 6.9225458493368565e-06, + "logits/chosen": 1339560832.0, + "logits/rejected": 546767872.0, + "logps/chosen": -305.4685974121094, + "logps/rejected": -440.9712829589844, + "loss": 0.0309, + "rewards/chosen": 3.0793325901031494, + "rewards/margins": 11.993170022964478, + "rewards/rejected": -8.913837432861328, + "step": 4111 + }, + { + "epoch": 0.3756966651439013, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 6.921218506391963e-06, + "logits/chosen": 638621013.3333334, + "logits/rejected": 515545984.0, + "logps/chosen": -206.9379679361979, + "logps/rejected": -470.94366455078125, + "loss": 0.0196, + "rewards/chosen": 3.869426409403483, + "rewards/margins": 10.355440298716227, + "rewards/rejected": -6.486013889312744, + "step": 4112 + }, + { + "epoch": 0.375788031064413, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 6.919891004580097e-06, + "logits/chosen": 441098137.6, + "logits/rejected": 451245653.3333333, + "logps/chosen": -271.1743408203125, + "logps/rejected": -410.2122802734375, + "loss": 0.0199, + "rewards/chosen": 3.6443035125732424, + "rewards/margins": 11.464872868855794, + "rewards/rejected": -7.820569356282552, + "step": 4113 + }, + { + "epoch": 0.3758793969849246, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 6.9185633440110344e-06, + "logits/chosen": 424398182.4, + "logits/rejected": 506987434.6666667, + "logps/chosen": -302.582080078125, + "logps/rejected": -520.4322102864584, + "loss": 0.0216, + "rewards/chosen": 3.7040145874023436, + "rewards/margins": 11.500522104899089, + "rewards/rejected": -7.796507517496745, + "step": 4114 + }, + { + "epoch": 0.3759707629054363, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 6.9172355247945586e-06, + "logits/chosen": 678820181.3333334, + "logits/rejected": 300726681.6, + "logps/chosen": -516.1901448567709, + "logps/rejected": -381.575927734375, + "loss": 0.0249, + "rewards/chosen": 2.7997140884399414, + "rewards/margins": 12.357830238342284, + "rewards/rejected": -9.558116149902343, + "step": 4115 + }, + { + "epoch": 0.3760621288259479, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 6.915907547040469e-06, + "logits/chosen": 477009817.6, + "logits/rejected": 518629717.3333333, + "logps/chosen": -370.53486328125, + "logps/rejected": -614.401611328125, + "loss": 0.0197, + "rewards/chosen": 3.630980682373047, + "rewards/margins": 12.868343861897788, + "rewards/rejected": -9.23736317952474, + "step": 4116 + }, + { + "epoch": 0.3761534947464596, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 6.9145794108585775e-06, + "logits/chosen": 1301461632.0, + "logits/rejected": 668311232.0, + "logps/chosen": -511.7623596191406, + "logps/rejected": -376.0538330078125, + "loss": 0.0163, + "rewards/chosen": 3.562208652496338, + "rewards/margins": 12.394535541534424, + "rewards/rejected": -8.832326889038086, + "step": 4117 + }, + { + "epoch": 0.3762448606669712, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 6.913251116358705e-06, + "logits/chosen": 883580854.8571428, + "logits/rejected": 479011616.0, + "logps/chosen": -349.81766183035717, + "logps/rejected": -318.0325622558594, + "loss": 0.0288, + "rewards/chosen": 4.052298409598214, + "rewards/margins": 14.431354386465891, + "rewards/rejected": -10.379055976867676, + "step": 4118 + }, + { + "epoch": 0.3763362265874829, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 6.9119226636506945e-06, + "logits/chosen": 633127782.4, + "logits/rejected": 514140928.0, + "logps/chosen": -333.57001953125, + "logps/rejected": -501.6600341796875, + "loss": 0.0375, + "rewards/chosen": 3.259899139404297, + "rewards/margins": 13.803922017415365, + "rewards/rejected": -10.544022878011068, + "step": 4119 + }, + { + "epoch": 0.3764275925079945, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.9105940528443915e-06, + "logits/chosen": 805667968.0, + "logits/rejected": 384585386.6666667, + "logps/chosen": -305.7948303222656, + "logps/rejected": -332.33807373046875, + "loss": 0.0182, + "rewards/chosen": 2.5950615406036377, + "rewards/margins": 11.447244564692179, + "rewards/rejected": -8.852183024088541, + "step": 4120 + }, + { + "epoch": 0.3765189584285062, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 6.909265284049664e-06, + "logits/chosen": 495876352.0, + "logits/rejected": 535194624.0, + "logps/chosen": -409.944970703125, + "logps/rejected": -460.8319498697917, + "loss": 0.037, + "rewards/chosen": 3.565782928466797, + "rewards/margins": 14.023713175455729, + "rewards/rejected": -10.457930246988932, + "step": 4121 + }, + { + "epoch": 0.3766103243490178, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 6.907936357376387e-06, + "logits/chosen": 572176042.6666666, + "logits/rejected": 440498304.0, + "logps/chosen": -422.5417073567708, + "logps/rejected": -393.5290222167969, + "loss": 0.0261, + "rewards/chosen": 3.766895294189453, + "rewards/margins": 12.08491325378418, + "rewards/rejected": -8.318017959594727, + "step": 4122 + }, + { + "epoch": 0.3767016902695295, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 6.906607272934451e-06, + "logits/chosen": 934419029.3333334, + "logits/rejected": 420534988.8, + "logps/chosen": -428.257080078125, + "logps/rejected": -372.8248291015625, + "loss": 0.0125, + "rewards/chosen": 3.7651440302530923, + "rewards/margins": 12.316143480936686, + "rewards/rejected": -8.550999450683594, + "step": 4123 + }, + { + "epoch": 0.3767930561900411, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 6.905278030833759e-06, + "logits/chosen": 1395723904.0, + "logits/rejected": 710290560.0, + "logps/chosen": -223.62033081054688, + "logps/rejected": -598.120361328125, + "loss": 0.0719, + "rewards/chosen": 3.0450804233551025, + "rewards/margins": 12.55565857887268, + "rewards/rejected": -9.510578155517578, + "step": 4124 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 6.903948631184226e-06, + "logits/chosen": 815923882.6666666, + "logits/rejected": 1338433536.0, + "logps/chosen": -411.8690592447917, + "logps/rejected": -695.877294921875, + "loss": 0.0149, + "rewards/chosen": 3.4828904469807944, + "rewards/margins": 13.897956212361654, + "rewards/rejected": -10.41506576538086, + "step": 4125 + }, + { + "epoch": 0.3769757880310644, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 6.902619074095782e-06, + "logits/chosen": 787453248.0, + "logits/rejected": 934290432.0, + "logps/chosen": -338.31451416015625, + "logps/rejected": -423.7711486816406, + "loss": 0.0095, + "rewards/chosen": 4.354532241821289, + "rewards/margins": 14.387290954589844, + "rewards/rejected": -10.032758712768555, + "step": 4126 + }, + { + "epoch": 0.3770671539515761, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 6.9012893596783696e-06, + "logits/chosen": 545848106.6666666, + "logits/rejected": 561546956.8, + "logps/chosen": -377.8850911458333, + "logps/rejected": -509.49560546875, + "loss": 0.0302, + "rewards/chosen": 2.9505138397216797, + "rewards/margins": 12.871369552612304, + "rewards/rejected": -9.920855712890624, + "step": 4127 + }, + { + "epoch": 0.3771585198720877, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 6.89995948804194e-06, + "logits/chosen": 611488614.4, + "logits/rejected": 916143616.0, + "logps/chosen": -412.916943359375, + "logps/rejected": -503.5340576171875, + "loss": 0.0249, + "rewards/chosen": 3.7378673553466797, + "rewards/margins": 11.561555862426758, + "rewards/rejected": -7.823688507080078, + "step": 4128 + }, + { + "epoch": 0.3772498857925994, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 6.898629459296466e-06, + "logits/chosen": 979163562.6666666, + "logits/rejected": 363676544.0, + "logps/chosen": -628.1089274088541, + "logps/rejected": -334.7962890625, + "loss": 0.0101, + "rewards/chosen": 3.705885887145996, + "rewards/margins": 11.79697322845459, + "rewards/rejected": -8.091087341308594, + "step": 4129 + }, + { + "epoch": 0.377341251713111, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 6.897299273551929e-06, + "logits/chosen": 466750208.0, + "logits/rejected": 475206656.0, + "logps/chosen": -331.6283203125, + "logps/rejected": -556.1739095052084, + "loss": 0.1307, + "rewards/chosen": 2.2360111236572267, + "rewards/margins": 13.692536290486654, + "rewards/rejected": -11.456525166829428, + "step": 4130 + }, + { + "epoch": 0.3774326176336227, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 6.895968930918318e-06, + "logits/chosen": 527424938.6666667, + "logits/rejected": 1034423360.0, + "logps/chosen": -487.510986328125, + "logps/rejected": -424.66412353515625, + "loss": 0.0161, + "rewards/chosen": 4.308011372884114, + "rewards/margins": 14.524705251057942, + "rewards/rejected": -10.216693878173828, + "step": 4131 + }, + { + "epoch": 0.3775239835541343, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 6.894638431505645e-06, + "logits/chosen": 636120627.2, + "logits/rejected": 1044624469.3333334, + "logps/chosen": -216.0426025390625, + "logps/rejected": -561.343505859375, + "loss": 0.0292, + "rewards/chosen": 3.0835981369018555, + "rewards/margins": 11.719515800476074, + "rewards/rejected": -8.635917663574219, + "step": 4132 + }, + { + "epoch": 0.377615349474646, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.893307775423925e-06, + "logits/chosen": 662320179.2, + "logits/rejected": 686080256.0, + "logps/chosen": -323.5839599609375, + "logps/rejected": -472.01025390625, + "loss": 0.029, + "rewards/chosen": 3.6397415161132813, + "rewards/margins": 13.064811325073242, + "rewards/rejected": -9.425069808959961, + "step": 4133 + }, + { + "epoch": 0.3777067153951576, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 6.891976962783197e-06, + "logits/chosen": 565147306.6666666, + "logits/rejected": 535901216.0, + "logps/chosen": -233.82489013671875, + "logps/rejected": -529.9815673828125, + "loss": 0.0502, + "rewards/chosen": 3.349313735961914, + "rewards/margins": 11.28892993927002, + "rewards/rejected": -7.9396162033081055, + "step": 4134 + }, + { + "epoch": 0.3777980813156693, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 6.8906459936935015e-06, + "logits/chosen": 743061196.8, + "logits/rejected": 467614549.3333333, + "logps/chosen": -379.7659912109375, + "logps/rejected": -422.7838134765625, + "loss": 0.0225, + "rewards/chosen": 3.5865432739257814, + "rewards/margins": 14.641054026285808, + "rewards/rejected": -11.054510752360025, + "step": 4135 + }, + { + "epoch": 0.3778894472361809, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 6.889314868264901e-06, + "logits/chosen": 569528960.0, + "logits/rejected": 344340684.8, + "logps/chosen": -350.1769205729167, + "logps/rejected": -308.688525390625, + "loss": 0.0388, + "rewards/chosen": 2.399717330932617, + "rewards/margins": 9.454018783569335, + "rewards/rejected": -7.054301452636719, + "step": 4136 + }, + { + "epoch": 0.3779808131566926, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 6.887983586607464e-06, + "logits/chosen": 377768874.6666667, + "logits/rejected": 909971968.0, + "logps/chosen": -252.94620768229166, + "logps/rejected": -679.10087890625, + "loss": 0.0068, + "rewards/chosen": 4.282371520996094, + "rewards/margins": 12.63311309814453, + "rewards/rejected": -8.350741577148437, + "step": 4137 + }, + { + "epoch": 0.3780721790772042, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 6.88665214883128e-06, + "logits/chosen": 439895210.6666667, + "logits/rejected": 577979289.6, + "logps/chosen": -425.9815673828125, + "logps/rejected": -615.2091796875, + "loss": 0.0106, + "rewards/chosen": 3.6738100051879883, + "rewards/margins": 12.70940113067627, + "rewards/rejected": -9.035591125488281, + "step": 4138 + }, + { + "epoch": 0.3781635449977159, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 6.885320555046441e-06, + "logits/chosen": 532471808.0, + "logits/rejected": 348003430.4, + "logps/chosen": -270.0140380859375, + "logps/rejected": -431.253125, + "loss": 0.0283, + "rewards/chosen": 2.551504135131836, + "rewards/margins": 12.270670700073243, + "rewards/rejected": -9.719166564941407, + "step": 4139 + }, + { + "epoch": 0.3782549109182275, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 6.883988805363061e-06, + "logits/chosen": 467966336.0, + "logits/rejected": 441897941.3333333, + "logps/chosen": -380.3897705078125, + "logps/rejected": -583.8907063802084, + "loss": 0.0039, + "rewards/chosen": 4.44789981842041, + "rewards/margins": 14.436025937398275, + "rewards/rejected": -9.988126118977865, + "step": 4140 + }, + { + "epoch": 0.37834627683873917, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.882656899891261e-06, + "logits/chosen": 467336000.0, + "logits/rejected": 415267072.0, + "logps/chosen": -290.9722595214844, + "logps/rejected": -451.07147216796875, + "loss": 0.0287, + "rewards/chosen": 3.0207839012145996, + "rewards/margins": 10.96436071395874, + "rewards/rejected": -7.943576812744141, + "step": 4141 + }, + { + "epoch": 0.3784376427592508, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 6.881324838741181e-06, + "logits/chosen": 379732650.6666667, + "logits/rejected": 448974489.6, + "logps/chosen": -247.05460611979166, + "logps/rejected": -627.74404296875, + "loss": 0.0149, + "rewards/chosen": 3.404596964518229, + "rewards/margins": 12.885991923014322, + "rewards/rejected": -9.481394958496093, + "step": 4142 + }, + { + "epoch": 0.37852900867976247, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 6.8799926220229655e-06, + "logits/chosen": 487593280.0, + "logits/rejected": 388194048.0, + "logps/chosen": -335.9537353515625, + "logps/rejected": -535.3037109375, + "loss": 0.0223, + "rewards/chosen": 3.126638889312744, + "rewards/margins": 12.425195217132568, + "rewards/rejected": -9.298556327819824, + "step": 4143 + }, + { + "epoch": 0.3786203746002741, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 6.8786602498467805e-06, + "logits/chosen": 571602432.0, + "logits/rejected": 634413440.0, + "logps/chosen": -225.463134765625, + "logps/rejected": -490.60699462890625, + "loss": 0.0201, + "rewards/chosen": 3.271883964538574, + "rewards/margins": 12.454633712768555, + "rewards/rejected": -9.18274974822998, + "step": 4144 + }, + { + "epoch": 0.37871174052078577, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 6.877327722322797e-06, + "logits/chosen": 834511744.0, + "logits/rejected": 637465600.0, + "logps/chosen": -365.573486328125, + "logps/rejected": -308.3180236816406, + "loss": 0.0186, + "rewards/chosen": 3.582882881164551, + "rewards/margins": 10.553184509277344, + "rewards/rejected": -6.970301628112793, + "step": 4145 + }, + { + "epoch": 0.3788031064412974, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 6.875995039561206e-06, + "logits/chosen": 251892522.66666666, + "logits/rejected": 378351539.2, + "logps/chosen": -481.2478841145833, + "logps/rejected": -380.8396240234375, + "loss": 0.0094, + "rewards/chosen": 3.9427013397216797, + "rewards/margins": 11.269099807739257, + "rewards/rejected": -7.326398468017578, + "step": 4146 + }, + { + "epoch": 0.37889447236180906, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 6.874662201672206e-06, + "logits/chosen": 333730560.0, + "logits/rejected": 705514304.0, + "logps/chosen": -184.67111206054688, + "logps/rejected": -369.3436279296875, + "loss": 0.1348, + "rewards/chosen": 1.9720042943954468, + "rewards/margins": 9.13822877407074, + "rewards/rejected": -7.166224479675293, + "step": 4147 + }, + { + "epoch": 0.3789858382823207, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 6.873329208766012e-06, + "logits/chosen": 632524544.0, + "logits/rejected": 254756864.0, + "logps/chosen": -206.85910034179688, + "logps/rejected": -494.1273193359375, + "loss": 0.0197, + "rewards/chosen": 3.778186798095703, + "rewards/margins": 17.454060554504395, + "rewards/rejected": -13.675873756408691, + "step": 4148 + }, + { + "epoch": 0.37907720420283236, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 6.8719960609528495e-06, + "logits/chosen": 354804736.0, + "logits/rejected": 550803520.0, + "logps/chosen": -472.53076171875, + "logps/rejected": -770.9581909179688, + "loss": 0.0139, + "rewards/chosen": 4.34299898147583, + "rewards/margins": 14.601199626922607, + "rewards/rejected": -10.258200645446777, + "step": 4149 + }, + { + "epoch": 0.379168570123344, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 6.870662758342955e-06, + "logits/chosen": 564279872.0, + "logits/rejected": 559809984.0, + "logps/chosen": -299.3729248046875, + "logps/rejected": -376.8933410644531, + "loss": 0.0211, + "rewards/chosen": 3.75536847114563, + "rewards/margins": 11.792470216751099, + "rewards/rejected": -8.037101745605469, + "step": 4150 + }, + { + "epoch": 0.37925993604385566, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 6.869329301046584e-06, + "logits/chosen": 651967616.0, + "logits/rejected": 757246025.1428572, + "logps/chosen": -360.1997375488281, + "logps/rejected": -497.1073521205357, + "loss": 0.0069, + "rewards/chosen": 2.969735860824585, + "rewards/margins": 12.608150516237531, + "rewards/rejected": -9.638414655412946, + "step": 4151 + }, + { + "epoch": 0.3793513019643673, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 6.867995689173999e-06, + "logits/chosen": 565393883.4285715, + "logits/rejected": 524158784.0, + "logps/chosen": -410.66859654017856, + "logps/rejected": -563.2807006835938, + "loss": 0.0248, + "rewards/chosen": 3.9173355102539062, + "rewards/margins": 12.179142951965332, + "rewards/rejected": -8.261807441711426, + "step": 4152 + }, + { + "epoch": 0.37944266788487896, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 6.866661922835481e-06, + "logits/chosen": 798000213.3333334, + "logits/rejected": 631577292.8, + "logps/chosen": -314.9595540364583, + "logps/rejected": -716.444384765625, + "loss": 0.011, + "rewards/chosen": 3.913974126180013, + "rewards/margins": 16.165438969930012, + "rewards/rejected": -12.25146484375, + "step": 4153 + }, + { + "epoch": 0.3795340338053906, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 6.865328002141311e-06, + "logits/chosen": 537014125.7142857, + "logits/rejected": 581358848.0, + "logps/chosen": -371.75425502232144, + "logps/rejected": -282.9253234863281, + "loss": 0.0248, + "rewards/chosen": 3.761410576956613, + "rewards/margins": 12.820590836661204, + "rewards/rejected": -9.05918025970459, + "step": 4154 + }, + { + "epoch": 0.37962539972590226, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 6.863993927201803e-06, + "logits/chosen": 495882956.8, + "logits/rejected": 705658069.3333334, + "logps/chosen": -351.990380859375, + "logps/rejected": -732.434814453125, + "loss": 0.0485, + "rewards/chosen": 2.684834671020508, + "rewards/margins": 13.442775599161784, + "rewards/rejected": -10.757940928141275, + "step": 4155 + }, + { + "epoch": 0.3797167656464139, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 6.8626596981272655e-06, + "logits/chosen": 615594547.2, + "logits/rejected": 666484138.6666666, + "logps/chosen": -395.317138671875, + "logps/rejected": -758.604248046875, + "loss": 0.0156, + "rewards/chosen": 3.7460540771484374, + "rewards/margins": 12.933646901448569, + "rewards/rejected": -9.18759282430013, + "step": 4156 + }, + { + "epoch": 0.37980813156692556, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.861325315028029e-06, + "logits/chosen": 486529177.6, + "logits/rejected": 268687360.0, + "logps/chosen": -378.9869140625, + "logps/rejected": -347.0970865885417, + "loss": 0.0172, + "rewards/chosen": 4.001793670654297, + "rewards/margins": 14.254044342041016, + "rewards/rejected": -10.252250671386719, + "step": 4157 + }, + { + "epoch": 0.3798994974874372, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 6.859990778014434e-06, + "logits/chosen": 748479744.0, + "logits/rejected": 464464960.0, + "logps/chosen": -339.1172281901042, + "logps/rejected": -371.5670166015625, + "loss": 0.1411, + "rewards/chosen": 2.6463491121927896, + "rewards/margins": 11.653696219126383, + "rewards/rejected": -9.007347106933594, + "step": 4158 + }, + { + "epoch": 0.37999086340794885, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 6.858656087196836e-06, + "logits/chosen": 482840384.0, + "logits/rejected": 446995029.3333333, + "logps/chosen": -352.5911560058594, + "logps/rejected": -491.9144694010417, + "loss": 0.0085, + "rewards/chosen": 3.3715057373046875, + "rewards/margins": 14.409585316975912, + "rewards/rejected": -11.038079579671225, + "step": 4159 + }, + { + "epoch": 0.3800822293284605, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 6.857321242685599e-06, + "logits/chosen": 494094165.3333333, + "logits/rejected": 432640864.0, + "logps/chosen": -303.31492106119794, + "logps/rejected": -415.4508972167969, + "loss": 0.0174, + "rewards/chosen": 4.553000450134277, + "rewards/margins": 14.048745155334473, + "rewards/rejected": -9.495744705200195, + "step": 4160 + }, + { + "epoch": 0.38017359524897215, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 6.855986244591104e-06, + "logits/chosen": 468531404.8, + "logits/rejected": 329569664.0, + "logps/chosen": -255.4302734375, + "logps/rejected": -321.88531494140625, + "loss": 0.033, + "rewards/chosen": 3.298238754272461, + "rewards/margins": 12.080934524536133, + "rewards/rejected": -8.782695770263672, + "step": 4161 + }, + { + "epoch": 0.3802649611694838, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 6.854651093023744e-06, + "logits/chosen": 567295616.0, + "logits/rejected": 536738048.0, + "logps/chosen": -311.4078063964844, + "logps/rejected": -424.6973876953125, + "loss": 0.0248, + "rewards/chosen": 3.4596991539001465, + "rewards/margins": 13.313353061676025, + "rewards/rejected": -9.853653907775879, + "step": 4162 + }, + { + "epoch": 0.38035632708999545, + "grad_norm": 71.5, + "kl": 0.0, + "learning_rate": 6.853315788093921e-06, + "logits/chosen": 392235520.0, + "logits/rejected": 532538880.0, + "logps/chosen": -395.296142578125, + "logps/rejected": -665.4866943359375, + "loss": 0.0879, + "rewards/chosen": 3.2996156215667725, + "rewards/margins": 13.061063528060913, + "rewards/rejected": -9.76144790649414, + "step": 4163 + }, + { + "epoch": 0.38044769301050707, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 6.8519803299120545e-06, + "logits/chosen": 584505642.6666666, + "logits/rejected": 369726310.4, + "logps/chosen": -359.2342529296875, + "logps/rejected": -286.623876953125, + "loss": 0.0142, + "rewards/chosen": 3.7583630879720054, + "rewards/margins": 10.450001271565755, + "rewards/rejected": -6.69163818359375, + "step": 4164 + }, + { + "epoch": 0.38053905893101875, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 6.8506447185885735e-06, + "logits/chosen": 714259968.0, + "logits/rejected": 494800554.6666667, + "logps/chosen": -350.2574951171875, + "logps/rejected": -529.1389973958334, + "loss": 0.0276, + "rewards/chosen": 3.8076705932617188, + "rewards/margins": 12.557096481323242, + "rewards/rejected": -8.749425888061523, + "step": 4165 + }, + { + "epoch": 0.38063042485153037, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 6.84930895423392e-06, + "logits/chosen": 541456320.0, + "logits/rejected": 380960064.0, + "logps/chosen": -287.2293395996094, + "logps/rejected": -527.60107421875, + "loss": 0.1213, + "rewards/chosen": 3.4360642433166504, + "rewards/margins": 10.30016565322876, + "rewards/rejected": -6.864101409912109, + "step": 4166 + }, + { + "epoch": 0.38072179077204205, + "grad_norm": 31.5, + "kl": 0.0, + "learning_rate": 6.847973036958552e-06, + "logits/chosen": 1395377834.6666667, + "logits/rejected": 703043993.6, + "logps/chosen": -292.9520670572917, + "logps/rejected": -555.48076171875, + "loss": 0.1135, + "rewards/chosen": 1.5088577270507812, + "rewards/margins": 11.454345703125, + "rewards/rejected": -9.945487976074219, + "step": 4167 + }, + { + "epoch": 0.38081315669255367, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.846636966872935e-06, + "logits/chosen": 775297945.6, + "logits/rejected": 714431914.6666666, + "logps/chosen": -386.221337890625, + "logps/rejected": -307.3260091145833, + "loss": 0.016, + "rewards/chosen": 4.4924358367919925, + "rewards/margins": 13.314121627807618, + "rewards/rejected": -8.821685791015625, + "step": 4168 + }, + { + "epoch": 0.38090452261306534, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.845300744087548e-06, + "logits/chosen": 570088448.0, + "logits/rejected": 640096960.0, + "logps/chosen": -386.8800354003906, + "logps/rejected": -636.073486328125, + "loss": 0.0126, + "rewards/chosen": 3.7751970291137695, + "rewards/margins": 13.485871315002441, + "rewards/rejected": -9.710674285888672, + "step": 4169 + }, + { + "epoch": 0.38099588853357697, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 6.8439643687128896e-06, + "logits/chosen": 1131380096.0, + "logits/rejected": 513724224.0, + "logps/chosen": -382.22503662109375, + "logps/rejected": -465.0853576660156, + "loss": 0.0153, + "rewards/chosen": 3.792130470275879, + "rewards/margins": 13.046196937561035, + "rewards/rejected": -9.254066467285156, + "step": 4170 + }, + { + "epoch": 0.38108725445408864, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 6.842627840859461e-06, + "logits/chosen": 564839731.2, + "logits/rejected": 441586005.3333333, + "logps/chosen": -404.72451171875, + "logps/rejected": -531.4991048177084, + "loss": 0.0395, + "rewards/chosen": 2.8573408126831055, + "rewards/margins": 10.097980817159016, + "rewards/rejected": -7.240640004475911, + "step": 4171 + }, + { + "epoch": 0.38117862037460026, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 6.841291160637783e-06, + "logits/chosen": 509330944.0, + "logits/rejected": 536045792.0, + "logps/chosen": -420.04742431640625, + "logps/rejected": -448.17926025390625, + "loss": 0.0114, + "rewards/chosen": 4.300558090209961, + "rewards/margins": 12.676163673400879, + "rewards/rejected": -8.375605583190918, + "step": 4172 + }, + { + "epoch": 0.38126998629511194, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 6.839954328158385e-06, + "logits/chosen": 712701952.0, + "logits/rejected": 689227776.0, + "logps/chosen": -285.36376953125, + "logps/rejected": -419.5334065755208, + "loss": 0.021, + "rewards/chosen": 3.6936878204345702, + "rewards/margins": 11.634294891357422, + "rewards/rejected": -7.940607070922852, + "step": 4173 + }, + { + "epoch": 0.38136135221562356, + "grad_norm": 29.375, + "kl": 0.0, + "learning_rate": 6.838617343531813e-06, + "logits/chosen": 585070165.3333334, + "logits/rejected": 543443072.0, + "logps/chosen": -344.0928548177083, + "logps/rejected": -602.1136474609375, + "loss": 0.1066, + "rewards/chosen": 3.0984977086385093, + "rewards/margins": 10.780347188313803, + "rewards/rejected": -7.681849479675293, + "step": 4174 + }, + { + "epoch": 0.38145271813613524, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 6.837280206868621e-06, + "logits/chosen": 503561173.3333333, + "logits/rejected": 544934041.6, + "logps/chosen": -418.4933268229167, + "logps/rejected": -559.7529296875, + "loss": 0.0175, + "rewards/chosen": 3.471408208211263, + "rewards/margins": 11.16514752705892, + "rewards/rejected": -7.693739318847657, + "step": 4175 + }, + { + "epoch": 0.38154408405664686, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 6.835942918279379e-06, + "logits/chosen": 455982720.0, + "logits/rejected": 719159040.0, + "logps/chosen": -231.90237426757812, + "logps/rejected": -376.51214599609375, + "loss": 0.0132, + "rewards/chosen": 3.8605217933654785, + "rewards/margins": 13.224274158477783, + "rewards/rejected": -9.363752365112305, + "step": 4176 + }, + { + "epoch": 0.38163544997715854, + "grad_norm": 24.875, + "kl": 0.0, + "learning_rate": 6.8346054778746674e-06, + "logits/chosen": 345754777.6, + "logits/rejected": 601568213.3333334, + "logps/chosen": -220.6043212890625, + "logps/rejected": -385.60595703125, + "loss": 0.1327, + "rewards/chosen": 2.2363462448120117, + "rewards/margins": 11.7357816696167, + "rewards/rejected": -9.499435424804688, + "step": 4177 + }, + { + "epoch": 0.38172681589767016, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 6.833267885765081e-06, + "logits/chosen": 604273237.3333334, + "logits/rejected": 390295680.0, + "logps/chosen": -441.3340657552083, + "logps/rejected": -359.31103515625, + "loss": 0.0139, + "rewards/chosen": 3.3905344009399414, + "rewards/margins": 11.169993019104004, + "rewards/rejected": -7.779458618164062, + "step": 4178 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 6.831930142061227e-06, + "logits/chosen": 387541376.0, + "logits/rejected": 326667840.0, + "logps/chosen": -400.863037109375, + "logps/rejected": -422.3148498535156, + "loss": 0.0067, + "rewards/chosen": 4.48941707611084, + "rewards/margins": 14.526373863220215, + "rewards/rejected": -10.036956787109375, + "step": 4179 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 6.830592246873724e-06, + "logits/chosen": 611692288.0, + "logits/rejected": 447087456.0, + "logps/chosen": -265.245849609375, + "logps/rejected": -437.70086669921875, + "loss": 0.0124, + "rewards/chosen": 4.010968208312988, + "rewards/margins": 12.687853813171387, + "rewards/rejected": -8.676885604858398, + "step": 4180 + }, + { + "epoch": 0.38200091365920513, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 6.829254200313201e-06, + "logits/chosen": 467270848.0, + "logits/rejected": 564968576.0, + "logps/chosen": -236.01889038085938, + "logps/rejected": -404.31842041015625, + "loss": 0.019, + "rewards/chosen": 3.6532578468322754, + "rewards/margins": 12.144667148590088, + "rewards/rejected": -8.491409301757812, + "step": 4181 + }, + { + "epoch": 0.38209227957971675, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.827916002490305e-06, + "logits/chosen": 442120640.0, + "logits/rejected": 503358336.0, + "logps/chosen": -329.60516357421875, + "logps/rejected": -391.0672912597656, + "loss": 0.0225, + "rewards/chosen": 3.453819513320923, + "rewards/margins": 11.246382474899292, + "rewards/rejected": -7.792562961578369, + "step": 4182 + }, + { + "epoch": 0.38218364550022843, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.8265776535156924e-06, + "logits/chosen": 595942080.0, + "logits/rejected": 957339968.0, + "logps/chosen": -285.4519958496094, + "logps/rejected": -214.58724975585938, + "loss": 0.025, + "rewards/chosen": 3.338897466659546, + "rewards/margins": 9.289569616317749, + "rewards/rejected": -5.950672149658203, + "step": 4183 + }, + { + "epoch": 0.38227501142074005, + "grad_norm": 0.96875, + "kl": 0.0, + "learning_rate": 6.825239153500029e-06, + "logits/chosen": 965059328.0, + "logits/rejected": 556432896.0, + "logps/chosen": -246.00950622558594, + "logps/rejected": -298.7962646484375, + "loss": 0.0058, + "rewards/chosen": 4.599185943603516, + "rewards/margins": 13.679645538330078, + "rewards/rejected": -9.080459594726562, + "step": 4184 + }, + { + "epoch": 0.38236637734125173, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 6.823900502554001e-06, + "logits/chosen": 549065728.0, + "logits/rejected": 548719360.0, + "logps/chosen": -259.3340657552083, + "logps/rejected": -511.3990478515625, + "loss": 0.0424, + "rewards/chosen": 3.942898750305176, + "rewards/margins": 11.634726524353027, + "rewards/rejected": -7.691827774047852, + "step": 4185 + }, + { + "epoch": 0.38245774326176335, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 6.8225617007883e-06, + "logits/chosen": 462827946.6666667, + "logits/rejected": 438365568.0, + "logps/chosen": -476.6581217447917, + "logps/rejected": -653.7290649414062, + "loss": 0.0335, + "rewards/chosen": 3.2489026387532554, + "rewards/margins": 14.34978993733724, + "rewards/rejected": -11.100887298583984, + "step": 4186 + }, + { + "epoch": 0.382549109182275, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 6.821222748313632e-06, + "logits/chosen": 687592550.4, + "logits/rejected": 617524010.6666666, + "logps/chosen": -405.0614013671875, + "logps/rejected": -570.1925455729166, + "loss": 0.028, + "rewards/chosen": 3.1103858947753906, + "rewards/margins": 13.053768157958984, + "rewards/rejected": -9.943382263183594, + "step": 4187 + }, + { + "epoch": 0.38264047510278665, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 6.819883645240717e-06, + "logits/chosen": 655596202.6666666, + "logits/rejected": 448670464.0, + "logps/chosen": -407.5154215494792, + "logps/rejected": -518.08046875, + "loss": 0.0135, + "rewards/chosen": 3.6945457458496094, + "rewards/margins": 13.16250991821289, + "rewards/rejected": -9.467964172363281, + "step": 4188 + }, + { + "epoch": 0.3827318410232983, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 6.818544391680286e-06, + "logits/chosen": 514834368.0, + "logits/rejected": 410935808.0, + "logps/chosen": -422.48419189453125, + "logps/rejected": -353.292724609375, + "loss": 0.012, + "rewards/chosen": 4.340229034423828, + "rewards/margins": 13.333032608032227, + "rewards/rejected": -8.992803573608398, + "step": 4189 + }, + { + "epoch": 0.38282320694380995, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 6.817204987743083e-06, + "logits/chosen": 647355904.0, + "logits/rejected": 346504192.0, + "logps/chosen": -286.5091959635417, + "logps/rejected": -287.9005432128906, + "loss": 0.1317, + "rewards/chosen": 2.8217480977376304, + "rewards/margins": 11.051029523213705, + "rewards/rejected": -8.229281425476074, + "step": 4190 + }, + { + "epoch": 0.3829145728643216, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 6.8158654335398655e-06, + "logits/chosen": 458554794.6666667, + "logits/rejected": 1095276134.4, + "logps/chosen": -297.1865234375, + "logps/rejected": -433.58349609375, + "loss": 0.0203, + "rewards/chosen": 3.4170681635538735, + "rewards/margins": 11.771023241678874, + "rewards/rejected": -8.353955078125, + "step": 4191 + }, + { + "epoch": 0.38300593878483324, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 6.814525729181401e-06, + "logits/chosen": 737608601.6, + "logits/rejected": 383135914.6666667, + "logps/chosen": -275.435888671875, + "logps/rejected": -463.1250813802083, + "loss": 0.0273, + "rewards/chosen": 3.2912620544433593, + "rewards/margins": 12.313918813069662, + "rewards/rejected": -9.022656758626303, + "step": 4192 + }, + { + "epoch": 0.3830973047053449, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 6.8131858747784695e-06, + "logits/chosen": 633257813.3333334, + "logits/rejected": 617089331.2, + "logps/chosen": -229.18623860677084, + "logps/rejected": -452.093994140625, + "loss": 0.0062, + "rewards/chosen": 4.4265085856119795, + "rewards/margins": 11.797666422526042, + "rewards/rejected": -7.371157836914063, + "step": 4193 + }, + { + "epoch": 0.38318867062585654, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 6.811845870441864e-06, + "logits/chosen": 738832822.8571428, + "logits/rejected": 214172704.0, + "logps/chosen": -357.2640904017857, + "logps/rejected": -325.5697937011719, + "loss": 0.0267, + "rewards/chosen": 3.8751539502825056, + "rewards/margins": 14.410795620509557, + "rewards/rejected": -10.53564167022705, + "step": 4194 + }, + { + "epoch": 0.3832800365463682, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 6.810505716282395e-06, + "logits/chosen": 251713331.2, + "logits/rejected": 394260608.0, + "logps/chosen": -208.1271240234375, + "logps/rejected": -594.8288981119791, + "loss": 0.0122, + "rewards/chosen": 4.074815368652343, + "rewards/margins": 14.529855346679687, + "rewards/rejected": -10.455039978027344, + "step": 4195 + }, + { + "epoch": 0.38337140246687984, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 6.8091654124108765e-06, + "logits/chosen": 401605728.0, + "logits/rejected": 613794560.0, + "logps/chosen": -330.8143005371094, + "logps/rejected": -614.5836181640625, + "loss": 0.1031, + "rewards/chosen": 4.6232428550720215, + "rewards/margins": 11.826402187347412, + "rewards/rejected": -7.203159332275391, + "step": 4196 + }, + { + "epoch": 0.3834627683873915, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 6.807824958938142e-06, + "logits/chosen": 419159296.0, + "logits/rejected": 498029465.6, + "logps/chosen": -384.3117268880208, + "logps/rejected": -481.590380859375, + "loss": 0.0083, + "rewards/chosen": 3.9260737101236978, + "rewards/margins": 12.046485392252604, + "rewards/rejected": -8.120411682128907, + "step": 4197 + }, + { + "epoch": 0.38355413430790314, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.806484355975032e-06, + "logits/chosen": 681732992.0, + "logits/rejected": 798751232.0, + "logps/chosen": -253.2493896484375, + "logps/rejected": -683.951953125, + "loss": 0.0226, + "rewards/chosen": 2.966517766316732, + "rewards/margins": 15.046487553914389, + "rewards/rejected": -12.079969787597657, + "step": 4198 + }, + { + "epoch": 0.3836455002284148, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 6.8051436036324034e-06, + "logits/chosen": 936055466.6666666, + "logits/rejected": 717217177.6, + "logps/chosen": -276.2496337890625, + "logps/rejected": -676.24375, + "loss": 0.023, + "rewards/chosen": 3.156709353129069, + "rewards/margins": 13.3877472559611, + "rewards/rejected": -10.231037902832032, + "step": 4199 + }, + { + "epoch": 0.38373686614892644, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 6.803802702021125e-06, + "logits/chosen": 658301866.6666666, + "logits/rejected": 857420492.8, + "logps/chosen": -299.3902994791667, + "logps/rejected": -715.311376953125, + "loss": 0.0148, + "rewards/chosen": 3.520310719807943, + "rewards/margins": 14.43799031575521, + "rewards/rejected": -10.917679595947266, + "step": 4200 + }, + { + "epoch": 0.3838282320694381, + "grad_norm": 0.88671875, + "kl": 0.0, + "learning_rate": 6.802461651252073e-06, + "logits/chosen": 458773589.3333333, + "logits/rejected": 760675020.8, + "logps/chosen": -297.7156575520833, + "logps/rejected": -437.3177734375, + "loss": 0.0053, + "rewards/chosen": 4.616636276245117, + "rewards/margins": 13.20131187438965, + "rewards/rejected": -8.584675598144532, + "step": 4201 + }, + { + "epoch": 0.38391959798994973, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 6.801120451436146e-06, + "logits/chosen": 1048671744.0, + "logits/rejected": 592833344.0, + "logps/chosen": -201.94558715820312, + "logps/rejected": -468.58013916015625, + "loss": 0.134, + "rewards/chosen": 2.050081968307495, + "rewards/margins": 9.357462644577026, + "rewards/rejected": -7.307380676269531, + "step": 4202 + }, + { + "epoch": 0.3840109639104614, + "grad_norm": 45.5, + "kl": 0.0, + "learning_rate": 6.799779102684243e-06, + "logits/chosen": 777121792.0, + "logits/rejected": 578781312.0, + "logps/chosen": -331.67568359375, + "logps/rejected": -487.4454345703125, + "loss": 0.1392, + "rewards/chosen": 1.8876434326171876, + "rewards/margins": 10.809686279296875, + "rewards/rejected": -8.922042846679688, + "step": 4203 + }, + { + "epoch": 0.38410232983097303, + "grad_norm": 0.357421875, + "kl": 0.0, + "learning_rate": 6.798437605107284e-06, + "logits/chosen": 261685808.0, + "logits/rejected": 675615597.7142857, + "logps/chosen": -197.4237060546875, + "logps/rejected": -491.55740792410717, + "loss": 0.0014, + "rewards/chosen": 4.545492649078369, + "rewards/margins": 14.902615887778145, + "rewards/rejected": -10.357123238699776, + "step": 4204 + }, + { + "epoch": 0.3841936957514847, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.7970959588162e-06, + "logits/chosen": 445641113.6, + "logits/rejected": 524734805.3333333, + "logps/chosen": -260.0133056640625, + "logps/rejected": -481.4798990885417, + "loss": 0.03, + "rewards/chosen": 3.302149200439453, + "rewards/margins": 12.528482182820639, + "rewards/rejected": -9.226332982381185, + "step": 4205 + }, + { + "epoch": 0.38428506167199633, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 6.795754163921929e-06, + "logits/chosen": 570726698.6666666, + "logits/rejected": 421798048.0, + "logps/chosen": -358.9976399739583, + "logps/rejected": -712.2377319335938, + "loss": 0.0116, + "rewards/chosen": 4.340915044148763, + "rewards/margins": 14.388797124226887, + "rewards/rejected": -10.047882080078125, + "step": 4206 + }, + { + "epoch": 0.384376427592508, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 6.794412220535426e-06, + "logits/chosen": 424357696.0, + "logits/rejected": 433742762.6666667, + "logps/chosen": -292.80072021484375, + "logps/rejected": -463.4402262369792, + "loss": 0.0123, + "rewards/chosen": 3.8442206382751465, + "rewards/margins": 12.393370469411215, + "rewards/rejected": -8.549149831136068, + "step": 4207 + }, + { + "epoch": 0.38446779351301963, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.7930701287676595e-06, + "logits/chosen": 363209600.0, + "logits/rejected": 574835008.0, + "logps/chosen": -246.01895141601562, + "logps/rejected": -365.8022155761719, + "loss": 0.0297, + "rewards/chosen": 3.33327054977417, + "rewards/margins": 10.743795394897461, + "rewards/rejected": -7.410524845123291, + "step": 4208 + }, + { + "epoch": 0.3845591594335313, + "grad_norm": 15.125, + "kl": 2.055919647216797, + "learning_rate": 6.791727888729605e-06, + "logits/chosen": 631338496.0, + "logits/rejected": 637694464.0, + "logps/chosen": -353.94740513392856, + "logps/rejected": -688.0010986328125, + "loss": 0.0983, + "rewards/chosen": 2.8106182643345425, + "rewards/margins": 11.502855164664133, + "rewards/rejected": -8.69223690032959, + "step": 4209 + }, + { + "epoch": 0.3846505253540429, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 6.790385500532255e-06, + "logits/chosen": 419313177.6, + "logits/rejected": 346129216.0, + "logps/chosen": -291.3367919921875, + "logps/rejected": -459.8291829427083, + "loss": 0.0245, + "rewards/chosen": 4.205583190917968, + "rewards/margins": 12.75173594156901, + "rewards/rejected": -8.546152750651041, + "step": 4210 + }, + { + "epoch": 0.3847418912745546, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 6.789042964286613e-06, + "logits/chosen": 634721408.0, + "logits/rejected": 459470080.0, + "logps/chosen": -403.5743408203125, + "logps/rejected": -425.5732421875, + "loss": 0.0384, + "rewards/chosen": 2.5415475368499756, + "rewards/margins": 10.763253927230835, + "rewards/rejected": -8.22170639038086, + "step": 4211 + }, + { + "epoch": 0.3848332571950662, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.787700280103694e-06, + "logits/chosen": 597951232.0, + "logits/rejected": 1134331136.0, + "logps/chosen": -325.3292236328125, + "logps/rejected": -545.1719360351562, + "loss": 0.012, + "rewards/chosen": 4.151429176330566, + "rewards/margins": 13.487259864807129, + "rewards/rejected": -9.335830688476562, + "step": 4212 + }, + { + "epoch": 0.3849246231155779, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 6.7863574480945235e-06, + "logits/chosen": 745245184.0, + "logits/rejected": 468354133.3333333, + "logps/chosen": -477.85185546875, + "logps/rejected": -459.3122965494792, + "loss": 0.0189, + "rewards/chosen": 3.8681949615478515, + "rewards/margins": 12.071014785766602, + "rewards/rejected": -8.20281982421875, + "step": 4213 + }, + { + "epoch": 0.3850159890360895, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 6.785014468370146e-06, + "logits/chosen": 529887658.6666667, + "logits/rejected": 379004672.0, + "logps/chosen": -302.157470703125, + "logps/rejected": -529.446044921875, + "loss": 0.037, + "rewards/chosen": 3.074777921040853, + "rewards/margins": 11.048840840657553, + "rewards/rejected": -7.974062919616699, + "step": 4214 + }, + { + "epoch": 0.3851073549566012, + "grad_norm": 0.6171875, + "kl": 0.0, + "learning_rate": 6.783671341041609e-06, + "logits/chosen": 636728362.6666666, + "logits/rejected": 534328320.0, + "logps/chosen": -408.8590494791667, + "logps/rejected": -447.620947265625, + "loss": 0.0032, + "rewards/chosen": 4.835729598999023, + "rewards/margins": 13.74961051940918, + "rewards/rejected": -8.913880920410156, + "step": 4215 + }, + { + "epoch": 0.3851987208771128, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.782328066219976e-06, + "logits/chosen": 568610304.0, + "logits/rejected": 881565824.0, + "logps/chosen": -317.875, + "logps/rejected": -420.1864929199219, + "loss": 0.0259, + "rewards/chosen": 3.0636258125305176, + "rewards/margins": 13.476780414581299, + "rewards/rejected": -10.413154602050781, + "step": 4216 + }, + { + "epoch": 0.3852900867976245, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 6.780984644016329e-06, + "logits/chosen": 895929139.2, + "logits/rejected": 460543274.6666667, + "logps/chosen": -409.59453125, + "logps/rejected": -319.72108968098956, + "loss": 0.0189, + "rewards/chosen": 3.730290985107422, + "rewards/margins": 11.574580764770507, + "rewards/rejected": -7.844289779663086, + "step": 4217 + }, + { + "epoch": 0.3853814527181361, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 6.77964107454175e-06, + "logits/chosen": 857365824.0, + "logits/rejected": 498783456.0, + "logps/chosen": -406.64813232421875, + "logps/rejected": -366.44677734375, + "loss": 0.0166, + "rewards/chosen": 4.17708683013916, + "rewards/margins": 12.113200664520264, + "rewards/rejected": -7.9361138343811035, + "step": 4218 + }, + { + "epoch": 0.3854728186386478, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 6.778297357907346e-06, + "logits/chosen": 657932288.0, + "logits/rejected": 322527712.0, + "logps/chosen": -587.3605346679688, + "logps/rejected": -392.1028137207031, + "loss": 0.0212, + "rewards/chosen": 3.7750213146209717, + "rewards/margins": 12.702475309371948, + "rewards/rejected": -8.927453994750977, + "step": 4219 + }, + { + "epoch": 0.3855641845591594, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 6.7769534942242254e-06, + "logits/chosen": 502808490.6666667, + "logits/rejected": 488225228.8, + "logps/chosen": -565.3280436197916, + "logps/rejected": -492.5416015625, + "loss": 0.0074, + "rewards/chosen": 4.0639495849609375, + "rewards/margins": 13.957434844970702, + "rewards/rejected": -9.893485260009765, + "step": 4220 + }, + { + "epoch": 0.3856555504796711, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 6.775609483603516e-06, + "logits/chosen": 633386752.0, + "logits/rejected": 237652832.0, + "logps/chosen": -361.1687825520833, + "logps/rejected": -465.5794677734375, + "loss": 0.025, + "rewards/chosen": 3.7381807963053384, + "rewards/margins": 16.147030512491863, + "rewards/rejected": -12.408849716186523, + "step": 4221 + }, + { + "epoch": 0.3857469164001827, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 6.774265326156351e-06, + "logits/chosen": 596834176.0, + "logits/rejected": 779946048.0, + "logps/chosen": -357.92254638671875, + "logps/rejected": -419.39483642578125, + "loss": 0.0207, + "rewards/chosen": 3.232365608215332, + "rewards/margins": 11.020708084106445, + "rewards/rejected": -7.788342475891113, + "step": 4222 + }, + { + "epoch": 0.3858382823206944, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 6.772921021993885e-06, + "logits/chosen": 493104853.3333333, + "logits/rejected": 498467232.0, + "logps/chosen": -376.1630859375, + "logps/rejected": -437.9393310546875, + "loss": 0.0355, + "rewards/chosen": 3.5849135716756186, + "rewards/margins": 10.493688901265463, + "rewards/rejected": -6.908775329589844, + "step": 4223 + }, + { + "epoch": 0.385929648241206, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 6.771576571227274e-06, + "logits/chosen": 621462784.0, + "logits/rejected": 382313088.0, + "logps/chosen": -337.16748046875, + "logps/rejected": -274.93377685546875, + "loss": 0.0111, + "rewards/chosen": 4.610989888509114, + "rewards/margins": 10.753710110982258, + "rewards/rejected": -6.1427202224731445, + "step": 4224 + }, + { + "epoch": 0.3860210141617177, + "grad_norm": 72.5, + "kl": 0.0, + "learning_rate": 6.770231973967697e-06, + "logits/chosen": 319401173.3333333, + "logits/rejected": 357639372.8, + "logps/chosen": -162.75845336914062, + "logps/rejected": -482.709033203125, + "loss": 0.0492, + "rewards/chosen": 2.516297181447347, + "rewards/margins": 11.670928223927817, + "rewards/rejected": -9.154631042480469, + "step": 4225 + }, + { + "epoch": 0.3861123800822293, + "grad_norm": 31.5, + "kl": 0.0, + "learning_rate": 6.7688872303263365e-06, + "logits/chosen": 470942304.0, + "logits/rejected": 437826176.0, + "logps/chosen": -412.20697021484375, + "logps/rejected": -477.3814697265625, + "loss": 0.1168, + "rewards/chosen": 2.0860517024993896, + "rewards/margins": 10.61863112449646, + "rewards/rejected": -8.53257942199707, + "step": 4226 + }, + { + "epoch": 0.386203746002741, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 6.767542340414391e-06, + "logits/chosen": 360194304.0, + "logits/rejected": 576927283.2, + "logps/chosen": -150.5701904296875, + "logps/rejected": -389.8601806640625, + "loss": 0.1167, + "rewards/chosen": 2.1548666954040527, + "rewards/margins": 10.44523458480835, + "rewards/rejected": -8.290367889404298, + "step": 4227 + }, + { + "epoch": 0.3862951119232526, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 6.7661973043430704e-06, + "logits/chosen": 570516544.0, + "logits/rejected": 429619104.0, + "logps/chosen": -324.32415771484375, + "logps/rejected": -472.6939697265625, + "loss": 0.0106, + "rewards/chosen": 4.538198471069336, + "rewards/margins": 13.206991195678711, + "rewards/rejected": -8.668792724609375, + "step": 4228 + }, + { + "epoch": 0.3863864778437643, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 6.764852122223597e-06, + "logits/chosen": 569681920.0, + "logits/rejected": 524780448.0, + "logps/chosen": -440.68572998046875, + "logps/rejected": -496.8712463378906, + "loss": 0.0206, + "rewards/chosen": 3.8507680892944336, + "rewards/margins": 11.997580528259277, + "rewards/rejected": -8.146812438964844, + "step": 4229 + }, + { + "epoch": 0.3864778437642759, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 6.763506794167207e-06, + "logits/chosen": 547564441.6, + "logits/rejected": 334102592.0, + "logps/chosen": -333.2380859375, + "logps/rejected": -324.2704264322917, + "loss": 0.0201, + "rewards/chosen": 3.541342926025391, + "rewards/margins": 10.355604680379232, + "rewards/rejected": -6.814261754353841, + "step": 4230 + }, + { + "epoch": 0.3865692096847876, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 6.7621613202851415e-06, + "logits/chosen": 640918357.3333334, + "logits/rejected": 575836006.4, + "logps/chosen": -513.810546875, + "logps/rejected": -559.88994140625, + "loss": 0.0118, + "rewards/chosen": 3.617892583211263, + "rewards/margins": 13.29067014058431, + "rewards/rejected": -9.672777557373047, + "step": 4231 + }, + { + "epoch": 0.3866605756052992, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 6.7608157006886635e-06, + "logits/chosen": 379169280.0, + "logits/rejected": 645613120.0, + "logps/chosen": -235.61588541666666, + "logps/rejected": -584.4696044921875, + "loss": 0.0167, + "rewards/chosen": 4.273638407389323, + "rewards/margins": 13.183694521586101, + "rewards/rejected": -8.910056114196777, + "step": 4232 + }, + { + "epoch": 0.3867519415258109, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 6.7594699354890405e-06, + "logits/chosen": 875791808.0, + "logits/rejected": 1113325184.0, + "logps/chosen": -375.709716796875, + "logps/rejected": -469.1663513183594, + "loss": 0.0348, + "rewards/chosen": 2.8257174491882324, + "rewards/margins": 12.036098003387451, + "rewards/rejected": -9.210380554199219, + "step": 4233 + }, + { + "epoch": 0.3868433074463225, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 6.758124024797556e-06, + "logits/rejected": 678179072.0, + "logps/rejected": -317.48870849609375, + "loss": 0.0623, + "rewards/rejected": -6.737871170043945, + "step": 4234 + }, + { + "epoch": 0.3869346733668342, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 6.756777968725503e-06, + "logits/chosen": 798104234.6666666, + "logits/rejected": 346628761.6, + "logps/chosen": -526.7736409505209, + "logps/rejected": -526.998828125, + "loss": 0.0049, + "rewards/chosen": 4.376940091451009, + "rewards/margins": 15.344954617818196, + "rewards/rejected": -10.968014526367188, + "step": 4235 + }, + { + "epoch": 0.3870260392873458, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 6.755431767384191e-06, + "logits/chosen": 906622634.6666666, + "logits/rejected": 743682176.0, + "logps/chosen": -238.53340657552084, + "logps/rejected": -555.1870727539062, + "loss": 0.0332, + "rewards/chosen": 3.4334805806477866, + "rewards/margins": 15.112996419270834, + "rewards/rejected": -11.679515838623047, + "step": 4236 + }, + { + "epoch": 0.3871174052078575, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 6.754085420884934e-06, + "logits/chosen": 597811392.0, + "logits/rejected": 328007872.0, + "logps/chosen": -342.910888671875, + "logps/rejected": -421.0909423828125, + "loss": 0.0142, + "rewards/chosen": 4.017705917358398, + "rewards/margins": 12.821920394897461, + "rewards/rejected": -8.804214477539062, + "step": 4237 + }, + { + "epoch": 0.3872087711283691, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 6.752738929339066e-06, + "logits/chosen": 668599296.0, + "logits/rejected": 548936896.0, + "logps/chosen": -414.5232238769531, + "logps/rejected": -354.13616943359375, + "loss": 0.0167, + "rewards/chosen": 4.015214920043945, + "rewards/margins": 13.360823631286621, + "rewards/rejected": -9.345608711242676, + "step": 4238 + }, + { + "epoch": 0.3873001370488808, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.7513922928579265e-06, + "logits/chosen": 322960768.0, + "logits/rejected": 528886752.0, + "logps/chosen": -488.2561950683594, + "logps/rejected": -580.6339111328125, + "loss": 0.0393, + "rewards/chosen": 2.9966659545898438, + "rewards/margins": 14.806474685668945, + "rewards/rejected": -11.809808731079102, + "step": 4239 + }, + { + "epoch": 0.3873915029693924, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 6.750045511552874e-06, + "logits/chosen": 554251349.3333334, + "logits/rejected": 549285888.0, + "logps/chosen": -282.22332763671875, + "logps/rejected": -372.09063720703125, + "loss": 0.0482, + "rewards/chosen": 3.349353790283203, + "rewards/margins": 9.839337825775146, + "rewards/rejected": -6.489984035491943, + "step": 4240 + }, + { + "epoch": 0.3874828688899041, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 6.7486985855352695e-06, + "logits/chosen": 413042688.0, + "logits/rejected": 404398464.0, + "logps/chosen": -263.71014404296875, + "logps/rejected": -326.5203857421875, + "loss": 0.0295, + "rewards/chosen": 3.4451891581217446, + "rewards/margins": 13.608159701029459, + "rewards/rejected": -10.162970542907715, + "step": 4241 + }, + { + "epoch": 0.3875742348104157, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 6.747351514916496e-06, + "logits/chosen": 1399506688.0, + "logits/rejected": 583570986.6666666, + "logps/chosen": -530.5728759765625, + "logps/rejected": -560.2607421875, + "loss": 0.0083, + "rewards/chosen": 3.653420925140381, + "rewards/margins": 14.979240576426188, + "rewards/rejected": -11.325819651285807, + "step": 4242 + }, + { + "epoch": 0.3876656007309274, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 6.74600429980794e-06, + "logits/chosen": 564709440.0, + "logits/rejected": 1128349013.3333333, + "logps/chosen": -203.2229461669922, + "logps/rejected": -581.7528889973959, + "loss": 0.0102, + "rewards/chosen": 3.185293674468994, + "rewards/margins": 11.902040322621664, + "rewards/rejected": -8.71674664815267, + "step": 4243 + }, + { + "epoch": 0.387756966651439, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 6.744656940321007e-06, + "logits/chosen": 611251285.3333334, + "logits/rejected": 700864614.4, + "logps/chosen": -235.83085123697916, + "logps/rejected": -446.43447265625, + "loss": 0.0058, + "rewards/chosen": 4.789445241292317, + "rewards/margins": 13.1246945699056, + "rewards/rejected": -8.335249328613282, + "step": 4244 + }, + { + "epoch": 0.38784833257195067, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 6.743309436567109e-06, + "logits/chosen": 454410784.0, + "logits/rejected": 486889045.3333333, + "logps/chosen": -388.8586120605469, + "logps/rejected": -598.0482991536459, + "loss": 0.0054, + "rewards/chosen": 4.599951267242432, + "rewards/margins": 12.79269520441691, + "rewards/rejected": -8.192743937174479, + "step": 4245 + }, + { + "epoch": 0.3879396984924623, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 6.7419617886576735e-06, + "logits/chosen": 645741056.0, + "logits/rejected": 492486336.0, + "logps/chosen": -370.56341552734375, + "logps/rejected": -622.9173583984375, + "loss": 0.0179, + "rewards/chosen": 3.8529863357543945, + "rewards/margins": 12.330574035644531, + "rewards/rejected": -8.477587699890137, + "step": 4246 + }, + { + "epoch": 0.38803106441297397, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 6.7406139967041375e-06, + "logits/chosen": 351951232.0, + "logits/rejected": 511941120.0, + "logps/chosen": -285.2317708333333, + "logps/rejected": -485.5111328125, + "loss": 0.0103, + "rewards/chosen": 4.369958877563477, + "rewards/margins": 14.667055130004883, + "rewards/rejected": -10.297096252441406, + "step": 4247 + }, + { + "epoch": 0.3881224303334856, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 6.73926606081795e-06, + "logits/chosen": 682605738.6666666, + "logits/rejected": 624668774.4, + "logps/chosen": -246.08642578125, + "logps/rejected": -556.60712890625, + "loss": 0.0284, + "rewards/chosen": 2.690007209777832, + "rewards/margins": 12.364324378967286, + "rewards/rejected": -9.674317169189454, + "step": 4248 + }, + { + "epoch": 0.38821379625399727, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 6.737917981110575e-06, + "logits/chosen": 847798528.0, + "logits/rejected": 326679808.0, + "logps/chosen": -486.76005859375, + "logps/rejected": -261.4368896484375, + "loss": 0.0576, + "rewards/chosen": 3.5587982177734374, + "rewards/margins": 8.72180627187093, + "rewards/rejected": -5.163008054097493, + "step": 4249 + }, + { + "epoch": 0.3883051621745089, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 6.736569757693485e-06, + "logits/chosen": 522622293.3333333, + "logits/rejected": 430505267.2, + "logps/chosen": -304.8290608723958, + "logps/rejected": -285.1998291015625, + "loss": 0.0169, + "rewards/chosen": 3.1946798960367837, + "rewards/margins": 10.142251459757487, + "rewards/rejected": -6.947571563720703, + "step": 4250 + }, + { + "epoch": 0.38839652809502057, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.735221390678166e-06, + "logits/chosen": 434806118.4, + "logits/rejected": 634296490.6666666, + "logps/chosen": -277.3278564453125, + "logps/rejected": -354.1919352213542, + "loss": 0.028, + "rewards/chosen": 3.319113922119141, + "rewards/margins": 12.183339945475261, + "rewards/rejected": -8.86422602335612, + "step": 4251 + }, + { + "epoch": 0.3884878940155322, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 6.733872880176114e-06, + "logits/chosen": 524550400.0, + "logits/rejected": 557431872.0, + "logps/chosen": -245.10171508789062, + "logps/rejected": -410.7591552734375, + "loss": 0.0294, + "rewards/chosen": 3.36384654045105, + "rewards/margins": 12.589295148849487, + "rewards/rejected": -9.225448608398438, + "step": 4252 + }, + { + "epoch": 0.38857925993604386, + "grad_norm": 33.0, + "kl": 0.0, + "learning_rate": 6.732524226298841e-06, + "logits/chosen": 414659123.2, + "logits/rejected": 296293888.0, + "logps/chosen": -234.4600341796875, + "logps/rejected": -339.9206949869792, + "loss": 0.1767, + "rewards/chosen": 2.404168701171875, + "rewards/margins": 9.612914657592773, + "rewards/rejected": -7.208745956420898, + "step": 4253 + }, + { + "epoch": 0.3886706258565555, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 6.7311754291578655e-06, + "logits/chosen": 452910976.0, + "logits/rejected": 608966144.0, + "logps/chosen": -208.5657958984375, + "logps/rejected": -483.68896484375, + "loss": 0.0424, + "rewards/chosen": 2.836803913116455, + "rewards/margins": 12.80547571182251, + "rewards/rejected": -9.968671798706055, + "step": 4254 + }, + { + "epoch": 0.38876199177706716, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 6.729826488864722e-06, + "logits/chosen": 381974592.0, + "logits/rejected": 602942976.0, + "logps/chosen": -360.9842529296875, + "logps/rejected": -463.15155029296875, + "loss": 0.0146, + "rewards/chosen": 4.937726020812988, + "rewards/margins": 13.042256355285645, + "rewards/rejected": -8.104530334472656, + "step": 4255 + }, + { + "epoch": 0.3888533576975788, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 6.728477405530954e-06, + "logits/chosen": 719393865.1428572, + "logits/rejected": 354544896.0, + "logps/chosen": -321.424072265625, + "logps/rejected": -322.7745056152344, + "loss": 0.0355, + "rewards/chosen": 3.472038541521345, + "rewards/margins": 11.277409826006208, + "rewards/rejected": -7.805371284484863, + "step": 4256 + }, + { + "epoch": 0.38894472361809046, + "grad_norm": 38.5, + "kl": 0.0, + "learning_rate": 6.727128179268121e-06, + "logits/chosen": 531885184.0, + "logits/rejected": 935158272.0, + "logps/chosen": -239.8420867919922, + "logps/rejected": -417.67608642578125, + "loss": 0.116, + "rewards/chosen": 3.477771043777466, + "rewards/margins": 10.853142023086548, + "rewards/rejected": -7.375370979309082, + "step": 4257 + }, + { + "epoch": 0.3890360895386021, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 6.725778810187789e-06, + "logits/chosen": 444724522.6666667, + "logits/rejected": 638831718.4, + "logps/chosen": -167.5422159830729, + "logps/rejected": -401.85458984375, + "loss": 0.0353, + "rewards/chosen": 2.7121461232503257, + "rewards/margins": 11.740817387898764, + "rewards/rejected": -9.028671264648438, + "step": 4258 + }, + { + "epoch": 0.38912745545911376, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 6.72442929840154e-06, + "logits/chosen": 564228608.0, + "logits/rejected": 668650496.0, + "logps/chosen": -251.859375, + "logps/rejected": -409.0356140136719, + "loss": 0.0252, + "rewards/chosen": 3.3995888233184814, + "rewards/margins": 12.727240800857544, + "rewards/rejected": -9.327651977539062, + "step": 4259 + }, + { + "epoch": 0.3892188213796254, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 6.723079644020964e-06, + "logits/chosen": 289048883.2, + "logits/rejected": 515081088.0, + "logps/chosen": -260.0442626953125, + "logps/rejected": -418.1192220052083, + "loss": 0.0177, + "rewards/chosen": 3.9240543365478517, + "rewards/margins": 11.887650680541991, + "rewards/rejected": -7.963596343994141, + "step": 4260 + }, + { + "epoch": 0.38931018730013706, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 6.7217298471576675e-06, + "logits/chosen": 420654208.0, + "logits/rejected": 326859059.2, + "logps/chosen": -267.56968180338544, + "logps/rejected": -470.7751953125, + "loss": 0.0062, + "rewards/chosen": 4.204512596130371, + "rewards/margins": 13.07111873626709, + "rewards/rejected": -8.866606140136719, + "step": 4261 + }, + { + "epoch": 0.3894015532206487, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 6.7203799079232635e-06, + "logits/chosen": 661064704.0, + "logits/rejected": 719599360.0, + "logps/chosen": -594.9044799804688, + "logps/rejected": -648.065185546875, + "loss": 0.0081, + "rewards/chosen": 4.411077976226807, + "rewards/margins": 13.591542720794678, + "rewards/rejected": -9.180464744567871, + "step": 4262 + }, + { + "epoch": 0.38949291914116035, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 6.719029826429382e-06, + "logits/chosen": 851216725.3333334, + "logits/rejected": 387416883.2, + "logps/chosen": -435.9190266927083, + "logps/rejected": -499.264013671875, + "loss": 0.0079, + "rewards/chosen": 4.862258275349935, + "rewards/margins": 15.286005528767902, + "rewards/rejected": -10.423747253417968, + "step": 4263 + }, + { + "epoch": 0.389584285061672, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 6.717679602787659e-06, + "logits/chosen": 499809578.6666667, + "logits/rejected": 589775462.4, + "logps/chosen": -316.71044921875, + "logps/rejected": -435.224755859375, + "loss": 0.0068, + "rewards/chosen": 4.364644368489583, + "rewards/margins": 13.255719502766926, + "rewards/rejected": -8.891075134277344, + "step": 4264 + }, + { + "epoch": 0.38967565098218365, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 6.716329237109749e-06, + "logits/chosen": 320794005.3333333, + "logits/rejected": 532064640.0, + "logps/chosen": -191.35498046875, + "logps/rejected": -530.2034912109375, + "loss": 0.0178, + "rewards/chosen": 4.324486414591472, + "rewards/margins": 13.971190134684246, + "rewards/rejected": -9.646703720092773, + "step": 4265 + }, + { + "epoch": 0.3897670169026953, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 6.714978729507313e-06, + "logits/chosen": 392120768.0, + "logits/rejected": 635444522.6666666, + "logps/chosen": -204.42041015625, + "logps/rejected": -662.7020670572916, + "loss": 0.1181, + "rewards/chosen": 0.7502385377883911, + "rewards/margins": 11.405702710151672, + "rewards/rejected": -10.655464172363281, + "step": 4266 + }, + { + "epoch": 0.38985838282320695, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 6.7136280800920265e-06, + "logits/chosen": 641143680.0, + "logits/rejected": 227061088.0, + "logps/chosen": -352.3852233886719, + "logps/rejected": -282.8667297363281, + "loss": 0.0149, + "rewards/chosen": 3.801285743713379, + "rewards/margins": 12.083298683166504, + "rewards/rejected": -8.282012939453125, + "step": 4267 + }, + { + "epoch": 0.38994974874371857, + "grad_norm": 0.423828125, + "kl": 0.0, + "learning_rate": 6.7122772889755735e-06, + "logits/chosen": 770725632.0, + "logits/rejected": 422325174.85714287, + "logps/chosen": -758.1795043945312, + "logps/rejected": -324.48081752232144, + "loss": 0.0014, + "rewards/chosen": 4.653558254241943, + "rewards/margins": 13.941301958901542, + "rewards/rejected": -9.287743704659599, + "step": 4268 + }, + { + "epoch": 0.39004111466423025, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 6.710926356269654e-06, + "logits/chosen": 472118592.0, + "logits/rejected": 538915264.0, + "logps/chosen": -297.4051513671875, + "logps/rejected": -651.268310546875, + "loss": 0.0117, + "rewards/chosen": 4.543888092041016, + "rewards/margins": 13.405115127563477, + "rewards/rejected": -8.861227035522461, + "step": 4269 + }, + { + "epoch": 0.39013248058474187, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 6.709575282085977e-06, + "logits/chosen": 885170517.3333334, + "logits/rejected": 529944217.6, + "logps/chosen": -340.54437255859375, + "logps/rejected": -367.6306884765625, + "loss": 0.011, + "rewards/chosen": 3.845982869466146, + "rewards/margins": 10.602728017171223, + "rewards/rejected": -6.756745147705078, + "step": 4270 + }, + { + "epoch": 0.39022384650525355, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 6.708224066536263e-06, + "logits/chosen": 1069109824.0, + "logits/rejected": 619896384.0, + "logps/chosen": -231.62423706054688, + "logps/rejected": -370.072998046875, + "loss": 0.0317, + "rewards/chosen": 3.3832359313964844, + "rewards/margins": 11.311936855316162, + "rewards/rejected": -7.928700923919678, + "step": 4271 + }, + { + "epoch": 0.39031521242576517, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 6.706872709732247e-06, + "logits/chosen": 472331468.8, + "logits/rejected": 503926016.0, + "logps/chosen": -375.443603515625, + "logps/rejected": -526.4957682291666, + "loss": 0.0235, + "rewards/chosen": 3.633187484741211, + "rewards/margins": 12.043782424926757, + "rewards/rejected": -8.410594940185547, + "step": 4272 + }, + { + "epoch": 0.39040657834627684, + "grad_norm": 0.302734375, + "kl": 0.0, + "learning_rate": 6.705521211785672e-06, + "logits/chosen": 232582997.33333334, + "logits/rejected": 493646950.4, + "logps/chosen": -243.1820068359375, + "logps/rejected": -559.10517578125, + "loss": 0.0017, + "rewards/chosen": 5.825097401936849, + "rewards/margins": 14.063729604085285, + "rewards/rejected": -8.238632202148438, + "step": 4273 + }, + { + "epoch": 0.39049794426678847, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.704169572808294e-06, + "logits/chosen": 452800877.71428573, + "logits/rejected": 528980672.0, + "logps/chosen": -332.36928013392856, + "logps/rejected": -583.609619140625, + "loss": 0.0407, + "rewards/chosen": 3.450878143310547, + "rewards/margins": 11.402080535888672, + "rewards/rejected": -7.951202392578125, + "step": 4274 + }, + { + "epoch": 0.39058931018730014, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 6.702817792911882e-06, + "logits/chosen": 412411989.3333333, + "logits/rejected": 323922585.6, + "logps/chosen": -469.8658040364583, + "logps/rejected": -263.966455078125, + "loss": 0.014, + "rewards/chosen": 3.6280040740966797, + "rewards/margins": 11.120378494262695, + "rewards/rejected": -7.492374420166016, + "step": 4275 + }, + { + "epoch": 0.39068067610781176, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 6.701465872208216e-06, + "logits/chosen": 642004377.6, + "logits/rejected": 388080341.3333333, + "logps/chosen": -555.721484375, + "logps/rejected": -553.0078531901041, + "loss": 0.0421, + "rewards/chosen": 2.7734052658081056, + "rewards/margins": 14.682668876647949, + "rewards/rejected": -11.909263610839844, + "step": 4276 + }, + { + "epoch": 0.39077204202832344, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 6.700113810809085e-06, + "logits/chosen": 715588288.0, + "logits/rejected": 733815744.0, + "logps/chosen": -452.3260803222656, + "logps/rejected": -568.66748046875, + "loss": 0.0176, + "rewards/chosen": 3.4131479263305664, + "rewards/margins": 12.69931411743164, + "rewards/rejected": -9.286166191101074, + "step": 4277 + }, + { + "epoch": 0.39086340794883506, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 6.698761608826297e-06, + "logits/chosen": 416007082.6666667, + "logits/rejected": 180507136.0, + "logps/chosen": -330.7978922526042, + "logps/rejected": -416.19488525390625, + "loss": 0.0453, + "rewards/chosen": 3.354034423828125, + "rewards/margins": 14.579681396484375, + "rewards/rejected": -11.22564697265625, + "step": 4278 + }, + { + "epoch": 0.39095477386934674, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 6.697409266371662e-06, + "logits/chosen": 368893088.0, + "logits/rejected": 458989696.0, + "logps/chosen": -355.3214416503906, + "logps/rejected": -362.63775634765625, + "loss": 0.0114, + "rewards/chosen": 3.88815975189209, + "rewards/margins": 12.746993064880371, + "rewards/rejected": -8.858833312988281, + "step": 4279 + }, + { + "epoch": 0.39104613978985836, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 6.696056783557007e-06, + "logits/chosen": 536319936.0, + "logits/rejected": 711774122.6666666, + "logps/chosen": -282.8929443359375, + "logps/rejected": -692.6451822916666, + "loss": 0.0136, + "rewards/chosen": 3.707597494125366, + "rewards/margins": 12.81568439801534, + "rewards/rejected": -9.108086903889975, + "step": 4280 + }, + { + "epoch": 0.39113750571037004, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 6.6947041604941695e-06, + "logits/chosen": 420786227.2, + "logits/rejected": 413466538.6666667, + "logps/chosen": -372.0478515625, + "logps/rejected": -319.2386881510417, + "loss": 0.01, + "rewards/chosen": 4.603509521484375, + "rewards/margins": 12.685915883382162, + "rewards/rejected": -8.082406361897787, + "step": 4281 + }, + { + "epoch": 0.39122887163088166, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 6.693351397295001e-06, + "logits/chosen": 592711808.0, + "logits/rejected": 667971840.0, + "logps/chosen": -330.204345703125, + "logps/rejected": -318.39361572265625, + "loss": 0.0192, + "rewards/chosen": 4.032447814941406, + "rewards/margins": 12.525344848632812, + "rewards/rejected": -8.492897033691406, + "step": 4282 + }, + { + "epoch": 0.39132023755139334, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 6.691998494071361e-06, + "logits/chosen": 662486698.6666666, + "logits/rejected": 488639692.8, + "logps/chosen": -458.5535481770833, + "logps/rejected": -662.76875, + "loss": 0.0181, + "rewards/chosen": 3.0506633122762046, + "rewards/margins": 14.56520799001058, + "rewards/rejected": -11.514544677734374, + "step": 4283 + }, + { + "epoch": 0.39141160347190496, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 6.690645450935123e-06, + "logits/chosen": 617860949.3333334, + "logits/rejected": 637846400.0, + "logps/chosen": -387.8191731770833, + "logps/rejected": -432.7530822753906, + "loss": 0.015, + "rewards/chosen": 4.106848398844401, + "rewards/margins": 13.054931322733562, + "rewards/rejected": -8.94808292388916, + "step": 4284 + }, + { + "epoch": 0.39150296939241663, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 6.689292267998169e-06, + "logits/chosen": 650913280.0, + "logits/rejected": 502315861.3333333, + "logps/chosen": -368.134033203125, + "logps/rejected": -388.37890625, + "loss": 0.0256, + "rewards/chosen": 3.2666481018066404, + "rewards/margins": 9.93298797607422, + "rewards/rejected": -6.666339874267578, + "step": 4285 + }, + { + "epoch": 0.39159433531292825, + "grad_norm": 36.5, + "kl": 0.0, + "learning_rate": 6.687938945372397e-06, + "logits/chosen": 385715430.4, + "logits/rejected": 419086592.0, + "logps/chosen": -160.2885498046875, + "logps/rejected": -392.149658203125, + "loss": 0.0688, + "rewards/chosen": 2.91788330078125, + "rewards/margins": 8.069877688090006, + "rewards/rejected": -5.151994387308757, + "step": 4286 + }, + { + "epoch": 0.39168570123343993, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 6.6865854831697145e-06, + "logits/chosen": 955044249.6, + "logits/rejected": 476754944.0, + "logps/chosen": -268.364599609375, + "logps/rejected": -318.4770914713542, + "loss": 0.0414, + "rewards/chosen": 3.01769962310791, + "rewards/margins": 12.472968482971192, + "rewards/rejected": -9.455268859863281, + "step": 4287 + }, + { + "epoch": 0.39177706715395155, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 6.6852318815020355e-06, + "logits/chosen": 327106752.0, + "logits/rejected": 329786400.0, + "logps/chosen": -288.6999918619792, + "logps/rejected": -659.06982421875, + "loss": 0.1438, + "rewards/chosen": 2.554114500681559, + "rewards/margins": 15.898772398630777, + "rewards/rejected": -13.344657897949219, + "step": 4288 + }, + { + "epoch": 0.39186843307446323, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 6.683878140481297e-06, + "logits/chosen": 553076531.2, + "logits/rejected": 551385258.6666666, + "logps/chosen": -328.957666015625, + "logps/rejected": -597.1717122395834, + "loss": 0.0312, + "rewards/chosen": 3.5080745697021483, + "rewards/margins": 11.48604850769043, + "rewards/rejected": -7.977973937988281, + "step": 4289 + }, + { + "epoch": 0.39195979899497485, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.682524260219436e-06, + "logits/chosen": 794349440.0, + "logits/rejected": 488450240.0, + "logps/chosen": -291.60565185546875, + "logps/rejected": -548.5921630859375, + "loss": 0.0171, + "rewards/chosen": 3.3641257286071777, + "rewards/margins": 13.911400318145752, + "rewards/rejected": -10.547274589538574, + "step": 4290 + }, + { + "epoch": 0.3920511649154865, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 6.6811702408284085e-06, + "logits/chosen": 536559786.6666667, + "logits/rejected": 172178576.0, + "logps/chosen": -397.24267578125, + "logps/rejected": -268.29327392578125, + "loss": 0.0187, + "rewards/chosen": 3.919891357421875, + "rewards/margins": 13.243188858032227, + "rewards/rejected": -9.323297500610352, + "step": 4291 + }, + { + "epoch": 0.39214253083599815, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 6.679816082420179e-06, + "logits/chosen": 572453034.6666666, + "logits/rejected": 987083520.0, + "logps/chosen": -376.5232747395833, + "logps/rejected": -634.993896484375, + "loss": 0.1365, + "rewards/chosen": 2.5339158376057944, + "rewards/margins": 12.575818379720053, + "rewards/rejected": -10.041902542114258, + "step": 4292 + }, + { + "epoch": 0.3922338967565098, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 6.678461785106722e-06, + "logits/chosen": 484157344.0, + "logits/rejected": 588140288.0, + "logps/chosen": -356.4754638671875, + "logps/rejected": -499.8551330566406, + "loss": 0.0248, + "rewards/chosen": 3.526942253112793, + "rewards/margins": 11.861112594604492, + "rewards/rejected": -8.3341703414917, + "step": 4293 + }, + { + "epoch": 0.39232526267702145, + "grad_norm": 52.0, + "kl": 0.0, + "learning_rate": 6.677107349000027e-06, + "logits/chosen": 475725696.0, + "logits/rejected": 776591923.2, + "logps/chosen": -238.7816162109375, + "logps/rejected": -742.72900390625, + "loss": 0.0612, + "rewards/chosen": 2.814822514851888, + "rewards/margins": 14.461618932088216, + "rewards/rejected": -11.646796417236327, + "step": 4294 + }, + { + "epoch": 0.3924166285975331, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 6.675752774212093e-06, + "logits/chosen": 726760618.6666666, + "logits/rejected": 507700256.0, + "logps/chosen": -348.7314860026042, + "logps/rejected": -460.42340087890625, + "loss": 0.0326, + "rewards/chosen": 3.3246285120646157, + "rewards/margins": 10.388910929361979, + "rewards/rejected": -7.064282417297363, + "step": 4295 + }, + { + "epoch": 0.39250799451804474, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 6.674398060854931e-06, + "logits/chosen": 497935981.71428573, + "logits/rejected": 601536576.0, + "logps/chosen": -307.1099330357143, + "logps/rejected": -495.53167724609375, + "loss": 0.0431, + "rewards/chosen": 3.5534910474504744, + "rewards/margins": 11.992468288966588, + "rewards/rejected": -8.438977241516113, + "step": 4296 + }, + { + "epoch": 0.3925993604385564, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 6.673043209040564e-06, + "logits/chosen": 547041865.1428572, + "logits/rejected": 430228160.0, + "logps/chosen": -310.62088448660717, + "logps/rejected": -314.44952392578125, + "loss": 0.0404, + "rewards/chosen": 3.384977340698242, + "rewards/margins": 10.483288288116455, + "rewards/rejected": -7.098310947418213, + "step": 4297 + }, + { + "epoch": 0.39269072635906804, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.671688218881024e-06, + "logits/chosen": 704730304.0, + "logits/rejected": 676584448.0, + "logps/chosen": -496.17498779296875, + "logps/rejected": -546.4842529296875, + "loss": 0.0238, + "rewards/chosen": 3.3922111988067627, + "rewards/margins": 13.053319215774536, + "rewards/rejected": -9.661108016967773, + "step": 4298 + }, + { + "epoch": 0.3927820922795797, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 6.670333090488357e-06, + "logits/chosen": 578031308.8, + "logits/rejected": 302786133.3333333, + "logps/chosen": -393.12177734375, + "logps/rejected": -404.6143391927083, + "loss": 0.0194, + "rewards/chosen": 3.545802688598633, + "rewards/margins": 15.608007685343424, + "rewards/rejected": -12.062204996744791, + "step": 4299 + }, + { + "epoch": 0.39287345820009134, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 6.668977823974619e-06, + "logits/chosen": 824933504.0, + "logits/rejected": 382178048.0, + "logps/chosen": -483.937744140625, + "logps/rejected": -500.0154622395833, + "loss": 0.0109, + "rewards/chosen": 4.7304840087890625, + "rewards/margins": 12.615139643351238, + "rewards/rejected": -7.884655634562175, + "step": 4300 + }, + { + "epoch": 0.392964824120603, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 6.667622419451878e-06, + "logits/chosen": 424420121.6, + "logits/rejected": 774259029.3333334, + "logps/chosen": -253.04560546875, + "logps/rejected": -384.7281901041667, + "loss": 0.027, + "rewards/chosen": 3.282329559326172, + "rewards/margins": 13.167755126953125, + "rewards/rejected": -9.885425567626953, + "step": 4301 + }, + { + "epoch": 0.39305619004111464, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 6.6662668770322136e-06, + "logits/chosen": 709977429.3333334, + "logits/rejected": 593876531.2, + "logps/chosen": -513.7803548177084, + "logps/rejected": -440.094189453125, + "loss": 0.0147, + "rewards/chosen": 3.5972280502319336, + "rewards/margins": 15.701524925231933, + "rewards/rejected": -12.104296875, + "step": 4302 + }, + { + "epoch": 0.3931475559616263, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 6.664911196827717e-06, + "logits/chosen": 432478784.0, + "logits/rejected": 538600384.0, + "logps/chosen": -309.8326416015625, + "logps/rejected": -522.6845092773438, + "loss": 0.0377, + "rewards/chosen": 3.109467029571533, + "rewards/margins": 11.803680896759033, + "rewards/rejected": -8.6942138671875, + "step": 4303 + }, + { + "epoch": 0.39323892188213794, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 6.663555378950492e-06, + "logits/chosen": 235359936.0, + "logits/rejected": 511019840.0, + "logps/chosen": -174.7540283203125, + "logps/rejected": -580.11865234375, + "loss": 0.0068, + "rewards/chosen": 4.728506565093994, + "rewards/margins": 13.20779275894165, + "rewards/rejected": -8.479286193847656, + "step": 4304 + }, + { + "epoch": 0.3933302878026496, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 6.662199423512648e-06, + "logits/chosen": 611557760.0, + "logits/rejected": 394535321.6, + "logps/chosen": -542.1968587239584, + "logps/rejected": -318.1768798828125, + "loss": 0.011, + "rewards/chosen": 3.596681594848633, + "rewards/margins": 11.167197036743165, + "rewards/rejected": -7.570515441894531, + "step": 4305 + }, + { + "epoch": 0.39342165372316124, + "grad_norm": 41.0, + "kl": 0.0, + "learning_rate": 6.660843330626313e-06, + "logits/chosen": 815358336.0, + "logits/rejected": 614749440.0, + "logps/chosen": -619.3587646484375, + "logps/rejected": -324.0624186197917, + "loss": 0.0932, + "rewards/chosen": 3.3150575160980225, + "rewards/margins": 10.151498556137085, + "rewards/rejected": -6.8364410400390625, + "step": 4306 + }, + { + "epoch": 0.3935130196436729, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 6.659487100403622e-06, + "logits/chosen": 278733952.0, + "logits/rejected": 540317248.0, + "logps/chosen": -156.8043975830078, + "logps/rejected": -450.863037109375, + "loss": 0.0151, + "rewards/chosen": 4.7779459953308105, + "rewards/margins": 12.779716968536377, + "rewards/rejected": -8.001770973205566, + "step": 4307 + }, + { + "epoch": 0.39360438556418453, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 6.6581307329567245e-06, + "logits/chosen": 640892458.6666666, + "logits/rejected": 427695936.0, + "logps/chosen": -282.3768310546875, + "logps/rejected": -564.8568115234375, + "loss": 0.0189, + "rewards/chosen": 4.232265472412109, + "rewards/margins": 14.73965072631836, + "rewards/rejected": -10.50738525390625, + "step": 4308 + }, + { + "epoch": 0.3936957514846962, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 6.656774228397777e-06, + "logits/chosen": 654790963.2, + "logits/rejected": 475636778.6666667, + "logps/chosen": -339.697998046875, + "logps/rejected": -472.0365397135417, + "loss": 0.0161, + "rewards/chosen": 4.162152481079102, + "rewards/margins": 14.706814956665038, + "rewards/rejected": -10.544662475585938, + "step": 4309 + }, + { + "epoch": 0.39378711740520783, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 6.655417586838952e-06, + "logits/chosen": 928094208.0, + "logits/rejected": 1335783424.0, + "logps/chosen": -339.17087809244794, + "logps/rejected": -347.3169250488281, + "loss": 0.0288, + "rewards/chosen": 3.4521484375, + "rewards/margins": 11.121315002441406, + "rewards/rejected": -7.669166564941406, + "step": 4310 + }, + { + "epoch": 0.3938784833257195, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 6.654060808392431e-06, + "logits/chosen": 616504128.0, + "logits/rejected": 468363008.0, + "logps/chosen": -444.0788269042969, + "logps/rejected": -537.4740600585938, + "loss": 0.0197, + "rewards/chosen": 3.5589499473571777, + "rewards/margins": 12.849416255950928, + "rewards/rejected": -9.29046630859375, + "step": 4311 + }, + { + "epoch": 0.39396984924623113, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 6.6527038931704056e-06, + "logits/chosen": 503010560.0, + "logits/rejected": 708652697.6, + "logps/chosen": -260.5882568359375, + "logps/rejected": -675.34072265625, + "loss": 0.009, + "rewards/chosen": 4.101327896118164, + "rewards/margins": 11.666371536254882, + "rewards/rejected": -7.565043640136719, + "step": 4312 + }, + { + "epoch": 0.3940612151667428, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 6.651346841285081e-06, + "logits/chosen": 326426368.0, + "logits/rejected": 644471424.0, + "logps/chosen": -118.27487182617188, + "logps/rejected": -304.06396484375, + "loss": 0.021, + "rewards/chosen": 4.0074896812438965, + "rewards/margins": 11.359753608703613, + "rewards/rejected": -7.352263927459717, + "step": 4313 + }, + { + "epoch": 0.39415258108725443, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 6.649989652848674e-06, + "logits/chosen": 537964748.8, + "logits/rejected": 378822229.3333333, + "logps/chosen": -334.9557861328125, + "logps/rejected": -501.52734375, + "loss": 0.125, + "rewards/chosen": 2.246855926513672, + "rewards/margins": 13.143094889322917, + "rewards/rejected": -10.896238962809244, + "step": 4314 + }, + { + "epoch": 0.3942439470077661, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 6.648632327973408e-06, + "logits/chosen": 606918848.0, + "logits/rejected": 503832800.0, + "logps/chosen": -332.8084716796875, + "logps/rejected": -518.6275634765625, + "loss": 0.0135, + "rewards/chosen": 4.281949043273926, + "rewards/margins": 13.014388084411621, + "rewards/rejected": -8.732439041137695, + "step": 4315 + }, + { + "epoch": 0.3943353129282777, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 6.647274866771525e-06, + "logits/chosen": 651504170.6666666, + "logits/rejected": 766906163.2, + "logps/chosen": -289.84409586588544, + "logps/rejected": -635.332421875, + "loss": 0.1127, + "rewards/chosen": 2.0518458684285483, + "rewards/margins": 9.747693188985188, + "rewards/rejected": -7.6958473205566404, + "step": 4316 + }, + { + "epoch": 0.3944266788487894, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.645917269355271e-06, + "logits/chosen": 588501350.4, + "logits/rejected": 980370261.3333334, + "logps/chosen": -250.74375, + "logps/rejected": -541.2919108072916, + "loss": 0.0267, + "rewards/chosen": 3.125438690185547, + "rewards/margins": 13.380872090657553, + "rewards/rejected": -10.255433400472006, + "step": 4317 + }, + { + "epoch": 0.394518044769301, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 6.644559535836909e-06, + "logits/chosen": 410252384.0, + "logits/rejected": 647550805.3333334, + "logps/chosen": -228.44931030273438, + "logps/rejected": -377.06591796875, + "loss": 0.0146, + "rewards/chosen": 4.0257439613342285, + "rewards/margins": 11.787593046824139, + "rewards/rejected": -7.761849085489909, + "step": 4318 + }, + { + "epoch": 0.3946094106898127, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 6.643201666328713e-06, + "logits/chosen": 510892416.0, + "logits/rejected": 254423500.8, + "logps/chosen": -447.7178548177083, + "logps/rejected": -393.00283203125, + "loss": 0.0106, + "rewards/chosen": 3.662781079610189, + "rewards/margins": 11.662797864278158, + "rewards/rejected": -8.000016784667968, + "step": 4319 + }, + { + "epoch": 0.3947007766103243, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.641843660942961e-06, + "logits/chosen": 531228928.0, + "logits/rejected": 526658355.2, + "logps/chosen": -371.4711507161458, + "logps/rejected": -496.806591796875, + "loss": 0.0222, + "rewards/chosen": 3.4484640757242837, + "rewards/margins": 10.78834597269694, + "rewards/rejected": -7.339881896972656, + "step": 4320 + }, + { + "epoch": 0.394792142530836, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 6.640485519791953e-06, + "logits/chosen": 677680640.0, + "logits/rejected": 360574400.0, + "logps/chosen": -393.89544677734375, + "logps/rejected": -454.3970947265625, + "loss": 0.0158, + "rewards/chosen": 3.629831075668335, + "rewards/margins": 12.004853963851929, + "rewards/rejected": -8.375022888183594, + "step": 4321 + }, + { + "epoch": 0.3948835084513476, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 6.6391272429879886e-06, + "logits/chosen": 790580992.0, + "logits/rejected": 536174624.0, + "logps/chosen": -275.5173645019531, + "logps/rejected": -369.9266357421875, + "loss": 0.0372, + "rewards/chosen": 2.7795467376708984, + "rewards/margins": 11.263252258300781, + "rewards/rejected": -8.483705520629883, + "step": 4322 + }, + { + "epoch": 0.3949748743718593, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 6.637768830643391e-06, + "logits/chosen": 541081770.6666666, + "logits/rejected": 493461606.4, + "logps/chosen": -125.73000081380208, + "logps/rejected": -542.945947265625, + "loss": 0.0256, + "rewards/chosen": 3.5718180338541665, + "rewards/margins": 14.77933603922526, + "rewards/rejected": -11.207518005371094, + "step": 4323 + }, + { + "epoch": 0.3950662402923709, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 6.6364102828704826e-06, + "logits/chosen": 1871845120.0, + "logits/rejected": 652518912.0, + "logps/chosen": -287.30499267578125, + "logps/rejected": -484.0457356770833, + "loss": 0.0178, + "rewards/chosen": 2.6640868186950684, + "rewards/margins": 11.432552496592203, + "rewards/rejected": -8.768465677897135, + "step": 4324 + }, + { + "epoch": 0.3951576062128826, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 6.635051599781608e-06, + "logits/chosen": 393765120.0, + "logits/rejected": 140183456.0, + "logps/chosen": -389.090087890625, + "logps/rejected": -165.59429931640625, + "loss": 0.0264, + "rewards/chosen": 3.814722696940104, + "rewards/margins": 11.031832377115885, + "rewards/rejected": -7.217109680175781, + "step": 4325 + }, + { + "epoch": 0.3952489721333942, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.633692781489114e-06, + "logits/chosen": 390960960.0, + "logits/rejected": 403680000.0, + "logps/chosen": -341.2232666015625, + "logps/rejected": -591.8218587239584, + "loss": 0.009, + "rewards/chosen": 3.8916168212890625, + "rewards/margins": 13.342076619466146, + "rewards/rejected": -9.450459798177084, + "step": 4326 + }, + { + "epoch": 0.3953403380539059, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 6.632333828105364e-06, + "logits/chosen": 454518869.3333333, + "logits/rejected": 470747238.4, + "logps/chosen": -292.9319661458333, + "logps/rejected": -563.5431640625, + "loss": 0.017, + "rewards/chosen": 3.7747484842936196, + "rewards/margins": 14.185049692789713, + "rewards/rejected": -10.410301208496094, + "step": 4327 + }, + { + "epoch": 0.39543170397441757, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 6.630974739742731e-06, + "logits/chosen": 334915008.0, + "logits/rejected": 607237440.0, + "logps/chosen": -269.0240783691406, + "logps/rejected": -591.6922607421875, + "loss": 0.0141, + "rewards/chosen": 4.259449481964111, + "rewards/margins": 12.678987979888916, + "rewards/rejected": -8.419538497924805, + "step": 4328 + }, + { + "epoch": 0.3955230698949292, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 6.629615516513597e-06, + "logits/chosen": 440639692.8, + "logits/rejected": 633060309.3333334, + "logps/chosen": -332.076171875, + "logps/rejected": -948.95947265625, + "loss": 0.074, + "rewards/chosen": 2.6557804107666017, + "rewards/margins": 15.048620478312174, + "rewards/rejected": -12.392840067545572, + "step": 4329 + }, + { + "epoch": 0.39561443581544087, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 6.628256158530357e-06, + "logits/chosen": 416229580.8, + "logits/rejected": 472267520.0, + "logps/chosen": -259.5396728515625, + "logps/rejected": -535.154052734375, + "loss": 0.0135, + "rewards/chosen": 4.631600189208984, + "rewards/margins": 13.423046239217122, + "rewards/rejected": -8.791446050008139, + "step": 4330 + }, + { + "epoch": 0.3957058017359525, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 6.626896665905422e-06, + "logits/chosen": 625143488.0, + "logits/rejected": 437817344.0, + "logps/chosen": -321.311279296875, + "logps/rejected": -375.8924560546875, + "loss": 0.0193, + "rewards/chosen": 3.5227794647216797, + "rewards/margins": 10.951282501220703, + "rewards/rejected": -7.428503036499023, + "step": 4331 + }, + { + "epoch": 0.39579716765646417, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 6.625537038751204e-06, + "logits/chosen": 538213120.0, + "logits/rejected": 513744128.0, + "logps/chosen": -339.3900669642857, + "logps/rejected": -580.6787719726562, + "loss": 0.0469, + "rewards/chosen": 3.086426326206752, + "rewards/margins": 11.336313792637416, + "rewards/rejected": -8.249887466430664, + "step": 4332 + }, + { + "epoch": 0.3958885335769758, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 6.624177277180135e-06, + "logits/chosen": 700148531.2, + "logits/rejected": 551609770.6666666, + "logps/chosen": -285.43330078125, + "logps/rejected": -324.30275472005206, + "loss": 0.0115, + "rewards/chosen": 4.48409423828125, + "rewards/margins": 11.092761611938476, + "rewards/rejected": -6.608667373657227, + "step": 4333 + }, + { + "epoch": 0.39597989949748746, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 6.622817381304654e-06, + "logits/chosen": 1040089088.0, + "logits/rejected": 761245824.0, + "logps/chosen": -412.6556803385417, + "logps/rejected": -789.9685668945312, + "loss": 0.0152, + "rewards/chosen": 4.62466557820638, + "rewards/margins": 18.19864304860433, + "rewards/rejected": -13.57397747039795, + "step": 4334 + }, + { + "epoch": 0.3960712654179991, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 6.621457351237211e-06, + "logits/chosen": 640455014.4, + "logits/rejected": 797050624.0, + "logps/chosen": -300.9551025390625, + "logps/rejected": -490.0662027994792, + "loss": 0.0201, + "rewards/chosen": 3.5987205505371094, + "rewards/margins": 11.902936299641928, + "rewards/rejected": -8.304215749104818, + "step": 4335 + }, + { + "epoch": 0.39616263133851076, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 6.620097187090268e-06, + "logits/chosen": 446977331.2, + "logits/rejected": 371356501.3333333, + "logps/chosen": -251.451220703125, + "logps/rejected": -763.7522786458334, + "loss": 0.0132, + "rewards/chosen": 4.513470077514649, + "rewards/margins": 14.478023910522461, + "rewards/rejected": -9.964553833007812, + "step": 4336 + }, + { + "epoch": 0.3962539972590224, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 6.618736888976296e-06, + "logits/chosen": 210861920.0, + "logits/rejected": 532886496.0, + "logps/chosen": -182.5554962158203, + "logps/rejected": -477.28472900390625, + "loss": 0.0102, + "rewards/chosen": 5.488494396209717, + "rewards/margins": 12.260512351989746, + "rewards/rejected": -6.772017955780029, + "step": 4337 + }, + { + "epoch": 0.39634536317953406, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 6.617376457007785e-06, + "logits/chosen": 517662634.6666667, + "logits/rejected": 409118412.8, + "logps/chosen": -197.6715087890625, + "logps/rejected": -317.06884765625, + "loss": 0.0177, + "rewards/chosen": 3.479497273763021, + "rewards/margins": 10.477345784505209, + "rewards/rejected": -6.9978485107421875, + "step": 4338 + }, + { + "epoch": 0.3964367291000457, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 6.616015891297224e-06, + "logits/chosen": 618769536.0, + "logits/rejected": 632883968.0, + "logps/chosen": -420.23577880859375, + "logps/rejected": -831.8905639648438, + "loss": 0.0133, + "rewards/chosen": 3.8953301906585693, + "rewards/margins": 13.14006781578064, + "rewards/rejected": -9.24473762512207, + "step": 4339 + }, + { + "epoch": 0.39652809502055736, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 6.614655191957122e-06, + "logits/chosen": 665814912.0, + "logits/rejected": 496200160.0, + "logps/chosen": -318.12884521484375, + "logps/rejected": -403.8653869628906, + "loss": 0.0099, + "rewards/chosen": 4.643791198730469, + "rewards/margins": 12.583093166351318, + "rewards/rejected": -7.93930196762085, + "step": 4340 + }, + { + "epoch": 0.396619460941069, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 6.613294359099995e-06, + "logits/chosen": 378697258.6666667, + "logits/rejected": 522479411.2, + "logps/chosen": -253.2997029622396, + "logps/rejected": -334.84248046875, + "loss": 0.014, + "rewards/chosen": 4.090354283650716, + "rewards/margins": 11.243584569295248, + "rewards/rejected": -7.153230285644531, + "step": 4341 + }, + { + "epoch": 0.39671082686158066, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 6.611933392838373e-06, + "logits/chosen": 387516672.0, + "logits/rejected": 475192192.0, + "logps/chosen": -236.34183756510416, + "logps/rejected": -414.6155700683594, + "loss": 0.0265, + "rewards/chosen": 3.4388370513916016, + "rewards/margins": 11.2553391456604, + "rewards/rejected": -7.816502094268799, + "step": 4342 + }, + { + "epoch": 0.3968021927820923, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 6.610572293284794e-06, + "logits/chosen": 574609510.4, + "logits/rejected": 810570240.0, + "logps/chosen": -451.946875, + "logps/rejected": -753.8002115885416, + "loss": 0.0081, + "rewards/chosen": 4.887380599975586, + "rewards/margins": 13.939617156982422, + "rewards/rejected": -9.052236557006836, + "step": 4343 + }, + { + "epoch": 0.39689355870260395, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.609211060551809e-06, + "logits/chosen": 910951040.0, + "logits/rejected": 590753152.0, + "logps/chosen": -188.04983520507812, + "logps/rejected": -564.1046142578125, + "loss": 0.0198, + "rewards/chosen": 3.247932195663452, + "rewards/margins": 12.148031949996948, + "rewards/rejected": -8.900099754333496, + "step": 4344 + }, + { + "epoch": 0.3969849246231156, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 6.607849694751978e-06, + "logits/chosen": 785427328.0, + "logits/rejected": 785861504.0, + "logps/chosen": -344.46331787109375, + "logps/rejected": -759.91943359375, + "loss": 0.0126, + "rewards/chosen": 3.7126545906066895, + "rewards/margins": 16.10058832168579, + "rewards/rejected": -12.387933731079102, + "step": 4345 + }, + { + "epoch": 0.39707629054362725, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 6.606488195997876e-06, + "logits/chosen": 491112448.0, + "logits/rejected": 409164352.0, + "logps/chosen": -315.7589111328125, + "logps/rejected": -554.99169921875, + "loss": 0.0211, + "rewards/chosen": 4.301278432210286, + "rewards/margins": 15.809762318929035, + "rewards/rejected": -11.50848388671875, + "step": 4346 + }, + { + "epoch": 0.3971676564641389, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 6.605126564402082e-06, + "logits/chosen": 451968204.8, + "logits/rejected": 233303424.0, + "logps/chosen": -273.9863037109375, + "logps/rejected": -243.0531005859375, + "loss": 0.0399, + "rewards/chosen": 3.0222551345825197, + "rewards/margins": 11.488249015808105, + "rewards/rejected": -8.465993881225586, + "step": 4347 + }, + { + "epoch": 0.39725902238465055, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 6.603764800077195e-06, + "logits/chosen": 490034474.6666667, + "logits/rejected": 629010124.8, + "logps/chosen": -422.8594563802083, + "logps/rejected": -625.06474609375, + "loss": 0.0366, + "rewards/chosen": 3.213102340698242, + "rewards/margins": 11.449888992309571, + "rewards/rejected": -8.236786651611329, + "step": 4348 + }, + { + "epoch": 0.39735038830516217, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 6.602402903135817e-06, + "logits/chosen": 426603872.0, + "logits/rejected": 363309952.0, + "logps/chosen": -371.9720153808594, + "logps/rejected": -391.42510986328125, + "loss": 0.0189, + "rewards/chosen": 3.6677403450012207, + "rewards/margins": 12.994947910308838, + "rewards/rejected": -9.327207565307617, + "step": 4349 + }, + { + "epoch": 0.39744175422567385, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 6.601040873690568e-06, + "logits/chosen": 503272996.5714286, + "logits/rejected": 726620352.0, + "logps/chosen": -355.82101004464283, + "logps/rejected": -510.6225891113281, + "loss": 0.0222, + "rewards/chosen": 3.9978697640555247, + "rewards/margins": 14.287972722734724, + "rewards/rejected": -10.2901029586792, + "step": 4350 + }, + { + "epoch": 0.39753312014618547, + "grad_norm": 0.6328125, + "kl": 0.0, + "learning_rate": 6.599678711854071e-06, + "logits/chosen": 209943125.33333334, + "logits/rejected": 352196096.0, + "logps/chosen": -292.4035237630208, + "logps/rejected": -589.85283203125, + "loss": 0.0215, + "rewards/chosen": 3.882382392883301, + "rewards/margins": 13.876329231262208, + "rewards/rejected": -9.993946838378907, + "step": 4351 + }, + { + "epoch": 0.39762448606669715, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 6.598316417738965e-06, + "logits/chosen": 1013352512.0, + "logits/rejected": 770188288.0, + "logps/chosen": -276.9830322265625, + "logps/rejected": -777.0496826171875, + "loss": 0.0163, + "rewards/chosen": 3.823383331298828, + "rewards/margins": 11.169564723968506, + "rewards/rejected": -7.346181392669678, + "step": 4352 + }, + { + "epoch": 0.39771585198720877, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 6.5969539914579e-06, + "logits/chosen": 516019456.0, + "logits/rejected": 530632806.4, + "logps/chosen": -336.41233317057294, + "logps/rejected": -585.985546875, + "loss": 0.0414, + "rewards/chosen": 4.22718620300293, + "rewards/margins": 11.726867294311523, + "rewards/rejected": -7.499681091308593, + "step": 4353 + }, + { + "epoch": 0.39780721790772045, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 6.595591433123538e-06, + "logits/chosen": 484008243.2, + "logits/rejected": 457478954.6666667, + "logps/chosen": -382.903125, + "logps/rejected": -363.1507161458333, + "loss": 0.0435, + "rewards/chosen": 3.072879409790039, + "rewards/margins": 9.473603185017904, + "rewards/rejected": -6.400723775227864, + "step": 4354 + }, + { + "epoch": 0.39789858382823207, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 6.594228742848546e-06, + "logits/chosen": 533564211.2, + "logits/rejected": 452742656.0, + "logps/chosen": -171.92822265625, + "logps/rejected": -469.9211832682292, + "loss": 0.1446, + "rewards/chosen": 1.9049137115478516, + "rewards/margins": 11.09524294535319, + "rewards/rejected": -9.190329233805338, + "step": 4355 + }, + { + "epoch": 0.39798994974874374, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 6.592865920745609e-06, + "logits/chosen": 453411136.0, + "logits/rejected": 440594688.0, + "logps/chosen": -302.7352294921875, + "logps/rejected": -453.48150634765625, + "loss": 0.0141, + "rewards/chosen": 4.312169075012207, + "rewards/margins": 13.093132972717285, + "rewards/rejected": -8.780963897705078, + "step": 4356 + }, + { + "epoch": 0.39808131566925536, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 6.591502966927419e-06, + "logits/chosen": 594749610.6666666, + "logits/rejected": 543148339.2, + "logps/chosen": -333.6369222005208, + "logps/rejected": -424.7279296875, + "loss": 0.0386, + "rewards/chosen": 3.905879338582357, + "rewards/margins": 10.087297185262045, + "rewards/rejected": -6.181417846679688, + "step": 4357 + }, + { + "epoch": 0.39817268158976704, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.590139881506679e-06, + "logits/chosen": 707791513.6, + "logits/rejected": 571974400.0, + "logps/chosen": -371.5548828125, + "logps/rejected": -499.7289632161458, + "loss": 0.0165, + "rewards/chosen": 3.8358169555664063, + "rewards/margins": 13.204621887207031, + "rewards/rejected": -9.368804931640625, + "step": 4358 + }, + { + "epoch": 0.39826404751027866, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 6.588776664596102e-06, + "logits/chosen": 409299882.6666667, + "logits/rejected": 543267225.6, + "logps/chosen": -203.64139811197916, + "logps/rejected": -367.195166015625, + "loss": 0.0183, + "rewards/chosen": 4.052190780639648, + "rewards/margins": 12.145031356811524, + "rewards/rejected": -8.092840576171875, + "step": 4359 + }, + { + "epoch": 0.39835541343079034, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 6.587413316308417e-06, + "logits/chosen": 823870720.0, + "logits/rejected": 896105386.6666666, + "logps/chosen": -296.8605224609375, + "logps/rejected": -581.1520182291666, + "loss": 0.025, + "rewards/chosen": 3.569710922241211, + "rewards/margins": 14.058395258585612, + "rewards/rejected": -10.4886843363444, + "step": 4360 + }, + { + "epoch": 0.39844677935130196, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 6.586049836756357e-06, + "logits/chosen": 529795413.3333333, + "logits/rejected": 397238220.8, + "logps/chosen": -335.9657796223958, + "logps/rejected": -464.07236328125, + "loss": 0.0623, + "rewards/chosen": 4.1000722249348955, + "rewards/margins": 11.569983418782552, + "rewards/rejected": -7.469911193847656, + "step": 4361 + }, + { + "epoch": 0.39853814527181364, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 6.5846862260526715e-06, + "logits/chosen": 661342080.0, + "logits/rejected": 358016768.0, + "logps/chosen": -428.62908935546875, + "logps/rejected": -436.804931640625, + "loss": 0.0057, + "rewards/chosen": 3.9242324829101562, + "rewards/margins": 13.411605834960938, + "rewards/rejected": -9.487373352050781, + "step": 4362 + }, + { + "epoch": 0.39862951119232526, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 6.583322484310117e-06, + "logits/chosen": 515413440.0, + "logits/rejected": 403956480.0, + "logps/chosen": -310.7935791015625, + "logps/rejected": -463.6884460449219, + "loss": 0.0244, + "rewards/chosen": 3.4430322647094727, + "rewards/margins": 11.065501689910889, + "rewards/rejected": -7.622469425201416, + "step": 4363 + }, + { + "epoch": 0.39872087711283694, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 6.581958611641464e-06, + "logits/chosen": 676427008.0, + "logits/rejected": 605293363.2, + "logps/chosen": -481.0278727213542, + "logps/rejected": -607.447998046875, + "loss": 0.0137, + "rewards/chosen": 3.597819964090983, + "rewards/margins": 13.84454085032145, + "rewards/rejected": -10.246720886230468, + "step": 4364 + }, + { + "epoch": 0.39881224303334856, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 6.5805946081594904e-06, + "logits/chosen": 520696960.0, + "logits/rejected": 543690069.3333334, + "logps/chosen": -271.9595031738281, + "logps/rejected": -357.11962890625, + "loss": 0.0108, + "rewards/chosen": 3.5272674560546875, + "rewards/margins": 12.132726033528646, + "rewards/rejected": -8.605458577473959, + "step": 4365 + }, + { + "epoch": 0.39890360895386023, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.579230473976988e-06, + "logits/chosen": 699936358.4, + "logits/rejected": 377552554.6666667, + "logps/chosen": -308.075830078125, + "logps/rejected": -421.3959147135417, + "loss": 0.0204, + "rewards/chosen": 3.6879859924316407, + "rewards/margins": 14.0156862894694, + "rewards/rejected": -10.32770029703776, + "step": 4366 + }, + { + "epoch": 0.39899497487437185, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 6.577866209206757e-06, + "logits/chosen": 952189184.0, + "logits/rejected": 496320554.6666667, + "logps/chosen": -324.576171875, + "logps/rejected": -513.7481282552084, + "loss": 0.0071, + "rewards/chosen": 3.7277679443359375, + "rewards/margins": 12.097223917643229, + "rewards/rejected": -8.369455973307291, + "step": 4367 + }, + { + "epoch": 0.39908634079488353, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 6.576501813961609e-06, + "logits/chosen": 896996672.0, + "logits/rejected": 481737045.3333333, + "logps/chosen": -594.00146484375, + "logps/rejected": -371.4449869791667, + "loss": 0.0056, + "rewards/chosen": 4.174838542938232, + "rewards/margins": 12.871744314829508, + "rewards/rejected": -8.696905771891275, + "step": 4368 + }, + { + "epoch": 0.39917770671539515, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.575137288354368e-06, + "logits/chosen": 760908851.2, + "logits/rejected": 474928554.6666667, + "logps/chosen": -430.331689453125, + "logps/rejected": -533.0735677083334, + "loss": 0.0219, + "rewards/chosen": 3.384398651123047, + "rewards/margins": 13.379725646972656, + "rewards/rejected": -9.99532699584961, + "step": 4369 + }, + { + "epoch": 0.39926907263590683, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.573772632497866e-06, + "logits/chosen": 583346995.2, + "logits/rejected": 207291541.33333334, + "logps/chosen": -455.18212890625, + "logps/rejected": -289.43983968098956, + "loss": 0.0305, + "rewards/chosen": 3.4316104888916015, + "rewards/margins": 12.66975695292155, + "rewards/rejected": -9.238146464029947, + "step": 4370 + }, + { + "epoch": 0.39936043855641845, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 6.57240784650495e-06, + "logits/chosen": 354664960.0, + "logits/rejected": 536206336.0, + "logps/chosen": -368.6024475097656, + "logps/rejected": -529.1689860026041, + "loss": 0.0149, + "rewards/chosen": 3.160630941390991, + "rewards/margins": 11.49853746096293, + "rewards/rejected": -8.33790651957194, + "step": 4371 + }, + { + "epoch": 0.39945180447693013, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 6.571042930488474e-06, + "logits/chosen": 513329194.6666667, + "logits/rejected": 763845760.0, + "logps/chosen": -341.9046223958333, + "logps/rejected": -745.1132202148438, + "loss": 0.0564, + "rewards/chosen": 2.6206235885620117, + "rewards/margins": 10.526299953460693, + "rewards/rejected": -7.905676364898682, + "step": 4372 + }, + { + "epoch": 0.39954317039744175, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 6.569677884561303e-06, + "logits/chosen": 278831168.0, + "logits/rejected": 560042965.3333334, + "logps/chosen": -483.0753479003906, + "logps/rejected": -308.4444986979167, + "loss": 0.0162, + "rewards/chosen": 3.0680344104766846, + "rewards/margins": 11.083902915318808, + "rewards/rejected": -8.015868504842123, + "step": 4373 + }, + { + "epoch": 0.3996345363179534, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 6.568312708836314e-06, + "logits/chosen": 594016768.0, + "logits/rejected": 450855232.0, + "logps/chosen": -387.74481201171875, + "logps/rejected": -323.2188415527344, + "loss": 0.1153, + "rewards/chosen": 3.495645046234131, + "rewards/margins": 10.201093196868896, + "rewards/rejected": -6.705448150634766, + "step": 4374 + }, + { + "epoch": 0.39972590223846505, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 6.5669474034263946e-06, + "logits/chosen": 607314739.2, + "logits/rejected": 532674133.3333333, + "logps/chosen": -180.5157470703125, + "logps/rejected": -463.2747395833333, + "loss": 0.0354, + "rewards/chosen": 2.9833969116210937, + "rewards/margins": 10.153421147664387, + "rewards/rejected": -7.170024236043294, + "step": 4375 + }, + { + "epoch": 0.3998172681589767, + "grad_norm": 51.0, + "kl": 0.0, + "learning_rate": 6.565581968444444e-06, + "logits/chosen": 934508160.0, + "logits/rejected": 525661056.0, + "logps/chosen": -351.18133544921875, + "logps/rejected": -577.4327392578125, + "loss": 0.0456, + "rewards/chosen": 2.222564697265625, + "rewards/margins": 11.969083786010742, + "rewards/rejected": -9.746519088745117, + "step": 4376 + }, + { + "epoch": 0.39990863407948835, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 6.56421640400337e-06, + "logits/chosen": 872545152.0, + "logits/rejected": 560175744.0, + "logps/chosen": -561.9747314453125, + "logps/rejected": -448.12542724609375, + "loss": 0.0263, + "rewards/chosen": 3.68691349029541, + "rewards/margins": 11.972976684570312, + "rewards/rejected": -8.286063194274902, + "step": 4377 + }, + { + "epoch": 0.4, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 6.562850710216091e-06, + "logits/chosen": 918657536.0, + "logits/rejected": 2073595008.0, + "logps/chosen": -379.9868977864583, + "logps/rejected": -759.4581909179688, + "loss": 0.0266, + "rewards/chosen": 3.6155223846435547, + "rewards/margins": 15.55347728729248, + "rewards/rejected": -11.937954902648926, + "step": 4378 + }, + { + "epoch": 0.40009136592051164, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 6.561484887195541e-06, + "logits/chosen": 463156138.6666667, + "logits/rejected": 778475200.0, + "logps/chosen": -312.387451171875, + "logps/rejected": -724.6710815429688, + "loss": 0.0186, + "rewards/chosen": 3.9826412200927734, + "rewards/margins": 11.996294975280762, + "rewards/rejected": -8.013653755187988, + "step": 4379 + }, + { + "epoch": 0.4001827318410233, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 6.5601189350546555e-06, + "logits/chosen": 595402956.8, + "logits/rejected": 631083008.0, + "logps/chosen": -471.30166015625, + "logps/rejected": -622.3444010416666, + "loss": 0.0403, + "rewards/chosen": 3.4471580505371096, + "rewards/margins": 13.12638142903646, + "rewards/rejected": -9.67922337849935, + "step": 4380 + }, + { + "epoch": 0.40027409776153494, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 6.558752853906389e-06, + "logits/chosen": 589659238.4, + "logits/rejected": 651955072.0, + "logps/chosen": -275.1173583984375, + "logps/rejected": -380.5076497395833, + "loss": 0.0214, + "rewards/chosen": 3.827379608154297, + "rewards/margins": 11.16032002766927, + "rewards/rejected": -7.332940419514974, + "step": 4381 + }, + { + "epoch": 0.4003654636820466, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 6.557386643863706e-06, + "logits/chosen": 589104844.8, + "logits/rejected": 255325610.66666666, + "logps/chosen": -390.98310546875, + "logps/rejected": -450.4736735026042, + "loss": 0.1396, + "rewards/chosen": 1.9342718124389648, + "rewards/margins": 12.757857322692871, + "rewards/rejected": -10.823585510253906, + "step": 4382 + }, + { + "epoch": 0.40045682960255824, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 6.556020305039574e-06, + "logits/chosen": 657523328.0, + "logits/rejected": 518206272.0, + "logps/chosen": -400.4474283854167, + "logps/rejected": -228.0069580078125, + "loss": 0.0258, + "rewards/chosen": 3.597379684448242, + "rewards/margins": 9.04440689086914, + "rewards/rejected": -5.447027206420898, + "step": 4383 + }, + { + "epoch": 0.4005481955230699, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 6.554653837546983e-06, + "logits/chosen": 719050547.2, + "logits/rejected": 473839445.3333333, + "logps/chosen": -388.6937744140625, + "logps/rejected": -455.5550537109375, + "loss": 0.0334, + "rewards/chosen": 3.3964996337890625, + "rewards/margins": 12.515426635742188, + "rewards/rejected": -9.118927001953125, + "step": 4384 + }, + { + "epoch": 0.40063956144358154, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 6.553287241498921e-06, + "logits/chosen": 723187200.0, + "logits/rejected": 374013269.3333333, + "logps/chosen": -352.01396484375, + "logps/rejected": -236.27115885416666, + "loss": 0.0126, + "rewards/chosen": 4.171974182128906, + "rewards/margins": 11.983287175496418, + "rewards/rejected": -7.811312993367513, + "step": 4385 + }, + { + "epoch": 0.4007309273640932, + "grad_norm": 14.25, + "kl": 14.084522247314453, + "learning_rate": 6.551920517008396e-06, + "logits/chosen": 641839030.8571428, + "logits/rejected": 209421792.0, + "logps/chosen": -436.61202566964283, + "logps/rejected": -273.7412109375, + "loss": 0.138, + "rewards/chosen": 3.300097329275949, + "rewards/margins": 11.037438733237131, + "rewards/rejected": -7.737341403961182, + "step": 4386 + }, + { + "epoch": 0.40082229328460484, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 6.550553664188426e-06, + "logits/chosen": 513740160.0, + "logits/rejected": 411254464.0, + "logps/chosen": -260.81634521484375, + "logps/rejected": -642.1143188476562, + "loss": 0.0192, + "rewards/chosen": 3.3371334075927734, + "rewards/margins": 13.085819244384766, + "rewards/rejected": -9.748685836791992, + "step": 4387 + }, + { + "epoch": 0.4009136592051165, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.549186683152031e-06, + "logits/chosen": 720434380.8, + "logits/rejected": 627242496.0, + "logps/chosen": -348.21298828125, + "logps/rejected": -467.0375569661458, + "loss": 0.0305, + "rewards/chosen": 3.207170104980469, + "rewards/margins": 13.00170669555664, + "rewards/rejected": -9.794536590576172, + "step": 4388 + }, + { + "epoch": 0.40100502512562813, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 6.547819574012252e-06, + "logits/chosen": 838573909.3333334, + "logits/rejected": 594341171.2, + "logps/chosen": -404.4192708333333, + "logps/rejected": -612.0232421875, + "loss": 0.0112, + "rewards/chosen": 3.6000744501749673, + "rewards/margins": 12.396612993876138, + "rewards/rejected": -8.796538543701171, + "step": 4389 + }, + { + "epoch": 0.4010963910461398, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 6.546452336882134e-06, + "logits/chosen": 366399658.6666667, + "logits/rejected": 362772928.0, + "logps/chosen": -241.95674641927084, + "logps/rejected": -323.30462646484375, + "loss": 0.0136, + "rewards/chosen": 4.548195838928223, + "rewards/margins": 11.603933334350586, + "rewards/rejected": -7.055737495422363, + "step": 4390 + }, + { + "epoch": 0.40118775696665143, + "grad_norm": 0.60546875, + "kl": 0.0, + "learning_rate": 6.545084971874738e-06, + "logits/chosen": 1095973120.0, + "logits/rejected": 571702528.0, + "logps/chosen": -252.9530792236328, + "logps/rejected": -614.2862141927084, + "loss": 0.0036, + "rewards/chosen": 4.403883457183838, + "rewards/margins": 15.315303643544516, + "rewards/rejected": -10.911420186360678, + "step": 4391 + }, + { + "epoch": 0.4012791228871631, + "grad_norm": 0.87890625, + "kl": 0.0, + "learning_rate": 6.543717479103129e-06, + "logits/chosen": 440240725.3333333, + "logits/rejected": 800308531.2, + "logps/chosen": -223.84493001302084, + "logps/rejected": -506.86259765625, + "loss": 0.0048, + "rewards/chosen": 4.629942576090495, + "rewards/margins": 13.299786631266276, + "rewards/rejected": -8.669844055175782, + "step": 4392 + }, + { + "epoch": 0.40137048880767473, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 6.542349858680388e-06, + "logits/chosen": 192267456.0, + "logits/rejected": 500331593.14285713, + "logps/chosen": -78.8961410522461, + "logps/rejected": -576.0086146763393, + "loss": 0.0079, + "rewards/chosen": 2.7231407165527344, + "rewards/margins": 13.401824406215123, + "rewards/rejected": -10.678683689662389, + "step": 4393 + }, + { + "epoch": 0.4014618547281864, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 6.540982110719604e-06, + "logits/chosen": 279340416.0, + "logits/rejected": 314272384.0, + "logps/chosen": -350.4701741536458, + "logps/rejected": -409.318896484375, + "loss": 0.0095, + "rewards/chosen": 4.1478697458903, + "rewards/margins": 13.680619684855145, + "rewards/rejected": -9.532749938964844, + "step": 4394 + }, + { + "epoch": 0.40155322064869803, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 6.539614235333877e-06, + "logits/chosen": 1145147733.3333333, + "logits/rejected": 541083596.8, + "logps/chosen": -245.52461751302084, + "logps/rejected": -452.2912109375, + "loss": 0.0192, + "rewards/chosen": 3.5723307927449546, + "rewards/margins": 12.020473798116049, + "rewards/rejected": -8.448143005371094, + "step": 4395 + }, + { + "epoch": 0.4016445865692097, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 6.538246232636316e-06, + "logits/chosen": 427660992.0, + "logits/rejected": 474996224.0, + "logps/chosen": -253.48562622070312, + "logps/rejected": -455.45758056640625, + "loss": 0.0083, + "rewards/chosen": 4.619162559509277, + "rewards/margins": 14.358979225158691, + "rewards/rejected": -9.739816665649414, + "step": 4396 + }, + { + "epoch": 0.4017359524897213, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 6.5368781027400455e-06, + "logits/chosen": 638553036.8, + "logits/rejected": 483077290.6666667, + "logps/chosen": -242.8537353515625, + "logps/rejected": -445.912353515625, + "loss": 0.0203, + "rewards/chosen": 4.270709228515625, + "rewards/margins": 11.830454381306966, + "rewards/rejected": -7.559745152791341, + "step": 4397 + }, + { + "epoch": 0.401827318410233, + "grad_norm": 0.50390625, + "kl": 0.0, + "learning_rate": 6.535509845758193e-06, + "logits/chosen": 936861184.0, + "logits/rejected": 389043456.0, + "logps/chosen": -388.776611328125, + "logps/rejected": -433.059765625, + "loss": 0.0034, + "rewards/chosen": 4.9265391031901045, + "rewards/margins": 14.308892313639323, + "rewards/rejected": -9.382353210449219, + "step": 4398 + }, + { + "epoch": 0.4019186843307446, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 6.534141461803906e-06, + "logits/chosen": 587571712.0, + "logits/rejected": 750344448.0, + "logps/chosen": -268.2740478515625, + "logps/rejected": -323.0643310546875, + "loss": 0.0173, + "rewards/chosen": 3.753488540649414, + "rewards/margins": 10.959060668945312, + "rewards/rejected": -7.205572128295898, + "step": 4399 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 6.532772950990331e-06, + "logits/chosen": 629920870.4, + "logits/rejected": 835041024.0, + "logps/chosen": -375.5560546875, + "logps/rejected": -528.7742919921875, + "loss": 0.0151, + "rewards/chosen": 3.951642608642578, + "rewards/margins": 12.209264373779297, + "rewards/rejected": -8.257621765136719, + "step": 4400 + }, + { + "epoch": 0.4021014161717679, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 6.531404313430638e-06, + "logits/chosen": 385245696.0, + "logits/rejected": 748443968.0, + "logps/chosen": -269.31667073567706, + "logps/rejected": -494.39697265625, + "loss": 0.0315, + "rewards/chosen": 3.3997672398885093, + "rewards/margins": 12.132061322530111, + "rewards/rejected": -8.732294082641602, + "step": 4401 + }, + { + "epoch": 0.4021927820922796, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 6.5300355492379945e-06, + "logits/chosen": 729509290.6666666, + "logits/rejected": 572081766.4, + "logps/chosen": -570.1771647135416, + "logps/rejected": -578.0201171875, + "loss": 0.0082, + "rewards/chosen": 3.8875722885131836, + "rewards/margins": 14.163844108581543, + "rewards/rejected": -10.27627182006836, + "step": 4402 + }, + { + "epoch": 0.4022841480127912, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 6.528666658525589e-06, + "logits/chosen": 828728896.0, + "logits/rejected": 621974400.0, + "logps/chosen": -307.62298583984375, + "logps/rejected": -453.044677734375, + "loss": 0.0408, + "rewards/chosen": 3.129573106765747, + "rewards/margins": 12.812360048294067, + "rewards/rejected": -9.68278694152832, + "step": 4403 + }, + { + "epoch": 0.4023755139333029, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 6.5272976414066135e-06, + "logits/chosen": 611815509.3333334, + "logits/rejected": 707316121.6, + "logps/chosen": -511.799072265625, + "logps/rejected": -705.81552734375, + "loss": 0.0106, + "rewards/chosen": 3.7366453806559243, + "rewards/margins": 15.170278803507486, + "rewards/rejected": -11.433633422851562, + "step": 4404 + }, + { + "epoch": 0.4024668798538145, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 6.525928497994273e-06, + "logits/chosen": 553582208.0, + "logits/rejected": 499204096.0, + "logps/chosen": -347.9231872558594, + "logps/rejected": -411.16552734375, + "loss": 0.0169, + "rewards/chosen": 3.5764577388763428, + "rewards/margins": 12.012428998947144, + "rewards/rejected": -8.4359712600708, + "step": 4405 + }, + { + "epoch": 0.4025582457743262, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 6.524559228401784e-06, + "logits/chosen": 411784106.6666667, + "logits/rejected": 725021491.2, + "logps/chosen": -239.8266805013021, + "logps/rejected": -525.772998046875, + "loss": 0.0119, + "rewards/chosen": 4.582172711690267, + "rewards/margins": 12.905647595723469, + "rewards/rejected": -8.323474884033203, + "step": 4406 + }, + { + "epoch": 0.4026496116948378, + "grad_norm": 27.375, + "kl": 0.0, + "learning_rate": 6.523189832742374e-06, + "logits/chosen": 504064608.0, + "logits/rejected": 646236800.0, + "logps/chosen": -164.7547607421875, + "logps/rejected": -429.567138671875, + "loss": 0.1167, + "rewards/chosen": 2.5002918243408203, + "rewards/margins": 10.514778137207031, + "rewards/rejected": -8.014486312866211, + "step": 4407 + }, + { + "epoch": 0.4027409776153495, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 6.5218203111292755e-06, + "logits/chosen": 835242752.0, + "logits/rejected": 591098572.8, + "logps/chosen": -363.7793782552083, + "logps/rejected": -450.7435546875, + "loss": 0.0151, + "rewards/chosen": 3.6279783248901367, + "rewards/margins": 12.148359489440917, + "rewards/rejected": -8.52038116455078, + "step": 4408 + }, + { + "epoch": 0.4028323435358611, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 6.5204506636757366e-06, + "logits/chosen": 437514794.6666667, + "logits/rejected": 350111667.2, + "logps/chosen": -503.2257486979167, + "logps/rejected": -532.7578125, + "loss": 0.0205, + "rewards/chosen": 2.9984334309895835, + "rewards/margins": 11.301365407307943, + "rewards/rejected": -8.302931976318359, + "step": 4409 + }, + { + "epoch": 0.4029237094563728, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.519080890495018e-06, + "logits/chosen": 393912448.0, + "logits/rejected": 517504170.6666667, + "logps/chosen": -164.393798828125, + "logps/rejected": -474.4687906901042, + "loss": 0.01, + "rewards/chosen": 3.941471576690674, + "rewards/margins": 13.253504912058512, + "rewards/rejected": -9.312033335367838, + "step": 4410 + }, + { + "epoch": 0.4030150753768844, + "grad_norm": 31.25, + "kl": 0.0, + "learning_rate": 6.517710991700381e-06, + "logits/chosen": 1313084672.0, + "logits/rejected": 569232256.0, + "logps/chosen": -528.2085571289062, + "logps/rejected": -523.0653076171875, + "loss": 0.1125, + "rewards/chosen": 3.1348044872283936, + "rewards/margins": 12.761414289474487, + "rewards/rejected": -9.626609802246094, + "step": 4411 + }, + { + "epoch": 0.4031064412973961, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 6.5163409674051094e-06, + "logits/chosen": 554587852.8, + "logits/rejected": 724779434.6666666, + "logps/chosen": -316.07041015625, + "logps/rejected": -601.051513671875, + "loss": 0.0101, + "rewards/chosen": 4.614093780517578, + "rewards/margins": 13.744611740112305, + "rewards/rejected": -9.130517959594727, + "step": 4412 + }, + { + "epoch": 0.4031978072179077, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 6.5149708177224876e-06, + "logits/chosen": 400173824.0, + "logits/rejected": 599678464.0, + "logps/chosen": -373.422216796875, + "logps/rejected": -653.5050455729166, + "loss": 0.0105, + "rewards/chosen": 4.2741943359375, + "rewards/margins": 14.742753601074218, + "rewards/rejected": -10.468559265136719, + "step": 4413 + }, + { + "epoch": 0.4032891731384194, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 6.513600542765816e-06, + "logits/chosen": 347597312.0, + "logits/rejected": 617431142.4, + "logps/chosen": -352.19580078125, + "logps/rejected": -643.50380859375, + "loss": 0.0177, + "rewards/chosen": 3.208926518758138, + "rewards/margins": 13.188737614949545, + "rewards/rejected": -9.979811096191407, + "step": 4414 + }, + { + "epoch": 0.403380539058931, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 6.512230142648403e-06, + "logits/chosen": 545619456.0, + "logits/rejected": 996995413.3333334, + "logps/chosen": -268.302392578125, + "logps/rejected": -521.9702962239584, + "loss": 0.0568, + "rewards/chosen": 2.782289505004883, + "rewards/margins": 10.53601582845052, + "rewards/rejected": -7.753726323445638, + "step": 4415 + }, + { + "epoch": 0.4034719049794427, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 6.5108596174835696e-06, + "logits/chosen": 824772800.0, + "logits/rejected": 380937632.0, + "logps/chosen": -504.974609375, + "logps/rejected": -277.2989196777344, + "loss": 0.0151, + "rewards/chosen": 3.782622814178467, + "rewards/margins": 10.922442436218262, + "rewards/rejected": -7.139819622039795, + "step": 4416 + }, + { + "epoch": 0.4035632708999543, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.509488967384643e-06, + "logits/chosen": 892298752.0, + "logits/rejected": 545037875.2, + "logps/chosen": -460.2552490234375, + "logps/rejected": -405.9736083984375, + "loss": 0.0068, + "rewards/chosen": 4.770214080810547, + "rewards/margins": 12.970102691650391, + "rewards/rejected": -8.199888610839844, + "step": 4417 + }, + { + "epoch": 0.403654636820466, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 6.508118192464966e-06, + "logits/chosen": 541311283.2, + "logits/rejected": 728894976.0, + "logps/chosen": -375.00634765625, + "logps/rejected": -294.35280354817706, + "loss": 0.0327, + "rewards/chosen": 3.1015556335449217, + "rewards/margins": 10.476418940226237, + "rewards/rejected": -7.374863306681315, + "step": 4418 + }, + { + "epoch": 0.4037460027409776, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 6.5067472928378875e-06, + "logits/chosen": 432954976.0, + "logits/rejected": 491099072.0, + "logps/chosen": -265.3220520019531, + "logps/rejected": -684.4494018554688, + "loss": 0.0224, + "rewards/chosen": 3.6405467987060547, + "rewards/margins": 13.611587524414062, + "rewards/rejected": -9.971040725708008, + "step": 4419 + }, + { + "epoch": 0.4038373686614893, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 6.5053762686167665e-06, + "logits/chosen": 438652364.8, + "logits/rejected": 533830400.0, + "logps/chosen": -310.5828125, + "logps/rejected": -387.3556315104167, + "loss": 0.0106, + "rewards/chosen": 4.451026916503906, + "rewards/margins": 12.315767923990887, + "rewards/rejected": -7.8647410074869795, + "step": 4420 + }, + { + "epoch": 0.4039287345820009, + "grad_norm": 63.5, + "kl": 0.0, + "learning_rate": 6.5040051199149755e-06, + "logits/chosen": 849800704.0, + "logits/rejected": 560232832.0, + "logps/chosen": -291.4453938802083, + "logps/rejected": -437.6746826171875, + "loss": 0.0776, + "rewards/chosen": 3.314170519510905, + "rewards/margins": 12.398330370585123, + "rewards/rejected": -9.084159851074219, + "step": 4421 + }, + { + "epoch": 0.4040201005025126, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 6.502633846845898e-06, + "logits/chosen": 825431756.8, + "logits/rejected": 349741226.6666667, + "logps/chosen": -379.3716064453125, + "logps/rejected": -422.9972330729167, + "loss": 0.0059, + "rewards/chosen": 4.791484069824219, + "rewards/margins": 14.764812978108726, + "rewards/rejected": -9.973328908284506, + "step": 4422 + }, + { + "epoch": 0.4041114664230242, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 6.501262449522923e-06, + "logits/chosen": 385695573.3333333, + "logits/rejected": 447288883.2, + "logps/chosen": -235.55863444010416, + "logps/rejected": -369.523681640625, + "loss": 0.0136, + "rewards/chosen": 3.67122491200765, + "rewards/margins": 12.204663022359213, + "rewards/rejected": -8.533438110351563, + "step": 4423 + }, + { + "epoch": 0.4042028323435359, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 6.499890928059453e-06, + "logits/chosen": 484445984.0, + "logits/rejected": 378457344.0, + "logps/chosen": -457.51513671875, + "logps/rejected": -460.68560791015625, + "loss": 0.0559, + "rewards/chosen": 3.9851784706115723, + "rewards/margins": 9.634881973266602, + "rewards/rejected": -5.649703502655029, + "step": 4424 + }, + { + "epoch": 0.4042941982640475, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 6.4985192825688995e-06, + "logits/chosen": 737737344.0, + "logits/rejected": 577873408.0, + "logps/chosen": -448.36810302734375, + "logps/rejected": -551.2406005859375, + "loss": 0.0206, + "rewards/chosen": 3.251783847808838, + "rewards/margins": 13.867990016937256, + "rewards/rejected": -10.616206169128418, + "step": 4425 + }, + { + "epoch": 0.4043855641845592, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 6.497147513164684e-06, + "logits/chosen": 337589632.0, + "logits/rejected": 528811872.0, + "logps/chosen": -362.92279052734375, + "logps/rejected": -586.9747924804688, + "loss": 0.0108, + "rewards/chosen": 4.401371002197266, + "rewards/margins": 14.10928726196289, + "rewards/rejected": -9.707916259765625, + "step": 4426 + }, + { + "epoch": 0.4044769301050708, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 6.495775619960243e-06, + "logits/chosen": 386852096.0, + "logits/rejected": 336543488.0, + "logps/chosen": -279.3194580078125, + "logps/rejected": -500.5615234375, + "loss": 0.0153, + "rewards/chosen": 3.673231760660807, + "rewards/margins": 14.819529978434245, + "rewards/rejected": -11.146298217773438, + "step": 4427 + }, + { + "epoch": 0.4045682960255825, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 6.494403603069015e-06, + "logits/chosen": 519410995.2, + "logits/rejected": 207967701.33333334, + "logps/chosen": -396.624609375, + "logps/rejected": -236.75813802083334, + "loss": 0.039, + "rewards/chosen": 3.188619613647461, + "rewards/margins": 8.628991444905598, + "rewards/rejected": -5.440371831258138, + "step": 4428 + }, + { + "epoch": 0.4046596619460941, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 6.493031462604457e-06, + "logits/chosen": 450605952.0, + "logits/rejected": 561590848.0, + "logps/chosen": -318.7010498046875, + "logps/rejected": -524.1553955078125, + "loss": 0.0134, + "rewards/chosen": 4.131339073181152, + "rewards/margins": 14.439103126525879, + "rewards/rejected": -10.307764053344727, + "step": 4429 + }, + { + "epoch": 0.4047510278666058, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 6.491659198680028e-06, + "logits/chosen": 492555136.0, + "logits/rejected": 650777088.0, + "logps/chosen": -377.04669189453125, + "logps/rejected": -406.4571228027344, + "loss": 0.118, + "rewards/chosen": 3.4124703407287598, + "rewards/margins": 8.768335819244385, + "rewards/rejected": -5.355865478515625, + "step": 4430 + }, + { + "epoch": 0.4048423937871174, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 6.490286811409205e-06, + "logits/chosen": 485411712.0, + "logits/rejected": 428967776.0, + "logps/chosen": -307.8162841796875, + "logps/rejected": -511.6842041015625, + "loss": 0.0389, + "rewards/chosen": 2.8114006519317627, + "rewards/margins": 13.07570195198059, + "rewards/rejected": -10.264301300048828, + "step": 4431 + }, + { + "epoch": 0.40493375970762907, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 6.48891430090547e-06, + "logits/chosen": 434420576.0, + "logits/rejected": 528359680.0, + "logps/chosen": -247.92495727539062, + "logps/rejected": -406.167236328125, + "loss": 0.0356, + "rewards/chosen": 3.247159481048584, + "rewards/margins": 9.773085435231526, + "rewards/rejected": -6.525925954182942, + "step": 4432 + }, + { + "epoch": 0.4050251256281407, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 6.48754166728232e-06, + "logits/chosen": 353304780.8, + "logits/rejected": 270110506.6666667, + "logps/chosen": -358.6643798828125, + "logps/rejected": -425.6734212239583, + "loss": 0.0358, + "rewards/chosen": 2.8469005584716798, + "rewards/margins": 13.274885431925455, + "rewards/rejected": -10.427984873453775, + "step": 4433 + }, + { + "epoch": 0.40511649154865237, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 6.486168910653253e-06, + "logits/chosen": 361364864.0, + "logits/rejected": 499511392.0, + "logps/chosen": -213.5211385091146, + "logps/rejected": -347.3348388671875, + "loss": 0.0127, + "rewards/chosen": 4.531729380289714, + "rewards/margins": 11.27045504252116, + "rewards/rejected": -6.738725662231445, + "step": 4434 + }, + { + "epoch": 0.405207857469164, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 6.484796031131789e-06, + "logits/chosen": 503501568.0, + "logits/rejected": 414794282.6666667, + "logps/chosen": -298.60458984375, + "logps/rejected": -460.125732421875, + "loss": 0.0503, + "rewards/chosen": 2.7074729919433596, + "rewards/margins": 10.385451507568359, + "rewards/rejected": -7.677978515625, + "step": 4435 + }, + { + "epoch": 0.40529922338967567, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 6.483423028831448e-06, + "logits/chosen": 900138624.0, + "logits/rejected": 582467254.8571428, + "logps/chosen": -681.5909423828125, + "logps/rejected": -385.02818080357144, + "loss": 0.0038, + "rewards/chosen": 3.557159423828125, + "rewards/margins": 12.234966278076172, + "rewards/rejected": -8.677806854248047, + "step": 4436 + }, + { + "epoch": 0.4053905893101873, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 6.4820499038657695e-06, + "logits/chosen": 535309482.6666667, + "logits/rejected": 614243020.8, + "logps/chosen": -323.25030517578125, + "logps/rejected": -343.71005859375, + "loss": 0.0184, + "rewards/chosen": 3.334181785583496, + "rewards/margins": 11.62037410736084, + "rewards/rejected": -8.286192321777344, + "step": 4437 + }, + { + "epoch": 0.40548195523069896, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 6.480676656348293e-06, + "logits/chosen": 532404224.0, + "logits/rejected": 338818022.4, + "logps/chosen": -449.1787923177083, + "logps/rejected": -495.940576171875, + "loss": 0.0166, + "rewards/chosen": 3.0981658299764, + "rewards/margins": 12.649842007954916, + "rewards/rejected": -9.551676177978516, + "step": 4438 + }, + { + "epoch": 0.4055733211512106, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 6.479303286392578e-06, + "logits/chosen": 585052032.0, + "logits/rejected": 597601856.0, + "logps/chosen": -473.2599182128906, + "logps/rejected": -587.05322265625, + "loss": 0.0911, + "rewards/chosen": 3.3495125770568848, + "rewards/margins": 9.616138935089111, + "rewards/rejected": -6.266626358032227, + "step": 4439 + }, + { + "epoch": 0.40566468707172226, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 6.477929794112186e-06, + "logits/chosen": 500015786.6666667, + "logits/rejected": 361083808.0, + "logps/chosen": -271.5842692057292, + "logps/rejected": -203.8302764892578, + "loss": 0.1248, + "rewards/chosen": 3.947068532307943, + "rewards/margins": 6.0356028874715175, + "rewards/rejected": -2.088534355163574, + "step": 4440 + }, + { + "epoch": 0.4057560529922339, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 6.4765561796206925e-06, + "logits/chosen": 495327061.3333333, + "logits/rejected": 596709171.2, + "logps/chosen": -375.3036295572917, + "logps/rejected": -538.50791015625, + "loss": 0.0198, + "rewards/chosen": 2.987663904825846, + "rewards/margins": 12.487885157267252, + "rewards/rejected": -9.500221252441406, + "step": 4441 + }, + { + "epoch": 0.40584741891274556, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.475182443031684e-06, + "logits/chosen": 1282919424.0, + "logits/rejected": 923503616.0, + "logps/chosen": -325.1453125, + "logps/rejected": -664.2284749348959, + "loss": 0.0245, + "rewards/chosen": 3.2483139038085938, + "rewards/margins": 11.358367919921875, + "rewards/rejected": -8.110054016113281, + "step": 4442 + }, + { + "epoch": 0.4059387848332572, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 6.473808584458756e-06, + "logits/chosen": 747703488.0, + "logits/rejected": 497560128.0, + "logps/chosen": -254.02894592285156, + "logps/rejected": -478.4239501953125, + "loss": 0.0174, + "rewards/chosen": 3.711200714111328, + "rewards/margins": 12.718002319335938, + "rewards/rejected": -9.00680160522461, + "step": 4443 + }, + { + "epoch": 0.40603015075376886, + "grad_norm": 37.25, + "kl": 0.0, + "learning_rate": 6.472434604015514e-06, + "logits/chosen": 559328704.0, + "logits/rejected": 435301376.0, + "logps/chosen": -290.3040771484375, + "logps/rejected": -319.84417724609375, + "loss": 0.0918, + "rewards/chosen": 3.2764153480529785, + "rewards/margins": 9.614735126495361, + "rewards/rejected": -6.338319778442383, + "step": 4444 + }, + { + "epoch": 0.4061215166742805, + "grad_norm": 56.5, + "kl": 0.0, + "learning_rate": 6.47106050181557e-06, + "logits/chosen": 511333344.0, + "logits/rejected": 603751744.0, + "logps/chosen": -365.0503234863281, + "logps/rejected": -447.8798828125, + "loss": 0.0624, + "rewards/chosen": 3.702813148498535, + "rewards/margins": 11.918646812438965, + "rewards/rejected": -8.21583366394043, + "step": 4445 + }, + { + "epoch": 0.40621288259479216, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.469686277972556e-06, + "logits/chosen": 1644435285.3333333, + "logits/rejected": 589535027.2, + "logps/chosen": -400.1842447916667, + "logps/rejected": -491.2203125, + "loss": 0.0249, + "rewards/chosen": 3.8559557596842446, + "rewards/margins": 13.915451304117838, + "rewards/rejected": -10.059495544433593, + "step": 4446 + }, + { + "epoch": 0.4063042485153038, + "grad_norm": 0.234375, + "kl": 0.0, + "learning_rate": 6.468311932600101e-06, + "logits/chosen": 571421440.0, + "logits/rejected": 590934400.0, + "logps/chosen": -297.6914978027344, + "logps/rejected": -554.556640625, + "loss": 0.0013, + "rewards/chosen": 5.67542028427124, + "rewards/margins": 15.351817607879639, + "rewards/rejected": -9.676397323608398, + "step": 4447 + }, + { + "epoch": 0.40639561443581546, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 6.466937465811854e-06, + "logits/chosen": 670389760.0, + "logits/rejected": 821033088.0, + "logps/chosen": -343.9469401041667, + "logps/rejected": -665.2395629882812, + "loss": 0.0326, + "rewards/chosen": 3.3327109018961587, + "rewards/margins": 13.925738016764322, + "rewards/rejected": -10.593027114868164, + "step": 4448 + }, + { + "epoch": 0.4064869803563271, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 6.46556287772147e-06, + "logits/chosen": 885751637.3333334, + "logits/rejected": 672412736.0, + "logps/chosen": -507.996826171875, + "logps/rejected": -531.80419921875, + "loss": 0.0098, + "rewards/chosen": 4.88139279683431, + "rewards/margins": 14.038985570271809, + "rewards/rejected": -9.1575927734375, + "step": 4449 + }, + { + "epoch": 0.40657834627683875, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 6.464188168442615e-06, + "logits/chosen": 457085286.4, + "logits/rejected": 321720021.3333333, + "logps/chosen": -334.1621826171875, + "logps/rejected": -340.8444010416667, + "loss": 0.0205, + "rewards/chosen": 4.005291366577149, + "rewards/margins": 11.854845428466797, + "rewards/rejected": -7.849554061889648, + "step": 4450 + }, + { + "epoch": 0.4066697121973504, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 6.462813338088963e-06, + "logits/chosen": 607503872.0, + "logits/rejected": 590630464.0, + "logps/chosen": -307.2855224609375, + "logps/rejected": -576.9749755859375, + "loss": 0.0326, + "rewards/chosen": 3.5365063349405923, + "rewards/margins": 12.4845978418986, + "rewards/rejected": -8.948091506958008, + "step": 4451 + }, + { + "epoch": 0.40676107811786205, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 6.461438386774204e-06, + "logits/chosen": 543654144.0, + "logits/rejected": 471249365.3333333, + "logps/chosen": -233.73495483398438, + "logps/rejected": -531.5819091796875, + "loss": 0.0168, + "rewards/chosen": 2.6413414478302, + "rewards/margins": 12.149968385696411, + "rewards/rejected": -9.508626937866211, + "step": 4452 + }, + { + "epoch": 0.4068524440383737, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 6.460063314612029e-06, + "logits/chosen": 648496682.6666666, + "logits/rejected": 415933952.0, + "logps/chosen": -228.3424072265625, + "logps/rejected": -490.63779296875, + "loss": 0.0504, + "rewards/chosen": 2.284523328145345, + "rewards/margins": 12.269473584493001, + "rewards/rejected": -9.984950256347656, + "step": 4453 + }, + { + "epoch": 0.40694380995888535, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 6.458688121716145e-06, + "logits/chosen": 463775008.0, + "logits/rejected": 316427328.0, + "logps/chosen": -480.49395751953125, + "logps/rejected": -391.66046142578125, + "loss": 0.0155, + "rewards/chosen": 3.8535842895507812, + "rewards/margins": 14.55868911743164, + "rewards/rejected": -10.70510482788086, + "step": 4454 + }, + { + "epoch": 0.40703517587939697, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 6.457312808200269e-06, + "logits/chosen": 432992832.0, + "logits/rejected": 395176789.3333333, + "logps/chosen": -295.1771240234375, + "logps/rejected": -536.843994140625, + "loss": 0.0145, + "rewards/chosen": 2.8104844093322754, + "rewards/margins": 11.825188159942627, + "rewards/rejected": -9.014703750610352, + "step": 4455 + }, + { + "epoch": 0.40712654179990865, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 6.455937374178127e-06, + "logits/chosen": 509410560.0, + "logits/rejected": 359049898.6666667, + "logps/chosen": -259.18017578125, + "logps/rejected": -580.3789876302084, + "loss": 0.0237, + "rewards/chosen": 3.4548480987548826, + "rewards/margins": 14.273980585734048, + "rewards/rejected": -10.819132486979166, + "step": 4456 + }, + { + "epoch": 0.40721790772042027, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 6.454561819763454e-06, + "logits/chosen": 608810368.0, + "logits/rejected": 588543360.0, + "logps/chosen": -332.0316162109375, + "logps/rejected": -448.1102294921875, + "loss": 0.0139, + "rewards/chosen": 3.809898853302002, + "rewards/margins": 11.195696353912354, + "rewards/rejected": -7.385797500610352, + "step": 4457 + }, + { + "epoch": 0.40730927364093195, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 6.453186145069994e-06, + "logits/chosen": 376058752.0, + "logits/rejected": 912632448.0, + "logps/chosen": -365.6332702636719, + "logps/rejected": -599.806396484375, + "loss": 0.0397, + "rewards/chosen": 3.1415047645568848, + "rewards/margins": 12.327769756317139, + "rewards/rejected": -9.186264991760254, + "step": 4458 + }, + { + "epoch": 0.40740063956144357, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 6.451810350211504e-06, + "logits/chosen": 322124672.0, + "logits/rejected": 454372937.14285713, + "logps/chosen": -213.51007080078125, + "logps/rejected": -571.069091796875, + "loss": 0.0093, + "rewards/chosen": 2.6835954189300537, + "rewards/margins": 12.246725388935634, + "rewards/rejected": -9.56312997000558, + "step": 4459 + }, + { + "epoch": 0.40749200548195524, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 6.450434435301751e-06, + "logits/chosen": 461997772.8, + "logits/rejected": 560557482.6666666, + "logps/chosen": -326.278515625, + "logps/rejected": -633.1754557291666, + "loss": 0.0194, + "rewards/chosen": 3.714960479736328, + "rewards/margins": 11.71121826171875, + "rewards/rejected": -7.996257781982422, + "step": 4460 + }, + { + "epoch": 0.40758337140246687, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 6.449058400454507e-06, + "logits/chosen": 719024998.4, + "logits/rejected": 1756623872.0, + "logps/chosen": -217.0612548828125, + "logps/rejected": -675.36083984375, + "loss": 0.0235, + "rewards/chosen": 3.442601776123047, + "rewards/margins": 13.354206975301107, + "rewards/rejected": -9.91160519917806, + "step": 4461 + }, + { + "epoch": 0.40767473732297854, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 6.44768224578356e-06, + "logits/chosen": 746535296.0, + "logits/rejected": 665023317.3333334, + "logps/chosen": -66.71263885498047, + "logps/rejected": -565.712890625, + "loss": 0.0445, + "rewards/chosen": 1.5593645572662354, + "rewards/margins": 10.59880026181539, + "rewards/rejected": -9.039435704549154, + "step": 4462 + }, + { + "epoch": 0.40776610324349016, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.446305971402704e-06, + "logits/chosen": 649955114.6666666, + "logits/rejected": 301848704.0, + "logps/chosen": -400.8047281901042, + "logps/rejected": -106.15403747558594, + "loss": 0.143, + "rewards/chosen": 3.3166097005208335, + "rewards/margins": 6.33286460240682, + "rewards/rejected": -3.0162549018859863, + "step": 4463 + }, + { + "epoch": 0.40785746916400184, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 6.444929577425743e-06, + "logits/chosen": 685478400.0, + "logits/rejected": 643311488.0, + "logps/chosen": -322.715771484375, + "logps/rejected": -397.314697265625, + "loss": 0.0216, + "rewards/chosen": 3.671722412109375, + "rewards/margins": 11.961009979248047, + "rewards/rejected": -8.289287567138672, + "step": 4464 + }, + { + "epoch": 0.40794883508451346, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.443553063966497e-06, + "logits/chosen": 401454506.6666667, + "logits/rejected": 172820480.0, + "logps/chosen": -346.3561197916667, + "logps/rejected": -303.4457702636719, + "loss": 0.0186, + "rewards/chosen": 4.030371983846028, + "rewards/margins": 13.839890797932942, + "rewards/rejected": -9.809518814086914, + "step": 4465 + }, + { + "epoch": 0.40804020100502514, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 6.442176431138784e-06, + "logits/chosen": 235508512.0, + "logits/rejected": 517703899.4285714, + "logps/chosen": -312.08245849609375, + "logps/rejected": -520.3007463727679, + "loss": 0.0073, + "rewards/chosen": 5.783026218414307, + "rewards/margins": 13.730088029588972, + "rewards/rejected": -7.947061811174665, + "step": 4466 + }, + { + "epoch": 0.40813156692553676, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 6.440799679056444e-06, + "logits/chosen": 427239936.0, + "logits/rejected": 469063987.2, + "logps/chosen": -321.5592854817708, + "logps/rejected": -518.474169921875, + "loss": 0.0063, + "rewards/chosen": 4.538496653238933, + "rewards/margins": 13.051122538248698, + "rewards/rejected": -8.512625885009765, + "step": 4467 + }, + { + "epoch": 0.40822293284604844, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 6.439422807833319e-06, + "logits/chosen": 1110888448.0, + "logits/rejected": 652495829.3333334, + "logps/chosen": -253.962451171875, + "logps/rejected": -333.7093505859375, + "loss": 0.0153, + "rewards/chosen": 4.319021224975586, + "rewards/margins": 10.54823621114095, + "rewards/rejected": -6.229214986165364, + "step": 4468 + }, + { + "epoch": 0.40831429876656006, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.438045817583265e-06, + "logits/chosen": 554121728.0, + "logits/rejected": 397886208.0, + "logps/chosen": -312.6141357421875, + "logps/rejected": -403.4603759765625, + "loss": 0.0141, + "rewards/chosen": 3.3501593271891275, + "rewards/margins": 13.115830866495768, + "rewards/rejected": -9.765671539306641, + "step": 4469 + }, + { + "epoch": 0.40840566468707173, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 6.436668708420145e-06, + "logits/chosen": 540730282.6666666, + "logits/rejected": 633803264.0, + "logps/chosen": -316.2576497395833, + "logps/rejected": -557.767333984375, + "loss": 0.0612, + "rewards/chosen": 2.894353230794271, + "rewards/margins": 12.188075383504232, + "rewards/rejected": -9.293722152709961, + "step": 4470 + }, + { + "epoch": 0.40849703060758336, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 6.4352914804578345e-06, + "logits/rejected": 618774080.0, + "logps/rejected": -475.64752197265625, + "loss": 0.0022, + "rewards/rejected": -8.520281791687012, + "step": 4471 + }, + { + "epoch": 0.40858839652809503, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 6.433914133810217e-06, + "logits/chosen": 562977408.0, + "logits/rejected": 500509132.8, + "logps/chosen": -334.1943766276042, + "logps/rejected": -535.8064453125, + "loss": 0.012, + "rewards/chosen": 3.582733154296875, + "rewards/margins": 12.060472106933593, + "rewards/rejected": -8.477738952636718, + "step": 4472 + }, + { + "epoch": 0.40867976244860665, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 6.432536668591187e-06, + "logits/chosen": 165039978.66666666, + "logits/rejected": 242567347.2, + "logps/chosen": -111.201904296875, + "logps/rejected": -308.4607421875, + "loss": 0.0255, + "rewards/chosen": 3.018369992574056, + "rewards/margins": 12.480586942036947, + "rewards/rejected": -9.46221694946289, + "step": 4473 + }, + { + "epoch": 0.40877112836911833, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 6.4311590849146446e-06, + "logits/chosen": 698918144.0, + "logits/rejected": 1508116736.0, + "logps/chosen": -347.8342692057292, + "logps/rejected": -822.395751953125, + "loss": 0.0418, + "rewards/chosen": 4.004557927449544, + "rewards/margins": 17.676707585652668, + "rewards/rejected": -13.672149658203125, + "step": 4474 + }, + { + "epoch": 0.40886249428962995, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 6.4297813828945086e-06, + "logits/chosen": 502925440.0, + "logits/rejected": 967112192.0, + "logps/chosen": -216.8332722981771, + "logps/rejected": -410.04755859375, + "loss": 0.1299, + "rewards/chosen": 0.8731324672698975, + "rewards/margins": 9.61640486717224, + "rewards/rejected": -8.743272399902343, + "step": 4475 + }, + { + "epoch": 0.40895386021014163, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 6.428403562644698e-06, + "logits/chosen": 574749504.0, + "logits/rejected": 476849216.0, + "logps/chosen": -475.06201171875, + "logps/rejected": -577.2875366210938, + "loss": 0.022, + "rewards/chosen": 3.375013828277588, + "rewards/margins": 12.765329837799072, + "rewards/rejected": -9.390316009521484, + "step": 4476 + }, + { + "epoch": 0.40904522613065325, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 6.427025624279148e-06, + "logits/chosen": 305563562.6666667, + "logits/rejected": 538967193.6, + "logps/chosen": -279.4976399739583, + "logps/rejected": -582.13681640625, + "loss": 0.0859, + "rewards/chosen": 4.79278564453125, + "rewards/margins": 12.292013549804688, + "rewards/rejected": -7.499227905273438, + "step": 4477 + }, + { + "epoch": 0.4091365920511649, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 6.425647567911799e-06, + "logits/chosen": 832960085.3333334, + "logits/rejected": 599383424.0, + "logps/chosen": -323.0551350911458, + "logps/rejected": -850.5719604492188, + "loss": 0.029, + "rewards/chosen": 3.445887565612793, + "rewards/margins": 13.287369728088379, + "rewards/rejected": -9.841482162475586, + "step": 4478 + }, + { + "epoch": 0.40922795797167655, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 6.4242693936566054e-06, + "logits/chosen": 593566412.8, + "logits/rejected": 609416320.0, + "logps/chosen": -507.05380859375, + "logps/rejected": -934.29052734375, + "loss": 0.0213, + "rewards/chosen": 4.270150375366211, + "rewards/margins": 14.313584518432616, + "rewards/rejected": -10.043434143066406, + "step": 4479 + }, + { + "epoch": 0.4093193238921882, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 6.422891101627531e-06, + "logits/chosen": 438823488.0, + "logits/rejected": 635290112.0, + "logps/chosen": -322.4565734863281, + "logps/rejected": -583.395263671875, + "loss": 0.0043, + "rewards/chosen": 5.102448463439941, + "rewards/margins": 13.974883079528809, + "rewards/rejected": -8.872434616088867, + "step": 4480 + }, + { + "epoch": 0.40941068981269985, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 6.421512691938543e-06, + "logits/chosen": 471155507.2, + "logits/rejected": 509681834.6666667, + "logps/chosen": -337.961181640625, + "logps/rejected": -712.6669921875, + "loss": 0.0339, + "rewards/chosen": 3.275988006591797, + "rewards/margins": 13.033029937744141, + "rewards/rejected": -9.757041931152344, + "step": 4481 + }, + { + "epoch": 0.4095020557332115, + "grad_norm": 58.25, + "kl": 0.0, + "learning_rate": 6.420134164703629e-06, + "logits/chosen": 564903936.0, + "logits/rejected": 720739520.0, + "logps/chosen": -428.7319030761719, + "logps/rejected": -496.9945068359375, + "loss": 0.0458, + "rewards/chosen": 3.3082268238067627, + "rewards/margins": 11.146874189376831, + "rewards/rejected": -7.838647365570068, + "step": 4482 + }, + { + "epoch": 0.40959342165372314, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 6.418755520036775e-06, + "logits/chosen": 810157184.0, + "logits/rejected": 676951680.0, + "logps/chosen": -426.79144287109375, + "logps/rejected": -389.36895751953125, + "loss": 0.0126, + "rewards/chosen": 3.726409673690796, + "rewards/margins": 11.102461099624634, + "rewards/rejected": -7.376051425933838, + "step": 4483 + }, + { + "epoch": 0.4096847875742348, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 6.417376758051985e-06, + "logits/chosen": 441256704.0, + "logits/rejected": 845446400.0, + "logps/chosen": -220.78287760416666, + "logps/rejected": -846.966796875, + "loss": 0.0143, + "rewards/chosen": 3.9325364430745444, + "rewards/margins": 13.416638310750326, + "rewards/rejected": -9.484101867675781, + "step": 4484 + }, + { + "epoch": 0.40977615349474644, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 6.415997878863271e-06, + "logits/chosen": 832491178.6666666, + "logits/rejected": 1012227584.0, + "logps/chosen": -185.82222493489584, + "logps/rejected": -190.1357421875, + "loss": 0.1117, + "rewards/chosen": 3.5178712209065757, + "rewards/margins": 5.128211339314779, + "rewards/rejected": -1.6103401184082031, + "step": 4485 + }, + { + "epoch": 0.4098675194152581, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 6.41461888258465e-06, + "logits/chosen": 403223296.0, + "logits/rejected": 683775488.0, + "logps/chosen": -309.2836100260417, + "logps/rejected": -425.82841796875, + "loss": 0.0106, + "rewards/chosen": 4.132473627726237, + "rewards/margins": 13.21591631571452, + "rewards/rejected": -9.083442687988281, + "step": 4486 + }, + { + "epoch": 0.40995888533576974, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 6.413239769330156e-06, + "logits/chosen": 520689024.0, + "logits/rejected": 348888352.0, + "logps/chosen": -344.4286804199219, + "logps/rejected": -467.88958740234375, + "loss": 0.0082, + "rewards/chosen": 4.540630340576172, + "rewards/margins": 14.116094589233398, + "rewards/rejected": -9.575464248657227, + "step": 4487 + }, + { + "epoch": 0.4100502512562814, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 6.411860539213826e-06, + "logits/chosen": 1146728618.6666667, + "logits/rejected": 910840768.0, + "logps/chosen": -297.04347737630206, + "logps/rejected": -245.626708984375, + "loss": 0.0311, + "rewards/chosen": 3.641683578491211, + "rewards/margins": 10.77296257019043, + "rewards/rejected": -7.131278991699219, + "step": 4488 + }, + { + "epoch": 0.41014161717679304, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 6.4104811923497105e-06, + "logits/chosen": 509141034.6666667, + "logits/rejected": 575851929.6, + "logps/chosen": -431.87451171875, + "logps/rejected": -329.511572265625, + "loss": 0.0184, + "rewards/chosen": 3.208183924357096, + "rewards/margins": 10.731600062052408, + "rewards/rejected": -7.523416137695312, + "step": 4489 + }, + { + "epoch": 0.4102329830973047, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 6.40910172885187e-06, + "logits/chosen": 616038848.0, + "logits/rejected": 1164614656.0, + "logps/chosen": -417.3842468261719, + "logps/rejected": -617.1102294921875, + "loss": 0.0244, + "rewards/chosen": 3.308112382888794, + "rewards/margins": 12.59804654121399, + "rewards/rejected": -9.289934158325195, + "step": 4490 + }, + { + "epoch": 0.41032434901781634, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 6.40772214883437e-06, + "logits/chosen": 707193472.0, + "logits/rejected": 641382016.0, + "logps/chosen": -452.5425109863281, + "logps/rejected": -536.7091064453125, + "loss": 0.0227, + "rewards/chosen": 3.692628860473633, + "rewards/margins": 11.62161922454834, + "rewards/rejected": -7.928990364074707, + "step": 4491 + }, + { + "epoch": 0.410415714938328, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 6.4063424524112915e-06, + "logits/chosen": 434714538.6666667, + "logits/rejected": 686122432.0, + "logps/chosen": -264.3995361328125, + "logps/rejected": -733.4613647460938, + "loss": 0.0145, + "rewards/chosen": 4.035567283630371, + "rewards/margins": 15.064289093017578, + "rewards/rejected": -11.028721809387207, + "step": 4492 + }, + { + "epoch": 0.41050708085883963, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 6.404962639696724e-06, + "logits/chosen": 509692448.0, + "logits/rejected": 427106752.0, + "logps/chosen": -309.0225830078125, + "logps/rejected": -386.46319580078125, + "loss": 0.0133, + "rewards/chosen": 4.008599281311035, + "rewards/margins": 13.877545356750488, + "rewards/rejected": -9.868946075439453, + "step": 4493 + }, + { + "epoch": 0.4105984467793513, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 6.403582710804762e-06, + "logits/chosen": 624686400.0, + "logits/rejected": 585491200.0, + "logps/chosen": -309.244873046875, + "logps/rejected": -459.3121032714844, + "loss": 0.011, + "rewards/chosen": 4.5080885887146, + "rewards/margins": 13.216667652130127, + "rewards/rejected": -8.708579063415527, + "step": 4494 + }, + { + "epoch": 0.41068981269986293, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 6.402202665849515e-06, + "logits/chosen": 526067814.4, + "logits/rejected": 374612522.6666667, + "logps/chosen": -330.9091796875, + "logps/rejected": -485.7035725911458, + "loss": 0.0519, + "rewards/chosen": 2.8430931091308596, + "rewards/margins": 13.804515075683593, + "rewards/rejected": -10.961421966552734, + "step": 4495 + }, + { + "epoch": 0.4107811786203746, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.4008225049450974e-06, + "logits/chosen": 441929113.6, + "logits/rejected": 208507264.0, + "logps/chosen": -234.3573486328125, + "logps/rejected": -379.1046956380208, + "loss": 0.0196, + "rewards/chosen": 4.148098754882812, + "rewards/margins": 13.751593399047852, + "rewards/rejected": -9.603494644165039, + "step": 4496 + }, + { + "epoch": 0.41087254454088623, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 6.3994422282056405e-06, + "logits/chosen": 619235712.0, + "logits/rejected": 456135731.2, + "logps/chosen": -235.73734537760416, + "logps/rejected": -330.19345703125, + "loss": 0.0096, + "rewards/chosen": 3.875586827596029, + "rewards/margins": 11.772519048055013, + "rewards/rejected": -7.896932220458984, + "step": 4497 + }, + { + "epoch": 0.4109639104613979, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 6.398061835745275e-06, + "logits/chosen": 778241536.0, + "logits/rejected": 456331968.0, + "logps/chosen": -333.18959554036456, + "logps/rejected": -474.02069091796875, + "loss": 0.0286, + "rewards/chosen": 3.699070930480957, + "rewards/margins": 10.405817031860352, + "rewards/rejected": -6.7067461013793945, + "step": 4498 + }, + { + "epoch": 0.41105527638190953, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.3966813276781505e-06, + "logits/chosen": 712290764.8, + "logits/rejected": 299218837.3333333, + "logps/chosen": -412.840625, + "logps/rejected": -331.7195638020833, + "loss": 0.0223, + "rewards/chosen": 3.5241268157958983, + "rewards/margins": 13.107580947875977, + "rewards/rejected": -9.583454132080078, + "step": 4499 + }, + { + "epoch": 0.4111466423024212, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 6.3953007041184216e-06, + "logits/chosen": 414621888.0, + "logits/rejected": 322773568.0, + "logps/chosen": -165.69955444335938, + "logps/rejected": -310.7240295410156, + "loss": 0.1127, + "rewards/chosen": 4.250679016113281, + "rewards/margins": 11.270493984222412, + "rewards/rejected": -7.019814968109131, + "step": 4500 + }, + { + "epoch": 0.4112380082229328, + "grad_norm": 0.578125, + "kl": 0.0, + "learning_rate": 6.3939199651802505e-06, + "logits/chosen": 263797280.0, + "logits/rejected": 564552850.2857143, + "logps/chosen": -265.751953125, + "logps/rejected": -699.8189174107143, + "loss": 0.0026, + "rewards/chosen": 4.060998439788818, + "rewards/margins": 13.929164682115827, + "rewards/rejected": -9.868166242327009, + "step": 4501 + }, + { + "epoch": 0.4113293741434445, + "grad_norm": 0.90234375, + "kl": 0.0, + "learning_rate": 6.392539110977813e-06, + "logits/chosen": 637999744.0, + "logits/rejected": 584358058.6666666, + "logps/chosen": -215.61407470703125, + "logps/rejected": -529.8720703125, + "loss": 0.005, + "rewards/chosen": 3.9739112854003906, + "rewards/margins": 14.061841328938803, + "rewards/rejected": -10.087930043538412, + "step": 4502 + }, + { + "epoch": 0.4114207400639561, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 6.3911581416252945e-06, + "logits/chosen": 785837056.0, + "logits/rejected": 518933632.0, + "logps/chosen": -800.6881103515625, + "logps/rejected": -482.3002115885417, + "loss": 0.0079, + "rewards/chosen": 3.432882785797119, + "rewards/margins": 13.624477863311768, + "rewards/rejected": -10.191595077514648, + "step": 4503 + }, + { + "epoch": 0.4115121059844678, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 6.389777057236886e-06, + "logits/chosen": 557793216.0, + "logits/rejected": 668638848.0, + "logps/chosen": -355.6676330566406, + "logps/rejected": -583.5321044921875, + "loss": 0.0219, + "rewards/chosen": 3.1936216354370117, + "rewards/margins": 13.48148250579834, + "rewards/rejected": -10.287860870361328, + "step": 4504 + }, + { + "epoch": 0.4116034719049794, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.388395857926792e-06, + "logits/chosen": 871636992.0, + "logits/rejected": 391077120.0, + "logps/chosen": -329.01055908203125, + "logps/rejected": -597.269775390625, + "loss": 0.0308, + "rewards/chosen": 3.812811533610026, + "rewards/margins": 14.17928377787272, + "rewards/rejected": -10.366472244262695, + "step": 4505 + }, + { + "epoch": 0.4116948378254911, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 6.387014543809224e-06, + "logits/chosen": 750479769.6, + "logits/rejected": 500557653.3333333, + "logps/chosen": -252.842578125, + "logps/rejected": -407.48876953125, + "loss": 0.0211, + "rewards/chosen": 4.098832321166992, + "rewards/margins": 13.00677858988444, + "rewards/rejected": -8.907946268717447, + "step": 4506 + }, + { + "epoch": 0.4117862037460027, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 6.3856331149984045e-06, + "logits/chosen": 641840896.0, + "logits/rejected": 451195494.4, + "logps/chosen": -181.21573893229166, + "logps/rejected": -400.3950927734375, + "loss": 0.0043, + "rewards/chosen": 4.824743588765462, + "rewards/margins": 13.395255978902181, + "rewards/rejected": -8.570512390136718, + "step": 4507 + }, + { + "epoch": 0.4118775696665144, + "grad_norm": 0.37109375, + "kl": 0.0, + "learning_rate": 6.384251571608564e-06, + "logits/chosen": 261940368.0, + "logits/rejected": 394788132.5714286, + "logps/chosen": -376.7149658203125, + "logps/rejected": -629.1967075892857, + "loss": 0.0015, + "rewards/chosen": 4.743847846984863, + "rewards/margins": 13.880085945129395, + "rewards/rejected": -9.136238098144531, + "step": 4508 + }, + { + "epoch": 0.411968935587026, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 6.382869913753944e-06, + "logits/chosen": 536489472.0, + "logits/rejected": 641037952.0, + "logps/chosen": -314.16843668619794, + "logps/rejected": -602.47802734375, + "loss": 0.0205, + "rewards/chosen": 3.8046862284342446, + "rewards/margins": 11.921895662943522, + "rewards/rejected": -8.117209434509277, + "step": 4509 + }, + { + "epoch": 0.4120603015075377, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 6.381488141548795e-06, + "logits/chosen": 507364928.0, + "logits/rejected": 344507584.0, + "logps/chosen": -302.5910339355469, + "logps/rejected": -531.2176513671875, + "loss": 0.0341, + "rewards/chosen": 2.9496378898620605, + "rewards/margins": 12.418229579925537, + "rewards/rejected": -9.468591690063477, + "step": 4510 + }, + { + "epoch": 0.4121516674280493, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 6.380106255107376e-06, + "logits/chosen": 619849472.0, + "logits/rejected": 685630720.0, + "logps/chosen": -430.5277099609375, + "logps/rejected": -429.08172607421875, + "loss": 0.0331, + "rewards/chosen": 2.803107500076294, + "rewards/margins": 10.983608484268188, + "rewards/rejected": -8.180500984191895, + "step": 4511 + }, + { + "epoch": 0.412243033348561, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 6.3787242545439575e-06, + "logits/chosen": 609887232.0, + "logits/rejected": 498870752.0, + "logps/chosen": -276.3624267578125, + "logps/rejected": -397.6780090332031, + "loss": 0.0336, + "rewards/chosen": 3.630737781524658, + "rewards/margins": 12.141412258148193, + "rewards/rejected": -8.510674476623535, + "step": 4512 + }, + { + "epoch": 0.4123343992690726, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 6.377342139972818e-06, + "logits/chosen": 945154048.0, + "logits/rejected": 827588266.6666666, + "logps/chosen": -442.7138671875, + "logps/rejected": -357.0870768229167, + "loss": 0.0131, + "rewards/chosen": 4.069953918457031, + "rewards/margins": 14.035750071207682, + "rewards/rejected": -9.96579615275065, + "step": 4513 + }, + { + "epoch": 0.4124257651895843, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.375959911508244e-06, + "logits/chosen": 416088217.6, + "logits/rejected": 528595925.3333333, + "logps/chosen": -281.6296875, + "logps/rejected": -510.6525065104167, + "loss": 0.039, + "rewards/chosen": 2.8575054168701173, + "rewards/margins": 11.49039166768392, + "rewards/rejected": -8.632886250813803, + "step": 4514 + }, + { + "epoch": 0.4125171311100959, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 6.374577569264533e-06, + "logits/chosen": 533844000.0, + "logits/rejected": 481427529.14285713, + "logps/chosen": -541.875, + "logps/rejected": -593.1574358258929, + "loss": 0.0087, + "rewards/chosen": 2.8472900390625, + "rewards/margins": 11.195232936314174, + "rewards/rejected": -8.347942897251674, + "step": 4515 + }, + { + "epoch": 0.4126084970306076, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 6.373195113355994e-06, + "logits/chosen": 592529408.0, + "logits/rejected": 607530624.0, + "logps/chosen": -390.3392333984375, + "logps/rejected": -323.4460856119792, + "loss": 0.0272, + "rewards/chosen": 3.6205848693847655, + "rewards/margins": 10.665974044799805, + "rewards/rejected": -7.045389175415039, + "step": 4516 + }, + { + "epoch": 0.4126998629511192, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 6.37181254389694e-06, + "logits/chosen": 354132256.0, + "logits/rejected": 564481472.0, + "logps/chosen": -248.6935577392578, + "logps/rejected": -706.0155639648438, + "loss": 0.129, + "rewards/chosen": 2.0821046829223633, + "rewards/margins": 12.05594539642334, + "rewards/rejected": -9.973840713500977, + "step": 4517 + }, + { + "epoch": 0.4127912288716309, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 6.3704298610017015e-06, + "logits/chosen": 584220876.8, + "logits/rejected": 493787477.3333333, + "logps/chosen": -357.50185546875, + "logps/rejected": -640.3885498046875, + "loss": 0.0311, + "rewards/chosen": 3.3893970489501952, + "rewards/margins": 12.766582870483399, + "rewards/rejected": -9.377185821533203, + "step": 4518 + }, + { + "epoch": 0.4128825947921425, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 6.369047064784609e-06, + "logits/chosen": 595439530.6666666, + "logits/rejected": 550511104.0, + "logps/chosen": -134.55219523111978, + "logps/rejected": -549.29951171875, + "loss": 0.0105, + "rewards/chosen": 3.696443239847819, + "rewards/margins": 11.753585879007975, + "rewards/rejected": -8.057142639160157, + "step": 4519 + }, + { + "epoch": 0.4129739607126542, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 6.367664155360009e-06, + "logits/chosen": 918957465.6, + "logits/rejected": 572780330.6666666, + "logps/chosen": -294.561376953125, + "logps/rejected": -441.0432942708333, + "loss": 0.0236, + "rewards/chosen": 3.569855499267578, + "rewards/margins": 12.740162658691407, + "rewards/rejected": -9.170307159423828, + "step": 4520 + }, + { + "epoch": 0.4130653266331658, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 6.366281132842256e-06, + "logits/chosen": 553358976.0, + "logps/chosen": -295.116943359375, + "loss": 0.049, + "rewards/chosen": 3.223153591156006, + "step": 4521 + }, + { + "epoch": 0.4131566925536775, + "grad_norm": 0.150390625, + "kl": 0.0, + "learning_rate": 6.3648979973457115e-06, + "logits/rejected": 583492032.0, + "logps/rejected": -358.9548034667969, + "loss": 0.0005, + "rewards/rejected": -9.15054702758789, + "step": 4522 + }, + { + "epoch": 0.4132480584741891, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 6.3635147489847485e-06, + "logits/chosen": 784137830.4, + "logits/rejected": 854821034.6666666, + "logps/chosen": -540.75029296875, + "logps/rejected": -738.5265299479166, + "loss": 0.0145, + "rewards/chosen": 3.9053226470947267, + "rewards/margins": 13.272650655110677, + "rewards/rejected": -9.367328008015951, + "step": 4523 + }, + { + "epoch": 0.4133394243947008, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 6.362131387873749e-06, + "logits/chosen": 408042880.0, + "logits/rejected": 503401318.4, + "logps/chosen": -283.7421061197917, + "logps/rejected": -314.45029296875, + "loss": 0.0081, + "rewards/chosen": 4.428730328877767, + "rewards/margins": 11.57770036061605, + "rewards/rejected": -7.148970031738282, + "step": 4524 + }, + { + "epoch": 0.4134307903152124, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 6.360747914127106e-06, + "logits/chosen": 925653430.8571428, + "logits/rejected": 597415424.0, + "logps/chosen": -287.80894252232144, + "logps/rejected": -258.2852783203125, + "loss": 0.0265, + "rewards/chosen": 3.6878444126674106, + "rewards/margins": 10.903525965554373, + "rewards/rejected": -7.215681552886963, + "step": 4525 + }, + { + "epoch": 0.4135221562357241, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.3593643278592145e-06, + "logits/chosen": 750156544.0, + "logits/rejected": 427357280.0, + "logps/chosen": -322.5117492675781, + "logps/rejected": -328.72052001953125, + "loss": 0.012, + "rewards/chosen": 4.0215325355529785, + "rewards/margins": 13.385591983795166, + "rewards/rejected": -9.364059448242188, + "step": 4526 + }, + { + "epoch": 0.4136135221562357, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 6.3579806291844904e-06, + "logits/chosen": 373712554.6666667, + "logits/rejected": 359424102.4, + "logps/chosen": -252.475830078125, + "logps/rejected": -474.319482421875, + "loss": 0.0168, + "rewards/chosen": 3.468231519063314, + "rewards/margins": 13.330008252461752, + "rewards/rejected": -9.861776733398438, + "step": 4527 + }, + { + "epoch": 0.4137048880767474, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 6.356596818217351e-06, + "logits/chosen": 354746560.0, + "logits/rejected": 415933269.3333333, + "logps/chosen": -253.4788818359375, + "logps/rejected": -418.32080078125, + "loss": 0.0065, + "rewards/chosen": 3.911123752593994, + "rewards/margins": 12.897051334381104, + "rewards/rejected": -8.98592758178711, + "step": 4528 + }, + { + "epoch": 0.413796253997259, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 6.355212895072223e-06, + "logits/chosen": 675807488.0, + "logits/rejected": 337711466.6666667, + "logps/chosen": -287.2298583984375, + "logps/rejected": -399.61474609375, + "loss": 0.0072, + "rewards/chosen": 3.5944008827209473, + "rewards/margins": 12.345820585886637, + "rewards/rejected": -8.75141970316569, + "step": 4529 + }, + { + "epoch": 0.4138876199177707, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 6.353828859863543e-06, + "logits/chosen": 625781802.6666666, + "logits/rejected": 449948224.0, + "logps/chosen": -470.9971516927083, + "logps/rejected": -297.98504638671875, + "loss": 0.0111, + "rewards/chosen": 4.972279866536458, + "rewards/margins": 11.671010812123615, + "rewards/rejected": -6.698730945587158, + "step": 4530 + }, + { + "epoch": 0.4139789858382823, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.352444712705762e-06, + "logits/chosen": 1096651008.0, + "logits/rejected": 659396608.0, + "logps/chosen": -411.6824544270833, + "logps/rejected": -488.496533203125, + "loss": 0.0113, + "rewards/chosen": 4.077551205952962, + "rewards/margins": 13.932867368062336, + "rewards/rejected": -9.855316162109375, + "step": 4531 + }, + { + "epoch": 0.414070351758794, + "grad_norm": 64.5, + "kl": 0.0, + "learning_rate": 6.351060453713333e-06, + "logits/chosen": 760681523.2, + "logits/rejected": 455510442.6666667, + "logps/chosen": -290.049072265625, + "logps/rejected": -453.1070556640625, + "loss": 0.0702, + "rewards/chosen": 2.941905212402344, + "rewards/margins": 14.799522908528647, + "rewards/rejected": -11.857617696126303, + "step": 4532 + }, + { + "epoch": 0.4141617176793056, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 6.3496760830007235e-06, + "logits/chosen": 708906240.0, + "logits/rejected": 364015872.0, + "logps/chosen": -305.84112548828125, + "logps/rejected": -459.1285095214844, + "loss": 0.0155, + "rewards/chosen": 3.749807119369507, + "rewards/margins": 13.06936240196228, + "rewards/rejected": -9.319555282592773, + "step": 4533 + }, + { + "epoch": 0.4142530835998173, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.348291600682404e-06, + "logits/chosen": 619917260.8, + "logits/rejected": 491499093.3333333, + "logps/chosen": -263.17333984375, + "logps/rejected": -370.3177083333333, + "loss": 0.1284, + "rewards/chosen": 2.404261589050293, + "rewards/margins": 11.879989941914877, + "rewards/rejected": -9.475728352864584, + "step": 4534 + }, + { + "epoch": 0.4143444495203289, + "grad_norm": 0.55859375, + "kl": 0.0, + "learning_rate": 6.3469070068728644e-06, + "logits/chosen": 482083104.0, + "logits/rejected": 381361834.6666667, + "logps/chosen": -450.6437072753906, + "logps/rejected": -366.7686360677083, + "loss": 0.0031, + "rewards/chosen": 4.6399431228637695, + "rewards/margins": 12.61158339182536, + "rewards/rejected": -7.971640268961589, + "step": 4535 + }, + { + "epoch": 0.41443581544084057, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 6.345522301686593e-06, + "logits/chosen": 391598400.0, + "logits/rejected": 571843392.0, + "logps/chosen": -233.35748291015625, + "logps/rejected": -531.794921875, + "loss": 0.0163, + "rewards/chosen": 4.116488933563232, + "rewards/margins": 14.416567325592041, + "rewards/rejected": -10.300078392028809, + "step": 4536 + }, + { + "epoch": 0.4145271813613522, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.344137485238095e-06, + "logits/chosen": 407552960.0, + "logits/rejected": 353284928.0, + "logps/chosen": -216.19869995117188, + "logps/rejected": -300.305908203125, + "loss": 0.0303, + "rewards/chosen": 3.7743773460388184, + "rewards/margins": 10.61518907546997, + "rewards/rejected": -6.840811729431152, + "step": 4537 + }, + { + "epoch": 0.41461854728186387, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 6.342752557641879e-06, + "logits/chosen": 365820117.3333333, + "logits/rejected": 793844019.2, + "logps/chosen": -183.72395833333334, + "logps/rejected": -554.194580078125, + "loss": 0.0149, + "rewards/chosen": 3.735703468322754, + "rewards/margins": 14.50982837677002, + "rewards/rejected": -10.774124908447266, + "step": 4538 + }, + { + "epoch": 0.4147099132023755, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 6.341367519012469e-06, + "logits/chosen": 443206229.3333333, + "logits/rejected": 484511328.0, + "logps/chosen": -267.6600748697917, + "logps/rejected": -285.80029296875, + "loss": 0.1292, + "rewards/chosen": 3.2413352330525718, + "rewards/margins": 10.829764684041342, + "rewards/rejected": -7.5884294509887695, + "step": 4539 + }, + { + "epoch": 0.41480127912288717, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 6.339982369464394e-06, + "logits/chosen": 247029994.66666666, + "logits/rejected": 491141068.8, + "logps/chosen": -395.2912190755208, + "logps/rejected": -425.6583984375, + "loss": 0.0158, + "rewards/chosen": 4.1276295979817705, + "rewards/margins": 13.475895436604816, + "rewards/rejected": -9.348265838623046, + "step": 4540 + }, + { + "epoch": 0.4148926450433988, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 6.338597109112191e-06, + "logits/chosen": 302638400.0, + "logits/rejected": 346462080.0, + "logps/chosen": -239.1063690185547, + "logps/rejected": -510.2806396484375, + "loss": 0.0237, + "rewards/chosen": 3.3339638710021973, + "rewards/margins": 12.017889499664307, + "rewards/rejected": -8.68392562866211, + "step": 4541 + }, + { + "epoch": 0.41498401096391047, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 6.337211738070409e-06, + "logits/chosen": 399891840.0, + "logits/rejected": 510023040.0, + "logps/chosen": -405.5045166015625, + "logps/rejected": -473.8484191894531, + "loss": 0.0158, + "rewards/chosen": 3.504265785217285, + "rewards/margins": 13.15324878692627, + "rewards/rejected": -9.648983001708984, + "step": 4542 + }, + { + "epoch": 0.4150753768844221, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 6.335826256453606e-06, + "logits/chosen": 390455520.0, + "logits/rejected": 491601728.0, + "logps/chosen": -171.1936492919922, + "logps/rejected": -476.50799560546875, + "loss": 0.0243, + "rewards/chosen": 3.7344112396240234, + "rewards/margins": 11.498111724853516, + "rewards/rejected": -7.763700485229492, + "step": 4543 + }, + { + "epoch": 0.41516674280493376, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 6.334440664376348e-06, + "logits/chosen": 482041258.6666667, + "logits/rejected": 553797312.0, + "logps/chosen": -401.0734049479167, + "logps/rejected": -460.62225341796875, + "loss": 0.0409, + "rewards/chosen": 3.576377550760905, + "rewards/margins": 9.147753397623697, + "rewards/rejected": -5.571375846862793, + "step": 4544 + }, + { + "epoch": 0.4152581087254454, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 6.333054961953212e-06, + "logits/chosen": 283464256.0, + "logits/rejected": 439287808.0, + "logps/chosen": -335.52227783203125, + "logps/rejected": -605.1060384114584, + "loss": 0.0075, + "rewards/chosen": 4.113531589508057, + "rewards/margins": 16.765773614247642, + "rewards/rejected": -12.652242024739584, + "step": 4545 + }, + { + "epoch": 0.41534947464595706, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 6.331669149298781e-06, + "logits/chosen": 730079914.6666666, + "logits/rejected": 509088192.0, + "logps/chosen": -347.2012532552083, + "logps/rejected": -814.3255615234375, + "loss": 0.0302, + "rewards/chosen": 3.8171310424804688, + "rewards/margins": 15.985956192016602, + "rewards/rejected": -12.168825149536133, + "step": 4546 + }, + { + "epoch": 0.4154408405664687, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 6.330283226527649e-06, + "logits/chosen": 428787072.0, + "logits/rejected": 492196576.0, + "logps/chosen": -222.0836181640625, + "logps/rejected": -532.5087890625, + "loss": 0.0209, + "rewards/chosen": 3.363597869873047, + "rewards/margins": 13.06912612915039, + "rewards/rejected": -9.705528259277344, + "step": 4547 + }, + { + "epoch": 0.41553220648698036, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 6.328897193754422e-06, + "logits/chosen": 394636864.0, + "logits/rejected": 363615680.0, + "logps/chosen": -299.9481201171875, + "logps/rejected": -664.0540161132812, + "loss": 0.0229, + "rewards/chosen": 3.672119617462158, + "rewards/margins": 12.731424808502197, + "rewards/rejected": -9.059305191040039, + "step": 4548 + }, + { + "epoch": 0.415623572407492, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 6.327511051093707e-06, + "logits/chosen": 674612309.3333334, + "logits/rejected": 502501056.0, + "logps/chosen": -319.6858723958333, + "logps/rejected": -511.57281494140625, + "loss": 0.0334, + "rewards/chosen": 3.5313549041748047, + "rewards/margins": 13.261173248291016, + "rewards/rejected": -9.729818344116211, + "step": 4549 + }, + { + "epoch": 0.41571493832800366, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 6.326124798660128e-06, + "logits/chosen": 703646933.3333334, + "logits/rejected": 1140502528.0, + "logps/chosen": -269.300048828125, + "logps/rejected": -577.6263671875, + "loss": 0.0168, + "rewards/chosen": 3.5870850880940757, + "rewards/margins": 13.501745732625325, + "rewards/rejected": -9.91466064453125, + "step": 4550 + }, + { + "epoch": 0.4158063042485153, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 6.324738436568315e-06, + "logits/chosen": 1332557994.6666667, + "logits/rejected": 719226572.8, + "logps/chosen": -344.2970784505208, + "logps/rejected": -324.129052734375, + "loss": 0.0137, + "rewards/chosen": 3.4121840794881186, + "rewards/margins": 10.953328641255696, + "rewards/rejected": -7.541144561767578, + "step": 4551 + }, + { + "epoch": 0.41589767016902696, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 6.323351964932909e-06, + "logits/chosen": 668395776.0, + "logits/rejected": 689569024.0, + "logps/chosen": -302.26654052734375, + "logps/rejected": -639.559326171875, + "loss": 0.016, + "rewards/chosen": 3.6570396423339844, + "rewards/margins": 15.134361267089844, + "rewards/rejected": -11.47732162475586, + "step": 4552 + }, + { + "epoch": 0.4159890360895386, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 6.321965383868554e-06, + "logits/chosen": 726921216.0, + "logits/rejected": 360944544.0, + "logps/chosen": -515.2835286458334, + "logps/rejected": -308.0548095703125, + "loss": 0.0344, + "rewards/chosen": 3.0887238184611, + "rewards/margins": 10.939149538675943, + "rewards/rejected": -7.850425720214844, + "step": 4553 + }, + { + "epoch": 0.41608040201005025, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 6.320578693489913e-06, + "logits/chosen": 587410841.6, + "logits/rejected": 396103893.3333333, + "logps/chosen": -294.1167236328125, + "logps/rejected": -472.2215983072917, + "loss": 0.0326, + "rewards/chosen": 3.0947540283203123, + "rewards/margins": 11.340095647176106, + "rewards/rejected": -8.245341618855795, + "step": 4554 + }, + { + "epoch": 0.4161717679305619, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 6.319191893911647e-06, + "logits/chosen": 511605077.3333333, + "logits/rejected": 580761472.0, + "logps/chosen": -305.4736735026042, + "logps/rejected": -428.73577880859375, + "loss": 0.1477, + "rewards/chosen": 2.4854714075724282, + "rewards/margins": 8.99255625406901, + "rewards/rejected": -6.507084846496582, + "step": 4555 + }, + { + "epoch": 0.41626313385107355, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 6.317804985248436e-06, + "logits/chosen": 755628480.0, + "logits/rejected": 567743232.0, + "logps/chosen": -333.1418151855469, + "logps/rejected": -394.9005126953125, + "loss": 0.0067, + "rewards/chosen": 5.044198036193848, + "rewards/margins": 13.592988967895508, + "rewards/rejected": -8.54879093170166, + "step": 4556 + }, + { + "epoch": 0.4163544997715852, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 6.316417967614961e-06, + "logits/chosen": 864044288.0, + "logits/rejected": 674935040.0, + "logps/chosen": -347.6575520833333, + "logps/rejected": -572.8397216796875, + "loss": 0.0166, + "rewards/chosen": 3.8527088165283203, + "rewards/margins": 14.56259822845459, + "rewards/rejected": -10.70988941192627, + "step": 4557 + }, + { + "epoch": 0.41644586569209685, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 6.315030841125917e-06, + "logits/chosen": 276161877.3333333, + "logits/rejected": 487207424.0, + "logps/chosen": -161.48947143554688, + "logps/rejected": -840.87646484375, + "loss": 0.0086, + "rewards/chosen": 4.987771670023601, + "rewards/margins": 15.697753969828288, + "rewards/rejected": -10.709982299804688, + "step": 4558 + }, + { + "epoch": 0.41653723161260847, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 6.313643605896006e-06, + "logits/chosen": 381652633.6, + "logits/rejected": 497791317.3333333, + "logps/chosen": -299.635400390625, + "logps/rejected": -426.2358805338542, + "loss": 0.0188, + "rewards/chosen": 4.250167465209961, + "rewards/margins": 15.155198033650716, + "rewards/rejected": -10.905030568440756, + "step": 4559 + }, + { + "epoch": 0.41662859753312015, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 6.312256262039941e-06, + "logits/chosen": 355454656.0, + "logits/rejected": 757754624.0, + "logps/chosen": -114.07667541503906, + "logps/rejected": -364.422119140625, + "loss": 0.0569, + "rewards/chosen": 4.110965728759766, + "rewards/margins": 10.832106590270996, + "rewards/rejected": -6.7211408615112305, + "step": 4560 + }, + { + "epoch": 0.41671996345363177, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 6.3108688096724415e-06, + "logits/chosen": 437076181.3333333, + "logits/rejected": 546818304.0, + "logps/chosen": -333.83725992838544, + "logps/rejected": -457.85146484375, + "loss": 0.089, + "rewards/chosen": 2.1120173136393228, + "rewards/margins": 11.306208292643229, + "rewards/rejected": -9.194190979003906, + "step": 4561 + }, + { + "epoch": 0.41681132937414345, + "grad_norm": 47.75, + "kl": 0.0, + "learning_rate": 6.309481248908235e-06, + "logits/chosen": 507048746.6666667, + "logits/rejected": 456016128.0, + "logps/chosen": -398.4240315755208, + "logps/rejected": -450.611376953125, + "loss": 0.0745, + "rewards/chosen": 2.157193978627523, + "rewards/margins": 11.153298409779866, + "rewards/rejected": -8.996104431152343, + "step": 4562 + }, + { + "epoch": 0.41690269529465507, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 6.308093579862065e-06, + "logits/chosen": 494990080.0, + "logits/rejected": 604973760.0, + "logps/chosen": -313.9035237630208, + "logps/rejected": -510.2896728515625, + "loss": 0.0554, + "rewards/chosen": 2.613730271657308, + "rewards/margins": 10.399210294087728, + "rewards/rejected": -7.78548002243042, + "step": 4563 + }, + { + "epoch": 0.41699406121516674, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.306705802648674e-06, + "logits/chosen": 438403123.2, + "logits/rejected": 317107754.6666667, + "logps/chosen": -278.568798828125, + "logps/rejected": -561.3503824869791, + "loss": 0.0115, + "rewards/chosen": 4.418827056884766, + "rewards/margins": 14.326705932617188, + "rewards/rejected": -9.907878875732422, + "step": 4564 + }, + { + "epoch": 0.41708542713567837, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 6.305317917382821e-06, + "logits/chosen": 564867456.0, + "logits/rejected": 310084672.0, + "logps/chosen": -285.6788024902344, + "logps/rejected": -387.195556640625, + "loss": 0.0093, + "rewards/chosen": 4.099982261657715, + "rewards/margins": 13.913224220275879, + "rewards/rejected": -9.813241958618164, + "step": 4565 + }, + { + "epoch": 0.41717679305619004, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 6.303929924179267e-06, + "logits/chosen": 593566464.0, + "logits/rejected": 852057728.0, + "logps/chosen": -297.7724304199219, + "logps/rejected": -715.9576416015625, + "loss": 0.0168, + "rewards/chosen": 3.65985107421875, + "rewards/margins": 12.965124130249023, + "rewards/rejected": -9.305273056030273, + "step": 4566 + }, + { + "epoch": 0.41726815897670166, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.3025418231527946e-06, + "logits/chosen": 618600064.0, + "logits/rejected": 697195739.4285715, + "logps/chosen": -180.60598754882812, + "logps/rejected": -255.73944963727678, + "loss": 0.0154, + "rewards/chosen": 3.8997132778167725, + "rewards/margins": 10.946898903165545, + "rewards/rejected": -7.047185625348773, + "step": 4567 + }, + { + "epoch": 0.41735952489721334, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.301153614418177e-06, + "logits/chosen": 288632678.4, + "logits/rejected": 296875648.0, + "logps/chosen": -239.099951171875, + "logps/rejected": -427.6034749348958, + "loss": 0.0118, + "rewards/chosen": 4.462881469726563, + "rewards/margins": 11.432998402913412, + "rewards/rejected": -6.970116933186849, + "step": 4568 + }, + { + "epoch": 0.41745089081772496, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 6.2997652980902144e-06, + "logits/chosen": 548043110.4, + "logits/rejected": 479837482.6666667, + "logps/chosen": -314.65830078125, + "logps/rejected": -506.8149007161458, + "loss": 0.0194, + "rewards/chosen": 3.8367027282714843, + "rewards/margins": 14.82788314819336, + "rewards/rejected": -10.991180419921875, + "step": 4569 + }, + { + "epoch": 0.41754225673823664, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 6.298376874283701e-06, + "logits/chosen": 659198080.0, + "logits/rejected": 285765248.0, + "logps/chosen": -384.2559509277344, + "logps/rejected": -347.5693054199219, + "loss": 0.0079, + "rewards/chosen": 4.48677396774292, + "rewards/margins": 15.142514705657959, + "rewards/rejected": -10.655740737915039, + "step": 4570 + }, + { + "epoch": 0.41763362265874826, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 6.296988343113453e-06, + "logits/chosen": 552942272.0, + "logits/rejected": 576965760.0, + "logps/chosen": -215.599365234375, + "logps/rejected": -432.08929443359375, + "loss": 0.0276, + "rewards/chosen": 2.9365055561065674, + "rewards/margins": 11.979995012283325, + "rewards/rejected": -9.043489456176758, + "step": 4571 + }, + { + "epoch": 0.41772498857925994, + "grad_norm": 1.140625, + "kl": 0.0, + "learning_rate": 6.295599704694284e-06, + "logits/chosen": 682145024.0, + "logits/rejected": 596609894.4, + "logps/chosen": -256.04750569661456, + "logps/rejected": -625.5779296875, + "loss": 0.012, + "rewards/chosen": 3.910313924153646, + "rewards/margins": 13.885767110188803, + "rewards/rejected": -9.975453186035157, + "step": 4572 + }, + { + "epoch": 0.41781635449977156, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 6.294210959141024e-06, + "logits/chosen": 552663296.0, + "logits/rejected": 703047232.0, + "logps/chosen": -467.2187805175781, + "logps/rejected": -551.1071166992188, + "loss": 0.0147, + "rewards/chosen": 4.25759220123291, + "rewards/margins": 12.344654083251953, + "rewards/rejected": -8.087061882019043, + "step": 4573 + }, + { + "epoch": 0.41790772042028324, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 6.292822106568508e-06, + "logits/chosen": 306403788.8, + "logits/rejected": 394820693.3333333, + "logps/chosen": -350.8976318359375, + "logps/rejected": -540.8016764322916, + "loss": 0.0197, + "rewards/chosen": 3.565739059448242, + "rewards/margins": 12.650856399536133, + "rewards/rejected": -9.08511734008789, + "step": 4574 + }, + { + "epoch": 0.41799908634079486, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 6.291433147091583e-06, + "logits/chosen": 455649792.0, + "logits/rejected": 294768224.0, + "logps/chosen": -379.50323486328125, + "logps/rejected": -485.3603820800781, + "loss": 0.0093, + "rewards/chosen": 4.654605388641357, + "rewards/margins": 15.145270824432373, + "rewards/rejected": -10.490665435791016, + "step": 4575 + }, + { + "epoch": 0.41809045226130653, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.290044080825103e-06, + "logits/chosen": 886281728.0, + "logits/rejected": 592549376.0, + "logps/chosen": -497.39296875, + "logps/rejected": -666.27001953125, + "loss": 0.0141, + "rewards/chosen": 4.7391822814941404, + "rewards/margins": 13.139330673217774, + "rewards/rejected": -8.400148391723633, + "step": 4576 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 38.0, + "kl": 0.0, + "learning_rate": 6.288654907883928e-06, + "logits/chosen": 469241088.0, + "logits/rejected": 865684544.0, + "logps/chosen": -257.2436218261719, + "logps/rejected": -648.1883544921875, + "loss": 0.0356, + "rewards/chosen": 3.2917637825012207, + "rewards/margins": 11.901297092437744, + "rewards/rejected": -8.609533309936523, + "step": 4577 + }, + { + "epoch": 0.41827318410232983, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 6.287265628382933e-06, + "logits/chosen": 1260983296.0, + "logits/rejected": 619730358.8571428, + "logps/chosen": -75.66160583496094, + "logps/rejected": -480.88936941964283, + "loss": 0.0185, + "rewards/chosen": 1.8399101495742798, + "rewards/margins": 10.148361665861946, + "rewards/rejected": -8.308451516287667, + "step": 4578 + }, + { + "epoch": 0.41836455002284145, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 6.285876242436996e-06, + "logits/chosen": 316236032.0, + "logits/rejected": 332935466.6666667, + "logps/chosen": -247.38275146484375, + "logps/rejected": -559.7962239583334, + "loss": 0.0117, + "rewards/chosen": 3.7326507568359375, + "rewards/margins": 13.98562494913737, + "rewards/rejected": -10.252974192301432, + "step": 4579 + }, + { + "epoch": 0.41845591594335313, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 6.284486750161009e-06, + "logits/chosen": 604736640.0, + "logits/rejected": 503085120.0, + "logps/chosen": -284.5167236328125, + "logps/rejected": -582.81396484375, + "loss": 0.1284, + "rewards/chosen": 3.058137575785319, + "rewards/margins": 11.038139502207438, + "rewards/rejected": -7.980001926422119, + "step": 4580 + }, + { + "epoch": 0.41854728186386475, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 6.283097151669869e-06, + "logits/chosen": 561948569.6, + "logits/rejected": 517910272.0, + "logps/chosen": -381.4111083984375, + "logps/rejected": -618.7117919921875, + "loss": 0.0287, + "rewards/chosen": 3.35263671875, + "rewards/margins": 13.416905466715495, + "rewards/rejected": -10.064268747965494, + "step": 4581 + }, + { + "epoch": 0.4186386477843764, + "grad_norm": 61.75, + "kl": 0.0, + "learning_rate": 6.281707447078483e-06, + "logits/chosen": 505895509.3333333, + "logits/rejected": 723837248.0, + "logps/chosen": -304.43674723307294, + "logps/rejected": -430.2439880371094, + "loss": 0.1503, + "rewards/chosen": 2.6242502530415854, + "rewards/margins": 9.620911439259848, + "rewards/rejected": -6.996661186218262, + "step": 4582 + }, + { + "epoch": 0.4187300137048881, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 6.280317636501765e-06, + "logits/chosen": 613261184.0, + "logits/rejected": 600799488.0, + "logps/chosen": -332.66900634765625, + "logps/rejected": -467.90606689453125, + "loss": 0.0305, + "rewards/chosen": 3.5423333644866943, + "rewards/margins": 10.894567251205444, + "rewards/rejected": -7.35223388671875, + "step": 4583 + }, + { + "epoch": 0.4188213796253997, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 6.278927720054642e-06, + "logits/chosen": 463786336.0, + "logits/rejected": 322527584.0, + "logps/chosen": -379.2812194824219, + "logps/rejected": -335.112548828125, + "loss": 0.0354, + "rewards/chosen": 3.241269588470459, + "rewards/margins": 9.854710102081299, + "rewards/rejected": -6.61344051361084, + "step": 4584 + }, + { + "epoch": 0.4189127455459114, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.277537697852045e-06, + "logits/chosen": 534083737.6, + "logits/rejected": 655266090.6666666, + "logps/chosen": -325.36591796875, + "logps/rejected": -504.5858561197917, + "loss": 0.0164, + "rewards/chosen": 3.635303497314453, + "rewards/margins": 13.45810317993164, + "rewards/rejected": -9.822799682617188, + "step": 4585 + }, + { + "epoch": 0.419004111466423, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 6.276147570008917e-06, + "logits/chosen": 369258432.0, + "logits/rejected": 1105423872.0, + "logps/chosen": -114.37749481201172, + "logps/rejected": -462.263671875, + "loss": 0.1448, + "rewards/chosen": 2.188533306121826, + "rewards/margins": 10.489213466644287, + "rewards/rejected": -8.300680160522461, + "step": 4586 + }, + { + "epoch": 0.4190954773869347, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 6.274757336640209e-06, + "logits/chosen": 453173184.0, + "logits/rejected": 476637312.0, + "logps/chosen": -426.98553466796875, + "logps/rejected": -404.3134765625, + "loss": 0.0103, + "rewards/chosen": 4.418386459350586, + "rewards/margins": 13.599024772644043, + "rewards/rejected": -9.180638313293457, + "step": 4587 + }, + { + "epoch": 0.4191868433074463, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 6.2733669978608815e-06, + "logits/chosen": 521604736.0, + "logits/rejected": 509029273.6, + "logps/chosen": -250.33772786458334, + "logps/rejected": -418.89453125, + "loss": 0.0169, + "rewards/chosen": 3.632887840270996, + "rewards/margins": 12.985704612731933, + "rewards/rejected": -9.352816772460937, + "step": 4588 + }, + { + "epoch": 0.419278209227958, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 6.2719765537858986e-06, + "logits/chosen": 401056682.6666667, + "logits/rejected": 503752601.6, + "logps/chosen": -411.5819091796875, + "logps/rejected": -360.1282470703125, + "loss": 0.017, + "rewards/chosen": 4.432427724202474, + "rewards/margins": 12.342185719807944, + "rewards/rejected": -7.909757995605469, + "step": 4589 + }, + { + "epoch": 0.4193695751484696, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.270586004530243e-06, + "logits/chosen": 563698688.0, + "logits/rejected": 803699797.3333334, + "logps/chosen": -387.9373046875, + "logps/rejected": -843.2913411458334, + "loss": 0.0281, + "rewards/chosen": 3.5357059478759765, + "rewards/margins": 18.7091983795166, + "rewards/rejected": -15.173492431640625, + "step": 4590 + }, + { + "epoch": 0.4194609410689813, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.269195350208894e-06, + "logits/chosen": 727744614.4, + "logits/rejected": 752877226.6666666, + "logps/chosen": -290.618798828125, + "logps/rejected": -580.4646402994791, + "loss": 0.0131, + "rewards/chosen": 4.298300933837891, + "rewards/margins": 14.056781005859374, + "rewards/rejected": -9.758480072021484, + "step": 4591 + }, + { + "epoch": 0.4195523069894929, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 6.267804590936851e-06, + "logits/chosen": 680642218.6666666, + "logits/rejected": 607158630.4, + "logps/chosen": -224.48189290364584, + "logps/rejected": -578.363037109375, + "loss": 0.01, + "rewards/chosen": 3.732203165690104, + "rewards/margins": 12.572216288248697, + "rewards/rejected": -8.840013122558593, + "step": 4592 + }, + { + "epoch": 0.4196436729100046, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 6.266413726829115e-06, + "logits/chosen": 525808640.0, + "logits/rejected": 705064192.0, + "logps/chosen": -270.3001953125, + "logps/rejected": -473.1298014322917, + "loss": 0.1289, + "rewards/chosen": 3.1100847244262697, + "rewards/margins": 11.576406033833823, + "rewards/rejected": -8.466321309407553, + "step": 4593 + }, + { + "epoch": 0.4197350388305162, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 6.265022758000697e-06, + "logits/chosen": 272635200.0, + "logits/rejected": 401653606.4, + "logps/chosen": -212.21964518229166, + "logps/rejected": -606.626953125, + "loss": 0.0128, + "rewards/chosen": 3.9746058781941733, + "rewards/margins": 13.251604779561362, + "rewards/rejected": -9.276998901367188, + "step": 4594 + }, + { + "epoch": 0.4198264047510279, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 6.2636316845666165e-06, + "logits/chosen": 595738922.6666666, + "logits/rejected": 749021747.2, + "logps/chosen": -276.295166015625, + "logps/rejected": -620.7302734375, + "loss": 0.024, + "rewards/chosen": 3.354841868082682, + "rewards/margins": 13.050058237711587, + "rewards/rejected": -9.695216369628906, + "step": 4595 + }, + { + "epoch": 0.4199177706715395, + "grad_norm": 25.875, + "kl": 0.0, + "learning_rate": 6.2622405066419046e-06, + "logits/chosen": 363028224.0, + "logits/rejected": 358417536.0, + "logps/chosen": -142.59853515625, + "logps/rejected": -483.7225748697917, + "loss": 0.0679, + "rewards/chosen": 2.629836654663086, + "rewards/margins": 13.721201197306314, + "rewards/rejected": -11.091364542643229, + "step": 4596 + }, + { + "epoch": 0.4200091365920512, + "grad_norm": 0.8828125, + "kl": 0.0, + "learning_rate": 6.2608492243415975e-06, + "logits/chosen": 1105798144.0, + "logits/rejected": 848328806.4, + "logps/chosen": -247.290283203125, + "logps/rejected": -785.66416015625, + "loss": 0.0057, + "rewards/chosen": 4.248238245646159, + "rewards/margins": 16.473076502482098, + "rewards/rejected": -12.224838256835938, + "step": 4597 + }, + { + "epoch": 0.4201005025125628, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 6.259457837780741e-06, + "logits/chosen": 454894336.0, + "logits/rejected": 499128320.0, + "logps/chosen": -70.94227600097656, + "logps/rejected": -542.0638020833334, + "loss": 0.0093, + "rewards/chosen": 3.315796136856079, + "rewards/margins": 11.569506565729776, + "rewards/rejected": -8.253710428873697, + "step": 4598 + }, + { + "epoch": 0.4201918684330745, + "grad_norm": 61.75, + "kl": 0.0, + "learning_rate": 6.258066347074392e-06, + "logits/chosen": 629719808.0, + "logits/rejected": 584574122.6666666, + "logps/chosen": -274.908642578125, + "logps/rejected": -494.5440266927083, + "loss": 0.0889, + "rewards/chosen": 2.409640884399414, + "rewards/margins": 11.940336481730142, + "rewards/rejected": -9.530695597330729, + "step": 4599 + }, + { + "epoch": 0.4202832343535861, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 6.25667475233761e-06, + "logits/chosen": 316091552.0, + "logits/rejected": 764612736.0, + "logps/chosen": -219.43954467773438, + "logps/rejected": -640.191650390625, + "loss": 0.0136, + "rewards/chosen": 4.0413713455200195, + "rewards/margins": 13.78453254699707, + "rewards/rejected": -9.74316120147705, + "step": 4600 + }, + { + "epoch": 0.4203746002740978, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 6.25528305368547e-06, + "logits/chosen": 530551142.4, + "logits/rejected": 818140501.3333334, + "logps/chosen": -193.6365478515625, + "logps/rejected": -839.4522298177084, + "loss": 0.032, + "rewards/chosen": 3.5025283813476564, + "rewards/margins": 13.2635617574056, + "rewards/rejected": -9.761033376057943, + "step": 4601 + }, + { + "epoch": 0.4204659661946094, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 6.253891251233053e-06, + "logits/chosen": 299547136.0, + "logits/rejected": 187931541.33333334, + "logps/chosen": -231.78916015625, + "logps/rejected": -378.4121500651042, + "loss": 0.0109, + "rewards/chosen": 4.698324584960938, + "rewards/margins": 14.904375457763672, + "rewards/rejected": -10.206050872802734, + "step": 4602 + }, + { + "epoch": 0.4205573321151211, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 6.2524993450954465e-06, + "logits/chosen": 511938112.0, + "logits/rejected": 438304597.3333333, + "logps/chosen": -193.9616241455078, + "logps/rejected": -434.7351888020833, + "loss": 0.0337, + "rewards/chosen": 1.9975059032440186, + "rewards/margins": 10.927373965581259, + "rewards/rejected": -8.92986806233724, + "step": 4603 + }, + { + "epoch": 0.4206486980356327, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 6.251107335387748e-06, + "logits/chosen": 357994026.6666667, + "logits/rejected": 659806720.0, + "logps/chosen": -294.3189697265625, + "logps/rejected": -469.82501220703125, + "loss": 0.0357, + "rewards/chosen": 3.8508682250976562, + "rewards/margins": 12.336448669433594, + "rewards/rejected": -8.485580444335938, + "step": 4604 + }, + { + "epoch": 0.4207400639561444, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 6.249715222225066e-06, + "logits/chosen": 552978995.2, + "logits/rejected": 721626965.3333334, + "logps/chosen": -330.18642578125, + "logps/rejected": -601.6980794270834, + "loss": 0.0396, + "rewards/chosen": 2.846193313598633, + "rewards/margins": 11.928410212198893, + "rewards/rejected": -9.08221689860026, + "step": 4605 + }, + { + "epoch": 0.420831429876656, + "grad_norm": 26.75, + "kl": 0.0, + "learning_rate": 6.248323005722513e-06, + "logits/chosen": 228136106.66666666, + "logits/rejected": 377948108.8, + "logps/chosen": -289.3939208984375, + "logps/rejected": -407.7146484375, + "loss": 0.0331, + "rewards/chosen": 4.546457926432292, + "rewards/margins": 10.954019419352214, + "rewards/rejected": -6.407561492919922, + "step": 4606 + }, + { + "epoch": 0.4209227957971677, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 6.246930685995215e-06, + "logits/chosen": 546170496.0, + "logits/rejected": 431965056.0, + "logps/chosen": -332.93798828125, + "logps/rejected": -517.8203735351562, + "loss": 0.0211, + "rewards/chosen": 3.2389140129089355, + "rewards/margins": 13.03507375717163, + "rewards/rejected": -9.796159744262695, + "step": 4607 + }, + { + "epoch": 0.4210141617176793, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 6.2455382631583005e-06, + "logits/chosen": 477238688.0, + "logits/rejected": 398902357.3333333, + "logps/chosen": -284.275634765625, + "logps/rejected": -367.7762451171875, + "loss": 0.007, + "rewards/chosen": 3.609832763671875, + "rewards/margins": 12.177165349324545, + "rewards/rejected": -8.56733258565267, + "step": 4608 + }, + { + "epoch": 0.421105527638191, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 6.2441457373269135e-06, + "logits/chosen": 430135253.3333333, + "logits/rejected": 728973260.8, + "logps/chosen": -323.7974446614583, + "logps/rejected": -841.50146484375, + "loss": 0.005, + "rewards/chosen": 4.50694465637207, + "rewards/margins": 13.432651138305664, + "rewards/rejected": -8.925706481933593, + "step": 4609 + }, + { + "epoch": 0.4211968935587026, + "grad_norm": 0.59765625, + "kl": 0.0, + "learning_rate": 6.2427531086162e-06, + "logits/chosen": 432648000.0, + "logits/rejected": 416912810.6666667, + "logps/chosen": -437.02490234375, + "logps/rejected": -463.9662679036458, + "loss": 0.0023, + "rewards/chosen": 5.50689697265625, + "rewards/margins": 14.302112579345703, + "rewards/rejected": -8.795215606689453, + "step": 4610 + }, + { + "epoch": 0.4212882594792143, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 6.241360377141319e-06, + "logits/chosen": 522414250.6666667, + "logits/rejected": 435358528.0, + "logps/chosen": -291.1396077473958, + "logps/rejected": -892.2381591796875, + "loss": 0.0197, + "rewards/chosen": 3.8562533060709634, + "rewards/margins": 13.153351465861002, + "rewards/rejected": -9.297098159790039, + "step": 4611 + }, + { + "epoch": 0.4213796253997259, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 6.239967543017435e-06, + "logits/chosen": 713035212.8, + "logits/rejected": 323654592.0, + "logps/chosen": -421.690185546875, + "logps/rejected": -294.62872314453125, + "loss": 0.0175, + "rewards/chosen": 3.6603702545166015, + "rewards/margins": 10.495860163370768, + "rewards/rejected": -6.835489908854167, + "step": 4612 + }, + { + "epoch": 0.4214709913202376, + "grad_norm": 0.9765625, + "kl": 0.0, + "learning_rate": 6.238574606359727e-06, + "logits/chosen": 824761088.0, + "logits/rejected": 371829162.6666667, + "logps/chosen": -500.67022705078125, + "logps/rejected": -581.3306070963541, + "loss": 0.0042, + "rewards/chosen": 4.174452304840088, + "rewards/margins": 13.93935759862264, + "rewards/rejected": -9.764905293782553, + "step": 4613 + }, + { + "epoch": 0.4215623572407492, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 6.237181567283372e-06, + "logits/chosen": 383221568.0, + "logits/rejected": 499508064.0, + "logps/chosen": -305.1274108886719, + "logps/rejected": -520.4147338867188, + "loss": 0.0073, + "rewards/chosen": 4.987271785736084, + "rewards/margins": 15.064174175262451, + "rewards/rejected": -10.076902389526367, + "step": 4614 + }, + { + "epoch": 0.4216537231612609, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 6.235788425903565e-06, + "logits/chosen": 493681600.0, + "logits/rejected": 365638784.0, + "logps/chosen": -294.1899108886719, + "logps/rejected": -321.19805908203125, + "loss": 0.0138, + "rewards/chosen": 4.204965591430664, + "rewards/margins": 12.186760425567627, + "rewards/rejected": -7.981794834136963, + "step": 4615 + }, + { + "epoch": 0.4217450890817725, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.234395182335505e-06, + "logits/chosen": 1094421120.0, + "logits/rejected": 580519424.0, + "logps/chosen": -438.43511962890625, + "logps/rejected": -382.3373107910156, + "loss": 0.0174, + "rewards/chosen": 3.9280388355255127, + "rewards/margins": 13.122390031814575, + "rewards/rejected": -9.194351196289062, + "step": 4616 + }, + { + "epoch": 0.42183645500228417, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 6.233001836694399e-06, + "logits/chosen": 433810227.2, + "logits/rejected": 745376682.6666666, + "logps/chosen": -227.2211181640625, + "logps/rejected": -452.8833821614583, + "loss": 0.0221, + "rewards/chosen": 3.8049739837646483, + "rewards/margins": 11.849090321858725, + "rewards/rejected": -8.044116338094076, + "step": 4617 + }, + { + "epoch": 0.4219278209227958, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 6.231608389095467e-06, + "logits/chosen": 598477738.6666666, + "logits/rejected": 380393952.0, + "logps/chosen": -176.4832763671875, + "logps/rejected": -389.4657287597656, + "loss": 0.0646, + "rewards/chosen": 3.306491216023763, + "rewards/margins": 11.114962418874105, + "rewards/rejected": -7.808471202850342, + "step": 4618 + }, + { + "epoch": 0.42201918684330747, + "grad_norm": 0.185546875, + "kl": 0.0, + "learning_rate": 6.230214839653932e-06, + "logits/chosen": 228208592.0, + "logits/rejected": 447419392.0, + "logps/chosen": -120.15324401855469, + "logps/rejected": -371.39163643973217, + "loss": 0.001, + "rewards/chosen": 5.3274827003479, + "rewards/margins": 13.481409004756383, + "rewards/rejected": -8.153926304408483, + "step": 4619 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 6.228821188485028e-06, + "logits/chosen": 475300693.3333333, + "logits/rejected": 277183488.0, + "logps/chosen": -268.20302327473956, + "logps/rejected": -295.1900329589844, + "loss": 0.0087, + "rewards/chosen": 4.760146458943685, + "rewards/margins": 13.120588620503742, + "rewards/rejected": -8.360442161560059, + "step": 4620 + }, + { + "epoch": 0.42220191868433077, + "grad_norm": 0.0419921875, + "kl": 0.0, + "learning_rate": 6.227427435703997e-06, + "logits/rejected": 605538496.0, + "logps/rejected": -623.301025390625, + "loss": 0.0002, + "rewards/rejected": -9.410636901855469, + "step": 4621 + }, + { + "epoch": 0.4222932846048424, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 6.22603358142609e-06, + "logits/chosen": 707413333.3333334, + "logits/rejected": 346570009.6, + "logps/chosen": -236.7575887044271, + "logps/rejected": -492.5884765625, + "loss": 0.014, + "rewards/chosen": 3.981396039326986, + "rewards/margins": 12.918325360616048, + "rewards/rejected": -8.936929321289062, + "step": 4622 + }, + { + "epoch": 0.42238465052535407, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 6.224639625766563e-06, + "logits/chosen": 759706931.2, + "logits/rejected": 557538389.3333334, + "logps/chosen": -230.968359375, + "logps/rejected": -638.0342610677084, + "loss": 0.041, + "rewards/chosen": 3.1666259765625, + "rewards/margins": 12.805107116699219, + "rewards/rejected": -9.638481140136719, + "step": 4623 + }, + { + "epoch": 0.4224760164458657, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 6.223245568840687e-06, + "logits/chosen": 322681408.0, + "logits/rejected": 453617254.4, + "logps/chosen": -215.88846842447916, + "logps/rejected": -460.95576171875, + "loss": 0.0345, + "rewards/chosen": 2.54604705174764, + "rewards/margins": 11.706937249501546, + "rewards/rejected": -9.160890197753906, + "step": 4624 + }, + { + "epoch": 0.42256738236637736, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 6.221851410763735e-06, + "logits/chosen": 831403840.0, + "logits/rejected": 317341888.0, + "logps/chosen": -257.53546142578125, + "logps/rejected": -389.0282389322917, + "loss": 0.0141, + "rewards/chosen": 2.885715961456299, + "rewards/margins": 11.97070042292277, + "rewards/rejected": -9.08498446146647, + "step": 4625 + }, + { + "epoch": 0.422658748286889, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 6.220457151650994e-06, + "logits/chosen": 849267660.8, + "logits/rejected": 516490922.6666667, + "logps/chosen": -435.13251953125, + "logps/rejected": -510.0879720052083, + "loss": 0.0154, + "rewards/chosen": 4.114696884155274, + "rewards/margins": 11.628204727172852, + "rewards/rejected": -7.513507843017578, + "step": 4626 + }, + { + "epoch": 0.42275011420740066, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 6.219062791617753e-06, + "logits/chosen": 445421440.0, + "logits/rejected": 513130624.0, + "logps/chosen": -458.6860656738281, + "logps/rejected": -243.56060791015625, + "loss": 0.0222, + "rewards/chosen": 3.106574296951294, + "rewards/margins": 11.532443284988403, + "rewards/rejected": -8.42586898803711, + "step": 4627 + }, + { + "epoch": 0.4228414801279123, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 6.217668330779314e-06, + "logits/chosen": 685209472.0, + "logits/rejected": 498277973.3333333, + "logps/chosen": -393.26348876953125, + "logps/rejected": -496.3352864583333, + "loss": 0.007, + "rewards/chosen": 3.729684352874756, + "rewards/margins": 12.008836269378662, + "rewards/rejected": -8.279151916503906, + "step": 4628 + }, + { + "epoch": 0.42293284604842396, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 6.216273769250987e-06, + "logits/chosen": 599295027.2, + "logits/rejected": 527413760.0, + "logps/chosen": -304.708984375, + "logps/rejected": -474.386962890625, + "loss": 0.0182, + "rewards/chosen": 4.034893035888672, + "rewards/margins": 11.56601397196452, + "rewards/rejected": -7.531120936075847, + "step": 4629 + }, + { + "epoch": 0.4230242119689356, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 6.214879107148088e-06, + "logits/chosen": 541374208.0, + "logits/rejected": 394747968.0, + "logps/chosen": -329.09674072265625, + "logps/rejected": -611.44970703125, + "loss": 0.0372, + "rewards/chosen": 2.962855815887451, + "rewards/margins": 13.164927005767822, + "rewards/rejected": -10.202071189880371, + "step": 4630 + }, + { + "epoch": 0.42311557788944726, + "grad_norm": 0.9765625, + "kl": 0.0, + "learning_rate": 6.213484344585942e-06, + "logits/chosen": 450853273.6, + "logits/rejected": 877784746.6666666, + "logps/chosen": -208.814794921875, + "logps/rejected": -821.16845703125, + "loss": 0.1309, + "rewards/chosen": 2.6686477661132812, + "rewards/margins": 12.268123626708984, + "rewards/rejected": -9.599475860595703, + "step": 4631 + }, + { + "epoch": 0.4232069438099589, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 6.212089481679884e-06, + "logits/chosen": 672759091.2, + "logits/rejected": 623415381.3333334, + "logps/chosen": -344.39091796875, + "logps/rejected": -592.255126953125, + "loss": 0.0219, + "rewards/chosen": 3.333971405029297, + "rewards/margins": 12.871102396647135, + "rewards/rejected": -9.537130991617838, + "step": 4632 + }, + { + "epoch": 0.42329830973047056, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 6.2106945185452575e-06, + "logits/chosen": 512869056.0, + "logits/rejected": 718326528.0, + "logps/chosen": -280.6827697753906, + "logps/rejected": -736.6870727539062, + "loss": 0.0421, + "rewards/chosen": 3.2407708168029785, + "rewards/margins": 9.783321857452393, + "rewards/rejected": -6.542551040649414, + "step": 4633 + }, + { + "epoch": 0.4233896756509822, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 6.2092994552974104e-06, + "logits/chosen": 573572608.0, + "logits/rejected": 351831872.0, + "logps/chosen": -385.4025634765625, + "logps/rejected": -589.2450764973959, + "loss": 0.0227, + "rewards/chosen": 3.379780960083008, + "rewards/margins": 14.916307957967124, + "rewards/rejected": -11.536526997884115, + "step": 4634 + }, + { + "epoch": 0.42348104157149385, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 6.207904292051704e-06, + "logits/chosen": 579415680.0, + "logits/rejected": 931215725.7142857, + "logps/chosen": -205.9256591796875, + "logps/rejected": -704.3165457589286, + "loss": 0.0066, + "rewards/chosen": 3.2182297706604004, + "rewards/margins": 12.837219987596784, + "rewards/rejected": -9.618990216936384, + "step": 4635 + }, + { + "epoch": 0.4235724074920055, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.206509028923502e-06, + "logits/chosen": 436711765.3333333, + "logits/rejected": 646942720.0, + "logps/chosen": -484.0492350260417, + "logps/rejected": -467.75908203125, + "loss": 0.024, + "rewards/chosen": 3.103588422139486, + "rewards/margins": 12.559272321065267, + "rewards/rejected": -9.45568389892578, + "step": 4636 + }, + { + "epoch": 0.42366377341251715, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 6.2051136660281845e-06, + "logits/chosen": 693782997.3333334, + "logits/rejected": 753722777.6, + "logps/chosen": -414.5115559895833, + "logps/rejected": -774.850439453125, + "loss": 0.0998, + "rewards/chosen": 2.249562899271647, + "rewards/margins": 13.086708132425944, + "rewards/rejected": -10.837145233154297, + "step": 4637 + }, + { + "epoch": 0.4237551393330288, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 6.20371820348113e-06, + "logits/chosen": 703949414.4, + "logits/rejected": 1199152640.0, + "logps/chosen": -380.091259765625, + "logps/rejected": -600.71435546875, + "loss": 0.0203, + "rewards/chosen": 3.397943878173828, + "rewards/margins": 13.56952387491862, + "rewards/rejected": -10.171579996744791, + "step": 4638 + }, + { + "epoch": 0.42384650525354045, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 6.202322641397733e-06, + "logits/chosen": 839554730.6666666, + "logits/rejected": 685970534.4, + "logps/chosen": -175.275634765625, + "logps/rejected": -447.59638671875, + "loss": 0.014, + "rewards/chosen": 3.3970038096110025, + "rewards/margins": 12.366745630900065, + "rewards/rejected": -8.969741821289062, + "step": 4639 + }, + { + "epoch": 0.42393787117405207, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 6.200926979893394e-06, + "logits/chosen": 911201024.0, + "logits/rejected": 495600128.0, + "logps/chosen": -653.860107421875, + "logps/rejected": -288.43837483723956, + "loss": 0.1156, + "rewards/chosen": 5.305658340454102, + "rewards/margins": 10.766213099161785, + "rewards/rejected": -5.460554758707683, + "step": 4640 + }, + { + "epoch": 0.42402923709456375, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 6.199531219083521e-06, + "logits/chosen": 682793088.0, + "logits/rejected": 800058752.0, + "logps/chosen": -357.16949462890625, + "logps/rejected": -371.1666259765625, + "loss": 0.025, + "rewards/chosen": 3.0663681030273438, + "rewards/margins": 12.016311645507812, + "rewards/rejected": -8.949943542480469, + "step": 4641 + }, + { + "epoch": 0.42412060301507537, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 6.198135359083529e-06, + "logits/chosen": 422289834.6666667, + "logits/rejected": 293247744.0, + "logps/chosen": -304.6317545572917, + "logps/rejected": -353.093212890625, + "loss": 0.0224, + "rewards/chosen": 3.161219596862793, + "rewards/margins": 11.472770500183106, + "rewards/rejected": -8.311550903320313, + "step": 4642 + }, + { + "epoch": 0.42421196893558705, + "grad_norm": 0.7265625, + "kl": 0.0, + "learning_rate": 6.196739400008846e-06, + "logits/chosen": 423471552.0, + "logits/rejected": 549057322.6666666, + "logps/chosen": -144.5133819580078, + "logps/rejected": -391.607666015625, + "loss": 0.0041, + "rewards/chosen": 4.183347702026367, + "rewards/margins": 13.577863693237305, + "rewards/rejected": -9.394515991210938, + "step": 4643 + }, + { + "epoch": 0.42430333485609867, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.1953433419748995e-06, + "logits/chosen": 749508096.0, + "logits/rejected": 545901760.0, + "logps/chosen": -270.1225891113281, + "logps/rejected": -421.245361328125, + "loss": 0.0535, + "rewards/chosen": 2.410921573638916, + "rewards/margins": 9.581561088562012, + "rewards/rejected": -7.170639514923096, + "step": 4644 + }, + { + "epoch": 0.42439470077661035, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.1939471850971365e-06, + "logits/chosen": 675748096.0, + "logits/rejected": 485806208.0, + "logps/chosen": -383.171630859375, + "logps/rejected": -438.435302734375, + "loss": 0.0257, + "rewards/chosen": 3.574476718902588, + "rewards/margins": 10.531773090362549, + "rewards/rejected": -6.957296371459961, + "step": 4645 + }, + { + "epoch": 0.42448606669712197, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 6.192550929491002e-06, + "logits/chosen": 574287104.0, + "logits/rejected": 489915648.0, + "logps/chosen": -300.801025390625, + "logps/rejected": -493.92523193359375, + "loss": 0.0129, + "rewards/chosen": 3.796236038208008, + "rewards/margins": 12.02218246459961, + "rewards/rejected": -8.225946426391602, + "step": 4646 + }, + { + "epoch": 0.42457743261763364, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 6.191154575271956e-06, + "logits/chosen": 472671552.0, + "logits/rejected": 568204544.0, + "logps/chosen": -267.2550048828125, + "logps/rejected": -614.4835205078125, + "loss": 0.0137, + "rewards/chosen": 4.324825763702393, + "rewards/margins": 13.182669162750244, + "rewards/rejected": -8.857843399047852, + "step": 4647 + }, + { + "epoch": 0.42466879853814526, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 6.189758122555463e-06, + "logits/chosen": 348945356.8, + "logits/rejected": 379366912.0, + "logps/chosen": -194.7981689453125, + "logps/rejected": -526.6551106770834, + "loss": 0.0286, + "rewards/chosen": 4.327626419067383, + "rewards/margins": 12.353852462768554, + "rewards/rejected": -8.026226043701172, + "step": 4648 + }, + { + "epoch": 0.42476016445865694, + "grad_norm": 4.40625, + "kl": 0.11798954010009766, + "learning_rate": 6.188361571456996e-06, + "logits/chosen": 429203370.6666667, + "logits/rejected": 446007296.0, + "logps/chosen": -339.66143798828125, + "logps/rejected": -372.45819091796875, + "loss": 0.0353, + "rewards/chosen": 3.5516532262166343, + "rewards/margins": 11.517664273579916, + "rewards/rejected": -7.966011047363281, + "step": 4649 + }, + { + "epoch": 0.42485153037916856, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 6.186964922092037e-06, + "logits/chosen": 364754005.3333333, + "logits/rejected": 336536576.0, + "logps/chosen": -425.8591715494792, + "logps/rejected": -503.052294921875, + "loss": 0.0047, + "rewards/chosen": 4.597968737284343, + "rewards/margins": 13.026632372538248, + "rewards/rejected": -8.428663635253907, + "step": 4650 + }, + { + "epoch": 0.42494289629968024, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 6.185568174576075e-06, + "logits/chosen": 462507093.3333333, + "logits/rejected": 559918080.0, + "logps/chosen": -312.3971761067708, + "logps/rejected": -529.7213745117188, + "loss": 0.022, + "rewards/chosen": 3.781164805094401, + "rewards/margins": 13.912555376688639, + "rewards/rejected": -10.131390571594238, + "step": 4651 + }, + { + "epoch": 0.42503426222019186, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 6.184171329024613e-06, + "logits/chosen": 525139904.0, + "logits/rejected": 233881941.33333334, + "logps/chosen": -183.77024841308594, + "logps/rejected": -319.6232503255208, + "loss": 0.0127, + "rewards/chosen": 3.886369228363037, + "rewards/margins": 12.496636231740316, + "rewards/rejected": -8.61026700337728, + "step": 4652 + }, + { + "epoch": 0.42512562814070354, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 6.182774385553151e-06, + "logits/chosen": 853088426.6666666, + "logits/rejected": 845399040.0, + "logps/chosen": -214.5306396484375, + "logps/rejected": -453.8885192871094, + "loss": 0.0546, + "rewards/chosen": 2.8391345342000327, + "rewards/margins": 11.849345525105795, + "rewards/rejected": -9.010210990905762, + "step": 4653 + }, + { + "epoch": 0.42521699406121516, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 6.181377344277207e-06, + "logits/chosen": 284041514.6666667, + "logits/rejected": 624304128.0, + "logps/chosen": -229.3389892578125, + "logps/rejected": -672.58193359375, + "loss": 0.0049, + "rewards/chosen": 4.4976654052734375, + "rewards/margins": 13.767288208007812, + "rewards/rejected": -9.269622802734375, + "step": 4654 + }, + { + "epoch": 0.42530835998172684, + "grad_norm": 43.0, + "kl": 0.0, + "learning_rate": 6.179980205312301e-06, + "logits/chosen": 733132970.6666666, + "logits/rejected": 351400192.0, + "logps/chosen": -339.09779866536456, + "logps/rejected": -299.1168212890625, + "loss": 0.0684, + "rewards/chosen": 3.46297295888265, + "rewards/margins": 9.718826134999594, + "rewards/rejected": -6.255853176116943, + "step": 4655 + }, + { + "epoch": 0.42539972590223846, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 6.1785829687739675e-06, + "logits/chosen": 206166368.0, + "logits/rejected": 658361685.3333334, + "logps/chosen": -136.9038848876953, + "logps/rejected": -551.2031656901041, + "loss": 0.0066, + "rewards/chosen": 4.162255764007568, + "rewards/margins": 14.029945532480875, + "rewards/rejected": -9.867689768473307, + "step": 4656 + }, + { + "epoch": 0.42549109182275013, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.177185634777739e-06, + "logits/chosen": 970955776.0, + "logits/rejected": 592218453.3333334, + "logps/chosen": -351.1952819824219, + "logps/rejected": -374.7314860026042, + "loss": 0.0167, + "rewards/chosen": 2.6849403381347656, + "rewards/margins": 11.188674926757812, + "rewards/rejected": -8.503734588623047, + "step": 4657 + }, + { + "epoch": 0.42558245774326176, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 6.175788203439168e-06, + "logits/chosen": 533697682.28571427, + "logits/rejected": 451441216.0, + "logps/chosen": -327.93331473214283, + "logps/rejected": -389.5646667480469, + "loss": 0.0338, + "rewards/chosen": 3.715674809047154, + "rewards/margins": 12.28278296334403, + "rewards/rejected": -8.567108154296875, + "step": 4658 + }, + { + "epoch": 0.42567382366377343, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 6.174390674873805e-06, + "logits/chosen": 616707242.6666666, + "logits/rejected": 315410784.0, + "logps/chosen": -240.6835734049479, + "logps/rejected": -405.53802490234375, + "loss": 0.0243, + "rewards/chosen": 3.834347724914551, + "rewards/margins": 13.283825874328613, + "rewards/rejected": -9.449478149414062, + "step": 4659 + }, + { + "epoch": 0.42576518958428505, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 6.172993049197216e-06, + "logits/chosen": 743136512.0, + "logits/rejected": 438612949.3333333, + "logps/chosen": -530.757080078125, + "logps/rejected": -347.6841634114583, + "loss": 0.1121, + "rewards/chosen": 3.442094326019287, + "rewards/margins": 11.65598440170288, + "rewards/rejected": -8.213890075683594, + "step": 4660 + }, + { + "epoch": 0.42585655550479673, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 6.171595326524969e-06, + "logits/chosen": 501712352.0, + "logits/rejected": 453483872.0, + "logps/chosen": -351.482177734375, + "logps/rejected": -439.0302734375, + "loss": 0.0149, + "rewards/chosen": 3.984384059906006, + "rewards/margins": 11.37248182296753, + "rewards/rejected": -7.388097763061523, + "step": 4661 + }, + { + "epoch": 0.42594792142530835, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 6.170197506972643e-06, + "logits/chosen": 504747562.6666667, + "logits/rejected": 385736288.0, + "logps/chosen": -390.6194661458333, + "logps/rejected": -623.2731323242188, + "loss": 0.0215, + "rewards/chosen": 3.920042037963867, + "rewards/margins": 15.498985290527344, + "rewards/rejected": -11.578943252563477, + "step": 4662 + }, + { + "epoch": 0.42603928734582003, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 6.168799590655825e-06, + "logits/chosen": 777604437.3333334, + "logits/rejected": 413510592.0, + "logps/chosen": -227.2374267578125, + "logps/rejected": -435.1368103027344, + "loss": 0.0294, + "rewards/chosen": 4.097341537475586, + "rewards/margins": 11.8682222366333, + "rewards/rejected": -7.770880699157715, + "step": 4663 + }, + { + "epoch": 0.42613065326633165, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 6.1674015776901105e-06, + "logits/chosen": 565876906.6666666, + "logits/rejected": 912522240.0, + "logps/chosen": -314.5609130859375, + "logps/rejected": -762.971875, + "loss": 0.0077, + "rewards/chosen": 4.266580263773601, + "rewards/margins": 16.18429495493571, + "rewards/rejected": -11.917714691162109, + "step": 4664 + }, + { + "epoch": 0.4262220191868433, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 6.166003468191102e-06, + "logits/chosen": 538161971.2, + "logits/rejected": 400146688.0, + "logps/chosen": -340.649462890625, + "logps/rejected": -568.7022298177084, + "loss": 0.0175, + "rewards/chosen": 4.250592803955078, + "rewards/margins": 14.336414591471353, + "rewards/rejected": -10.085821787516275, + "step": 4665 + }, + { + "epoch": 0.42631338510735495, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 6.164605262274409e-06, + "logits/chosen": 197359504.0, + "logits/rejected": 479811744.0, + "logps/chosen": -313.9195556640625, + "logps/rejected": -568.8488159179688, + "loss": 0.0188, + "rewards/chosen": 3.566169023513794, + "rewards/margins": 13.331011056900024, + "rewards/rejected": -9.76484203338623, + "step": 4666 + }, + { + "epoch": 0.4264047510278666, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 6.163206960055652e-06, + "logits/chosen": 302679136.0, + "logits/rejected": 470775698.28571427, + "logps/chosen": -110.530029296875, + "logps/rejected": -512.9206194196429, + "loss": 0.0349, + "rewards/chosen": 2.323925733566284, + "rewards/margins": 10.350368125098091, + "rewards/rejected": -8.026442391531807, + "step": 4667 + }, + { + "epoch": 0.42649611694837825, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 6.161808561650456e-06, + "logits/chosen": 647008128.0, + "logits/rejected": 458383616.0, + "logps/chosen": -425.2272033691406, + "logps/rejected": -525.7620239257812, + "loss": 0.0081, + "rewards/chosen": 4.208550453186035, + "rewards/margins": 12.688715934753418, + "rewards/rejected": -8.480165481567383, + "step": 4668 + }, + { + "epoch": 0.4265874828688899, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 6.160410067174457e-06, + "logits/chosen": 586258841.6, + "logits/rejected": 767961941.3333334, + "logps/chosen": -392.63974609375, + "logps/rejected": -662.218994140625, + "loss": 0.0364, + "rewards/chosen": 3.2645626068115234, + "rewards/margins": 12.50594393412272, + "rewards/rejected": -9.241381327311197, + "step": 4669 + }, + { + "epoch": 0.42667884878940154, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 6.159011476743296e-06, + "logits/chosen": 448391680.0, + "logits/rejected": 507694880.0, + "logps/chosen": -235.87460327148438, + "logps/rejected": -557.2353515625, + "loss": 0.0089, + "rewards/chosen": 5.0620927810668945, + "rewards/margins": 16.82590961456299, + "rewards/rejected": -11.763816833496094, + "step": 4670 + }, + { + "epoch": 0.4267702147099132, + "grad_norm": 0.390625, + "kl": 0.0, + "learning_rate": 6.157612790472626e-06, + "logits/rejected": 436790784.0, + "logps/rejected": -547.4111938476562, + "loss": 0.0013, + "rewards/rejected": -7.991422653198242, + "step": 4671 + }, + { + "epoch": 0.42686158063042484, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 6.156214008478103e-06, + "logits/chosen": 544609587.2, + "logits/rejected": 487941632.0, + "logps/chosen": -521.58154296875, + "logps/rejected": -620.10693359375, + "loss": 0.0321, + "rewards/chosen": 3.192473602294922, + "rewards/margins": 13.379787953694663, + "rewards/rejected": -10.18731435139974, + "step": 4672 + }, + { + "epoch": 0.4269529465509365, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.154815130875394e-06, + "logits/chosen": 506121874.28571427, + "logits/rejected": 659417152.0, + "logps/chosen": -348.4806431361607, + "logps/rejected": -879.6422119140625, + "loss": 0.0345, + "rewards/chosen": 3.695714678083147, + "rewards/margins": 14.094304765973773, + "rewards/rejected": -10.398590087890625, + "step": 4673 + }, + { + "epoch": 0.42704431247144814, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 6.1534161577801735e-06, + "logits/chosen": 361129932.8, + "logits/rejected": 406126848.0, + "logps/chosen": -295.906103515625, + "logps/rejected": -382.2491861979167, + "loss": 0.0268, + "rewards/chosen": 3.3835403442382814, + "rewards/margins": 12.20221316019694, + "rewards/rejected": -8.818672815958658, + "step": 4674 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.373046875, + "kl": 0.0, + "learning_rate": 6.152017089308124e-06, + "logits/chosen": 906514816.0, + "logits/rejected": 535270619.4285714, + "logps/chosen": -319.14141845703125, + "logps/rejected": -669.5851702008929, + "loss": 0.0011, + "rewards/chosen": 5.104150295257568, + "rewards/margins": 14.567112854548864, + "rewards/rejected": -9.462962559291295, + "step": 4675 + }, + { + "epoch": 0.42722704431247144, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 6.1506179255749335e-06, + "logits/chosen": 296669525.3333333, + "logits/rejected": 582060595.2, + "logps/chosen": -258.27490234375, + "logps/rejected": -658.897119140625, + "loss": 0.0057, + "rewards/chosen": 4.6862897872924805, + "rewards/margins": 15.24840145111084, + "rewards/rejected": -10.56211166381836, + "step": 4676 + }, + { + "epoch": 0.4273184102329831, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 6.149218666696302e-06, + "logits/chosen": 776948736.0, + "logits/rejected": 628779712.0, + "logps/chosen": -429.48077392578125, + "logps/rejected": -353.64642333984375, + "loss": 0.0218, + "rewards/chosen": 3.353257656097412, + "rewards/margins": 11.212273120880127, + "rewards/rejected": -7.859015464782715, + "step": 4677 + }, + { + "epoch": 0.42740977615349474, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 6.1478193127879325e-06, + "logits/chosen": 583067456.0, + "logits/rejected": 677618304.0, + "logps/chosen": -231.9088897705078, + "logps/rejected": -707.6492919921875, + "loss": 0.0164, + "rewards/chosen": 3.4657652378082275, + "rewards/margins": 12.852203607559204, + "rewards/rejected": -9.386438369750977, + "step": 4678 + }, + { + "epoch": 0.4275011420740064, + "grad_norm": 0.86328125, + "kl": 0.0, + "learning_rate": 6.146419863965544e-06, + "logits/chosen": 597519957.3333334, + "logits/rejected": 623401369.6, + "logps/chosen": -352.9481201171875, + "logps/rejected": -476.36005859375, + "loss": 0.0051, + "rewards/chosen": 4.329903284708659, + "rewards/margins": 13.536706415812176, + "rewards/rejected": -9.206803131103516, + "step": 4679 + }, + { + "epoch": 0.42759250799451803, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 6.145020320344852e-06, + "logits/chosen": 402525568.0, + "logits/rejected": 426141542.4, + "logps/chosen": -355.4449055989583, + "logps/rejected": -427.966357421875, + "loss": 0.0118, + "rewards/chosen": 4.149351119995117, + "rewards/margins": 12.710233688354492, + "rewards/rejected": -8.560882568359375, + "step": 4680 + }, + { + "epoch": 0.4276838739150297, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 6.143620682041589e-06, + "logits/chosen": 509629184.0, + "logits/rejected": 538463658.6666666, + "logps/chosen": -303.14306640625, + "logps/rejected": -581.5436197916666, + "loss": 0.0309, + "rewards/chosen": 3.4791095733642576, + "rewards/margins": 12.595401255289712, + "rewards/rejected": -9.116291681925455, + "step": 4681 + }, + { + "epoch": 0.42777523983554133, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 6.142220949171493e-06, + "logits/chosen": 352188053.3333333, + "logits/rejected": 369395200.0, + "logps/chosen": -212.3219197591146, + "logps/rejected": -558.3529296875, + "loss": 0.0083, + "rewards/chosen": 4.298760732014974, + "rewards/margins": 13.09134343465169, + "rewards/rejected": -8.792582702636718, + "step": 4682 + }, + { + "epoch": 0.427866605756053, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 6.1408211218503055e-06, + "logits/chosen": 496967872.0, + "logits/rejected": 430025408.0, + "logps/chosen": -256.6072998046875, + "logps/rejected": -459.8016662597656, + "loss": 0.0382, + "rewards/chosen": 2.5236706733703613, + "rewards/margins": 9.858044624328613, + "rewards/rejected": -7.334373950958252, + "step": 4683 + }, + { + "epoch": 0.42795797167656463, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 6.139421200193782e-06, + "logits/chosen": 452815072.0, + "logits/rejected": 505521984.0, + "logps/chosen": -316.3345947265625, + "logps/rejected": -746.5700073242188, + "loss": 0.0187, + "rewards/chosen": 3.6103224754333496, + "rewards/margins": 15.133119106292725, + "rewards/rejected": -11.522796630859375, + "step": 4684 + }, + { + "epoch": 0.4280493375970763, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 6.1380211843176815e-06, + "logits/chosen": 517454592.0, + "logits/rejected": 436801024.0, + "logps/chosen": -620.152587890625, + "logps/rejected": -504.46888950892856, + "loss": 0.0869, + "rewards/chosen": 4.015973091125488, + "rewards/margins": 11.87545599256243, + "rewards/rejected": -7.859482901436942, + "step": 4685 + }, + { + "epoch": 0.42814070351758793, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 6.136621074337774e-06, + "logits/chosen": 538352512.0, + "logits/rejected": 1392347136.0, + "logps/chosen": -387.8190104166667, + "logps/rejected": -464.016650390625, + "loss": 0.0061, + "rewards/chosen": 4.359114329020183, + "rewards/margins": 13.710096232096355, + "rewards/rejected": -9.350981903076171, + "step": 4686 + }, + { + "epoch": 0.4282320694380996, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.135220870369835e-06, + "logits/chosen": 619101644.8, + "logits/rejected": 456002944.0, + "logps/chosen": -441.170703125, + "logps/rejected": -399.8865152994792, + "loss": 0.0297, + "rewards/chosen": 3.174397659301758, + "rewards/margins": 10.996728134155273, + "rewards/rejected": -7.822330474853516, + "step": 4687 + }, + { + "epoch": 0.4283234353586112, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 6.133820572529647e-06, + "logits/chosen": 794618521.6, + "logits/rejected": 453643605.3333333, + "logps/chosen": -276.6318115234375, + "logps/rejected": -362.7317301432292, + "loss": 0.026, + "rewards/chosen": 3.2390403747558594, + "rewards/margins": 11.816048940022787, + "rewards/rejected": -8.577008565266928, + "step": 4688 + }, + { + "epoch": 0.4284148012791229, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.132420180933003e-06, + "logits/chosen": 426110762.6666667, + "logits/rejected": 442623168.0, + "logps/chosen": -142.32759602864584, + "logps/rejected": -571.45947265625, + "loss": 0.0482, + "rewards/chosen": 3.207493464152018, + "rewards/margins": 12.346450487772623, + "rewards/rejected": -9.138957023620605, + "step": 4689 + }, + { + "epoch": 0.4285061671996345, + "grad_norm": 1.0703125, + "kl": 0.0, + "learning_rate": 6.131019695695702e-06, + "logits/chosen": 648532224.0, + "logits/rejected": 404998400.0, + "logps/chosen": -524.3568725585938, + "logps/rejected": -361.9511311848958, + "loss": 0.0047, + "rewards/chosen": 4.368438720703125, + "rewards/margins": 12.273399353027344, + "rewards/rejected": -7.904960632324219, + "step": 4690 + }, + { + "epoch": 0.4285975331201462, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 6.1296191169335515e-06, + "logits/chosen": 291149600.0, + "logits/rejected": 700637056.0, + "logps/chosen": -174.19847106933594, + "logps/rejected": -386.1473795572917, + "loss": 0.0071, + "rewards/chosen": 4.267212867736816, + "rewards/margins": 13.108663241068522, + "rewards/rejected": -8.841450373331705, + "step": 4691 + }, + { + "epoch": 0.4286888990406578, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 6.128218444762367e-06, + "logits/chosen": 495712597.3333333, + "logits/rejected": 610776780.8, + "logps/chosen": -270.7399495442708, + "logps/rejected": -675.9189453125, + "loss": 0.0233, + "rewards/chosen": 2.775158246358236, + "rewards/margins": 13.02019837697347, + "rewards/rejected": -10.245040130615234, + "step": 4692 + }, + { + "epoch": 0.4287802649611695, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 6.126817679297968e-06, + "logits/chosen": 510018406.4, + "logits/rejected": 517887360.0, + "logps/chosen": -299.619775390625, + "logps/rejected": -522.9286702473959, + "loss": 0.0176, + "rewards/chosen": 3.781299591064453, + "rewards/margins": 13.18798828125, + "rewards/rejected": -9.406688690185547, + "step": 4693 + }, + { + "epoch": 0.4288716308816811, + "grad_norm": 32.5, + "kl": 0.0, + "learning_rate": 6.125416820656189e-06, + "logits/chosen": 744155264.0, + "logits/rejected": 789327296.0, + "logps/chosen": -164.52548217773438, + "logps/rejected": -359.6156005859375, + "loss": 0.0489, + "rewards/chosen": 2.6883983612060547, + "rewards/margins": 9.890144348144531, + "rewards/rejected": -7.201745986938477, + "step": 4694 + }, + { + "epoch": 0.4289629968021928, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 6.124015868952866e-06, + "logits/chosen": 467100160.0, + "logits/rejected": 418027040.0, + "logps/chosen": -262.91457112630206, + "logps/rejected": -408.07684326171875, + "loss": 0.0426, + "rewards/chosen": 3.144280751546224, + "rewards/margins": 10.996296246846518, + "rewards/rejected": -7.852015495300293, + "step": 4695 + }, + { + "epoch": 0.4290543627227044, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 6.122614824303845e-06, + "logits/chosen": 389511082.6666667, + "logits/rejected": 466171392.0, + "logps/chosen": -425.1529947916667, + "logps/rejected": -439.894140625, + "loss": 0.0127, + "rewards/chosen": 3.5959672927856445, + "rewards/margins": 11.9149751663208, + "rewards/rejected": -8.319007873535156, + "step": 4696 + }, + { + "epoch": 0.4291457286432161, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 6.121213686824979e-06, + "logits/chosen": 697654784.0, + "logits/rejected": 784880256.0, + "logps/chosen": -364.3827311197917, + "logps/rejected": -666.35107421875, + "loss": 0.0393, + "rewards/chosen": 3.4259821573893228, + "rewards/margins": 14.008965174357096, + "rewards/rejected": -10.582983016967773, + "step": 4697 + }, + { + "epoch": 0.4292370945637277, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 6.119812456632129e-06, + "logits/chosen": 400227200.0, + "logits/rejected": 456786208.0, + "logps/chosen": -229.69815063476562, + "logps/rejected": -529.8700561523438, + "loss": 0.1256, + "rewards/chosen": 2.37920880317688, + "rewards/margins": 11.588576555252075, + "rewards/rejected": -9.209367752075195, + "step": 4698 + }, + { + "epoch": 0.4293284604842394, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 6.118411133841164e-06, + "logits/chosen": 345957536.0, + "logits/rejected": 571547904.0, + "logps/chosen": -229.33123779296875, + "logps/rejected": -513.38671875, + "loss": 0.0036, + "rewards/chosen": 4.284170627593994, + "rewards/margins": 13.494706948598227, + "rewards/rejected": -9.210536321004232, + "step": 4699 + }, + { + "epoch": 0.429419826404751, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.117009718567961e-06, + "logits/chosen": 855014656.0, + "logits/rejected": 834192213.3333334, + "logps/chosen": -283.863818359375, + "logps/rejected": -804.98828125, + "loss": 0.0442, + "rewards/chosen": 3.19726676940918, + "rewards/margins": 13.421890767415366, + "rewards/rejected": -10.224623998006185, + "step": 4700 + }, + { + "epoch": 0.4295111923252627, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 6.115608210928404e-06, + "logits/chosen": 1010918229.3333334, + "logits/rejected": 735043968.0, + "logps/chosen": -361.7737630208333, + "logps/rejected": -552.6760864257812, + "loss": 0.0136, + "rewards/chosen": 4.412845929463704, + "rewards/margins": 13.55399735768636, + "rewards/rejected": -9.141151428222656, + "step": 4701 + }, + { + "epoch": 0.4296025582457743, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 6.114206611038384e-06, + "logits/chosen": 484655718.4, + "logits/rejected": 401956181.3333333, + "logps/chosen": -337.6914794921875, + "logps/rejected": -625.1837565104166, + "loss": 0.0192, + "rewards/chosen": 3.9119842529296873, + "rewards/margins": 13.346910603841145, + "rewards/rejected": -9.434926350911459, + "step": 4702 + }, + { + "epoch": 0.429693924166286, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 6.1128049190138014e-06, + "logits/chosen": 654717056.0, + "logits/rejected": 480823552.0, + "logps/chosen": -379.03924560546875, + "logps/rejected": -491.4265441894531, + "loss": 0.0385, + "rewards/chosen": 2.8762879371643066, + "rewards/margins": 11.557273387908936, + "rewards/rejected": -8.680985450744629, + "step": 4703 + }, + { + "epoch": 0.4297852900867976, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 6.111403134970562e-06, + "logits/chosen": 600342848.0, + "logits/rejected": 593943936.0, + "logps/chosen": -185.26588439941406, + "logps/rejected": -550.3991088867188, + "loss": 0.0224, + "rewards/chosen": 3.3455770015716553, + "rewards/margins": 12.503303289413452, + "rewards/rejected": -9.157726287841797, + "step": 4704 + }, + { + "epoch": 0.4298766560073093, + "grad_norm": 0.2099609375, + "kl": 0.0, + "learning_rate": 6.11000125902458e-06, + "logits/rejected": 380190688.0, + "logps/rejected": -621.945556640625, + "loss": 0.0003, + "rewards/rejected": -10.474740982055664, + "step": 4705 + }, + { + "epoch": 0.4299680219278209, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 6.108599291291778e-06, + "logits/chosen": 709006037.3333334, + "logits/rejected": 559811968.0, + "logps/chosen": -514.8957112630209, + "logps/rejected": -466.309326171875, + "loss": 0.0307, + "rewards/chosen": 3.7365099589029946, + "rewards/margins": 12.701428095499674, + "rewards/rejected": -8.96491813659668, + "step": 4706 + }, + { + "epoch": 0.4300593878483326, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 6.1071972318880865e-06, + "logits/chosen": 547091712.0, + "logits/rejected": 382581376.0, + "logps/chosen": -512.1649169921875, + "logps/rejected": -518.1900227864584, + "loss": 0.0044, + "rewards/chosen": 4.1201324462890625, + "rewards/margins": 13.941972096761068, + "rewards/rejected": -9.821839650472006, + "step": 4707 + }, + { + "epoch": 0.4301507537688442, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 6.105795080929442e-06, + "logits/chosen": 499855957.3333333, + "logits/rejected": 235597728.0, + "logps/chosen": -247.0539347330729, + "logps/rejected": -408.04541015625, + "loss": 0.0181, + "rewards/chosen": 3.932765324910482, + "rewards/margins": 13.833274205525717, + "rewards/rejected": -9.900508880615234, + "step": 4708 + }, + { + "epoch": 0.4302421196893559, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 6.104392838531789e-06, + "logits/chosen": 523820320.0, + "logits/rejected": 472636000.0, + "logps/chosen": -444.2149658203125, + "logps/rejected": -607.08642578125, + "loss": 0.0114, + "rewards/chosen": 4.241453170776367, + "rewards/margins": 12.942512512207031, + "rewards/rejected": -8.701059341430664, + "step": 4709 + }, + { + "epoch": 0.4303334856098675, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 6.1029905048110825e-06, + "logits/chosen": 750839104.0, + "logits/rejected": 999216128.0, + "logps/chosen": -401.25213623046875, + "logps/rejected": -441.306884765625, + "loss": 0.0112, + "rewards/chosen": 3.880143880844116, + "rewards/margins": 13.605562925338745, + "rewards/rejected": -9.725419044494629, + "step": 4710 + }, + { + "epoch": 0.4304248515303792, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 6.101588079883281e-06, + "logits/chosen": 638272768.0, + "logits/rejected": 606829781.3333334, + "logps/chosen": -258.30712890625, + "logps/rejected": -452.6991373697917, + "loss": 0.1208, + "rewards/chosen": 3.336897277832031, + "rewards/margins": 9.210259501139323, + "rewards/rejected": -5.873362223307292, + "step": 4711 + }, + { + "epoch": 0.4305162174508908, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 6.1001855638643496e-06, + "logits/chosen": 639506329.6, + "logits/rejected": 332242005.3333333, + "logps/chosen": -517.8890625, + "logps/rejected": -415.142822265625, + "loss": 0.0153, + "rewards/chosen": 3.7898303985595705, + "rewards/margins": 12.260989379882812, + "rewards/rejected": -8.471158981323242, + "step": 4712 + }, + { + "epoch": 0.4306075833714025, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 6.098782956870266e-06, + "logits/chosen": 594510677.3333334, + "logits/rejected": 344129945.6, + "logps/chosen": -379.6613362630208, + "logps/rejected": -460.6751953125, + "loss": 0.0178, + "rewards/chosen": 3.1915661493937173, + "rewards/margins": 11.41960875193278, + "rewards/rejected": -8.228042602539062, + "step": 4713 + }, + { + "epoch": 0.4306989492919141, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.097380259017014e-06, + "logits/chosen": 410050240.0, + "logits/rejected": 700860864.0, + "logps/chosen": -209.35928344726562, + "logps/rejected": -698.1777954101562, + "loss": 0.015, + "rewards/chosen": 3.584728717803955, + "rewards/margins": 11.998826503753662, + "rewards/rejected": -8.414097785949707, + "step": 4714 + }, + { + "epoch": 0.4307903152124258, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 6.095977470420581e-06, + "logits/chosen": 420977056.0, + "logits/rejected": 404983168.0, + "logps/chosen": -301.4022216796875, + "logps/rejected": -655.4949951171875, + "loss": 0.0087, + "rewards/chosen": 4.393619537353516, + "rewards/margins": 17.51806926727295, + "rewards/rejected": -13.124449729919434, + "step": 4715 + }, + { + "epoch": 0.4308816811329374, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 6.094574591196965e-06, + "logits/chosen": 695142546.2857143, + "logits/rejected": 235797472.0, + "logps/chosen": -327.80604771205356, + "logps/rejected": -159.85191345214844, + "loss": 0.059, + "rewards/chosen": 3.2630530766078403, + "rewards/margins": 10.689872877938406, + "rewards/rejected": -7.426819801330566, + "step": 4716 + }, + { + "epoch": 0.4309730470534491, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 6.093171621462173e-06, + "logits/chosen": 657549970.2857143, + "logits/rejected": 542636416.0, + "logps/chosen": -379.9183872767857, + "logps/rejected": -172.83856201171875, + "loss": 0.0285, + "rewards/chosen": 3.7428741455078125, + "rewards/margins": 12.17701244354248, + "rewards/rejected": -8.434138298034668, + "step": 4717 + }, + { + "epoch": 0.4310644129739607, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 6.091768561332216e-06, + "logits/chosen": 692906816.0, + "logits/rejected": 412602432.0, + "logps/chosen": -258.7525634765625, + "logps/rejected": -396.26019287109375, + "loss": 0.0117, + "rewards/chosen": 3.917259693145752, + "rewards/margins": 13.771048069000244, + "rewards/rejected": -9.853788375854492, + "step": 4718 + }, + { + "epoch": 0.4311557788944724, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 6.090365410923113e-06, + "logits/chosen": 428031744.0, + "logits/rejected": 384476448.0, + "logps/chosen": -269.75823974609375, + "logps/rejected": -349.33209228515625, + "loss": 0.0395, + "rewards/chosen": 3.1367383003234863, + "rewards/margins": 10.378806591033936, + "rewards/rejected": -7.242068290710449, + "step": 4719 + }, + { + "epoch": 0.431247144814984, + "grad_norm": 0.55078125, + "kl": 0.0, + "learning_rate": 6.088962170350895e-06, + "logits/chosen": 574048768.0, + "logits/rejected": 642648704.0, + "logps/chosen": -527.5980224609375, + "logps/rejected": -459.9771321614583, + "loss": 0.0028, + "rewards/chosen": 4.686090469360352, + "rewards/margins": 13.332012176513672, + "rewards/rejected": -8.64592170715332, + "step": 4720 + }, + { + "epoch": 0.4313385107354957, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 6.087558839731594e-06, + "logits/chosen": 1120251562.6666667, + "logits/rejected": 535448288.0, + "logps/chosen": -422.914306640625, + "logps/rejected": -323.52947998046875, + "loss": 0.0239, + "rewards/chosen": 3.8952722549438477, + "rewards/margins": 10.733224868774414, + "rewards/rejected": -6.837952613830566, + "step": 4721 + }, + { + "epoch": 0.4314298766560073, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 6.086155419181254e-06, + "logits/chosen": 564437888.0, + "logits/rejected": 484774784.0, + "logps/chosen": -283.234130859375, + "logps/rejected": -664.39111328125, + "loss": 0.0164, + "rewards/chosen": 4.161798477172852, + "rewards/margins": 15.730693817138672, + "rewards/rejected": -11.56889533996582, + "step": 4722 + }, + { + "epoch": 0.43152124257651897, + "grad_norm": 75.5, + "kl": 0.0, + "learning_rate": 6.0847519088159225e-06, + "logits/chosen": 319667865.6, + "logits/rejected": 424285696.0, + "logps/chosen": -231.53681640625, + "logps/rejected": -508.7857666015625, + "loss": 0.0523, + "rewards/chosen": 2.924471855163574, + "rewards/margins": 12.187699317932129, + "rewards/rejected": -9.263227462768555, + "step": 4723 + }, + { + "epoch": 0.4316126084970306, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 6.08334830875166e-06, + "logits/chosen": 452614297.6, + "logits/rejected": 360268202.6666667, + "logps/chosen": -400.0806396484375, + "logps/rejected": -456.7556559244792, + "loss": 0.0303, + "rewards/chosen": 3.703809356689453, + "rewards/margins": 13.259395853678384, + "rewards/rejected": -9.555586496988932, + "step": 4724 + }, + { + "epoch": 0.43170397441754227, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 6.081944619104529e-06, + "logits/chosen": 692002560.0, + "logits/rejected": 711672490.6666666, + "logps/chosen": -501.171630859375, + "logps/rejected": -816.6842447916666, + "loss": 0.0051, + "rewards/chosen": 4.242750644683838, + "rewards/margins": 14.851633548736572, + "rewards/rejected": -10.608882904052734, + "step": 4725 + }, + { + "epoch": 0.4317953403380539, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 6.080540839990604e-06, + "logits/chosen": 633544896.0, + "logits/rejected": 390641056.0, + "logps/chosen": -345.8310241699219, + "logps/rejected": -574.2813720703125, + "loss": 0.0135, + "rewards/chosen": 3.6457581520080566, + "rewards/margins": 15.065002918243408, + "rewards/rejected": -11.419244766235352, + "step": 4726 + }, + { + "epoch": 0.43188670625856557, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 6.07913697152596e-06, + "logits/chosen": 382705578.6666667, + "logits/rejected": 469203148.8, + "logps/chosen": -135.8587443033854, + "logps/rejected": -422.946435546875, + "loss": 0.0217, + "rewards/chosen": 3.181259791056315, + "rewards/margins": 11.14347292582194, + "rewards/rejected": -7.962213134765625, + "step": 4727 + }, + { + "epoch": 0.4319780721790772, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 6.077733013826692e-06, + "logits/chosen": 710713984.0, + "logits/rejected": 503272960.0, + "logps/chosen": -257.58717854817706, + "logps/rejected": -627.035546875, + "loss": 0.0121, + "rewards/chosen": 4.05779234568278, + "rewards/margins": 13.802886644999187, + "rewards/rejected": -9.745094299316406, + "step": 4728 + }, + { + "epoch": 0.43206943809958887, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 6.076328967008886e-06, + "logits/chosen": 536066720.0, + "logits/rejected": 518372394.6666667, + "logps/chosen": -296.2675476074219, + "logps/rejected": -366.4961344401042, + "loss": 0.0075, + "rewards/chosen": 4.701496124267578, + "rewards/margins": 11.145087560017902, + "rewards/rejected": -6.443591435750325, + "step": 4729 + }, + { + "epoch": 0.4321608040201005, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 6.074924831188647e-06, + "logits/chosen": 542228906.6666666, + "logits/rejected": 640106240.0, + "logps/chosen": -358.8658040364583, + "logps/rejected": -520.8178100585938, + "loss": 0.0343, + "rewards/chosen": 3.928725242614746, + "rewards/margins": 13.58565902709961, + "rewards/rejected": -9.656933784484863, + "step": 4730 + }, + { + "epoch": 0.43225216994061216, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 6.073520606482083e-06, + "logits/chosen": 283309004.8, + "logits/rejected": 547107200.0, + "logps/chosen": -234.398291015625, + "logps/rejected": -510.3452962239583, + "loss": 0.0352, + "rewards/chosen": 3.654659652709961, + "rewards/margins": 13.268775049845377, + "rewards/rejected": -9.614115397135416, + "step": 4731 + }, + { + "epoch": 0.4323435358611238, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.072116293005312e-06, + "logits/chosen": 593652480.0, + "logits/rejected": 543672256.0, + "logps/chosen": -459.8378092447917, + "logps/rejected": -502.38232421875, + "loss": 0.0345, + "rewards/chosen": 3.107450803120931, + "rewards/margins": 14.675596555074057, + "rewards/rejected": -11.568145751953125, + "step": 4732 + }, + { + "epoch": 0.43243490178163546, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.070711890874456e-06, + "logits/chosen": 579766016.0, + "logits/rejected": 510110432.0, + "logps/chosen": -310.4814860026042, + "logps/rejected": -745.5521850585938, + "loss": 0.0307, + "rewards/chosen": 3.4467976888020835, + "rewards/margins": 13.401146252950033, + "rewards/rejected": -9.95434856414795, + "step": 4733 + }, + { + "epoch": 0.4325262677021471, + "grad_norm": 28.375, + "kl": 0.0, + "learning_rate": 6.06930740020565e-06, + "logits/chosen": 875422617.6, + "logits/rejected": 804042922.6666666, + "logps/chosen": -394.71298828125, + "logps/rejected": -442.47509765625, + "loss": 0.0437, + "rewards/chosen": 3.3567951202392576, + "rewards/margins": 9.829719670613606, + "rewards/rejected": -6.472924550374349, + "step": 4734 + }, + { + "epoch": 0.43261763362265876, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 6.067902821115027e-06, + "logits/chosen": 489415808.0, + "logits/rejected": 578836864.0, + "logps/chosen": -298.13665771484375, + "logps/rejected": -705.7452392578125, + "loss": 0.009, + "rewards/chosen": 4.491037845611572, + "rewards/margins": 14.519117832183838, + "rewards/rejected": -10.028079986572266, + "step": 4735 + }, + { + "epoch": 0.4327089995431704, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 6.066498153718735e-06, + "logits/chosen": 505395046.4, + "logits/rejected": 300671061.3333333, + "logps/chosen": -253.6462890625, + "logps/rejected": -416.2357584635417, + "loss": 0.0332, + "rewards/chosen": 3.557600402832031, + "rewards/margins": 14.065735880533854, + "rewards/rejected": -10.508135477701822, + "step": 4736 + }, + { + "epoch": 0.43280036546368206, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 6.065093398132927e-06, + "logits/chosen": 215383216.0, + "logits/rejected": 403599725.71428573, + "logps/chosen": -164.71395874023438, + "logps/rejected": -488.66929408482144, + "loss": 0.0149, + "rewards/chosen": 2.0268402099609375, + "rewards/margins": 12.210659572056361, + "rewards/rejected": -10.183819362095424, + "step": 4737 + }, + { + "epoch": 0.4328917313841937, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.063688554473763e-06, + "logits/chosen": 271866304.0, + "logits/rejected": 551486464.0, + "logps/chosen": -401.8771667480469, + "logps/rejected": -512.4989624023438, + "loss": 0.0199, + "rewards/chosen": 3.6203856468200684, + "rewards/margins": 12.08494520187378, + "rewards/rejected": -8.464559555053711, + "step": 4738 + }, + { + "epoch": 0.43298309730470536, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 6.062283622857412e-06, + "logits/chosen": 654259660.8, + "logits/rejected": 1203633493.3333333, + "logps/chosen": -314.5146484375, + "logps/rejected": -388.2625732421875, + "loss": 0.0188, + "rewards/chosen": 3.803203582763672, + "rewards/margins": 13.533796946207682, + "rewards/rejected": -9.73059336344401, + "step": 4739 + }, + { + "epoch": 0.433074463225217, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 6.060878603400046e-06, + "logits/chosen": 421990016.0, + "logits/rejected": 574445525.3333334, + "logps/chosen": -329.25006103515625, + "logps/rejected": -584.0643310546875, + "loss": 0.006, + "rewards/chosen": 4.551753044128418, + "rewards/margins": 12.754159609476725, + "rewards/rejected": -8.202406565348307, + "step": 4740 + }, + { + "epoch": 0.43316582914572865, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.059473496217852e-06, + "logits/chosen": 641724313.6, + "logits/rejected": 507370538.6666667, + "logps/chosen": -415.508154296875, + "logps/rejected": -543.525146484375, + "loss": 0.027, + "rewards/chosen": 3.4169593811035157, + "rewards/margins": 12.888539886474609, + "rewards/rejected": -9.471580505371094, + "step": 4741 + }, + { + "epoch": 0.4332571950662403, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 6.058068301427014e-06, + "logits/chosen": 538702677.3333334, + "logits/rejected": 445649305.6, + "logps/chosen": -428.0201822916667, + "logps/rejected": -430.307763671875, + "loss": 0.0087, + "rewards/chosen": 3.8490660985310874, + "rewards/margins": 12.175997098286947, + "rewards/rejected": -8.32693099975586, + "step": 4742 + }, + { + "epoch": 0.43334856098675195, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 6.056663019143731e-06, + "logits/chosen": 632186470.4, + "logits/rejected": 668737792.0, + "logps/chosen": -401.1285400390625, + "logps/rejected": -645.9681803385416, + "loss": 0.0141, + "rewards/chosen": 4.167246627807617, + "rewards/margins": 13.118788528442384, + "rewards/rejected": -8.951541900634766, + "step": 4743 + }, + { + "epoch": 0.4334399269072636, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 6.055257649484205e-06, + "logits/chosen": 649371776.0, + "logits/rejected": 500296576.0, + "logps/chosen": -432.9201354980469, + "logps/rejected": -423.8553771972656, + "loss": 0.014, + "rewards/chosen": 3.667290449142456, + "rewards/margins": 12.97441840171814, + "rewards/rejected": -9.307127952575684, + "step": 4744 + }, + { + "epoch": 0.43353129282777525, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 6.053852192564651e-06, + "logits/chosen": 539099648.0, + "logits/rejected": 524211424.0, + "logps/chosen": -337.45994059244794, + "logps/rejected": -493.6079406738281, + "loss": 0.0199, + "rewards/chosen": 4.079202651977539, + "rewards/margins": 13.71548843383789, + "rewards/rejected": -9.636285781860352, + "step": 4745 + }, + { + "epoch": 0.43362265874828687, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 6.052446648501283e-06, + "logits/chosen": 331834410.6666667, + "logits/rejected": 369790028.8, + "logps/chosen": -244.91337076822916, + "logps/rejected": -369.913037109375, + "loss": 0.005, + "rewards/chosen": 4.642565091451009, + "rewards/margins": 13.11590798695882, + "rewards/rejected": -8.473342895507812, + "step": 4746 + }, + { + "epoch": 0.43371402466879855, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.051041017410329e-06, + "logits/chosen": 488654848.0, + "logits/rejected": 409977184.0, + "logps/chosen": -425.353759765625, + "logps/rejected": -316.67364501953125, + "loss": 0.0475, + "rewards/chosen": 2.7549994786580405, + "rewards/margins": 10.860193570454916, + "rewards/rejected": -8.105194091796875, + "step": 4747 + }, + { + "epoch": 0.43380539058931017, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 6.049635299408021e-06, + "logits/chosen": 519579187.2, + "logits/rejected": 585189717.3333334, + "logps/chosen": -443.671240234375, + "logps/rejected": -661.152587890625, + "loss": 0.0186, + "rewards/chosen": 4.315400695800781, + "rewards/margins": 14.312279637654623, + "rewards/rejected": -9.996878941853842, + "step": 4748 + }, + { + "epoch": 0.43389675650982185, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 6.048229494610599e-06, + "logits/chosen": 533608448.0, + "logits/rejected": 1226311082.6666667, + "logps/chosen": -409.2414306640625, + "logps/rejected": -811.4964192708334, + "loss": 0.0256, + "rewards/chosen": 3.2501148223876952, + "rewards/margins": 13.562614186604819, + "rewards/rejected": -10.312499364217123, + "step": 4749 + }, + { + "epoch": 0.43398812243033347, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 6.046823603134309e-06, + "logits/chosen": 292098346.6666667, + "logits/rejected": 669530265.6, + "logps/chosen": -209.20418294270834, + "logps/rejected": -394.623974609375, + "loss": 0.0645, + "rewards/chosen": 3.9143091837565103, + "rewards/margins": 11.678585306803384, + "rewards/rejected": -7.764276123046875, + "step": 4750 + }, + { + "epoch": 0.43407948835084514, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 6.045417625095407e-06, + "logits/chosen": 465774250.6666667, + "logits/rejected": 504465408.0, + "logps/chosen": -335.53163655598956, + "logps/rejected": -520.98388671875, + "loss": 0.0205, + "rewards/chosen": 3.9458643595377603, + "rewards/margins": 12.867111841837565, + "rewards/rejected": -8.921247482299805, + "step": 4751 + }, + { + "epoch": 0.43417085427135677, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 6.044011560610154e-06, + "logits/chosen": 1941943978.6666667, + "logits/rejected": 783950233.6, + "logps/chosen": -539.2881673177084, + "logps/rejected": -553.03388671875, + "loss": 0.0076, + "rewards/chosen": 4.325013478597005, + "rewards/margins": 14.479253896077473, + "rewards/rejected": -10.15424041748047, + "step": 4752 + }, + { + "epoch": 0.43426222019186844, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 6.042605409794816e-06, + "logits/chosen": 655453798.4, + "logits/rejected": 411948202.6666667, + "logps/chosen": -360.1626953125, + "logps/rejected": -339.843994140625, + "loss": 0.0113, + "rewards/chosen": 4.2586925506591795, + "rewards/margins": 11.964754231770833, + "rewards/rejected": -7.706061681111653, + "step": 4753 + }, + { + "epoch": 0.43435358611238006, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 6.041199172765673e-06, + "logits/chosen": 654278208.0, + "logits/rejected": 911671466.6666666, + "logps/chosen": -239.64642333984375, + "logps/rejected": -487.5375162760417, + "loss": 0.007, + "rewards/chosen": 3.6528372764587402, + "rewards/margins": 12.17283328374227, + "rewards/rejected": -8.51999600728353, + "step": 4754 + }, + { + "epoch": 0.43444495203289174, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 6.039792849639003e-06, + "logits/chosen": 469438890.6666667, + "logits/rejected": 620529305.6, + "logps/chosen": -396.084228515625, + "logps/rejected": -531.43759765625, + "loss": 0.0127, + "rewards/chosen": 4.180509567260742, + "rewards/margins": 12.82081871032715, + "rewards/rejected": -8.640309143066407, + "step": 4755 + }, + { + "epoch": 0.43453631795340336, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 6.0383864405311e-06, + "logits/chosen": 531401472.0, + "logits/rejected": 456136192.0, + "logps/chosen": -382.9422607421875, + "logps/rejected": -477.1494954427083, + "loss": 0.0149, + "rewards/chosen": 2.832416534423828, + "rewards/margins": 12.213062286376953, + "rewards/rejected": -9.380645751953125, + "step": 4756 + }, + { + "epoch": 0.43462768387391504, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 6.036979945558258e-06, + "logits/chosen": 712656064.0, + "logits/rejected": 358079488.0, + "logps/chosen": -494.78912353515625, + "logps/rejected": -273.8092346191406, + "loss": 0.0191, + "rewards/chosen": 3.8815865516662598, + "rewards/margins": 10.440235137939453, + "rewards/rejected": -6.558648586273193, + "step": 4757 + }, + { + "epoch": 0.43471904979442666, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 6.035573364836783e-06, + "logits/chosen": 604717465.6, + "logits/rejected": 404017322.6666667, + "logps/chosen": -305.314501953125, + "logps/rejected": -543.583984375, + "loss": 0.0384, + "rewards/chosen": 2.9279455184936523, + "rewards/margins": 12.875557009379069, + "rewards/rejected": -9.947611490885416, + "step": 4758 + }, + { + "epoch": 0.43481041571493834, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 6.034166698482984e-06, + "logits/chosen": 217781888.0, + "logits/rejected": 444149248.0, + "logps/chosen": -216.1980183919271, + "logps/rejected": -471.6302734375, + "loss": 0.0101, + "rewards/chosen": 4.000211079915364, + "rewards/margins": 13.523308308919269, + "rewards/rejected": -9.523097229003906, + "step": 4759 + }, + { + "epoch": 0.43490178163544996, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 6.032759946613183e-06, + "logits/chosen": 623632256.0, + "logits/rejected": 930699776.0, + "logps/chosen": -212.2069854736328, + "logps/rejected": -413.99560546875, + "loss": 0.006, + "rewards/chosen": 3.8151140213012695, + "rewards/margins": 12.721216519673666, + "rewards/rejected": -8.906102498372396, + "step": 4760 + }, + { + "epoch": 0.43499314755596163, + "grad_norm": 66.0, + "kl": 0.0, + "learning_rate": 6.031353109343702e-06, + "logits/chosen": 666269056.0, + "logits/rejected": 631987498.6666666, + "logps/chosen": -425.82684326171875, + "logps/rejected": -480.7640787760417, + "loss": 0.0511, + "rewards/chosen": 3.0530900955200195, + "rewards/margins": 11.21589438120524, + "rewards/rejected": -8.16280428568522, + "step": 4761 + }, + { + "epoch": 0.43508451347647326, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 6.029946186790875e-06, + "logits/chosen": 557567488.0, + "logits/rejected": 644397568.0, + "logps/chosen": -259.69525146484375, + "logps/rejected": -525.6553955078125, + "loss": 0.0825, + "rewards/chosen": 3.2296156883239746, + "rewards/margins": 9.8500337600708, + "rewards/rejected": -6.620418071746826, + "step": 4762 + }, + { + "epoch": 0.43517587939698493, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 6.0285391790710405e-06, + "logits/chosen": 1035695001.6, + "logits/rejected": 771153664.0, + "logps/chosen": -419.79794921875, + "logps/rejected": -283.078125, + "loss": 0.0313, + "rewards/chosen": 3.064390182495117, + "rewards/margins": 11.95077896118164, + "rewards/rejected": -8.886388778686523, + "step": 4763 + }, + { + "epoch": 0.43526724531749655, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 6.027132086300546e-06, + "logits/chosen": 727281322.6666666, + "logits/rejected": 350775449.6, + "logps/chosen": -443.5859375, + "logps/rejected": -352.522705078125, + "loss": 0.0153, + "rewards/chosen": 3.4338115056355796, + "rewards/margins": 14.81784699757894, + "rewards/rejected": -11.38403549194336, + "step": 4764 + }, + { + "epoch": 0.43535861123800823, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 6.025724908595744e-06, + "logits/chosen": 306539264.0, + "logits/rejected": 600014189.7142857, + "logps/chosen": -222.670654296875, + "logps/rejected": -433.3841029575893, + "loss": 0.0064, + "rewards/chosen": 2.978318929672241, + "rewards/margins": 11.92349192074367, + "rewards/rejected": -8.945172991071429, + "step": 4765 + }, + { + "epoch": 0.43544997715851985, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 6.0243176460729955e-06, + "logits/chosen": 686599850.6666666, + "logits/rejected": 525718220.8, + "logps/chosen": -402.5002034505208, + "logps/rejected": -489.7126953125, + "loss": 0.0183, + "rewards/chosen": 3.0979830423990884, + "rewards/margins": 12.261134592692057, + "rewards/rejected": -9.16315155029297, + "step": 4766 + }, + { + "epoch": 0.43554134307903153, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 6.022910298848669e-06, + "logits/chosen": 344748416.0, + "logits/rejected": 450357760.0, + "logps/chosen": -310.656494140625, + "logps/rejected": -637.63330078125, + "loss": 0.0226, + "rewards/chosen": 2.8885498046875, + "rewards/margins": 12.11374282836914, + "rewards/rejected": -9.22519302368164, + "step": 4767 + }, + { + "epoch": 0.43563270899954315, + "grad_norm": 0.6640625, + "kl": 0.0, + "learning_rate": 6.021502867039138e-06, + "logits/chosen": 524416896.0, + "logits/rejected": 476290336.0, + "logps/chosen": -339.03363037109375, + "logps/rejected": -574.2489013671875, + "loss": 0.0041, + "rewards/chosen": 4.892927169799805, + "rewards/margins": 16.732481002807617, + "rewards/rejected": -11.839553833007812, + "step": 4768 + }, + { + "epoch": 0.4357240749200548, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 6.020095350760784e-06, + "logits/chosen": 642921088.0, + "logits/rejected": 478271040.0, + "logps/chosen": -280.0689392089844, + "logps/rejected": -690.9483032226562, + "loss": 0.0172, + "rewards/chosen": 3.65018367767334, + "rewards/margins": 13.604395866394043, + "rewards/rejected": -9.954212188720703, + "step": 4769 + }, + { + "epoch": 0.43581544084056645, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 6.018687750129997e-06, + "logits/chosen": 791243584.0, + "logits/rejected": 551217749.3333334, + "logps/chosen": -239.14657592773438, + "logps/rejected": -512.3266194661459, + "loss": 0.0154, + "rewards/chosen": 2.8053817749023438, + "rewards/margins": 13.199552536010742, + "rewards/rejected": -10.394170761108398, + "step": 4770 + }, + { + "epoch": 0.4359068067610781, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 6.0172800652631706e-06, + "logits/chosen": 470491776.0, + "logits/rejected": 653399859.2, + "logps/chosen": -192.0814208984375, + "logps/rejected": -455.16318359375, + "loss": 0.1243, + "rewards/chosen": 1.433563232421875, + "rewards/margins": 10.114548492431641, + "rewards/rejected": -8.680985260009766, + "step": 4771 + }, + { + "epoch": 0.43599817268158975, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 6.015872296276707e-06, + "logits/chosen": 803744341.3333334, + "logits/rejected": 460029542.4, + "logps/chosen": -221.7115478515625, + "logps/rejected": -404.10009765625, + "loss": 0.0567, + "rewards/chosen": 2.5486604372660318, + "rewards/margins": 11.51944595972697, + "rewards/rejected": -8.970785522460938, + "step": 4772 + }, + { + "epoch": 0.4360895386021014, + "grad_norm": 44.75, + "kl": 0.0, + "learning_rate": 6.0144644432870184e-06, + "logits/chosen": 512971161.6, + "logits/rejected": 863111850.6666666, + "logps/chosen": -301.3361083984375, + "logps/rejected": -593.3230794270834, + "loss": 0.0346, + "rewards/chosen": 3.981366729736328, + "rewards/margins": 11.686062749226888, + "rewards/rejected": -7.70469601949056, + "step": 4773 + }, + { + "epoch": 0.43618090452261304, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 6.013056506410518e-06, + "logits/chosen": 510557952.0, + "logits/rejected": 459472512.0, + "logps/chosen": -322.00860595703125, + "logps/rejected": -338.016357421875, + "loss": 0.0229, + "rewards/chosen": 3.856156826019287, + "rewards/margins": 11.825790882110596, + "rewards/rejected": -7.969634056091309, + "step": 4774 + }, + { + "epoch": 0.4362722704431247, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.011648485763633e-06, + "logits/chosen": 940749619.2, + "logits/rejected": 348410837.3333333, + "logps/chosen": -243.425439453125, + "logps/rejected": -490.9560546875, + "loss": 0.0323, + "rewards/chosen": 3.3211288452148438, + "rewards/margins": 13.684401194254557, + "rewards/rejected": -10.363272349039713, + "step": 4775 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 6.010240381462789e-06, + "logits/chosen": 635575168.0, + "logits/rejected": 568038070.8571428, + "logps/chosen": -169.7364044189453, + "logps/rejected": -537.4468470982143, + "loss": 0.009, + "rewards/chosen": 2.5652847290039062, + "rewards/margins": 12.279741559709821, + "rewards/rejected": -9.714456830705915, + "step": 4776 + }, + { + "epoch": 0.436455002284148, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 6.008832193624426e-06, + "logits/chosen": 559100800.0, + "logits/rejected": 836384000.0, + "logps/chosen": -524.683837890625, + "logps/rejected": -732.3740234375, + "loss": 0.0159, + "rewards/chosen": 3.639380693435669, + "rewards/margins": 14.033307313919067, + "rewards/rejected": -10.393926620483398, + "step": 4777 + }, + { + "epoch": 0.43654636820465964, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 6.007423922364988e-06, + "logits/chosen": 1068482730.6666666, + "logits/rejected": 607337267.2, + "logps/chosen": -555.3709309895834, + "logps/rejected": -461.678173828125, + "loss": 0.0209, + "rewards/chosen": 2.87186590830485, + "rewards/margins": 11.001183382670083, + "rewards/rejected": -8.129317474365234, + "step": 4778 + }, + { + "epoch": 0.4366377341251713, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 6.006015567800927e-06, + "logits/chosen": 1012421504.0, + "logits/rejected": 1015416768.0, + "logps/chosen": -229.69432067871094, + "logps/rejected": -585.507080078125, + "loss": 0.0292, + "rewards/chosen": 3.0701799392700195, + "rewards/margins": 12.523426055908203, + "rewards/rejected": -9.453246116638184, + "step": 4779 + }, + { + "epoch": 0.43672910004568294, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 6.004607130048697e-06, + "logits/chosen": 433651370.6666667, + "logits/rejected": 322984601.6, + "logps/chosen": -351.7093098958333, + "logps/rejected": -400.6822265625, + "loss": 0.0259, + "rewards/chosen": 2.886593818664551, + "rewards/margins": 11.647017860412598, + "rewards/rejected": -8.760424041748047, + "step": 4780 + }, + { + "epoch": 0.4368204659661946, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 6.0031986092247686e-06, + "logits/chosen": 447269888.0, + "logits/rejected": 395764992.0, + "logps/chosen": -351.5177001953125, + "logps/rejected": -504.7069905598958, + "loss": 0.0305, + "rewards/chosen": 3.322043609619141, + "rewards/margins": 12.76706746419271, + "rewards/rejected": -9.445023854573568, + "step": 4781 + }, + { + "epoch": 0.43691183188670624, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 6.001790005445607e-06, + "logits/chosen": 599625472.0, + "logits/rejected": 338118118.4, + "logps/chosen": -340.4086100260417, + "logps/rejected": -366.74921875, + "loss": 0.0715, + "rewards/chosen": 2.8486769994099936, + "rewards/margins": 10.433018430074057, + "rewards/rejected": -7.584341430664063, + "step": 4782 + }, + { + "epoch": 0.4370031978072179, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 6.000381318827695e-06, + "logits/chosen": 696673024.0, + "logits/rejected": 743752768.0, + "logps/chosen": -195.72152709960938, + "logps/rejected": -571.1241455078125, + "loss": 0.0425, + "rewards/chosen": 2.9835996627807617, + "rewards/margins": 13.025187492370605, + "rewards/rejected": -10.041587829589844, + "step": 4783 + }, + { + "epoch": 0.43709456372772953, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.9989725494875165e-06, + "logits/chosen": 454056243.2, + "logits/rejected": 547018538.6666666, + "logps/chosen": -198.01534423828124, + "logps/rejected": -426.1305338541667, + "loss": 0.0232, + "rewards/chosen": 3.5747169494628905, + "rewards/margins": 11.898636881510416, + "rewards/rejected": -8.323919932047525, + "step": 4784 + }, + { + "epoch": 0.4371859296482412, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5.997563697541563e-06, + "logits/chosen": 791890944.0, + "logits/rejected": 296291942.4, + "logps/chosen": -380.5064290364583, + "logps/rejected": -394.8585693359375, + "loss": 0.024, + "rewards/chosen": 2.711761474609375, + "rewards/margins": 13.23369140625, + "rewards/rejected": -10.521929931640624, + "step": 4785 + }, + { + "epoch": 0.43727729556875283, + "grad_norm": 0.46484375, + "kl": 0.0, + "learning_rate": 5.9961547631063354e-06, + "logits/chosen": 702514176.0, + "logits/rejected": 554014902.8571428, + "logps/chosen": -426.2069091796875, + "logps/rejected": -360.88773018973217, + "loss": 0.0024, + "rewards/chosen": 4.24965238571167, + "rewards/margins": 12.667851107461113, + "rewards/rejected": -8.418198721749443, + "step": 4786 + }, + { + "epoch": 0.4373686614892645, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.994745746298339e-06, + "logits/chosen": 984576426.6666666, + "logits/rejected": 359012864.0, + "logps/chosen": -408.4662272135417, + "logps/rejected": -370.1929931640625, + "loss": 0.014, + "rewards/chosen": 3.285267194112142, + "rewards/margins": 11.853679974873861, + "rewards/rejected": -8.568412780761719, + "step": 4787 + }, + { + "epoch": 0.43746002740977613, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 5.993336647234085e-06, + "logits/chosen": 647118400.0, + "logits/rejected": 358641920.0, + "logps/chosen": -235.5767822265625, + "logps/rejected": -395.3544108072917, + "loss": 0.0112, + "rewards/chosen": 3.6443305015563965, + "rewards/margins": 11.841898759206137, + "rewards/rejected": -8.19756825764974, + "step": 4788 + }, + { + "epoch": 0.4375513933302878, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5.991927466030093e-06, + "logits/chosen": 526703456.0, + "logits/rejected": 481031058.28571427, + "logps/chosen": -254.86170959472656, + "logps/rejected": -393.5061732700893, + "loss": 0.0123, + "rewards/chosen": 2.255070447921753, + "rewards/margins": 10.935425519943237, + "rewards/rejected": -8.680355072021484, + "step": 4789 + }, + { + "epoch": 0.43764275925079943, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5.990518202802891e-06, + "logits/chosen": 817782464.0, + "logits/rejected": 371455786.6666667, + "logps/chosen": -387.25592041015625, + "logps/rejected": -394.5096028645833, + "loss": 0.0097, + "rewards/chosen": 3.319744825363159, + "rewards/margins": 11.213229576746624, + "rewards/rejected": -7.893484751383464, + "step": 4790 + }, + { + "epoch": 0.4377341251713111, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 5.9891088576690106e-06, + "logits/chosen": 573913728.0, + "logits/rejected": 414785312.0, + "logps/chosen": -324.06927490234375, + "logps/rejected": -459.7337951660156, + "loss": 0.0114, + "rewards/chosen": 4.02414608001709, + "rewards/margins": 13.281462669372559, + "rewards/rejected": -9.257316589355469, + "step": 4791 + }, + { + "epoch": 0.4378254910918227, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5.987699430744992e-06, + "logits/chosen": 430311552.0, + "logits/rejected": 386416576.0, + "logps/chosen": -318.53271484375, + "logps/rejected": -498.38848876953125, + "loss": 0.0115, + "rewards/chosen": 4.114405632019043, + "rewards/margins": 12.319378852844238, + "rewards/rejected": -8.204973220825195, + "step": 4792 + }, + { + "epoch": 0.4379168570123344, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5.9862899221473814e-06, + "logits/chosen": 563051050.6666666, + "logits/rejected": 513155072.0, + "logps/chosen": -278.6888834635417, + "logps/rejected": -490.96904296875, + "loss": 0.0158, + "rewards/chosen": 3.653285344441732, + "rewards/margins": 12.495241673787435, + "rewards/rejected": -8.841956329345702, + "step": 4793 + }, + { + "epoch": 0.438008222932846, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5.984880331992734e-06, + "logits/chosen": 582169088.0, + "logits/rejected": 789242316.8, + "logps/chosen": -207.1648966471354, + "logps/rejected": -465.990283203125, + "loss": 0.0108, + "rewards/chosen": 4.4593550364176435, + "rewards/margins": 11.319991938273112, + "rewards/rejected": -6.860636901855469, + "step": 4794 + }, + { + "epoch": 0.4380995888533577, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5.983470660397607e-06, + "logits/chosen": 421986218.6666667, + "logits/rejected": 393156147.2, + "logps/chosen": -181.384765625, + "logps/rejected": -689.236474609375, + "loss": 0.1386, + "rewards/chosen": 0.5404570897420248, + "rewards/margins": 10.711771329243978, + "rewards/rejected": -10.171314239501953, + "step": 4795 + }, + { + "epoch": 0.4381909547738693, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5.982060907478568e-06, + "logits/chosen": 814922069.3333334, + "logits/rejected": 481692096.0, + "logps/chosen": -307.7733561197917, + "logps/rejected": -515.7400512695312, + "loss": 0.0176, + "rewards/chosen": 3.9736105600992837, + "rewards/margins": 13.110817591349283, + "rewards/rejected": -9.13720703125, + "step": 4796 + }, + { + "epoch": 0.438282320694381, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 5.980651073352191e-06, + "logits/chosen": 369957568.0, + "logits/rejected": 699827840.0, + "logps/chosen": -364.87322998046875, + "logps/rejected": -574.801025390625, + "loss": 0.0119, + "rewards/chosen": 4.156486988067627, + "rewards/margins": 13.40561056137085, + "rewards/rejected": -9.249123573303223, + "step": 4797 + }, + { + "epoch": 0.4383736866148926, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 5.979241158135058e-06, + "logits/chosen": 458808217.6, + "logits/rejected": 557438976.0, + "logps/chosen": -255.70654296875, + "logps/rejected": -598.0071614583334, + "loss": 0.0305, + "rewards/chosen": 3.7092025756835936, + "rewards/margins": 17.994925944010415, + "rewards/rejected": -14.285723368326822, + "step": 4798 + }, + { + "epoch": 0.4384650525354043, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 5.977831161943751e-06, + "logits/chosen": 342072896.0, + "logits/rejected": 810320128.0, + "logps/chosen": -237.89486694335938, + "logps/rejected": -599.8119303385416, + "loss": 0.0054, + "rewards/chosen": 4.352200984954834, + "rewards/margins": 14.015515804290771, + "rewards/rejected": -9.663314819335938, + "step": 4799 + }, + { + "epoch": 0.4385564184559159, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5.976421084894869e-06, + "logits/chosen": 525163008.0, + "logits/rejected": 399750432.0, + "logps/chosen": -375.5152282714844, + "logps/rejected": -530.4891357421875, + "loss": 0.0222, + "rewards/chosen": 3.1231658458709717, + "rewards/margins": 13.29344916343689, + "rewards/rejected": -10.170283317565918, + "step": 4800 + }, + { + "epoch": 0.4386477843764276, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5.975010927105009e-06, + "logits/chosen": 443663360.0, + "logits/rejected": 383175168.0, + "logps/chosen": -197.264501953125, + "logps/rejected": -436.76220703125, + "loss": 0.0441, + "rewards/chosen": 3.2845767974853515, + "rewards/margins": 12.519602839152018, + "rewards/rejected": -9.235026041666666, + "step": 4801 + }, + { + "epoch": 0.4387391502969392, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 5.973600688690781e-06, + "logits/chosen": 652243200.0, + "logits/rejected": 710706322.2857143, + "logps/chosen": -522.1695556640625, + "logps/rejected": -462.9920131138393, + "loss": 0.0059, + "rewards/chosen": 3.9943604469299316, + "rewards/margins": 12.175931862422399, + "rewards/rejected": -8.181571415492467, + "step": 4802 + }, + { + "epoch": 0.4388305162174509, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 5.9721903697687945e-06, + "logits/chosen": 540623872.0, + "logits/rejected": 437960288.0, + "logps/chosen": -353.2235107421875, + "logps/rejected": -529.3270263671875, + "loss": 0.0154, + "rewards/chosen": 3.721780300140381, + "rewards/margins": 13.529787540435791, + "rewards/rejected": -9.80800724029541, + "step": 4803 + }, + { + "epoch": 0.4389218821379625, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 5.970779970455672e-06, + "logits/chosen": 576663893.3333334, + "logits/rejected": 267849088.0, + "logps/chosen": -413.5257975260417, + "logps/rejected": -369.66571044921875, + "loss": 0.1168, + "rewards/chosen": 3.114969571431478, + "rewards/margins": 11.343809445699057, + "rewards/rejected": -8.228839874267578, + "step": 4804 + }, + { + "epoch": 0.4390132480584742, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5.969369490868042e-06, + "logits/chosen": 581729877.3333334, + "logits/rejected": 324324064.0, + "logps/chosen": -436.0007731119792, + "logps/rejected": -442.46490478515625, + "loss": 0.0215, + "rewards/chosen": 3.9031473795572915, + "rewards/margins": 13.947260538736979, + "rewards/rejected": -10.044113159179688, + "step": 4805 + }, + { + "epoch": 0.4391046139789858, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5.967958931122535e-06, + "logits/chosen": 477544874.6666667, + "logits/rejected": 423120025.6, + "logps/chosen": -410.52783203125, + "logps/rejected": -461.278173828125, + "loss": 0.0253, + "rewards/chosen": 2.7394164403279624, + "rewards/margins": 12.399879391988119, + "rewards/rejected": -9.660462951660156, + "step": 4806 + }, + { + "epoch": 0.4391959798994975, + "grad_norm": 3.78125, + "kl": 1.390838623046875, + "learning_rate": 5.966548291335793e-06, + "logits/chosen": 585796608.0, + "logits/rejected": 258678496.0, + "logps/chosen": -452.117919921875, + "logps/rejected": -598.5076904296875, + "loss": 0.0277, + "rewards/chosen": 3.787611552647182, + "rewards/margins": 12.185817309788295, + "rewards/rejected": -8.398205757141113, + "step": 4807 + }, + { + "epoch": 0.4392873458200091, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.965137571624463e-06, + "logits/chosen": 601877888.0, + "logits/rejected": 522459221.3333333, + "logps/chosen": -307.38079833984375, + "logps/rejected": -471.692138671875, + "loss": 0.0891, + "rewards/chosen": 3.25007700920105, + "rewards/margins": 11.731706698735556, + "rewards/rejected": -8.481629689534506, + "step": 4808 + }, + { + "epoch": 0.4393787117405208, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.963726772105198e-06, + "logits/chosen": 427247146.6666667, + "logits/rejected": 436688486.4, + "logps/chosen": -268.5802408854167, + "logps/rejected": -524.88388671875, + "loss": 0.0654, + "rewards/chosen": 5.658178965250651, + "rewards/margins": 13.126668039957682, + "rewards/rejected": -7.468489074707032, + "step": 4809 + }, + { + "epoch": 0.4394700776610324, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5.9623158928946575e-06, + "logits/chosen": 348417280.0, + "logits/rejected": 416105045.3333333, + "logps/chosen": -305.88385009765625, + "logps/rejected": -527.7578125, + "loss": 0.0096, + "rewards/chosen": 3.2597336769104004, + "rewards/margins": 13.868558088938395, + "rewards/rejected": -10.608824412027994, + "step": 4810 + }, + { + "epoch": 0.4395614435815441, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 5.960904934109511e-06, + "logits/chosen": 254861418.66666666, + "logits/rejected": 540239974.4, + "logps/chosen": -294.00233968098956, + "logps/rejected": -660.586376953125, + "loss": 0.0092, + "rewards/chosen": 3.87136173248291, + "rewards/margins": 13.211385536193848, + "rewards/rejected": -9.340023803710938, + "step": 4811 + }, + { + "epoch": 0.4396528095020557, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 5.9594938958664265e-06, + "logits/chosen": 583745792.0, + "logits/rejected": 510298922.6666667, + "logps/chosen": -311.979248046875, + "logps/rejected": -515.8531901041666, + "loss": 0.0043, + "rewards/chosen": 4.275870323181152, + "rewards/margins": 12.8071502049764, + "rewards/rejected": -8.531279881795248, + "step": 4812 + }, + { + "epoch": 0.4397441754225674, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5.958082778282088e-06, + "logits/chosen": 763201536.0, + "logits/rejected": 346362624.0, + "logps/chosen": -284.2584716796875, + "logps/rejected": -515.631103515625, + "loss": 0.0264, + "rewards/chosen": 3.3573287963867187, + "rewards/margins": 14.019113922119141, + "rewards/rejected": -10.661785125732422, + "step": 4813 + }, + { + "epoch": 0.439835541343079, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 5.95667158147318e-06, + "logits/chosen": 276861184.0, + "logits/rejected": 421909760.0, + "logps/chosen": -108.50679016113281, + "logps/rejected": -448.9791259765625, + "loss": 0.0089, + "rewards/chosen": 3.9455809593200684, + "rewards/margins": 13.135742982228598, + "rewards/rejected": -9.19016202290853, + "step": 4814 + }, + { + "epoch": 0.4399269072635907, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 5.955260305556396e-06, + "logits/chosen": 359118421.3333333, + "logits/rejected": 754971852.8, + "logps/chosen": -112.63803100585938, + "logps/rejected": -430.442578125, + "loss": 0.0065, + "rewards/chosen": 4.191034317016602, + "rewards/margins": 14.147136306762695, + "rewards/rejected": -9.956101989746093, + "step": 4815 + }, + { + "epoch": 0.4400182731841023, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5.9538489506484355e-06, + "logits/chosen": 548522368.0, + "logits/rejected": 552353344.0, + "logps/chosen": -449.20965576171875, + "logps/rejected": -400.2178039550781, + "loss": 0.0301, + "rewards/chosen": 2.990924835205078, + "rewards/margins": 12.138274192810059, + "rewards/rejected": -9.14734935760498, + "step": 4816 + }, + { + "epoch": 0.440109639104614, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5.952437516866006e-06, + "logits/chosen": 481877376.0, + "logits/rejected": 711613312.0, + "logps/chosen": -366.8071594238281, + "logps/rejected": -582.8518676757812, + "loss": 0.0285, + "rewards/chosen": 3.5425329208374023, + "rewards/margins": 13.728116989135742, + "rewards/rejected": -10.18558406829834, + "step": 4817 + }, + { + "epoch": 0.4402010050251256, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.951026004325817e-06, + "logits/chosen": 555527616.0, + "logits/rejected": 454999008.0, + "logps/chosen": -343.98291015625, + "logps/rejected": -495.89373779296875, + "loss": 0.0308, + "rewards/chosen": 2.9563286304473877, + "rewards/margins": 11.901891946792603, + "rewards/rejected": -8.945563316345215, + "step": 4818 + }, + { + "epoch": 0.4402923709456373, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 5.949614413144588e-06, + "logits/chosen": 287064780.8, + "logits/rejected": 380360960.0, + "logps/chosen": -249.493505859375, + "logps/rejected": -457.314208984375, + "loss": 0.0103, + "rewards/chosen": 4.452531814575195, + "rewards/margins": 15.451316197713217, + "rewards/rejected": -10.998784383138021, + "step": 4819 + }, + { + "epoch": 0.4403837368661489, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5.948202743439046e-06, + "logits/chosen": 704761856.0, + "logits/rejected": 467659093.3333333, + "logps/chosen": -254.9289306640625, + "logps/rejected": -471.7594807942708, + "loss": 0.0188, + "rewards/chosen": 3.665923309326172, + "rewards/margins": 12.624614588419597, + "rewards/rejected": -8.958691279093424, + "step": 4820 + }, + { + "epoch": 0.4404751027866606, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5.946790995325924e-06, + "logits/chosen": 1340143360.0, + "logits/rejected": 967638016.0, + "logps/chosen": -468.89312744140625, + "logps/rejected": -653.4057006835938, + "loss": 0.0167, + "rewards/chosen": 4.226054668426514, + "rewards/margins": 12.557258129119873, + "rewards/rejected": -8.33120346069336, + "step": 4821 + }, + { + "epoch": 0.4405664687071722, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5.945379168921956e-06, + "logits/chosen": 221345664.0, + "logits/rejected": 294195865.6, + "logps/chosen": -130.58477783203125, + "logps/rejected": -320.025146484375, + "loss": 0.0234, + "rewards/chosen": 3.146214803059896, + "rewards/margins": 11.832110341389974, + "rewards/rejected": -8.685895538330078, + "step": 4822 + }, + { + "epoch": 0.4406578346276839, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 5.943967264343893e-06, + "logits/chosen": 1048590336.0, + "logits/rejected": 623202176.0, + "logps/chosen": -368.8959147135417, + "logps/rejected": -718.30322265625, + "loss": 0.0436, + "rewards/chosen": 3.910696029663086, + "rewards/margins": 13.75216007232666, + "rewards/rejected": -9.841464042663574, + "step": 4823 + }, + { + "epoch": 0.4407492005481955, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5.942555281708482e-06, + "logits/chosen": 668034624.0, + "logits/rejected": 767483648.0, + "logps/chosen": -390.0791015625, + "logps/rejected": -527.4088745117188, + "loss": 0.0206, + "rewards/chosen": 3.5068066120147705, + "rewards/margins": 14.33899712562561, + "rewards/rejected": -10.83219051361084, + "step": 4824 + }, + { + "epoch": 0.4408405664687072, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5.941143221132482e-06, + "logits/chosen": 622261376.0, + "logits/rejected": 522099264.0, + "logps/chosen": -259.44610595703125, + "logps/rejected": -445.85791015625, + "loss": 0.0299, + "rewards/chosen": 3.1732120513916016, + "rewards/margins": 12.595195770263672, + "rewards/rejected": -9.42198371887207, + "step": 4825 + }, + { + "epoch": 0.4409319323892188, + "grad_norm": 0.5546875, + "kl": 0.0, + "learning_rate": 5.939731082732657e-06, + "logits/chosen": 1278561280.0, + "logits/rejected": 590417203.2, + "logps/chosen": -292.27760823567706, + "logps/rejected": -413.207421875, + "loss": 0.0033, + "rewards/chosen": 5.16859753926595, + "rewards/margins": 14.236866124471028, + "rewards/rejected": -9.068268585205079, + "step": 4826 + }, + { + "epoch": 0.44102329830973047, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.938318866625779e-06, + "logits/chosen": 689437312.0, + "logits/rejected": 465522624.0, + "logps/chosen": -505.9794006347656, + "logps/rejected": -505.2578125, + "loss": 0.0141, + "rewards/chosen": 3.6764698028564453, + "rewards/margins": 12.580657958984375, + "rewards/rejected": -8.90418815612793, + "step": 4827 + }, + { + "epoch": 0.4411146642302421, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5.936906572928625e-06, + "logits/chosen": 341157888.0, + "logits/rejected": 490703462.4, + "logps/chosen": -301.2766927083333, + "logps/rejected": -471.52412109375, + "loss": 0.0341, + "rewards/chosen": 3.2732534408569336, + "rewards/margins": 13.49702205657959, + "rewards/rejected": -10.223768615722657, + "step": 4828 + }, + { + "epoch": 0.44120603015075377, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5.935494201757976e-06, + "logits/chosen": 325000128.0, + "logits/rejected": 638037760.0, + "logps/chosen": -309.19573974609375, + "logps/rejected": -633.874755859375, + "loss": 0.0177, + "rewards/chosen": 3.563591480255127, + "rewards/margins": 12.968483448028564, + "rewards/rejected": -9.404891967773438, + "step": 4829 + }, + { + "epoch": 0.4412973960712654, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5.934081753230625e-06, + "logits/chosen": 495707648.0, + "logits/rejected": 851906048.0, + "logps/chosen": -373.6422932942708, + "logps/rejected": -762.503173828125, + "loss": 0.0138, + "rewards/chosen": 4.255771636962891, + "rewards/margins": 14.658875465393066, + "rewards/rejected": -10.403103828430176, + "step": 4830 + }, + { + "epoch": 0.44138876199177707, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 5.932669227463368e-06, + "logits/chosen": 779967616.0, + "logits/rejected": 687346688.0, + "logps/chosen": -337.516845703125, + "logps/rejected": -388.2923889160156, + "loss": 0.0102, + "rewards/chosen": 4.279444694519043, + "rewards/margins": 13.196282386779785, + "rewards/rejected": -8.916837692260742, + "step": 4831 + }, + { + "epoch": 0.4414801279122887, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5.931256624573007e-06, + "logits/chosen": 587944384.0, + "logits/rejected": 618499584.0, + "logps/chosen": -330.72900390625, + "logps/rejected": -388.1677551269531, + "loss": 0.0249, + "rewards/chosen": 3.2527036666870117, + "rewards/margins": 10.416342735290527, + "rewards/rejected": -7.163639068603516, + "step": 4832 + }, + { + "epoch": 0.44157149383280037, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 5.929843944676351e-06, + "logits/chosen": 410234688.0, + "logits/rejected": 393342805.3333333, + "logps/chosen": -315.9169921875, + "logps/rejected": -459.4248046875, + "loss": 0.0075, + "rewards/chosen": 3.671574115753174, + "rewards/margins": 12.111635684967041, + "rewards/rejected": -8.440061569213867, + "step": 4833 + }, + { + "epoch": 0.441662859753312, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 5.928431187890217e-06, + "logits/chosen": 758908928.0, + "logits/rejected": 472787370.6666667, + "logps/chosen": -366.1195983886719, + "logps/rejected": -490.9095865885417, + "loss": 0.0102, + "rewards/chosen": 3.4496910572052, + "rewards/margins": 12.368855079015097, + "rewards/rejected": -8.919164021809896, + "step": 4834 + }, + { + "epoch": 0.44175422567382366, + "grad_norm": 49.5, + "kl": 0.0, + "learning_rate": 5.927018354331425e-06, + "logits/chosen": 450910720.0, + "logits/rejected": 604937216.0, + "logps/chosen": -255.30316162109375, + "logps/rejected": -422.4595947265625, + "loss": 0.0619, + "rewards/chosen": 2.9833056926727295, + "rewards/margins": 12.862780332565308, + "rewards/rejected": -9.879474639892578, + "step": 4835 + }, + { + "epoch": 0.4418455915943353, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 5.925605444116805e-06, + "logits/chosen": 649764608.0, + "logits/rejected": 473797376.0, + "logps/chosen": -441.612060546875, + "logps/rejected": -606.8245239257812, + "loss": 0.0203, + "rewards/chosen": 3.196568250656128, + "rewards/margins": 14.057983160018921, + "rewards/rejected": -10.861414909362793, + "step": 4836 + }, + { + "epoch": 0.44193695751484696, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 5.924192457363189e-06, + "logits/chosen": 663787776.0, + "logits/rejected": 520919040.0, + "logps/chosen": -268.87774658203125, + "logps/rejected": -303.43658447265625, + "loss": 0.0741, + "rewards/chosen": 3.7629504203796387, + "rewards/margins": 10.011143207550049, + "rewards/rejected": -6.24819278717041, + "step": 4837 + }, + { + "epoch": 0.44202832343535864, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 5.922779394187422e-06, + "logits/chosen": 416847530.6666667, + "logits/rejected": 633140032.0, + "logps/chosen": -280.5659993489583, + "logps/rejected": -576.9013061523438, + "loss": 0.0146, + "rewards/chosen": 4.256518681844075, + "rewards/margins": 12.20631726582845, + "rewards/rejected": -7.949798583984375, + "step": 4838 + }, + { + "epoch": 0.44211968935587026, + "grad_norm": 39.25, + "kl": 0.0, + "learning_rate": 5.921366254706348e-06, + "logits/chosen": 1417117440.0, + "logits/rejected": 729490688.0, + "logps/chosen": -252.6140899658203, + "logps/rejected": -439.40594482421875, + "loss": 0.0775, + "rewards/chosen": 2.3588614463806152, + "rewards/margins": 12.005615711212158, + "rewards/rejected": -9.646754264831543, + "step": 4839 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 5.91995303903682e-06, + "logits/chosen": 334355285.3333333, + "logits/rejected": 272088064.0, + "logps/chosen": -215.8551025390625, + "logps/rejected": -344.39520263671875, + "loss": 0.0077, + "rewards/chosen": 4.8597103754679365, + "rewards/margins": 13.849308649698894, + "rewards/rejected": -8.989598274230957, + "step": 4840 + }, + { + "epoch": 0.44230242119689356, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.9185397472957005e-06, + "logits/chosen": 434800736.0, + "logits/rejected": 358389056.0, + "logps/chosen": -289.08837890625, + "logps/rejected": -411.8046875, + "loss": 0.016, + "rewards/chosen": 3.8046398162841797, + "rewards/margins": 14.00117301940918, + "rewards/rejected": -10.196533203125, + "step": 4841 + }, + { + "epoch": 0.44239378711740523, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5.917126379599853e-06, + "logits/chosen": 590407082.6666666, + "logits/rejected": 665786675.2, + "logps/chosen": -238.95939127604166, + "logps/rejected": -439.2673828125, + "loss": 0.0286, + "rewards/chosen": 2.5463587443033853, + "rewards/margins": 11.221090189615884, + "rewards/rejected": -8.6747314453125, + "step": 4842 + }, + { + "epoch": 0.44248515303791686, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5.9157129360661536e-06, + "logits/chosen": 803570176.0, + "logits/rejected": 479884224.0, + "logps/chosen": -603.0418701171875, + "logps/rejected": -489.441162109375, + "loss": 0.0161, + "rewards/chosen": 3.719926357269287, + "rewards/margins": 13.226111888885498, + "rewards/rejected": -9.506185531616211, + "step": 4843 + }, + { + "epoch": 0.44257651895842853, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5.914299416811476e-06, + "logits/chosen": 840513280.0, + "logits/rejected": 433394212.5714286, + "logps/chosen": -470.4871826171875, + "logps/rejected": -622.6739676339286, + "loss": 0.0063, + "rewards/chosen": 2.989694356918335, + "rewards/margins": 12.926235233034406, + "rewards/rejected": -9.936540876116071, + "step": 4844 + }, + { + "epoch": 0.44266788487894015, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 5.9128858219527095e-06, + "logits/chosen": 472684352.0, + "logits/rejected": 398491904.0, + "logps/chosen": -225.02684020996094, + "logps/rejected": -541.9517415364584, + "loss": 0.0072, + "rewards/chosen": 3.5476181507110596, + "rewards/margins": 13.692413250605265, + "rewards/rejected": -10.144795099894205, + "step": 4845 + }, + { + "epoch": 0.44275925079945183, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5.911472151606743e-06, + "logits/chosen": 776479808.0, + "logits/rejected": 490768512.0, + "logps/chosen": -434.2593688964844, + "logps/rejected": -652.8126831054688, + "loss": 0.0209, + "rewards/chosen": 3.194528102874756, + "rewards/margins": 14.04574728012085, + "rewards/rejected": -10.851219177246094, + "step": 4846 + }, + { + "epoch": 0.44285061671996345, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5.910058405890475e-06, + "logits/chosen": 483793817.6, + "logits/rejected": 636803242.6666666, + "logps/chosen": -217.4346435546875, + "logps/rejected": -316.53228759765625, + "loss": 0.0278, + "rewards/chosen": 3.298078918457031, + "rewards/margins": 12.59626490275065, + "rewards/rejected": -9.29818598429362, + "step": 4847 + }, + { + "epoch": 0.44294198264047513, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5.90864458492081e-06, + "logits/chosen": 452769408.0, + "logits/rejected": 426765696.0, + "logps/chosen": -204.49520874023438, + "logps/rejected": -342.4733581542969, + "loss": 0.1298, + "rewards/chosen": 2.0978801250457764, + "rewards/margins": 10.036190748214722, + "rewards/rejected": -7.938310623168945, + "step": 4848 + }, + { + "epoch": 0.44303334856098675, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5.907230688814655e-06, + "logits/chosen": 609577600.0, + "logits/rejected": 571050581.3333334, + "logps/chosen": -333.83673095703125, + "logps/rejected": -399.0069173177083, + "loss": 0.0099, + "rewards/chosen": 3.3650221824645996, + "rewards/margins": 12.381917476654053, + "rewards/rejected": -9.016895294189453, + "step": 4849 + }, + { + "epoch": 0.4431247144814984, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.905816717688927e-06, + "logits/chosen": 511112874.6666667, + "logits/rejected": 1075620480.0, + "logps/chosen": -268.53220621744794, + "logps/rejected": -574.0726928710938, + "loss": 0.042, + "rewards/chosen": 3.194142977396647, + "rewards/margins": 12.11287752787272, + "rewards/rejected": -8.918734550476074, + "step": 4850 + }, + { + "epoch": 0.44321608040201005, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.904402671660551e-06, + "logits/chosen": 543719104.0, + "logits/rejected": 273684288.0, + "logps/chosen": -435.4229736328125, + "logps/rejected": -364.28741455078125, + "loss": 0.0305, + "rewards/chosen": 2.9177021980285645, + "rewards/margins": 11.910078525543213, + "rewards/rejected": -8.992376327514648, + "step": 4851 + }, + { + "epoch": 0.4433074463225217, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 5.902988550846451e-06, + "logits/chosen": 437517397.3333333, + "logits/rejected": 473811609.6, + "logps/chosen": -216.0020548502604, + "logps/rejected": -385.7152587890625, + "loss": 0.0074, + "rewards/chosen": 4.088003794352214, + "rewards/margins": 12.488252512613933, + "rewards/rejected": -8.400248718261718, + "step": 4852 + }, + { + "epoch": 0.44339881224303335, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5.901574355363567e-06, + "logits/chosen": 412957525.3333333, + "logits/rejected": 433352908.8, + "logps/chosen": -246.7957560221354, + "logps/rejected": -405.747412109375, + "loss": 0.0291, + "rewards/chosen": 3.2909088134765625, + "rewards/margins": 12.888157653808594, + "rewards/rejected": -9.597248840332032, + "step": 4853 + }, + { + "epoch": 0.443490178163545, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5.900160085328836e-06, + "logits/chosen": 261892256.0, + "logits/rejected": 619067072.0, + "logps/chosen": -206.35720825195312, + "logps/rejected": -630.5106811523438, + "loss": 0.019, + "rewards/chosen": 4.146006107330322, + "rewards/margins": 13.1242356300354, + "rewards/rejected": -8.978229522705078, + "step": 4854 + }, + { + "epoch": 0.44358154408405664, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5.898745740859204e-06, + "logits/chosen": 350287957.3333333, + "logits/rejected": 398121856.0, + "logps/chosen": -430.1507975260417, + "logps/rejected": -507.58173828125, + "loss": 0.0276, + "rewards/chosen": 2.878933906555176, + "rewards/margins": 12.725413703918457, + "rewards/rejected": -9.846479797363282, + "step": 4855 + }, + { + "epoch": 0.4436729100045683, + "grad_norm": 0.6640625, + "kl": 0.0, + "learning_rate": 5.897331322071629e-06, + "logits/chosen": 660220800.0, + "logits/rejected": 406344777.14285713, + "logps/chosen": -495.61083984375, + "logps/rejected": -608.3999720982143, + "loss": 0.0023, + "rewards/chosen": 4.21392822265625, + "rewards/margins": 15.216509137834821, + "rewards/rejected": -11.002580915178571, + "step": 4856 + }, + { + "epoch": 0.44376427592507994, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5.895916829083066e-06, + "logits/chosen": 455549388.8, + "logits/rejected": 494704426.6666667, + "logps/chosen": -339.94521484375, + "logps/rejected": -326.68988037109375, + "loss": 0.0223, + "rewards/chosen": 4.142810821533203, + "rewards/margins": 11.174782435099285, + "rewards/rejected": -7.031971613566081, + "step": 4857 + }, + { + "epoch": 0.4438556418455916, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5.894502262010483e-06, + "logits/chosen": 639196569.6, + "logits/rejected": 744695296.0, + "logps/chosen": -325.062109375, + "logps/rejected": -378.923828125, + "loss": 0.1199, + "rewards/chosen": 3.7916446685791017, + "rewards/margins": 9.430202738444011, + "rewards/rejected": -5.638558069864909, + "step": 4858 + }, + { + "epoch": 0.44394700776610324, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5.89308762097085e-06, + "logits/chosen": 857482188.8, + "logits/rejected": 717956693.3333334, + "logps/chosen": -427.70810546875, + "logps/rejected": -559.8865559895834, + "loss": 0.0138, + "rewards/chosen": 3.843312072753906, + "rewards/margins": 14.043034998575845, + "rewards/rejected": -10.19972292582194, + "step": 4859 + }, + { + "epoch": 0.4440383736866149, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5.891672906081146e-06, + "logits/chosen": 1109903872.0, + "logits/rejected": 656035840.0, + "logps/chosen": -460.0318298339844, + "logps/rejected": -610.22021484375, + "loss": 0.0064, + "rewards/chosen": 2.9674041271209717, + "rewards/margins": 12.157331364495414, + "rewards/rejected": -9.189927237374443, + "step": 4860 + }, + { + "epoch": 0.44412973960712654, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5.890258117458355e-06, + "logits/chosen": 599661363.2, + "logits/rejected": 1249885781.3333333, + "logps/chosen": -355.163427734375, + "logps/rejected": -787.076904296875, + "loss": 0.0132, + "rewards/chosen": 4.642981719970703, + "rewards/margins": 14.725567372639976, + "rewards/rejected": -10.082585652669271, + "step": 4861 + }, + { + "epoch": 0.4442211055276382, + "grad_norm": 51.25, + "kl": 0.0, + "learning_rate": 5.8888432552194655e-06, + "logits/chosen": 537253376.0, + "logits/rejected": 506840192.0, + "logps/chosen": -379.2316080729167, + "logps/rejected": -276.793212890625, + "loss": 0.0607, + "rewards/chosen": 3.680049260457357, + "rewards/margins": 13.366936047871908, + "rewards/rejected": -9.68688678741455, + "step": 4862 + }, + { + "epoch": 0.44431247144814984, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 5.887428319481473e-06, + "logits/chosen": 528609024.0, + "logits/rejected": 396571084.8, + "logps/chosen": -325.90879313151044, + "logps/rejected": -438.211376953125, + "loss": 0.0094, + "rewards/chosen": 4.441961288452148, + "rewards/margins": 12.542696762084962, + "rewards/rejected": -8.100735473632813, + "step": 4863 + }, + { + "epoch": 0.4444038373686615, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5.886013310361383e-06, + "logits/chosen": 692209792.0, + "logits/rejected": 279434464.0, + "logps/chosen": -445.2331237792969, + "logps/rejected": -326.28070068359375, + "loss": 0.0218, + "rewards/chosen": 3.221669912338257, + "rewards/margins": 11.208671808242798, + "rewards/rejected": -7.987001895904541, + "step": 4864 + }, + { + "epoch": 0.44449520328917314, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 5.884598227976198e-06, + "logits/chosen": 555860160.0, + "logits/rejected": 503970889.14285713, + "logps/chosen": -268.7115478515625, + "logps/rejected": -548.2952008928571, + "loss": 0.0053, + "rewards/chosen": 3.1795411109924316, + "rewards/margins": 11.95843471799578, + "rewards/rejected": -8.778893607003349, + "step": 4865 + }, + { + "epoch": 0.4445865692096848, + "grad_norm": 34.75, + "kl": 0.0, + "learning_rate": 5.883183072442938e-06, + "logits/chosen": 551542656.0, + "logits/rejected": 1243565397.3333333, + "logps/chosen": -236.28414916992188, + "logps/rejected": -487.3560384114583, + "loss": 0.0795, + "rewards/chosen": 1.5176769495010376, + "rewards/margins": 10.456495563189188, + "rewards/rejected": -8.93881861368815, + "step": 4866 + }, + { + "epoch": 0.44467793513019643, + "grad_norm": 0.61328125, + "kl": 0.0, + "learning_rate": 5.881767843878619e-06, + "logits/chosen": 613122816.0, + "logits/rejected": 332996373.3333333, + "logps/chosen": -254.07203674316406, + "logps/rejected": -256.593994140625, + "loss": 0.0028, + "rewards/chosen": 5.539253234863281, + "rewards/margins": 13.006520589192707, + "rewards/rejected": -7.467267354329427, + "step": 4867 + }, + { + "epoch": 0.4447693010507081, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5.880352542400272e-06, + "logits/chosen": 326975846.4, + "logits/rejected": 185654122.66666666, + "logps/chosen": -223.601953125, + "logps/rejected": -384.5381673177083, + "loss": 0.0448, + "rewards/chosen": 2.961730194091797, + "rewards/margins": 13.207804870605468, + "rewards/rejected": -10.246074676513672, + "step": 4868 + }, + { + "epoch": 0.44486066697121973, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 5.878937168124923e-06, + "logits/chosen": 807771776.0, + "logits/rejected": 862887850.6666666, + "logps/chosen": -469.83575439453125, + "logps/rejected": -638.730712890625, + "loss": 0.0049, + "rewards/chosen": 4.240286350250244, + "rewards/margins": 14.760532220204672, + "rewards/rejected": -10.520245869954428, + "step": 4869 + }, + { + "epoch": 0.4449520328917314, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 5.877521721169615e-06, + "logits/chosen": 438679347.2, + "logits/rejected": 279761408.0, + "logps/chosen": -274.86806640625, + "logps/rejected": -365.2286376953125, + "loss": 0.0126, + "rewards/chosen": 4.1053108215332035, + "rewards/margins": 13.2237730662028, + "rewards/rejected": -9.118462244669596, + "step": 4870 + }, + { + "epoch": 0.44504339881224303, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 5.876106201651392e-06, + "logits/chosen": 1247110485.3333333, + "logits/rejected": 645324748.8, + "logps/chosen": -451.2172444661458, + "logps/rejected": -581.6623046875, + "loss": 0.0079, + "rewards/chosen": 4.023897171020508, + "rewards/margins": 12.231557846069336, + "rewards/rejected": -8.207660675048828, + "step": 4871 + }, + { + "epoch": 0.4451347647327547, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5.874690609687303e-06, + "logits/chosen": 643432550.4, + "logits/rejected": 644602026.6666666, + "logps/chosen": -299.48994140625, + "logps/rejected": -557.9658203125, + "loss": 0.029, + "rewards/chosen": 3.243146514892578, + "rewards/margins": 10.627788798014324, + "rewards/rejected": -7.384642283121745, + "step": 4872 + }, + { + "epoch": 0.4452261306532663, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 5.873274945394405e-06, + "logits/chosen": 492899200.0, + "logits/rejected": 442550323.2, + "logps/chosen": -275.7546793619792, + "logps/rejected": -277.66982421875, + "loss": 0.0085, + "rewards/chosen": 4.254673322041829, + "rewards/margins": 12.513939984639485, + "rewards/rejected": -8.259266662597657, + "step": 4873 + }, + { + "epoch": 0.445317496573778, + "grad_norm": 0.35546875, + "kl": 0.0, + "learning_rate": 5.871859208889759e-06, + "logits/chosen": 133443936.0, + "logits/rejected": 391049691.4285714, + "logps/chosen": -62.17791748046875, + "logps/rejected": -568.4769112723214, + "loss": 0.0032, + "rewards/chosen": 3.714895725250244, + "rewards/margins": 14.217537675585065, + "rewards/rejected": -10.502641950334821, + "step": 4874 + }, + { + "epoch": 0.4454088624942896, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5.870443400290436e-06, + "logits/chosen": 353602969.6, + "logits/rejected": 683809109.3333334, + "logps/chosen": -247.9315673828125, + "logps/rejected": -542.3072509765625, + "loss": 0.0256, + "rewards/chosen": 3.5637958526611326, + "rewards/margins": 13.55719388326009, + "rewards/rejected": -9.993398030598959, + "step": 4875 + }, + { + "epoch": 0.4455002284148013, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5.869027519713507e-06, + "logits/chosen": 505578784.0, + "logits/rejected": 695639168.0, + "logps/chosen": -322.9875793457031, + "logps/rejected": -516.4574788411459, + "loss": 0.0548, + "rewards/chosen": 2.8029356002807617, + "rewards/margins": 10.977234840393066, + "rewards/rejected": -8.174299240112305, + "step": 4876 + }, + { + "epoch": 0.4455915943353129, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 5.8676115672760545e-06, + "logits/chosen": 864765098.6666666, + "logits/rejected": 470150297.6, + "logps/chosen": -274.06553141276044, + "logps/rejected": -452.16533203125, + "loss": 0.0052, + "rewards/chosen": 4.88297176361084, + "rewards/margins": 14.950702476501466, + "rewards/rejected": -10.067730712890626, + "step": 4877 + }, + { + "epoch": 0.4456829602558246, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 5.8661955430951644e-06, + "logits/chosen": 709235840.0, + "logits/rejected": 820693824.0, + "logps/chosen": -432.1640625, + "logps/rejected": -567.18408203125, + "loss": 0.0099, + "rewards/chosen": 4.716673851013184, + "rewards/margins": 13.844243049621582, + "rewards/rejected": -9.127569198608398, + "step": 4878 + }, + { + "epoch": 0.4457743261763362, + "grad_norm": 0.037841796875, + "kl": 0.0, + "learning_rate": 5.864779447287928e-06, + "logits/rejected": 352132544.0, + "logps/rejected": -586.2421875, + "loss": 0.0001, + "rewards/rejected": -10.468146324157715, + "step": 4879 + }, + { + "epoch": 0.4458656920968479, + "grad_norm": 0.61328125, + "kl": 0.0, + "learning_rate": 5.863363279971443e-06, + "logits/chosen": 710365781.3333334, + "logits/rejected": 446622259.2, + "logps/chosen": -262.1731363932292, + "logps/rejected": -521.2009765625, + "loss": 0.0031, + "rewards/chosen": 4.8943297068278, + "rewards/margins": 13.919092432657877, + "rewards/rejected": -9.024762725830078, + "step": 4880 + }, + { + "epoch": 0.4459570580173595, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 5.861947041262815e-06, + "logits/chosen": 373838549.3333333, + "logits/rejected": 445689292.8, + "logps/chosen": -473.5853678385417, + "logps/rejected": -361.1521728515625, + "loss": 0.0051, + "rewards/chosen": 4.693945566813151, + "rewards/margins": 13.765155283610028, + "rewards/rejected": -9.071209716796876, + "step": 4881 + }, + { + "epoch": 0.4460484239378712, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5.860530731279151e-06, + "logits/chosen": 1189328691.2, + "logits/rejected": 402426624.0, + "logps/chosen": -436.961767578125, + "logps/rejected": -507.9471028645833, + "loss": 0.0244, + "rewards/chosen": 3.6019779205322267, + "rewards/margins": 16.96435127258301, + "rewards/rejected": -13.362373352050781, + "step": 4882 + }, + { + "epoch": 0.4461397898583828, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 5.859114350137571e-06, + "logits/chosen": 891505920.0, + "logits/rejected": 897322432.0, + "logps/chosen": -388.29986572265625, + "logps/rejected": -504.7374267578125, + "loss": 0.0089, + "rewards/chosen": 4.418592929840088, + "rewards/margins": 15.044665813446045, + "rewards/rejected": -10.626072883605957, + "step": 4883 + }, + { + "epoch": 0.4462311557788945, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5.8576978979551924e-06, + "logits/chosen": 346837696.0, + "logits/rejected": 570677760.0, + "logps/chosen": -359.5769856770833, + "logps/rejected": -617.632568359375, + "loss": 0.0115, + "rewards/chosen": 4.3828786214192705, + "rewards/margins": 13.473253568013508, + "rewards/rejected": -9.090374946594238, + "step": 4884 + }, + { + "epoch": 0.4463225216994061, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 5.856281374849145e-06, + "logits/chosen": 718596928.0, + "logits/rejected": 438228320.0, + "logps/chosen": -390.4579772949219, + "logps/rejected": -514.3501586914062, + "loss": 0.0141, + "rewards/chosen": 3.8748679161071777, + "rewards/margins": 14.004234790802002, + "rewards/rejected": -10.129366874694824, + "step": 4885 + }, + { + "epoch": 0.4464138876199178, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5.854864780936564e-06, + "logits/chosen": 1219654656.0, + "logits/rejected": 515137792.0, + "logps/chosen": -400.972900390625, + "logps/rejected": -523.84228515625, + "loss": 0.0194, + "rewards/chosen": 3.6173667907714844, + "rewards/margins": 11.356879234313965, + "rewards/rejected": -7.7395124435424805, + "step": 4886 + }, + { + "epoch": 0.4465052535404294, + "grad_norm": 25.25, + "kl": 0.0, + "learning_rate": 5.853448116334582e-06, + "logits/chosen": 414690517.3333333, + "logits/rejected": 193143184.0, + "logps/chosen": -384.02392578125, + "logps/rejected": -354.3119812011719, + "loss": 0.0593, + "rewards/chosen": 3.441523234049479, + "rewards/margins": 10.995742956797281, + "rewards/rejected": -7.554219722747803, + "step": 4887 + }, + { + "epoch": 0.4465966194609411, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5.85203138116035e-06, + "logits/chosen": 1002857813.3333334, + "logits/rejected": 563105228.8, + "logps/chosen": -483.2442626953125, + "logps/rejected": -496.647314453125, + "loss": 0.0171, + "rewards/chosen": 3.6763264338175454, + "rewards/margins": 13.05693556467692, + "rewards/rejected": -9.380609130859375, + "step": 4888 + }, + { + "epoch": 0.4466879853814527, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5.850614575531017e-06, + "logits/chosen": 853932236.8, + "logits/rejected": 682521941.3333334, + "logps/chosen": -477.3982421875, + "logps/rejected": -493.6802978515625, + "loss": 0.0212, + "rewards/chosen": 3.7683570861816404, + "rewards/margins": 10.8185120900472, + "rewards/rejected": -7.05015500386556, + "step": 4889 + }, + { + "epoch": 0.4467793513019644, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 5.849197699563739e-06, + "logits/chosen": 852808064.0, + "logits/rejected": 522726368.0, + "logps/chosen": -324.60986328125, + "logps/rejected": -533.57080078125, + "loss": 0.0097, + "rewards/chosen": 4.3554816246032715, + "rewards/margins": 14.142943859100342, + "rewards/rejected": -9.78746223449707, + "step": 4890 + }, + { + "epoch": 0.446870717222476, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5.84778075337568e-06, + "logits/chosen": 225428896.0, + "logits/rejected": 375091882.6666667, + "logps/chosen": -203.02645874023438, + "logps/rejected": -573.8463541666666, + "loss": 0.0098, + "rewards/chosen": 3.7016868591308594, + "rewards/margins": 14.033702850341797, + "rewards/rejected": -10.332015991210938, + "step": 4891 + }, + { + "epoch": 0.4469620831429877, + "grad_norm": 23.0, + "kl": 0.0, + "learning_rate": 5.846363737084008e-06, + "logits/chosen": 768820940.8, + "logits/rejected": 318291072.0, + "logps/chosen": -343.101123046875, + "logps/rejected": -399.189453125, + "loss": 0.1189, + "rewards/chosen": 3.2618434906005858, + "rewards/margins": 8.20006955464681, + "rewards/rejected": -4.938226064046224, + "step": 4892 + }, + { + "epoch": 0.4470534490634993, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 5.844946650805896e-06, + "logits/chosen": 485846101.3333333, + "logits/rejected": 288749056.0, + "logps/chosen": -352.2237955729167, + "logps/rejected": -373.5891357421875, + "loss": 0.0176, + "rewards/chosen": 3.0231590270996094, + "rewards/margins": 12.333119964599609, + "rewards/rejected": -9.3099609375, + "step": 4893 + }, + { + "epoch": 0.447144814984011, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5.8435294946585254e-06, + "logits/chosen": 253587616.0, + "logits/rejected": 683018048.0, + "logps/chosen": -364.32867431640625, + "logps/rejected": -427.9400634765625, + "loss": 0.0181, + "rewards/chosen": 4.3253374099731445, + "rewards/margins": 13.156206130981445, + "rewards/rejected": -8.8308687210083, + "step": 4894 + }, + { + "epoch": 0.4472361809045226, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5.842112268759079e-06, + "logits/chosen": 718824874.6666666, + "logits/rejected": 378976384.0, + "logps/chosen": -559.4680582682291, + "logps/rejected": -451.817529296875, + "loss": 0.0126, + "rewards/chosen": 3.4847803115844727, + "rewards/margins": 13.374360084533691, + "rewards/rejected": -9.889579772949219, + "step": 4895 + }, + { + "epoch": 0.4473275468250343, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5.840694973224752e-06, + "logits/chosen": 358622336.0, + "logits/rejected": 686455296.0, + "logps/chosen": -165.81643676757812, + "logps/rejected": -548.021484375, + "loss": 0.014, + "rewards/chosen": 3.438868522644043, + "rewards/margins": 11.525616264343261, + "rewards/rejected": -8.086747741699218, + "step": 4896 + }, + { + "epoch": 0.4474189127455459, + "grad_norm": 28.75, + "kl": 0.0, + "learning_rate": 5.839277608172739e-06, + "logits/chosen": 397215317.3333333, + "logits/rejected": 730951424.0, + "logps/chosen": -236.81241861979166, + "logps/rejected": -733.23974609375, + "loss": 0.1156, + "rewards/chosen": 3.4357872009277344, + "rewards/margins": 9.840325355529785, + "rewards/rejected": -6.404538154602051, + "step": 4897 + }, + { + "epoch": 0.4475102786660576, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5.837860173720245e-06, + "logits/chosen": 384008832.0, + "logits/rejected": 276480448.0, + "logps/chosen": -280.39141845703125, + "logps/rejected": -443.9347229003906, + "loss": 0.0182, + "rewards/chosen": 4.335728168487549, + "rewards/margins": 16.011026859283447, + "rewards/rejected": -11.675298690795898, + "step": 4898 + }, + { + "epoch": 0.4476016445865692, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5.836442669984477e-06, + "logits/chosen": 497009664.0, + "logits/rejected": 430181717.3333333, + "logps/chosen": -192.28921508789062, + "logps/rejected": -365.958984375, + "loss": 0.0218, + "rewards/chosen": 2.625197649002075, + "rewards/margins": 10.222870429356892, + "rewards/rejected": -7.597672780354817, + "step": 4899 + }, + { + "epoch": 0.4476930105070809, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5.835025097082649e-06, + "logits/chosen": 247484437.33333334, + "logits/rejected": 501388902.4, + "logps/chosen": -208.6728515625, + "logps/rejected": -470.402490234375, + "loss": 0.0414, + "rewards/chosen": 4.388926823933919, + "rewards/margins": 12.098504765828451, + "rewards/rejected": -7.709577941894532, + "step": 4900 + }, + { + "epoch": 0.4477843764275925, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5.833607455131983e-06, + "logits/chosen": 446995200.0, + "logits/rejected": 381457728.0, + "logps/chosen": -344.2025146484375, + "logps/rejected": -432.6322937011719, + "loss": 0.0229, + "rewards/chosen": 4.3323163986206055, + "rewards/margins": 11.983745574951172, + "rewards/rejected": -7.651429176330566, + "step": 4901 + }, + { + "epoch": 0.4478757423481042, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 5.832189744249705e-06, + "logits/chosen": 541763072.0, + "logits/rejected": 838505574.4, + "logps/chosen": -247.7911173502604, + "logps/rejected": -400.272021484375, + "loss": 0.0145, + "rewards/chosen": 3.6922035217285156, + "rewards/margins": 11.89664077758789, + "rewards/rejected": -8.204437255859375, + "step": 4902 + }, + { + "epoch": 0.4479671082686158, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5.830771964553044e-06, + "logits/chosen": 650039509.3333334, + "logits/rejected": 589703296.0, + "logps/chosen": -407.4332682291667, + "logps/rejected": -444.259765625, + "loss": 0.0458, + "rewards/chosen": 3.094390551249186, + "rewards/margins": 8.910112539927164, + "rewards/rejected": -5.8157219886779785, + "step": 4903 + }, + { + "epoch": 0.4480584741891275, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5.829354116159242e-06, + "logits/chosen": 382025045.3333333, + "logits/rejected": 400090944.0, + "logps/chosen": -183.99466959635416, + "logps/rejected": -521.1746826171875, + "loss": 0.0413, + "rewards/chosen": 3.087953567504883, + "rewards/margins": 12.921043395996094, + "rewards/rejected": -9.833089828491211, + "step": 4904 + }, + { + "epoch": 0.4481498401096391, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5.827936199185534e-06, + "logits/chosen": 403134378.6666667, + "logits/rejected": 406487904.0, + "logps/chosen": -305.9296468098958, + "logps/rejected": -631.6724853515625, + "loss": 0.014, + "rewards/chosen": 4.624207814534505, + "rewards/margins": 13.734406789143879, + "rewards/rejected": -9.110198974609375, + "step": 4905 + }, + { + "epoch": 0.4482412060301508, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.826518213749177e-06, + "logits/chosen": 396798668.8, + "logits/rejected": 503504042.6666667, + "logps/chosen": -213.642333984375, + "logps/rejected": -427.3785807291667, + "loss": 0.0227, + "rewards/chosen": 3.8162391662597654, + "rewards/margins": 11.294729359944661, + "rewards/rejected": -7.4784901936848955, + "step": 4906 + }, + { + "epoch": 0.4483325719506624, + "grad_norm": 0.66015625, + "kl": 0.0, + "learning_rate": 5.825100159967421e-06, + "logits/chosen": 258173653.33333334, + "logits/rejected": 314855910.4, + "logps/chosen": -258.8310139973958, + "logps/rejected": -445.86044921875, + "loss": 0.0031, + "rewards/chosen": 5.122914950052897, + "rewards/margins": 15.074715487162273, + "rewards/rejected": -9.951800537109374, + "step": 4907 + }, + { + "epoch": 0.44842393787117407, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5.823682037957524e-06, + "logits/chosen": 634510762.6666666, + "logits/rejected": 394265958.4, + "logps/chosen": -520.4466959635416, + "logps/rejected": -539.960302734375, + "loss": 0.0103, + "rewards/chosen": 3.694711367289225, + "rewards/margins": 14.92195364634196, + "rewards/rejected": -11.227242279052735, + "step": 4908 + }, + { + "epoch": 0.4485153037916857, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 5.822263847836757e-06, + "logits/chosen": 590584640.0, + "logits/rejected": 746504320.0, + "logps/chosen": -333.37713623046875, + "logps/rejected": -813.00341796875, + "loss": 0.0092, + "rewards/chosen": 4.684758186340332, + "rewards/margins": 13.22910213470459, + "rewards/rejected": -8.544343948364258, + "step": 4909 + }, + { + "epoch": 0.44860666971219737, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 5.820845589722386e-06, + "logits/chosen": 386792089.6, + "logits/rejected": 281641472.0, + "logps/chosen": -291.964208984375, + "logps/rejected": -262.07375081380206, + "loss": 0.0293, + "rewards/chosen": 3.4130790710449217, + "rewards/margins": 9.916588592529298, + "rewards/rejected": -6.503509521484375, + "step": 4910 + }, + { + "epoch": 0.448698035632709, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5.819427263731692e-06, + "logits/chosen": 514115104.0, + "logits/rejected": 648805824.0, + "logps/chosen": -216.70352172851562, + "logps/rejected": -471.3908386230469, + "loss": 0.0273, + "rewards/chosen": 3.1254584789276123, + "rewards/margins": 11.437111616134644, + "rewards/rejected": -8.311653137207031, + "step": 4911 + }, + { + "epoch": 0.44878940155322067, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 5.8180088699819525e-06, + "logits/chosen": 710001749.3333334, + "logits/rejected": 948143104.0, + "logps/chosen": -351.2971598307292, + "logps/rejected": -360.5392761230469, + "loss": 0.0367, + "rewards/chosen": 3.8473536173502603, + "rewards/margins": 12.973487536112467, + "rewards/rejected": -9.126133918762207, + "step": 4912 + }, + { + "epoch": 0.4488807674737323, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5.816590408590461e-06, + "logits/chosen": 371164245.3333333, + "logits/rejected": 480360396.8, + "logps/chosen": -218.73274739583334, + "logps/rejected": -471.295751953125, + "loss": 0.0151, + "rewards/chosen": 3.7962611516316733, + "rewards/margins": 14.490227063496908, + "rewards/rejected": -10.693965911865234, + "step": 4913 + }, + { + "epoch": 0.44897213339424397, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5.815171879674508e-06, + "logits/chosen": 279736992.0, + "logits/rejected": 632888704.0, + "logps/chosen": -232.74005126953125, + "logps/rejected": -475.6410217285156, + "loss": 0.0134, + "rewards/chosen": 3.9564366340637207, + "rewards/margins": 11.961602687835693, + "rewards/rejected": -8.005166053771973, + "step": 4914 + }, + { + "epoch": 0.4490634993147556, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5.813753283351394e-06, + "logits/chosen": 824174592.0, + "logits/rejected": 844533888.0, + "logps/chosen": -557.6737670898438, + "logps/rejected": -517.7051391601562, + "loss": 0.0097, + "rewards/chosen": 4.643714427947998, + "rewards/margins": 15.620242595672607, + "rewards/rejected": -10.97652816772461, + "step": 4915 + }, + { + "epoch": 0.44915486523526726, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 5.812334619738422e-06, + "logits/chosen": 508143808.0, + "logits/rejected": 338721749.3333333, + "logps/chosen": -514.1700439453125, + "logps/rejected": -397.0213216145833, + "loss": 0.0122, + "rewards/chosen": 3.0290374755859375, + "rewards/margins": 11.844551086425781, + "rewards/rejected": -8.815513610839844, + "step": 4916 + }, + { + "epoch": 0.4492462311557789, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5.810915888952905e-06, + "logits/chosen": 526971968.0, + "logits/rejected": 500452906.6666667, + "logps/chosen": -460.29022216796875, + "logps/rejected": -485.57275390625, + "loss": 0.0335, + "rewards/chosen": 2.500065803527832, + "rewards/margins": 12.024365425109863, + "rewards/rejected": -9.524299621582031, + "step": 4917 + }, + { + "epoch": 0.44933759707629056, + "grad_norm": 52.5, + "kl": 0.0, + "learning_rate": 5.809497091112155e-06, + "logits/chosen": 1076093440.0, + "logits/rejected": 643485952.0, + "logps/chosen": -432.8037109375, + "logps/rejected": -332.2548828125, + "loss": 0.1138, + "rewards/chosen": 3.252413749694824, + "rewards/margins": 6.7808990478515625, + "rewards/rejected": -3.5284852981567383, + "step": 4918 + }, + { + "epoch": 0.4494289629968022, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5.808078226333497e-06, + "logits/chosen": 421099776.0, + "logits/rejected": 650281984.0, + "logps/chosen": -265.0177307128906, + "logps/rejected": -528.8195190429688, + "loss": 0.019, + "rewards/chosen": 3.9554452896118164, + "rewards/margins": 13.8811674118042, + "rewards/rejected": -9.925722122192383, + "step": 4919 + }, + { + "epoch": 0.44952032891731386, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5.806659294734256e-06, + "logits/chosen": 482679808.0, + "logits/rejected": 441716544.0, + "logps/chosen": -331.8585205078125, + "logps/rejected": -632.3228759765625, + "loss": 0.0262, + "rewards/chosen": 3.2740445137023926, + "rewards/margins": 13.270058155059814, + "rewards/rejected": -9.996013641357422, + "step": 4920 + }, + { + "epoch": 0.4496116948378255, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5.805240296431765e-06, + "logits/chosen": 266936256.0, + "logits/rejected": 600985045.3333334, + "logps/chosen": -112.77851104736328, + "logps/rejected": -569.4254557291666, + "loss": 0.0266, + "rewards/chosen": 2.2213051319122314, + "rewards/margins": 13.795562028884888, + "rewards/rejected": -11.574256896972656, + "step": 4921 + }, + { + "epoch": 0.44970306075833716, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 5.803821231543363e-06, + "logits/chosen": 313048810.6666667, + "logits/rejected": 768870809.6, + "logps/chosen": -197.60738118489584, + "logps/rejected": -675.8619140625, + "loss": 0.0095, + "rewards/chosen": 4.160050710042317, + "rewards/margins": 12.769790395100912, + "rewards/rejected": -8.609739685058594, + "step": 4922 + }, + { + "epoch": 0.4497944266788488, + "grad_norm": 0.74609375, + "kl": 0.0, + "learning_rate": 5.802402100186393e-06, + "logits/chosen": 477440192.0, + "logits/rejected": 319407402.6666667, + "logps/chosen": -425.1847229003906, + "logps/rejected": -438.8623860677083, + "loss": 0.0041, + "rewards/chosen": 4.16562032699585, + "rewards/margins": 14.071420192718506, + "rewards/rejected": -9.905799865722656, + "step": 4923 + }, + { + "epoch": 0.44988579259936046, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 5.8009829024782035e-06, + "logits/chosen": 601928064.0, + "logits/rejected": 376247680.0, + "logps/chosen": -348.89715576171875, + "logps/rejected": -525.0037027994791, + "loss": 0.0058, + "rewards/chosen": 3.8178482055664062, + "rewards/margins": 13.804656982421875, + "rewards/rejected": -9.986808776855469, + "step": 4924 + }, + { + "epoch": 0.4499771585198721, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 5.799563638536149e-06, + "logits/chosen": 830117990.4, + "logits/rejected": 521378816.0, + "logps/chosen": -269.294140625, + "logps/rejected": -519.60498046875, + "loss": 0.0366, + "rewards/chosen": 3.2371936798095704, + "rewards/margins": 10.9260072072347, + "rewards/rejected": -7.68881352742513, + "step": 4925 + }, + { + "epoch": 0.45006852444038375, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5.79814430847759e-06, + "logits/chosen": 1078023808.0, + "logits/rejected": 561778496.0, + "logps/chosen": -315.9206848144531, + "logps/rejected": -390.29443359375, + "loss": 0.0323, + "rewards/chosen": 3.2381629943847656, + "rewards/margins": 12.648900985717773, + "rewards/rejected": -9.410737991333008, + "step": 4926 + }, + { + "epoch": 0.4501598903608954, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5.796724912419891e-06, + "logits/chosen": 536293760.0, + "logits/rejected": 372845728.0, + "logps/chosen": -219.77278645833334, + "logps/rejected": -355.52008056640625, + "loss": 0.0286, + "rewards/chosen": 3.595133145650228, + "rewards/margins": 12.493168195088705, + "rewards/rejected": -8.898035049438477, + "step": 4927 + }, + { + "epoch": 0.45025125628140705, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 5.7953054504804255e-06, + "logits/chosen": 780489216.0, + "logits/rejected": 331131808.0, + "logps/chosen": -321.3285827636719, + "logps/rejected": -374.4612731933594, + "loss": 0.0111, + "rewards/chosen": 4.40004825592041, + "rewards/margins": 13.748578071594238, + "rewards/rejected": -9.348529815673828, + "step": 4928 + }, + { + "epoch": 0.4503426222019187, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5.793885922776565e-06, + "logits/chosen": 502018368.0, + "logits/rejected": 322646752.0, + "logps/chosen": -332.03033447265625, + "logps/rejected": -399.2918395996094, + "loss": 0.0187, + "rewards/chosen": 3.9975852966308594, + "rewards/margins": 12.949505805969238, + "rewards/rejected": -8.951920509338379, + "step": 4929 + }, + { + "epoch": 0.45043398812243035, + "grad_norm": 33.75, + "kl": 0.0, + "learning_rate": 5.792466329425697e-06, + "logits/chosen": 755722410.6666666, + "logits/rejected": 698482112.0, + "logps/chosen": -290.37339274088544, + "logps/rejected": -419.06414794921875, + "loss": 0.0722, + "rewards/chosen": 2.6686347325642905, + "rewards/margins": 9.642360051472982, + "rewards/rejected": -6.973725318908691, + "step": 4930 + }, + { + "epoch": 0.450525354042942, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 5.791046670545203e-06, + "logits/chosen": 956735658.6666666, + "logits/rejected": 570246809.6, + "logps/chosen": -280.9065755208333, + "logps/rejected": -317.443359375, + "loss": 0.0131, + "rewards/chosen": 3.7898642222086587, + "rewards/margins": 11.236951319376628, + "rewards/rejected": -7.447087097167969, + "step": 4931 + }, + { + "epoch": 0.45061671996345365, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.78962694625248e-06, + "logits/chosen": 610615936.0, + "logits/rejected": 343423296.0, + "logps/chosen": -185.6077423095703, + "logps/rejected": -384.50738525390625, + "loss": 0.0244, + "rewards/chosen": 3.5989465713500977, + "rewards/margins": 11.111421585083008, + "rewards/rejected": -7.51247501373291, + "step": 4932 + }, + { + "epoch": 0.45070808588396527, + "grad_norm": 0.9375, + "kl": 0.0, + "learning_rate": 5.788207156664924e-06, + "logits/chosen": 462851276.8, + "logits/rejected": 380624981.3333333, + "logps/chosen": -293.206591796875, + "logps/rejected": -411.47802734375, + "loss": 0.0068, + "rewards/chosen": 4.642395782470703, + "rewards/margins": 14.070130793253579, + "rewards/rejected": -9.427735010782877, + "step": 4933 + }, + { + "epoch": 0.45079945180447695, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5.786787301899941e-06, + "logits/chosen": 514959360.0, + "logits/rejected": 973728597.3333334, + "logps/chosen": -142.76943359375, + "logps/rejected": -685.2921549479166, + "loss": 0.1443, + "rewards/chosen": 2.245359420776367, + "rewards/margins": 8.946050643920898, + "rewards/rejected": -6.700691223144531, + "step": 4934 + }, + { + "epoch": 0.45089081772498857, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5.7853673820749355e-06, + "logits/chosen": 630742464.0, + "logits/rejected": 456577024.0, + "logps/chosen": -548.8803100585938, + "logps/rejected": -700.3416341145834, + "loss": 0.0122, + "rewards/chosen": 3.089324951171875, + "rewards/margins": 13.313678741455078, + "rewards/rejected": -10.224353790283203, + "step": 4935 + }, + { + "epoch": 0.45098218364550025, + "grad_norm": 66.0, + "kl": 0.0, + "learning_rate": 5.783947397307326e-06, + "logits/chosen": 568060501.3333334, + "logits/rejected": 310168217.6, + "logps/chosen": -348.78271484375, + "logps/rejected": -359.5528564453125, + "loss": 0.0675, + "rewards/chosen": 3.5206273396809897, + "rewards/margins": 10.459814961751302, + "rewards/rejected": -6.939187622070312, + "step": 4936 + }, + { + "epoch": 0.45107354956601187, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.7825273477145285e-06, + "logits/chosen": 650473514.6666666, + "logits/rejected": 657422336.0, + "logps/chosen": -297.9150390625, + "logps/rejected": -625.7138671875, + "loss": 0.032, + "rewards/chosen": 3.6590449015299478, + "rewards/margins": 14.732680956522623, + "rewards/rejected": -11.073636054992676, + "step": 4937 + }, + { + "epoch": 0.45116491548652354, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.78110723341397e-06, + "logits/chosen": 590909696.0, + "logits/rejected": 514861408.0, + "logps/chosen": -224.589111328125, + "logps/rejected": -461.24420166015625, + "loss": 0.0189, + "rewards/chosen": 3.7697633107503257, + "rewards/margins": 14.553378423055014, + "rewards/rejected": -10.783615112304688, + "step": 4938 + }, + { + "epoch": 0.45125628140703516, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5.779687054523082e-06, + "logits/chosen": 390850816.0, + "logits/rejected": 447123404.8, + "logps/chosen": -242.7730712890625, + "logps/rejected": -558.470654296875, + "loss": 0.0159, + "rewards/chosen": 3.1325642267862954, + "rewards/margins": 13.044945208231608, + "rewards/rejected": -9.912380981445313, + "step": 4939 + }, + { + "epoch": 0.45134764732754684, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5.778266811159297e-06, + "logits/chosen": 476888320.0, + "logits/rejected": 697676288.0, + "logps/chosen": -361.6363118489583, + "logps/rejected": -567.34453125, + "loss": 0.0099, + "rewards/chosen": 4.287870407104492, + "rewards/margins": 13.69752540588379, + "rewards/rejected": -9.409654998779297, + "step": 4940 + }, + { + "epoch": 0.45143901324805846, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5.7768465034400575e-06, + "logits/chosen": 511120320.0, + "logits/rejected": 523689344.0, + "logps/chosen": -330.99383544921875, + "logps/rejected": -440.7930603027344, + "loss": 0.0173, + "rewards/chosen": 3.468018054962158, + "rewards/margins": 12.28572130203247, + "rewards/rejected": -8.817703247070312, + "step": 4941 + }, + { + "epoch": 0.45153037916857014, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5.775426131482811e-06, + "logits/chosen": 495911424.0, + "logits/rejected": 747750400.0, + "logps/chosen": -287.7563171386719, + "logps/rejected": -589.4246419270834, + "loss": 0.0134, + "rewards/chosen": 2.922738552093506, + "rewards/margins": 12.033750057220459, + "rewards/rejected": -9.111011505126953, + "step": 4942 + }, + { + "epoch": 0.45162174508908176, + "grad_norm": 23.25, + "kl": 0.0, + "learning_rate": 5.774005695405008e-06, + "logits/chosen": 599404330.6666666, + "logits/rejected": 714369689.6, + "logps/chosen": -484.8409830729167, + "logps/rejected": -368.67109375, + "loss": 0.022, + "rewards/chosen": 3.384585698445638, + "rewards/margins": 12.288260014851888, + "rewards/rejected": -8.90367431640625, + "step": 4943 + }, + { + "epoch": 0.45171311100959344, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 5.772585195324104e-06, + "logits/chosen": 412568256.0, + "logits/rejected": 524230656.0, + "logps/chosen": -294.00213623046875, + "logps/rejected": -638.9038696289062, + "loss": 0.0169, + "rewards/chosen": 3.6474368572235107, + "rewards/margins": 15.631882429122925, + "rewards/rejected": -11.984445571899414, + "step": 4944 + }, + { + "epoch": 0.45180447693010506, + "grad_norm": 0.9375, + "kl": 0.0, + "learning_rate": 5.771164631357563e-06, + "logits/chosen": 354339541.3333333, + "logits/rejected": 431242598.4, + "logps/chosen": -222.07731119791666, + "logps/rejected": -369.8420166015625, + "loss": 0.0064, + "rewards/chosen": 4.4678160349528, + "rewards/margins": 12.750959650675455, + "rewards/rejected": -8.283143615722656, + "step": 4945 + }, + { + "epoch": 0.45189584285061674, + "grad_norm": 37.75, + "kl": 0.0, + "learning_rate": 5.769744003622852e-06, + "logits/chosen": 258476880.0, + "logits/rejected": 447811488.0, + "logps/chosen": -173.39576721191406, + "logps/rejected": -427.1263427734375, + "loss": 0.099, + "rewards/chosen": 3.057821750640869, + "rewards/margins": 12.119884014129639, + "rewards/rejected": -9.06206226348877, + "step": 4946 + }, + { + "epoch": 0.45198720877112836, + "grad_norm": 44.5, + "kl": 0.0, + "learning_rate": 5.768323312237444e-06, + "logits/chosen": 608732416.0, + "logits/rejected": 432571562.6666667, + "logps/chosen": -375.0275146484375, + "logps/rejected": -571.1922200520834, + "loss": 0.0933, + "rewards/chosen": 2.15602970123291, + "rewards/margins": 12.803692817687988, + "rewards/rejected": -10.647663116455078, + "step": 4947 + }, + { + "epoch": 0.45207857469164003, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 5.766902557318815e-06, + "logits/chosen": 1222996992.0, + "logits/rejected": 844983978.6666666, + "logps/chosen": -345.8154052734375, + "logps/rejected": -481.6549886067708, + "loss": 0.0057, + "rewards/chosen": 5.082927703857422, + "rewards/margins": 14.085704803466797, + "rewards/rejected": -9.002777099609375, + "step": 4948 + }, + { + "epoch": 0.45216994061215166, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5.765481738984453e-06, + "logits/chosen": 560040448.0, + "logits/rejected": 500822186.6666667, + "logps/chosen": -465.314794921875, + "logps/rejected": -390.1417643229167, + "loss": 0.022, + "rewards/chosen": 3.518640899658203, + "rewards/margins": 12.562304560343424, + "rewards/rejected": -9.04366366068522, + "step": 4949 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5.764060857351841e-06, + "logits/chosen": 499127850.6666667, + "logits/rejected": 520603136.0, + "logps/chosen": -405.71337890625, + "logps/rejected": -628.00146484375, + "loss": 0.1286, + "rewards/chosen": 1.5021292368570964, + "rewards/margins": 11.681498591105143, + "rewards/rejected": -10.179369354248047, + "step": 4950 + }, + { + "epoch": 0.45235267245317495, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5.762639912538477e-06, + "logits/chosen": 916115148.8, + "logits/rejected": 794500352.0, + "logps/chosen": -230.18125, + "logps/rejected": -623.9136149088541, + "loss": 0.0326, + "rewards/chosen": 3.5604888916015627, + "rewards/margins": 13.469978841145835, + "rewards/rejected": -9.909489949544271, + "step": 4951 + }, + { + "epoch": 0.45244403837368663, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 5.761218904661856e-06, + "logits/chosen": 788830464.0, + "logits/rejected": 1011786752.0, + "logps/chosen": -426.506396484375, + "logps/rejected": -392.4645589192708, + "loss": 0.111, + "rewards/chosen": 2.3711374282836912, + "rewards/margins": 10.90670248667399, + "rewards/rejected": -8.535565058390299, + "step": 4952 + }, + { + "epoch": 0.45253540429419825, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 5.759797833839486e-06, + "logits/chosen": 440300096.0, + "logits/rejected": 559137728.0, + "logps/chosen": -369.57525634765625, + "logps/rejected": -597.88818359375, + "loss": 0.0182, + "rewards/chosen": 3.980907440185547, + "rewards/margins": 15.379337310791016, + "rewards/rejected": -11.398429870605469, + "step": 4953 + }, + { + "epoch": 0.45262677021470993, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 5.758376700188876e-06, + "logits/chosen": 1042912870.4, + "logits/rejected": 559410858.6666666, + "logps/chosen": -528.645166015625, + "logps/rejected": -339.78851318359375, + "loss": 0.0112, + "rewards/chosen": 4.4012397766113285, + "rewards/margins": 11.95715560913086, + "rewards/rejected": -7.555915832519531, + "step": 4954 + }, + { + "epoch": 0.45271813613522155, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5.7569555038275395e-06, + "logits/chosen": 1177810858.6666667, + "logits/rejected": 765924198.4, + "logps/chosen": -265.4492594401042, + "logps/rejected": -577.988232421875, + "loss": 0.0127, + "rewards/chosen": 3.558190027872721, + "rewards/margins": 12.917075220743815, + "rewards/rejected": -9.358885192871094, + "step": 4955 + }, + { + "epoch": 0.4528095020557332, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.755534244872993e-06, + "logits/chosen": 1197109504.0, + "logits/rejected": 786600806.4, + "logps/chosen": -151.40594482421875, + "logps/rejected": -555.3509765625, + "loss": 0.0195, + "rewards/chosen": 3.9971323013305664, + "rewards/margins": 14.275502967834473, + "rewards/rejected": -10.278370666503907, + "step": 4956 + }, + { + "epoch": 0.45290086797624485, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5.754112923442767e-06, + "logits/chosen": 863254784.0, + "logits/rejected": 1123729024.0, + "logps/chosen": -311.3262939453125, + "logps/rejected": -520.3007202148438, + "loss": 0.0276, + "rewards/chosen": 3.1773829460144043, + "rewards/margins": 11.457021236419678, + "rewards/rejected": -8.279638290405273, + "step": 4957 + }, + { + "epoch": 0.4529922338967565, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5.752691539654388e-06, + "logits/chosen": 561758566.4, + "logits/rejected": 422939605.3333333, + "logps/chosen": -310.501904296875, + "logps/rejected": -576.4478352864584, + "loss": 0.0429, + "rewards/chosen": 2.891316604614258, + "rewards/margins": 13.03975601196289, + "rewards/rejected": -10.148439407348633, + "step": 4958 + }, + { + "epoch": 0.45308359981726815, + "grad_norm": 2.25, + "kl": 0.8816184997558594, + "learning_rate": 5.751270093625394e-06, + "logits/chosen": 555147337.1428572, + "logits/rejected": 214499440.0, + "logps/chosen": -278.7308349609375, + "logps/rejected": -352.847900390625, + "loss": 0.0304, + "rewards/chosen": 4.029056004115513, + "rewards/margins": 13.89115469796317, + "rewards/rejected": -9.862098693847656, + "step": 4959 + }, + { + "epoch": 0.4531749657377798, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5.7498485854733245e-06, + "logits/chosen": 619538176.0, + "logits/rejected": 408697920.0, + "logps/chosen": -364.5516052246094, + "logps/rejected": -502.9990539550781, + "loss": 0.0328, + "rewards/chosen": 3.132453441619873, + "rewards/margins": 12.391309261322021, + "rewards/rejected": -9.258855819702148, + "step": 4960 + }, + { + "epoch": 0.45326633165829144, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5.7484270153157215e-06, + "logits/chosen": 630693973.3333334, + "logits/rejected": 711180185.6, + "logps/chosen": -351.298583984375, + "logps/rejected": -415.012060546875, + "loss": 0.1238, + "rewards/chosen": 3.2412001291910806, + "rewards/margins": 9.865332667032877, + "rewards/rejected": -6.624132537841797, + "step": 4961 + }, + { + "epoch": 0.4533576975788031, + "grad_norm": 0.828125, + "kl": 0.0, + "learning_rate": 5.747005383270141e-06, + "logits/chosen": 362618016.0, + "logits/rejected": 407220565.3333333, + "logps/chosen": -254.87619018554688, + "logps/rejected": -534.3366292317709, + "loss": 0.0034, + "rewards/chosen": 4.595189094543457, + "rewards/margins": 13.425177574157715, + "rewards/rejected": -8.829988479614258, + "step": 4962 + }, + { + "epoch": 0.45344906349931474, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 5.745583689454135e-06, + "logits/chosen": 737660979.2, + "logits/rejected": 542713728.0, + "logps/chosen": -257.10712890625, + "logps/rejected": -592.831787109375, + "loss": 0.0129, + "rewards/chosen": 4.272158813476563, + "rewards/margins": 13.8614621480306, + "rewards/rejected": -9.589303334554037, + "step": 4963 + }, + { + "epoch": 0.4535404294198264, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 5.744161933985267e-06, + "logits/chosen": 557754432.0, + "logits/rejected": 492993024.0, + "logps/chosen": -426.93023681640625, + "logps/rejected": -380.7967122395833, + "loss": 0.0054, + "rewards/chosen": 4.005265712738037, + "rewards/margins": 11.386901378631592, + "rewards/rejected": -7.381635665893555, + "step": 4964 + }, + { + "epoch": 0.45363179534033804, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.742740116981102e-06, + "logits/chosen": 416850858.6666667, + "logits/rejected": 613621350.4, + "logps/chosen": -331.61659749348956, + "logps/rejected": -540.244873046875, + "loss": 0.0171, + "rewards/chosen": 3.7735137939453125, + "rewards/margins": 12.67047576904297, + "rewards/rejected": -8.896961975097657, + "step": 4965 + }, + { + "epoch": 0.4537231612608497, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.74131823855921e-06, + "logits/chosen": 703658496.0, + "logits/rejected": 567802453.3333334, + "logps/chosen": -351.4634765625, + "logps/rejected": -611.4142659505209, + "loss": 0.0937, + "rewards/chosen": 3.629891204833984, + "rewards/margins": 11.123034286499024, + "rewards/rejected": -7.493143081665039, + "step": 4966 + }, + { + "epoch": 0.45381452718136134, + "grad_norm": 26.375, + "kl": 0.0, + "learning_rate": 5.739896298837169e-06, + "logits/chosen": 368239616.0, + "logits/rejected": 550603904.0, + "logps/chosen": -160.35155232747397, + "logps/rejected": -360.46234130859375, + "loss": 0.0849, + "rewards/chosen": 3.0057290395100913, + "rewards/margins": 9.28632672627767, + "rewards/rejected": -6.280597686767578, + "step": 4967 + }, + { + "epoch": 0.453905893101873, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5.738474297932559e-06, + "logits/chosen": 350849248.0, + "logits/rejected": 372976320.0, + "logps/chosen": -383.8675231933594, + "logps/rejected": -542.004150390625, + "loss": 0.0253, + "rewards/chosen": 3.337434768676758, + "rewards/margins": 12.732950210571289, + "rewards/rejected": -9.395515441894531, + "step": 4968 + }, + { + "epoch": 0.45399725902238464, + "grad_norm": 38.0, + "kl": 0.0, + "learning_rate": 5.7370522359629665e-06, + "logits/chosen": 1093099520.0, + "logits/rejected": 511075225.6, + "logps/chosen": -194.8881632486979, + "logps/rejected": -484.2794921875, + "loss": 0.0878, + "rewards/chosen": 2.1454097429911294, + "rewards/margins": 10.916933027903239, + "rewards/rejected": -8.77152328491211, + "step": 4969 + }, + { + "epoch": 0.4540886249428963, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5.735630113045985e-06, + "logits/chosen": 714194534.4, + "logits/rejected": 423590186.6666667, + "logps/chosen": -234.4654296875, + "logps/rejected": -342.1653645833333, + "loss": 0.0221, + "rewards/chosen": 4.572838973999024, + "rewards/margins": 14.137880833943687, + "rewards/rejected": -9.565041859944662, + "step": 4970 + }, + { + "epoch": 0.45417999086340793, + "grad_norm": 29.875, + "kl": 0.0, + "learning_rate": 5.734207929299206e-06, + "logits/chosen": 1049730752.0, + "logits/rejected": 424133034.6666667, + "logps/chosen": -166.13314819335938, + "logps/rejected": -372.4280598958333, + "loss": 0.1173, + "rewards/chosen": 3.759030342102051, + "rewards/margins": 9.314653714497883, + "rewards/rejected": -5.555623372395833, + "step": 4971 + }, + { + "epoch": 0.4542713567839196, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5.732785684840235e-06, + "logits/chosen": 926348202.6666666, + "logits/rejected": 509889920.0, + "logps/chosen": -209.41581217447916, + "logps/rejected": -394.34063720703125, + "loss": 0.0603, + "rewards/chosen": 2.9624706904093423, + "rewards/margins": 11.901939074198404, + "rewards/rejected": -8.939468383789062, + "step": 4972 + }, + { + "epoch": 0.45436272270443123, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5.731363379786676e-06, + "logits/chosen": 540238336.0, + "logits/rejected": 820842752.0, + "logps/chosen": -363.99176025390625, + "logps/rejected": -544.8240356445312, + "loss": 0.0152, + "rewards/chosen": 3.6144652366638184, + "rewards/margins": 12.865548610687256, + "rewards/rejected": -9.251083374023438, + "step": 4973 + }, + { + "epoch": 0.4544540886249429, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 5.729941014256144e-06, + "logits/chosen": 616948053.3333334, + "logits/rejected": 636862336.0, + "logps/chosen": -334.19036865234375, + "logps/rejected": -489.8641357421875, + "loss": 0.0281, + "rewards/chosen": 3.5834147135416665, + "rewards/margins": 11.799469629923502, + "rewards/rejected": -8.216054916381836, + "step": 4974 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 5.728518588366253e-06, + "logits/chosen": 847454208.0, + "logits/rejected": 712591104.0, + "logps/chosen": -433.4183756510417, + "logps/rejected": -647.69921875, + "loss": 0.0086, + "rewards/chosen": 4.8463090260823565, + "rewards/margins": 15.581658299763998, + "rewards/rejected": -10.73534927368164, + "step": 4975 + }, + { + "epoch": 0.4546368204659662, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5.7270961022346224e-06, + "logits/chosen": 350176972.8, + "logits/rejected": 453725781.3333333, + "logps/chosen": -248.4465087890625, + "logps/rejected": -691.54833984375, + "loss": 0.0384, + "rewards/chosen": 3.2223960876464846, + "rewards/margins": 14.069569142659507, + "rewards/rejected": -10.847173055013021, + "step": 4976 + }, + { + "epoch": 0.45472818638647783, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5.725673555978884e-06, + "logits/chosen": 441909145.6, + "logits/rejected": 412224981.3333333, + "logps/chosen": -386.64384765625, + "logps/rejected": -390.725830078125, + "loss": 0.0248, + "rewards/chosen": 3.605915832519531, + "rewards/margins": 13.485544967651368, + "rewards/rejected": -9.879629135131836, + "step": 4977 + }, + { + "epoch": 0.4548195523069895, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 5.724250949716663e-06, + "logits/chosen": 627180096.0, + "logits/rejected": 434721376.0, + "logps/chosen": -369.3172607421875, + "logps/rejected": -431.70452880859375, + "loss": 0.0056, + "rewards/chosen": 4.656719207763672, + "rewards/margins": 13.04111099243164, + "rewards/rejected": -8.384391784667969, + "step": 4978 + }, + { + "epoch": 0.4549109182275011, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 5.7228282835656e-06, + "logits/chosen": 509627443.2, + "logits/rejected": 440087040.0, + "logps/chosen": -288.43212890625, + "logps/rejected": -433.6447347005208, + "loss": 0.0216, + "rewards/chosen": 3.6400306701660154, + "rewards/margins": 11.923557408650716, + "rewards/rejected": -8.283526738484701, + "step": 4979 + }, + { + "epoch": 0.4550022841480128, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5.721405557643335e-06, + "logits/chosen": 391651520.0, + "logits/rejected": 479062848.0, + "logps/chosen": -209.12945556640625, + "logps/rejected": -496.6796875, + "loss": 0.0135, + "rewards/chosen": 4.107879161834717, + "rewards/margins": 12.292381763458252, + "rewards/rejected": -8.184502601623535, + "step": 4980 + }, + { + "epoch": 0.4550936500685244, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.719982772067515e-06, + "logits/chosen": 657814208.0, + "logits/rejected": 321253152.0, + "logps/chosen": -219.22007751464844, + "logps/rejected": -465.444580078125, + "loss": 0.0169, + "rewards/chosen": 3.7091288566589355, + "rewards/margins": 12.368425846099854, + "rewards/rejected": -8.659296989440918, + "step": 4981 + }, + { + "epoch": 0.4551850159890361, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5.71855992695579e-06, + "logits/chosen": 686635264.0, + "logits/rejected": 493033216.0, + "logps/chosen": -331.6197509765625, + "logps/rejected": -636.6360473632812, + "loss": 0.0326, + "rewards/chosen": 3.474762439727783, + "rewards/margins": 14.211649417877197, + "rewards/rejected": -10.736886978149414, + "step": 4982 + }, + { + "epoch": 0.4552763819095477, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.7171370224258176e-06, + "logits/chosen": 487863296.0, + "logits/rejected": 393365162.6666667, + "logps/chosen": -401.880029296875, + "logps/rejected": -331.70574951171875, + "loss": 0.0317, + "rewards/chosen": 3.2763526916503904, + "rewards/margins": 11.439139811197915, + "rewards/rejected": -8.162787119547525, + "step": 4983 + }, + { + "epoch": 0.4553677478300594, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5.715714058595257e-06, + "logits/chosen": 604018739.2, + "logits/rejected": 574787157.3333334, + "logps/chosen": -263.8609375, + "logps/rejected": -491.736083984375, + "loss": 0.045, + "rewards/chosen": 2.9997182846069337, + "rewards/margins": 12.258353487650552, + "rewards/rejected": -9.25863520304362, + "step": 4984 + }, + { + "epoch": 0.455459113750571, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 5.714291035581774e-06, + "logits/chosen": 533135257.6, + "logits/rejected": 442536960.0, + "logps/chosen": -338.1359619140625, + "logps/rejected": -443.2655436197917, + "loss": 0.0104, + "rewards/chosen": 4.8103485107421875, + "rewards/margins": 13.843134562174479, + "rewards/rejected": -9.032786051432291, + "step": 4985 + }, + { + "epoch": 0.4555504796710827, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5.712867953503041e-06, + "logits/chosen": 438453504.0, + "logits/rejected": 505078357.3333333, + "logps/chosen": -346.164794921875, + "logps/rejected": -508.2379557291667, + "loss": 0.0159, + "rewards/chosen": 3.3364105224609375, + "rewards/margins": 12.170480092366537, + "rewards/rejected": -8.8340695699056, + "step": 4986 + }, + { + "epoch": 0.4556418455915943, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5.711444812476734e-06, + "logits/chosen": 544113280.0, + "logits/rejected": 299805248.0, + "logps/chosen": -406.30987548828125, + "logps/rejected": -382.85076904296875, + "loss": 0.0131, + "rewards/chosen": 3.7994017601013184, + "rewards/margins": 13.461150646209717, + "rewards/rejected": -9.661748886108398, + "step": 4987 + }, + { + "epoch": 0.455733211512106, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5.710021612620531e-06, + "logits/chosen": 813246208.0, + "logits/rejected": 665979072.0, + "logps/chosen": -218.19911193847656, + "logps/rejected": -411.5029296875, + "loss": 0.1106, + "rewards/chosen": 3.802791118621826, + "rewards/margins": 11.467248439788818, + "rewards/rejected": -7.664457321166992, + "step": 4988 + }, + { + "epoch": 0.4558245774326176, + "grad_norm": 30.25, + "kl": 0.0, + "learning_rate": 5.708598354052122e-06, + "logits/chosen": 1026708608.0, + "logits/rejected": 622392064.0, + "logps/chosen": -345.10107421875, + "logps/rejected": -438.4854431152344, + "loss": 0.106, + "rewards/chosen": 3.171271324157715, + "rewards/margins": 9.499754428863525, + "rewards/rejected": -6.3284831047058105, + "step": 4989 + }, + { + "epoch": 0.4559159433531293, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5.707175036889191e-06, + "logits/chosen": 514443072.0, + "logits/rejected": 533150368.0, + "logps/chosen": -298.3773193359375, + "logps/rejected": -474.6545104980469, + "loss": 0.1195, + "rewards/chosen": 3.1757242679595947, + "rewards/margins": 11.054055452346802, + "rewards/rejected": -7.878331184387207, + "step": 4990 + }, + { + "epoch": 0.4560073092736409, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 5.705751661249439e-06, + "logits/chosen": 798643353.6, + "logits/rejected": 566261845.3333334, + "logps/chosen": -427.196142578125, + "logps/rejected": -699.515380859375, + "loss": 0.0201, + "rewards/chosen": 3.5915943145751954, + "rewards/margins": 14.163554763793945, + "rewards/rejected": -10.57196044921875, + "step": 4991 + }, + { + "epoch": 0.4560986751941526, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 5.704328227250563e-06, + "logits/chosen": 402698282.6666667, + "logits/rejected": 521584076.8, + "logps/chosen": -289.8900146484375, + "logps/rejected": -460.11982421875, + "loss": 0.0099, + "rewards/chosen": 4.020345687866211, + "rewards/margins": 12.952756881713867, + "rewards/rejected": -8.932411193847656, + "step": 4992 + }, + { + "epoch": 0.4561900411146642, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5.702904735010268e-06, + "logits/chosen": 383211904.0, + "logits/rejected": 424566272.0, + "logps/chosen": -205.4822235107422, + "logps/rejected": -711.10107421875, + "loss": 0.0182, + "rewards/chosen": 3.4322242736816406, + "rewards/margins": 13.956998825073242, + "rewards/rejected": -10.524774551391602, + "step": 4993 + }, + { + "epoch": 0.4562814070351759, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5.701481184646265e-06, + "logits/chosen": 615400128.0, + "logits/rejected": 352187872.0, + "logps/chosen": -257.4197998046875, + "logps/rejected": -404.8031311035156, + "loss": 0.0274, + "rewards/chosen": 3.71616268157959, + "rewards/margins": 11.211793899536133, + "rewards/rejected": -7.495631217956543, + "step": 4994 + }, + { + "epoch": 0.4563727729556875, + "grad_norm": 0.388671875, + "kl": 0.0, + "learning_rate": 5.700057576276267e-06, + "logits/chosen": 670845184.0, + "logits/rejected": 445522090.6666667, + "logps/chosen": -599.8955078125, + "logps/rejected": -488.7993977864583, + "loss": 0.0014, + "rewards/chosen": 5.235721111297607, + "rewards/margins": 14.976591269175211, + "rewards/rejected": -9.740870157877604, + "step": 4995 + }, + { + "epoch": 0.4564641388761992, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5.698633910017993e-06, + "logits/chosen": 584030361.6, + "logits/rejected": 283687402.6666667, + "logps/chosen": -547.03447265625, + "logps/rejected": -232.787841796875, + "loss": 0.0244, + "rewards/chosen": 3.5000648498535156, + "rewards/margins": 10.490208943684895, + "rewards/rejected": -6.99014409383138, + "step": 4996 + }, + { + "epoch": 0.4565555047967108, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 5.697210185989167e-06, + "logits/chosen": 464800665.6, + "logits/rejected": 638420522.6666666, + "logps/chosen": -295.1061767578125, + "logps/rejected": -671.2127685546875, + "loss": 0.0088, + "rewards/chosen": 4.310756683349609, + "rewards/margins": 15.926630655924479, + "rewards/rejected": -11.61587397257487, + "step": 4997 + }, + { + "epoch": 0.4566468707172225, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 5.6957864043075195e-06, + "logits/chosen": 603160780.8, + "logits/rejected": 994023253.3333334, + "logps/chosen": -338.6032958984375, + "logps/rejected": -503.79296875, + "loss": 0.0139, + "rewards/chosen": 4.1348213195800785, + "rewards/margins": 13.835653686523438, + "rewards/rejected": -9.70083236694336, + "step": 4998 + }, + { + "epoch": 0.4567382366377341, + "grad_norm": 0.875, + "kl": 0.0, + "learning_rate": 5.694362565090783e-06, + "logits/chosen": 696848896.0, + "logits/rejected": 579321408.0, + "logps/chosen": -265.2198181152344, + "logps/rejected": -727.78564453125, + "loss": 0.0064, + "rewards/chosen": 4.590270042419434, + "rewards/margins": 15.552178382873535, + "rewards/rejected": -10.961908340454102, + "step": 4999 + }, + { + "epoch": 0.4568296025582458, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5.692938668456696e-06, + "logits/chosen": 427612928.0, + "logits/rejected": 324402688.0, + "logps/chosen": -345.7411295572917, + "logps/rejected": -428.233154296875, + "loss": 0.0151, + "rewards/chosen": 4.311573028564453, + "rewards/margins": 13.122507476806641, + "rewards/rejected": -8.810934448242188, + "step": 5000 + }, + { + "epoch": 0.4569209684787574, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5.691514714523e-06, + "logits/chosen": 958369792.0, + "logits/rejected": 577748160.0, + "logps/chosen": -253.8743896484375, + "logps/rejected": -401.90478515625, + "loss": 0.034, + "rewards/chosen": 3.450737635294596, + "rewards/margins": 10.432659784952799, + "rewards/rejected": -6.981922149658203, + "step": 5001 + }, + { + "epoch": 0.4570123343992691, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 5.690090703407445e-06, + "logits/chosen": 392550080.0, + "logits/rejected": 430729152.0, + "logps/chosen": -319.5418701171875, + "logps/rejected": -423.2432556152344, + "loss": 0.0085, + "rewards/chosen": 4.207536697387695, + "rewards/margins": 12.161168098449707, + "rewards/rejected": -7.953631401062012, + "step": 5002 + }, + { + "epoch": 0.4571037003197807, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 5.688666635227782e-06, + "logits/chosen": 641940224.0, + "logits/rejected": 519720064.0, + "logps/chosen": -462.8865051269531, + "logps/rejected": -391.3066813151042, + "loss": 0.0114, + "rewards/chosen": 3.177841901779175, + "rewards/margins": 11.935426791508993, + "rewards/rejected": -8.757584889729818, + "step": 5003 + }, + { + "epoch": 0.4571950662402924, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 5.687242510101769e-06, + "logits/chosen": 424387200.0, + "logits/rejected": 415139456.0, + "logps/chosen": -221.03775024414062, + "logps/rejected": -565.571044921875, + "loss": 0.0155, + "rewards/chosen": 3.726651906967163, + "rewards/margins": 13.480741262435913, + "rewards/rejected": -9.75408935546875, + "step": 5004 + }, + { + "epoch": 0.457286432160804, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 5.685818328147168e-06, + "logits/chosen": 667004979.2, + "logits/rejected": 556190464.0, + "logps/chosen": -275.6887451171875, + "logps/rejected": -430.7703043619792, + "loss": 0.0136, + "rewards/chosen": 4.416342544555664, + "rewards/margins": 11.792016474405925, + "rewards/rejected": -7.375673929850261, + "step": 5005 + }, + { + "epoch": 0.4573777980813157, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5.684394089481746e-06, + "logits/chosen": 771233152.0, + "logits/rejected": 584086784.0, + "logps/chosen": -342.93951416015625, + "logps/rejected": -660.095703125, + "loss": 0.0214, + "rewards/chosen": 3.615046977996826, + "rewards/margins": 12.571338176727295, + "rewards/rejected": -8.956291198730469, + "step": 5006 + }, + { + "epoch": 0.4574691640018273, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5.682969794223275e-06, + "logits/chosen": 845708202.6666666, + "logits/rejected": 699542118.4, + "logps/chosen": -258.3188883463542, + "logps/rejected": -495.68505859375, + "loss": 0.0616, + "rewards/chosen": 4.201900482177734, + "rewards/margins": 11.923755645751953, + "rewards/rejected": -7.721855163574219, + "step": 5007 + }, + { + "epoch": 0.457560529922339, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5.6815454424895275e-06, + "logits/chosen": 429539328.0, + "logits/rejected": 485134720.0, + "logps/chosen": -343.34893798828125, + "logps/rejected": -480.5826721191406, + "loss": 0.0211, + "rewards/chosen": 3.686309337615967, + "rewards/margins": 12.805545330047607, + "rewards/rejected": -9.11923599243164, + "step": 5008 + }, + { + "epoch": 0.4576518958428506, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5.680121034398288e-06, + "logits/chosen": 543151513.6, + "logits/rejected": 422754901.3333333, + "logps/chosen": -352.23564453125, + "logps/rejected": -450.5569661458333, + "loss": 0.0391, + "rewards/chosen": 2.909083366394043, + "rewards/margins": 14.311040560404459, + "rewards/rejected": -11.401957194010416, + "step": 5009 + }, + { + "epoch": 0.4577432617633623, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 5.678696570067341e-06, + "logits/chosen": 310129984.0, + "logits/rejected": 452902101.3333333, + "logps/chosen": -240.66024780273438, + "logps/rejected": -516.4962565104166, + "loss": 0.0052, + "rewards/chosen": 4.219037055969238, + "rewards/margins": 15.123737017313639, + "rewards/rejected": -10.9046999613444, + "step": 5010 + }, + { + "epoch": 0.4578346276838739, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 5.677272049614474e-06, + "logits/chosen": 678330538.6666666, + "logits/rejected": 410808256.0, + "logps/chosen": -360.3695475260417, + "logps/rejected": -505.55255126953125, + "loss": 0.0069, + "rewards/chosen": 4.97725772857666, + "rewards/margins": 16.70795726776123, + "rewards/rejected": -11.73069953918457, + "step": 5011 + }, + { + "epoch": 0.4579259936043856, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5.675847473157485e-06, + "logits/chosen": 533659340.8, + "logits/rejected": 704936533.3333334, + "logps/chosen": -364.54208984375, + "logps/rejected": -630.63232421875, + "loss": 0.0409, + "rewards/chosen": 3.0278175354003904, + "rewards/margins": 13.87957331339518, + "rewards/rejected": -10.851755777994791, + "step": 5012 + }, + { + "epoch": 0.4580173595248972, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5.674422840814172e-06, + "logits/chosen": 480186197.3333333, + "logits/rejected": 441982016.0, + "logps/chosen": -316.98419189453125, + "logps/rejected": -362.61798095703125, + "loss": 0.0116, + "rewards/chosen": 4.8777726491292315, + "rewards/margins": 13.750312169392902, + "rewards/rejected": -8.872539520263672, + "step": 5013 + }, + { + "epoch": 0.45810872544540887, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5.672998152702338e-06, + "logits/chosen": 397269248.0, + "logits/rejected": 198584368.0, + "logps/chosen": -247.82002766927084, + "logps/rejected": -339.2835998535156, + "loss": 0.0593, + "rewards/chosen": 2.9357732137044272, + "rewards/margins": 15.315565427144369, + "rewards/rejected": -12.379792213439941, + "step": 5014 + }, + { + "epoch": 0.4582000913659205, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5.6715734089397925e-06, + "logits/chosen": 566685184.0, + "logits/rejected": 1055043157.3333334, + "logps/chosen": -330.3526123046875, + "logps/rejected": -495.2923990885417, + "loss": 0.1276, + "rewards/chosen": 3.3690673828125, + "rewards/margins": 13.162774276733398, + "rewards/rejected": -9.793706893920898, + "step": 5015 + }, + { + "epoch": 0.45829145728643217, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5.670148609644348e-06, + "logits/chosen": 407034880.0, + "logits/rejected": 498834176.0, + "logps/chosen": -263.01141357421875, + "logps/rejected": -571.4200439453125, + "loss": 0.0376, + "rewards/chosen": 3.2964847087860107, + "rewards/margins": 11.857184171676636, + "rewards/rejected": -8.560699462890625, + "step": 5016 + }, + { + "epoch": 0.4583828232069438, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5.6687237549338225e-06, + "logits/chosen": 515604736.0, + "logits/rejected": 316139434.6666667, + "logps/chosen": -237.8876220703125, + "logps/rejected": -283.5101725260417, + "loss": 0.018, + "rewards/chosen": 3.8952308654785157, + "rewards/margins": 12.659609731038412, + "rewards/rejected": -8.764378865559896, + "step": 5017 + }, + { + "epoch": 0.45847418912745547, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5.667298844926038e-06, + "logits/chosen": 743686912.0, + "logits/rejected": 596075328.0, + "logps/chosen": -272.4415283203125, + "logps/rejected": -545.9297485351562, + "loss": 0.0129, + "rewards/chosen": 4.12259578704834, + "rewards/margins": 15.59902572631836, + "rewards/rejected": -11.47642993927002, + "step": 5018 + }, + { + "epoch": 0.4585655550479671, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5.665873879738824e-06, + "logits/chosen": 581385258.6666666, + "logits/rejected": 589383321.6, + "logps/chosen": -352.8252766927083, + "logps/rejected": -600.94716796875, + "loss": 0.0148, + "rewards/chosen": 3.311647415161133, + "rewards/margins": 14.991476821899415, + "rewards/rejected": -11.679829406738282, + "step": 5019 + }, + { + "epoch": 0.45865692096847877, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 5.664448859490006e-06, + "logits/chosen": 328701728.0, + "logits/rejected": 448323669.3333333, + "logps/chosen": -171.35183715820312, + "logps/rejected": -537.8124593098959, + "loss": 0.0085, + "rewards/chosen": 3.821043014526367, + "rewards/margins": 12.806281407674154, + "rewards/rejected": -8.985238393147787, + "step": 5020 + }, + { + "epoch": 0.4587482868889904, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5.663023784297426e-06, + "logits/chosen": 521542432.0, + "logits/rejected": 311325440.0, + "logps/chosen": -380.1177978515625, + "logps/rejected": -372.37335205078125, + "loss": 0.0239, + "rewards/chosen": 3.738044023513794, + "rewards/margins": 12.52064061164856, + "rewards/rejected": -8.782596588134766, + "step": 5021 + }, + { + "epoch": 0.45883965280950206, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5.6615986542789226e-06, + "logits/chosen": 382292224.0, + "logits/rejected": 285248768.0, + "logps/chosen": -268.5440266927083, + "logps/rejected": -404.0771789550781, + "loss": 0.0304, + "rewards/chosen": 3.9160283406575522, + "rewards/margins": 13.462249120076498, + "rewards/rejected": -9.546220779418945, + "step": 5022 + }, + { + "epoch": 0.4589310187300137, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.660173469552339e-06, + "logits/chosen": 521527722.6666667, + "logits/rejected": 398514662.4, + "logps/chosen": -245.8248087565104, + "logps/rejected": -366.525244140625, + "loss": 0.016, + "rewards/chosen": 3.3578758239746094, + "rewards/margins": 13.357848358154296, + "rewards/rejected": -9.999972534179687, + "step": 5023 + }, + { + "epoch": 0.45902238465052536, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5.6587482302355255e-06, + "logits/chosen": 736825984.0, + "logits/rejected": 694688426.6666666, + "logps/chosen": -94.79998779296875, + "logps/rejected": -596.8132731119791, + "loss": 0.0066, + "rewards/chosen": 4.056617259979248, + "rewards/margins": 14.414522965749105, + "rewards/rejected": -10.357905705769857, + "step": 5024 + }, + { + "epoch": 0.459113750571037, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5.657322936446339e-06, + "logits/chosen": 492868266.6666667, + "logits/rejected": 727722752.0, + "logps/chosen": -283.4395345052083, + "logps/rejected": -572.3544921875, + "loss": 0.0237, + "rewards/chosen": 4.129851659138997, + "rewards/margins": 14.941404660542805, + "rewards/rejected": -10.811553001403809, + "step": 5025 + }, + { + "epoch": 0.45920511649154866, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.6558975883026335e-06, + "logits/chosen": 1230503321.6, + "logits/rejected": 362687104.0, + "logps/chosen": -263.0542236328125, + "logps/rejected": -243.53678385416666, + "loss": 0.122, + "rewards/chosen": 3.7498817443847656, + "rewards/margins": 10.011770248413086, + "rewards/rejected": -6.26188850402832, + "step": 5026 + }, + { + "epoch": 0.4592964824120603, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5.654472185922277e-06, + "logits/chosen": 338676326.4, + "logits/rejected": 488244778.6666667, + "logps/chosen": -267.95078125, + "logps/rejected": -314.57338460286456, + "loss": 0.0254, + "rewards/chosen": 3.6651153564453125, + "rewards/margins": 12.883519490559896, + "rewards/rejected": -9.218404134114584, + "step": 5027 + }, + { + "epoch": 0.45938784833257196, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.653046729423133e-06, + "logits/chosen": 300822848.0, + "logits/rejected": 560906496.0, + "logps/chosen": -145.35337829589844, + "logps/rejected": -469.4570007324219, + "loss": 0.051, + "rewards/chosen": 3.6358978748321533, + "rewards/margins": 9.726416826248169, + "rewards/rejected": -6.090518951416016, + "step": 5028 + }, + { + "epoch": 0.4594792142530836, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 5.651621218923074e-06, + "logits/chosen": 414954816.0, + "logits/rejected": 295969504.0, + "logps/chosen": -421.9808044433594, + "logps/rejected": -422.7002868652344, + "loss": 0.0097, + "rewards/chosen": 4.78209924697876, + "rewards/margins": 14.222218036651611, + "rewards/rejected": -9.440118789672852, + "step": 5029 + }, + { + "epoch": 0.45957058017359526, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 5.6501956545399795e-06, + "logits/chosen": 515302698.6666667, + "logits/rejected": 632790937.6, + "logps/chosen": -445.1643880208333, + "logps/rejected": -555.7890625, + "loss": 0.0174, + "rewards/chosen": 3.24173641204834, + "rewards/margins": 10.878780174255372, + "rewards/rejected": -7.637043762207031, + "step": 5030 + }, + { + "epoch": 0.4596619460941069, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5.6487700363917275e-06, + "logits/chosen": 387756373.3333333, + "logits/rejected": 481708646.4, + "logps/chosen": -411.0904134114583, + "logps/rejected": -446.1529296875, + "loss": 0.0294, + "rewards/chosen": 2.6505513191223145, + "rewards/margins": 11.503801441192627, + "rewards/rejected": -8.853250122070312, + "step": 5031 + }, + { + "epoch": 0.45975331201461855, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5.647344364596204e-06, + "logits/chosen": 558384170.6666666, + "logits/rejected": 360800512.0, + "logps/chosen": -382.6684977213542, + "logps/rejected": -483.7562255859375, + "loss": 0.0254, + "rewards/chosen": 3.7581297556559243, + "rewards/margins": 13.807562510172525, + "rewards/rejected": -10.049432754516602, + "step": 5032 + }, + { + "epoch": 0.4598446779351302, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5.6459186392713e-06, + "logits/chosen": 507136938.6666667, + "logits/rejected": 709650483.2, + "logps/chosen": -239.30291748046875, + "logps/rejected": -455.15400390625, + "loss": 0.0174, + "rewards/chosen": 3.810645421346029, + "rewards/margins": 13.732209904988608, + "rewards/rejected": -9.921564483642578, + "step": 5033 + }, + { + "epoch": 0.45993604385564185, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5.6444928605349105e-06, + "logits/chosen": 768740249.6, + "logits/rejected": 711788800.0, + "logps/chosen": -414.82275390625, + "logps/rejected": -362.7821451822917, + "loss": 0.0229, + "rewards/chosen": 3.8301631927490236, + "rewards/margins": 12.910604985555015, + "rewards/rejected": -9.08044179280599, + "step": 5034 + }, + { + "epoch": 0.4600274097761535, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.643067028504931e-06, + "logits/chosen": 443140096.0, + "logits/rejected": 594142208.0, + "logps/chosen": -355.9571228027344, + "logps/rejected": -541.1942749023438, + "loss": 0.0222, + "rewards/chosen": 3.643247365951538, + "rewards/margins": 13.68927550315857, + "rewards/rejected": -10.046028137207031, + "step": 5035 + }, + { + "epoch": 0.46011877569666515, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 5.641641143299268e-06, + "logits/chosen": 490692010.6666667, + "logits/rejected": 528968384.0, + "logps/chosen": -176.7811279296875, + "logps/rejected": -785.8796997070312, + "loss": 0.0286, + "rewards/chosen": 3.577002207438151, + "rewards/margins": 14.427965799967447, + "rewards/rejected": -10.850963592529297, + "step": 5036 + }, + { + "epoch": 0.46021014161717677, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5.640215205035826e-06, + "logits/chosen": 373449113.6, + "logits/rejected": 378930517.3333333, + "logps/chosen": -302.3161376953125, + "logps/rejected": -429.41357421875, + "loss": 0.0192, + "rewards/chosen": 3.741571044921875, + "rewards/margins": 11.937849299112955, + "rewards/rejected": -8.19627825419108, + "step": 5037 + }, + { + "epoch": 0.46030150753768845, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 5.638789213832518e-06, + "logits/chosen": 519130432.0, + "logits/rejected": 645351936.0, + "logps/chosen": -193.761962890625, + "logps/rejected": -607.7025756835938, + "loss": 0.0188, + "rewards/chosen": 3.4641785621643066, + "rewards/margins": 13.257524013519287, + "rewards/rejected": -9.79334545135498, + "step": 5038 + }, + { + "epoch": 0.46039287345820007, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5.6373631698072615e-06, + "logits/chosen": 645017152.0, + "logits/rejected": 256648384.0, + "logps/chosen": -156.33267211914062, + "logps/rejected": -396.93719482421875, + "loss": 0.0186, + "rewards/chosen": 3.6674857139587402, + "rewards/margins": 11.583299160003662, + "rewards/rejected": -7.915813446044922, + "step": 5039 + }, + { + "epoch": 0.46048423937871175, + "grad_norm": 66.0, + "kl": 0.0, + "learning_rate": 5.635937073077976e-06, + "logits/chosen": 893222400.0, + "logits/rejected": 867251814.4, + "logps/chosen": -321.43841552734375, + "logps/rejected": -383.0729736328125, + "loss": 0.051, + "rewards/chosen": 2.932445844014486, + "rewards/margins": 12.079115613301596, + "rewards/rejected": -9.14666976928711, + "step": 5040 + }, + { + "epoch": 0.46057560529922337, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 5.634510923762586e-06, + "logits/chosen": 546000281.6, + "logits/rejected": 188616085.33333334, + "logps/chosen": -424.5783203125, + "logps/rejected": -352.1068929036458, + "loss": 0.0169, + "rewards/chosen": 4.126482391357422, + "rewards/margins": 15.079597218831381, + "rewards/rejected": -10.953114827473959, + "step": 5041 + }, + { + "epoch": 0.46066697121973504, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5.633084721979022e-06, + "logits/chosen": 478480128.0, + "logits/rejected": 606958080.0, + "logps/chosen": -301.1283447265625, + "logps/rejected": -459.8732096354167, + "loss": 0.0145, + "rewards/chosen": 3.9279972076416017, + "rewards/margins": 13.151355361938476, + "rewards/rejected": -9.223358154296875, + "step": 5042 + }, + { + "epoch": 0.46075833714024667, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 5.631658467845218e-06, + "logits/chosen": 416260181.3333333, + "logits/rejected": 506468352.0, + "logps/chosen": -315.6909586588542, + "logps/rejected": -665.2373046875, + "loss": 0.019, + "rewards/chosen": 4.11610730489095, + "rewards/margins": 11.876129468282063, + "rewards/rejected": -7.760022163391113, + "step": 5043 + }, + { + "epoch": 0.46084970306075834, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 5.630232161479109e-06, + "logits/chosen": 714416332.8, + "logits/rejected": 500603477.3333333, + "logps/chosen": -501.5283203125, + "logps/rejected": -618.1260172526041, + "loss": 0.0251, + "rewards/chosen": 3.4076358795166017, + "rewards/margins": 12.953557840983073, + "rewards/rejected": -9.54592196146647, + "step": 5044 + }, + { + "epoch": 0.46094106898126996, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5.6288058029986416e-06, + "logits/chosen": 1009432320.0, + "logits/rejected": 517440877.71428573, + "logps/chosen": -147.06101989746094, + "logps/rejected": -387.76967075892856, + "loss": 0.0075, + "rewards/chosen": 3.1159470081329346, + "rewards/margins": 12.39004499571664, + "rewards/rejected": -9.274097987583705, + "step": 5045 + }, + { + "epoch": 0.46103243490178164, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 5.627379392521758e-06, + "logits/chosen": 618378240.0, + "logits/rejected": 616488038.4, + "logps/chosen": -387.9722900390625, + "logps/rejected": -554.4359375, + "loss": 0.0175, + "rewards/chosen": 3.1917807261149087, + "rewards/margins": 12.208061854044596, + "rewards/rejected": -9.016281127929688, + "step": 5046 + }, + { + "epoch": 0.46112380082229326, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.625952930166414e-06, + "logits/chosen": 384089002.6666667, + "logits/rejected": 618084480.0, + "logps/chosen": -314.0574951171875, + "logps/rejected": -422.4921875, + "loss": 0.04, + "rewards/chosen": 3.0790958404541016, + "rewards/margins": 11.816293716430664, + "rewards/rejected": -8.737197875976562, + "step": 5047 + }, + { + "epoch": 0.46121516674280494, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.62452641605056e-06, + "logits/chosen": 518178272.0, + "logits/rejected": 412988480.0, + "logps/chosen": -295.4505920410156, + "logps/rejected": -369.7721862792969, + "loss": 0.0282, + "rewards/chosen": 3.5107481479644775, + "rewards/margins": 12.050909280776978, + "rewards/rejected": -8.5401611328125, + "step": 5048 + }, + { + "epoch": 0.46130653266331656, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5.623099850292161e-06, + "logits/chosen": 529697344.0, + "logits/rejected": 402703488.0, + "logps/chosen": -355.322021484375, + "logps/rejected": -444.57073974609375, + "loss": 0.0138, + "rewards/chosen": 4.182153224945068, + "rewards/margins": 14.296633243560791, + "rewards/rejected": -10.114480018615723, + "step": 5049 + }, + { + "epoch": 0.46139789858382824, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5.621673233009174e-06, + "logits/chosen": 646144614.4, + "logits/rejected": 734378496.0, + "logps/chosen": -399.6764892578125, + "logps/rejected": -443.4298095703125, + "loss": 0.0465, + "rewards/chosen": 3.076754570007324, + "rewards/margins": 10.274296124776203, + "rewards/rejected": -7.19754155476888, + "step": 5050 + }, + { + "epoch": 0.46148926450433986, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.620246564319573e-06, + "logits/chosen": 749993472.0, + "logits/rejected": 1300115456.0, + "logps/chosen": -378.14501953125, + "logps/rejected": -349.52825927734375, + "loss": 0.0233, + "rewards/chosen": 3.942676544189453, + "rewards/margins": 10.248135089874268, + "rewards/rejected": -6.3054585456848145, + "step": 5051 + }, + { + "epoch": 0.46158063042485153, + "grad_norm": 0.74609375, + "kl": 0.0, + "learning_rate": 5.618819844341325e-06, + "logits/chosen": 208163120.0, + "logits/rejected": 424453632.0, + "logps/chosen": -289.12286376953125, + "logps/rejected": -440.74448939732144, + "loss": 0.0028, + "rewards/chosen": 4.460452556610107, + "rewards/margins": 13.098117351531982, + "rewards/rejected": -8.637664794921875, + "step": 5052 + }, + { + "epoch": 0.46167199634536316, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 5.617393073192412e-06, + "logits/chosen": 502097305.6, + "logits/rejected": 395806592.0, + "logps/chosen": -384.659619140625, + "logps/rejected": -512.1247151692709, + "loss": 0.0188, + "rewards/chosen": 3.968292999267578, + "rewards/margins": 12.732188669840493, + "rewards/rejected": -8.763895670572916, + "step": 5053 + }, + { + "epoch": 0.46176336226587483, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5.615966250990811e-06, + "logits/chosen": 734590976.0, + "logits/rejected": 376365824.0, + "logps/chosen": -579.47578125, + "logps/rejected": -474.6894124348958, + "loss": 0.0193, + "rewards/chosen": 3.676968002319336, + "rewards/margins": 14.875263595581055, + "rewards/rejected": -11.198295593261719, + "step": 5054 + }, + { + "epoch": 0.46185472818638645, + "grad_norm": 32.0, + "kl": 0.0, + "learning_rate": 5.614539377854509e-06, + "logits/chosen": 345770922.6666667, + "logits/rejected": 450074880.0, + "logps/chosen": -253.34869384765625, + "logps/rejected": -558.8157958984375, + "loss": 0.1401, + "rewards/chosen": 1.883877436319987, + "rewards/margins": 10.758938471476236, + "rewards/rejected": -8.87506103515625, + "step": 5055 + }, + { + "epoch": 0.46194609410689813, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 5.613112453901493e-06, + "logits/chosen": 565775360.0, + "logits/rejected": 666093824.0, + "logps/chosen": -309.167041015625, + "logps/rejected": -546.894287109375, + "loss": 0.0174, + "rewards/chosen": 4.022698974609375, + "rewards/margins": 12.499685541788736, + "rewards/rejected": -8.476986567179361, + "step": 5056 + }, + { + "epoch": 0.46203746002740975, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5.611685479249759e-06, + "logits/chosen": 581175756.8, + "logits/rejected": 393282389.3333333, + "logps/chosen": -286.996484375, + "logps/rejected": -406.7569173177083, + "loss": 0.0293, + "rewards/chosen": 3.895854187011719, + "rewards/margins": 11.579213968912761, + "rewards/rejected": -7.683359781901042, + "step": 5057 + }, + { + "epoch": 0.46212882594792143, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5.610258454017301e-06, + "logits/chosen": 435427712.0, + "logits/rejected": 431911680.0, + "logps/chosen": -367.540283203125, + "logps/rejected": -556.4826049804688, + "loss": 0.0252, + "rewards/chosen": 3.624596277872721, + "rewards/margins": 14.546285311381022, + "rewards/rejected": -10.9216890335083, + "step": 5058 + }, + { + "epoch": 0.46222019186843305, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5.608831378322126e-06, + "logits/rejected": 594063424.0, + "logps/rejected": -406.3751220703125, + "loss": 0.0808, + "rewards/rejected": -8.17310619354248, + "step": 5059 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 5.607404252282236e-06, + "logits/chosen": 848507392.0, + "logits/rejected": 541843609.6, + "logps/chosen": -451.4456380208333, + "logps/rejected": -510.89814453125, + "loss": 0.0108, + "rewards/chosen": 4.275009791056315, + "rewards/margins": 13.846241633097332, + "rewards/rejected": -9.571231842041016, + "step": 5060 + }, + { + "epoch": 0.46240292370945635, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 5.605977076015641e-06, + "logits/chosen": 707833173.3333334, + "logits/rejected": 363868876.8, + "logps/chosen": -452.1402587890625, + "logps/rejected": -496.93232421875, + "loss": 0.014, + "rewards/chosen": 3.486447016398112, + "rewards/margins": 11.504712549845378, + "rewards/rejected": -8.018265533447266, + "step": 5061 + }, + { + "epoch": 0.462494289629968, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 5.604549849640357e-06, + "logits/chosen": 972520320.0, + "logits/rejected": 718909952.0, + "logps/chosen": -381.995361328125, + "logps/rejected": -362.44195556640625, + "loss": 0.0096, + "rewards/chosen": 4.316830158233643, + "rewards/margins": 12.505003452301025, + "rewards/rejected": -8.188173294067383, + "step": 5062 + }, + { + "epoch": 0.46258565555047965, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5.6031225732744025e-06, + "logits/chosen": 420451552.0, + "logits/rejected": 407442752.0, + "logps/chosen": -358.159912109375, + "logps/rejected": -472.2441101074219, + "loss": 0.019, + "rewards/chosen": 3.7363624572753906, + "rewards/margins": 12.505783081054688, + "rewards/rejected": -8.769420623779297, + "step": 5063 + }, + { + "epoch": 0.4626770214709913, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 5.6016952470357985e-06, + "logits/chosen": 492443238.4, + "logits/rejected": 479316992.0, + "logps/chosen": -335.79326171875, + "logps/rejected": -479.3202718098958, + "loss": 0.0154, + "rewards/chosen": 3.908704376220703, + "rewards/margins": 12.168768819173177, + "rewards/rejected": -8.260064442952475, + "step": 5064 + }, + { + "epoch": 0.46276838739150294, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 5.6002678710425715e-06, + "logits/chosen": 261176624.0, + "logits/rejected": 527909952.0, + "logps/chosen": -368.58306884765625, + "logps/rejected": -614.1788940429688, + "loss": 0.0192, + "rewards/chosen": 3.709628105163574, + "rewards/margins": 16.955758094787598, + "rewards/rejected": -13.246129989624023, + "step": 5065 + }, + { + "epoch": 0.4628597533120146, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 5.598840445412755e-06, + "logits/chosen": 991309653.3333334, + "logits/rejected": 429222988.8, + "logps/chosen": -517.2134602864584, + "logps/rejected": -424.53779296875, + "loss": 0.0058, + "rewards/chosen": 4.3205006917317705, + "rewards/margins": 14.040619405110675, + "rewards/rejected": -9.720118713378906, + "step": 5066 + }, + { + "epoch": 0.46295111923252624, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5.5974129702643795e-06, + "logits/chosen": 478473984.0, + "logits/rejected": 402053990.4, + "logps/chosen": -166.20709228515625, + "logps/rejected": -493.78359375, + "loss": 0.0209, + "rewards/chosen": 3.051210403442383, + "rewards/margins": 12.889182662963867, + "rewards/rejected": -9.837972259521484, + "step": 5067 + }, + { + "epoch": 0.4630424851530379, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5.595985445715489e-06, + "logits/chosen": 413439411.2, + "logits/rejected": 268197930.66666666, + "logps/chosen": -377.3976806640625, + "logps/rejected": -332.4922688802083, + "loss": 0.02, + "rewards/chosen": 3.8374774932861326, + "rewards/margins": 12.817136255900063, + "rewards/rejected": -8.979658762613932, + "step": 5068 + }, + { + "epoch": 0.46313385107354954, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 5.594557871884122e-06, + "logits/chosen": 714355328.0, + "logits/rejected": 331738176.0, + "logps/chosen": -638.9061279296875, + "logps/rejected": -429.2996419270833, + "loss": 0.0058, + "rewards/chosen": 3.806732177734375, + "rewards/margins": 13.083883921305338, + "rewards/rejected": -9.277151743570963, + "step": 5069 + }, + { + "epoch": 0.4632252169940612, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 5.5931302488883295e-06, + "logits/chosen": 562783104.0, + "logits/rejected": 476993638.4, + "logps/chosen": -563.7096761067709, + "logps/rejected": -598.2908203125, + "loss": 0.0164, + "rewards/chosen": 3.162085215250651, + "rewards/margins": 13.894941965738932, + "rewards/rejected": -10.732856750488281, + "step": 5070 + }, + { + "epoch": 0.46331658291457284, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.59170257684616e-06, + "logits/chosen": 622958336.0, + "logits/rejected": 553085988.5714285, + "logps/chosen": -225.8356475830078, + "logps/rejected": -387.67313058035717, + "loss": 0.0103, + "rewards/chosen": 2.5314254760742188, + "rewards/margins": 10.407847268240793, + "rewards/rejected": -7.876421792166574, + "step": 5071 + }, + { + "epoch": 0.4634079488350845, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 5.590274855875671e-06, + "logits/chosen": 533618176.0, + "logits/rejected": 708707328.0, + "logps/chosen": -514.11376953125, + "logps/rejected": -552.07607421875, + "loss": 0.0068, + "rewards/chosen": 4.148386637369792, + "rewards/margins": 14.649072519938152, + "rewards/rejected": -10.500685882568359, + "step": 5072 + }, + { + "epoch": 0.46349931475559614, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5.588847086094917e-06, + "logits/rejected": 507216832.0, + "logps/rejected": -490.1281433105469, + "loss": 0.0092, + "rewards/rejected": -8.330785751342773, + "step": 5073 + }, + { + "epoch": 0.4635906806761078, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5.587419267621968e-06, + "logits/chosen": 512530528.0, + "logits/rejected": 317832384.0, + "logps/chosen": -214.40472412109375, + "logps/rejected": -458.3150227864583, + "loss": 0.0121, + "rewards/chosen": 3.0376405715942383, + "rewards/margins": 13.28182315826416, + "rewards/rejected": -10.244182586669922, + "step": 5074 + }, + { + "epoch": 0.46368204659661943, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5.585991400574887e-06, + "logits/chosen": 504168352.0, + "logits/rejected": 355383296.0, + "logps/chosen": -370.67724609375, + "logps/rejected": -426.6878967285156, + "loss": 0.0147, + "rewards/chosen": 3.7710301876068115, + "rewards/margins": 12.778341054916382, + "rewards/rejected": -9.00731086730957, + "step": 5075 + }, + { + "epoch": 0.4637734125171311, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 5.584563485071748e-06, + "logits/chosen": 454879027.2, + "logits/rejected": 479089877.3333333, + "logps/chosen": -235.63525390625, + "logps/rejected": -395.3131103515625, + "loss": 0.011, + "rewards/chosen": 4.174658966064453, + "rewards/margins": 14.375564448038737, + "rewards/rejected": -10.200905481974283, + "step": 5076 + }, + { + "epoch": 0.46386477843764273, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5.583135521230623e-06, + "logits/chosen": 643257941.3333334, + "logits/rejected": 461813862.4, + "logps/chosen": -145.29436238606772, + "logps/rejected": -378.6001953125, + "loss": 0.1207, + "rewards/chosen": 4.304358164469401, + "rewards/margins": 10.891497294108074, + "rewards/rejected": -6.587139129638672, + "step": 5077 + }, + { + "epoch": 0.4639561443581544, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 5.581707509169594e-06, + "logits/chosen": 297996949.3333333, + "logits/rejected": 564044902.4, + "logps/chosen": -139.0689697265625, + "logps/rejected": -468.001904296875, + "loss": 0.01, + "rewards/chosen": 3.8012475967407227, + "rewards/margins": 11.896759986877441, + "rewards/rejected": -8.095512390136719, + "step": 5078 + }, + { + "epoch": 0.46404751027866603, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 5.5802794490067424e-06, + "logits/chosen": 703155097.6, + "logits/rejected": 575660672.0, + "logps/chosen": -409.574853515625, + "logps/rejected": -504.6326497395833, + "loss": 0.0109, + "rewards/chosen": 4.550188446044922, + "rewards/margins": 14.142071787516276, + "rewards/rejected": -9.591883341471354, + "step": 5079 + }, + { + "epoch": 0.4641388761991777, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5.578851340860159e-06, + "logits/chosen": 528959061.3333333, + "logits/rejected": 376891136.0, + "logps/chosen": -437.5388997395833, + "logps/rejected": -325.89215087890625, + "loss": 0.0257, + "rewards/chosen": 4.282419522603353, + "rewards/margins": 10.22181208928426, + "rewards/rejected": -5.939392566680908, + "step": 5080 + }, + { + "epoch": 0.46423024211968933, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5.577423184847932e-06, + "logits/chosen": 895310677.3333334, + "logits/rejected": 906861184.0, + "logps/chosen": -212.6836954752604, + "logps/rejected": -307.0877685546875, + "loss": 0.0362, + "rewards/chosen": 3.067275365193685, + "rewards/margins": 12.035497029622396, + "rewards/rejected": -8.968221664428711, + "step": 5081 + }, + { + "epoch": 0.464321608040201, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.575994981088157e-06, + "logits/chosen": 1194122854.4, + "logits/rejected": 599056128.0, + "logps/chosen": -394.1864990234375, + "logps/rejected": -484.9117838541667, + "loss": 0.035, + "rewards/chosen": 3.1520275115966796, + "rewards/margins": 13.125986480712891, + "rewards/rejected": -9.973958969116211, + "step": 5082 + }, + { + "epoch": 0.4644129739607126, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.574566729698933e-06, + "logits/chosen": 630637056.0, + "logits/rejected": 695856298.6666666, + "logps/chosen": -287.42880859375, + "logps/rejected": -484.652099609375, + "loss": 0.0156, + "rewards/chosen": 4.700996017456054, + "rewards/margins": 12.658147430419922, + "rewards/rejected": -7.957151412963867, + "step": 5083 + }, + { + "epoch": 0.4645043398812243, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5.573138430798365e-06, + "logits/chosen": 351854336.0, + "logits/rejected": 385868467.2, + "logps/chosen": -285.9096272786458, + "logps/rejected": -450.38076171875, + "loss": 0.0114, + "rewards/chosen": 4.5288950602213545, + "rewards/margins": 14.875090281168621, + "rewards/rejected": -10.346195220947266, + "step": 5084 + }, + { + "epoch": 0.4645957058017359, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 5.571710084504559e-06, + "logits/chosen": 595307648.0, + "logits/rejected": 1105770496.0, + "logps/chosen": -268.9348449707031, + "logps/rejected": -860.8720703125, + "loss": 0.027, + "rewards/chosen": 2.9831204414367676, + "rewards/margins": 14.042194843292236, + "rewards/rejected": -11.059074401855469, + "step": 5085 + }, + { + "epoch": 0.4646870717222476, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 5.570281690935626e-06, + "logits/chosen": 416977312.0, + "logits/rejected": 464138496.0, + "logps/chosen": -268.3107604980469, + "logps/rejected": -440.3563537597656, + "loss": 0.0427, + "rewards/chosen": 3.1226561069488525, + "rewards/margins": 11.99056363105774, + "rewards/rejected": -8.867907524108887, + "step": 5086 + }, + { + "epoch": 0.4647784376427592, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5.568853250209681e-06, + "logits/chosen": 528570709.3333333, + "logits/rejected": 418837862.4, + "logps/chosen": -339.9635416666667, + "logps/rejected": -425.5033203125, + "loss": 0.0228, + "rewards/chosen": 2.913095474243164, + "rewards/margins": 11.940794754028321, + "rewards/rejected": -9.027699279785157, + "step": 5087 + }, + { + "epoch": 0.4648698035632709, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5.5674247624448415e-06, + "logits/chosen": 566455168.0, + "logits/rejected": 1053180288.0, + "logps/chosen": -322.7344970703125, + "logps/rejected": -381.389892578125, + "loss": 0.012, + "rewards/chosen": 4.202683448791504, + "rewards/margins": 12.361348152160645, + "rewards/rejected": -8.15866470336914, + "step": 5088 + }, + { + "epoch": 0.4649611694837825, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 5.565996227759235e-06, + "logits/chosen": 816370636.8, + "logits/rejected": 1998634837.3333333, + "logps/chosen": -288.8866455078125, + "logps/rejected": -576.8951009114584, + "loss": 0.015, + "rewards/chosen": 3.8077766418457033, + "rewards/margins": 13.998127492268882, + "rewards/rejected": -10.190350850423178, + "step": 5089 + }, + { + "epoch": 0.4650525354042942, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5.564567646270982e-06, + "logits/chosen": 323357653.3333333, + "logits/rejected": 997772390.4, + "logps/chosen": -211.36092122395834, + "logps/rejected": -592.22744140625, + "loss": 0.035, + "rewards/chosen": 4.579758326212565, + "rewards/margins": 13.982842127482098, + "rewards/rejected": -9.403083801269531, + "step": 5090 + }, + { + "epoch": 0.4651439013248059, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5.563139018098216e-06, + "logits/chosen": 558505574.4, + "logits/rejected": 594449152.0, + "logps/chosen": -296.862890625, + "logps/rejected": -396.1200358072917, + "loss": 0.0183, + "rewards/chosen": 4.098238372802735, + "rewards/margins": 14.135877990722657, + "rewards/rejected": -10.037639617919922, + "step": 5091 + }, + { + "epoch": 0.4652352672453175, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 5.5617103433590695e-06, + "logits/chosen": 612883456.0, + "logits/rejected": 734385600.0, + "logps/chosen": -358.0764567057292, + "logps/rejected": -409.7615966796875, + "loss": 0.0129, + "rewards/chosen": 4.5434830983479815, + "rewards/margins": 14.795193990071613, + "rewards/rejected": -10.251710891723633, + "step": 5092 + }, + { + "epoch": 0.4653266331658292, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 5.560281622171684e-06, + "logits/chosen": 465550677.3333333, + "logits/rejected": 387633600.0, + "logps/chosen": -339.18532307942706, + "logps/rejected": -660.5784912109375, + "loss": 0.0208, + "rewards/chosen": 3.770270347595215, + "rewards/margins": 16.425739288330078, + "rewards/rejected": -12.655468940734863, + "step": 5093 + }, + { + "epoch": 0.4654179990863408, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 5.558852854654197e-06, + "logits/chosen": 422029376.0, + "logits/rejected": 523084992.0, + "logps/chosen": -282.5392761230469, + "logps/rejected": -565.224609375, + "loss": 0.0219, + "rewards/chosen": 3.3977813720703125, + "rewards/margins": 13.608755111694336, + "rewards/rejected": -10.210973739624023, + "step": 5094 + }, + { + "epoch": 0.46550936500685247, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5.5574240409247595e-06, + "logits/chosen": 655533107.2, + "logits/rejected": 570915840.0, + "logps/chosen": -423.70087890625, + "logps/rejected": -464.3865966796875, + "loss": 0.0125, + "rewards/chosen": 4.477366256713867, + "rewards/margins": 13.725474166870118, + "rewards/rejected": -9.24810791015625, + "step": 5095 + }, + { + "epoch": 0.4656007309273641, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5.555995181101517e-06, + "logits/chosen": 446984396.8, + "logits/rejected": 788763733.3333334, + "logps/chosen": -196.23433837890624, + "logps/rejected": -540.9817708333334, + "loss": 0.0254, + "rewards/chosen": 3.5629295349121093, + "rewards/margins": 14.425933583577475, + "rewards/rejected": -10.863004048665365, + "step": 5096 + }, + { + "epoch": 0.46569209684787577, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5.5545662753026244e-06, + "logits/chosen": 269180992.0, + "logits/rejected": 308784128.0, + "logps/chosen": -475.63604736328125, + "logps/rejected": -397.1536051432292, + "loss": 0.0141, + "rewards/chosen": 2.8322739601135254, + "rewards/margins": 11.7500106493632, + "rewards/rejected": -8.917736689249674, + "step": 5097 + }, + { + "epoch": 0.4657834627683874, + "grad_norm": 0.8671875, + "kl": 0.0, + "learning_rate": 5.55313732364624e-06, + "logits/chosen": 709195605.3333334, + "logits/rejected": 831294771.2, + "logps/chosen": -241.11954752604166, + "logps/rejected": -620.99833984375, + "loss": 0.0049, + "rewards/chosen": 4.756425221761067, + "rewards/margins": 13.526029713948567, + "rewards/rejected": -8.7696044921875, + "step": 5098 + }, + { + "epoch": 0.46587482868889907, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 5.551708326250522e-06, + "logits/chosen": 754126208.0, + "logits/rejected": 441868800.0, + "logps/chosen": -198.56683349609375, + "logps/rejected": -440.6246337890625, + "loss": 0.0055, + "rewards/chosen": 3.9926514625549316, + "rewards/margins": 12.39907439549764, + "rewards/rejected": -8.406422932942709, + "step": 5099 + }, + { + "epoch": 0.4659661946094107, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 5.550279283233638e-06, + "logits/chosen": 506840704.0, + "logits/rejected": 358715562.6666667, + "logps/chosen": -359.5555419921875, + "logps/rejected": -352.9982096354167, + "loss": 0.0079, + "rewards/chosen": 3.5401124954223633, + "rewards/margins": 12.145832379659018, + "rewards/rejected": -8.605719884236654, + "step": 5100 + }, + { + "epoch": 0.46605756052992237, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 5.548850194713756e-06, + "logits/chosen": 456473664.0, + "logits/rejected": 413837536.0, + "logps/chosen": -321.94671630859375, + "logps/rejected": -428.8758544921875, + "loss": 0.0074, + "rewards/chosen": 4.669597625732422, + "rewards/margins": 13.974479675292969, + "rewards/rejected": -9.304882049560547, + "step": 5101 + }, + { + "epoch": 0.466148926450434, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5.54742106080905e-06, + "logits/chosen": 524449194.6666667, + "logits/rejected": 575341465.6, + "logps/chosen": -265.06260172526044, + "logps/rejected": -453.388037109375, + "loss": 0.0148, + "rewards/chosen": 4.197283426920573, + "rewards/margins": 12.971022288004558, + "rewards/rejected": -8.773738861083984, + "step": 5102 + }, + { + "epoch": 0.46624029237094566, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5.545991881637691e-06, + "logits/chosen": 516101333.3333333, + "logits/rejected": 340943360.0, + "logps/chosen": -271.0216064453125, + "logps/rejected": -241.7889404296875, + "loss": 0.029, + "rewards/chosen": 3.541014035542806, + "rewards/margins": 9.82521136601766, + "rewards/rejected": -6.2841973304748535, + "step": 5103 + }, + { + "epoch": 0.4663316582914573, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5.544562657317863e-06, + "logits/chosen": 444833024.0, + "logits/rejected": 758417578.6666666, + "logps/chosen": -240.84521484375, + "logps/rejected": -401.09521484375, + "loss": 0.0234, + "rewards/chosen": 3.6962406158447267, + "rewards/margins": 14.110244623819987, + "rewards/rejected": -10.41400400797526, + "step": 5104 + }, + { + "epoch": 0.46642302421196896, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 5.54313338796775e-06, + "logits/chosen": 658670421.3333334, + "logits/rejected": 822985830.4, + "logps/chosen": -342.1618245442708, + "logps/rejected": -347.90947265625, + "loss": 0.1182, + "rewards/chosen": 4.548119227091472, + "rewards/margins": 11.245375696818034, + "rewards/rejected": -6.697256469726563, + "step": 5105 + }, + { + "epoch": 0.4665143901324806, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5.5417040737055374e-06, + "logits/chosen": 645565030.4, + "logits/rejected": 773921962.6666666, + "logps/chosen": -451.711572265625, + "logps/rejected": -520.4518636067709, + "loss": 0.0258, + "rewards/chosen": 3.772264862060547, + "rewards/margins": 13.05187136332194, + "rewards/rejected": -9.279606501261393, + "step": 5106 + }, + { + "epoch": 0.46660575605299226, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 5.540274714649415e-06, + "logits/chosen": 390268074.6666667, + "logits/rejected": 304154470.4, + "logps/chosen": -205.30802408854166, + "logps/rejected": -336.112841796875, + "loss": 0.0181, + "rewards/chosen": 3.672316551208496, + "rewards/margins": 12.739567375183105, + "rewards/rejected": -9.067250823974609, + "step": 5107 + }, + { + "epoch": 0.4666971219735039, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 5.538845310917582e-06, + "logits/chosen": 349949568.0, + "logits/rejected": 609379264.0, + "logps/chosen": -231.0889892578125, + "logps/rejected": -603.166748046875, + "loss": 0.0066, + "rewards/chosen": 4.729969501495361, + "rewards/margins": 14.544841289520264, + "rewards/rejected": -9.814871788024902, + "step": 5108 + }, + { + "epoch": 0.46678848789401556, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5.537415862628232e-06, + "logits/chosen": 552544298.6666666, + "logits/rejected": 541676800.0, + "logps/chosen": -505.4212646484375, + "logps/rejected": -500.622802734375, + "loss": 0.0195, + "rewards/chosen": 3.9885692596435547, + "rewards/margins": 14.288232803344727, + "rewards/rejected": -10.299663543701172, + "step": 5109 + }, + { + "epoch": 0.4668798538145272, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 5.535986369899572e-06, + "logits/chosen": 258870720.0, + "logits/rejected": 675003562.6666666, + "logps/chosen": -280.2539978027344, + "logps/rejected": -607.7827962239584, + "loss": 0.1206, + "rewards/chosen": 3.0166923999786377, + "rewards/margins": 10.958234707514446, + "rewards/rejected": -7.941542307535808, + "step": 5110 + }, + { + "epoch": 0.46697121973503886, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 5.534556832849803e-06, + "logits/chosen": 723656064.0, + "logits/rejected": 505472192.0, + "logps/chosen": -209.0741729736328, + "logps/rejected": -457.989990234375, + "loss": 0.0248, + "rewards/chosen": 3.5314579010009766, + "rewards/margins": 12.064645767211914, + "rewards/rejected": -8.533187866210938, + "step": 5111 + }, + { + "epoch": 0.4670625856555505, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5.533127251597139e-06, + "logits/chosen": 480695398.4, + "logits/rejected": 260741696.0, + "logps/chosen": -299.246923828125, + "logps/rejected": -447.7364095052083, + "loss": 0.0271, + "rewards/chosen": 3.3222766876220704, + "rewards/margins": 12.809392547607422, + "rewards/rejected": -9.487115859985352, + "step": 5112 + }, + { + "epoch": 0.46715395157606215, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 5.53169762625979e-06, + "logits/chosen": 404073696.0, + "logits/rejected": 418756224.0, + "logps/chosen": -356.93310546875, + "logps/rejected": -335.9015197753906, + "loss": 0.0152, + "rewards/chosen": 3.9486145973205566, + "rewards/margins": 11.100704193115234, + "rewards/rejected": -7.152089595794678, + "step": 5113 + }, + { + "epoch": 0.4672453174965738, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 5.530267956955973e-06, + "logits/chosen": 586489728.0, + "logits/rejected": 479829418.6666667, + "logps/chosen": -402.885009765625, + "logps/rejected": -550.9307454427084, + "loss": 0.0075, + "rewards/chosen": 3.5234971046447754, + "rewards/margins": 12.90730079015096, + "rewards/rejected": -9.383803685506185, + "step": 5114 + }, + { + "epoch": 0.46733668341708545, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5.528838243803909e-06, + "logits/chosen": 811717376.0, + "logits/rejected": 526859093.3333333, + "logps/chosen": -342.488330078125, + "logps/rejected": -474.5503336588542, + "loss": 0.0158, + "rewards/chosen": 4.321478271484375, + "rewards/margins": 13.880185190836588, + "rewards/rejected": -9.558706919352213, + "step": 5115 + }, + { + "epoch": 0.4674280493375971, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5.527408486921823e-06, + "logits/chosen": 699502976.0, + "logits/rejected": 606331776.0, + "logps/chosen": -306.033935546875, + "logps/rejected": -433.72344970703125, + "loss": 0.0171, + "rewards/chosen": 3.773620128631592, + "rewards/margins": 10.82617998123169, + "rewards/rejected": -7.052559852600098, + "step": 5116 + }, + { + "epoch": 0.46751941525810875, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 5.5259786864279415e-06, + "logits/chosen": 1540168704.0, + "logits/rejected": 549949056.0, + "logps/chosen": -430.8099670410156, + "logps/rejected": -333.05702718098956, + "loss": 0.0072, + "rewards/chosen": 4.072161674499512, + "rewards/margins": 11.978603680928547, + "rewards/rejected": -7.906442006429036, + "step": 5117 + }, + { + "epoch": 0.46761078117862037, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 5.5245488424404945e-06, + "logits/chosen": 598550848.0, + "logits/rejected": 320033536.0, + "logps/chosen": -215.1535186767578, + "logps/rejected": -538.276123046875, + "loss": 0.0088, + "rewards/chosen": 4.568170547485352, + "rewards/margins": 16.202905654907227, + "rewards/rejected": -11.634735107421875, + "step": 5118 + }, + { + "epoch": 0.46770214709913205, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5.5231189550777195e-06, + "logits/chosen": 565978026.6666666, + "logits/rejected": 346269056.0, + "logps/chosen": -379.1978352864583, + "logps/rejected": -545.4981079101562, + "loss": 0.0252, + "rewards/chosen": 3.7379652659098306, + "rewards/margins": 16.849016825358074, + "rewards/rejected": -13.111051559448242, + "step": 5119 + }, + { + "epoch": 0.46779351301964367, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.521689024457853e-06, + "logits/chosen": 730535168.0, + "logits/rejected": 620601344.0, + "logps/chosen": -310.58868408203125, + "logps/rejected": -570.2466430664062, + "loss": 0.018, + "rewards/chosen": 3.5946755409240723, + "rewards/margins": 13.105064868927002, + "rewards/rejected": -9.51038932800293, + "step": 5120 + }, + { + "epoch": 0.46788487894015535, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5.520259050699138e-06, + "logits/chosen": 665364480.0, + "logits/rejected": 250746240.0, + "logps/chosen": -337.26727294921875, + "logps/rejected": -285.3107604980469, + "loss": 0.0254, + "rewards/chosen": 3.7375787099202475, + "rewards/margins": 15.524020512898764, + "rewards/rejected": -11.786441802978516, + "step": 5121 + }, + { + "epoch": 0.46797624486066697, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5.5188290339198195e-06, + "logits/chosen": 451970880.0, + "logits/rejected": 446414016.0, + "logps/chosen": -293.3050231933594, + "logps/rejected": -262.33734130859375, + "loss": 0.029, + "rewards/chosen": 3.083028793334961, + "rewards/margins": 11.175548553466797, + "rewards/rejected": -8.092519760131836, + "step": 5122 + }, + { + "epoch": 0.46806761078117864, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5.517398974238148e-06, + "logits/chosen": 450925792.0, + "logits/rejected": 476631616.0, + "logps/chosen": -384.68280029296875, + "logps/rejected": -465.6513366699219, + "loss": 0.011, + "rewards/chosen": 4.315479278564453, + "rewards/margins": 12.932608604431152, + "rewards/rejected": -8.6171293258667, + "step": 5123 + }, + { + "epoch": 0.46815897670169027, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5.515968871772373e-06, + "logits/chosen": 544596992.0, + "logits/rejected": 481939541.3333333, + "logps/chosen": -296.48492431640625, + "logps/rejected": -538.68896484375, + "loss": 0.0163, + "rewards/chosen": 2.6872284412384033, + "rewards/margins": 11.918166399002075, + "rewards/rejected": -9.230937957763672, + "step": 5124 + }, + { + "epoch": 0.46825034262220194, + "grad_norm": 0.2216796875, + "kl": 0.0, + "learning_rate": 5.514538726640755e-06, + "logits/chosen": 206986352.0, + "logits/rejected": 529005275.4285714, + "logps/chosen": -317.4033508300781, + "logps/rejected": -569.2476283482143, + "loss": 0.0014, + "rewards/chosen": 5.2884063720703125, + "rewards/margins": 13.881349836077009, + "rewards/rejected": -8.592943464006696, + "step": 5125 + }, + { + "epoch": 0.46834170854271356, + "grad_norm": 0.92578125, + "kl": 0.0, + "learning_rate": 5.513108538961551e-06, + "logits/chosen": 587067520.0, + "logits/rejected": 947231158.8571428, + "logps/chosen": -258.6014404296875, + "logps/rejected": -573.4627859933036, + "loss": 0.004, + "rewards/chosen": 3.4552552700042725, + "rewards/margins": 13.550005810601371, + "rewards/rejected": -10.094750540597099, + "step": 5126 + }, + { + "epoch": 0.46843307446322524, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.5116783088530255e-06, + "logits/chosen": 1582557184.0, + "logits/rejected": 1127344384.0, + "logps/chosen": -257.9627685546875, + "logps/rejected": -457.0113932291667, + "loss": 0.0569, + "rewards/chosen": 2.4119009971618652, + "rewards/margins": 11.169960180918375, + "rewards/rejected": -8.75805918375651, + "step": 5127 + }, + { + "epoch": 0.46852444038373686, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.510248036433444e-06, + "logits/chosen": 587732480.0, + "logits/rejected": 526850624.0, + "logps/chosen": -277.15545654296875, + "logps/rejected": -377.5904541015625, + "loss": 0.015, + "rewards/chosen": 3.805290699005127, + "rewards/margins": 11.516741275787354, + "rewards/rejected": -7.711450576782227, + "step": 5128 + }, + { + "epoch": 0.46861580630424854, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5.508817721821078e-06, + "logits/chosen": 437496934.4, + "logits/rejected": 353142442.6666667, + "logps/chosen": -245.339794921875, + "logps/rejected": -445.44482421875, + "loss": 0.1157, + "rewards/chosen": 3.397035598754883, + "rewards/margins": 13.149632136027018, + "rewards/rejected": -9.752596537272135, + "step": 5129 + }, + { + "epoch": 0.46870717222476016, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5.5073873651342e-06, + "logits/chosen": 431748659.2, + "logits/rejected": 703001258.6666666, + "logps/chosen": -296.79228515625, + "logps/rejected": -307.48671468098956, + "loss": 0.0962, + "rewards/chosen": 4.278787231445312, + "rewards/margins": 9.414263661702474, + "rewards/rejected": -5.135476430257161, + "step": 5130 + }, + { + "epoch": 0.46879853814527184, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 5.505956966491088e-06, + "logits/chosen": 680292864.0, + "logits/rejected": 678010922.6666666, + "logps/chosen": -229.19769287109375, + "logps/rejected": -381.1313069661458, + "loss": 0.0065, + "rewards/chosen": 4.391841888427734, + "rewards/margins": 12.247667948404949, + "rewards/rejected": -7.855826059977214, + "step": 5131 + }, + { + "epoch": 0.46888990406578346, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5.504526526010023e-06, + "logits/chosen": 452304341.3333333, + "logits/rejected": 460825984.0, + "logps/chosen": -309.9427490234375, + "logps/rejected": -330.2998046875, + "loss": 0.031, + "rewards/chosen": 3.3930883407592773, + "rewards/margins": 10.392127990722656, + "rewards/rejected": -6.999039649963379, + "step": 5132 + }, + { + "epoch": 0.46898126998629514, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5.503096043809288e-06, + "logits/chosen": 836721344.0, + "logits/rejected": 581199040.0, + "logps/chosen": -365.68841552734375, + "logps/rejected": -527.0664672851562, + "loss": 0.0139, + "rewards/chosen": 4.111273288726807, + "rewards/margins": 14.809303760528564, + "rewards/rejected": -10.698030471801758, + "step": 5133 + }, + { + "epoch": 0.46907263590680676, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5.501665520007173e-06, + "logits/chosen": 386688768.0, + "logits/rejected": 547545497.6, + "logps/chosen": -308.4764811197917, + "logps/rejected": -357.1633544921875, + "loss": 0.024, + "rewards/chosen": 4.740427335103353, + "rewards/margins": 12.496520932515462, + "rewards/rejected": -7.756093597412109, + "step": 5134 + }, + { + "epoch": 0.46916400182731843, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.500234954721966e-06, + "logits/chosen": 445353472.0, + "logits/rejected": 526050611.2, + "logps/chosen": -307.86676025390625, + "logps/rejected": -536.098876953125, + "loss": 0.0403, + "rewards/chosen": 3.379047711690267, + "rewards/margins": 11.748942120869955, + "rewards/rejected": -8.369894409179688, + "step": 5135 + }, + { + "epoch": 0.46925536774783005, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5.498804348071965e-06, + "logits/chosen": 588792320.0, + "logits/rejected": 335430336.0, + "logps/chosen": -380.18477957589283, + "logps/rejected": -380.87457275390625, + "loss": 0.0352, + "rewards/chosen": 3.5404104505266463, + "rewards/margins": 10.723680904933385, + "rewards/rejected": -7.183270454406738, + "step": 5136 + }, + { + "epoch": 0.46934673366834173, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 5.497373700175464e-06, + "logits/chosen": 485616128.0, + "logits/rejected": 470914368.0, + "logps/chosen": -308.8695068359375, + "logps/rejected": -541.0916748046875, + "loss": 0.0082, + "rewards/chosen": 4.380725860595703, + "rewards/margins": 13.16439151763916, + "rewards/rejected": -8.783665657043457, + "step": 5137 + }, + { + "epoch": 0.46943809958885335, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 5.495943011150766e-06, + "logits/chosen": 747784768.0, + "logits/rejected": 973335210.6666666, + "logps/chosen": -302.7806396484375, + "logps/rejected": -527.1951497395834, + "loss": 0.0067, + "rewards/chosen": 3.7987136840820312, + "rewards/margins": 12.27668571472168, + "rewards/rejected": -8.477972030639648, + "step": 5138 + }, + { + "epoch": 0.46952946550936503, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5.494512281116177e-06, + "logits/chosen": 589103872.0, + "logits/rejected": 488594602.6666667, + "logps/chosen": -353.85076904296875, + "logps/rejected": -584.50244140625, + "loss": 0.0078, + "rewards/chosen": 3.4781715869903564, + "rewards/margins": 13.453169425328573, + "rewards/rejected": -9.974997838338217, + "step": 5139 + }, + { + "epoch": 0.46962083142987665, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.493081510190004e-06, + "logits/chosen": 999744960.0, + "logits/rejected": 565846784.0, + "logps/chosen": -316.61663818359375, + "logps/rejected": -662.0557250976562, + "loss": 0.135, + "rewards/chosen": 1.9843635559082031, + "rewards/margins": 13.370851516723633, + "rewards/rejected": -11.38648796081543, + "step": 5140 + }, + { + "epoch": 0.4697121973503883, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5.491650698490558e-06, + "logits/chosen": 617680426.6666666, + "logits/rejected": 252763904.0, + "logps/chosen": -226.1837158203125, + "logps/rejected": -381.93914794921875, + "loss": 0.0342, + "rewards/chosen": 3.318990389506022, + "rewards/margins": 13.59036127726237, + "rewards/rejected": -10.271370887756348, + "step": 5141 + }, + { + "epoch": 0.46980356327089995, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 5.490219846136155e-06, + "logits/chosen": 458579456.0, + "logits/rejected": 278827980.8, + "logps/chosen": -288.238525390625, + "logps/rejected": -343.3332763671875, + "loss": 0.0213, + "rewards/chosen": 2.966460863749186, + "rewards/margins": 12.161146227518717, + "rewards/rejected": -9.194685363769532, + "step": 5142 + }, + { + "epoch": 0.4698949291914116, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5.488788953245114e-06, + "logits/chosen": 351825484.8, + "logits/rejected": 749534208.0, + "logps/chosen": -212.75810546875, + "logps/rejected": -894.4991861979166, + "loss": 0.0129, + "rewards/chosen": 4.548378372192383, + "rewards/margins": 17.349617385864256, + "rewards/rejected": -12.801239013671875, + "step": 5143 + }, + { + "epoch": 0.46998629511192325, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 5.487358019935754e-06, + "logits/chosen": 492080341.3333333, + "logits/rejected": 432606560.0, + "logps/chosen": -410.1735432942708, + "logps/rejected": -527.4642333984375, + "loss": 0.0154, + "rewards/chosen": 4.363739967346191, + "rewards/margins": 12.31693983078003, + "rewards/rejected": -7.953199863433838, + "step": 5144 + }, + { + "epoch": 0.4700776610324349, + "grad_norm": 53.5, + "kl": 0.0, + "learning_rate": 5.485927046326401e-06, + "logits/chosen": 357025280.0, + "logits/rejected": 472043562.6666667, + "logps/chosen": -260.188818359375, + "logps/rejected": -307.73402913411456, + "loss": 0.0672, + "rewards/chosen": 3.2763946533203123, + "rewards/margins": 13.452417500813802, + "rewards/rejected": -10.17602284749349, + "step": 5145 + }, + { + "epoch": 0.47016902695294654, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5.484496032535385e-06, + "logits/chosen": 653787392.0, + "logits/rejected": 745581482.6666666, + "logps/chosen": -369.70126953125, + "logps/rejected": -730.5545247395834, + "loss": 0.0138, + "rewards/chosen": 4.729788208007813, + "rewards/margins": 13.681220753987631, + "rewards/rejected": -8.951432545979818, + "step": 5146 + }, + { + "epoch": 0.4702603928734582, + "grad_norm": 58.75, + "kl": 0.0, + "learning_rate": 5.483064978681033e-06, + "logits/chosen": 469272064.0, + "logits/rejected": 491188633.6, + "logps/chosen": -297.02972412109375, + "logps/rejected": -509.88564453125, + "loss": 0.0709, + "rewards/chosen": 2.8266121546427407, + "rewards/margins": 11.280486361185709, + "rewards/rejected": -8.453874206542968, + "step": 5147 + }, + { + "epoch": 0.47035175879396984, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 5.481633884881685e-06, + "logits/chosen": 393382720.0, + "logits/rejected": 532554069.3333333, + "logps/chosen": -489.7737731933594, + "logps/rejected": -524.8961588541666, + "loss": 0.0046, + "rewards/chosen": 4.120152473449707, + "rewards/margins": 13.441198666890463, + "rewards/rejected": -9.321046193440756, + "step": 5148 + }, + { + "epoch": 0.4704431247144815, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5.480202751255678e-06, + "logits/chosen": 418893132.8, + "logits/rejected": 338830805.3333333, + "logps/chosen": -254.2560546875, + "logps/rejected": -334.69753011067706, + "loss": 0.135, + "rewards/chosen": 2.0573936462402345, + "rewards/margins": 10.684932963053384, + "rewards/rejected": -8.62753931681315, + "step": 5149 + }, + { + "epoch": 0.47053449063499314, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5.478771577921351e-06, + "logits/chosen": 413969322.6666667, + "logits/rejected": 642920704.0, + "logps/chosen": -223.75895182291666, + "logps/rejected": -786.503662109375, + "loss": 0.014, + "rewards/chosen": 4.452958106994629, + "rewards/margins": 11.68445873260498, + "rewards/rejected": -7.231500625610352, + "step": 5150 + }, + { + "epoch": 0.4706258565555048, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5.477340364997051e-06, + "logits/chosen": 597456128.0, + "logits/rejected": 454388384.0, + "logps/chosen": -407.2907409667969, + "logps/rejected": -475.4057312011719, + "loss": 0.0111, + "rewards/chosen": 4.323764801025391, + "rewards/margins": 13.758665084838867, + "rewards/rejected": -9.434900283813477, + "step": 5151 + }, + { + "epoch": 0.47071722247601644, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5.475909112601125e-06, + "logits/chosen": 414009120.0, + "logits/rejected": 902280960.0, + "logps/chosen": -191.19239807128906, + "logps/rejected": -387.1333312988281, + "loss": 0.019, + "rewards/chosen": 3.87785267829895, + "rewards/margins": 11.65884518623352, + "rewards/rejected": -7.78099250793457, + "step": 5152 + }, + { + "epoch": 0.4708085883965281, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.474477820851926e-06, + "logits/chosen": 468599193.6, + "logits/rejected": 594608896.0, + "logps/chosen": -322.3183349609375, + "logps/rejected": -797.5929361979166, + "loss": 0.0278, + "rewards/chosen": 3.575095367431641, + "rewards/margins": 14.898950449625652, + "rewards/rejected": -11.32385508219401, + "step": 5153 + }, + { + "epoch": 0.47089995431703974, + "grad_norm": 54.25, + "kl": 0.0, + "learning_rate": 5.473046489867806e-06, + "logits/chosen": 385227178.6666667, + "logits/rejected": 481641267.2, + "logps/chosen": -136.66691080729166, + "logps/rejected": -313.867529296875, + "loss": 0.0878, + "rewards/chosen": 4.07852045694987, + "rewards/margins": 11.220074717203776, + "rewards/rejected": -7.141554260253907, + "step": 5154 + }, + { + "epoch": 0.4709913202375514, + "grad_norm": 28.625, + "kl": 0.0, + "learning_rate": 5.471615119767125e-06, + "logits/chosen": 895084748.8, + "logits/rejected": 434003541.3333333, + "logps/chosen": -351.012451171875, + "logps/rejected": -338.7443033854167, + "loss": 0.078, + "rewards/chosen": 2.5108362197875977, + "rewards/margins": 7.299130694071453, + "rewards/rejected": -4.7882944742838545, + "step": 5155 + }, + { + "epoch": 0.47108268615806304, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5.470183710668244e-06, + "logits/chosen": 777331251.2, + "logits/rejected": 801777322.6666666, + "logps/chosen": -223.370263671875, + "logps/rejected": -307.2668050130208, + "loss": 0.1406, + "rewards/chosen": 2.0644145965576173, + "rewards/margins": 10.70982577006022, + "rewards/rejected": -8.645411173502604, + "step": 5156 + }, + { + "epoch": 0.4711740520785747, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5.468752262689525e-06, + "logits/chosen": 467045478.4, + "logits/rejected": 492818474.6666667, + "logps/chosen": -265.271484375, + "logps/rejected": -484.0111490885417, + "loss": 0.0378, + "rewards/chosen": 3.155093955993652, + "rewards/margins": 11.861102867126466, + "rewards/rejected": -8.706008911132812, + "step": 5157 + }, + { + "epoch": 0.47126541799908633, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5.467320775949337e-06, + "logits/chosen": 435160217.6, + "logits/rejected": 307442389.3333333, + "logps/chosen": -447.14599609375, + "logps/rejected": -398.4259440104167, + "loss": 0.0258, + "rewards/chosen": 3.8820823669433593, + "rewards/margins": 13.168198649088541, + "rewards/rejected": -9.286116282145182, + "step": 5158 + }, + { + "epoch": 0.471356783919598, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5.465889250566052e-06, + "logits/chosen": 1249545557.3333333, + "logits/rejected": 867834470.4, + "logps/chosen": -523.6954752604166, + "logps/rejected": -462.844140625, + "loss": 0.0138, + "rewards/chosen": 3.3663371404012046, + "rewards/margins": 12.130398114522299, + "rewards/rejected": -8.764060974121094, + "step": 5159 + }, + { + "epoch": 0.47144814984010963, + "grad_norm": 0.1552734375, + "kl": 0.0, + "learning_rate": 5.464457686658041e-06, + "logits/chosen": 335873984.0, + "logits/rejected": 774729069.7142857, + "logps/chosen": -271.6330871582031, + "logps/rejected": -460.8617466517857, + "loss": 0.0006, + "rewards/chosen": 5.443869113922119, + "rewards/margins": 16.083487033843994, + "rewards/rejected": -10.639617919921875, + "step": 5160 + }, + { + "epoch": 0.4715395157606213, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5.463026084343683e-06, + "logits/chosen": 445442645.3333333, + "logits/rejected": 419470368.0, + "logps/chosen": -358.9731038411458, + "logps/rejected": -280.443603515625, + "loss": 0.0195, + "rewards/chosen": 4.137128512064616, + "rewards/margins": 12.138824144999187, + "rewards/rejected": -8.00169563293457, + "step": 5161 + }, + { + "epoch": 0.47163088168113293, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 5.461594443741358e-06, + "logits/chosen": 516330965.3333333, + "logits/rejected": 796980121.6, + "logps/chosen": -332.6182861328125, + "logps/rejected": -488.488916015625, + "loss": 0.008, + "rewards/chosen": 4.037542978922526, + "rewards/margins": 13.129880015055338, + "rewards/rejected": -9.092337036132813, + "step": 5162 + }, + { + "epoch": 0.4717222476016446, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5.46016276496945e-06, + "logits/chosen": 484962816.0, + "logits/rejected": 502560448.0, + "logps/chosen": -251.40855407714844, + "logps/rejected": -389.0435485839844, + "loss": 0.0103, + "rewards/chosen": 4.521487236022949, + "rewards/margins": 13.9518461227417, + "rewards/rejected": -9.43035888671875, + "step": 5163 + }, + { + "epoch": 0.4718136135221562, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 5.458731048146344e-06, + "logits/chosen": 366878165.3333333, + "logits/rejected": 466818150.4, + "logps/chosen": -290.07623291015625, + "logps/rejected": -550.23857421875, + "loss": 0.0154, + "rewards/chosen": 4.1443430582682295, + "rewards/margins": 14.964580790201822, + "rewards/rejected": -10.820237731933593, + "step": 5164 + }, + { + "epoch": 0.4719049794426679, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 5.45729929339043e-06, + "logits/chosen": 432738560.0, + "logits/rejected": 518653952.0, + "logps/chosen": -238.1265411376953, + "logps/rejected": -661.3033040364584, + "loss": 0.0078, + "rewards/chosen": 3.740330696105957, + "rewards/margins": 12.110550244649252, + "rewards/rejected": -8.370219548543295, + "step": 5165 + }, + { + "epoch": 0.4719963453631795, + "grad_norm": 0.9609375, + "kl": 0.0, + "learning_rate": 5.455867500820103e-06, + "logits/chosen": 314342912.0, + "logits/rejected": 389727078.4, + "logps/chosen": -327.41807047526044, + "logps/rejected": -313.381787109375, + "loss": 0.0056, + "rewards/chosen": 4.53799565633138, + "rewards/margins": 12.586228688557942, + "rewards/rejected": -8.048233032226562, + "step": 5166 + }, + { + "epoch": 0.4720877112836912, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 5.454435670553756e-06, + "logits/chosen": 563643392.0, + "logits/rejected": 461388096.0, + "logps/chosen": -329.1001790364583, + "logps/rejected": -197.4563751220703, + "loss": 0.0216, + "rewards/chosen": 3.7444985707600913, + "rewards/margins": 11.490219910939535, + "rewards/rejected": -7.745721340179443, + "step": 5167 + }, + { + "epoch": 0.4721790772042028, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 5.453003802709791e-06, + "logits/chosen": 806133248.0, + "logits/rejected": 1519828650.6666667, + "logps/chosen": -495.3439453125, + "logps/rejected": -770.8668619791666, + "loss": 0.0178, + "rewards/chosen": 4.012981033325195, + "rewards/margins": 12.91780751546224, + "rewards/rejected": -8.904826482137045, + "step": 5168 + }, + { + "epoch": 0.4722704431247145, + "grad_norm": 0.35546875, + "kl": 0.0, + "learning_rate": 5.451571897406608e-06, + "logits/chosen": 376336298.6666667, + "logits/rejected": 323432115.2, + "logps/chosen": -321.1995849609375, + "logps/rejected": -424.86337890625, + "loss": 0.0022, + "rewards/chosen": 5.861979802449544, + "rewards/margins": 13.957547124226888, + "rewards/rejected": -8.095567321777343, + "step": 5169 + }, + { + "epoch": 0.4723618090452261, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 5.450139954762613e-06, + "logits/chosen": 472490688.0, + "logits/rejected": 566076928.0, + "logps/chosen": -378.5589294433594, + "logps/rejected": -525.712890625, + "loss": 0.0125, + "rewards/chosen": 4.146095275878906, + "rewards/margins": 14.263057708740234, + "rewards/rejected": -10.116962432861328, + "step": 5170 + }, + { + "epoch": 0.4724531749657378, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5.448707974896214e-06, + "logits/chosen": 492293888.0, + "logits/rejected": 381539788.8, + "logps/chosen": -381.3546142578125, + "logps/rejected": -286.9437255859375, + "loss": 0.0066, + "rewards/chosen": 4.9111175537109375, + "rewards/margins": 12.242401123046875, + "rewards/rejected": -7.3312835693359375, + "step": 5171 + }, + { + "epoch": 0.4725445408862494, + "grad_norm": 3.609375, + "kl": 4.098941802978516, + "learning_rate": 5.4472759579258225e-06, + "logits/chosen": 531664201.14285713, + "logits/rejected": 536291392.0, + "logps/chosen": -402.78274972098217, + "logps/rejected": -589.66015625, + "loss": 0.0267, + "rewards/chosen": 4.020463398524693, + "rewards/margins": 12.25513117653983, + "rewards/rejected": -8.234667778015137, + "step": 5172 + }, + { + "epoch": 0.4726359068067611, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 5.445843903969854e-06, + "logits/chosen": 564484608.0, + "logits/rejected": 906467200.0, + "logps/chosen": -276.9992370605469, + "logps/rejected": -302.14068603515625, + "loss": 0.0449, + "rewards/chosen": 3.0038254261016846, + "rewards/margins": 10.930976152420044, + "rewards/rejected": -7.927150726318359, + "step": 5173 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5.444411813146725e-06, + "logits/chosen": 592596224.0, + "logits/rejected": 1108862361.6, + "logps/chosen": -238.86848958333334, + "logps/rejected": -480.764306640625, + "loss": 0.0089, + "rewards/chosen": 4.47867997487386, + "rewards/margins": 14.596786053975421, + "rewards/rejected": -10.118106079101562, + "step": 5174 + }, + { + "epoch": 0.4728186386477844, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5.442979685574856e-06, + "logits/chosen": 678825984.0, + "logits/rejected": 305426496.0, + "logps/chosen": -352.22251674107144, + "logps/rejected": -639.6728515625, + "loss": 0.035, + "rewards/chosen": 3.239733559744699, + "rewards/margins": 17.096615655081614, + "rewards/rejected": -13.856882095336914, + "step": 5175 + }, + { + "epoch": 0.472910004568296, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5.4415475213726736e-06, + "logits/chosen": 552167936.0, + "logits/rejected": 448281856.0, + "logps/chosen": -348.119384765625, + "logps/rejected": -273.1778564453125, + "loss": 0.1339, + "rewards/chosen": 3.2323861122131348, + "rewards/margins": 8.113253116607666, + "rewards/rejected": -4.880867004394531, + "step": 5176 + }, + { + "epoch": 0.4730013704888077, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 5.4401153206586e-06, + "logits/chosen": 395168480.0, + "logits/rejected": 381152448.0, + "logps/chosen": -333.072021484375, + "logps/rejected": -473.8780517578125, + "loss": 0.0183, + "rewards/chosen": 3.618927478790283, + "rewards/margins": 12.357462406158447, + "rewards/rejected": -8.738534927368164, + "step": 5177 + }, + { + "epoch": 0.4730927364093193, + "grad_norm": 0.53125, + "kl": 0.0, + "learning_rate": 5.4386830835510685e-06, + "logits/chosen": 291307861.3333333, + "logits/rejected": 219351449.6, + "logps/chosen": -306.7943522135417, + "logps/rejected": -382.952587890625, + "loss": 0.0024, + "rewards/chosen": 5.1743119557698565, + "rewards/margins": 15.601309331258136, + "rewards/rejected": -10.42699737548828, + "step": 5178 + }, + { + "epoch": 0.473184102329831, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5.437250810168509e-06, + "logits/chosen": 547953024.0, + "logits/rejected": 303343552.0, + "logps/chosen": -252.752685546875, + "logps/rejected": -363.5001627604167, + "loss": 0.0213, + "rewards/chosen": 2.5698533058166504, + "rewards/margins": 11.971793333689371, + "rewards/rejected": -9.40194002787272, + "step": 5179 + }, + { + "epoch": 0.4732754682503426, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5.435818500629361e-06, + "logits/chosen": 677412352.0, + "logits/rejected": 406072544.0, + "logps/chosen": -403.5430908203125, + "logps/rejected": -404.21588134765625, + "loss": 0.0077, + "rewards/chosen": 4.257610321044922, + "rewards/margins": 12.774678230285645, + "rewards/rejected": -8.517067909240723, + "step": 5180 + }, + { + "epoch": 0.4733668341708543, + "grad_norm": 0.236328125, + "kl": 0.0, + "learning_rate": 5.434386155052059e-06, + "logits/chosen": 274735936.0, + "logits/rejected": 862742613.3333334, + "logps/chosen": -85.3503189086914, + "logps/rejected": -722.8880208333334, + "loss": 0.0016, + "rewards/chosen": 5.449808120727539, + "rewards/margins": 16.529148737589516, + "rewards/rejected": -11.079340616861979, + "step": 5181 + }, + { + "epoch": 0.4734582000913659, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5.4329537735550465e-06, + "logits/chosen": 560678400.0, + "logits/rejected": 609755776.0, + "logps/chosen": -326.862548828125, + "logps/rejected": -583.0894775390625, + "loss": 0.0226, + "rewards/chosen": 3.329176425933838, + "rewards/margins": 15.844974994659424, + "rewards/rejected": -12.515798568725586, + "step": 5182 + }, + { + "epoch": 0.4735495660118776, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 5.43152135625677e-06, + "logits/chosen": 568701824.0, + "logits/rejected": 579034931.2, + "logps/chosen": -286.6591389973958, + "logps/rejected": -439.8912109375, + "loss": 0.0196, + "rewards/chosen": 2.917741139729818, + "rewards/margins": 11.615662129720052, + "rewards/rejected": -8.697920989990234, + "step": 5183 + }, + { + "epoch": 0.4736409319323892, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5.430088903275675e-06, + "logits/chosen": 1100695936.0, + "logits/rejected": 438462688.0, + "logps/chosen": -348.94952392578125, + "logps/rejected": -367.74078369140625, + "loss": 0.0143, + "rewards/chosen": 4.627634048461914, + "rewards/margins": 12.115307807922363, + "rewards/rejected": -7.487673759460449, + "step": 5184 + }, + { + "epoch": 0.4737322978529009, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5.4286564147302126e-06, + "logits/chosen": 794332224.0, + "logits/rejected": 529862229.3333333, + "logps/chosen": -727.2288818359375, + "logps/rejected": -469.8980305989583, + "loss": 0.0075, + "rewards/chosen": 3.5664916038513184, + "rewards/margins": 12.617232163747152, + "rewards/rejected": -9.050740559895834, + "step": 5185 + }, + { + "epoch": 0.4738236637734125, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 5.427223890738835e-06, + "logits/chosen": 976759296.0, + "logits/rejected": 988437094.4, + "logps/chosen": -363.9419352213542, + "logps/rejected": -380.97275390625, + "loss": 0.0155, + "rewards/chosen": 3.2342713673909507, + "rewards/margins": 11.017799504597981, + "rewards/rejected": -7.783528137207031, + "step": 5186 + }, + { + "epoch": 0.4739150296939242, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 5.425791331420002e-06, + "logits/chosen": 564090470.4, + "logits/rejected": 510247125.3333333, + "logps/chosen": -406.5893310546875, + "logps/rejected": -669.4873860677084, + "loss": 0.0174, + "rewards/chosen": 3.863450622558594, + "rewards/margins": 18.273563639322916, + "rewards/rejected": -14.410113016764322, + "step": 5187 + }, + { + "epoch": 0.4740063956144358, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5.424358736892168e-06, + "logits/chosen": 1602813696.0, + "logits/rejected": 758575542.8571428, + "logps/chosen": -752.47314453125, + "logps/rejected": -541.1639229910714, + "loss": 0.0065, + "rewards/chosen": 2.971386671066284, + "rewards/margins": 12.14439082145691, + "rewards/rejected": -9.173004150390625, + "step": 5188 + }, + { + "epoch": 0.4740977615349475, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5.4229261072738e-06, + "logits/chosen": 654621226.6666666, + "logits/rejected": 760522854.4, + "logps/chosen": -466.8530680338542, + "logps/rejected": -534.407421875, + "loss": 0.0273, + "rewards/chosen": 3.284094492594401, + "rewards/margins": 11.077804819742838, + "rewards/rejected": -7.793710327148437, + "step": 5189 + }, + { + "epoch": 0.4741891274554591, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5.421493442683362e-06, + "logits/chosen": 889487667.2, + "logits/rejected": 540063146.6666666, + "logps/chosen": -260.9333740234375, + "logps/rejected": -349.7124837239583, + "loss": 0.0342, + "rewards/chosen": 3.403730010986328, + "rewards/margins": 11.644898986816406, + "rewards/rejected": -8.241168975830078, + "step": 5190 + }, + { + "epoch": 0.4742804933759708, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5.42006074323932e-06, + "logits/chosen": 851558997.3333334, + "logits/rejected": 563612006.4, + "logps/chosen": -204.72509765625, + "logps/rejected": -463.8787109375, + "loss": 0.027, + "rewards/chosen": 2.781535784403483, + "rewards/margins": 12.611262957255045, + "rewards/rejected": -9.829727172851562, + "step": 5191 + }, + { + "epoch": 0.4743718592964824, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.418628009060147e-06, + "logits/chosen": 451213248.0, + "logits/rejected": 927256960.0, + "logps/chosen": -170.14837646484375, + "logps/rejected": -487.6427917480469, + "loss": 0.0231, + "rewards/chosen": 3.70802903175354, + "rewards/margins": 12.796968698501587, + "rewards/rejected": -9.088939666748047, + "step": 5192 + }, + { + "epoch": 0.4744632252169941, + "grad_norm": 2.34375, + "kl": 2.7571258544921875, + "learning_rate": 5.4171952402643176e-06, + "logits/chosen": 860095853.7142857, + "logits/rejected": 365294112.0, + "logps/chosen": -472.39404296875, + "logps/rejected": -133.4508514404297, + "loss": 0.0131, + "rewards/chosen": 5.402054922921317, + "rewards/margins": 10.96320595060076, + "rewards/rejected": -5.561151027679443, + "step": 5193 + }, + { + "epoch": 0.4745545911375057, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5.4157624369703055e-06, + "logits/chosen": 634761318.4, + "logits/rejected": 207052032.0, + "logps/chosen": -362.5711181640625, + "logps/rejected": -368.6629231770833, + "loss": 0.0229, + "rewards/chosen": 3.4230796813964846, + "rewards/margins": 14.152833811442058, + "rewards/rejected": -10.729754130045572, + "step": 5194 + }, + { + "epoch": 0.4746459570580174, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5.414329599296594e-06, + "logits/chosen": 523779136.0, + "logits/rejected": 508775594.6666667, + "logps/chosen": -349.3050231933594, + "logps/rejected": -475.6348876953125, + "loss": 0.0185, + "rewards/chosen": 2.7781600952148438, + "rewards/margins": 12.199530919392904, + "rewards/rejected": -9.42137082417806, + "step": 5195 + }, + { + "epoch": 0.474737322978529, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 5.412896727361663e-06, + "logits/chosen": 1133491456.0, + "logits/rejected": 738779989.3333334, + "logps/chosen": -378.3470458984375, + "logps/rejected": -514.019775390625, + "loss": 0.0051, + "rewards/chosen": 4.5371904373168945, + "rewards/margins": 13.854799588521322, + "rewards/rejected": -9.317609151204428, + "step": 5196 + }, + { + "epoch": 0.4748286888990407, + "grad_norm": 6.71875, + "kl": 6.91362190246582, + "learning_rate": 5.411463821283998e-06, + "logits/chosen": 657366528.0, + "logps/chosen": -466.65155029296875, + "loss": 0.0501, + "rewards/chosen": 4.113079071044922, + "step": 5197 + }, + { + "epoch": 0.4749200548195523, + "grad_norm": 0.3828125, + "kl": 0.0, + "learning_rate": 5.410030881182085e-06, + "logits/chosen": 305839488.0, + "logits/rejected": 479026278.4, + "logps/chosen": -331.4619954427083, + "logps/rejected": -569.838330078125, + "loss": 0.0019, + "rewards/chosen": 5.536734263102214, + "rewards/margins": 15.570589701334637, + "rewards/rejected": -10.033855438232422, + "step": 5198 + }, + { + "epoch": 0.47501142074006397, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5.40859790717442e-06, + "logits/chosen": 594551936.0, + "logits/rejected": 783219865.6, + "logps/chosen": -478.312744140625, + "logps/rejected": -581.2544921875, + "loss": 0.0088, + "rewards/chosen": 4.175742785135905, + "rewards/margins": 13.39721342722575, + "rewards/rejected": -9.221470642089844, + "step": 5199 + }, + { + "epoch": 0.4751027866605756, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5.407164899379492e-06, + "logits/chosen": 582202197.3333334, + "logits/rejected": 433374617.6, + "logps/chosen": -403.30712890625, + "logps/rejected": -451.521484375, + "loss": 0.0095, + "rewards/chosen": 3.810508410135905, + "rewards/margins": 13.075520006815593, + "rewards/rejected": -9.265011596679688, + "step": 5200 + }, + { + "epoch": 0.47519415258108727, + "grad_norm": 46.0, + "kl": 0.0, + "learning_rate": 5.405731857915801e-06, + "logits/chosen": 657345664.0, + "logits/rejected": 485604704.0, + "logps/chosen": -325.50189208984375, + "logps/rejected": -456.68328857421875, + "loss": 0.0587, + "rewards/chosen": 3.185886859893799, + "rewards/margins": 12.634876728057861, + "rewards/rejected": -9.448989868164062, + "step": 5201 + }, + { + "epoch": 0.4752855185015989, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5.404298782901844e-06, + "logits/chosen": 1093226922.6666667, + "logits/rejected": 1054371635.2, + "logps/chosen": -372.547607421875, + "logps/rejected": -685.212109375, + "loss": 0.0176, + "rewards/chosen": 3.321962038675944, + "rewards/margins": 14.174409548441568, + "rewards/rejected": -10.852447509765625, + "step": 5202 + }, + { + "epoch": 0.47537688442211057, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 5.402865674456122e-06, + "logits/chosen": 531620710.4, + "logits/rejected": 813657770.6666666, + "logps/chosen": -341.64931640625, + "logps/rejected": -325.2047119140625, + "loss": 0.0161, + "rewards/chosen": 4.392319107055664, + "rewards/margins": 11.73505948384603, + "rewards/rejected": -7.342740376790364, + "step": 5203 + }, + { + "epoch": 0.4754682503426222, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 5.401432532697143e-06, + "logits/chosen": 581429845.3333334, + "logits/rejected": 572908083.2, + "logps/chosen": -415.403564453125, + "logps/rejected": -508.494140625, + "loss": 0.0079, + "rewards/chosen": 4.28839111328125, + "rewards/margins": 12.293264770507813, + "rewards/rejected": -8.004873657226563, + "step": 5204 + }, + { + "epoch": 0.47555961626313387, + "grad_norm": 0.875, + "kl": 0.0, + "learning_rate": 5.399999357743412e-06, + "logits/chosen": 313756032.0, + "logits/rejected": 265995494.4, + "logps/chosen": -178.89998372395834, + "logps/rejected": -349.297216796875, + "loss": 0.0042, + "rewards/chosen": 4.996512413024902, + "rewards/margins": 13.723256492614746, + "rewards/rejected": -8.726744079589844, + "step": 5205 + }, + { + "epoch": 0.4756509821836455, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5.398566149713441e-06, + "logits/chosen": 622313685.3333334, + "logits/rejected": 462682009.6, + "logps/chosen": -308.8945719401042, + "logps/rejected": -560.2466796875, + "loss": 0.0182, + "rewards/chosen": 3.208070755004883, + "rewards/margins": 12.181883621215821, + "rewards/rejected": -8.973812866210938, + "step": 5206 + }, + { + "epoch": 0.47574234810415716, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 5.39713290872574e-06, + "logits/chosen": 622488064.0, + "logits/rejected": 773301248.0, + "logps/chosen": -427.77099609375, + "logps/rejected": -673.32421875, + "loss": 0.1112, + "rewards/chosen": 2.654181957244873, + "rewards/margins": 12.864020824432373, + "rewards/rejected": -10.2098388671875, + "step": 5207 + }, + { + "epoch": 0.4758337140246688, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5.39569963489883e-06, + "logits/chosen": 661640533.3333334, + "logits/rejected": 663287744.0, + "logps/chosen": -349.858154296875, + "logps/rejected": -882.0464477539062, + "loss": 0.0439, + "rewards/chosen": 3.015124956766764, + "rewards/margins": 11.107903162638346, + "rewards/rejected": -8.092778205871582, + "step": 5208 + }, + { + "epoch": 0.47592507994518046, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 5.394266328351226e-06, + "logits/chosen": 468308181.3333333, + "logits/rejected": 717590579.2, + "logps/chosen": -454.5159505208333, + "logps/rejected": -554.3248046875, + "loss": 0.0118, + "rewards/chosen": 4.338123639424642, + "rewards/margins": 12.742814191182454, + "rewards/rejected": -8.404690551757813, + "step": 5209 + }, + { + "epoch": 0.4760164458656921, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5.392832989201449e-06, + "logits/chosen": 495600128.0, + "logits/rejected": 726997162.6666666, + "logps/chosen": -224.24951171875, + "logps/rejected": -601.036865234375, + "loss": 0.0229, + "rewards/chosen": 3.6352848052978515, + "rewards/margins": 14.49422747294108, + "rewards/rejected": -10.858942667643229, + "step": 5210 + }, + { + "epoch": 0.47610781178620376, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5.391399617568022e-06, + "logits/chosen": 572484761.6, + "logits/rejected": 794287957.3333334, + "logps/chosen": -271.70380859375, + "logps/rejected": -405.0859375, + "loss": 0.0217, + "rewards/chosen": 4.7772216796875, + "rewards/margins": 12.04148546854655, + "rewards/rejected": -7.26426378885905, + "step": 5211 + }, + { + "epoch": 0.4761991777067154, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5.389966213569475e-06, + "logits/chosen": 500593600.0, + "logits/rejected": 614113024.0, + "logps/chosen": -402.31817626953125, + "logps/rejected": -570.5341186523438, + "loss": 0.022, + "rewards/chosen": 3.418800115585327, + "rewards/margins": 12.853978872299194, + "rewards/rejected": -9.435178756713867, + "step": 5212 + }, + { + "epoch": 0.47629054362722706, + "grad_norm": 0.8828125, + "kl": 0.0, + "learning_rate": 5.388532777324335e-06, + "logits/chosen": 578309120.0, + "logits/rejected": 527290368.0, + "logps/chosen": -325.2430419921875, + "logps/rejected": -639.68408203125, + "loss": 0.0058, + "rewards/chosen": 4.397255897521973, + "rewards/margins": 15.383510780334472, + "rewards/rejected": -10.9862548828125, + "step": 5213 + }, + { + "epoch": 0.4763819095477387, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5.387099308951133e-06, + "logits/chosen": 561164245.3333334, + "logits/rejected": 703572940.8, + "logps/chosen": -300.7270100911458, + "logps/rejected": -651.536328125, + "loss": 0.0166, + "rewards/chosen": 3.714043935139974, + "rewards/margins": 13.1278683980306, + "rewards/rejected": -9.413824462890625, + "step": 5214 + }, + { + "epoch": 0.47647327546825036, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 5.385665808568405e-06, + "logits/chosen": 531117129.14285713, + "logits/rejected": 647002496.0, + "logps/chosen": -417.64432198660717, + "logps/rejected": -477.3490295410156, + "loss": 0.03, + "rewards/chosen": 3.685224805559431, + "rewards/margins": 14.452967916216169, + "rewards/rejected": -10.767743110656738, + "step": 5215 + }, + { + "epoch": 0.476564641388762, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5.384232276294689e-06, + "logits/chosen": 363172960.0, + "logits/rejected": 337512320.0, + "logps/chosen": -408.0602722167969, + "logps/rejected": -417.2882486979167, + "loss": 0.0198, + "rewards/chosen": 3.8048553466796875, + "rewards/margins": 11.90344492594401, + "rewards/rejected": -8.098589579264322, + "step": 5216 + }, + { + "epoch": 0.47665600730927365, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.382798712248524e-06, + "logits/chosen": 358473088.0, + "logits/rejected": 766275840.0, + "logps/chosen": -258.25946044921875, + "logps/rejected": -520.6679077148438, + "loss": 0.0384, + "rewards/chosen": 3.2437222003936768, + "rewards/margins": 10.213958024978638, + "rewards/rejected": -6.970235824584961, + "step": 5217 + }, + { + "epoch": 0.4767473732297853, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5.38136511654845e-06, + "logits/chosen": 451473493.3333333, + "logits/rejected": 431282790.4, + "logps/chosen": -390.5882161458333, + "logps/rejected": -529.64560546875, + "loss": 0.0128, + "rewards/chosen": 3.935903549194336, + "rewards/margins": 14.02175407409668, + "rewards/rejected": -10.085850524902344, + "step": 5218 + }, + { + "epoch": 0.47683873915029695, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5.379931489313016e-06, + "logits/chosen": 648627558.4, + "logits/rejected": 415962240.0, + "logps/chosen": -244.928564453125, + "logps/rejected": -485.5384521484375, + "loss": 0.0295, + "rewards/chosen": 3.391803741455078, + "rewards/margins": 11.979795455932617, + "rewards/rejected": -8.587991714477539, + "step": 5219 + }, + { + "epoch": 0.4769301050708086, + "grad_norm": 0.404296875, + "kl": 0.0, + "learning_rate": 5.378497830660767e-06, + "logits/chosen": 474561792.0, + "logits/rejected": 617153024.0, + "logps/chosen": -240.59517415364584, + "logps/rejected": -527.724609375, + "loss": 0.003, + "rewards/chosen": 4.987893422444661, + "rewards/margins": 16.01048863728841, + "rewards/rejected": -11.02259521484375, + "step": 5220 + }, + { + "epoch": 0.47702147099132025, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5.3770641407102554e-06, + "logits/chosen": 615543734.8571428, + "logits/rejected": 2144972672.0, + "logps/chosen": -353.62618582589283, + "logps/rejected": -215.08241271972656, + "loss": 0.0265, + "rewards/chosen": 3.9527293613978793, + "rewards/margins": 12.49540056501116, + "rewards/rejected": -8.542671203613281, + "step": 5221 + }, + { + "epoch": 0.4771128369118319, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5.375630419580032e-06, + "logits/chosen": 324502357.3333333, + "logits/rejected": 261184307.2, + "logps/chosen": -158.71021525065103, + "logps/rejected": -359.9296875, + "loss": 0.0392, + "rewards/chosen": 2.5659262339274087, + "rewards/margins": 11.679619471232096, + "rewards/rejected": -9.113693237304688, + "step": 5222 + }, + { + "epoch": 0.47720420283234355, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5.374196667388655e-06, + "logits/chosen": 584136806.4, + "logits/rejected": 293692138.6666667, + "logps/chosen": -277.268994140625, + "logps/rejected": -340.3011474609375, + "loss": 0.0253, + "rewards/chosen": 3.40093994140625, + "rewards/margins": 13.23319943745931, + "rewards/rejected": -9.83225949605306, + "step": 5223 + }, + { + "epoch": 0.47729556875285517, + "grad_norm": 33.0, + "kl": 0.0, + "learning_rate": 5.372762884254678e-06, + "logits/chosen": 982098432.0, + "logits/rejected": 850512725.3333334, + "logps/chosen": -382.36201171875, + "logps/rejected": -656.543701171875, + "loss": 0.1418, + "rewards/chosen": 2.7498023986816404, + "rewards/margins": 11.061889139811196, + "rewards/rejected": -8.312086741129557, + "step": 5224 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 5.371329070296667e-06, + "logits/chosen": 619700070.4, + "logits/rejected": 983834282.6666666, + "logps/chosen": -388.9915283203125, + "logps/rejected": -684.6124674479166, + "loss": 0.0075, + "rewards/chosen": 4.760008239746094, + "rewards/margins": 13.40128428141276, + "rewards/rejected": -8.641276041666666, + "step": 5225 + }, + { + "epoch": 0.47747830059387847, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5.369895225633179e-06, + "logits/chosen": 553610752.0, + "logits/rejected": 230673632.0, + "logps/chosen": -384.1138916015625, + "logps/rejected": -336.12890625, + "loss": 0.0366, + "rewards/chosen": 3.046072006225586, + "rewards/margins": 10.704287052154541, + "rewards/rejected": -7.658215045928955, + "step": 5226 + }, + { + "epoch": 0.47756966651439015, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 5.368461350382785e-06, + "logits/chosen": 534028083.2, + "logits/rejected": 589427072.0, + "logps/chosen": -394.5974853515625, + "logps/rejected": -691.1207682291666, + "loss": 0.0174, + "rewards/chosen": 4.061959075927734, + "rewards/margins": 13.305362192789712, + "rewards/rejected": -9.243403116861979, + "step": 5227 + }, + { + "epoch": 0.47766103243490177, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5.3670274446640514e-06, + "logits/chosen": 557291264.0, + "logits/rejected": 485538880.0, + "logps/chosen": -332.6070556640625, + "logps/rejected": -479.51605224609375, + "loss": 0.0266, + "rewards/chosen": 2.952026128768921, + "rewards/margins": 12.769627332687378, + "rewards/rejected": -9.817601203918457, + "step": 5228 + }, + { + "epoch": 0.47775239835541344, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5.365593508595548e-06, + "logits/chosen": 361715302.4, + "logits/rejected": 277011456.0, + "logps/chosen": -233.0429931640625, + "logps/rejected": -436.5123697916667, + "loss": 0.0241, + "rewards/chosen": 3.5611019134521484, + "rewards/margins": 14.011229832967123, + "rewards/rejected": -10.450127919514975, + "step": 5229 + }, + { + "epoch": 0.47784376427592506, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 5.3641595422958485e-06, + "logits/chosen": 766993856.0, + "logits/rejected": 904700342.8571428, + "logps/chosen": -543.2488403320312, + "logps/rejected": -391.4208286830357, + "loss": 0.0045, + "rewards/chosen": 3.36932373046875, + "rewards/margins": 11.976246970040458, + "rewards/rejected": -8.606923239571708, + "step": 5230 + }, + { + "epoch": 0.47793513019643674, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5.362725545883529e-06, + "logits/chosen": 393120128.0, + "logits/rejected": 551314816.0, + "logps/chosen": -219.8768768310547, + "logps/rejected": -758.1461791992188, + "loss": 0.0198, + "rewards/chosen": 3.5142171382904053, + "rewards/margins": 15.320485830307007, + "rewards/rejected": -11.806268692016602, + "step": 5231 + }, + { + "epoch": 0.47802649611694836, + "grad_norm": 0.90234375, + "kl": 0.0, + "learning_rate": 5.3612915194771685e-06, + "logits/chosen": 995679232.0, + "logits/rejected": 580549427.2, + "logps/chosen": -344.3734537760417, + "logps/rejected": -423.01748046875, + "loss": 0.0056, + "rewards/chosen": 4.24723752339681, + "rewards/margins": 13.199469884236652, + "rewards/rejected": -8.952232360839844, + "step": 5232 + }, + { + "epoch": 0.47811786203746004, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5.359857463195346e-06, + "logits/chosen": 569659289.6, + "logits/rejected": 427467136.0, + "logps/chosen": -302.783056640625, + "logps/rejected": -608.1965738932291, + "loss": 0.0417, + "rewards/chosen": 2.826456069946289, + "rewards/margins": 13.537346522013346, + "rewards/rejected": -10.710890452067057, + "step": 5233 + }, + { + "epoch": 0.47820922795797166, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 5.358423377156647e-06, + "logits/chosen": 537937216.0, + "logits/rejected": 578569728.0, + "logps/chosen": -193.6643829345703, + "logps/rejected": -429.8861389160156, + "loss": 0.0495, + "rewards/chosen": 3.2507219314575195, + "rewards/margins": 12.613673210144043, + "rewards/rejected": -9.362951278686523, + "step": 5234 + }, + { + "epoch": 0.47830059387848334, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5.356989261479652e-06, + "logits/chosen": 873096021.3333334, + "logits/rejected": 496500864.0, + "logps/chosen": -401.1612141927083, + "logps/rejected": -561.4503173828125, + "loss": 0.0245, + "rewards/chosen": 3.925461451212565, + "rewards/margins": 13.075573603312174, + "rewards/rejected": -9.15011215209961, + "step": 5235 + }, + { + "epoch": 0.47839195979899496, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5.355555116282956e-06, + "logits/chosen": 1061950976.0, + "logits/rejected": 1365822037.3333333, + "logps/chosen": -185.0770263671875, + "logps/rejected": -471.2464192708333, + "loss": 0.0096, + "rewards/chosen": 3.4336953163146973, + "rewards/margins": 12.64271910985311, + "rewards/rejected": -9.209023793538412, + "step": 5236 + }, + { + "epoch": 0.47848332571950664, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.354120941685146e-06, + "logits/chosen": 476782944.0, + "logits/rejected": 588237952.0, + "logps/chosen": -332.31781005859375, + "logps/rejected": -325.67877197265625, + "loss": 0.0117, + "rewards/chosen": 4.197788238525391, + "rewards/margins": 12.356002807617188, + "rewards/rejected": -8.158214569091797, + "step": 5237 + }, + { + "epoch": 0.47857469164001826, + "grad_norm": 0.765625, + "kl": 0.0, + "learning_rate": 5.352686737804814e-06, + "logits/chosen": 344858528.0, + "logits/rejected": 902070016.0, + "logps/chosen": -286.71087646484375, + "logps/rejected": -823.7701822916666, + "loss": 0.0048, + "rewards/chosen": 4.572150230407715, + "rewards/margins": 16.675973574320473, + "rewards/rejected": -12.10382334391276, + "step": 5238 + }, + { + "epoch": 0.47866605756052993, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 5.3512525047605555e-06, + "logits/chosen": 456338858.6666667, + "logits/rejected": 586565120.0, + "logps/chosen": -537.5554606119791, + "logps/rejected": -331.5457763671875, + "loss": 0.0172, + "rewards/chosen": 3.9756650924682617, + "rewards/margins": 14.040499687194824, + "rewards/rejected": -10.064834594726562, + "step": 5239 + }, + { + "epoch": 0.47875742348104156, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 5.349818242670971e-06, + "logits/chosen": 651409920.0, + "logits/rejected": 774426624.0, + "logps/chosen": -267.35748291015625, + "logps/rejected": -491.4188232421875, + "loss": 0.0175, + "rewards/chosen": 3.684393882751465, + "rewards/margins": 12.13314151763916, + "rewards/rejected": -8.448747634887695, + "step": 5240 + }, + { + "epoch": 0.47884878940155323, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 5.348383951654658e-06, + "logits/chosen": 523948288.0, + "logits/rejected": 485727200.0, + "logps/chosen": -307.962646484375, + "logps/rejected": -570.5653686523438, + "loss": 0.0155, + "rewards/chosen": 3.8962490558624268, + "rewards/margins": 13.572701692581177, + "rewards/rejected": -9.67645263671875, + "step": 5241 + }, + { + "epoch": 0.47894015532206485, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5.346949631830221e-06, + "logits/chosen": 797666474.6666666, + "logits/rejected": 694821785.6, + "logps/chosen": -485.6361897786458, + "logps/rejected": -396.4347900390625, + "loss": 0.0279, + "rewards/chosen": 3.551005999247233, + "rewards/margins": 10.12902037302653, + "rewards/rejected": -6.578014373779297, + "step": 5242 + }, + { + "epoch": 0.47903152124257653, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 5.345515283316263e-06, + "logits/chosen": 769600409.6, + "logits/rejected": 1147501994.6666667, + "logps/chosen": -286.848876953125, + "logps/rejected": -382.1504313151042, + "loss": 0.0257, + "rewards/chosen": 3.433568572998047, + "rewards/margins": 9.796631749471029, + "rewards/rejected": -6.3630631764729815, + "step": 5243 + }, + { + "epoch": 0.47912288716308815, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 5.344080906231393e-06, + "logits/chosen": 370627413.3333333, + "logits/rejected": 471999692.8, + "logps/chosen": -257.74761962890625, + "logps/rejected": -665.85576171875, + "loss": 0.0109, + "rewards/chosen": 3.5441411336263022, + "rewards/margins": 13.324258168538412, + "rewards/rejected": -9.78011703491211, + "step": 5244 + }, + { + "epoch": 0.47921425308359983, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 5.34264650069422e-06, + "logits/chosen": 511640064.0, + "logits/rejected": 626522828.8, + "logps/chosen": -225.33992513020834, + "logps/rejected": -355.5859130859375, + "loss": 0.0142, + "rewards/chosen": 3.6230786641438804, + "rewards/margins": 11.578808339436849, + "rewards/rejected": -7.955729675292969, + "step": 5245 + }, + { + "epoch": 0.47930561900411145, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 5.341212066823356e-06, + "logits/chosen": 578501632.0, + "logits/rejected": 818378581.3333334, + "logps/chosen": -334.52744140625, + "logps/rejected": -789.8460286458334, + "loss": 0.0111, + "rewards/chosen": 4.661408233642578, + "rewards/margins": 14.566839981079102, + "rewards/rejected": -9.905431747436523, + "step": 5246 + }, + { + "epoch": 0.4793969849246231, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5.339777604737414e-06, + "logits/chosen": 595626598.4, + "logits/rejected": 642140245.3333334, + "logps/chosen": -457.171435546875, + "logps/rejected": -576.7238362630209, + "loss": 0.0184, + "rewards/chosen": 4.070602798461914, + "rewards/margins": 12.078954442342123, + "rewards/rejected": -8.008351643880209, + "step": 5247 + }, + { + "epoch": 0.47948835084513475, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.338343114555015e-06, + "logits/chosen": 523560576.0, + "logits/rejected": 492279910.4, + "logps/chosen": -444.5382486979167, + "logps/rejected": -456.2658203125, + "loss": 0.025, + "rewards/chosen": 2.663403352101644, + "rewards/margins": 13.446465333302816, + "rewards/rejected": -10.783061981201172, + "step": 5248 + }, + { + "epoch": 0.4795797167656464, + "grad_norm": 25.875, + "kl": 0.0, + "learning_rate": 5.336908596394773e-06, + "logits/chosen": 946741120.0, + "logits/rejected": 668873344.0, + "logps/chosen": -212.94189453125, + "logps/rejected": -486.83197021484375, + "loss": 0.1273, + "rewards/chosen": 2.252558708190918, + "rewards/margins": 11.63223648071289, + "rewards/rejected": -9.379677772521973, + "step": 5249 + }, + { + "epoch": 0.47967108268615805, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 5.335474050375313e-06, + "logits/chosen": 1408163840.0, + "logits/rejected": 563091882.6666666, + "logps/chosen": -240.0682373046875, + "logps/rejected": -487.842041015625, + "loss": 0.0074, + "rewards/chosen": 3.7805612087249756, + "rewards/margins": 12.438578049341837, + "rewards/rejected": -8.658016840616861, + "step": 5250 + }, + { + "epoch": 0.4797624486066697, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 5.334039476615256e-06, + "logits/chosen": 662729011.2, + "logits/rejected": 821359872.0, + "logps/chosen": -597.57412109375, + "logps/rejected": -595.3949381510416, + "loss": 0.0275, + "rewards/chosen": 3.2338871002197265, + "rewards/margins": 13.465254211425782, + "rewards/rejected": -10.231367111206055, + "step": 5251 + }, + { + "epoch": 0.47985381452718134, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.33260487523323e-06, + "logits/chosen": 750578907.4285715, + "logits/rejected": 366996032.0, + "logps/chosen": -420.51834542410717, + "logps/rejected": -579.668701171875, + "loss": 0.1165, + "rewards/chosen": 3.5254993438720703, + "rewards/margins": 12.47728443145752, + "rewards/rejected": -8.95178508758545, + "step": 5252 + }, + { + "epoch": 0.479945180447693, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 5.3311702463478635e-06, + "logits/chosen": 545908906.6666666, + "logits/rejected": 437876633.6, + "logps/chosen": -373.0413004557292, + "logps/rejected": -503.369921875, + "loss": 0.0083, + "rewards/chosen": 3.985261599222819, + "rewards/margins": 13.857214419047038, + "rewards/rejected": -9.87195281982422, + "step": 5253 + }, + { + "epoch": 0.48003654636820464, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.3297355900777844e-06, + "logits/chosen": 687080618.6666666, + "logits/rejected": 522076518.4, + "logps/chosen": -235.89717610677084, + "logps/rejected": -313.1785888671875, + "loss": 0.0379, + "rewards/chosen": 3.806072235107422, + "rewards/margins": 10.831349182128907, + "rewards/rejected": -7.025276947021484, + "step": 5254 + }, + { + "epoch": 0.4801279122887163, + "grad_norm": 64.5, + "kl": 0.0, + "learning_rate": 5.3283009065416284e-06, + "logits/chosen": 791999488.0, + "logits/rejected": 449083872.0, + "logps/chosen": -199.6425018310547, + "logps/rejected": -434.68670654296875, + "loss": 0.179, + "rewards/chosen": 0.9193295240402222, + "rewards/margins": 10.975215792655945, + "rewards/rejected": -10.055886268615723, + "step": 5255 + }, + { + "epoch": 0.48021927820922794, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5.326866195858027e-06, + "logits/chosen": 667447509.3333334, + "logits/rejected": 393523136.0, + "logps/chosen": -401.3648274739583, + "logps/rejected": -490.1014709472656, + "loss": 0.0231, + "rewards/chosen": 3.7703857421875, + "rewards/margins": 12.671748161315918, + "rewards/rejected": -8.901362419128418, + "step": 5256 + }, + { + "epoch": 0.4803106441297396, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 5.325431458145621e-06, + "logits/chosen": 1170495232.0, + "logits/rejected": 614288256.0, + "logps/chosen": -367.3238525390625, + "logps/rejected": -450.1955159505208, + "loss": 0.0097, + "rewards/chosen": 3.6227684020996094, + "rewards/margins": 12.545589447021484, + "rewards/rejected": -8.922821044921875, + "step": 5257 + }, + { + "epoch": 0.48040201005025124, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5.3239966935230496e-06, + "logits/chosen": 355844437.3333333, + "logits/rejected": 863950912.0, + "logps/chosen": -250.7006632486979, + "logps/rejected": -576.510498046875, + "loss": 0.0213, + "rewards/chosen": 4.121458689371745, + "rewards/margins": 13.017317454020183, + "rewards/rejected": -8.895858764648438, + "step": 5258 + }, + { + "epoch": 0.4804933759707629, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 5.322561902108953e-06, + "logits/chosen": 541094400.0, + "logits/rejected": 532315221.3333333, + "logps/chosen": -209.235693359375, + "logps/rejected": -677.4488118489584, + "loss": 0.0215, + "rewards/chosen": 4.034902191162109, + "rewards/margins": 13.590779876708984, + "rewards/rejected": -9.555877685546875, + "step": 5259 + }, + { + "epoch": 0.48058474189127454, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5.321127084021974e-06, + "logits/chosen": 934009685.3333334, + "logits/rejected": 360716211.2, + "logps/chosen": -416.89892578125, + "logps/rejected": -322.4626953125, + "loss": 0.0142, + "rewards/chosen": 3.870851516723633, + "rewards/margins": 12.305072402954101, + "rewards/rejected": -8.434220886230468, + "step": 5260 + }, + { + "epoch": 0.4806761078117862, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 5.319692239380761e-06, + "logits/chosen": 509991648.0, + "logits/rejected": 638417280.0, + "logps/chosen": -330.5419616699219, + "logps/rejected": -382.764404296875, + "loss": 0.0136, + "rewards/chosen": 4.260709762573242, + "rewards/margins": 12.274505615234375, + "rewards/rejected": -8.013795852661133, + "step": 5261 + }, + { + "epoch": 0.48076747373229783, + "grad_norm": 0.7734375, + "kl": 0.0, + "learning_rate": 5.318257368303963e-06, + "logits/chosen": 707205120.0, + "logits/rejected": 534216294.4, + "logps/chosen": -351.4180094401042, + "logps/rejected": -526.923095703125, + "loss": 0.0038, + "rewards/chosen": 4.659027735392253, + "rewards/margins": 13.921225611368815, + "rewards/rejected": -9.262197875976563, + "step": 5262 + }, + { + "epoch": 0.4808588396528095, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5.31682247091023e-06, + "logits/chosen": 598104320.0, + "logits/rejected": 716222464.0, + "logps/chosen": -353.7860514322917, + "logps/rejected": -508.544677734375, + "loss": 0.0155, + "rewards/chosen": 3.196516990661621, + "rewards/margins": 14.569189262390136, + "rewards/rejected": -11.372672271728515, + "step": 5263 + }, + { + "epoch": 0.48095020557332113, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.315387547318212e-06, + "logits/chosen": 546257962.6666666, + "logits/rejected": 476097632.0, + "logps/chosen": -277.56256103515625, + "logps/rejected": -410.818603515625, + "loss": 0.0463, + "rewards/chosen": 3.4009002049764, + "rewards/margins": 11.650644620259603, + "rewards/rejected": -8.249744415283203, + "step": 5264 + }, + { + "epoch": 0.4810415714938328, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.3139525976465675e-06, + "logits/chosen": 545096064.0, + "logits/rejected": 307919914.6666667, + "logps/chosen": -282.68572998046875, + "logps/rejected": -421.2350260416667, + "loss": 0.0093, + "rewards/chosen": 3.297978401184082, + "rewards/margins": 12.18351141611735, + "rewards/rejected": -8.885533014933268, + "step": 5265 + }, + { + "epoch": 0.48113293741434443, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5.312517622013952e-06, + "logits/chosen": 569047125.3333334, + "logits/rejected": 754718822.4, + "logps/chosen": -360.977294921875, + "logps/rejected": -549.0359375, + "loss": 0.0225, + "rewards/chosen": 2.760582605997721, + "rewards/margins": 12.528556696573892, + "rewards/rejected": -9.767974090576171, + "step": 5266 + }, + { + "epoch": 0.4812243033348561, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5.3110826205390246e-06, + "logits/chosen": 505731626.6666667, + "logits/rejected": 441315635.2, + "logps/chosen": -375.8826497395833, + "logps/rejected": -410.57080078125, + "loss": 0.0181, + "rewards/chosen": 3.5605716705322266, + "rewards/margins": 13.170650100708007, + "rewards/rejected": -9.610078430175781, + "step": 5267 + }, + { + "epoch": 0.48131566925536773, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5.309647593340445e-06, + "logits/chosen": 392176736.0, + "logits/rejected": 540613376.0, + "logps/chosen": -356.72930908203125, + "logps/rejected": -419.0888671875, + "loss": 0.0174, + "rewards/chosen": 3.273670196533203, + "rewards/margins": 12.542917887369791, + "rewards/rejected": -9.269247690836588, + "step": 5268 + }, + { + "epoch": 0.4814070351758794, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5.30821254053688e-06, + "logits/chosen": 288932608.0, + "logits/rejected": 302205312.0, + "logps/chosen": -320.2195129394531, + "logps/rejected": -497.4329833984375, + "loss": 0.0149, + "rewards/chosen": 4.296334266662598, + "rewards/margins": 14.921233177185059, + "rewards/rejected": -10.624898910522461, + "step": 5269 + }, + { + "epoch": 0.481498401096391, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.3067774622469935e-06, + "logits/chosen": 687660544.0, + "logits/rejected": 453166165.3333333, + "logps/chosen": -592.98828125, + "logps/rejected": -375.5856119791667, + "loss": 0.0133, + "rewards/chosen": 4.551963424682617, + "rewards/margins": 12.962700144449869, + "rewards/rejected": -8.410736719767252, + "step": 5270 + }, + { + "epoch": 0.4815897670169027, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 5.305342358589452e-06, + "logits/chosen": 509340160.0, + "logits/rejected": 480850133.3333333, + "logps/chosen": -516.5118408203125, + "logps/rejected": -458.9302571614583, + "loss": 0.0073, + "rewards/chosen": 4.199549674987793, + "rewards/margins": 12.331366539001465, + "rewards/rejected": -8.131816864013672, + "step": 5271 + }, + { + "epoch": 0.4816811329374143, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5.303907229682929e-06, + "logits/chosen": 844123072.0, + "logits/rejected": 362709952.0, + "logps/chosen": -400.1026611328125, + "logps/rejected": -449.9058837890625, + "loss": 0.0277, + "rewards/chosen": 3.1479015350341797, + "rewards/margins": 10.842555046081543, + "rewards/rejected": -7.694653511047363, + "step": 5272 + }, + { + "epoch": 0.481772498857926, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5.302472075646091e-06, + "logits/chosen": 614963712.0, + "logits/rejected": 509401173.3333333, + "logps/chosen": -378.78623046875, + "logps/rejected": -382.5681559244792, + "loss": 0.0249, + "rewards/chosen": 3.6815296173095704, + "rewards/margins": 12.544976806640625, + "rewards/rejected": -8.863447189331055, + "step": 5273 + }, + { + "epoch": 0.4818638647784376, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.301036896597617e-06, + "logits/chosen": 499104672.0, + "logits/rejected": 577787264.0, + "logps/chosen": -314.331298828125, + "logps/rejected": -472.2860412597656, + "loss": 0.0208, + "rewards/chosen": 4.02775764465332, + "rewards/margins": 12.169763565063477, + "rewards/rejected": -8.142005920410156, + "step": 5274 + }, + { + "epoch": 0.4819552306989493, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 5.2996016926561785e-06, + "logits/chosen": 573277098.6666666, + "logits/rejected": 731357568.0, + "logps/chosen": -414.1348470052083, + "logps/rejected": -1483.5836181640625, + "loss": 0.0182, + "rewards/chosen": 4.098331133524577, + "rewards/margins": 20.571994463602703, + "rewards/rejected": -16.473663330078125, + "step": 5275 + }, + { + "epoch": 0.4820465966194609, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5.298166463940459e-06, + "logits/chosen": 552631296.0, + "logits/rejected": 381374438.4, + "logps/chosen": -356.2780354817708, + "logps/rejected": -365.2296142578125, + "loss": 0.0149, + "rewards/chosen": 3.4087273279825845, + "rewards/margins": 12.0928253809611, + "rewards/rejected": -8.684098052978516, + "step": 5276 + }, + { + "epoch": 0.4821379625399726, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 5.296731210569134e-06, + "logits/chosen": 485324842.6666667, + "logits/rejected": 502123392.0, + "logps/chosen": -320.9903971354167, + "logps/rejected": -365.46136474609375, + "loss": 0.0092, + "rewards/chosen": 4.709121068318685, + "rewards/margins": 13.18744119008382, + "rewards/rejected": -8.478320121765137, + "step": 5277 + }, + { + "epoch": 0.4822293284604842, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5.295295932660887e-06, + "logits/chosen": 727553792.0, + "logits/rejected": 569319680.0, + "logps/chosen": -400.6181945800781, + "logps/rejected": -404.95257568359375, + "loss": 0.0182, + "rewards/chosen": 3.539834976196289, + "rewards/margins": 9.91247844696045, + "rewards/rejected": -6.37264347076416, + "step": 5278 + }, + { + "epoch": 0.4823206943809959, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 5.293860630334402e-06, + "logits/chosen": 550746709.3333334, + "logits/rejected": 553931968.0, + "logps/chosen": -406.6672770182292, + "logps/rejected": -542.6063232421875, + "loss": 0.1216, + "rewards/chosen": 3.062291145324707, + "rewards/margins": 12.462370872497559, + "rewards/rejected": -9.400079727172852, + "step": 5279 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5.292425303708366e-06, + "logits/chosen": 418014432.0, + "logits/rejected": 675289856.0, + "logps/chosen": -125.88615417480469, + "logps/rejected": -332.32391357421875, + "loss": 0.0247, + "rewards/chosen": 3.2010974884033203, + "rewards/margins": 11.383151054382324, + "rewards/rejected": -8.182053565979004, + "step": 5280 + }, + { + "epoch": 0.4825034262220192, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5.2909899529014665e-06, + "logits/chosen": 633825280.0, + "logits/rejected": 998904320.0, + "logps/chosen": -411.0878499348958, + "logps/rejected": -458.264208984375, + "loss": 0.013, + "rewards/chosen": 3.835054397583008, + "rewards/margins": 11.636471176147461, + "rewards/rejected": -7.8014167785644535, + "step": 5281 + }, + { + "epoch": 0.4825947921425308, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 5.289554578032394e-06, + "logits/chosen": 621921066.6666666, + "logits/rejected": 1065422028.8, + "logps/chosen": -189.7888387044271, + "logps/rejected": -449.0185546875, + "loss": 0.01, + "rewards/chosen": 4.263742446899414, + "rewards/margins": 11.931071853637695, + "rewards/rejected": -7.667329406738281, + "step": 5282 + }, + { + "epoch": 0.4826861580630425, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5.28811917921984e-06, + "logits/chosen": 448987584.0, + "logits/rejected": 363398720.0, + "logps/chosen": -329.300537109375, + "logps/rejected": -509.26123046875, + "loss": 0.0283, + "rewards/chosen": 3.2019968032836914, + "rewards/margins": 13.122004508972168, + "rewards/rejected": -9.920007705688477, + "step": 5283 + }, + { + "epoch": 0.4827775239835541, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 5.286683756582502e-06, + "logits/chosen": 596890240.0, + "logits/rejected": 1104660992.0, + "logps/chosen": -437.8806966145833, + "logps/rejected": -746.5670166015625, + "loss": 0.0284, + "rewards/chosen": 3.8389625549316406, + "rewards/margins": 17.51310443878174, + "rewards/rejected": -13.674141883850098, + "step": 5284 + }, + { + "epoch": 0.4828688899040658, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5.285248310239072e-06, + "logits/chosen": 435887206.4, + "logits/rejected": 671970389.3333334, + "logps/chosen": -299.9088134765625, + "logps/rejected": -483.8894856770833, + "loss": 0.0253, + "rewards/chosen": 3.6970958709716797, + "rewards/margins": 13.043575922648111, + "rewards/rejected": -9.346480051676432, + "step": 5285 + }, + { + "epoch": 0.4829602558245774, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5.283812840308248e-06, + "logits/chosen": 670926540.8, + "logits/rejected": 718611370.6666666, + "logps/chosen": -279.6783447265625, + "logps/rejected": -560.132568359375, + "loss": 0.047, + "rewards/chosen": 3.030657196044922, + "rewards/margins": 11.303274536132813, + "rewards/rejected": -8.27261734008789, + "step": 5286 + }, + { + "epoch": 0.4830516217450891, + "grad_norm": 45.0, + "kl": 0.0, + "learning_rate": 5.282377346908734e-06, + "logits/chosen": 1382626304.0, + "logits/rejected": 934423381.3333334, + "logps/chosen": -290.99151611328125, + "logps/rejected": -475.1785481770833, + "loss": 0.0531, + "rewards/chosen": 1.9724899530410767, + "rewards/margins": 11.064740777015686, + "rewards/rejected": -9.09225082397461, + "step": 5287 + }, + { + "epoch": 0.4831429876656007, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5.280941830159228e-06, + "logits/chosen": 530013696.0, + "logits/rejected": 960419123.2, + "logps/chosen": -242.68575032552084, + "logps/rejected": -566.4671875, + "loss": 0.1344, + "rewards/chosen": 0.838732639948527, + "rewards/margins": 11.234402958552042, + "rewards/rejected": -10.395670318603516, + "step": 5288 + }, + { + "epoch": 0.4832343535861124, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 5.2795062901784366e-06, + "logits/chosen": 1445903232.0, + "logits/rejected": 600487296.0, + "logps/chosen": -642.6473388671875, + "logps/rejected": -648.69482421875, + "loss": 0.0078, + "rewards/chosen": 4.348509311676025, + "rewards/margins": 15.0509934425354, + "rewards/rejected": -10.702484130859375, + "step": 5289 + }, + { + "epoch": 0.483325719506624, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5.278070727085065e-06, + "logits/chosen": 1292256256.0, + "logits/rejected": 612949162.6666666, + "logps/chosen": -406.2723876953125, + "logps/rejected": -526.4811604817709, + "loss": 0.0328, + "rewards/chosen": 4.196435165405274, + "rewards/margins": 13.727553558349609, + "rewards/rejected": -9.531118392944336, + "step": 5290 + }, + { + "epoch": 0.4834170854271357, + "grad_norm": 0.859375, + "kl": 0.0, + "learning_rate": 5.27663514099782e-06, + "logits/chosen": 632732928.0, + "logits/rejected": 514922666.6666667, + "logps/chosen": -514.626220703125, + "logps/rejected": -621.2924397786459, + "loss": 0.0037, + "rewards/chosen": 4.248512268066406, + "rewards/margins": 16.89093907674154, + "rewards/rejected": -12.64242680867513, + "step": 5291 + }, + { + "epoch": 0.4835084513476473, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.275199532035412e-06, + "logits/chosen": 548963242.6666666, + "logits/rejected": 486222540.8, + "logps/chosen": -197.9941609700521, + "logps/rejected": -319.491455078125, + "loss": 0.0107, + "rewards/chosen": 4.682536443074544, + "rewards/margins": 11.738823064168294, + "rewards/rejected": -7.05628662109375, + "step": 5292 + }, + { + "epoch": 0.483599817268159, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.273763900316553e-06, + "logits/chosen": 575310116.5714285, + "logits/rejected": 174967776.0, + "logps/chosen": -413.6767578125, + "logps/rejected": -165.160888671875, + "loss": 0.1544, + "rewards/chosen": 2.4559642246791293, + "rewards/margins": 9.974349634987966, + "rewards/rejected": -7.518385410308838, + "step": 5293 + }, + { + "epoch": 0.4836911831886706, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 5.2723282459599545e-06, + "logits/chosen": 1708024064.0, + "logits/rejected": 690878293.3333334, + "logps/chosen": -320.0047607421875, + "logps/rejected": -592.0120442708334, + "loss": 0.0101, + "rewards/chosen": 3.4864959716796875, + "rewards/margins": 12.531449635823568, + "rewards/rejected": -9.04495366414388, + "step": 5294 + }, + { + "epoch": 0.4837825491091823, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 5.270892569084335e-06, + "logits/chosen": 544575296.0, + "logits/rejected": 422324608.0, + "logps/chosen": -221.69070434570312, + "logps/rejected": -556.4176025390625, + "loss": 0.0046, + "rewards/chosen": 5.3918843269348145, + "rewards/margins": 16.06778383255005, + "rewards/rejected": -10.675899505615234, + "step": 5295 + }, + { + "epoch": 0.4838739150296939, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.2694568698084085e-06, + "logits/chosen": 383565056.0, + "logits/rejected": 225731690.66666666, + "logps/chosen": -274.241650390625, + "logps/rejected": -291.1644694010417, + "loss": 0.0131, + "rewards/chosen": 4.107277679443359, + "rewards/margins": 12.810222117106118, + "rewards/rejected": -8.70294443766276, + "step": 5296 + }, + { + "epoch": 0.4839652809502056, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5.268021148250896e-06, + "logits/chosen": 706665984.0, + "logits/rejected": 359792810.6666667, + "logps/chosen": -266.9984375, + "logps/rejected": -296.4337158203125, + "loss": 0.0222, + "rewards/chosen": 3.758221435546875, + "rewards/margins": 12.521759796142579, + "rewards/rejected": -8.763538360595703, + "step": 5297 + }, + { + "epoch": 0.4840566468707172, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5.266585404530517e-06, + "logits/chosen": 498176853.3333333, + "logits/rejected": 719117209.6, + "logps/chosen": -246.88871256510416, + "logps/rejected": -364.50478515625, + "loss": 0.0324, + "rewards/chosen": 2.666165828704834, + "rewards/margins": 10.025174617767334, + "rewards/rejected": -7.3590087890625, + "step": 5298 + }, + { + "epoch": 0.4841480127912289, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5.265149638765996e-06, + "logits/chosen": 516651861.3333333, + "logits/rejected": 450707097.6, + "logps/chosen": -353.4793294270833, + "logps/rejected": -355.131884765625, + "loss": 0.0176, + "rewards/chosen": 3.0711965560913086, + "rewards/margins": 13.163680076599121, + "rewards/rejected": -10.092483520507812, + "step": 5299 + }, + { + "epoch": 0.4842393787117405, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5.2637138510760556e-06, + "logits/chosen": 878396330.6666666, + "logits/rejected": 711769856.0, + "logps/chosen": -382.9817708333333, + "logps/rejected": -719.2994140625, + "loss": 0.0064, + "rewards/chosen": 4.420084635416667, + "rewards/margins": 14.348093668619793, + "rewards/rejected": -9.928009033203125, + "step": 5300 + }, + { + "epoch": 0.4843307446322522, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5.262278041579425e-06, + "logits/chosen": 339549162.6666667, + "logits/rejected": 538641971.2, + "logps/chosen": -244.42803955078125, + "logps/rejected": -344.010205078125, + "loss": 0.0134, + "rewards/chosen": 4.4724547068278, + "rewards/margins": 13.22470537821452, + "rewards/rejected": -8.752250671386719, + "step": 5301 + }, + { + "epoch": 0.4844221105527638, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5.26084221039483e-06, + "logits/chosen": 579412480.0, + "logits/rejected": 548994406.4, + "logps/chosen": -374.434814453125, + "logps/rejected": -381.6267578125, + "loss": 0.0098, + "rewards/chosen": 3.856135050455729, + "rewards/margins": 13.712972513834634, + "rewards/rejected": -9.856837463378906, + "step": 5302 + }, + { + "epoch": 0.4845134764732755, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5.259406357641001e-06, + "logits/chosen": 503508416.0, + "logits/rejected": 322291904.0, + "logps/chosen": -298.26007080078125, + "logps/rejected": -437.471923828125, + "loss": 0.0255, + "rewards/chosen": 3.195795774459839, + "rewards/margins": 14.577200651168823, + "rewards/rejected": -11.381404876708984, + "step": 5303 + }, + { + "epoch": 0.4846048423937871, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5.257970483436668e-06, + "logits/chosen": 605288704.0, + "logits/rejected": 493140096.0, + "logps/chosen": -420.892822265625, + "logps/rejected": -646.7892456054688, + "loss": 0.0502, + "rewards/chosen": 3.4331795374552407, + "rewards/margins": 13.661693255106607, + "rewards/rejected": -10.228513717651367, + "step": 5304 + }, + { + "epoch": 0.48469620831429877, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 5.256534587900567e-06, + "logits/chosen": 302421824.0, + "logits/rejected": 832385331.2, + "logps/chosen": -212.44120279947916, + "logps/rejected": -450.660888671875, + "loss": 0.0064, + "rewards/chosen": 4.914298375447591, + "rewards/margins": 14.49041379292806, + "rewards/rejected": -9.57611541748047, + "step": 5305 + }, + { + "epoch": 0.4847875742348104, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 5.255098671151433e-06, + "logits/chosen": 627688345.6, + "logits/rejected": 425923200.0, + "logps/chosen": -264.743603515625, + "logps/rejected": -494.9413248697917, + "loss": 0.0082, + "rewards/chosen": 5.006095886230469, + "rewards/margins": 15.788700103759766, + "rewards/rejected": -10.782604217529297, + "step": 5306 + }, + { + "epoch": 0.48487894015532207, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5.253662733308001e-06, + "logits/chosen": 526396928.0, + "logits/rejected": 578787114.6666666, + "logps/chosen": -415.552490234375, + "logps/rejected": -523.198974609375, + "loss": 0.0162, + "rewards/chosen": 3.5523910522460938, + "rewards/margins": 12.818122863769531, + "rewards/rejected": -9.265731811523438, + "step": 5307 + }, + { + "epoch": 0.4849703060758337, + "grad_norm": 0.251953125, + "kl": 0.0, + "learning_rate": 5.2522267744890135e-06, + "logits/chosen": 187782656.0, + "logits/rejected": 437150939.4285714, + "logps/chosen": -176.31805419921875, + "logps/rejected": -656.6540178571429, + "loss": 0.001, + "rewards/chosen": 4.919600009918213, + "rewards/margins": 15.490695067814418, + "rewards/rejected": -10.571095057896205, + "step": 5308 + }, + { + "epoch": 0.48506167199634537, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5.250790794813205e-06, + "logits/chosen": 434002517.3333333, + "logits/rejected": 535271168.0, + "logps/chosen": -324.8064371744792, + "logps/rejected": -179.91180419921875, + "loss": 0.0243, + "rewards/chosen": 3.98381773630778, + "rewards/margins": 12.336385409037272, + "rewards/rejected": -8.352567672729492, + "step": 5309 + }, + { + "epoch": 0.485153037916857, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 5.2493547943993254e-06, + "logits/chosen": 384511360.0, + "logits/rejected": 777930188.8, + "logps/chosen": -322.5297444661458, + "logps/rejected": -538.693359375, + "loss": 0.0064, + "rewards/chosen": 4.181989034016927, + "rewards/margins": 12.979366048177084, + "rewards/rejected": -8.797377014160157, + "step": 5310 + }, + { + "epoch": 0.48524440383736867, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.247918773366112e-06, + "logits/chosen": 497445120.0, + "logits/rejected": 681973440.0, + "logps/chosen": -301.0858968098958, + "logps/rejected": -961.0496826171875, + "loss": 0.03, + "rewards/chosen": 3.272507667541504, + "rewards/margins": 14.02841854095459, + "rewards/rejected": -10.755910873413086, + "step": 5311 + }, + { + "epoch": 0.4853357697578803, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5.246482731832314e-06, + "logits/chosen": 467358464.0, + "logits/rejected": 570931200.0, + "logps/chosen": -534.406005859375, + "logps/rejected": -986.4501953125, + "loss": 0.0196, + "rewards/chosen": 3.8455581665039062, + "rewards/margins": 15.986165046691895, + "rewards/rejected": -12.140606880187988, + "step": 5312 + }, + { + "epoch": 0.48542713567839196, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5.245046669916677e-06, + "logits/chosen": 352407552.0, + "logits/rejected": 299095072.0, + "logps/chosen": -334.8118896484375, + "logps/rejected": -442.1982421875, + "loss": 0.0317, + "rewards/chosen": 3.5058958530426025, + "rewards/margins": 12.83510422706604, + "rewards/rejected": -9.329208374023438, + "step": 5313 + }, + { + "epoch": 0.4855185015989036, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.2436105877379525e-06, + "logits/chosen": 665569664.0, + "logits/rejected": 423543232.0, + "logps/chosen": -268.0768127441406, + "logps/rejected": -455.519287109375, + "loss": 0.0173, + "rewards/chosen": 3.711169958114624, + "rewards/margins": 14.199439764022827, + "rewards/rejected": -10.488269805908203, + "step": 5314 + }, + { + "epoch": 0.48560986751941526, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5.242174485414888e-06, + "logits/chosen": 470265856.0, + "logits/rejected": 486652245.3333333, + "logps/chosen": -263.9041748046875, + "logps/rejected": -452.7438151041667, + "loss": 0.0322, + "rewards/chosen": 3.0230878829956054, + "rewards/margins": 12.772743288675944, + "rewards/rejected": -9.749655405680338, + "step": 5315 + }, + { + "epoch": 0.4857012334399269, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 5.240738363066238e-06, + "logits/chosen": 520562073.6, + "logits/rejected": 541330432.0, + "logps/chosen": -295.84970703125, + "logps/rejected": -378.3128662109375, + "loss": 0.0109, + "rewards/chosen": 4.819482421875, + "rewards/margins": 13.931106185913086, + "rewards/rejected": -9.111623764038086, + "step": 5316 + }, + { + "epoch": 0.48579259936043856, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 5.239302220810757e-06, + "logits/chosen": 733347174.4, + "logits/rejected": 1090288810.6666667, + "logps/chosen": -379.82197265625, + "logps/rejected": -581.9347330729166, + "loss": 0.0163, + "rewards/chosen": 4.03515396118164, + "rewards/margins": 12.151115671793619, + "rewards/rejected": -8.115961710611979, + "step": 5317 + }, + { + "epoch": 0.4858839652809502, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5.237866058767198e-06, + "logits/chosen": 609875136.0, + "logits/rejected": 964706816.0, + "logps/chosen": -222.9258575439453, + "logps/rejected": -543.73876953125, + "loss": 0.0174, + "rewards/chosen": 2.6351099014282227, + "rewards/margins": 11.87194538116455, + "rewards/rejected": -9.236835479736328, + "step": 5318 + }, + { + "epoch": 0.48597533120146186, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 5.23642987705432e-06, + "logits/chosen": 394077440.0, + "logits/rejected": 429133504.0, + "logps/chosen": -288.40716552734375, + "logps/rejected": -528.4795532226562, + "loss": 0.1291, + "rewards/chosen": 2.6370344161987305, + "rewards/margins": 12.52255630493164, + "rewards/rejected": -9.88552188873291, + "step": 5319 + }, + { + "epoch": 0.4860666971219735, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5.234993675790882e-06, + "logits/chosen": 389365101.71428573, + "logits/rejected": 524941376.0, + "logps/chosen": -378.20431082589283, + "logps/rejected": -952.3719482421875, + "loss": 0.0354, + "rewards/chosen": 3.637529100690569, + "rewards/margins": 13.18226787022182, + "rewards/rejected": -9.54473876953125, + "step": 5320 + }, + { + "epoch": 0.48615806304248516, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.233557455095645e-06, + "logits/chosen": 638739236.5714285, + "logits/rejected": 193260736.0, + "logps/chosen": -407.73374720982144, + "logps/rejected": -407.1934509277344, + "loss": 0.0438, + "rewards/chosen": 3.1576290130615234, + "rewards/margins": 14.107284545898438, + "rewards/rejected": -10.949655532836914, + "step": 5321 + }, + { + "epoch": 0.4862494289629968, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5.232121215087369e-06, + "logits/chosen": 526273600.0, + "logits/rejected": 357656064.0, + "logps/chosen": -355.3182373046875, + "logps/rejected": -464.8501790364583, + "loss": 0.0187, + "rewards/chosen": 2.5373566150665283, + "rewards/margins": 12.155720313390097, + "rewards/rejected": -9.618363698323568, + "step": 5322 + }, + { + "epoch": 0.48634079488350845, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.230684955884819e-06, + "logits/chosen": 364747081.14285713, + "logits/rejected": 280097824.0, + "logps/chosen": -171.84395926339286, + "logps/rejected": -643.6722412109375, + "loss": 0.052, + "rewards/chosen": 3.100493839808873, + "rewards/margins": 19.192258289882115, + "rewards/rejected": -16.091764450073242, + "step": 5323 + }, + { + "epoch": 0.4864321608040201, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.229248677606762e-06, + "logits/chosen": 664301632.0, + "logits/rejected": 1000726912.0, + "logps/chosen": -341.8314208984375, + "logps/rejected": -631.9744873046875, + "loss": 0.017, + "rewards/chosen": 4.003870964050293, + "rewards/margins": 12.649554252624512, + "rewards/rejected": -8.645683288574219, + "step": 5324 + }, + { + "epoch": 0.48652352672453175, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5.227812380371962e-06, + "logits/chosen": 553837926.4, + "logits/rejected": 547794261.3333334, + "logps/chosen": -405.1856689453125, + "logps/rejected": -662.9322916666666, + "loss": 0.034, + "rewards/chosen": 3.6635948181152345, + "rewards/margins": 14.079107538859049, + "rewards/rejected": -10.415512720743815, + "step": 5325 + }, + { + "epoch": 0.4866148926450434, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5.226376064299189e-06, + "logits/chosen": 587872256.0, + "logits/rejected": 434510293.3333333, + "logps/chosen": -204.2034423828125, + "logps/rejected": -313.29713948567706, + "loss": 0.0361, + "rewards/chosen": 3.8009647369384765, + "rewards/margins": 13.627490107218424, + "rewards/rejected": -9.826525370279947, + "step": 5326 + }, + { + "epoch": 0.48670625856555505, + "grad_norm": 0.58984375, + "kl": 0.0, + "learning_rate": 5.2249397295072145e-06, + "logits/chosen": 2190723072.0, + "logits/rejected": 616512219.4285715, + "logps/chosen": -1011.0405883789062, + "logps/rejected": -630.4328962053571, + "loss": 0.0015, + "rewards/chosen": 4.554437160491943, + "rewards/margins": 13.849378653935023, + "rewards/rejected": -9.29494149344308, + "step": 5327 + }, + { + "epoch": 0.48679762448606667, + "grad_norm": 64.0, + "kl": 0.0, + "learning_rate": 5.2235033761148064e-06, + "logits/chosen": 432961248.0, + "logits/rejected": 529416704.0, + "logps/chosen": -438.48077392578125, + "logps/rejected": -275.0291748046875, + "loss": 0.0347, + "rewards/chosen": 4.536317348480225, + "rewards/margins": 11.057507991790771, + "rewards/rejected": -6.521190643310547, + "step": 5328 + }, + { + "epoch": 0.48688899040657835, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 5.222067004240742e-06, + "logits/chosen": 540722048.0, + "logits/rejected": 387743872.0, + "logps/chosen": -619.573974609375, + "logps/rejected": -573.8519287109375, + "loss": 0.0072, + "rewards/chosen": 4.407024383544922, + "rewards/margins": 13.716259956359863, + "rewards/rejected": -9.309235572814941, + "step": 5329 + }, + { + "epoch": 0.48698035632708997, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 5.220630614003792e-06, + "logits/chosen": 363316544.0, + "logits/rejected": 450055424.0, + "logps/chosen": -369.45452880859375, + "logps/rejected": -566.1236572265625, + "loss": 0.0062, + "rewards/chosen": 5.10399055480957, + "rewards/margins": 14.282610893249512, + "rewards/rejected": -9.178620338439941, + "step": 5330 + }, + { + "epoch": 0.48707172224760165, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 5.219194205522736e-06, + "logits/chosen": 589422464.0, + "logits/rejected": 537325952.0, + "logps/chosen": -330.8034261067708, + "logps/rejected": -546.552001953125, + "loss": 0.0146, + "rewards/chosen": 4.313037554423015, + "rewards/margins": 14.262039820353191, + "rewards/rejected": -9.949002265930176, + "step": 5331 + }, + { + "epoch": 0.48716308816811327, + "grad_norm": 31.875, + "kl": 0.0, + "learning_rate": 5.217757778916349e-06, + "logits/chosen": 885375146.6666666, + "logits/rejected": 657026406.4, + "logps/chosen": -203.22115071614584, + "logps/rejected": -349.32587890625, + "loss": 0.0889, + "rewards/chosen": 2.185542424519857, + "rewards/margins": 9.0389466603597, + "rewards/rejected": -6.853404235839844, + "step": 5332 + }, + { + "epoch": 0.48725445408862494, + "grad_norm": 0.73828125, + "kl": 0.0, + "learning_rate": 5.2163213343034115e-06, + "logits/chosen": 603818176.0, + "logits/rejected": 580960469.3333334, + "logps/chosen": -333.41192626953125, + "logps/rejected": -543.5454915364584, + "loss": 0.0038, + "rewards/chosen": 4.228529453277588, + "rewards/margins": 15.996634006500244, + "rewards/rejected": -11.768104553222656, + "step": 5333 + }, + { + "epoch": 0.48734582000913657, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 5.214884871802703e-06, + "logits/chosen": 681972032.0, + "logits/rejected": 298178112.0, + "logps/chosen": -564.6602172851562, + "logps/rejected": -411.6263732910156, + "loss": 0.0174, + "rewards/chosen": 3.6200859546661377, + "rewards/margins": 13.704887628555298, + "rewards/rejected": -10.08480167388916, + "step": 5334 + }, + { + "epoch": 0.48743718592964824, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5.213448391533008e-06, + "logits/chosen": 375217715.2, + "logits/rejected": 419086122.6666667, + "logps/chosen": -296.766455078125, + "logps/rejected": -511.5955810546875, + "loss": 0.0155, + "rewards/chosen": 4.361305236816406, + "rewards/margins": 11.876754760742188, + "rewards/rejected": -7.515449523925781, + "step": 5335 + }, + { + "epoch": 0.48752855185015986, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5.212011893613108e-06, + "logits/chosen": 457661098.6666667, + "logits/rejected": 456544716.8, + "logps/chosen": -454.3429768880208, + "logps/rejected": -390.2271240234375, + "loss": 0.0118, + "rewards/chosen": 3.9267094930013022, + "rewards/margins": 13.89871571858724, + "rewards/rejected": -9.972006225585938, + "step": 5336 + }, + { + "epoch": 0.48761991777067154, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.210575378161789e-06, + "logits/chosen": 622601216.0, + "logits/rejected": 151172992.0, + "logps/chosen": -344.709228515625, + "logps/rejected": -154.32986450195312, + "loss": 0.0274, + "rewards/chosen": 3.8739021846226285, + "rewards/margins": 12.242179734366282, + "rewards/rejected": -8.368277549743652, + "step": 5337 + }, + { + "epoch": 0.48771128369118316, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5.209138845297837e-06, + "logits/chosen": 368508117.3333333, + "logits/rejected": 411037824.0, + "logps/chosen": -249.62615966796875, + "logps/rejected": -217.92982482910156, + "loss": 0.0151, + "rewards/chosen": 4.437145233154297, + "rewards/margins": 11.161986351013184, + "rewards/rejected": -6.724841117858887, + "step": 5338 + }, + { + "epoch": 0.48780264961169484, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5.2077022951400405e-06, + "logits/chosen": 513035093.3333333, + "logits/rejected": 660675737.6, + "logps/chosen": -324.70200602213544, + "logps/rejected": -479.906396484375, + "loss": 0.0123, + "rewards/chosen": 3.8646841049194336, + "rewards/margins": 13.60394115447998, + "rewards/rejected": -9.739257049560546, + "step": 5339 + }, + { + "epoch": 0.48789401553220646, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5.20626572780719e-06, + "logits/chosen": 356225749.3333333, + "logits/rejected": 755239424.0, + "logps/chosen": -295.8883463541667, + "logps/rejected": -846.1141357421875, + "loss": 0.0166, + "rewards/chosen": 3.990896224975586, + "rewards/margins": 14.608834266662598, + "rewards/rejected": -10.617938041687012, + "step": 5340 + }, + { + "epoch": 0.48798538145271814, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5.204829143418072e-06, + "logits/chosen": 542002176.0, + "logits/rejected": 539247274.6666666, + "logps/chosen": -393.616748046875, + "logps/rejected": -467.3707275390625, + "loss": 0.0226, + "rewards/chosen": 3.576785659790039, + "rewards/margins": 13.470111465454101, + "rewards/rejected": -9.893325805664062, + "step": 5341 + }, + { + "epoch": 0.48807674737322976, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 5.203392542091483e-06, + "logits/chosen": 778764800.0, + "logits/rejected": 381215744.0, + "logps/chosen": -588.0069580078125, + "logps/rejected": -531.1904703776041, + "loss": 0.0075, + "rewards/chosen": 3.5300230979919434, + "rewards/margins": 14.019863923390707, + "rewards/rejected": -10.489840825398764, + "step": 5342 + }, + { + "epoch": 0.48816811329374143, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5.201955923946216e-06, + "logits/chosen": 718104192.0, + "logits/rejected": 816428800.0, + "logps/chosen": -287.4652099609375, + "logps/rejected": -496.8084309895833, + "loss": 0.0124, + "rewards/chosen": 2.9686522483825684, + "rewards/margins": 13.357274532318115, + "rewards/rejected": -10.388622283935547, + "step": 5343 + }, + { + "epoch": 0.48825947921425306, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5.200519289101065e-06, + "logits/chosen": 620604928.0, + "logits/rejected": 468764373.3333333, + "logps/chosen": -394.4275390625, + "logps/rejected": -439.7552490234375, + "loss": 0.0191, + "rewards/chosen": 3.7335697174072267, + "rewards/margins": 12.315977350870767, + "rewards/rejected": -8.582407633463541, + "step": 5344 + }, + { + "epoch": 0.48835084513476473, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 5.199082637674825e-06, + "logits/chosen": 552550656.0, + "logits/rejected": 467425075.2, + "logps/chosen": -429.72314453125, + "logps/rejected": -495.29873046875, + "loss": 0.0087, + "rewards/chosen": 4.254734357198079, + "rewards/margins": 12.244912274678548, + "rewards/rejected": -7.990177917480469, + "step": 5345 + }, + { + "epoch": 0.4884422110552764, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5.197645969786297e-06, + "logits/chosen": 943180544.0, + "logits/rejected": 623521126.4, + "logps/chosen": -297.59619140625, + "logps/rejected": -866.685546875, + "loss": 0.0164, + "rewards/chosen": 3.8659540812174478, + "rewards/margins": 16.019664255777997, + "rewards/rejected": -12.153710174560548, + "step": 5346 + }, + { + "epoch": 0.48853357697578803, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.196209285554276e-06, + "logits/chosen": 593436800.0, + "logits/rejected": 422076416.0, + "logps/chosen": -433.7159423828125, + "logps/rejected": -167.4398651123047, + "loss": 0.0175, + "rewards/chosen": 3.5946977138519287, + "rewards/margins": 9.400506734848022, + "rewards/rejected": -5.805809020996094, + "step": 5347 + }, + { + "epoch": 0.4886249428962997, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 5.194772585097569e-06, + "logits/chosen": 405510976.0, + "logits/rejected": 497590314.6666667, + "logps/chosen": -259.5001220703125, + "logps/rejected": -579.6396484375, + "loss": 0.0042, + "rewards/chosen": 4.518968105316162, + "rewards/margins": 13.723592281341553, + "rewards/rejected": -9.20462417602539, + "step": 5348 + }, + { + "epoch": 0.48871630881681133, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 5.193335868534971e-06, + "logits/chosen": 915226965.3333334, + "logits/rejected": 590049587.2, + "logps/chosen": -209.3001708984375, + "logps/rejected": -579.943017578125, + "loss": 0.0108, + "rewards/chosen": 3.9819348653157554, + "rewards/margins": 11.79213383992513, + "rewards/rejected": -7.810198974609375, + "step": 5349 + }, + { + "epoch": 0.488807674737323, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5.191899135985289e-06, + "logits/chosen": 573411993.6, + "logits/rejected": 339208789.3333333, + "logps/chosen": -341.75048828125, + "logps/rejected": -564.6227213541666, + "loss": 0.0217, + "rewards/chosen": 3.694741058349609, + "rewards/margins": 13.962453969319661, + "rewards/rejected": -10.267712910970053, + "step": 5350 + }, + { + "epoch": 0.4888990406578346, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 5.1904623875673255e-06, + "logits/chosen": 427589034.6666667, + "logits/rejected": 704328192.0, + "logps/chosen": -286.6568603515625, + "logps/rejected": -484.9962890625, + "loss": 0.0069, + "rewards/chosen": 5.04014778137207, + "rewards/margins": 13.947879409790039, + "rewards/rejected": -8.907731628417968, + "step": 5351 + }, + { + "epoch": 0.4889904065783463, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5.189025623399888e-06, + "logits/chosen": 444755360.0, + "logits/rejected": 554048042.6666666, + "logps/chosen": -332.2471008300781, + "logps/rejected": -360.378173828125, + "loss": 0.0078, + "rewards/chosen": 3.7536797523498535, + "rewards/margins": 12.811167240142822, + "rewards/rejected": -9.057487487792969, + "step": 5352 + }, + { + "epoch": 0.4890817724988579, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5.187588843601784e-06, + "logits/chosen": 389142048.0, + "logits/rejected": 280053312.0, + "logps/chosen": -337.0047607421875, + "logps/rejected": -270.24700927734375, + "loss": 0.0187, + "rewards/chosen": 3.408130407333374, + "rewards/margins": 9.842619180679321, + "rewards/rejected": -6.434488773345947, + "step": 5353 + }, + { + "epoch": 0.4891731384193696, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.186152048291818e-06, + "logits/chosen": 451991210.6666667, + "logits/rejected": 369298764.8, + "logps/chosen": -234.13264973958334, + "logps/rejected": -373.9052978515625, + "loss": 0.0146, + "rewards/chosen": 3.4575862884521484, + "rewards/margins": 10.697532272338867, + "rewards/rejected": -7.239945983886718, + "step": 5354 + }, + { + "epoch": 0.4892645043398812, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.1847152375888046e-06, + "logits/chosen": 691122389.3333334, + "logits/rejected": 486143520.0, + "logps/chosen": -336.8363037109375, + "logps/rejected": -675.4507446289062, + "loss": 0.0268, + "rewards/chosen": 3.4673503239949546, + "rewards/margins": 14.960313161214193, + "rewards/rejected": -11.492962837219238, + "step": 5355 + }, + { + "epoch": 0.4893558702603929, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 5.183278411611551e-06, + "logits/chosen": 617986816.0, + "logits/rejected": 492498656.0, + "logps/chosen": -426.98345947265625, + "logps/rejected": -664.3333740234375, + "loss": 0.0181, + "rewards/chosen": 3.409518003463745, + "rewards/margins": 14.091130018234253, + "rewards/rejected": -10.681612014770508, + "step": 5356 + }, + { + "epoch": 0.4894472361809045, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5.1818415704788725e-06, + "logits/chosen": 590218112.0, + "logits/rejected": 407468672.0, + "logps/chosen": -317.3399658203125, + "logps/rejected": -463.94775390625, + "loss": 0.0113, + "rewards/chosen": 3.9883127212524414, + "rewards/margins": 14.463610649108887, + "rewards/rejected": -10.475297927856445, + "step": 5357 + }, + { + "epoch": 0.4895386021014162, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5.180404714309581e-06, + "logits/chosen": 617442816.0, + "logits/rejected": 569547929.6, + "logps/chosen": -124.62996419270833, + "logps/rejected": -686.48837890625, + "loss": 0.0211, + "rewards/chosen": 2.8636757532755532, + "rewards/margins": 13.172733370463053, + "rewards/rejected": -10.3090576171875, + "step": 5358 + }, + { + "epoch": 0.4896299680219278, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5.1789678432224895e-06, + "logits/chosen": 510296704.0, + "logits/rejected": 624488704.0, + "logps/chosen": -393.00531005859375, + "logps/rejected": -634.55712890625, + "loss": 0.0059, + "rewards/chosen": 5.219898223876953, + "rewards/margins": 16.094970703125, + "rewards/rejected": -10.875072479248047, + "step": 5359 + }, + { + "epoch": 0.4897213339424395, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 5.177530957336416e-06, + "logits/chosen": 652394410.6666666, + "logits/rejected": 538663488.0, + "logps/chosen": -284.3795572916667, + "logps/rejected": -258.2293701171875, + "loss": 0.0437, + "rewards/chosen": 3.0130430857340493, + "rewards/margins": 11.708956400553385, + "rewards/rejected": -8.695913314819336, + "step": 5360 + }, + { + "epoch": 0.4898126998629511, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 5.176094056770178e-06, + "logits/chosen": 382487987.2, + "logits/rejected": 454055552.0, + "logps/chosen": -262.865966796875, + "logps/rejected": -378.751953125, + "loss": 0.0822, + "rewards/chosen": 3.3790077209472655, + "rewards/margins": 10.71483777364095, + "rewards/rejected": -7.335830052693685, + "step": 5361 + }, + { + "epoch": 0.4899040657834628, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5.174657141642591e-06, + "logits/chosen": 531313049.6, + "logits/rejected": 512021162.6666667, + "logps/chosen": -202.0541259765625, + "logps/rejected": -416.5383707682292, + "loss": 0.042, + "rewards/chosen": 2.8594200134277346, + "rewards/margins": 13.100352478027343, + "rewards/rejected": -10.24093246459961, + "step": 5362 + }, + { + "epoch": 0.4899954317039744, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 5.173220212072478e-06, + "logits/chosen": 841989120.0, + "logits/rejected": 653791744.0, + "logps/chosen": -432.09307861328125, + "logps/rejected": -441.8356119791667, + "loss": 0.0073, + "rewards/chosen": 3.768510341644287, + "rewards/margins": 12.038549582163492, + "rewards/rejected": -8.270039240519205, + "step": 5363 + }, + { + "epoch": 0.4900867976244861, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 5.171783268178659e-06, + "logits/chosen": 485151360.0, + "logits/rejected": 215836083.2, + "logps/chosen": -331.3564860026042, + "logps/rejected": -459.675390625, + "loss": 0.0174, + "rewards/chosen": 3.297844886779785, + "rewards/margins": 14.783121681213379, + "rewards/rejected": -11.485276794433593, + "step": 5364 + }, + { + "epoch": 0.4901781635449977, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 5.170346310079955e-06, + "logits/chosen": 430021785.6, + "logits/rejected": 430599424.0, + "logps/chosen": -282.47060546875, + "logps/rejected": -502.0145670572917, + "loss": 0.0147, + "rewards/chosen": 3.9057849884033202, + "rewards/margins": 12.329555892944336, + "rewards/rejected": -8.423770904541016, + "step": 5365 + }, + { + "epoch": 0.4902695294655094, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5.1689093378951885e-06, + "logits/chosen": 618877184.0, + "logits/rejected": 649715763.2, + "logps/chosen": -377.181884765625, + "logps/rejected": -532.736572265625, + "loss": 0.0247, + "rewards/chosen": 2.8358243306477866, + "rewards/margins": 12.763319142659507, + "rewards/rejected": -9.92749481201172, + "step": 5366 + }, + { + "epoch": 0.490360895386021, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 5.167472351743186e-06, + "logits/chosen": 570444352.0, + "logits/rejected": 371961280.0, + "logps/chosen": -302.6470947265625, + "logps/rejected": -295.11285400390625, + "loss": 0.0144, + "rewards/chosen": 3.547963857650757, + "rewards/margins": 12.31900668144226, + "rewards/rejected": -8.771042823791504, + "step": 5367 + }, + { + "epoch": 0.4904522613065327, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 5.166035351742769e-06, + "logits/chosen": 966717610.6666666, + "logits/rejected": 640890265.6, + "logps/chosen": -270.09765625, + "logps/rejected": -385.674462890625, + "loss": 0.0061, + "rewards/chosen": 4.267733891805013, + "rewards/margins": 12.714978154500326, + "rewards/rejected": -8.447244262695312, + "step": 5368 + }, + { + "epoch": 0.4905436272270443, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5.164598338012769e-06, + "logits/chosen": 548690389.3333334, + "logits/rejected": 482233824.0, + "logps/chosen": -321.3334147135417, + "logps/rejected": -458.8070068359375, + "loss": 0.0396, + "rewards/chosen": 3.389370918273926, + "rewards/margins": 10.879409790039062, + "rewards/rejected": -7.490038871765137, + "step": 5369 + }, + { + "epoch": 0.490634993147556, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5.163161310672013e-06, + "logits/chosen": 513590144.0, + "logits/rejected": 593781145.6, + "logps/chosen": -413.7616373697917, + "logps/rejected": -429.5599609375, + "loss": 0.0109, + "rewards/chosen": 3.5564260482788086, + "rewards/margins": 12.731171226501464, + "rewards/rejected": -9.174745178222656, + "step": 5370 + }, + { + "epoch": 0.4907263590680676, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 5.1617242698393265e-06, + "logits/chosen": 568979072.0, + "logits/rejected": 785980586.6666666, + "logps/chosen": -191.13101196289062, + "logps/rejected": -519.9276936848959, + "loss": 0.0091, + "rewards/chosen": 3.3397340774536133, + "rewards/margins": 13.222735404968262, + "rewards/rejected": -9.883001327514648, + "step": 5371 + }, + { + "epoch": 0.4908177249885793, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 5.160287215633541e-06, + "logits/chosen": 496037696.0, + "logits/rejected": 494570666.6666667, + "logps/chosen": -213.1434326171875, + "logps/rejected": -430.6557210286458, + "loss": 0.0084, + "rewards/chosen": 3.3719496726989746, + "rewards/margins": 12.612485408782959, + "rewards/rejected": -9.240535736083984, + "step": 5372 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 0.50390625, + "kl": 0.0, + "learning_rate": 5.158850148173489e-06, + "logits/chosen": 168681968.0, + "logits/rejected": 423341056.0, + "logps/chosen": -140.04322814941406, + "logps/rejected": -550.2344563802084, + "loss": 0.0023, + "rewards/chosen": 4.768919944763184, + "rewards/margins": 14.2446928024292, + "rewards/rejected": -9.475772857666016, + "step": 5373 + }, + { + "epoch": 0.4910004568296026, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5.157413067578001e-06, + "logits/chosen": 767083161.6, + "logits/rejected": 523808597.3333333, + "logps/chosen": -283.8960693359375, + "logps/rejected": -568.8350016276041, + "loss": 0.0279, + "rewards/chosen": 3.2973194122314453, + "rewards/margins": 10.560434341430664, + "rewards/rejected": -7.263114929199219, + "step": 5374 + }, + { + "epoch": 0.4910918227501142, + "grad_norm": 0.83984375, + "kl": 0.0, + "learning_rate": 5.15597597396591e-06, + "logits/chosen": 766782976.0, + "logits/rejected": 900075520.0, + "logps/chosen": -308.9331461588542, + "logps/rejected": -506.59326171875, + "loss": 0.0047, + "rewards/chosen": 4.588498433430989, + "rewards/margins": 15.269955190022785, + "rewards/rejected": -10.681456756591796, + "step": 5375 + }, + { + "epoch": 0.4911831886706259, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5.154538867456052e-06, + "logits/chosen": 583241318.4, + "logits/rejected": 601948074.6666666, + "logps/chosen": -350.3351318359375, + "logps/rejected": -460.26416015625, + "loss": 0.0245, + "rewards/chosen": 3.238277053833008, + "rewards/margins": 13.578854497273763, + "rewards/rejected": -10.340577443440756, + "step": 5376 + }, + { + "epoch": 0.4912745545911375, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5.153101748167261e-06, + "logits/chosen": 426722355.2, + "logits/rejected": 331994069.3333333, + "logps/chosen": -162.8506591796875, + "logps/rejected": -381.7239583333333, + "loss": 0.0248, + "rewards/chosen": 3.460614776611328, + "rewards/margins": 12.786657587687174, + "rewards/rejected": -9.326042811075846, + "step": 5377 + }, + { + "epoch": 0.4913659205116492, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5.151664616218373e-06, + "logits/chosen": 885602201.6, + "logits/rejected": 580882602.6666666, + "logps/chosen": -468.04658203125, + "logps/rejected": -476.9044189453125, + "loss": 0.03, + "rewards/chosen": 3.4750667572021485, + "rewards/margins": 11.332470830281576, + "rewards/rejected": -7.857404073079427, + "step": 5378 + }, + { + "epoch": 0.4914572864321608, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.150227471728229e-06, + "logits/chosen": 649206067.2, + "logits/rejected": 712411434.6666666, + "logps/chosen": -307.12080078125, + "logps/rejected": -408.2450358072917, + "loss": 0.0174, + "rewards/chosen": 4.180097579956055, + "rewards/margins": 13.190240859985352, + "rewards/rejected": -9.010143280029297, + "step": 5379 + }, + { + "epoch": 0.4915486523526725, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 5.148790314815662e-06, + "logits/chosen": 1226625024.0, + "logits/rejected": 488850976.0, + "logps/chosen": -307.4637756347656, + "logps/rejected": -512.6007690429688, + "loss": 0.0084, + "rewards/chosen": 4.275625228881836, + "rewards/margins": 15.04615592956543, + "rewards/rejected": -10.770530700683594, + "step": 5380 + }, + { + "epoch": 0.4916400182731841, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5.147353145599516e-06, + "logits/chosen": 690683562.6666666, + "logits/rejected": 422797152.0, + "logps/chosen": -404.2482503255208, + "logps/rejected": -399.5323181152344, + "loss": 0.0163, + "rewards/chosen": 4.147113800048828, + "rewards/margins": 12.7445068359375, + "rewards/rejected": -8.597393035888672, + "step": 5381 + }, + { + "epoch": 0.4917313841936958, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5.145915964198631e-06, + "logits/chosen": 354803296.0, + "logits/rejected": 378749888.0, + "logps/chosen": -206.99160766601562, + "logps/rejected": -365.240478515625, + "loss": 0.0196, + "rewards/chosen": 3.3069353103637695, + "rewards/margins": 11.534086227416992, + "rewards/rejected": -8.227150917053223, + "step": 5382 + }, + { + "epoch": 0.4918227501142074, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5.144478770731847e-06, + "logits/chosen": 343570112.0, + "logits/rejected": 539397546.6666666, + "logps/chosen": -180.5540771484375, + "logps/rejected": -521.681640625, + "loss": 0.0066, + "rewards/chosen": 3.7688233852386475, + "rewards/margins": 14.331591049830118, + "rewards/rejected": -10.56276766459147, + "step": 5383 + }, + { + "epoch": 0.4919141160347191, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5.143041565318008e-06, + "logits/chosen": 442549568.0, + "logits/rejected": 525929472.0, + "logps/chosen": -364.8676452636719, + "logps/rejected": -679.2842407226562, + "loss": 0.0133, + "rewards/chosen": 4.510472297668457, + "rewards/margins": 14.007365226745605, + "rewards/rejected": -9.496892929077148, + "step": 5384 + }, + { + "epoch": 0.4920054819552307, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5.141604348075956e-06, + "logits/chosen": 783956565.3333334, + "logits/rejected": 632225792.0, + "logps/chosen": -223.25227864583334, + "logps/rejected": -549.712353515625, + "loss": 0.1644, + "rewards/chosen": 0.04009779294331869, + "rewards/margins": 9.495674880345662, + "rewards/rejected": -9.455577087402343, + "step": 5385 + }, + { + "epoch": 0.49209684787574237, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 5.140167119124536e-06, + "logits/chosen": 347197056.0, + "logits/rejected": 395031200.0, + "logps/chosen": -191.2996826171875, + "logps/rejected": -449.75103759765625, + "loss": 0.0331, + "rewards/chosen": 2.695923089981079, + "rewards/margins": 13.079899072647095, + "rewards/rejected": -10.383975982666016, + "step": 5386 + }, + { + "epoch": 0.492188213796254, + "grad_norm": 0.640625, + "kl": 0.0, + "learning_rate": 5.138729878582594e-06, + "logits/chosen": 480028416.0, + "logits/rejected": 745125632.0, + "logps/chosen": -148.84628295898438, + "logps/rejected": -495.77880859375, + "loss": 0.0033, + "rewards/chosen": 4.592684268951416, + "rewards/margins": 14.132366021474203, + "rewards/rejected": -9.539681752522787, + "step": 5387 + }, + { + "epoch": 0.49227957971676567, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 5.1372926265689775e-06, + "logits/chosen": 560632320.0, + "logits/rejected": 474392234.6666667, + "logps/chosen": -303.03436279296875, + "logps/rejected": -779.57861328125, + "loss": 0.0066, + "rewards/chosen": 3.712552070617676, + "rewards/margins": 18.361602465311684, + "rewards/rejected": -14.64905039469401, + "step": 5388 + }, + { + "epoch": 0.4923709456372773, + "grad_norm": 2.3125, + "kl": 0.30710411071777344, + "learning_rate": 5.135855363202531e-06, + "logits/chosen": 407422976.0, + "logits/rejected": 597872000.0, + "logps/chosen": -263.02347237723217, + "logps/rejected": -157.63368225097656, + "loss": 0.0203, + "rewards/chosen": 4.060838971819196, + "rewards/margins": 12.58208874293736, + "rewards/rejected": -8.521249771118164, + "step": 5389 + }, + { + "epoch": 0.49246231155778897, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5.1344180886021065e-06, + "logits/chosen": 555809462.8571428, + "logits/rejected": 365361472.0, + "logps/chosen": -409.149658203125, + "logps/rejected": -482.5516052246094, + "loss": 0.0255, + "rewards/chosen": 3.757324763706752, + "rewards/margins": 10.33964306967599, + "rewards/rejected": -6.582318305969238, + "step": 5390 + }, + { + "epoch": 0.4925536774783006, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5.1329808028865515e-06, + "logits/chosen": 806867968.0, + "logits/rejected": 533377066.6666667, + "logps/chosen": -411.279296875, + "logps/rejected": -799.322509765625, + "loss": 0.0184, + "rewards/chosen": 4.14795150756836, + "rewards/margins": 15.9052858988444, + "rewards/rejected": -11.757334391276041, + "step": 5391 + }, + { + "epoch": 0.49264504339881227, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5.131543506174714e-06, + "logits/chosen": 466443520.0, + "logits/rejected": 391224106.6666667, + "logps/chosen": -381.9941650390625, + "logps/rejected": -446.65625, + "loss": 0.0081, + "rewards/chosen": 4.722749328613281, + "rewards/margins": 13.68971913655599, + "rewards/rejected": -8.966969807942709, + "step": 5392 + }, + { + "epoch": 0.4927364093193239, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5.13010619858545e-06, + "logits/chosen": 571993088.0, + "logits/rejected": 715203968.0, + "logps/chosen": -347.5184326171875, + "logps/rejected": -829.952392578125, + "loss": 0.0189, + "rewards/chosen": 3.3019890785217285, + "rewards/margins": 13.522008419036865, + "rewards/rejected": -10.220019340515137, + "step": 5393 + }, + { + "epoch": 0.49282777523983556, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5.1286688802376075e-06, + "logits/chosen": 362525504.0, + "logits/rejected": 1027011392.0, + "logps/chosen": -177.13070678710938, + "logps/rejected": -469.0625, + "loss": 0.0124, + "rewards/chosen": 4.34586238861084, + "rewards/margins": 13.356165885925293, + "rewards/rejected": -9.010303497314453, + "step": 5394 + }, + { + "epoch": 0.4929191411603472, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.12723155125004e-06, + "logits/chosen": 616851328.0, + "logits/rejected": 507646656.0, + "logps/chosen": -425.3274841308594, + "logps/rejected": -513.49658203125, + "loss": 0.0229, + "rewards/chosen": 3.1520283222198486, + "rewards/margins": 11.638095140457153, + "rewards/rejected": -8.486066818237305, + "step": 5395 + }, + { + "epoch": 0.49301050708085886, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5.125794211741602e-06, + "logits/chosen": 639996160.0, + "logits/rejected": 471721664.0, + "logps/chosen": -307.6857604980469, + "logps/rejected": -539.07568359375, + "loss": 0.0203, + "rewards/chosen": 3.2677948474884033, + "rewards/margins": 11.883875131607056, + "rewards/rejected": -8.616080284118652, + "step": 5396 + }, + { + "epoch": 0.4931018730013705, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.124356861831152e-06, + "logits/chosen": 527741030.4, + "logits/rejected": 423267157.3333333, + "logps/chosen": -282.268994140625, + "logps/rejected": -390.3255615234375, + "loss": 0.0157, + "rewards/chosen": 3.9978179931640625, + "rewards/margins": 12.438232421875, + "rewards/rejected": -8.440414428710938, + "step": 5397 + }, + { + "epoch": 0.49319323892188216, + "grad_norm": 0.5234375, + "kl": 0.0, + "learning_rate": 5.122919501637538e-06, + "logits/chosen": 257680640.0, + "logits/rejected": 411157674.6666667, + "logps/chosen": -226.9871063232422, + "logps/rejected": -460.0485026041667, + "loss": 0.0025, + "rewards/chosen": 5.212623596191406, + "rewards/margins": 14.047002156575521, + "rewards/rejected": -8.834378560384115, + "step": 5398 + }, + { + "epoch": 0.4932846048423938, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.121482131279621e-06, + "logits/chosen": 1215019008.0, + "logits/rejected": 481141589.3333333, + "logps/chosen": -567.25107421875, + "logps/rejected": -340.3042399088542, + "loss": 0.0267, + "rewards/chosen": 3.4083763122558595, + "rewards/margins": 11.55118204752604, + "rewards/rejected": -8.142805735270182, + "step": 5399 + }, + { + "epoch": 0.49337597076290546, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5.120044750876257e-06, + "logits/chosen": 593757994.6666666, + "logits/rejected": 422834816.0, + "logps/chosen": -312.34792073567706, + "logps/rejected": -479.0446472167969, + "loss": 0.017, + "rewards/chosen": 3.980410893758138, + "rewards/margins": 13.179675420125326, + "rewards/rejected": -9.199264526367188, + "step": 5400 + }, + { + "epoch": 0.4934673366834171, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.118607360546305e-06, + "logits/chosen": 356115872.0, + "logits/rejected": 613654144.0, + "logps/chosen": -364.2789306640625, + "logps/rejected": -556.618408203125, + "loss": 0.0183, + "rewards/chosen": 4.436705589294434, + "rewards/margins": 13.58552074432373, + "rewards/rejected": -9.148815155029297, + "step": 5401 + }, + { + "epoch": 0.49355870260392876, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5.1171699604086224e-06, + "logits/chosen": 320800768.0, + "logits/rejected": 407125862.4, + "logps/chosen": -190.203369140625, + "logps/rejected": -633.179443359375, + "loss": 0.0284, + "rewards/chosen": 3.654820442199707, + "rewards/margins": 13.06396427154541, + "rewards/rejected": -9.409143829345703, + "step": 5402 + }, + { + "epoch": 0.4936500685244404, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 5.11573255058207e-06, + "logits/chosen": 684954419.2, + "logits/rejected": 540732032.0, + "logps/chosen": -279.3132568359375, + "logps/rejected": -708.1143391927084, + "loss": 0.0211, + "rewards/chosen": 3.4567550659179687, + "rewards/margins": 14.239705149332682, + "rewards/rejected": -10.782950083414713, + "step": 5403 + }, + { + "epoch": 0.49374143444495205, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.1142951311855075e-06, + "logits/chosen": 283586944.0, + "logits/rejected": 576203059.2, + "logps/chosen": -121.62967936197917, + "logps/rejected": -485.68564453125, + "loss": 0.1374, + "rewards/chosen": 1.0467562675476074, + "rewards/margins": 9.074074077606202, + "rewards/rejected": -8.027317810058594, + "step": 5404 + }, + { + "epoch": 0.4938328003654637, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.1128577023377975e-06, + "logits/chosen": 1101217536.0, + "logits/rejected": 723395392.0, + "logps/chosen": -404.9083251953125, + "logps/rejected": -557.1181030273438, + "loss": 0.0108, + "rewards/chosen": 4.074664115905762, + "rewards/margins": 13.524785995483398, + "rewards/rejected": -9.450121879577637, + "step": 5405 + }, + { + "epoch": 0.49392416628597535, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 5.111420264157801e-06, + "logits/chosen": 849558272.0, + "logits/rejected": 634005888.0, + "logps/chosen": -278.3179524739583, + "logps/rejected": -273.2464599609375, + "loss": 0.0302, + "rewards/chosen": 3.360189437866211, + "rewards/margins": 10.939116477966309, + "rewards/rejected": -7.578927040100098, + "step": 5406 + }, + { + "epoch": 0.494015532206487, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 5.109982816764381e-06, + "logits/chosen": 505425203.2, + "logits/rejected": 639654997.3333334, + "logps/chosen": -314.266015625, + "logps/rejected": -404.1534016927083, + "loss": 0.0248, + "rewards/chosen": 3.4831390380859375, + "rewards/margins": 10.278779347737629, + "rewards/rejected": -6.795640309651692, + "step": 5407 + }, + { + "epoch": 0.49410689812699865, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5.108545360276401e-06, + "logits/chosen": 519142016.0, + "logits/rejected": 383242272.0, + "logps/chosen": -329.2647705078125, + "logps/rejected": -464.91729736328125, + "loss": 0.0204, + "rewards/chosen": 3.22432804107666, + "rewards/margins": 11.895317077636719, + "rewards/rejected": -8.670989036560059, + "step": 5408 + }, + { + "epoch": 0.49419826404751027, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5.107107894812724e-06, + "logits/chosen": 626007168.0, + "logits/rejected": 314243232.0, + "logps/chosen": -470.5646158854167, + "logps/rejected": -468.8474426269531, + "loss": 0.023, + "rewards/chosen": 4.032384872436523, + "rewards/margins": 13.641437530517578, + "rewards/rejected": -9.609052658081055, + "step": 5409 + }, + { + "epoch": 0.49428962996802195, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5.105670420492219e-06, + "logits/chosen": 1100093781.3333333, + "logits/rejected": 569900646.4, + "logps/chosen": -494.5791829427083, + "logps/rejected": -530.011279296875, + "loss": 0.0061, + "rewards/chosen": 4.227692286173503, + "rewards/margins": 13.0861021677653, + "rewards/rejected": -8.858409881591797, + "step": 5410 + }, + { + "epoch": 0.49438099588853357, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.104232937433748e-06, + "logits/chosen": 634854336.0, + "logits/rejected": 586820288.0, + "logps/chosen": -522.9025268554688, + "logps/rejected": -479.14825439453125, + "loss": 0.0134, + "rewards/chosen": 3.865135669708252, + "rewards/margins": 13.260019779205322, + "rewards/rejected": -9.39488410949707, + "step": 5411 + }, + { + "epoch": 0.49447236180904525, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 5.102795445756179e-06, + "logits/chosen": 471057120.0, + "logits/rejected": 501111104.0, + "logps/chosen": -342.413330078125, + "logps/rejected": -414.839599609375, + "loss": 0.0108, + "rewards/chosen": 4.657207489013672, + "rewards/margins": 13.509735107421875, + "rewards/rejected": -8.852527618408203, + "step": 5412 + }, + { + "epoch": 0.49456372772955687, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5.101357945578378e-06, + "logits/chosen": 369107916.8, + "logits/rejected": 516175189.3333333, + "logps/chosen": -98.32230224609376, + "logps/rejected": -755.0204264322916, + "loss": 0.0403, + "rewards/chosen": 3.5453773498535157, + "rewards/margins": 11.70430908203125, + "rewards/rejected": -8.158931732177734, + "step": 5413 + }, + { + "epoch": 0.49465509365006854, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5.099920437019215e-06, + "logits/chosen": 707583078.4, + "logits/rejected": 591224234.6666666, + "logps/chosen": -303.477587890625, + "logps/rejected": -520.800048828125, + "loss": 0.0215, + "rewards/chosen": 4.220873260498047, + "rewards/margins": 13.195637130737305, + "rewards/rejected": -8.974763870239258, + "step": 5414 + }, + { + "epoch": 0.49474645957058017, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.098482920197557e-06, + "logits/chosen": 800184320.0, + "logits/rejected": 1091350656.0, + "logps/chosen": -301.44683837890625, + "logps/rejected": -407.18792724609375, + "loss": 0.0215, + "rewards/chosen": 3.759962558746338, + "rewards/margins": 12.290990352630615, + "rewards/rejected": -8.531027793884277, + "step": 5415 + }, + { + "epoch": 0.49483782549109184, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5.097045395232275e-06, + "logits/chosen": 806096128.0, + "logits/rejected": 1031598250.6666666, + "logps/chosen": -312.76181640625, + "logps/rejected": -429.1793619791667, + "loss": 0.0207, + "rewards/chosen": 3.5063018798828125, + "rewards/margins": 11.701044718424479, + "rewards/rejected": -8.194742838541666, + "step": 5416 + }, + { + "epoch": 0.49492919141160346, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5.095607862242238e-06, + "logits/chosen": 531643168.0, + "logits/rejected": 682993536.0, + "logps/chosen": -193.0126953125, + "logps/rejected": -457.12725830078125, + "loss": 0.0189, + "rewards/chosen": 3.9744160175323486, + "rewards/margins": 16.58379864692688, + "rewards/rejected": -12.609382629394531, + "step": 5417 + }, + { + "epoch": 0.49502055733211514, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 5.094170321346316e-06, + "logits/chosen": 574626611.2, + "logits/rejected": 860253184.0, + "logps/chosen": -204.689453125, + "logps/rejected": -424.3981119791667, + "loss": 0.1225, + "rewards/chosen": 2.1370330810546876, + "rewards/margins": 11.314786529541015, + "rewards/rejected": -9.177753448486328, + "step": 5418 + }, + { + "epoch": 0.49511192325262676, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5.09273277266338e-06, + "logits/chosen": 490860953.6, + "logits/rejected": 249532160.0, + "logps/chosen": -286.96201171875, + "logps/rejected": -488.5826822916667, + "loss": 0.021, + "rewards/chosen": 3.5530517578125, + "rewards/margins": 15.986056772867837, + "rewards/rejected": -12.433005015055338, + "step": 5419 + }, + { + "epoch": 0.49520328917313844, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 5.091295216312304e-06, + "logits/chosen": 651739989.3333334, + "logits/rejected": 370411776.0, + "logps/chosen": -275.1916910807292, + "logps/rejected": -367.10272216796875, + "loss": 0.0275, + "rewards/chosen": 3.5310948689778647, + "rewards/margins": 11.98697026570638, + "rewards/rejected": -8.455875396728516, + "step": 5420 + }, + { + "epoch": 0.49529465509365006, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.089857652411961e-06, + "logits/chosen": 434091136.0, + "logits/rejected": 497892147.2, + "logps/chosen": -289.8437093098958, + "logps/rejected": -580.14951171875, + "loss": 0.0186, + "rewards/chosen": 3.2032833099365234, + "rewards/margins": 12.981748580932617, + "rewards/rejected": -9.778465270996094, + "step": 5421 + }, + { + "epoch": 0.49538602101416174, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5.08842008108122e-06, + "logits/chosen": 771120486.4, + "logits/rejected": 1156943445.3333333, + "logps/chosen": -334.56875, + "logps/rejected": -461.7256266276042, + "loss": 0.0245, + "rewards/chosen": 3.28817138671875, + "rewards/margins": 13.0237429300944, + "rewards/rejected": -9.73557154337565, + "step": 5422 + }, + { + "epoch": 0.49547738693467336, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5.086982502438959e-06, + "logits/chosen": 514382438.4, + "logits/rejected": 963831125.3333334, + "logps/chosen": -304.2896728515625, + "logps/rejected": -558.6986490885416, + "loss": 0.0306, + "rewards/chosen": 3.5034206390380858, + "rewards/margins": 12.935436375935872, + "rewards/rejected": -9.432015736897787, + "step": 5423 + }, + { + "epoch": 0.49556875285518504, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.08554491660405e-06, + "logits/chosen": 652813226.6666666, + "logits/rejected": 403457177.6, + "logps/chosen": -498.4564615885417, + "logps/rejected": -377.486865234375, + "loss": 0.0157, + "rewards/chosen": 3.283906936645508, + "rewards/margins": 11.838725662231445, + "rewards/rejected": -8.554818725585937, + "step": 5424 + }, + { + "epoch": 0.49566011877569666, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5.08410732369537e-06, + "logits/chosen": 527836467.2, + "logits/rejected": 483075029.3333333, + "logps/chosen": -280.9625, + "logps/rejected": -411.7587890625, + "loss": 0.0244, + "rewards/chosen": 3.445661163330078, + "rewards/margins": 11.822213745117187, + "rewards/rejected": -8.37655258178711, + "step": 5425 + }, + { + "epoch": 0.49575148469620833, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5.082669723831793e-06, + "logits/chosen": 745960128.0, + "logits/rejected": 711308653.7142857, + "logps/chosen": -432.4884033203125, + "logps/rejected": -490.4608677455357, + "loss": 0.029, + "rewards/chosen": 3.516977071762085, + "rewards/margins": 11.337620905467443, + "rewards/rejected": -7.820643833705357, + "step": 5426 + }, + { + "epoch": 0.49584285061671995, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 5.0812321171321965e-06, + "logits/chosen": 806852266.6666666, + "logits/rejected": 790434560.0, + "logps/chosen": -435.0929768880208, + "logps/rejected": -642.9853515625, + "loss": 0.0134, + "rewards/chosen": 4.393394470214844, + "rewards/margins": 14.663107872009277, + "rewards/rejected": -10.269713401794434, + "step": 5427 + }, + { + "epoch": 0.49593421653723163, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5.079794503715455e-06, + "logits/chosen": 385058304.0, + "logits/rejected": 498816614.4, + "logps/chosen": -215.363525390625, + "logps/rejected": -502.54580078125, + "loss": 0.0207, + "rewards/chosen": 2.90751043955485, + "rewards/margins": 11.014178530375162, + "rewards/rejected": -8.106668090820312, + "step": 5428 + }, + { + "epoch": 0.49602558245774325, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.07835688370045e-06, + "logits/chosen": 1253255680.0, + "logits/rejected": 973148569.6, + "logps/chosen": -463.9608154296875, + "logps/rejected": -892.10322265625, + "loss": 0.0133, + "rewards/chosen": 3.3339064915974936, + "rewards/margins": 15.911972745259604, + "rewards/rejected": -12.57806625366211, + "step": 5429 + }, + { + "epoch": 0.49611694837825493, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5.076919257206053e-06, + "logits/chosen": 641356480.0, + "logits/rejected": 517781942.85714287, + "logps/chosen": -251.2061004638672, + "logps/rejected": -402.39756556919644, + "loss": 0.0058, + "rewards/chosen": 3.768019199371338, + "rewards/margins": 11.752418177468435, + "rewards/rejected": -7.984398978097098, + "step": 5430 + }, + { + "epoch": 0.49620831429876655, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 5.0754816243511496e-06, + "logits/chosen": 769852672.0, + "logits/rejected": 816891328.0, + "logps/chosen": -408.2239176432292, + "logps/rejected": -871.7772827148438, + "loss": 0.0128, + "rewards/chosen": 4.333277702331543, + "rewards/margins": 15.448349952697754, + "rewards/rejected": -11.115072250366211, + "step": 5431 + }, + { + "epoch": 0.4962996802192782, + "grad_norm": 28.875, + "kl": 0.0, + "learning_rate": 5.074043985254613e-06, + "logits/chosen": 493046592.0, + "logits/rejected": 303169728.0, + "logps/chosen": -252.20521545410156, + "logps/rejected": -211.05247497558594, + "loss": 0.0554, + "rewards/chosen": 3.3082275390625, + "rewards/margins": 9.596496105194092, + "rewards/rejected": -6.288268566131592, + "step": 5432 + }, + { + "epoch": 0.49639104613978985, + "grad_norm": 0.88671875, + "kl": 0.0, + "learning_rate": 5.0726063400353256e-06, + "logits/chosen": 811059541.3333334, + "logits/rejected": 844157081.6, + "logps/chosen": -499.9794921875, + "logps/rejected": -584.14052734375, + "loss": 0.0041, + "rewards/chosen": 4.917805035909017, + "rewards/margins": 14.969657452901203, + "rewards/rejected": -10.051852416992187, + "step": 5433 + }, + { + "epoch": 0.4964824120603015, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.0711686888121654e-06, + "logits/chosen": 881931178.6666666, + "logits/rejected": 523009408.0, + "logps/chosen": -297.4486897786458, + "logps/rejected": -472.34222412109375, + "loss": 0.0483, + "rewards/chosen": 2.823205312093099, + "rewards/margins": 12.348103841145834, + "rewards/rejected": -9.524898529052734, + "step": 5434 + }, + { + "epoch": 0.49657377798081315, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5.069731031704016e-06, + "logits/chosen": 644919509.3333334, + "logits/rejected": 625056256.0, + "logps/chosen": -306.48122151692706, + "logps/rejected": -732.7462890625, + "loss": 0.0209, + "rewards/chosen": 3.1906744639078775, + "rewards/margins": 13.922231165568034, + "rewards/rejected": -10.731556701660157, + "step": 5435 + }, + { + "epoch": 0.4966651439013248, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 5.068293368829755e-06, + "logits/chosen": 1264015104.0, + "logits/rejected": 759132608.0, + "logps/chosen": -579.1464233398438, + "logps/rejected": -855.223876953125, + "loss": 0.0078, + "rewards/chosen": 4.346066474914551, + "rewards/margins": 17.44705295562744, + "rewards/rejected": -13.10098648071289, + "step": 5436 + }, + { + "epoch": 0.49675650982183644, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 5.066855700308268e-06, + "logits/chosen": 535880661.3333333, + "logits/rejected": 585478016.0, + "logps/chosen": -286.40757242838544, + "logps/rejected": -624.4095458984375, + "loss": 0.0117, + "rewards/chosen": 4.56166394551595, + "rewards/margins": 14.656855901082356, + "rewards/rejected": -10.095191955566406, + "step": 5437 + }, + { + "epoch": 0.4968478757423481, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 5.06541802625843e-06, + "logits/chosen": 842503168.0, + "logits/rejected": 673482752.0, + "logps/chosen": -284.0099853515625, + "logps/rejected": -581.9219970703125, + "loss": 0.0327, + "rewards/chosen": 3.0801712036132813, + "rewards/margins": 14.306399281819662, + "rewards/rejected": -11.22622807820638, + "step": 5438 + }, + { + "epoch": 0.49693924166285974, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5.06398034679913e-06, + "logits/chosen": 471535168.0, + "logits/rejected": 343987424.0, + "logps/chosen": -258.9710998535156, + "logps/rejected": -510.49444580078125, + "loss": 0.0375, + "rewards/chosen": 2.7249226570129395, + "rewards/margins": 14.288336277008057, + "rewards/rejected": -11.563413619995117, + "step": 5439 + }, + { + "epoch": 0.4970306075833714, + "grad_norm": 40.75, + "kl": 0.0, + "learning_rate": 5.062542662049247e-06, + "logits/chosen": 413043925.3333333, + "logits/rejected": 376465817.6, + "logps/chosen": -214.31526692708334, + "logps/rejected": -374.758544921875, + "loss": 0.1004, + "rewards/chosen": 2.706002871195475, + "rewards/margins": 11.148114840189615, + "rewards/rejected": -8.44211196899414, + "step": 5440 + }, + { + "epoch": 0.49712197350388304, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5.061104972127666e-06, + "logits/chosen": 556067968.0, + "logits/rejected": 663749760.0, + "logps/chosen": -280.7811584472656, + "logps/rejected": -789.717529296875, + "loss": 0.0147, + "rewards/chosen": 3.676968574523926, + "rewards/margins": 13.020197868347168, + "rewards/rejected": -9.343229293823242, + "step": 5441 + }, + { + "epoch": 0.4972133394243947, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5.0596672771532696e-06, + "logits/chosen": 905146368.0, + "logits/rejected": 456335744.0, + "logps/chosen": -128.2867889404297, + "logps/rejected": -518.9549153645834, + "loss": 0.0269, + "rewards/chosen": 3.857870578765869, + "rewards/margins": 11.44656292597453, + "rewards/rejected": -7.588692347208659, + "step": 5442 + }, + { + "epoch": 0.49730470534490634, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5.058229577244942e-06, + "logits/chosen": 631736448.0, + "logits/rejected": 701677760.0, + "logps/chosen": -371.9018249511719, + "logps/rejected": -454.975830078125, + "loss": 0.0162, + "rewards/chosen": 3.7623977661132812, + "rewards/margins": 14.463787078857422, + "rewards/rejected": -10.70138931274414, + "step": 5443 + }, + { + "epoch": 0.497396071265418, + "grad_norm": 0.76953125, + "kl": 0.0, + "learning_rate": 5.056791872521568e-06, + "logits/chosen": 425683114.6666667, + "logits/rejected": 648933785.6, + "logps/chosen": -364.5904134114583, + "logps/rejected": -327.21328125, + "loss": 0.0039, + "rewards/chosen": 4.951443672180176, + "rewards/margins": 13.568892097473144, + "rewards/rejected": -8.617448425292968, + "step": 5444 + }, + { + "epoch": 0.49748743718592964, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5.055354163102032e-06, + "logits/chosen": 658968166.4, + "logits/rejected": 510374656.0, + "logps/chosen": -332.397705078125, + "logps/rejected": -443.6180826822917, + "loss": 0.0139, + "rewards/chosen": 4.091712188720703, + "rewards/margins": 12.952371470133464, + "rewards/rejected": -8.86065928141276, + "step": 5445 + }, + { + "epoch": 0.4975788031064413, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5.053916449105219e-06, + "logits/chosen": 502208544.0, + "logits/rejected": 1214030336.0, + "logps/chosen": -355.47674560546875, + "logps/rejected": -689.7811279296875, + "loss": 0.0107, + "rewards/chosen": 4.845688819885254, + "rewards/margins": 15.45029354095459, + "rewards/rejected": -10.604604721069336, + "step": 5446 + }, + { + "epoch": 0.49767016902695294, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 5.0524787306500165e-06, + "logits/chosen": 748751018.6666666, + "logits/rejected": 755651379.2, + "logps/chosen": -293.322265625, + "logps/rejected": -438.65810546875, + "loss": 0.0051, + "rewards/chosen": 4.542152404785156, + "rewards/margins": 14.648128509521484, + "rewards/rejected": -10.105976104736328, + "step": 5447 + }, + { + "epoch": 0.4977615349474646, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5.051041007855308e-06, + "logits/chosen": 470200480.0, + "logits/rejected": 584473984.0, + "logps/chosen": -386.7332763671875, + "logps/rejected": -433.9996337890625, + "loss": 0.0101, + "rewards/chosen": 4.237594127655029, + "rewards/margins": 13.116651058197021, + "rewards/rejected": -8.879056930541992, + "step": 5448 + }, + { + "epoch": 0.49785290086797623, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 5.049603280839982e-06, + "logits/chosen": 471582208.0, + "logits/rejected": 507519283.2, + "logps/chosen": -198.46614583333334, + "logps/rejected": -411.63603515625, + "loss": 0.0115, + "rewards/chosen": 3.5765660603841147, + "rewards/margins": 12.719016774495444, + "rewards/rejected": -9.142450714111328, + "step": 5449 + }, + { + "epoch": 0.4979442667884879, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 5.0481655497229245e-06, + "logits/chosen": 602637260.8, + "logits/rejected": 464101205.3333333, + "logps/chosen": -346.1502685546875, + "logps/rejected": -760.8876953125, + "loss": 0.0134, + "rewards/chosen": 4.092953109741211, + "rewards/margins": 14.05037473042806, + "rewards/rejected": -9.95742162068685, + "step": 5450 + }, + { + "epoch": 0.49803563270899953, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5.046727814623022e-06, + "logits/chosen": 516550400.0, + "logits/rejected": 513862528.0, + "logps/chosen": -273.118408203125, + "logps/rejected": -516.3072102864584, + "loss": 0.0327, + "rewards/chosen": 3.6977108001708983, + "rewards/margins": 12.432686614990235, + "rewards/rejected": -8.734975814819336, + "step": 5451 + }, + { + "epoch": 0.4981269986295112, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5.045290075659163e-06, + "logits/chosen": 348641280.0, + "logits/rejected": 372113056.0, + "logps/chosen": -364.8356628417969, + "logps/rejected": -455.9609680175781, + "loss": 0.0172, + "rewards/chosen": 4.029764652252197, + "rewards/margins": 14.00883150100708, + "rewards/rejected": -9.979066848754883, + "step": 5452 + }, + { + "epoch": 0.49821836455002283, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5.043852332950233e-06, + "logits/chosen": 659874176.0, + "logits/rejected": 802094080.0, + "logps/chosen": -445.3255310058594, + "logps/rejected": -531.7547200520834, + "loss": 0.0107, + "rewards/chosen": 3.392230272293091, + "rewards/margins": 11.8507289091746, + "rewards/rejected": -8.45849863688151, + "step": 5453 + }, + { + "epoch": 0.4983097304705345, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5.042414586615122e-06, + "logits/chosen": 479862674.28571427, + "logits/rejected": 1098382464.0, + "logps/chosen": -351.80758231026783, + "logps/rejected": -86.71878051757812, + "loss": 0.1084, + "rewards/chosen": 3.092708042689732, + "rewards/margins": 3.014944671520165, + "rewards/rejected": 0.07776337116956711, + "step": 5454 + }, + { + "epoch": 0.49840109639104613, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5.040976836772716e-06, + "logits/chosen": 694914457.6, + "logits/rejected": 378379776.0, + "logps/chosen": -396.1919921875, + "logps/rejected": -620.04248046875, + "loss": 0.1449, + "rewards/chosen": 1.7440853118896484, + "rewards/margins": 9.796814600626627, + "rewards/rejected": -8.052729288736979, + "step": 5455 + }, + { + "epoch": 0.4984924623115578, + "grad_norm": 0.875, + "kl": 0.0, + "learning_rate": 5.039539083541908e-06, + "logits/chosen": 719952128.0, + "logits/rejected": 520556501.3333333, + "logps/chosen": -302.6820068359375, + "logps/rejected": -612.0306803385416, + "loss": 0.003, + "rewards/chosen": 4.542470455169678, + "rewards/margins": 17.210886796315513, + "rewards/rejected": -12.668416341145834, + "step": 5456 + }, + { + "epoch": 0.4985838282320694, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5.038101327041582e-06, + "logits/chosen": 798185536.0, + "logits/rejected": 1000476352.0, + "logps/chosen": -275.51824951171875, + "logps/rejected": -410.91357421875, + "loss": 0.024, + "rewards/chosen": 3.1253128051757812, + "rewards/margins": 13.363748550415039, + "rewards/rejected": -10.238435745239258, + "step": 5457 + }, + { + "epoch": 0.4986751941525811, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5.0366635673906305e-06, + "logits/chosen": 684662997.3333334, + "logits/rejected": 373355110.4, + "logps/chosen": -525.06884765625, + "logps/rejected": -409.678125, + "loss": 0.0193, + "rewards/chosen": 3.2718070348103843, + "rewards/margins": 12.356461270650229, + "rewards/rejected": -9.084654235839844, + "step": 5458 + }, + { + "epoch": 0.4987665600730927, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5.035225804707941e-06, + "logits/chosen": 687897941.3333334, + "logits/rejected": 722335232.0, + "logps/chosen": -552.3165283203125, + "logps/rejected": -444.152783203125, + "loss": 0.0142, + "rewards/chosen": 3.6319878896077475, + "rewards/margins": 12.210566075642904, + "rewards/rejected": -8.578578186035156, + "step": 5459 + }, + { + "epoch": 0.4988579259936044, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 5.033788039112404e-06, + "logits/chosen": 354722090.6666667, + "logits/rejected": 327058380.8, + "logps/chosen": -168.5379638671875, + "logps/rejected": -684.396630859375, + "loss": 0.0095, + "rewards/chosen": 4.189186414082845, + "rewards/margins": 14.858249982198078, + "rewards/rejected": -10.669063568115234, + "step": 5460 + }, + { + "epoch": 0.498949291914116, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5.03235027072291e-06, + "logits/chosen": 393791200.0, + "logits/rejected": 523567744.0, + "logps/chosen": -241.80316162109375, + "logps/rejected": -534.614990234375, + "loss": 0.1175, + "rewards/chosen": 2.6367297172546387, + "rewards/margins": 11.710561275482178, + "rewards/rejected": -9.073831558227539, + "step": 5461 + }, + { + "epoch": 0.4990406578346277, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.030912499658347e-06, + "logits/chosen": 789400960.0, + "logits/rejected": 693314304.0, + "logps/chosen": -491.0408935546875, + "logps/rejected": -457.9169921875, + "loss": 0.0153, + "rewards/chosen": 3.1356735229492188, + "rewards/margins": 10.999238967895508, + "rewards/rejected": -7.863565444946289, + "step": 5462 + }, + { + "epoch": 0.4991320237551393, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 5.029474726037607e-06, + "logits/chosen": 1339957888.0, + "logits/rejected": 721018112.0, + "logps/chosen": -489.4862365722656, + "logps/rejected": -525.7689208984375, + "loss": 0.0103, + "rewards/chosen": 4.085245609283447, + "rewards/margins": 13.957369327545166, + "rewards/rejected": -9.872123718261719, + "step": 5463 + }, + { + "epoch": 0.499223389675651, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5.028036949979583e-06, + "logits/chosen": 789660825.6, + "logits/rejected": 493117952.0, + "logps/chosen": -354.8205322265625, + "logps/rejected": -463.8959147135417, + "loss": 0.015, + "rewards/chosen": 4.3278045654296875, + "rewards/margins": 14.385986963907877, + "rewards/rejected": -10.05818239847819, + "step": 5464 + }, + { + "epoch": 0.4993147555961626, + "grad_norm": 46.75, + "kl": 0.0, + "learning_rate": 5.02659917160316e-06, + "logits/chosen": 491996057.6, + "logits/rejected": 184412373.33333334, + "logps/chosen": -279.55400390625, + "logps/rejected": -206.9423828125, + "loss": 0.0388, + "rewards/chosen": 4.471352386474609, + "rewards/margins": 10.118597793579102, + "rewards/rejected": -5.647245407104492, + "step": 5465 + }, + { + "epoch": 0.4994061215166743, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 5.025161391027232e-06, + "logits/chosen": 1135163187.2, + "logits/rejected": 739879936.0, + "logps/chosen": -276.7830078125, + "logps/rejected": -292.53892008463544, + "loss": 0.0129, + "rewards/chosen": 4.105921936035156, + "rewards/margins": 11.566071065266927, + "rewards/rejected": -7.4601491292317705, + "step": 5466 + }, + { + "epoch": 0.4994974874371859, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5.023723608370691e-06, + "logits/chosen": 589233194.6666666, + "logits/rejected": 653923020.8, + "logps/chosen": -281.20554606119794, + "logps/rejected": -433.986669921875, + "loss": 0.0105, + "rewards/chosen": 3.590529441833496, + "rewards/margins": 12.361274528503419, + "rewards/rejected": -8.770745086669923, + "step": 5467 + }, + { + "epoch": 0.4995888533576976, + "grad_norm": 0.62890625, + "kl": 0.0, + "learning_rate": 5.022285823752426e-06, + "logits/chosen": 279114304.0, + "logits/rejected": 365210777.6, + "logps/chosen": -264.95652262369794, + "logps/rejected": -451.8244140625, + "loss": 0.0035, + "rewards/chosen": 5.07012144724528, + "rewards/margins": 14.176746050516766, + "rewards/rejected": -9.106624603271484, + "step": 5468 + }, + { + "epoch": 0.4996802192782092, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5.020848037291332e-06, + "logits/chosen": 552784576.0, + "logits/rejected": 433721984.0, + "logps/chosen": -393.0338134765625, + "logps/rejected": -517.5908203125, + "loss": 0.0178, + "rewards/chosen": 4.241298675537109, + "rewards/margins": 14.717557907104492, + "rewards/rejected": -10.476259231567383, + "step": 5469 + }, + { + "epoch": 0.4997715851987209, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5.019410249106297e-06, + "logits/chosen": 487919923.2, + "logits/rejected": 250437376.0, + "logps/chosen": -267.9065673828125, + "logps/rejected": -405.7261149088542, + "loss": 0.0317, + "rewards/chosen": 3.293932342529297, + "rewards/margins": 10.177364730834961, + "rewards/rejected": -6.883432388305664, + "step": 5470 + }, + { + "epoch": 0.4998629511192325, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 5.0179724593162146e-06, + "logits/chosen": 478144960.0, + "logits/rejected": 434358893.71428573, + "logps/chosen": -221.98358154296875, + "logps/rejected": -509.50802176339283, + "loss": 0.0056, + "rewards/chosen": 3.120626926422119, + "rewards/margins": 12.135643209729876, + "rewards/rejected": -9.015016283307757, + "step": 5471 + }, + { + "epoch": 0.4999543170397442, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 5.016534668039976e-06, + "logits/chosen": 530200512.0, + "logits/rejected": 560184000.0, + "logps/chosen": -410.9021301269531, + "logps/rejected": -596.7879638671875, + "loss": 0.0042, + "rewards/chosen": 5.169556617736816, + "rewards/margins": 15.302693367004395, + "rewards/rejected": -10.133136749267578, + "step": 5472 + }, + { + "epoch": 0.5000456829602559, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5.015096875396474e-06, + "logits/chosen": 395327786.6666667, + "logits/rejected": 538944896.0, + "logps/chosen": -224.39701334635416, + "logps/rejected": -644.2719116210938, + "loss": 0.0348, + "rewards/chosen": 3.729302088419596, + "rewards/margins": 13.805667559305826, + "rewards/rejected": -10.07636547088623, + "step": 5473 + }, + { + "epoch": 0.5001370488807675, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5.0136590815046005e-06, + "logits/chosen": 446396006.4, + "logits/rejected": 241050794.66666666, + "logps/chosen": -339.61220703125, + "logps/rejected": -399.7394612630208, + "loss": 0.0288, + "rewards/chosen": 3.3635818481445314, + "rewards/margins": 13.38582026163737, + "rewards/rejected": -10.022238413492838, + "step": 5474 + }, + { + "epoch": 0.5002284148012791, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5.012221286483246e-06, + "logits/chosen": 428549162.6666667, + "logits/rejected": 924197952.0, + "logps/chosen": -250.2933349609375, + "logps/rejected": -256.7949523925781, + "loss": 0.0372, + "rewards/chosen": 4.205356915791829, + "rewards/margins": 11.342319806416828, + "rewards/rejected": -7.136962890625, + "step": 5475 + }, + { + "epoch": 0.5003197807217907, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.010783490451306e-06, + "logits/chosen": 642318016.0, + "logits/rejected": 521515744.0, + "logps/chosen": -295.15655517578125, + "logps/rejected": -414.8359375, + "loss": 0.0459, + "rewards/chosen": 2.903794765472412, + "rewards/margins": 11.98531198501587, + "rewards/rejected": -9.081517219543457, + "step": 5476 + }, + { + "epoch": 0.5004111466423025, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 5.009345693527672e-06, + "logits/chosen": 435137877.3333333, + "logits/rejected": 516628121.6, + "logps/chosen": -343.7852783203125, + "logps/rejected": -431.79853515625, + "loss": 0.0086, + "rewards/chosen": 3.955591837565104, + "rewards/margins": 13.266019694010415, + "rewards/rejected": -9.310427856445312, + "step": 5477 + }, + { + "epoch": 0.5005025125628141, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5.007907895831235e-06, + "logits/chosen": 558468044.8, + "logits/rejected": 717808981.3333334, + "logps/chosen": -192.9289306640625, + "logps/rejected": -672.2470703125, + "loss": 0.0276, + "rewards/chosen": 3.7109115600585936, + "rewards/margins": 11.646232732137044, + "rewards/rejected": -7.93532117207845, + "step": 5478 + }, + { + "epoch": 0.5005938784833257, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5.006470097480887e-06, + "logits/chosen": 396991104.0, + "logits/rejected": 343075424.0, + "logps/chosen": -380.17413330078125, + "logps/rejected": -447.90850830078125, + "loss": 0.0225, + "rewards/chosen": 3.6150593757629395, + "rewards/margins": 11.436429500579834, + "rewards/rejected": -7.8213701248168945, + "step": 5479 + }, + { + "epoch": 0.5006852444038373, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 5.005032298595524e-06, + "logits/chosen": 562147123.2, + "logits/rejected": 747177301.3333334, + "logps/chosen": -333.606201171875, + "logps/rejected": -607.2246500651041, + "loss": 0.0159, + "rewards/chosen": 4.147359466552734, + "rewards/margins": 14.029593404134115, + "rewards/rejected": -9.88223393758138, + "step": 5480 + }, + { + "epoch": 0.5007766103243491, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 5.003594499294034e-06, + "logits/chosen": 347393996.8, + "logits/rejected": 234679722.66666666, + "logps/chosen": -229.1732421875, + "logps/rejected": -258.16074625651044, + "loss": 0.1361, + "rewards/chosen": 2.2861690521240234, + "rewards/margins": 8.773260752360027, + "rewards/rejected": -6.487091700236003, + "step": 5481 + }, + { + "epoch": 0.5008679762448607, + "grad_norm": 2.390625, + "kl": 3.042074203491211, + "learning_rate": 5.002156699695314e-06, + "logits/chosen": 308602773.3333333, + "logits/rejected": 308457120.0, + "logps/chosen": -291.7379557291667, + "logps/rejected": -346.1661376953125, + "loss": 0.0148, + "rewards/chosen": 4.988670984903972, + "rewards/margins": 14.03720537821452, + "rewards/rejected": -9.048534393310547, + "step": 5482 + }, + { + "epoch": 0.5009593421653723, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 5.0007188999182535e-06, + "logits/chosen": 907961472.0, + "logits/rejected": 731605952.0, + "logps/chosen": -395.30828857421875, + "logps/rejected": -723.9701538085938, + "loss": 0.0116, + "rewards/chosen": 4.232239246368408, + "rewards/margins": 14.15409803390503, + "rewards/rejected": -9.921858787536621, + "step": 5483 + }, + { + "epoch": 0.5010507080858839, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 4.999281100081748e-06, + "logits/chosen": 758746560.0, + "logits/rejected": 521909845.3333333, + "logps/chosen": -207.81797790527344, + "logps/rejected": -520.9411214192709, + "loss": 0.0103, + "rewards/chosen": 3.181993246078491, + "rewards/margins": 12.405210256576538, + "rewards/rejected": -9.223217010498047, + "step": 5484 + }, + { + "epoch": 0.5011420740063957, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 4.997843300304688e-06, + "logits/chosen": 565473996.8, + "logits/rejected": 770170709.3333334, + "logps/chosen": -338.18583984375, + "logps/rejected": -825.9175618489584, + "loss": 0.0151, + "rewards/chosen": 4.381341934204102, + "rewards/margins": 13.076674779256185, + "rewards/rejected": -8.695332845052084, + "step": 5485 + }, + { + "epoch": 0.5012334399269073, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 4.996405500705967e-06, + "logits/chosen": 760792746.6666666, + "logits/rejected": 652063436.8, + "logps/chosen": -459.4110921223958, + "logps/rejected": -661.096875, + "loss": 0.0158, + "rewards/chosen": 3.292701085408529, + "rewards/margins": 13.421132787068686, + "rewards/rejected": -10.128431701660157, + "step": 5486 + }, + { + "epoch": 0.5013248058474189, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 4.994967701404478e-06, + "logits/chosen": 608006272.0, + "logits/rejected": 573474133.3333334, + "logps/chosen": -361.24407958984375, + "logps/rejected": -607.5104166666666, + "loss": 0.0079, + "rewards/chosen": 3.4542860984802246, + "rewards/margins": 14.205436865488688, + "rewards/rejected": -10.751150767008463, + "step": 5487 + }, + { + "epoch": 0.5014161717679305, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 4.993529902519114e-06, + "logits/chosen": 544225382.4, + "logits/rejected": 836460885.3333334, + "logps/chosen": -296.434375, + "logps/rejected": -294.9369710286458, + "loss": 0.0123, + "rewards/chosen": 4.222883987426758, + "rewards/margins": 12.273239262898763, + "rewards/rejected": -8.050355275472006, + "step": 5488 + }, + { + "epoch": 0.5015075376884423, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 4.992092104168766e-06, + "logits/chosen": 464632576.0, + "logits/rejected": 354650752.0, + "logps/chosen": -300.5286865234375, + "logps/rejected": -654.8309936523438, + "loss": 0.0116, + "rewards/chosen": 4.512617429097493, + "rewards/margins": 16.95101769765218, + "rewards/rejected": -12.438400268554688, + "step": 5489 + }, + { + "epoch": 0.5015989036089539, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 4.99065430647233e-06, + "logits/chosen": 615835776.0, + "logits/rejected": 687784192.0, + "logps/chosen": -264.7966003417969, + "logps/rejected": -558.6956380208334, + "loss": 0.0098, + "rewards/chosen": 3.2836570739746094, + "rewards/margins": 11.948953628540039, + "rewards/rejected": -8.66529655456543, + "step": 5490 + }, + { + "epoch": 0.5016902695294655, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 4.9892165095486945e-06, + "logits/chosen": 1003681280.0, + "logits/rejected": 558988390.4, + "logps/chosen": -367.196044921875, + "logps/rejected": -406.239501953125, + "loss": 0.0096, + "rewards/chosen": 4.365035057067871, + "rewards/margins": 15.117275810241699, + "rewards/rejected": -10.752240753173828, + "step": 5491 + }, + { + "epoch": 0.5017816354499771, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 4.987778713516753e-06, + "logits/chosen": 696959552.0, + "logits/rejected": 680978176.0, + "logps/chosen": -310.904296875, + "logps/rejected": -799.7421875, + "loss": 0.0133, + "rewards/chosen": 3.9212098121643066, + "rewards/margins": 15.240221500396729, + "rewards/rejected": -11.319011688232422, + "step": 5492 + }, + { + "epoch": 0.5018730013704888, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 4.9863409184954e-06, + "logits/chosen": 568736384.0, + "logits/rejected": 941436800.0, + "logps/chosen": -250.21725463867188, + "logps/rejected": -426.3273010253906, + "loss": 0.0173, + "rewards/chosen": 3.9093809127807617, + "rewards/margins": 11.899297714233398, + "rewards/rejected": -7.989916801452637, + "step": 5493 + }, + { + "epoch": 0.5019643672910005, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 4.9849031246035265e-06, + "logits/chosen": 706320213.3333334, + "logits/rejected": 362173184.0, + "logps/chosen": -291.89658610026044, + "logps/rejected": -429.967236328125, + "loss": 0.132, + "rewards/chosen": 1.0210429032643635, + "rewards/margins": 9.251777919133504, + "rewards/rejected": -8.23073501586914, + "step": 5494 + }, + { + "epoch": 0.5020557332115121, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 4.983465331960025e-06, + "logits/chosen": 1176240128.0, + "logits/rejected": 698076288.0, + "logps/chosen": -429.95843505859375, + "logps/rejected": -674.098876953125, + "loss": 0.0221, + "rewards/chosen": 3.234835624694824, + "rewards/margins": 11.883400917053223, + "rewards/rejected": -8.648565292358398, + "step": 5495 + }, + { + "epoch": 0.5021470991320237, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 4.982027540683785e-06, + "logits/chosen": 1030551424.0, + "logits/rejected": 419833920.0, + "logps/chosen": -202.30795288085938, + "logps/rejected": -495.17022705078125, + "loss": 0.024, + "rewards/chosen": 3.515758752822876, + "rewards/margins": 12.476545572280884, + "rewards/rejected": -8.960786819458008, + "step": 5496 + }, + { + "epoch": 0.5022384650525354, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 4.980589750893704e-06, + "logits/chosen": 644223168.0, + "logits/rejected": 602981290.6666666, + "logps/chosen": -596.9644165039062, + "logps/rejected": -516.9239908854166, + "loss": 0.01, + "rewards/chosen": 3.2626495361328125, + "rewards/margins": 12.827540715535482, + "rewards/rejected": -9.56489117940267, + "step": 5497 + }, + { + "epoch": 0.5023298309730471, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.979151962708671e-06, + "logits/chosen": 896755507.2, + "logits/rejected": 778480128.0, + "logps/chosen": -256.011669921875, + "logps/rejected": -380.6287841796875, + "loss": 0.0423, + "rewards/chosen": 2.883331298828125, + "rewards/margins": 11.159748077392578, + "rewards/rejected": -8.276416778564453, + "step": 5498 + }, + { + "epoch": 0.5024211968935587, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 4.977714176247576e-06, + "logits/chosen": 658282598.4, + "logits/rejected": 364612821.3333333, + "logps/chosen": -384.6918212890625, + "logps/rejected": -552.3683268229166, + "loss": 0.031, + "rewards/chosen": 3.283092498779297, + "rewards/margins": 11.993220647176107, + "rewards/rejected": -8.71012814839681, + "step": 5499 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 4.976276391629311e-06, + "logits/chosen": 515103530.6666667, + "logits/rejected": 249369728.0, + "logps/chosen": -366.6539713541667, + "logps/rejected": -680.6176147460938, + "loss": 0.017, + "rewards/chosen": 4.293382962544759, + "rewards/margins": 13.859410603841145, + "rewards/rejected": -9.566027641296387, + "step": 5500 + }, + { + "epoch": 0.502603928734582, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 4.97483860897277e-06, + "logits/chosen": 613623744.0, + "logits/rejected": 989856704.0, + "logps/chosen": -358.8055114746094, + "logps/rejected": -563.7869873046875, + "loss": 0.0122, + "rewards/chosen": 3.912749767303467, + "rewards/margins": 14.73195505142212, + "rewards/rejected": -10.819205284118652, + "step": 5501 + }, + { + "epoch": 0.5026952946550937, + "grad_norm": 0.41796875, + "kl": 0.0, + "learning_rate": 4.973400828396842e-06, + "logits/chosen": 771554176.0, + "logits/rejected": 457800704.0, + "logps/chosen": -589.88623046875, + "logps/rejected": -392.85191127232144, + "loss": 0.0023, + "rewards/chosen": 5.09578275680542, + "rewards/margins": 13.743285655975342, + "rewards/rejected": -8.647502899169922, + "step": 5502 + }, + { + "epoch": 0.5027866605756053, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 4.97196305002042e-06, + "logits/chosen": 689815449.6, + "logits/rejected": 541036629.3333334, + "logps/chosen": -390.48095703125, + "logps/rejected": -650.9338785807291, + "loss": 0.0161, + "rewards/chosen": 3.9159080505371096, + "rewards/margins": 13.684259287516277, + "rewards/rejected": -9.768351236979166, + "step": 5503 + }, + { + "epoch": 0.5028780264961169, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 4.9705252739623935e-06, + "logits/chosen": 448912281.6, + "logits/rejected": 374189440.0, + "logps/chosen": -232.115869140625, + "logps/rejected": -348.9837646484375, + "loss": 0.0214, + "rewards/chosen": 3.6207672119140626, + "rewards/margins": 12.529981231689453, + "rewards/rejected": -8.90921401977539, + "step": 5504 + }, + { + "epoch": 0.5029693924166286, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 4.9690875003416545e-06, + "logits/chosen": 614409792.0, + "logits/rejected": 274527424.0, + "logps/chosen": -326.7455139160156, + "logps/rejected": -289.5227355957031, + "loss": 0.017, + "rewards/chosen": 3.430354356765747, + "rewards/margins": 13.049170732498169, + "rewards/rejected": -9.618816375732422, + "step": 5505 + }, + { + "epoch": 0.5030607583371403, + "grad_norm": 0.3515625, + "kl": 0.0, + "learning_rate": 4.9676497292770915e-06, + "logits/chosen": 155073888.0, + "logits/rejected": 383350381.71428573, + "logps/chosen": -64.61748504638672, + "logps/rejected": -437.05569893973217, + "loss": 0.0022, + "rewards/chosen": 4.2972564697265625, + "rewards/margins": 12.852140699114118, + "rewards/rejected": -8.554884229387556, + "step": 5506 + }, + { + "epoch": 0.5031521242576519, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 4.966211960887597e-06, + "logits/chosen": 657325141.3333334, + "logits/rejected": 565749555.2, + "logps/chosen": -637.7415771484375, + "logps/rejected": -538.963671875, + "loss": 0.0085, + "rewards/chosen": 3.9394963582356772, + "rewards/margins": 12.825784810384116, + "rewards/rejected": -8.886288452148438, + "step": 5507 + }, + { + "epoch": 0.5032434901781635, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 4.9647741952920595e-06, + "logits/chosen": 371755289.6, + "logits/rejected": 272360618.6666667, + "logps/chosen": -259.4068359375, + "logps/rejected": -496.0514322916667, + "loss": 0.0218, + "rewards/chosen": 3.90469970703125, + "rewards/margins": 13.641184234619141, + "rewards/rejected": -9.73648452758789, + "step": 5508 + }, + { + "epoch": 0.5033348560986752, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 4.963336432609371e-06, + "logits/chosen": 1013771008.0, + "logits/rejected": 515145779.2, + "logps/chosen": -323.4538981119792, + "logps/rejected": -394.4494384765625, + "loss": 0.0166, + "rewards/chosen": 3.2178306579589844, + "rewards/margins": 11.508020782470703, + "rewards/rejected": -8.290190124511719, + "step": 5509 + }, + { + "epoch": 0.5034262220191869, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 4.96189867295842e-06, + "logits/chosen": 498655872.0, + "logits/rejected": 564392320.0, + "logps/chosen": -297.4862060546875, + "logps/rejected": -450.0987854003906, + "loss": 0.0176, + "rewards/chosen": 3.7010116577148438, + "rewards/margins": 14.008903503417969, + "rewards/rejected": -10.307891845703125, + "step": 5510 + }, + { + "epoch": 0.5035175879396985, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 4.9604609164580935e-06, + "logits/chosen": 395412864.0, + "logits/rejected": 542049728.0, + "logps/chosen": -279.11785888671875, + "logps/rejected": -617.524658203125, + "loss": 0.0199, + "rewards/chosen": 3.9040047327677407, + "rewards/margins": 13.09685198465983, + "rewards/rejected": -9.19284725189209, + "step": 5511 + }, + { + "epoch": 0.5036089538602101, + "grad_norm": 0.43359375, + "kl": 0.0, + "learning_rate": 4.959023163227285e-06, + "logits/chosen": 301812704.0, + "logits/rejected": 378290389.3333333, + "logps/chosen": -373.0135803222656, + "logps/rejected": -432.1314290364583, + "loss": 0.0017, + "rewards/chosen": 5.43765926361084, + "rewards/margins": 13.617809613545736, + "rewards/rejected": -8.180150349934896, + "step": 5512 + }, + { + "epoch": 0.5037003197807218, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 4.957585413384879e-06, + "logits/chosen": 318523605.3333333, + "logits/rejected": 501863628.8, + "logps/chosen": -235.62740071614584, + "logps/rejected": -504.7541015625, + "loss": 0.0225, + "rewards/chosen": 2.810211181640625, + "rewards/margins": 12.154361724853516, + "rewards/rejected": -9.34415054321289, + "step": 5513 + }, + { + "epoch": 0.5037916857012334, + "grad_norm": 0.91015625, + "kl": 0.0, + "learning_rate": 4.956147667049769e-06, + "logits/chosen": 792144256.0, + "logits/rejected": 539373568.0, + "logps/chosen": -395.930908203125, + "logps/rejected": -481.0997314453125, + "loss": 0.0048, + "rewards/chosen": 4.453709602355957, + "rewards/margins": 11.775205930074055, + "rewards/rejected": -7.321496327718099, + "step": 5514 + }, + { + "epoch": 0.5038830516217451, + "grad_norm": 54.75, + "kl": 0.0, + "learning_rate": 4.954709924340839e-06, + "logits/chosen": 366222336.0, + "logits/rejected": 388652096.0, + "logps/chosen": -206.14205932617188, + "logps/rejected": -508.65423583984375, + "loss": 0.1057, + "rewards/chosen": 1.9951257705688477, + "rewards/margins": 13.053631782531738, + "rewards/rejected": -11.05850601196289, + "step": 5515 + }, + { + "epoch": 0.5039744175422567, + "grad_norm": 0.734375, + "kl": 0.0, + "learning_rate": 4.953272185376979e-06, + "logits/chosen": 512875040.0, + "logits/rejected": 372639232.0, + "logps/chosen": -380.9923095703125, + "logps/rejected": -531.4216715494791, + "loss": 0.0031, + "rewards/chosen": 4.933407783508301, + "rewards/margins": 15.48067315419515, + "rewards/rejected": -10.54726537068685, + "step": 5516 + }, + { + "epoch": 0.5040657834627684, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 4.9518344502770755e-06, + "logits/chosen": 494392661.3333333, + "logits/rejected": 583718195.2, + "logps/chosen": -386.431884765625, + "logps/rejected": -543.8158203125, + "loss": 0.0124, + "rewards/chosen": 3.6394999821980796, + "rewards/margins": 12.265009625752768, + "rewards/rejected": -8.625509643554688, + "step": 5517 + }, + { + "epoch": 0.50415714938328, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 4.950396719160019e-06, + "logits/chosen": 714804288.0, + "logits/rejected": 1303427200.0, + "logps/chosen": -400.2000732421875, + "logps/rejected": -861.243896484375, + "loss": 0.0086, + "rewards/chosen": 4.592284202575684, + "rewards/margins": 15.144906044006348, + "rewards/rejected": -10.552621841430664, + "step": 5518 + }, + { + "epoch": 0.5042485153037917, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 4.948958992144691e-06, + "logits/chosen": 974305109.3333334, + "logits/rejected": 932923776.0, + "logps/chosen": -344.1364339192708, + "logps/rejected": -669.0404052734375, + "loss": 0.0235, + "rewards/chosen": 3.630404790242513, + "rewards/margins": 13.147293408711752, + "rewards/rejected": -9.516888618469238, + "step": 5519 + }, + { + "epoch": 0.5043398812243033, + "grad_norm": 57.75, + "kl": 0.0, + "learning_rate": 4.947521269349984e-06, + "logits/chosen": 974343270.4, + "logits/rejected": 801524821.3333334, + "logps/chosen": -196.44627685546874, + "logps/rejected": -725.7916666666666, + "loss": 0.0733, + "rewards/chosen": 3.2402923583984373, + "rewards/margins": 14.605440266927083, + "rewards/rejected": -11.365147908528646, + "step": 5520 + }, + { + "epoch": 0.504431247144815, + "grad_norm": 52.75, + "kl": 0.0, + "learning_rate": 4.946083550894782e-06, + "logits/chosen": 513927744.0, + "logits/rejected": 553367808.0, + "logps/chosen": -334.27459716796875, + "logps/rejected": -410.4946594238281, + "loss": 0.0587, + "rewards/chosen": 3.672760486602783, + "rewards/margins": 9.890403270721436, + "rewards/rejected": -6.217642784118652, + "step": 5521 + }, + { + "epoch": 0.5045226130653266, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 4.944645836897969e-06, + "logits/chosen": 835636992.0, + "logits/rejected": 292409344.0, + "logps/chosen": -549.179443359375, + "logps/rejected": -465.526611328125, + "loss": 0.0059, + "rewards/chosen": 3.7613251209259033, + "rewards/margins": 14.525613705317179, + "rewards/rejected": -10.764288584391275, + "step": 5522 + }, + { + "epoch": 0.5046139789858383, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 4.943208127478434e-06, + "logits/chosen": 716169625.6, + "logits/rejected": 553472426.6666666, + "logps/chosen": -301.45927734375, + "logps/rejected": -275.81899007161456, + "loss": 0.0269, + "rewards/chosen": 3.495616912841797, + "rewards/margins": 10.997333526611328, + "rewards/rejected": -7.501716613769531, + "step": 5523 + }, + { + "epoch": 0.5047053449063499, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 4.941770422755061e-06, + "logits/chosen": 728225689.6, + "logits/rejected": 516545066.6666667, + "logps/chosen": -440.641162109375, + "logps/rejected": -637.69775390625, + "loss": 0.0329, + "rewards/chosen": 2.9825727462768556, + "rewards/margins": 14.841620953877769, + "rewards/rejected": -11.859048207600912, + "step": 5524 + }, + { + "epoch": 0.5047967108268616, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 4.940332722846733e-06, + "logits/chosen": 745705344.0, + "logits/rejected": 560816000.0, + "logps/chosen": -416.0718994140625, + "logps/rejected": -410.2120361328125, + "loss": 0.0133, + "rewards/chosen": 3.9114162921905518, + "rewards/margins": 13.822298288345337, + "rewards/rejected": -9.910881996154785, + "step": 5525 + }, + { + "epoch": 0.5048880767473732, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 4.9388950278723365e-06, + "logits/chosen": 393629120.0, + "logits/rejected": 315113536.0, + "logps/chosen": -374.29583740234375, + "logps/rejected": -225.2412567138672, + "loss": 0.0432, + "rewards/chosen": 3.2697019577026367, + "rewards/margins": 10.64141845703125, + "rewards/rejected": -7.371716499328613, + "step": 5526 + }, + { + "epoch": 0.5049794426678849, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 4.937457337950754e-06, + "logits/chosen": 669782630.4, + "logits/rejected": 803037866.6666666, + "logps/chosen": -343.592138671875, + "logps/rejected": -396.3561197916667, + "loss": 0.0319, + "rewards/chosen": 3.142373466491699, + "rewards/margins": 11.764839871724448, + "rewards/rejected": -8.622466405232748, + "step": 5527 + }, + { + "epoch": 0.5050708085883965, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 4.936019653200873e-06, + "logits/chosen": 540143104.0, + "logits/rejected": 445371904.0, + "logps/chosen": -236.5978271484375, + "logps/rejected": -373.461669921875, + "loss": 0.0291, + "rewards/chosen": 3.33886604309082, + "rewards/margins": 12.704906590779622, + "rewards/rejected": -9.366040547688803, + "step": 5528 + }, + { + "epoch": 0.5051621745089082, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 4.9345819737415704e-06, + "logits/chosen": 552683306.6666666, + "logits/rejected": 422859443.2, + "logps/chosen": -296.92138671875, + "logps/rejected": -630.91806640625, + "loss": 0.0148, + "rewards/chosen": 4.090047200520833, + "rewards/margins": 16.565601094563803, + "rewards/rejected": -12.475553894042969, + "step": 5529 + }, + { + "epoch": 0.5052535404294198, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 4.933144299691735e-06, + "logits/chosen": 423371520.0, + "logits/rejected": 642646784.0, + "logps/chosen": -476.353515625, + "logps/rejected": -684.62451171875, + "loss": 0.0158, + "rewards/chosen": 3.598532199859619, + "rewards/margins": 14.6382155418396, + "rewards/rejected": -11.03968334197998, + "step": 5530 + }, + { + "epoch": 0.5053449063499315, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 4.931706631170246e-06, + "logits/chosen": 672912947.2, + "logits/rejected": 435007914.6666667, + "logps/chosen": -388.1225341796875, + "logps/rejected": -515.4820149739584, + "loss": 0.0271, + "rewards/chosen": 3.5728225708007812, + "rewards/margins": 12.206368764241537, + "rewards/rejected": -8.633546193440756, + "step": 5531 + }, + { + "epoch": 0.5054362722704431, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 4.930268968295985e-06, + "logits/chosen": 1361755648.0, + "logits/rejected": 653078058.6666666, + "logps/chosen": -179.1156768798828, + "logps/rejected": -455.1036783854167, + "loss": 0.0101, + "rewards/chosen": 3.49485445022583, + "rewards/margins": 12.269424279530844, + "rewards/rejected": -8.774569829305014, + "step": 5532 + }, + { + "epoch": 0.5055276381909548, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 4.928831311187835e-06, + "logits/chosen": 648311338.6666666, + "logits/rejected": 489589600.0, + "logps/chosen": -286.213623046875, + "logps/rejected": -648.9044189453125, + "loss": 0.0152, + "rewards/chosen": 4.163331985473633, + "rewards/margins": 16.11203956604004, + "rewards/rejected": -11.948707580566406, + "step": 5533 + }, + { + "epoch": 0.5056190041114664, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.927393659964675e-06, + "logits/chosen": 389483161.6, + "logits/rejected": 486213461.3333333, + "logps/chosen": -286.836962890625, + "logps/rejected": -430.7122395833333, + "loss": 0.0201, + "rewards/chosen": 3.9120792388916015, + "rewards/margins": 12.675991439819336, + "rewards/rejected": -8.763912200927734, + "step": 5534 + }, + { + "epoch": 0.505710370031978, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 4.925956014745389e-06, + "logits/chosen": 472928960.0, + "logits/rejected": 357807200.0, + "logps/chosen": -235.12242126464844, + "logps/rejected": -502.77593994140625, + "loss": 0.03, + "rewards/chosen": 3.1304261684417725, + "rewards/margins": 14.48070502281189, + "rewards/rejected": -11.350278854370117, + "step": 5535 + }, + { + "epoch": 0.5058017359524897, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 4.924518375648852e-06, + "logits/chosen": 616610944.0, + "logits/rejected": 676914176.0, + "logps/chosen": -368.8391418457031, + "logps/rejected": -299.9583740234375, + "loss": 0.0208, + "rewards/chosen": 3.6305084228515625, + "rewards/margins": 11.628902912139893, + "rewards/rejected": -7.99839448928833, + "step": 5536 + }, + { + "epoch": 0.5058931018730014, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.923080742793948e-06, + "logits/chosen": 475458048.0, + "logits/rejected": 559112140.8, + "logps/chosen": -369.544677734375, + "logps/rejected": -484.648291015625, + "loss": 0.0145, + "rewards/chosen": 3.5582895278930664, + "rewards/margins": 12.776617622375488, + "rewards/rejected": -9.218328094482422, + "step": 5537 + }, + { + "epoch": 0.505984467793513, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 4.921643116299552e-06, + "logits/chosen": 300837610.6666667, + "logits/rejected": 518144512.0, + "logps/chosen": -288.9028727213542, + "logps/rejected": -371.07880859375, + "loss": 0.0143, + "rewards/chosen": 3.62781556447347, + "rewards/margins": 12.664599927266439, + "rewards/rejected": -9.036784362792968, + "step": 5538 + }, + { + "epoch": 0.5060758337140246, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 4.920205496284546e-06, + "logits/chosen": 815152640.0, + "logits/rejected": 574074931.2, + "logps/chosen": -726.6871744791666, + "logps/rejected": -553.829150390625, + "loss": 0.0105, + "rewards/chosen": 3.6066150665283203, + "rewards/margins": 13.405446243286132, + "rewards/rejected": -9.798831176757812, + "step": 5539 + }, + { + "epoch": 0.5061671996345363, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 4.918767882867805e-06, + "logits/chosen": 663575488.0, + "logits/rejected": 395610496.0, + "logps/chosen": -386.6013488769531, + "logps/rejected": -385.4573669433594, + "loss": 0.0275, + "rewards/chosen": 3.578458309173584, + "rewards/margins": 13.228623867034912, + "rewards/rejected": -9.650165557861328, + "step": 5540 + }, + { + "epoch": 0.506258565555048, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 4.917330276168208e-06, + "logits/chosen": 563125077.3333334, + "logits/rejected": 638246092.8, + "logps/chosen": -312.7578125, + "logps/rejected": -377.280078125, + "loss": 0.0216, + "rewards/chosen": 2.898914655049642, + "rewards/margins": 10.630374463399251, + "rewards/rejected": -7.7314598083496096, + "step": 5541 + }, + { + "epoch": 0.5063499314755596, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 4.915892676304632e-06, + "logits/chosen": 565054976.0, + "logits/rejected": 461350656.0, + "logps/chosen": -403.968994140625, + "logps/rejected": -560.7638346354166, + "loss": 0.0185, + "rewards/chosen": 2.772111654281616, + "rewards/margins": 11.644970337549845, + "rewards/rejected": -8.872858683268229, + "step": 5542 + }, + { + "epoch": 0.5064412973960712, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.9144550833959506e-06, + "logits/chosen": 593580330.6666666, + "logits/rejected": 322835558.4, + "logps/chosen": -228.8326619466146, + "logps/rejected": -412.851904296875, + "loss": 0.0117, + "rewards/chosen": 3.79132080078125, + "rewards/margins": 13.465118408203125, + "rewards/rejected": -9.673797607421875, + "step": 5543 + }, + { + "epoch": 0.5065326633165829, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 4.913017497561043e-06, + "logits/chosen": 753687040.0, + "logits/rejected": 607297664.0, + "logps/chosen": -303.6041666666667, + "logps/rejected": -546.3041381835938, + "loss": 0.0186, + "rewards/chosen": 4.021251042683919, + "rewards/margins": 15.100243886311848, + "rewards/rejected": -11.07899284362793, + "step": 5544 + }, + { + "epoch": 0.5066240292370946, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 4.911579918918781e-06, + "logits/chosen": 399638336.0, + "logits/rejected": 461083200.0, + "logps/chosen": -227.80636596679688, + "logps/rejected": -496.283935546875, + "loss": 0.0216, + "rewards/chosen": 3.5704987049102783, + "rewards/margins": 10.598981142044067, + "rewards/rejected": -7.028482437133789, + "step": 5545 + }, + { + "epoch": 0.5067153951576062, + "grad_norm": 72.0, + "kl": 0.0, + "learning_rate": 4.910142347588041e-06, + "logits/chosen": 337905066.6666667, + "logits/rejected": 318492006.4, + "logps/chosen": -249.58740234375, + "logps/rejected": -431.12607421875, + "loss": 0.035, + "rewards/chosen": 3.5058806737264, + "rewards/margins": 14.448648007710775, + "rewards/rejected": -10.942767333984374, + "step": 5546 + }, + { + "epoch": 0.5068067610781178, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 4.908704783687695e-06, + "logits/chosen": 400380586.6666667, + "logits/rejected": 429094502.4, + "logps/chosen": -261.26259358723956, + "logps/rejected": -574.797119140625, + "loss": 0.0097, + "rewards/chosen": 4.366260528564453, + "rewards/margins": 15.87183074951172, + "rewards/rejected": -11.505570220947266, + "step": 5547 + }, + { + "epoch": 0.5068981269986295, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.9072672273366204e-06, + "logits/chosen": 579817045.3333334, + "logits/rejected": 402495334.4, + "logps/chosen": -463.9925537109375, + "logps/rejected": -481.833154296875, + "loss": 0.0198, + "rewards/chosen": 3.4235623677571616, + "rewards/margins": 12.925820668538412, + "rewards/rejected": -9.50225830078125, + "step": 5548 + }, + { + "epoch": 0.5069894929191412, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 4.905829678653685e-06, + "logits/chosen": 554429988.5714285, + "logits/rejected": 586171456.0, + "logps/chosen": -421.8490513392857, + "logps/rejected": -452.84881591796875, + "loss": 0.0469, + "rewards/chosen": 3.0673065185546875, + "rewards/margins": 13.57019329071045, + "rewards/rejected": -10.502886772155762, + "step": 5549 + }, + { + "epoch": 0.5070808588396528, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.904392137757763e-06, + "logits/chosen": 536068864.0, + "logits/rejected": 468172586.6666667, + "logps/chosen": -364.15, + "logps/rejected": -375.049072265625, + "loss": 0.0296, + "rewards/chosen": 3.3640167236328127, + "rewards/margins": 11.355597686767577, + "rewards/rejected": -7.991580963134766, + "step": 5550 + }, + { + "epoch": 0.5071722247601644, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 4.9029546047677265e-06, + "logits/chosen": 584711360.0, + "logits/rejected": 703348416.0, + "logps/chosen": -355.19818115234375, + "logps/rejected": -615.14111328125, + "loss": 0.0107, + "rewards/chosen": 3.957700252532959, + "rewards/margins": 14.245898723602295, + "rewards/rejected": -10.288198471069336, + "step": 5551 + }, + { + "epoch": 0.5072635906806761, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 4.9015170798024445e-06, + "logits/chosen": 455481124.5714286, + "logits/rejected": 376844064.0, + "logps/chosen": -359.00718470982144, + "logps/rejected": -139.07676696777344, + "loss": 0.1457, + "rewards/chosen": 2.9234215872628346, + "rewards/margins": 6.838887180600848, + "rewards/rejected": -3.9154655933380127, + "step": 5552 + }, + { + "epoch": 0.5073549566011878, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 4.900079562980787e-06, + "logits/chosen": 549412608.0, + "logits/rejected": 404826112.0, + "logps/chosen": -386.07373046875, + "logps/rejected": -470.3336588541667, + "loss": 0.0061, + "rewards/chosen": 4.172989845275879, + "rewards/margins": 14.188806851704916, + "rewards/rejected": -10.015817006429037, + "step": 5553 + }, + { + "epoch": 0.5074463225216994, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 4.898642054421624e-06, + "logits/chosen": 611642304.0, + "logits/rejected": 502851328.0, + "logps/chosen": -466.4941101074219, + "logps/rejected": -824.5838623046875, + "loss": 0.0116, + "rewards/chosen": 3.8016183376312256, + "rewards/margins": 14.39185118675232, + "rewards/rejected": -10.590232849121094, + "step": 5554 + }, + { + "epoch": 0.507537688442211, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 4.897204554243823e-06, + "logits/chosen": 511281715.2, + "logits/rejected": 585385898.6666666, + "logps/chosen": -382.3938232421875, + "logps/rejected": -587.2621256510416, + "loss": 0.019, + "rewards/chosen": 3.6647377014160156, + "rewards/margins": 14.232648213704428, + "rewards/rejected": -10.567910512288412, + "step": 5555 + }, + { + "epoch": 0.5076290543627227, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 4.895767062566254e-06, + "logits/chosen": 505925674.6666667, + "logits/rejected": 734233958.4, + "logps/chosen": -299.22088623046875, + "logps/rejected": -519.71591796875, + "loss": 0.0202, + "rewards/chosen": 3.5402514139811196, + "rewards/margins": 14.33527857462565, + "rewards/rejected": -10.795027160644532, + "step": 5556 + }, + { + "epoch": 0.5077204202832344, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 4.894329579507782e-06, + "logits/chosen": 386935104.0, + "logits/rejected": 355269856.0, + "logps/chosen": -284.7190246582031, + "logps/rejected": -316.62103271484375, + "loss": 0.0213, + "rewards/chosen": 4.299409866333008, + "rewards/margins": 10.864335060119629, + "rewards/rejected": -6.564925193786621, + "step": 5557 + }, + { + "epoch": 0.507811786203746, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 4.892892105187277e-06, + "logits/chosen": 373573205.3333333, + "logits/rejected": 386589465.6, + "logps/chosen": -276.27740478515625, + "logps/rejected": -504.92255859375, + "loss": 0.0954, + "rewards/chosen": 4.000090916951497, + "rewards/margins": 12.68401730855306, + "rewards/rejected": -8.683926391601563, + "step": 5558 + }, + { + "epoch": 0.5079031521242576, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 4.891454639723601e-06, + "logits/chosen": 1056183978.6666666, + "logits/rejected": 666586560.0, + "logps/chosen": -486.8184407552083, + "logps/rejected": -613.7330322265625, + "loss": 0.0374, + "rewards/chosen": 3.8143234252929688, + "rewards/margins": 16.334774017333984, + "rewards/rejected": -12.520450592041016, + "step": 5559 + }, + { + "epoch": 0.5079945180447692, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 4.890017183235621e-06, + "logits/chosen": 649663744.0, + "logits/rejected": 440684224.0, + "logps/chosen": -301.8980407714844, + "logps/rejected": -489.6542053222656, + "loss": 0.0305, + "rewards/chosen": 3.3066134452819824, + "rewards/margins": 12.017465114593506, + "rewards/rejected": -8.710851669311523, + "step": 5560 + }, + { + "epoch": 0.508085883965281, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 4.8885797358422e-06, + "logits/chosen": 370653568.0, + "logits/rejected": 452852480.0, + "logps/chosen": -332.59747314453125, + "logps/rejected": -700.9921875, + "loss": 0.033, + "rewards/chosen": 3.283081531524658, + "rewards/margins": 15.753973484039307, + "rewards/rejected": -12.470891952514648, + "step": 5561 + }, + { + "epoch": 0.5081772498857926, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 4.887142297662204e-06, + "logits/chosen": 640959936.0, + "logits/rejected": 925758805.3333334, + "logps/chosen": -253.4786376953125, + "logps/rejected": -342.946044921875, + "loss": 0.0462, + "rewards/chosen": 4.427878379821777, + "rewards/margins": 11.295044263203938, + "rewards/rejected": -6.867165883382161, + "step": 5562 + }, + { + "epoch": 0.5082686158063042, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 4.885704868814493e-06, + "logits/chosen": 583649177.6, + "logits/rejected": 326003157.3333333, + "logps/chosen": -375.16171875, + "logps/rejected": -277.4717203776042, + "loss": 0.0087, + "rewards/chosen": 4.497867584228516, + "rewards/margins": 12.766688028971354, + "rewards/rejected": -8.268820444742838, + "step": 5563 + }, + { + "epoch": 0.5083599817268158, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 4.884267449417932e-06, + "logits/chosen": 1085544960.0, + "logits/rejected": 1014701824.0, + "logps/chosen": -243.779541015625, + "logps/rejected": -380.10406494140625, + "loss": 0.0152, + "rewards/chosen": 3.637474536895752, + "rewards/margins": 12.311760425567627, + "rewards/rejected": -8.674285888671875, + "step": 5564 + }, + { + "epoch": 0.5084513476473276, + "grad_norm": 43.0, + "kl": 0.0, + "learning_rate": 4.88283003959138e-06, + "logits/chosen": 771587669.3333334, + "logits/rejected": 523348070.4, + "logps/chosen": -428.2801920572917, + "logps/rejected": -431.31455078125, + "loss": 0.088, + "rewards/chosen": 3.343472162882487, + "rewards/margins": 12.874104181925455, + "rewards/rejected": -9.530632019042969, + "step": 5565 + }, + { + "epoch": 0.5085427135678392, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 4.881392639453697e-06, + "logits/chosen": 484498346.6666667, + "logits/rejected": 311337280.0, + "logps/chosen": -249.44283040364584, + "logps/rejected": -428.1231689453125, + "loss": 0.0379, + "rewards/chosen": 3.080023765563965, + "rewards/margins": 13.650555610656738, + "rewards/rejected": -10.570531845092773, + "step": 5566 + }, + { + "epoch": 0.5086340794883508, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 4.879955249123744e-06, + "logits/chosen": 526731477.3333333, + "logits/rejected": 435728486.4, + "logps/chosen": -442.0281575520833, + "logps/rejected": -456.831689453125, + "loss": 0.0074, + "rewards/chosen": 3.9846649169921875, + "rewards/margins": 12.856925964355469, + "rewards/rejected": -8.872261047363281, + "step": 5567 + }, + { + "epoch": 0.5087254454088624, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 4.87851786872038e-06, + "logits/chosen": 998053632.0, + "logits/rejected": 734034368.0, + "logps/chosen": -386.6663818359375, + "logps/rejected": -644.476806640625, + "loss": 0.0288, + "rewards/chosen": 3.108492851257324, + "rewards/margins": 14.665846824645996, + "rewards/rejected": -11.557353973388672, + "step": 5568 + }, + { + "epoch": 0.5088168113293742, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.877080498362464e-06, + "logits/chosen": 567375462.4, + "logits/rejected": 886328576.0, + "logps/chosen": -395.0998046875, + "logps/rejected": -594.7366536458334, + "loss": 0.0184, + "rewards/chosen": 4.1177619934082035, + "rewards/margins": 14.033611043294272, + "rewards/rejected": -9.915849049886068, + "step": 5569 + }, + { + "epoch": 0.5089081772498858, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.875643138168849e-06, + "logits/chosen": 455536512.0, + "logits/rejected": 334648672.0, + "logps/chosen": -258.11842854817706, + "logps/rejected": -472.2568359375, + "loss": 0.0403, + "rewards/chosen": 3.3220558166503906, + "rewards/margins": 11.73628044128418, + "rewards/rejected": -8.414224624633789, + "step": 5570 + }, + { + "epoch": 0.5089995431703974, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.874205788258397e-06, + "logits/chosen": 596811392.0, + "logits/rejected": 584874944.0, + "logps/chosen": -237.02450561523438, + "logps/rejected": -312.4188232421875, + "loss": 0.0235, + "rewards/chosen": 3.231905698776245, + "rewards/margins": 11.380720853805542, + "rewards/rejected": -8.148815155029297, + "step": 5571 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 4.87276844874996e-06, + "logits/chosen": 748944512.0, + "logits/rejected": 935602346.6666666, + "logps/chosen": -137.9418182373047, + "logps/rejected": -533.708740234375, + "loss": 0.0109, + "rewards/chosen": 3.4950733184814453, + "rewards/margins": 13.298900604248047, + "rewards/rejected": -9.803827285766602, + "step": 5572 + }, + { + "epoch": 0.5091822750114208, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 4.871331119762393e-06, + "logits/chosen": 641246634.6666666, + "logits/rejected": 691752294.4, + "logps/chosen": -328.84206136067706, + "logps/rejected": -512.6837890625, + "loss": 0.0086, + "rewards/chosen": 4.32468827565511, + "rewards/margins": 14.248864301045735, + "rewards/rejected": -9.924176025390626, + "step": 5573 + }, + { + "epoch": 0.5092736409319324, + "grad_norm": 29.875, + "kl": 0.0, + "learning_rate": 4.869893801414551e-06, + "logits/chosen": 686201770.6666666, + "logits/rejected": 832043520.0, + "logps/chosen": -245.29058837890625, + "logps/rejected": -485.98759765625, + "loss": 0.0334, + "rewards/chosen": 3.453479766845703, + "rewards/margins": 10.443000793457031, + "rewards/rejected": -6.989521026611328, + "step": 5574 + }, + { + "epoch": 0.509365006852444, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.868456493825285e-06, + "logits/chosen": 1076749226.6666667, + "logits/rejected": 732270976.0, + "logps/chosen": -300.5687255859375, + "logps/rejected": -291.51171875, + "loss": 0.0489, + "rewards/chosen": 2.8488407135009766, + "rewards/margins": 11.00918197631836, + "rewards/rejected": -8.160341262817383, + "step": 5575 + }, + { + "epoch": 0.5094563727729556, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 4.86701919711345e-06, + "logits/chosen": 440351072.0, + "logits/rejected": 606193792.0, + "logps/chosen": -305.091796875, + "logps/rejected": -699.937255859375, + "loss": 0.0111, + "rewards/chosen": 4.075971603393555, + "rewards/margins": 13.15172004699707, + "rewards/rejected": -9.075748443603516, + "step": 5576 + }, + { + "epoch": 0.5095477386934674, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 4.8655819113978935e-06, + "logits/chosen": 629934848.0, + "logits/rejected": 602331289.6, + "logps/chosen": -410.5862630208333, + "logps/rejected": -478.4841796875, + "loss": 0.0093, + "rewards/chosen": 4.251251220703125, + "rewards/margins": 13.649742126464844, + "rewards/rejected": -9.398490905761719, + "step": 5577 + }, + { + "epoch": 0.509639104613979, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.86414463679747e-06, + "logits/chosen": 594017945.6, + "logits/rejected": 439258368.0, + "logps/chosen": -324.81865234375, + "logps/rejected": -424.3162841796875, + "loss": 0.0176, + "rewards/chosen": 4.094099426269532, + "rewards/margins": 14.581350708007813, + "rewards/rejected": -10.487251281738281, + "step": 5578 + }, + { + "epoch": 0.5097304705344906, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.862707373431025e-06, + "logits/chosen": 433467904.0, + "logits/rejected": 478424490.6666667, + "logps/chosen": -306.25625, + "logps/rejected": -605.115966796875, + "loss": 0.1457, + "rewards/chosen": 1.9746068954467773, + "rewards/margins": 11.778103574117026, + "rewards/rejected": -9.803496678670248, + "step": 5579 + }, + { + "epoch": 0.5098218364550022, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 4.8612701214174075e-06, + "logits/chosen": 569706112.0, + "logits/rejected": 305510048.0, + "logps/chosen": -341.05517578125, + "logps/rejected": -522.3049926757812, + "loss": 0.0289, + "rewards/chosen": 3.459933598836263, + "rewards/margins": 15.165456136067709, + "rewards/rejected": -11.705522537231445, + "step": 5580 + }, + { + "epoch": 0.509913202375514, + "grad_norm": 0.0245361328125, + "kl": 0.0, + "learning_rate": 4.8598328808754665e-06, + "logits/rejected": 534714336.0, + "logps/rejected": -557.5421142578125, + "loss": 0.0001, + "rewards/rejected": -10.276476860046387, + "step": 5581 + }, + { + "epoch": 0.5100045682960256, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 4.858395651924046e-06, + "logits/chosen": 362610739.2, + "logits/rejected": 427368021.3333333, + "logps/chosen": -222.48623046875, + "logps/rejected": -312.90797932942706, + "loss": 0.0277, + "rewards/chosen": 3.1284555435180663, + "rewards/margins": 10.809279187520344, + "rewards/rejected": -7.680823644002278, + "step": 5582 + }, + { + "epoch": 0.5100959342165372, + "grad_norm": 38.0, + "kl": 0.0, + "learning_rate": 4.8569584346819945e-06, + "logits/chosen": 700082585.6, + "logits/rejected": 458844416.0, + "logps/chosen": -291.567626953125, + "logps/rejected": -492.9290364583333, + "loss": 0.0871, + "rewards/chosen": 4.12042007446289, + "rewards/margins": 10.123334503173828, + "rewards/rejected": -6.0029144287109375, + "step": 5583 + }, + { + "epoch": 0.5101873001370488, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 4.8555212292681556e-06, + "logits/chosen": 440867788.8, + "logits/rejected": 455892906.6666667, + "logps/chosen": -221.0539794921875, + "logps/rejected": -360.0439046223958, + "loss": 0.0269, + "rewards/chosen": 3.5596057891845705, + "rewards/margins": 11.154703648885091, + "rewards/rejected": -7.5950978597005205, + "step": 5584 + }, + { + "epoch": 0.5102786660575606, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 4.85408403580137e-06, + "logits/chosen": 546740608.0, + "logits/rejected": 592470570.6666666, + "logps/chosen": -263.792724609375, + "logps/rejected": -576.1790771484375, + "loss": 0.0136, + "rewards/chosen": 2.925121307373047, + "rewards/margins": 13.310083389282227, + "rewards/rejected": -10.38496208190918, + "step": 5585 + }, + { + "epoch": 0.5103700319780722, + "grad_norm": 4.1875, + "kl": 0.9607143402099609, + "learning_rate": 4.8526468544004855e-06, + "logits/chosen": 493401216.0, + "logits/rejected": 342935712.0, + "logps/chosen": -295.3855387369792, + "logps/rejected": -199.91152954101562, + "loss": 0.0364, + "rewards/chosen": 3.564891497294108, + "rewards/margins": 10.768003145853678, + "rewards/rejected": -7.20311164855957, + "step": 5586 + }, + { + "epoch": 0.5104613978985838, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 4.851209685184339e-06, + "logits/chosen": 710871424.0, + "logits/rejected": 610248106.6666666, + "logps/chosen": -523.3101196289062, + "logps/rejected": -732.73388671875, + "loss": 0.1217, + "rewards/chosen": 0.5435340404510498, + "rewards/margins": 12.092962344487509, + "rewards/rejected": -11.549428304036459, + "step": 5587 + }, + { + "epoch": 0.5105527638190954, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 4.849772528271773e-06, + "logits/chosen": 571099776.0, + "logits/rejected": 414755648.0, + "logps/chosen": -415.3286946614583, + "logps/rejected": -672.4796142578125, + "loss": 0.0103, + "rewards/chosen": 4.745004018147786, + "rewards/margins": 18.278470357259113, + "rewards/rejected": -13.533466339111328, + "step": 5588 + }, + { + "epoch": 0.5106441297396072, + "grad_norm": 0.84375, + "kl": 0.0, + "learning_rate": 4.8483353837816275e-06, + "logits/chosen": 448070752.0, + "logits/rejected": 492857856.0, + "logps/chosen": -301.6109313964844, + "logps/rejected": -533.2616141183036, + "loss": 0.0032, + "rewards/chosen": 3.7061767578125, + "rewards/margins": 12.821150643484932, + "rewards/rejected": -9.114973885672432, + "step": 5589 + }, + { + "epoch": 0.5107354956601188, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 4.846898251832741e-06, + "logits/chosen": 673289088.0, + "logits/rejected": 514943040.0, + "logps/chosen": -248.194580078125, + "logps/rejected": -477.8666076660156, + "loss": 0.0183, + "rewards/chosen": 3.6396446228027344, + "rewards/margins": 12.781574249267578, + "rewards/rejected": -9.141929626464844, + "step": 5590 + }, + { + "epoch": 0.5108268615806304, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 4.845461132543949e-06, + "logits/chosen": 953176064.0, + "logits/rejected": 340112640.0, + "logps/chosen": -209.6036376953125, + "logps/rejected": -337.6242370605469, + "loss": 0.0515, + "rewards/chosen": 3.227609316507975, + "rewards/margins": 13.288962999979654, + "rewards/rejected": -10.06135368347168, + "step": 5591 + }, + { + "epoch": 0.510918227501142, + "grad_norm": 0.8125, + "kl": 0.0, + "learning_rate": 4.844024026034092e-06, + "logits/chosen": 767944448.0, + "logits/rejected": 713458645.3333334, + "logps/chosen": -157.34458923339844, + "logps/rejected": -607.41015625, + "loss": 0.0062, + "rewards/chosen": 3.7326889038085938, + "rewards/margins": 14.528429667154947, + "rewards/rejected": -10.795740763346354, + "step": 5592 + }, + { + "epoch": 0.5110095934216538, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 4.842586932422001e-06, + "logits/chosen": 508576853.3333333, + "logits/rejected": 849104486.4, + "logps/chosen": -325.5009358723958, + "logps/rejected": -508.01142578125, + "loss": 0.0199, + "rewards/chosen": 3.9707616170247397, + "rewards/margins": 12.153345235188803, + "rewards/rejected": -8.182583618164063, + "step": 5593 + }, + { + "epoch": 0.5111009593421654, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 4.8411498518265125e-06, + "logits/chosen": 532008064.0, + "logits/rejected": 291463372.8, + "logps/chosen": -370.9153238932292, + "logps/rejected": -458.1666015625, + "loss": 0.0152, + "rewards/chosen": 3.621007283528646, + "rewards/margins": 12.872846730550132, + "rewards/rejected": -9.251839447021485, + "step": 5594 + }, + { + "epoch": 0.511192325262677, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 4.8397127843664606e-06, + "logits/chosen": 371261747.2, + "logits/rejected": 343417152.0, + "logps/chosen": -324.8361083984375, + "logps/rejected": -391.8202311197917, + "loss": 0.0173, + "rewards/chosen": 3.9383045196533204, + "rewards/margins": 15.01298484802246, + "rewards/rejected": -11.07468032836914, + "step": 5595 + }, + { + "epoch": 0.5112836911831886, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 4.838275730160675e-06, + "logits/chosen": 763658922.6666666, + "logits/rejected": 566987584.0, + "logps/chosen": -386.5409342447917, + "logps/rejected": -615.6119384765625, + "loss": 0.0258, + "rewards/chosen": 3.6581878662109375, + "rewards/margins": 10.4222993850708, + "rewards/rejected": -6.764111518859863, + "step": 5596 + }, + { + "epoch": 0.5113750571037003, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 4.836838689327989e-06, + "logits/chosen": 543224448.0, + "logits/rejected": 909395008.0, + "logps/chosen": -255.89419555664062, + "logps/rejected": -739.4298095703125, + "loss": 0.0076, + "rewards/chosen": 4.46799373626709, + "rewards/margins": 14.574370384216309, + "rewards/rejected": -10.106376647949219, + "step": 5597 + }, + { + "epoch": 0.511466423024212, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 4.835401661987231e-06, + "logits/chosen": 338495360.0, + "logits/rejected": 423764000.0, + "logps/chosen": -219.26382446289062, + "logps/rejected": -562.3400268554688, + "loss": 0.0075, + "rewards/chosen": 4.77628231048584, + "rewards/margins": 14.445574760437012, + "rewards/rejected": -9.669292449951172, + "step": 5598 + }, + { + "epoch": 0.5115577889447236, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 4.833964648257231e-06, + "logits/chosen": 562227200.0, + "logits/rejected": 723097770.6666666, + "logps/chosen": -171.818212890625, + "logps/rejected": -690.5186360677084, + "loss": 0.019, + "rewards/chosen": 3.895513153076172, + "rewards/margins": 14.741165415445963, + "rewards/rejected": -10.845652262369791, + "step": 5599 + }, + { + "epoch": 0.5116491548652353, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 4.832527648256815e-06, + "logits/chosen": 781782528.0, + "logits/rejected": 402445619.2, + "logps/chosen": -416.8241373697917, + "logps/rejected": -430.771826171875, + "loss": 0.0147, + "rewards/chosen": 3.745438575744629, + "rewards/margins": 13.64387912750244, + "rewards/rejected": -9.898440551757812, + "step": 5600 + }, + { + "epoch": 0.5117405207857469, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 4.831090662104812e-06, + "logits/chosen": 410361024.0, + "logits/rejected": 415037696.0, + "logps/chosen": -345.26385498046875, + "logps/rejected": -539.4840087890625, + "loss": 0.009, + "rewards/chosen": 4.490360260009766, + "rewards/margins": 14.594765663146973, + "rewards/rejected": -10.104405403137207, + "step": 5601 + }, + { + "epoch": 0.5118318867062586, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 4.829653689920046e-06, + "logits/chosen": 567227904.0, + "logits/rejected": 503700160.0, + "logps/chosen": -328.4978434244792, + "logps/rejected": -540.697021484375, + "loss": 0.0307, + "rewards/chosen": 3.4448811213175454, + "rewards/margins": 12.009538332621256, + "rewards/rejected": -8.564657211303711, + "step": 5602 + }, + { + "epoch": 0.5119232526267702, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 4.828216731821342e-06, + "logits/chosen": 345670869.3333333, + "logits/rejected": 590598195.2, + "logps/chosen": -250.2568359375, + "logps/rejected": -466.73583984375, + "loss": 0.0301, + "rewards/chosen": 2.6457703908284507, + "rewards/margins": 11.327975591023764, + "rewards/rejected": -8.682205200195312, + "step": 5603 + }, + { + "epoch": 0.5120146185472819, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 4.8267797879275245e-06, + "logits/chosen": 475397632.0, + "logits/rejected": 492389376.0, + "logps/chosen": -294.816650390625, + "logps/rejected": -578.2447509765625, + "loss": 0.1231, + "rewards/chosen": 2.9491159439086916, + "rewards/margins": 12.758461189270019, + "rewards/rejected": -9.809345245361328, + "step": 5604 + }, + { + "epoch": 0.5121059844677935, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 4.825342858357411e-06, + "logits/chosen": 580366784.0, + "logits/rejected": 799416064.0, + "logps/chosen": -266.6932373046875, + "logps/rejected": -333.2852783203125, + "loss": 0.0366, + "rewards/chosen": 3.2557883262634277, + "rewards/margins": 12.545350551605225, + "rewards/rejected": -9.289562225341797, + "step": 5605 + }, + { + "epoch": 0.5121973503883052, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 4.823905943229825e-06, + "logits/chosen": 430294613.3333333, + "logits/rejected": 729788723.2, + "logps/chosen": -273.28167724609375, + "logps/rejected": -576.898388671875, + "loss": 0.0088, + "rewards/chosen": 4.259498914082845, + "rewards/margins": 12.591665967305502, + "rewards/rejected": -8.332167053222657, + "step": 5606 + }, + { + "epoch": 0.5122887163088168, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 4.822469042663587e-06, + "logits/chosen": 476530688.0, + "logits/rejected": 505239488.0, + "logps/chosen": -253.95005798339844, + "logps/rejected": -486.7991638183594, + "loss": 0.0149, + "rewards/chosen": 4.654170513153076, + "rewards/margins": 13.633677959442139, + "rewards/rejected": -8.979507446289062, + "step": 5607 + }, + { + "epoch": 0.5123800822293285, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 4.821032156777513e-06, + "logits/chosen": 503336544.0, + "logits/rejected": 496050944.0, + "logps/chosen": -436.9957275390625, + "logps/rejected": -589.016357421875, + "loss": 0.0117, + "rewards/chosen": 3.975184440612793, + "rewards/margins": 12.870940208435059, + "rewards/rejected": -8.895755767822266, + "step": 5608 + }, + { + "epoch": 0.5124714481498401, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 4.819595285690423e-06, + "logits/chosen": 682720000.0, + "logits/rejected": 712695398.4, + "logps/chosen": -428.4335123697917, + "logps/rejected": -255.758154296875, + "loss": 0.0103, + "rewards/chosen": 3.997983932495117, + "rewards/margins": 11.141225051879882, + "rewards/rejected": -7.143241119384766, + "step": 5609 + }, + { + "epoch": 0.5125628140703518, + "grad_norm": 0.91015625, + "kl": 0.0, + "learning_rate": 4.818158429521129e-06, + "logits/chosen": 956960960.0, + "logits/rejected": 556331434.6666666, + "logps/chosen": -414.1434326171875, + "logps/rejected": -668.5586751302084, + "loss": 0.0034, + "rewards/chosen": 4.639300346374512, + "rewards/margins": 14.258378664652506, + "rewards/rejected": -9.619078318277994, + "step": 5610 + }, + { + "epoch": 0.5126541799908634, + "grad_norm": 0.76171875, + "kl": 0.0, + "learning_rate": 4.81672158838845e-06, + "logits/chosen": 183374112.0, + "logits/rejected": 537981824.0, + "logps/chosen": -126.03352355957031, + "logps/rejected": -544.2494506835938, + "loss": 0.0065, + "rewards/chosen": 4.588944435119629, + "rewards/margins": 13.12486457824707, + "rewards/rejected": -8.535920143127441, + "step": 5611 + }, + { + "epoch": 0.5127455459113751, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.815284762411196e-06, + "logits/chosen": 317866112.0, + "logits/rejected": 479158208.0, + "logps/chosen": -320.97705078125, + "logps/rejected": -854.57421875, + "loss": 0.016, + "rewards/chosen": 4.263975620269775, + "rewards/margins": 14.468233585357666, + "rewards/rejected": -10.20425796508789, + "step": 5612 + }, + { + "epoch": 0.5128369118318867, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 4.813847951708183e-06, + "logits/chosen": 408524128.0, + "logits/rejected": 776627797.3333334, + "logps/chosen": -262.5521545410156, + "logps/rejected": -369.98095703125, + "loss": 0.0079, + "rewards/chosen": 4.125129222869873, + "rewards/margins": 11.777922471364338, + "rewards/rejected": -7.652793248494466, + "step": 5613 + }, + { + "epoch": 0.5129282777523984, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 4.812411156398218e-06, + "logits/chosen": 589845312.0, + "logits/rejected": 763491968.0, + "logps/chosen": -315.9298095703125, + "logps/rejected": -680.6182861328125, + "loss": 0.0124, + "rewards/chosen": 4.078117370605469, + "rewards/margins": 12.914705276489258, + "rewards/rejected": -8.836587905883789, + "step": 5614 + }, + { + "epoch": 0.51301964367291, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 4.810974376600113e-06, + "logits/chosen": 384360780.8, + "logits/rejected": 278795477.3333333, + "logps/chosen": -287.11806640625, + "logps/rejected": -447.45263671875, + "loss": 0.0196, + "rewards/chosen": 3.7596817016601562, + "rewards/margins": 16.10222625732422, + "rewards/rejected": -12.342544555664062, + "step": 5615 + }, + { + "epoch": 0.5131110095934217, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 4.809537612432676e-06, + "logits/chosen": 992144000.0, + "logits/rejected": 595621888.0, + "logps/chosen": -404.418212890625, + "logps/rejected": -563.0252075195312, + "loss": 0.0094, + "rewards/chosen": 5.081179618835449, + "rewards/margins": 15.17611312866211, + "rewards/rejected": -10.09493350982666, + "step": 5616 + }, + { + "epoch": 0.5132023755139333, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.8081008640147125e-06, + "logits/chosen": 640079232.0, + "logits/rejected": 510753312.0, + "logps/chosen": -365.50927734375, + "logps/rejected": -498.4993896484375, + "loss": 0.0272, + "rewards/chosen": 3.2505459785461426, + "rewards/margins": 12.780807971954346, + "rewards/rejected": -9.530261993408203, + "step": 5617 + }, + { + "epoch": 0.513293741434445, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.806664131465031e-06, + "logits/chosen": 464451008.0, + "logits/rejected": 336195648.0, + "logps/chosen": -442.126953125, + "logps/rejected": -420.098388671875, + "loss": 0.0099, + "rewards/chosen": 4.326127052307129, + "rewards/margins": 15.47130298614502, + "rewards/rejected": -11.14517593383789, + "step": 5618 + }, + { + "epoch": 0.5133851073549566, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 4.805227414902433e-06, + "logits/chosen": 510671744.0, + "logits/rejected": 455185600.0, + "logps/chosen": -369.24566650390625, + "logps/rejected": -469.1622619628906, + "loss": 0.0426, + "rewards/chosen": 2.735379695892334, + "rewards/margins": 11.051807880401611, + "rewards/rejected": -8.316428184509277, + "step": 5619 + }, + { + "epoch": 0.5134764732754683, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.8037907144457245e-06, + "logits/chosen": 498271692.8, + "logits/rejected": 444051626.6666667, + "logps/chosen": -337.0982421875, + "logps/rejected": -705.36865234375, + "loss": 0.0185, + "rewards/chosen": 3.6073070526123048, + "rewards/margins": 14.132915115356445, + "rewards/rejected": -10.52560806274414, + "step": 5620 + }, + { + "epoch": 0.5135678391959799, + "grad_norm": 24.125, + "kl": 0.0, + "learning_rate": 4.802354030213704e-06, + "logits/chosen": 493498912.0, + "logits/rejected": 414357184.0, + "logps/chosen": -316.4288330078125, + "logps/rejected": -398.44476318359375, + "loss": 0.1144, + "rewards/chosen": 2.256836414337158, + "rewards/margins": 10.550556659698486, + "rewards/rejected": -8.293720245361328, + "step": 5621 + }, + { + "epoch": 0.5136592051164915, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 4.800917362325176e-06, + "logits/chosen": 753708970.6666666, + "logits/rejected": 564806656.0, + "logps/chosen": -310.7655843098958, + "logps/rejected": -670.6431274414062, + "loss": 0.0357, + "rewards/chosen": 3.4582176208496094, + "rewards/margins": 15.293535232543945, + "rewards/rejected": -11.835317611694336, + "step": 5622 + }, + { + "epoch": 0.5137505710370032, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 4.7994807108989375e-06, + "logits/chosen": 627238326.8571428, + "logits/rejected": 624003776.0, + "logps/chosen": -439.06229073660717, + "logps/rejected": -612.7400512695312, + "loss": 0.0262, + "rewards/chosen": 3.9908324650355746, + "rewards/margins": 13.470117705208914, + "rewards/rejected": -9.47928524017334, + "step": 5623 + }, + { + "epoch": 0.5138419369575149, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 4.7980440760537864e-06, + "logits/chosen": 511500224.0, + "logits/rejected": 978818901.3333334, + "logps/chosen": -300.9980773925781, + "logps/rejected": -443.2892252604167, + "loss": 0.0075, + "rewards/chosen": 3.575982093811035, + "rewards/margins": 13.391093254089355, + "rewards/rejected": -9.81511116027832, + "step": 5624 + }, + { + "epoch": 0.5139333028780265, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.796607457908518e-06, + "logits/chosen": 668773478.4, + "logits/rejected": 358869077.3333333, + "logps/chosen": -312.405615234375, + "logps/rejected": -417.2337239583333, + "loss": 0.0187, + "rewards/chosen": 3.9254386901855467, + "rewards/margins": 13.640306599934895, + "rewards/rejected": -9.71486790974935, + "step": 5625 + }, + { + "epoch": 0.5140246687985381, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 4.795170856581929e-06, + "logits/chosen": 415916629.3333333, + "logits/rejected": 603019417.6, + "logps/chosen": -248.4669189453125, + "logps/rejected": -428.43349609375, + "loss": 0.0159, + "rewards/chosen": 3.398789723714193, + "rewards/margins": 13.104565938313803, + "rewards/rejected": -9.70577621459961, + "step": 5626 + }, + { + "epoch": 0.5141160347190498, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 4.793734272192813e-06, + "logits/chosen": 392468352.0, + "logits/rejected": 385365600.0, + "logps/chosen": -223.85977172851562, + "logps/rejected": -290.152587890625, + "loss": 0.0203, + "rewards/chosen": 3.4638609886169434, + "rewards/margins": 11.158152103424072, + "rewards/rejected": -7.694291114807129, + "step": 5627 + }, + { + "epoch": 0.5142074006395615, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 4.79229770485996e-06, + "logits/chosen": 493652672.0, + "logits/rejected": 490676160.0, + "logps/chosen": -275.5717468261719, + "logps/rejected": -468.14031982421875, + "loss": 0.0169, + "rewards/chosen": 3.842897891998291, + "rewards/margins": 13.063548564910889, + "rewards/rejected": -9.220650672912598, + "step": 5628 + }, + { + "epoch": 0.5142987665600731, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 4.790861154702164e-06, + "logits/chosen": 475256256.0, + "logits/rejected": 1090721792.0, + "logps/chosen": -399.29742431640625, + "logps/rejected": -611.2547607421875, + "loss": 0.0198, + "rewards/chosen": 3.697098731994629, + "rewards/margins": 13.553057670593262, + "rewards/rejected": -9.855958938598633, + "step": 5629 + }, + { + "epoch": 0.5143901324805847, + "grad_norm": 55.75, + "kl": 0.0, + "learning_rate": 4.789424621838212e-06, + "logits/chosen": 617522474.6666666, + "logits/rejected": 159257984.0, + "logps/chosen": -406.4145100911458, + "logps/rejected": -404.9793395996094, + "loss": 0.0732, + "rewards/chosen": 3.0954860051472983, + "rewards/margins": 12.234026273091635, + "rewards/rejected": -9.138540267944336, + "step": 5630 + }, + { + "epoch": 0.5144814984010964, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 4.787988106386893e-06, + "logits/chosen": 699834316.8, + "logits/rejected": 676050688.0, + "logps/chosen": -201.43939208984375, + "logps/rejected": -319.8223470052083, + "loss": 0.0374, + "rewards/chosen": 3.262299728393555, + "rewards/margins": 11.340488942464194, + "rewards/rejected": -8.078189214070639, + "step": 5631 + }, + { + "epoch": 0.5145728643216081, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 4.786551608466994e-06, + "logits/chosen": 159072256.0, + "logits/rejected": 579737429.3333334, + "logps/chosen": -439.10791015625, + "logps/rejected": -641.5902099609375, + "loss": 0.0088, + "rewards/chosen": 3.4367880821228027, + "rewards/margins": 12.13801654179891, + "rewards/rejected": -8.701228459676107, + "step": 5632 + }, + { + "epoch": 0.5146642302421197, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 4.785115128197298e-06, + "logits/chosen": 715171754.6666666, + "logits/rejected": 413496320.0, + "logps/chosen": -277.88905843098956, + "logps/rejected": -527.7896484375, + "loss": 0.0062, + "rewards/chosen": 4.388034502665202, + "rewards/margins": 13.948673693339032, + "rewards/rejected": -9.560639190673829, + "step": 5633 + }, + { + "epoch": 0.5147555961626313, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 4.78367866569659e-06, + "logits/rejected": 614366592.0, + "logps/rejected": -551.0245971679688, + "loss": 0.0206, + "rewards/rejected": -8.97909927368164, + "step": 5634 + }, + { + "epoch": 0.514846962083143, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 4.7822422210836524e-06, + "logits/chosen": 1345102506.6666667, + "logits/rejected": 440880742.4, + "logps/chosen": -837.01708984375, + "logps/rejected": -332.9255859375, + "loss": 0.0137, + "rewards/chosen": 3.6735626856486, + "rewards/margins": 12.088472811381022, + "rewards/rejected": -8.414910125732423, + "step": 5635 + }, + { + "epoch": 0.5149383280036547, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 4.780805794477266e-06, + "logits/chosen": 477311296.0, + "logits/rejected": 575103808.0, + "logps/chosen": -286.53643798828125, + "logps/rejected": -543.9973754882812, + "loss": 0.028, + "rewards/chosen": 2.9643263816833496, + "rewards/margins": 12.426637172698975, + "rewards/rejected": -9.462310791015625, + "step": 5636 + }, + { + "epoch": 0.5150296939241663, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 4.77936938599621e-06, + "logits/chosen": 656839509.3333334, + "logits/rejected": 568086374.4, + "logps/chosen": -323.5812581380208, + "logps/rejected": -394.665087890625, + "loss": 0.0144, + "rewards/chosen": 3.487574259440104, + "rewards/margins": 12.098266092936198, + "rewards/rejected": -8.610691833496094, + "step": 5637 + }, + { + "epoch": 0.5151210598446779, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 4.77793299575926e-06, + "logits/chosen": 397966176.0, + "logits/rejected": 660751744.0, + "logps/chosen": -319.73272705078125, + "logps/rejected": -596.4312744140625, + "loss": 0.0133, + "rewards/chosen": 3.7391796112060547, + "rewards/margins": 13.1283540725708, + "rewards/rejected": -9.389174461364746, + "step": 5638 + }, + { + "epoch": 0.5152124257651896, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 4.776496623885195e-06, + "logits/chosen": 264727936.0, + "logits/rejected": 419750368.0, + "logps/chosen": -257.6494140625, + "logps/rejected": -448.5151672363281, + "loss": 0.0078, + "rewards/chosen": 4.942320823669434, + "rewards/margins": 12.95952320098877, + "rewards/rejected": -8.017202377319336, + "step": 5639 + }, + { + "epoch": 0.5153037916857013, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 4.775060270492787e-06, + "logits/chosen": 579986688.0, + "logits/rejected": 593036544.0, + "logps/chosen": -161.67422485351562, + "logps/rejected": -738.1920572916666, + "loss": 0.0725, + "rewards/chosen": 2.8247854709625244, + "rewards/margins": 13.568751891454061, + "rewards/rejected": -10.743966420491537, + "step": 5640 + }, + { + "epoch": 0.5153951576062129, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 4.773623935700812e-06, + "logits/chosen": 793436288.0, + "logits/rejected": 267988309.33333334, + "logps/chosen": -288.4097900390625, + "logps/rejected": -308.52939860026044, + "loss": 0.0062, + "rewards/chosen": 3.803227424621582, + "rewards/margins": 12.346247037251791, + "rewards/rejected": -8.543019612630209, + "step": 5641 + }, + { + "epoch": 0.5154865235267245, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 4.772187619628038e-06, + "logits/chosen": 496541568.0, + "logits/rejected": 365818304.0, + "logps/chosen": -328.66961669921875, + "logps/rejected": -317.57135009765625, + "loss": 0.071, + "rewards/chosen": 3.3465819358825684, + "rewards/margins": 9.878337860107422, + "rewards/rejected": -6.5317559242248535, + "step": 5642 + }, + { + "epoch": 0.5155778894472361, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.77075132239324e-06, + "logits/chosen": 721729536.0, + "logits/rejected": 564748480.0, + "logps/chosen": -490.29937744140625, + "logps/rejected": -505.14324951171875, + "loss": 0.0141, + "rewards/chosen": 3.693227529525757, + "rewards/margins": 14.095227003097534, + "rewards/rejected": -10.401999473571777, + "step": 5643 + }, + { + "epoch": 0.5156692553677479, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 4.769315044115181e-06, + "logits/chosen": 416266240.0, + "logits/rejected": 659990272.0, + "logps/chosen": -311.97366768973217, + "logps/rejected": -792.98046875, + "loss": 0.0278, + "rewards/chosen": 4.0599773951939175, + "rewards/margins": 13.577860695975168, + "rewards/rejected": -9.51788330078125, + "step": 5644 + }, + { + "epoch": 0.5157606212882595, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 4.767878784912633e-06, + "logits/chosen": 737826406.4, + "logits/rejected": 455442048.0, + "logps/chosen": -364.4513427734375, + "logps/rejected": -389.0320638020833, + "loss": 0.0244, + "rewards/chosen": 3.844879913330078, + "rewards/margins": 12.745259094238282, + "rewards/rejected": -8.900379180908203, + "step": 5645 + }, + { + "epoch": 0.5158519872087711, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 4.766442544904357e-06, + "logits/chosen": 400735936.0, + "logits/rejected": 433812800.0, + "logps/chosen": -233.21908569335938, + "logps/rejected": -546.4317626953125, + "loss": 0.0288, + "rewards/chosen": 3.0773062705993652, + "rewards/margins": 13.233662128448486, + "rewards/rejected": -10.156355857849121, + "step": 5646 + }, + { + "epoch": 0.5159433531292827, + "grad_norm": 5.65625, + "kl": 7.748439788818359, + "learning_rate": 4.7650063242091195e-06, + "logits/chosen": 698885973.3333334, + "logits/rejected": 915960384.0, + "logps/chosen": -454.8074544270833, + "logps/rejected": -322.4958801269531, + "loss": 0.0357, + "rewards/chosen": 4.242408752441406, + "rewards/margins": 12.985139846801758, + "rewards/rejected": -8.742731094360352, + "step": 5647 + }, + { + "epoch": 0.5160347190497945, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 4.763570122945681e-06, + "logits/chosen": 942076330.6666666, + "logits/rejected": 1124840192.0, + "logps/chosen": -269.6654052734375, + "logps/rejected": -610.9906616210938, + "loss": 0.0469, + "rewards/chosen": 2.948963165283203, + "rewards/margins": 13.134130477905273, + "rewards/rejected": -10.18516731262207, + "step": 5648 + }, + { + "epoch": 0.5161260849703061, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 4.7621339412328035e-06, + "logits/chosen": 652315136.0, + "logits/rejected": 619477248.0, + "logps/chosen": -539.4734497070312, + "logps/rejected": -716.1986083984375, + "loss": 0.0083, + "rewards/chosen": 4.633770942687988, + "rewards/margins": 12.886567115783691, + "rewards/rejected": -8.252796173095703, + "step": 5649 + }, + { + "epoch": 0.5162174508908177, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 4.760697779189245e-06, + "logits/chosen": 450150195.2, + "logits/rejected": 740877824.0, + "logps/chosen": -258.3609375, + "logps/rejected": -796.6300455729166, + "loss": 0.0616, + "rewards/chosen": 3.768971252441406, + "rewards/margins": 18.078297424316407, + "rewards/rejected": -14.309326171875, + "step": 5650 + }, + { + "epoch": 0.5163088168113293, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 4.759261636933762e-06, + "logits/chosen": 227134784.0, + "logits/rejected": 365886304.0, + "logps/chosen": -184.31085205078125, + "logps/rejected": -358.51153564453125, + "loss": 0.0142, + "rewards/chosen": 4.396175384521484, + "rewards/margins": 11.51101303100586, + "rewards/rejected": -7.114837646484375, + "step": 5651 + }, + { + "epoch": 0.5164001827318411, + "grad_norm": 0.58984375, + "kl": 0.0, + "learning_rate": 4.757825514585113e-06, + "logits/chosen": 478453674.6666667, + "logits/rejected": 864570572.8, + "logps/chosen": -309.53574625651044, + "logps/rejected": -476.9697265625, + "loss": 0.0033, + "rewards/chosen": 5.021359761555989, + "rewards/margins": 13.846869405110677, + "rewards/rejected": -8.825509643554687, + "step": 5652 + }, + { + "epoch": 0.5164915486523527, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 4.756389412262048e-06, + "logits/chosen": 522740172.8, + "logits/rejected": 1021506389.3333334, + "logps/chosen": -228.2701416015625, + "logps/rejected": -712.19140625, + "loss": 0.0164, + "rewards/chosen": 4.23901596069336, + "rewards/margins": 14.737001164754233, + "rewards/rejected": -10.497985204060873, + "step": 5653 + }, + { + "epoch": 0.5165829145728643, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 4.754953330083323e-06, + "logits/chosen": 323023445.3333333, + "logits/rejected": 419515264.0, + "logps/chosen": -142.30673217773438, + "logps/rejected": -587.292138671875, + "loss": 0.0168, + "rewards/chosen": 3.1934407552083335, + "rewards/margins": 13.880405934651693, + "rewards/rejected": -10.686965179443359, + "step": 5654 + }, + { + "epoch": 0.5166742804933759, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 4.753517268167686e-06, + "logits/chosen": 623147136.0, + "logits/rejected": 464245717.3333333, + "logps/chosen": -514.053955078125, + "logps/rejected": -544.3677978515625, + "loss": 0.0064, + "rewards/chosen": 3.9253053665161133, + "rewards/margins": 14.907928148905436, + "rewards/rejected": -10.982622782389322, + "step": 5655 + }, + { + "epoch": 0.5167656464138877, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 4.752081226633888e-06, + "logits/chosen": 563106816.0, + "logits/rejected": 288135978.6666667, + "logps/chosen": -461.4248046875, + "logps/rejected": -443.0233561197917, + "loss": 0.0064, + "rewards/chosen": 3.696582078933716, + "rewards/margins": 13.610774914423624, + "rewards/rejected": -9.914192835489908, + "step": 5656 + }, + { + "epoch": 0.5168570123343993, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 4.750645205600675e-06, + "logits/chosen": 723927424.0, + "logits/rejected": 985844992.0, + "logps/chosen": -251.37167358398438, + "logps/rejected": -210.79330444335938, + "loss": 0.0407, + "rewards/chosen": 2.651766538619995, + "rewards/margins": 9.07816767692566, + "rewards/rejected": -6.426401138305664, + "step": 5657 + }, + { + "epoch": 0.5169483782549109, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 4.749209205186797e-06, + "logits/chosen": 536828458.6666667, + "logits/rejected": 1007241318.4, + "logps/chosen": -235.68570963541666, + "logps/rejected": -523.19814453125, + "loss": 0.1127, + "rewards/chosen": 3.0707953770955405, + "rewards/margins": 11.538211758931478, + "rewards/rejected": -8.467416381835937, + "step": 5658 + }, + { + "epoch": 0.5170397441754225, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 4.74777322551099e-06, + "logits/chosen": 608636480.0, + "logits/rejected": 483008384.0, + "logps/chosen": -271.73870849609375, + "logps/rejected": -436.99114990234375, + "loss": 0.0359, + "rewards/chosen": 3.553539752960205, + "rewards/margins": 10.8811616897583, + "rewards/rejected": -7.327621936798096, + "step": 5659 + }, + { + "epoch": 0.5171311100959343, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 4.746337266692001e-06, + "logits/chosen": 495318220.8, + "logits/rejected": 426307840.0, + "logps/chosen": -387.0114013671875, + "logps/rejected": -272.0130208333333, + "loss": 0.0157, + "rewards/chosen": 4.096027374267578, + "rewards/margins": 11.130199432373047, + "rewards/rejected": -7.034172058105469, + "step": 5660 + }, + { + "epoch": 0.5172224760164459, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 4.744901328848569e-06, + "logits/chosen": 530457952.0, + "logits/rejected": 369786048.0, + "logps/chosen": -294.384765625, + "logps/rejected": -404.6531982421875, + "loss": 0.0154, + "rewards/chosen": 4.101555347442627, + "rewards/margins": 12.431734561920166, + "rewards/rejected": -8.330179214477539, + "step": 5661 + }, + { + "epoch": 0.5173138419369575, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.743465412099436e-06, + "logits/chosen": 735087667.2, + "logits/rejected": 743012096.0, + "logps/chosen": -194.810595703125, + "logps/rejected": -346.0623779296875, + "loss": 0.0186, + "rewards/chosen": 3.7293106079101563, + "rewards/margins": 10.898770904541015, + "rewards/rejected": -7.169460296630859, + "step": 5662 + }, + { + "epoch": 0.5174052078574691, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 4.742029516563334e-06, + "logits/chosen": 362604885.3333333, + "logits/rejected": 343727296.0, + "logps/chosen": -282.18096923828125, + "logps/rejected": -307.2955627441406, + "loss": 0.0086, + "rewards/chosen": 5.094578742980957, + "rewards/margins": 12.620461463928223, + "rewards/rejected": -7.525882720947266, + "step": 5663 + }, + { + "epoch": 0.5174965737779809, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 4.740593642359003e-06, + "logits/chosen": 513389792.0, + "logits/rejected": 490519253.3333333, + "logps/chosen": -367.5469970703125, + "logps/rejected": -504.5738932291667, + "loss": 0.0187, + "rewards/chosen": 2.651620626449585, + "rewards/margins": 11.681031306584677, + "rewards/rejected": -9.029410680135092, + "step": 5664 + }, + { + "epoch": 0.5175879396984925, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 4.739157789605172e-06, + "logits/chosen": 453188640.0, + "logits/rejected": 591639296.0, + "logps/chosen": -366.0203857421875, + "logps/rejected": -556.8348388671875, + "loss": 0.0178, + "rewards/chosen": 3.573599338531494, + "rewards/margins": 12.689587116241455, + "rewards/rejected": -9.115987777709961, + "step": 5665 + }, + { + "epoch": 0.5176793056190041, + "grad_norm": 0.1806640625, + "kl": 0.0, + "learning_rate": 4.737721958420577e-06, + "logits/chosen": 524650965.3333333, + "logits/rejected": 428494028.8, + "logps/chosen": -548.1610514322916, + "logps/rejected": -505.10458984375, + "loss": 0.001, + "rewards/chosen": 6.589422225952148, + "rewards/margins": 14.990795516967774, + "rewards/rejected": -8.401373291015625, + "step": 5666 + }, + { + "epoch": 0.5177706715395157, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 4.736286148923945e-06, + "logits/chosen": 685099818.6666666, + "logits/rejected": 775022976.0, + "logps/chosen": -310.8887939453125, + "logps/rejected": -352.09185791015625, + "loss": 0.0229, + "rewards/chosen": 3.947120030721029, + "rewards/margins": 11.842027505238852, + "rewards/rejected": -7.894907474517822, + "step": 5667 + }, + { + "epoch": 0.5178620374600275, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 4.7348503612340056e-06, + "logits/chosen": 560676949.3333334, + "logits/rejected": 455093760.0, + "logps/chosen": -297.83770751953125, + "logps/rejected": -337.9249267578125, + "loss": 0.1246, + "rewards/chosen": 3.1390679677327475, + "rewards/margins": 12.537896474202475, + "rewards/rejected": -9.398828506469727, + "step": 5668 + }, + { + "epoch": 0.5179534033805391, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 4.733414595469484e-06, + "logits/chosen": 339551744.0, + "logits/rejected": 438487456.0, + "logps/chosen": -279.7442932128906, + "logps/rejected": -405.35552978515625, + "loss": 0.0167, + "rewards/chosen": 3.7210474014282227, + "rewards/margins": 12.027960777282715, + "rewards/rejected": -8.306913375854492, + "step": 5669 + }, + { + "epoch": 0.5180447693010507, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.731978851749105e-06, + "logits/chosen": 636684800.0, + "logits/rejected": 832185941.3333334, + "logps/chosen": -360.380517578125, + "logps/rejected": -683.1090494791666, + "loss": 0.0344, + "rewards/chosen": 3.6533069610595703, + "rewards/margins": 14.744084040323893, + "rewards/rejected": -11.090777079264322, + "step": 5670 + }, + { + "epoch": 0.5181361352215623, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 4.730543130191594e-06, + "logits/chosen": 350264115.2, + "logits/rejected": 1053186730.6666666, + "logps/chosen": -156.794677734375, + "logps/rejected": -182.42167154947916, + "loss": 0.0257, + "rewards/chosen": 3.761433792114258, + "rewards/margins": 9.46893933614095, + "rewards/rejected": -5.707505544026692, + "step": 5671 + }, + { + "epoch": 0.518227501142074, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.729107430915667e-06, + "logits/chosen": 444824678.4, + "logits/rejected": 437320960.0, + "logps/chosen": -407.574267578125, + "logps/rejected": -511.2410074869792, + "loss": 0.0244, + "rewards/chosen": 4.050424575805664, + "rewards/margins": 12.507220586140951, + "rewards/rejected": -8.456796010335287, + "step": 5672 + }, + { + "epoch": 0.5183188670625857, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 4.727671754040047e-06, + "logits/chosen": 570898073.6, + "logits/rejected": 477227093.3333333, + "logps/chosen": -225.1815673828125, + "logps/rejected": -400.8172200520833, + "loss": 0.0067, + "rewards/chosen": 4.679173278808594, + "rewards/margins": 13.601199595133462, + "rewards/rejected": -8.92202631632487, + "step": 5673 + }, + { + "epoch": 0.5184102329830973, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 4.726236099683449e-06, + "logits/chosen": 398934937.6, + "logits/rejected": 753624064.0, + "logps/chosen": -355.7411865234375, + "logps/rejected": -365.876953125, + "loss": 0.0285, + "rewards/chosen": 3.738709259033203, + "rewards/margins": 11.72218271891276, + "rewards/rejected": -7.983473459879558, + "step": 5674 + }, + { + "epoch": 0.5185015989036089, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 4.7248004679645894e-06, + "logits/chosen": 1041811328.0, + "logits/rejected": 757106261.3333334, + "logps/chosen": -252.59494018554688, + "logps/rejected": -415.2671305338542, + "loss": 0.0063, + "rewards/chosen": 4.149543762207031, + "rewards/margins": 13.286861419677734, + "rewards/rejected": -9.137317657470703, + "step": 5675 + }, + { + "epoch": 0.5185929648241207, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 4.723364859002181e-06, + "logits/chosen": 1336897621.3333333, + "logits/rejected": 499054387.2, + "logps/chosen": -455.3724772135417, + "logps/rejected": -405.7844482421875, + "loss": 0.0139, + "rewards/chosen": 3.84636656443278, + "rewards/margins": 13.423371569315592, + "rewards/rejected": -9.577005004882812, + "step": 5676 + }, + { + "epoch": 0.5186843307446323, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.721929272914937e-06, + "logits/chosen": 378626157.71428573, + "logits/rejected": 26777380.0, + "logps/chosen": -227.14317103794642, + "logps/rejected": -686.4705810546875, + "loss": 0.0326, + "rewards/chosen": 3.9773761204310825, + "rewards/margins": 12.470729964120046, + "rewards/rejected": -8.493353843688965, + "step": 5677 + }, + { + "epoch": 0.5187756966651439, + "grad_norm": 51.5, + "kl": 0.0, + "learning_rate": 4.720493709821563e-06, + "logits/chosen": 342248320.0, + "logits/rejected": 529967820.8, + "logps/chosen": -240.2081502278646, + "logps/rejected": -650.74326171875, + "loss": 0.0481, + "rewards/chosen": 3.3341242472330728, + "rewards/margins": 12.783191935221353, + "rewards/rejected": -9.449067687988281, + "step": 5678 + }, + { + "epoch": 0.5188670625856555, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 4.719058169840773e-06, + "logits/chosen": 452491968.0, + "logits/rejected": 417947328.0, + "logps/chosen": -256.3432312011719, + "logps/rejected": -567.020263671875, + "loss": 0.0878, + "rewards/chosen": 2.811507225036621, + "rewards/margins": 13.231450080871582, + "rewards/rejected": -10.419942855834961, + "step": 5679 + }, + { + "epoch": 0.5189584285061672, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 4.717622653091268e-06, + "logits/chosen": 561529636.5714285, + "logits/rejected": 349699136.0, + "logps/chosen": -344.88595145089283, + "logps/rejected": -260.6634521484375, + "loss": 0.0261, + "rewards/chosen": 3.6480816432407925, + "rewards/margins": 11.978360039847239, + "rewards/rejected": -8.330278396606445, + "step": 5680 + }, + { + "epoch": 0.5190497944266789, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 4.716187159691753e-06, + "logits/chosen": 457560268.8, + "logits/rejected": 763550037.3333334, + "logps/chosen": -237.819091796875, + "logps/rejected": -546.8642171223959, + "loss": 0.0235, + "rewards/chosen": 3.720317840576172, + "rewards/margins": 13.367192586263021, + "rewards/rejected": -9.64687474568685, + "step": 5681 + }, + { + "epoch": 0.5191411603471905, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 4.71475168976093e-06, + "logits/chosen": 621406003.2, + "logits/rejected": 469768448.0, + "logps/chosen": -381.0532958984375, + "logps/rejected": -509.9861246744792, + "loss": 0.0148, + "rewards/chosen": 3.867392730712891, + "rewards/margins": 11.260645421346029, + "rewards/rejected": -7.393252690633138, + "step": 5682 + }, + { + "epoch": 0.5192325262677021, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 4.713316243417499e-06, + "logits/chosen": 485883904.0, + "logits/rejected": 529304661.3333333, + "logps/chosen": -321.81376953125, + "logps/rejected": -312.22898356119794, + "loss": 0.1331, + "rewards/chosen": 2.4997074127197267, + "rewards/margins": 11.177713902791343, + "rewards/rejected": -8.678006490071615, + "step": 5683 + }, + { + "epoch": 0.5193238921882138, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.71188082078016e-06, + "logits/chosen": 299436928.0, + "logits/rejected": 904031829.3333334, + "logps/chosen": -504.82537841796875, + "logps/rejected": -478.0335286458333, + "loss": 0.0069, + "rewards/chosen": 4.110617160797119, + "rewards/margins": 12.502103010813395, + "rewards/rejected": -8.391485850016275, + "step": 5684 + }, + { + "epoch": 0.5194152581087255, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.710445421967608e-06, + "logits/chosen": 473618560.0, + "logits/rejected": 608436121.6, + "logps/chosen": -252.82305908203125, + "logps/rejected": -537.941552734375, + "loss": 0.0147, + "rewards/chosen": 3.3134047190348306, + "rewards/margins": 13.986884943644204, + "rewards/rejected": -10.673480224609374, + "step": 5685 + }, + { + "epoch": 0.5195066240292371, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 4.709010047098535e-06, + "logits/chosen": 509275872.0, + "logits/rejected": 423716778.6666667, + "logps/chosen": -391.8536682128906, + "logps/rejected": -457.9243977864583, + "loss": 0.0082, + "rewards/chosen": 3.8782944679260254, + "rewards/margins": 11.875733534495037, + "rewards/rejected": -7.997439066569011, + "step": 5686 + }, + { + "epoch": 0.5195979899497487, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 4.707574696291636e-06, + "logits/chosen": 945730867.2, + "logits/rejected": 541488298.6666666, + "logps/chosen": -510.891748046875, + "logps/rejected": -739.4794921875, + "loss": 0.0124, + "rewards/chosen": 4.235365676879883, + "rewards/margins": 14.737537511189778, + "rewards/rejected": -10.502171834309896, + "step": 5687 + }, + { + "epoch": 0.5196893558702604, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 4.706139369665601e-06, + "logits/chosen": 638045269.3333334, + "logits/rejected": 390269440.0, + "logps/chosen": -293.35670979817706, + "logps/rejected": -426.48955078125, + "loss": 0.0182, + "rewards/chosen": 3.368776003519694, + "rewards/margins": 12.444860903422038, + "rewards/rejected": -9.076084899902344, + "step": 5688 + }, + { + "epoch": 0.5197807217907721, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 4.704704067339115e-06, + "logits/chosen": 589397145.6, + "logits/rejected": 1236503040.0, + "logps/chosen": -279.726708984375, + "logps/rejected": -684.0113932291666, + "loss": 0.0354, + "rewards/chosen": 3.192348098754883, + "rewards/margins": 10.592072423299154, + "rewards/rejected": -7.3997243245442705, + "step": 5689 + }, + { + "epoch": 0.5198720877112837, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 4.703268789430868e-06, + "logits/chosen": 494962368.0, + "logits/rejected": 411295200.0, + "logps/chosen": -469.1551513671875, + "logps/rejected": -618.5247802734375, + "loss": 0.0109, + "rewards/chosen": 4.054131507873535, + "rewards/margins": 16.765830039978027, + "rewards/rejected": -12.711698532104492, + "step": 5690 + }, + { + "epoch": 0.5199634536317953, + "grad_norm": 0.023681640625, + "kl": 0.0, + "learning_rate": 4.701833536059543e-06, + "logits/rejected": 863180736.0, + "logps/rejected": -533.5278930664062, + "loss": 0.0001, + "rewards/rejected": -10.048439025878906, + "step": 5691 + }, + { + "epoch": 0.520054819552307, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 4.700398307343822e-06, + "logits/chosen": 663272192.0, + "logits/rejected": 661985536.0, + "logps/chosen": -482.262646484375, + "logps/rejected": -386.6398518880208, + "loss": 0.0312, + "rewards/chosen": 3.3429847717285157, + "rewards/margins": 11.92051912943522, + "rewards/rejected": -8.577534357706705, + "step": 5692 + }, + { + "epoch": 0.5201461854728187, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.698963103402385e-06, + "logits/chosen": 515634112.0, + "logits/rejected": 1162038528.0, + "logps/chosen": -335.12750244140625, + "logps/rejected": -596.8629150390625, + "loss": 0.0174, + "rewards/chosen": 3.576167583465576, + "rewards/margins": 11.61592149734497, + "rewards/rejected": -8.039753913879395, + "step": 5693 + }, + { + "epoch": 0.5202375513933303, + "grad_norm": 71.5, + "kl": 0.0, + "learning_rate": 4.69752792435391e-06, + "logits/chosen": 619031296.0, + "logits/rejected": 463175884.8, + "logps/chosen": -298.6261800130208, + "logps/rejected": -384.9060791015625, + "loss": 0.0442, + "rewards/chosen": 5.191766738891602, + "rewards/margins": 12.557056045532226, + "rewards/rejected": -7.365289306640625, + "step": 5694 + }, + { + "epoch": 0.5203289173138419, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 4.696092770317073e-06, + "logits/chosen": 563669888.0, + "logits/rejected": 255568128.0, + "logps/chosen": -398.2447102864583, + "logps/rejected": -309.3771667480469, + "loss": 0.0177, + "rewards/chosen": 4.249352137247722, + "rewards/margins": 13.740714708964031, + "rewards/rejected": -9.491362571716309, + "step": 5695 + }, + { + "epoch": 0.5204202832343536, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 4.6946576414105485e-06, + "logits/chosen": 338768490.6666667, + "logits/rejected": 234724992.0, + "logps/chosen": -226.50948079427084, + "logps/rejected": -371.287841796875, + "loss": 0.0199, + "rewards/chosen": 4.114743550618489, + "rewards/margins": 14.408247311909992, + "rewards/rejected": -10.293503761291504, + "step": 5696 + }, + { + "epoch": 0.5205116491548653, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 4.693222537753007e-06, + "logits/chosen": 702029888.0, + "logits/rejected": 875590070.8571428, + "logps/chosen": -345.6806640625, + "logps/rejected": -389.5059291294643, + "loss": 0.0057, + "rewards/chosen": 3.52490234375, + "rewards/margins": 12.807965959821429, + "rewards/rejected": -9.283063616071429, + "step": 5697 + }, + { + "epoch": 0.5206030150753769, + "grad_norm": 0.70703125, + "kl": 0.0, + "learning_rate": 4.691787459463121e-06, + "logits/chosen": 648603008.0, + "logits/rejected": 536438930.28571427, + "logps/chosen": -74.1175537109375, + "logps/rejected": -454.87001255580356, + "loss": 0.004, + "rewards/chosen": 3.491290330886841, + "rewards/margins": 12.901736497879028, + "rewards/rejected": -9.410446166992188, + "step": 5698 + }, + { + "epoch": 0.5206943809958885, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 4.690352406659556e-06, + "logits/chosen": 748582144.0, + "logits/rejected": 577014528.0, + "logps/chosen": -509.5768127441406, + "logps/rejected": -477.227783203125, + "loss": 0.127, + "rewards/chosen": 2.532012939453125, + "rewards/margins": 10.76550547281901, + "rewards/rejected": -8.233492533365885, + "step": 5699 + }, + { + "epoch": 0.5207857469164002, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 4.688917379460978e-06, + "logits/chosen": 1382100224.0, + "logits/rejected": 650309504.0, + "logps/chosen": -537.5823974609375, + "logps/rejected": -514.82373046875, + "loss": 0.0178, + "rewards/chosen": 3.4482154846191406, + "rewards/margins": 12.60671329498291, + "rewards/rejected": -9.15849781036377, + "step": 5700 + }, + { + "epoch": 0.5208771128369118, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 4.68748237798605e-06, + "logits/chosen": 420840908.8, + "logits/rejected": 427934208.0, + "logps/chosen": -364.301025390625, + "logps/rejected": -471.376708984375, + "loss": 0.1405, + "rewards/chosen": 2.431306838989258, + "rewards/margins": 11.079394022623697, + "rewards/rejected": -8.64808718363444, + "step": 5701 + }, + { + "epoch": 0.5209684787574235, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 4.686047402353433e-06, + "logits/chosen": 927591014.4, + "logits/rejected": 535929344.0, + "logps/chosen": -307.1505126953125, + "logps/rejected": -510.0823974609375, + "loss": 0.1031, + "rewards/chosen": 2.566383934020996, + "rewards/margins": 13.29776185353597, + "rewards/rejected": -10.731377919514975, + "step": 5702 + }, + { + "epoch": 0.5210598446779351, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.6846124526817885e-06, + "logits/chosen": 481472341.3333333, + "logits/rejected": 630159424.0, + "logps/chosen": -279.05491129557294, + "logps/rejected": -659.4625244140625, + "loss": 0.0175, + "rewards/chosen": 4.208978335062663, + "rewards/margins": 13.684716860453289, + "rewards/rejected": -9.475738525390625, + "step": 5703 + }, + { + "epoch": 0.5211512105984468, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 4.6831775290897715e-06, + "logits/chosen": 571368192.0, + "logits/rejected": 594477056.0, + "logps/chosen": -425.86065673828125, + "logps/rejected": -470.8998209635417, + "loss": 0.0092, + "rewards/chosen": 3.340406894683838, + "rewards/margins": 12.507352670033773, + "rewards/rejected": -9.166945775349935, + "step": 5704 + }, + { + "epoch": 0.5212425765189584, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 4.681742631696037e-06, + "logits/chosen": 545966848.0, + "logits/rejected": 550922547.2, + "logps/chosen": -448.8077799479167, + "logps/rejected": -453.29638671875, + "loss": 0.0193, + "rewards/chosen": 3.163311004638672, + "rewards/margins": 11.865180206298827, + "rewards/rejected": -8.701869201660156, + "step": 5705 + }, + { + "epoch": 0.5213339424394701, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 4.680307760619238e-06, + "logits/chosen": 431074713.6, + "logits/rejected": 465442730.6666667, + "logps/chosen": -376.469873046875, + "logps/rejected": -562.6373697916666, + "loss": 0.0161, + "rewards/chosen": 4.006452941894532, + "rewards/margins": 12.162351989746094, + "rewards/rejected": -8.155899047851562, + "step": 5706 + }, + { + "epoch": 0.5214253083599817, + "grad_norm": 0.953125, + "kl": 0.0, + "learning_rate": 4.678872915978027e-06, + "logits/chosen": 407083008.0, + "logits/rejected": 599061504.0, + "logps/chosen": -223.33707682291666, + "logps/rejected": -461.9060546875, + "loss": 0.1292, + "rewards/chosen": 1.3199939727783203, + "rewards/margins": 10.1216609954834, + "rewards/rejected": -8.801667022705079, + "step": 5707 + }, + { + "epoch": 0.5215166742804934, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.677438097891048e-06, + "logits/chosen": 343001728.0, + "logits/rejected": 354265292.8, + "logps/chosen": -218.2613321940104, + "logps/rejected": -376.4541015625, + "loss": 0.0254, + "rewards/chosen": 3.4215873082478843, + "rewards/margins": 12.279371579488119, + "rewards/rejected": -8.857784271240234, + "step": 5708 + }, + { + "epoch": 0.521608040201005, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 4.676003306476951e-06, + "logits/chosen": 740059989.3333334, + "logits/rejected": 426382822.4, + "logps/chosen": -430.9206136067708, + "logps/rejected": -447.83525390625, + "loss": 0.0307, + "rewards/chosen": 2.8354196548461914, + "rewards/margins": 12.609397315979004, + "rewards/rejected": -9.773977661132813, + "step": 5709 + }, + { + "epoch": 0.5216994061215167, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 4.6745685418543795e-06, + "logits/chosen": 1089005312.0, + "logits/rejected": 706155264.0, + "logps/chosen": -348.2926940917969, + "logps/rejected": -669.0218912760416, + "loss": 0.0057, + "rewards/chosen": 4.2028913497924805, + "rewards/margins": 13.092523256937662, + "rewards/rejected": -8.889631907145182, + "step": 5710 + }, + { + "epoch": 0.5217907720420283, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 4.673133804141973e-06, + "logits/chosen": 466106624.0, + "logits/rejected": 476890197.3333333, + "logps/chosen": -329.11749267578125, + "logps/rejected": -511.7232259114583, + "loss": 0.0032, + "rewards/chosen": 4.700146675109863, + "rewards/margins": 13.72952938079834, + "rewards/rejected": -9.029382705688477, + "step": 5711 + }, + { + "epoch": 0.52188213796254, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.671699093458374e-06, + "logits/rejected": 634796864.0, + "logps/rejected": -596.4078979492188, + "loss": 0.0041, + "rewards/rejected": -8.843242645263672, + "step": 5712 + }, + { + "epoch": 0.5219735038830516, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 4.670264409922218e-06, + "logits/chosen": 665042240.0, + "logits/rejected": 499252821.3333333, + "logps/chosen": -378.8841552734375, + "logps/rejected": -403.43212890625, + "loss": 0.0036, + "rewards/chosen": 4.714974880218506, + "rewards/margins": 12.371996720631916, + "rewards/rejected": -7.657021840413411, + "step": 5713 + }, + { + "epoch": 0.5220648698035633, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 4.668829753652139e-06, + "logits/chosen": 731811669.3333334, + "logits/rejected": 398837376.0, + "logps/chosen": -277.57763671875, + "logps/rejected": -611.307861328125, + "loss": 0.0178, + "rewards/chosen": 4.424081802368164, + "rewards/margins": 16.042627334594727, + "rewards/rejected": -11.618545532226562, + "step": 5714 + }, + { + "epoch": 0.5221562357240749, + "grad_norm": 0.91796875, + "kl": 0.0, + "learning_rate": 4.667395124766772e-06, + "logits/chosen": 344612266.6666667, + "logits/rejected": 464177356.8, + "logps/chosen": -239.031005859375, + "logps/rejected": -372.37998046875, + "loss": 0.0043, + "rewards/chosen": 5.194901784261067, + "rewards/margins": 14.12602055867513, + "rewards/rejected": -8.931118774414063, + "step": 5715 + }, + { + "epoch": 0.5222476016445866, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 4.665960523384745e-06, + "logits/chosen": 380664928.0, + "logits/rejected": 468584917.3333333, + "logps/chosen": -373.7220764160156, + "logps/rejected": -603.0377604166666, + "loss": 0.0062, + "rewards/chosen": 4.089118957519531, + "rewards/margins": 14.478080749511719, + "rewards/rejected": -10.388961791992188, + "step": 5716 + }, + { + "epoch": 0.5223389675650982, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 4.664525949624689e-06, + "logits/chosen": 779175765.3333334, + "logits/rejected": 669443993.6, + "logps/chosen": -311.18857828776044, + "logps/rejected": -417.2083984375, + "loss": 0.0059, + "rewards/chosen": 4.39817746480306, + "rewards/margins": 14.99131940205892, + "rewards/rejected": -10.59314193725586, + "step": 5717 + }, + { + "epoch": 0.5224303334856099, + "grad_norm": 76.5, + "kl": 0.0, + "learning_rate": 4.663091403605228e-06, + "logits/chosen": 754248192.0, + "logits/rejected": 506416853.3333333, + "logps/chosen": -320.841552734375, + "logps/rejected": -547.3719889322916, + "loss": 0.0748, + "rewards/chosen": 3.744175672531128, + "rewards/margins": 11.045741478602093, + "rewards/rejected": -7.301565806070964, + "step": 5718 + }, + { + "epoch": 0.5225216994061215, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 4.661656885444987e-06, + "logits/chosen": 737061546.6666666, + "logits/rejected": 699330048.0, + "logps/chosen": -282.37937418619794, + "logps/rejected": -642.4273071289062, + "loss": 0.0106, + "rewards/chosen": 4.598848978678386, + "rewards/margins": 11.715823809305828, + "rewards/rejected": -7.116974830627441, + "step": 5719 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 4.660222395262588e-06, + "logits/chosen": 356176576.0, + "logits/rejected": 436282752.0, + "logps/chosen": -277.0387268066406, + "logps/rejected": -387.740966796875, + "loss": 0.0227, + "rewards/chosen": 3.52229380607605, + "rewards/margins": 12.663410902023315, + "rewards/rejected": -9.141117095947266, + "step": 5720 + }, + { + "epoch": 0.5227044312471448, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.6587879331766465e-06, + "logits/chosen": 335135539.2, + "logits/rejected": 621147818.6666666, + "logps/chosen": -255.156982421875, + "logps/rejected": -608.6920572916666, + "loss": 0.0152, + "rewards/chosen": 4.193257141113281, + "rewards/margins": 13.365184275309243, + "rewards/rejected": -9.171927134195963, + "step": 5721 + }, + { + "epoch": 0.5227957971676565, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 4.657353499305782e-06, + "logits/chosen": 696052864.0, + "logits/rejected": 438143328.0, + "logps/chosen": -177.930419921875, + "logps/rejected": -384.4970397949219, + "loss": 0.0154, + "rewards/chosen": 3.884850263595581, + "rewards/margins": 13.569174528121948, + "rewards/rejected": -9.684324264526367, + "step": 5722 + }, + { + "epoch": 0.5228871630881681, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 4.655919093768608e-06, + "logits/chosen": 511823552.0, + "logits/rejected": 238616224.0, + "logps/chosen": -359.0680847167969, + "logps/rejected": -354.6651611328125, + "loss": 0.0241, + "rewards/chosen": 3.001307725906372, + "rewards/margins": 11.616830110549927, + "rewards/rejected": -8.615522384643555, + "step": 5723 + }, + { + "epoch": 0.5229785290086798, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 4.654484716683739e-06, + "logits/chosen": 500140288.0, + "logits/rejected": 712712704.0, + "logps/chosen": -443.19912109375, + "logps/rejected": -378.3069661458333, + "loss": 0.0177, + "rewards/chosen": 3.7140472412109373, + "rewards/margins": 12.245847829182942, + "rewards/rejected": -8.531800587972006, + "step": 5724 + }, + { + "epoch": 0.5230698949291914, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 4.65305036816978e-06, + "logits/chosen": 769631402.6666666, + "logits/rejected": 1025741721.6, + "logps/chosen": -365.3037923177083, + "logps/rejected": -676.5041015625, + "loss": 0.0112, + "rewards/chosen": 3.7793992360432944, + "rewards/margins": 13.65922711690267, + "rewards/rejected": -9.879827880859375, + "step": 5725 + }, + { + "epoch": 0.523161260849703, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 4.651616048345343e-06, + "logits/chosen": 454364512.0, + "logits/rejected": 288322496.0, + "logps/chosen": -243.863037109375, + "logps/rejected": -409.8579915364583, + "loss": 0.0042, + "rewards/chosen": 4.467663764953613, + "rewards/margins": 12.57957935333252, + "rewards/rejected": -8.111915588378906, + "step": 5726 + }, + { + "epoch": 0.5232526267702147, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 4.6501817573290305e-06, + "logits/chosen": 531706368.0, + "logits/rejected": 318803776.0, + "logps/chosen": -470.477734375, + "logps/rejected": -315.36362711588544, + "loss": 0.0175, + "rewards/chosen": 3.7956520080566407, + "rewards/margins": 11.18704948425293, + "rewards/rejected": -7.391397476196289, + "step": 5727 + }, + { + "epoch": 0.5233439926907264, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 4.648747495239445e-06, + "logits/chosen": 413475635.2, + "logits/rejected": 408339114.6666667, + "logps/chosen": -279.7393310546875, + "logps/rejected": -505.1405843098958, + "loss": 0.0192, + "rewards/chosen": 3.768946075439453, + "rewards/margins": 13.30182367960612, + "rewards/rejected": -9.532877604166666, + "step": 5728 + }, + { + "epoch": 0.523435358611238, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 4.647313262195188e-06, + "logits/chosen": 630321152.0, + "logits/rejected": 481331520.0, + "logps/chosen": -539.8316650390625, + "logps/rejected": -570.7146606445312, + "loss": 0.0422, + "rewards/chosen": 2.937605381011963, + "rewards/margins": 12.595932483673096, + "rewards/rejected": -9.658327102661133, + "step": 5729 + }, + { + "epoch": 0.5235267245317496, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 4.6458790583148565e-06, + "logits/chosen": 879880704.0, + "logits/rejected": 493399040.0, + "logps/chosen": -458.8533630371094, + "logps/rejected": -529.9913940429688, + "loss": 0.0079, + "rewards/chosen": 4.253204345703125, + "rewards/margins": 14.917261123657227, + "rewards/rejected": -10.664056777954102, + "step": 5730 + }, + { + "epoch": 0.5236180904522613, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 4.644444883717045e-06, + "logits/chosen": 567204915.2, + "logits/rejected": 558285653.3333334, + "logps/chosen": -191.00181884765624, + "logps/rejected": -529.8121744791666, + "loss": 0.0246, + "rewards/chosen": 3.6236690521240233, + "rewards/margins": 12.130927149454752, + "rewards/rejected": -8.507258097330729, + "step": 5731 + }, + { + "epoch": 0.523709456372773, + "grad_norm": 28.0, + "kl": 0.0, + "learning_rate": 4.643010738520348e-06, + "logits/chosen": 519779456.0, + "logits/rejected": 601721728.0, + "logps/chosen": -319.220703125, + "logps/rejected": -587.9239501953125, + "loss": 0.0303, + "rewards/chosen": 2.9476120471954346, + "rewards/margins": 12.830569505691528, + "rewards/rejected": -9.882957458496094, + "step": 5732 + }, + { + "epoch": 0.5238008222932846, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 4.641576622843355e-06, + "logits/chosen": 583931392.0, + "logits/rejected": 378505984.0, + "logps/chosen": -391.98687744140625, + "logps/rejected": -375.6957702636719, + "loss": 0.0069, + "rewards/chosen": 4.671995162963867, + "rewards/margins": 14.165326118469238, + "rewards/rejected": -9.493330955505371, + "step": 5733 + }, + { + "epoch": 0.5238921882137962, + "grad_norm": 75.5, + "kl": 0.0, + "learning_rate": 4.640142536804654e-06, + "logits/chosen": 833731840.0, + "logits/rejected": 566039552.0, + "logps/chosen": -467.08447265625, + "logps/rejected": -532.0982259114584, + "loss": 0.0738, + "rewards/chosen": 2.383746385574341, + "rewards/margins": 13.353073676427206, + "rewards/rejected": -10.969327290852865, + "step": 5734 + }, + { + "epoch": 0.5239835541343079, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 4.638708480522832e-06, + "logits/chosen": 840334144.0, + "logits/rejected": 464400384.0, + "logps/chosen": -261.24835205078125, + "logps/rejected": -414.079345703125, + "loss": 0.1322, + "rewards/chosen": 1.719897985458374, + "rewards/margins": 12.232925176620483, + "rewards/rejected": -10.51302719116211, + "step": 5735 + }, + { + "epoch": 0.5240749200548196, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 4.637274454116471e-06, + "logits/chosen": 610951680.0, + "logits/rejected": 565533184.0, + "logps/chosen": -361.52081298828125, + "logps/rejected": -558.6448364257812, + "loss": 0.0157, + "rewards/chosen": 3.8779964447021484, + "rewards/margins": 11.482875347137451, + "rewards/rejected": -7.604878902435303, + "step": 5736 + }, + { + "epoch": 0.5241662859753312, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 4.6358404577041515e-06, + "logits/chosen": 709096256.0, + "logits/rejected": 931484501.3333334, + "logps/chosen": -344.8104248046875, + "logps/rejected": -603.2928873697916, + "loss": 0.0069, + "rewards/chosen": 3.8226470947265625, + "rewards/margins": 12.520314534505209, + "rewards/rejected": -8.697667439778646, + "step": 5737 + }, + { + "epoch": 0.5242576518958428, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 4.634406491404455e-06, + "logits/chosen": 611567718.4, + "logits/rejected": 366307754.6666667, + "logps/chosen": -447.75361328125, + "logps/rejected": -507.7607421875, + "loss": 0.0148, + "rewards/chosen": 4.127165222167969, + "rewards/margins": 15.421372477213541, + "rewards/rejected": -11.294207255045572, + "step": 5738 + }, + { + "epoch": 0.5243490178163545, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 4.63297255533595e-06, + "logits/chosen": 409010790.4, + "logits/rejected": 267956032.0, + "logps/chosen": -320.176123046875, + "logps/rejected": -410.8058268229167, + "loss": 0.1146, + "rewards/chosen": 2.5373226165771485, + "rewards/margins": 12.286389287312826, + "rewards/rejected": -9.749066670735678, + "step": 5739 + }, + { + "epoch": 0.5244403837368662, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 4.631538649617216e-06, + "logits/chosen": 590174566.4, + "logits/rejected": 721166421.3333334, + "logps/chosen": -308.00771484375, + "logps/rejected": -779.675048828125, + "loss": 0.0086, + "rewards/chosen": 4.681604385375977, + "rewards/margins": 15.148688634236654, + "rewards/rejected": -10.467084248860678, + "step": 5740 + }, + { + "epoch": 0.5245317496573778, + "grad_norm": 56.0, + "kl": 0.0, + "learning_rate": 4.630104774366823e-06, + "logits/chosen": 526194688.0, + "logits/rejected": 221961312.0, + "logps/chosen": -267.01381429036456, + "logps/rejected": -368.69769287109375, + "loss": 0.0875, + "rewards/chosen": 2.604832649230957, + "rewards/margins": 12.172703742980957, + "rewards/rejected": -9.56787109375, + "step": 5741 + }, + { + "epoch": 0.5246231155778894, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 4.6286709297033365e-06, + "logits/chosen": 988626176.0, + "logits/rejected": 370269081.6, + "logps/chosen": -373.8096516927083, + "logps/rejected": -298.607763671875, + "loss": 0.0127, + "rewards/chosen": 3.7068729400634766, + "rewards/margins": 12.41279640197754, + "rewards/rejected": -8.705923461914063, + "step": 5742 + }, + { + "epoch": 0.524714481498401, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 4.627237115745324e-06, + "logits/chosen": 576999253.3333334, + "logits/rejected": 672784640.0, + "logps/chosen": -378.7970784505208, + "logps/rejected": -761.7698364257812, + "loss": 0.0204, + "rewards/chosen": 4.003574053446452, + "rewards/margins": 12.597754160563152, + "rewards/rejected": -8.5941801071167, + "step": 5743 + }, + { + "epoch": 0.5248058474189128, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 4.625803332611347e-06, + "logits/chosen": 400283200.0, + "logits/rejected": 381226720.0, + "logps/chosen": -235.2816162109375, + "logps/rejected": -518.3057861328125, + "loss": 0.0217, + "rewards/chosen": 3.3300328254699707, + "rewards/margins": 13.725791454315186, + "rewards/rejected": -10.395758628845215, + "step": 5744 + }, + { + "epoch": 0.5248972133394244, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 4.6243695804199694e-06, + "logits/chosen": 418159974.4, + "logits/rejected": 417864789.3333333, + "logps/chosen": -252.12978515625, + "logps/rejected": -503.7526448567708, + "loss": 0.0895, + "rewards/chosen": 3.276660919189453, + "rewards/margins": 13.46597671508789, + "rewards/rejected": -10.189315795898438, + "step": 5745 + }, + { + "epoch": 0.524988579259936, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 4.622935859289745e-06, + "logits/chosen": 760281856.0, + "logits/rejected": 487966848.0, + "logps/chosen": -324.1676432291667, + "logps/rejected": -431.95355224609375, + "loss": 0.0264, + "rewards/chosen": 3.5712966918945312, + "rewards/margins": 11.962262153625488, + "rewards/rejected": -8.390965461730957, + "step": 5746 + }, + { + "epoch": 0.5250799451804476, + "grad_norm": 32.25, + "kl": 0.0, + "learning_rate": 4.621502169339234e-06, + "logits/chosen": 554171221.3333334, + "logits/rejected": 841051904.0, + "logps/chosen": -379.3357747395833, + "logps/rejected": -364.968017578125, + "loss": 0.1078, + "rewards/chosen": 2.5751736958821616, + "rewards/margins": 10.888037045796713, + "rewards/rejected": -8.31286334991455, + "step": 5747 + }, + { + "epoch": 0.5251713111009594, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 4.620068510686985e-06, + "logits/chosen": 354648512.0, + "logits/rejected": 409277781.3333333, + "logps/chosen": -329.56341552734375, + "logps/rejected": -324.4159342447917, + "loss": 0.0071, + "rewards/chosen": 4.635850429534912, + "rewards/margins": 13.637495835622152, + "rewards/rejected": -9.00164540608724, + "step": 5748 + }, + { + "epoch": 0.525262677021471, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 4.618634883451551e-06, + "logits/chosen": 572857045.3333334, + "logits/rejected": 749928192.0, + "logps/chosen": -224.37255859375, + "logps/rejected": -583.4923095703125, + "loss": 0.0395, + "rewards/chosen": 3.0704243977864585, + "rewards/margins": 11.142254193623861, + "rewards/rejected": -8.071829795837402, + "step": 5749 + }, + { + "epoch": 0.5253540429419826, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 4.617201287751478e-06, + "logits/chosen": 375070656.0, + "logits/rejected": 1176443648.0, + "logps/chosen": -184.93492126464844, + "logps/rejected": -1069.105224609375, + "loss": 0.1305, + "rewards/chosen": 2.5002217292785645, + "rewards/margins": 18.60222864151001, + "rewards/rejected": -16.102006912231445, + "step": 5750 + }, + { + "epoch": 0.5254454088624942, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 4.615767723705312e-06, + "logits/chosen": 516260761.6, + "logits/rejected": 677846314.6666666, + "logps/chosen": -275.09443359375, + "logps/rejected": -736.0743815104166, + "loss": 0.0252, + "rewards/chosen": 3.5785064697265625, + "rewards/margins": 15.01617685953776, + "rewards/rejected": -11.437670389811197, + "step": 5751 + }, + { + "epoch": 0.525536774783006, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 4.6143341914315955e-06, + "logits/chosen": 744841045.3333334, + "logits/rejected": 940871872.0, + "logps/chosen": -218.04561360677084, + "logps/rejected": -1239.3026123046875, + "loss": 0.0121, + "rewards/chosen": 4.489878018697103, + "rewards/margins": 15.099350293477375, + "rewards/rejected": -10.609472274780273, + "step": 5752 + }, + { + "epoch": 0.5256281407035176, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 4.6129006910488674e-06, + "logits/chosen": 598410137.6, + "logits/rejected": 781615957.3333334, + "logps/chosen": -375.108447265625, + "logps/rejected": -351.5357259114583, + "loss": 0.0225, + "rewards/chosen": 3.7728126525878904, + "rewards/margins": 14.569555409749348, + "rewards/rejected": -10.796742757161459, + "step": 5753 + }, + { + "epoch": 0.5257195066240292, + "grad_norm": 54.75, + "kl": 0.0, + "learning_rate": 4.611467222675667e-06, + "logits/chosen": 463700800.0, + "logits/rejected": 470862752.0, + "logps/chosen": -302.9632568359375, + "logps/rejected": -666.421630859375, + "loss": 0.0742, + "rewards/chosen": 4.209494113922119, + "rewards/margins": 15.364662647247314, + "rewards/rejected": -11.155168533325195, + "step": 5754 + }, + { + "epoch": 0.5258108725445408, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 4.610033786430526e-06, + "logits/chosen": 416920612.5714286, + "logits/rejected": 779560704.0, + "logps/chosen": -287.3009556361607, + "logps/rejected": -728.0869140625, + "loss": 0.033, + "rewards/chosen": 3.713987077985491, + "rewards/margins": 15.090733255658831, + "rewards/rejected": -11.37674617767334, + "step": 5755 + }, + { + "epoch": 0.5259022384650526, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 4.6086003824319785e-06, + "logits/chosen": 706181785.6, + "logits/rejected": 672736938.6666666, + "logps/chosen": -387.2456787109375, + "logps/rejected": -362.9978841145833, + "loss": 0.018, + "rewards/chosen": 4.021646499633789, + "rewards/margins": 11.826428095499676, + "rewards/rejected": -7.804781595865886, + "step": 5756 + }, + { + "epoch": 0.5259936043855642, + "grad_norm": 0.373046875, + "kl": 0.0, + "learning_rate": 4.607167010798552e-06, + "logits/chosen": 775785856.0, + "logits/rejected": 747240228.5714285, + "logps/chosen": -337.28466796875, + "logps/rejected": -387.3052455357143, + "loss": 0.0015, + "rewards/chosen": 4.5526123046875, + "rewards/margins": 13.521235874720983, + "rewards/rejected": -8.968623570033483, + "step": 5757 + }, + { + "epoch": 0.5260849703060758, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 4.605733671648775e-06, + "logits/chosen": 633754560.0, + "logits/rejected": 790745234.2857143, + "logps/chosen": -312.01806640625, + "logps/rejected": -423.69991629464283, + "loss": 0.0549, + "rewards/chosen": 2.539074659347534, + "rewards/margins": 11.972282579966954, + "rewards/rejected": -9.43320792061942, + "step": 5758 + }, + { + "epoch": 0.5261763362265874, + "grad_norm": 40.5, + "kl": 0.0, + "learning_rate": 4.60430036510117e-06, + "logits/chosen": 448187968.0, + "logits/rejected": 697092032.0, + "logps/chosen": -459.4967956542969, + "logps/rejected": -568.4638671875, + "loss": 0.0455, + "rewards/chosen": 3.2655746936798096, + "rewards/margins": 11.864789247512817, + "rewards/rejected": -8.599214553833008, + "step": 5759 + }, + { + "epoch": 0.5262677021470992, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 4.602867091274259e-06, + "logits/chosen": 457485525.3333333, + "logits/rejected": 1375739008.0, + "logps/chosen": -299.51800537109375, + "logps/rejected": -437.4975280761719, + "loss": 0.0263, + "rewards/chosen": 3.691857655843099, + "rewards/margins": 14.372846921284994, + "rewards/rejected": -10.680989265441895, + "step": 5760 + }, + { + "epoch": 0.5263590680676108, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 4.60143385028656e-06, + "logits/chosen": 410446299.4285714, + "logits/rejected": 636886400.0, + "logps/chosen": -246.49729701450892, + "logps/rejected": -539.9319458007812, + "loss": 0.0152, + "rewards/chosen": 4.123437336512974, + "rewards/margins": 12.010070732661656, + "rewards/rejected": -7.886633396148682, + "step": 5761 + }, + { + "epoch": 0.5264504339881224, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 4.600000642256589e-06, + "logits/chosen": 566144384.0, + "logits/rejected": 609782528.0, + "logps/chosen": -306.09051513671875, + "logps/rejected": -506.246337890625, + "loss": 0.0129, + "rewards/chosen": 2.972093105316162, + "rewards/margins": 12.758011023203531, + "rewards/rejected": -9.78591791788737, + "step": 5762 + }, + { + "epoch": 0.526541799908634, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 4.5985674673028585e-06, + "logits/chosen": 594077312.0, + "logits/rejected": 446979840.0, + "logps/chosen": -360.74847412109375, + "logps/rejected": -493.7772623697917, + "loss": 0.0071, + "rewards/chosen": 4.3144941329956055, + "rewards/margins": 14.65091355641683, + "rewards/rejected": -10.336419423421225, + "step": 5763 + }, + { + "epoch": 0.5266331658291458, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 4.5971343255438785e-06, + "logits/chosen": 635982464.0, + "logits/rejected": 1012811008.0, + "logps/chosen": -270.173095703125, + "logps/rejected": -617.6429443359375, + "loss": 0.024, + "rewards/chosen": 3.2423815727233887, + "rewards/margins": 13.199742794036865, + "rewards/rejected": -9.957361221313477, + "step": 5764 + }, + { + "epoch": 0.5267245317496574, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 4.595701217098159e-06, + "logits/chosen": 523294944.0, + "logits/rejected": 608252288.0, + "logps/chosen": -196.4456787109375, + "logps/rejected": -609.123046875, + "loss": 0.0182, + "rewards/chosen": 3.911916732788086, + "rewards/margins": 13.219226837158203, + "rewards/rejected": -9.307310104370117, + "step": 5765 + }, + { + "epoch": 0.526815897670169, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 4.594268142084202e-06, + "logits/chosen": 667618304.0, + "logits/rejected": 1121151488.0, + "logps/chosen": -237.239453125, + "logps/rejected": -639.8860677083334, + "loss": 0.0278, + "rewards/chosen": 3.9628528594970702, + "rewards/margins": 11.251242701212565, + "rewards/rejected": -7.288389841715495, + "step": 5766 + }, + { + "epoch": 0.5269072635906806, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 4.592835100620509e-06, + "logits/chosen": 799451136.0, + "logits/rejected": 622680064.0, + "logps/chosen": -564.445556640625, + "logps/rejected": -715.75859375, + "loss": 0.0143, + "rewards/chosen": 4.224673271179199, + "rewards/margins": 13.259874534606933, + "rewards/rejected": -9.035201263427734, + "step": 5767 + }, + { + "epoch": 0.5269986295111924, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 4.591402092825583e-06, + "logits/chosen": 1065563264.0, + "logits/rejected": 763092608.0, + "logps/chosen": -434.7055358886719, + "logps/rejected": -540.8211059570312, + "loss": 0.0148, + "rewards/chosen": 3.7037599086761475, + "rewards/margins": 14.692538976669312, + "rewards/rejected": -10.988779067993164, + "step": 5768 + }, + { + "epoch": 0.527089995431704, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 4.5899691188179154e-06, + "logits/chosen": 481086016.0, + "logits/rejected": 261548800.0, + "logps/chosen": -254.6361541748047, + "logps/rejected": -457.7661437988281, + "loss": 0.0067, + "rewards/chosen": 4.807433128356934, + "rewards/margins": 13.170624732971191, + "rewards/rejected": -8.363191604614258, + "step": 5769 + }, + { + "epoch": 0.5271813613522156, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 4.588536178716005e-06, + "logits/chosen": 531854720.0, + "logits/rejected": 755598080.0, + "logps/chosen": -543.3528442382812, + "logps/rejected": -486.8251139322917, + "loss": 0.0137, + "rewards/chosen": 3.098977565765381, + "rewards/margins": 12.27408234278361, + "rewards/rejected": -9.175104777018229, + "step": 5770 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 4.587103272638339e-06, + "logits/chosen": 557007808.0, + "logits/rejected": 299556160.0, + "logps/chosen": -450.4989013671875, + "logps/rejected": -277.42962646484375, + "loss": 0.0087, + "rewards/chosen": 4.5227274894714355, + "rewards/margins": 12.028048515319824, + "rewards/rejected": -7.505321025848389, + "step": 5771 + }, + { + "epoch": 0.527364093193239, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 4.585670400703408e-06, + "logits/chosen": 784946560.0, + "logits/rejected": 1033220224.0, + "logps/chosen": -321.913330078125, + "logps/rejected": -555.7587890625, + "loss": 0.0188, + "rewards/chosen": 3.5896058082580566, + "rewards/margins": 12.010791301727295, + "rewards/rejected": -8.421185493469238, + "step": 5772 + }, + { + "epoch": 0.5274554591137506, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 4.584237563029695e-06, + "logits/chosen": 765777408.0, + "logits/rejected": 446735008.0, + "logps/chosen": -458.20770263671875, + "logps/rejected": -505.1011962890625, + "loss": 0.0192, + "rewards/chosen": 3.331648826599121, + "rewards/margins": 12.168370246887207, + "rewards/rejected": -8.836721420288086, + "step": 5773 + }, + { + "epoch": 0.5275468250342622, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 4.582804759735683e-06, + "logits/chosen": 526069376.0, + "logits/rejected": 482627584.0, + "logps/chosen": -319.48927815755206, + "logps/rejected": -647.4609375, + "loss": 0.025, + "rewards/chosen": 3.570772171020508, + "rewards/margins": 13.729634475708007, + "rewards/rejected": -10.1588623046875, + "step": 5774 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 2.6875, + "kl": 0.7921943664550781, + "learning_rate": 4.5813719909398535e-06, + "logits/chosen": 762095542.8571428, + "logits/rejected": 455381536.0, + "logps/chosen": -372.81361607142856, + "logps/rejected": -308.2616882324219, + "loss": 0.0218, + "rewards/chosen": 4.202636173793247, + "rewards/margins": 12.570402554103307, + "rewards/rejected": -8.367766380310059, + "step": 5775 + }, + { + "epoch": 0.5277295568752856, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 4.5799392567606806e-06, + "logits/chosen": 445812736.0, + "logits/rejected": 580434901.3333334, + "logps/chosen": -305.577490234375, + "logps/rejected": -640.5135091145834, + "loss": 0.0393, + "rewards/chosen": 2.7711517333984377, + "rewards/margins": 12.36101392110189, + "rewards/rejected": -9.589862187703451, + "step": 5776 + }, + { + "epoch": 0.5278209227957972, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 4.57850655731664e-06, + "logits/chosen": 544017834.6666666, + "logits/rejected": 641817088.0, + "logps/chosen": -387.4789225260417, + "logps/rejected": -451.2025451660156, + "loss": 0.0254, + "rewards/chosen": 3.8559792836507163, + "rewards/margins": 13.242064793904623, + "rewards/rejected": -9.386085510253906, + "step": 5777 + }, + { + "epoch": 0.5279122887163088, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 4.577073892726201e-06, + "logits/chosen": 336980326.4, + "logits/rejected": 384344746.6666667, + "logps/chosen": -226.652783203125, + "logps/rejected": -317.09474690755206, + "loss": 0.0161, + "rewards/chosen": 4.539949035644531, + "rewards/margins": 11.94588623046875, + "rewards/rejected": -7.405937194824219, + "step": 5778 + }, + { + "epoch": 0.5280036546368204, + "grad_norm": 0.50390625, + "kl": 0.0, + "learning_rate": 4.5756412631078325e-06, + "logits/chosen": 690216106.6666666, + "logits/rejected": 551066521.6, + "logps/chosen": -259.94683837890625, + "logps/rejected": -597.16494140625, + "loss": 0.0024, + "rewards/chosen": 5.693258921305339, + "rewards/margins": 13.90661646525065, + "rewards/rejected": -8.213357543945312, + "step": 5779 + }, + { + "epoch": 0.5280950205573322, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 4.574208668580001e-06, + "logits/chosen": 484905216.0, + "logits/rejected": 367795916.8, + "logps/chosen": -252.30354817708334, + "logps/rejected": -510.209716796875, + "loss": 0.0122, + "rewards/chosen": 3.8118158976236978, + "rewards/margins": 14.159929911295572, + "rewards/rejected": -10.348114013671875, + "step": 5780 + }, + { + "epoch": 0.5281863864778438, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 4.572776109261166e-06, + "logits/chosen": 487135597.71428573, + "logits/rejected": 181925792.0, + "logps/chosen": -340.17529296875, + "logps/rejected": -249.82325744628906, + "loss": 0.0178, + "rewards/chosen": 4.113472529820034, + "rewards/margins": 12.056636878422328, + "rewards/rejected": -7.943164348602295, + "step": 5781 + }, + { + "epoch": 0.5282777523983554, + "grad_norm": 50.0, + "kl": 0.0, + "learning_rate": 4.571343585269789e-06, + "logits/chosen": 490461504.0, + "logits/rejected": 281768064.0, + "logps/chosen": -330.265380859375, + "logps/rejected": -457.7393798828125, + "loss": 0.0404, + "rewards/chosen": 3.5416371822357178, + "rewards/margins": 13.95676302909851, + "rewards/rejected": -10.415125846862793, + "step": 5782 + }, + { + "epoch": 0.528369118318867, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 4.569911096724326e-06, + "logits/chosen": 888850944.0, + "logits/rejected": 521104201.14285713, + "logps/chosen": -677.098388671875, + "logps/rejected": -616.2785993303571, + "loss": 0.0105, + "rewards/chosen": 2.444256544113159, + "rewards/margins": 11.095371961593628, + "rewards/rejected": -8.651115417480469, + "step": 5783 + }, + { + "epoch": 0.5284604842393787, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 4.568478643743232e-06, + "logits/chosen": 638604288.0, + "logits/rejected": 406040768.0, + "logps/chosen": -241.59647042410714, + "logps/rejected": -529.346435546875, + "loss": 0.0407, + "rewards/chosen": 3.3377173287527904, + "rewards/margins": 11.818399701799665, + "rewards/rejected": -8.480682373046875, + "step": 5784 + }, + { + "epoch": 0.5285518501598904, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 4.5670462264449534e-06, + "logits/chosen": 832674508.8, + "logits/rejected": 638558165.3333334, + "logps/chosen": -372.75927734375, + "logps/rejected": -357.9512939453125, + "loss": 0.0106, + "rewards/chosen": 4.318289947509766, + "rewards/margins": 12.637294006347656, + "rewards/rejected": -8.31900405883789, + "step": 5785 + }, + { + "epoch": 0.528643216080402, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 4.565613844947942e-06, + "logits/chosen": 994233429.3333334, + "logits/rejected": 503235379.2, + "logps/chosen": -414.1891276041667, + "logps/rejected": -356.3878173828125, + "loss": 0.0065, + "rewards/chosen": 4.649354298909505, + "rewards/margins": 12.94291508992513, + "rewards/rejected": -8.293560791015626, + "step": 5786 + }, + { + "epoch": 0.5287345820009136, + "grad_norm": 24.375, + "kl": 0.0, + "learning_rate": 4.56418149937064e-06, + "logits/chosen": 555126784.0, + "logits/rejected": 479037866.6666667, + "logps/chosen": -326.011328125, + "logps/rejected": -468.7295328776042, + "loss": 0.1161, + "rewards/chosen": 2.4759902954101562, + "rewards/margins": 11.829384485880533, + "rewards/rejected": -9.353394190470377, + "step": 5787 + }, + { + "epoch": 0.5288259479214253, + "grad_norm": 0.953125, + "kl": 0.0, + "learning_rate": 4.5627491898314916e-06, + "logits/chosen": 554068608.0, + "logits/rejected": 385269478.4, + "logps/chosen": -309.7388102213542, + "logps/rejected": -441.185009765625, + "loss": 0.0047, + "rewards/chosen": 4.678403854370117, + "rewards/margins": 13.909767532348633, + "rewards/rejected": -9.231363677978516, + "step": 5788 + }, + { + "epoch": 0.528917313841937, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 4.561316916448932e-06, + "logits/chosen": 360309248.0, + "logits/rejected": 568980352.0, + "logps/chosen": -257.42999267578125, + "logps/rejected": -619.67529296875, + "loss": 0.0157, + "rewards/chosen": 4.500631332397461, + "rewards/margins": 13.475595474243164, + "rewards/rejected": -8.974964141845703, + "step": 5789 + }, + { + "epoch": 0.5290086797624486, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 4.5598846793414e-06, + "logits/chosen": 516638464.0, + "logits/rejected": 315374805.3333333, + "logps/chosen": -400.7820556640625, + "logps/rejected": -427.2691243489583, + "loss": 0.0137, + "rewards/chosen": 3.904340362548828, + "rewards/margins": 12.790308888753255, + "rewards/rejected": -8.885968526204428, + "step": 5790 + }, + { + "epoch": 0.5291000456829602, + "grad_norm": 30.125, + "kl": 0.0, + "learning_rate": 4.558452478627327e-06, + "logits/chosen": 570491596.8, + "logits/rejected": 457586602.6666667, + "logps/chosen": -309.934814453125, + "logps/rejected": -766.3846842447916, + "loss": 0.0377, + "rewards/chosen": 3.001066970825195, + "rewards/margins": 16.18442039489746, + "rewards/rejected": -13.183353424072266, + "step": 5791 + }, + { + "epoch": 0.5291914116034719, + "grad_norm": 0.55078125, + "kl": 0.0, + "learning_rate": 4.557020314425145e-06, + "logits/chosen": 294094624.0, + "logits/rejected": 464073130.6666667, + "logps/chosen": -242.03253173828125, + "logps/rejected": -509.7953694661458, + "loss": 0.0028, + "rewards/chosen": 5.342741966247559, + "rewards/margins": 14.494280179341635, + "rewards/rejected": -9.151538213094076, + "step": 5792 + }, + { + "epoch": 0.5292827775239836, + "grad_norm": 0.84375, + "kl": 0.0, + "learning_rate": 4.555588186853277e-06, + "logits/chosen": 841820416.0, + "logits/rejected": 597813906.2857143, + "logps/chosen": -764.4759521484375, + "logps/rejected": -532.9935128348214, + "loss": 0.0027, + "rewards/chosen": 3.963702440261841, + "rewards/margins": 12.753612892968315, + "rewards/rejected": -8.789910452706474, + "step": 5793 + }, + { + "epoch": 0.5293741434444952, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 4.554156096030149e-06, + "logits/chosen": 481749418.6666667, + "logits/rejected": 415417216.0, + "logps/chosen": -356.4288736979167, + "logps/rejected": -287.2574157714844, + "loss": 0.0329, + "rewards/chosen": 3.9661763509114585, + "rewards/margins": 9.520002683003744, + "rewards/rejected": -5.553826332092285, + "step": 5794 + }, + { + "epoch": 0.5294655093650068, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 4.55272404207418e-06, + "logits/chosen": 326184576.0, + "logits/rejected": 458941235.2, + "logps/chosen": -191.01898193359375, + "logps/rejected": -345.785791015625, + "loss": 0.1166, + "rewards/chosen": 2.3707496325174966, + "rewards/margins": 10.244992033640543, + "rewards/rejected": -7.8742424011230465, + "step": 5795 + }, + { + "epoch": 0.5295568752855185, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 4.551292025103789e-06, + "logits/chosen": 686957098.6666666, + "logits/rejected": 659520320.0, + "logps/chosen": -223.4288330078125, + "logps/rejected": -221.47935485839844, + "loss": 0.0473, + "rewards/chosen": 3.471225102742513, + "rewards/margins": 10.784359773000082, + "rewards/rejected": -7.313134670257568, + "step": 5796 + }, + { + "epoch": 0.5296482412060302, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 4.5498600452373895e-06, + "logits/chosen": 655605162.6666666, + "logits/rejected": 461499392.0, + "logps/chosen": -432.56982421875, + "logps/rejected": -418.07490234375, + "loss": 0.0128, + "rewards/chosen": 4.347743988037109, + "rewards/margins": 13.669780731201172, + "rewards/rejected": -9.322036743164062, + "step": 5797 + }, + { + "epoch": 0.5297396071265418, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 4.548428102593395e-06, + "logits/chosen": 552111820.8, + "logits/rejected": 576838442.6666666, + "logps/chosen": -211.2718017578125, + "logps/rejected": -537.8882649739584, + "loss": 0.0154, + "rewards/chosen": 4.375420379638672, + "rewards/margins": 14.415303039550782, + "rewards/rejected": -10.03988265991211, + "step": 5798 + }, + { + "epoch": 0.5298309730470534, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 4.546996197290211e-06, + "logits/chosen": 648954368.0, + "logits/rejected": 513924576.0, + "logps/chosen": -428.4222106933594, + "logps/rejected": -409.226318359375, + "loss": 0.0128, + "rewards/chosen": 3.6745147705078125, + "rewards/margins": 12.635703086853027, + "rewards/rejected": -8.961188316345215, + "step": 5799 + }, + { + "epoch": 0.5299223389675651, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 4.545564329446246e-06, + "logits/chosen": 413891456.0, + "logits/rejected": 782402816.0, + "logps/chosen": -309.5677490234375, + "logps/rejected": -511.33837890625, + "loss": 0.0312, + "rewards/chosen": 3.5392751693725586, + "rewards/margins": 11.238556861877441, + "rewards/rejected": -7.699281692504883, + "step": 5800 + }, + { + "epoch": 0.5300137048880768, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 4.544132499179898e-06, + "logits/chosen": 482992960.0, + "logits/rejected": 231066736.0, + "logps/chosen": -385.94781494140625, + "logps/rejected": -319.1158447265625, + "loss": 0.0153, + "rewards/chosen": 4.138789176940918, + "rewards/margins": 12.777983665466309, + "rewards/rejected": -8.63919448852539, + "step": 5801 + }, + { + "epoch": 0.5301050708085884, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 4.542700706609571e-06, + "logits/chosen": 681644288.0, + "logits/rejected": 380622944.0, + "logps/chosen": -294.0770670572917, + "logps/rejected": -373.70220947265625, + "loss": 0.0308, + "rewards/chosen": 3.351363182067871, + "rewards/margins": 12.710416793823242, + "rewards/rejected": -9.359053611755371, + "step": 5802 + }, + { + "epoch": 0.5301964367291, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.5412689518536565e-06, + "logits/chosen": 390131328.0, + "logits/rejected": 647616128.0, + "logps/chosen": -326.6619873046875, + "logps/rejected": -689.4884643554688, + "loss": 0.0191, + "rewards/chosen": 3.669687271118164, + "rewards/margins": 14.419432640075684, + "rewards/rejected": -10.74974536895752, + "step": 5803 + }, + { + "epoch": 0.5302878026496117, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 4.539837235030551e-06, + "logits/chosen": 730897834.6666666, + "logits/rejected": 503592448.0, + "logps/chosen": -377.0729573567708, + "logps/rejected": -753.9063720703125, + "loss": 0.0207, + "rewards/chosen": 3.9793831507364907, + "rewards/margins": 18.676887194315594, + "rewards/rejected": -14.697504043579102, + "step": 5804 + }, + { + "epoch": 0.5303791685701233, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.538405556258643e-06, + "logits/chosen": 942354005.3333334, + "logits/rejected": 877371596.8, + "logps/chosen": -209.8453572591146, + "logps/rejected": -523.82705078125, + "loss": 0.0497, + "rewards/chosen": 3.6126839319864907, + "rewards/margins": 12.352086702982584, + "rewards/rejected": -8.739402770996094, + "step": 5805 + }, + { + "epoch": 0.530470534490635, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 4.536973915656318e-06, + "logits/chosen": 481752576.0, + "logits/rejected": 555782869.3333334, + "logps/chosen": -340.92547607421875, + "logps/rejected": -448.287841796875, + "loss": 0.0086, + "rewards/chosen": 4.843565464019775, + "rewards/margins": 13.70669666926066, + "rewards/rejected": -8.863131205240885, + "step": 5806 + }, + { + "epoch": 0.5305619004111466, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 4.5355423133419604e-06, + "logits/chosen": 1023864917.3333334, + "logits/rejected": 630105804.8, + "logps/chosen": -181.97296142578125, + "logps/rejected": -371.932568359375, + "loss": 0.0132, + "rewards/chosen": 3.4609511693318686, + "rewards/margins": 11.849558957417807, + "rewards/rejected": -8.388607788085938, + "step": 5807 + }, + { + "epoch": 0.5306532663316583, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 4.53411074943395e-06, + "logits/chosen": 623637504.0, + "logits/rejected": 344643104.0, + "logps/chosen": -302.38262939453125, + "logps/rejected": -264.672607421875, + "loss": 0.0162, + "rewards/chosen": 4.053544044494629, + "rewards/margins": 12.327964782714844, + "rewards/rejected": -8.274420738220215, + "step": 5808 + }, + { + "epoch": 0.53074463225217, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 4.532679224050664e-06, + "logits/chosen": 878150656.0, + "logits/rejected": 799058944.0, + "logps/chosen": -370.94061279296875, + "logps/rejected": -481.0059407552083, + "loss": 0.0114, + "rewards/chosen": 3.6409683227539062, + "rewards/margins": 11.203175862630207, + "rewards/rejected": -7.562207539876302, + "step": 5809 + }, + { + "epoch": 0.5308359981726816, + "grad_norm": 0.8203125, + "kl": 0.0, + "learning_rate": 4.5312477373104755e-06, + "logits/chosen": 429562432.0, + "logits/rejected": 748494336.0, + "logps/chosen": -419.0771789550781, + "logps/rejected": -671.5821126302084, + "loss": 0.0029, + "rewards/chosen": 4.5640153884887695, + "rewards/margins": 15.301623980204264, + "rewards/rejected": -10.737608591715494, + "step": 5810 + }, + { + "epoch": 0.5309273640931932, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 4.529816289331758e-06, + "logits/chosen": 998179968.0, + "logits/rejected": 620401152.0, + "logps/chosen": -364.1602478027344, + "logps/rejected": -392.1574401855469, + "loss": 0.0237, + "rewards/chosen": 3.182677745819092, + "rewards/margins": 13.245822429656982, + "rewards/rejected": -10.06314468383789, + "step": 5811 + }, + { + "epoch": 0.5310187300137049, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 4.528384880232875e-06, + "logits/chosen": 385263616.0, + "logits/rejected": 326450892.8, + "logps/chosen": -273.99399820963544, + "logps/rejected": -420.5439453125, + "loss": 0.0173, + "rewards/chosen": 4.3246720631917315, + "rewards/margins": 13.40189069112142, + "rewards/rejected": -9.077218627929687, + "step": 5812 + }, + { + "epoch": 0.5311100959342165, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 4.526953510132194e-06, + "logits/chosen": 397161625.6, + "logits/rejected": 337025621.3333333, + "logps/chosen": -250.68564453125, + "logps/rejected": -328.8097330729167, + "loss": 0.032, + "rewards/chosen": 4.132256317138672, + "rewards/margins": 9.648617553710938, + "rewards/rejected": -5.516361236572266, + "step": 5813 + }, + { + "epoch": 0.5312014618547282, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 4.525522179148074e-06, + "logits/chosen": 687447705.6, + "logits/rejected": 572359082.6666666, + "logps/chosen": -308.85947265625, + "logps/rejected": -497.175537109375, + "loss": 0.0165, + "rewards/chosen": 4.098865127563476, + "rewards/margins": 13.908071772257486, + "rewards/rejected": -9.80920664469401, + "step": 5814 + }, + { + "epoch": 0.5312928277752398, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 4.524090887398875e-06, + "logits/chosen": 725287475.2, + "logits/rejected": 711034197.3333334, + "logps/chosen": -271.5314208984375, + "logps/rejected": -411.704833984375, + "loss": 0.0301, + "rewards/chosen": 3.491962432861328, + "rewards/margins": 12.106524785359701, + "rewards/rejected": -8.614562352498373, + "step": 5815 + }, + { + "epoch": 0.5313841936957515, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 4.52265963500295e-06, + "logits/chosen": 686629376.0, + "logits/rejected": 1082779392.0, + "logps/chosen": -331.31768798828125, + "logps/rejected": -663.7059326171875, + "loss": 0.0209, + "rewards/chosen": 3.333118438720703, + "rewards/margins": 12.182429313659668, + "rewards/rejected": -8.849310874938965, + "step": 5816 + }, + { + "epoch": 0.5314755596162631, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 4.5212284220786495e-06, + "logits/chosen": 666208597.3333334, + "logits/rejected": 358664768.0, + "logps/chosen": -394.2658284505208, + "logps/rejected": -453.7227478027344, + "loss": 0.0215, + "rewards/chosen": 4.047595024108887, + "rewards/margins": 15.278565406799316, + "rewards/rejected": -11.23097038269043, + "step": 5817 + }, + { + "epoch": 0.5315669255367748, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 4.519797248744323e-06, + "logits/chosen": 428236800.0, + "logits/rejected": 617105493.3333334, + "logps/chosen": -338.40419921875, + "logps/rejected": -731.5897623697916, + "loss": 0.0217, + "rewards/chosen": 3.656339645385742, + "rewards/margins": 13.28904914855957, + "rewards/rejected": -9.632709503173828, + "step": 5818 + }, + { + "epoch": 0.5316582914572864, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 4.518366115118317e-06, + "logits/chosen": 526802227.2, + "logits/rejected": 806219605.3333334, + "logps/chosen": -423.8380859375, + "logps/rejected": -605.5696614583334, + "loss": 0.0284, + "rewards/chosen": 3.2526271820068358, + "rewards/margins": 10.93738187154134, + "rewards/rejected": -7.684754689534505, + "step": 5819 + }, + { + "epoch": 0.5317496573777981, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 4.516935021318968e-06, + "logits/chosen": 453022549.3333333, + "logits/rejected": 601183616.0, + "logps/chosen": -307.9248453776042, + "logps/rejected": -532.9273681640625, + "loss": 0.016, + "rewards/chosen": 4.220670700073242, + "rewards/margins": 14.774293899536133, + "rewards/rejected": -10.55362319946289, + "step": 5820 + }, + { + "epoch": 0.5318410232983097, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 4.515503967464619e-06, + "logits/chosen": 539635840.0, + "logits/rejected": 435700352.0, + "logps/chosen": -363.4712219238281, + "logps/rejected": -521.9340209960938, + "loss": 0.0236, + "rewards/chosen": 3.022966146469116, + "rewards/margins": 14.093507528305054, + "rewards/rejected": -11.070541381835938, + "step": 5821 + }, + { + "epoch": 0.5319323892188214, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 4.5140729536736006e-06, + "logits/chosen": 473752576.0, + "logits/rejected": 948602816.0, + "logps/chosen": -303.68658447265625, + "logps/rejected": -511.4822082519531, + "loss": 0.0201, + "rewards/chosen": 4.516391754150391, + "rewards/margins": 13.950685501098633, + "rewards/rejected": -9.434293746948242, + "step": 5822 + }, + { + "epoch": 0.532023755139333, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 4.512641980064248e-06, + "logits/chosen": 421712725.3333333, + "logits/rejected": 476177049.6, + "logps/chosen": -286.76503499348956, + "logps/rejected": -541.3076171875, + "loss": 0.0233, + "rewards/chosen": 2.7737998962402344, + "rewards/margins": 12.108409881591797, + "rewards/rejected": -9.334609985351562, + "step": 5823 + }, + { + "epoch": 0.5321151210598447, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 4.511211046754887e-06, + "logits/chosen": 587368320.0, + "logits/rejected": 1155627520.0, + "logps/chosen": -285.84075927734375, + "logps/rejected": -449.890380859375, + "loss": 0.0119, + "rewards/chosen": 4.0773162841796875, + "rewards/margins": 12.338968276977539, + "rewards/rejected": -8.261651992797852, + "step": 5824 + }, + { + "epoch": 0.5322064869803563, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 4.509780153863846e-06, + "logits/chosen": 403321632.0, + "logits/rejected": 262899280.0, + "logps/chosen": -314.04766845703125, + "logps/rejected": -470.5929870605469, + "loss": 0.0289, + "rewards/chosen": 2.8073570728302, + "rewards/margins": 11.907206773757935, + "rewards/rejected": -9.099849700927734, + "step": 5825 + }, + { + "epoch": 0.532297852900868, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.508349301509443e-06, + "logits/chosen": 391957333.3333333, + "logits/rejected": 1122075340.8, + "logps/chosen": -434.0304361979167, + "logps/rejected": -262.2671875, + "loss": 0.017, + "rewards/chosen": 4.141197204589844, + "rewards/margins": 11.396803283691407, + "rewards/rejected": -7.255606079101563, + "step": 5826 + }, + { + "epoch": 0.5323892188213796, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 4.506918489809997e-06, + "logits/chosen": 630890496.0, + "logits/rejected": 450673888.0, + "logps/chosen": -377.95941162109375, + "logps/rejected": -515.4151611328125, + "loss": 0.0163, + "rewards/chosen": 3.928636074066162, + "rewards/margins": 13.656367778778076, + "rewards/rejected": -9.727731704711914, + "step": 5827 + }, + { + "epoch": 0.5324805847418913, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 4.505487718883825e-06, + "logits/chosen": 244483993.6, + "logits/rejected": 206813610.66666666, + "logps/chosen": -223.114697265625, + "logps/rejected": -237.56547037760416, + "loss": 0.0429, + "rewards/chosen": 4.663809967041016, + "rewards/margins": 9.981757990519206, + "rewards/rejected": -5.31794802347819, + "step": 5828 + }, + { + "epoch": 0.5325719506624029, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.5040569888492344e-06, + "logits/chosen": 522180403.2, + "logits/rejected": 552999253.3333334, + "logps/chosen": -391.18076171875, + "logps/rejected": -654.7483317057291, + "loss": 0.0145, + "rewards/chosen": 3.9202476501464845, + "rewards/margins": 13.610478337605795, + "rewards/rejected": -9.69023068745931, + "step": 5829 + }, + { + "epoch": 0.5326633165829145, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.502626299824539e-06, + "logits/chosen": 396672998.4, + "logits/rejected": 263708373.33333334, + "logps/chosen": -400.828076171875, + "logps/rejected": -407.934814453125, + "loss": 0.0109, + "rewards/chosen": 4.689348983764648, + "rewards/margins": 12.046128718058268, + "rewards/rejected": -7.35677973429362, + "step": 5830 + }, + { + "epoch": 0.5327546825034262, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 4.501195651928037e-06, + "logits/chosen": 585031936.0, + "logits/rejected": 898772160.0, + "logps/chosen": -322.7684631347656, + "logps/rejected": -575.1810913085938, + "loss": 0.0188, + "rewards/chosen": 4.826425552368164, + "rewards/margins": 14.186846733093262, + "rewards/rejected": -9.360421180725098, + "step": 5831 + }, + { + "epoch": 0.5328460484239379, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 4.499765045278035e-06, + "logits/chosen": 709220992.0, + "logits/rejected": 870729216.0, + "logps/chosen": -543.9271240234375, + "logps/rejected": -698.7386881510416, + "loss": 0.0032, + "rewards/chosen": 4.845361232757568, + "rewards/margins": 16.1174635887146, + "rewards/rejected": -11.272102355957031, + "step": 5832 + }, + { + "epoch": 0.5329374143444495, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 4.498334479992828e-06, + "logits/chosen": 327963456.0, + "logits/rejected": 833212876.8, + "logps/chosen": -168.95782470703125, + "logps/rejected": -886.63359375, + "loss": 0.0144, + "rewards/chosen": 4.013840675354004, + "rewards/margins": 13.634455299377441, + "rewards/rejected": -9.620614624023437, + "step": 5833 + }, + { + "epoch": 0.5330287802649611, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 4.496903956190713e-06, + "logits/chosen": 238506026.66666666, + "logits/rejected": 428148352.0, + "logps/chosen": -157.90168253580728, + "logps/rejected": -446.39091796875, + "loss": 0.0287, + "rewards/chosen": 3.117417653401693, + "rewards/margins": 11.777100880940756, + "rewards/rejected": -8.659683227539062, + "step": 5834 + }, + { + "epoch": 0.5331201461854728, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 4.495473473989978e-06, + "logits/chosen": 1359274752.0, + "logits/rejected": 612468838.4, + "logps/chosen": -284.5659586588542, + "logps/rejected": -536.6091796875, + "loss": 0.0293, + "rewards/chosen": 2.564209302266439, + "rewards/margins": 11.593123181660971, + "rewards/rejected": -9.028913879394532, + "step": 5835 + }, + { + "epoch": 0.5332115121059845, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 4.494043033508913e-06, + "logits/chosen": 770955861.3333334, + "logits/rejected": 870440128.0, + "logps/chosen": -259.15602620442706, + "logps/rejected": -403.8170166015625, + "loss": 0.0232, + "rewards/chosen": 3.869587262471517, + "rewards/margins": 14.318508466084799, + "rewards/rejected": -10.448921203613281, + "step": 5836 + }, + { + "epoch": 0.5333028780264961, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 4.492612634865801e-06, + "logits/chosen": 578985216.0, + "logits/rejected": 780282752.0, + "logps/chosen": -440.5966491699219, + "logps/rejected": -486.9297790527344, + "loss": 0.0169, + "rewards/chosen": 3.5739262104034424, + "rewards/margins": 13.254531145095825, + "rewards/rejected": -9.680604934692383, + "step": 5837 + }, + { + "epoch": 0.5333942439470077, + "grad_norm": 0.85546875, + "kl": 0.0, + "learning_rate": 4.491182278178924e-06, + "logits/chosen": 442486208.0, + "logits/rejected": 821007506.2857143, + "logps/chosen": -343.0403747558594, + "logps/rejected": -433.7993861607143, + "loss": 0.003, + "rewards/chosen": 4.261114597320557, + "rewards/margins": 12.473199231284005, + "rewards/rejected": -8.212084633963448, + "step": 5838 + }, + { + "epoch": 0.5334856098675194, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 4.489751963566557e-06, + "logits/chosen": 1167781683.2, + "logits/rejected": 892524629.3333334, + "logps/chosen": -144.1080078125, + "logps/rejected": -287.10272216796875, + "loss": 0.145, + "rewards/chosen": 2.3095623016357423, + "rewards/margins": 8.83675537109375, + "rewards/rejected": -6.527193069458008, + "step": 5839 + }, + { + "epoch": 0.5335769757880311, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 4.488321691146975e-06, + "logits/chosen": 567955797.3333334, + "logits/rejected": 671508070.4, + "logps/chosen": -467.5391438802083, + "logps/rejected": -651.407568359375, + "loss": 0.011, + "rewards/chosen": 3.8382221857706704, + "rewards/margins": 14.085077349344889, + "rewards/rejected": -10.246855163574219, + "step": 5840 + }, + { + "epoch": 0.5336683417085427, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 4.48689146103845e-06, + "logits/chosen": 459707989.3333333, + "logits/rejected": 709543065.6, + "logps/chosen": -301.55128987630206, + "logps/rejected": -476.8580078125, + "loss": 0.0086, + "rewards/chosen": 4.345553716023763, + "rewards/margins": 13.44057019551595, + "rewards/rejected": -9.095016479492188, + "step": 5841 + }, + { + "epoch": 0.5337597076290543, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 4.4854612733592446e-06, + "logits/chosen": 875944448.0, + "logits/rejected": 566216294.4, + "logps/chosen": -282.07838948567706, + "logps/rejected": -550.2033203125, + "loss": 0.0075, + "rewards/chosen": 4.208865483601888, + "rewards/margins": 14.510332616170246, + "rewards/rejected": -10.301467132568359, + "step": 5842 + }, + { + "epoch": 0.533851073549566, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 4.4840311282276266e-06, + "logits/chosen": 709139754.6666666, + "logits/rejected": 452840000.0, + "logps/chosen": -348.8013509114583, + "logps/rejected": -280.4391174316406, + "loss": 0.0562, + "rewards/chosen": 4.344306945800781, + "rewards/margins": 9.296865940093994, + "rewards/rejected": -4.952558994293213, + "step": 5843 + }, + { + "epoch": 0.5339424394700777, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 4.482601025761853e-06, + "logits/chosen": 520617216.0, + "logits/rejected": 576245504.0, + "logps/chosen": -323.0054524739583, + "logps/rejected": -776.8570556640625, + "loss": 0.0241, + "rewards/chosen": 4.137977282206218, + "rewards/margins": 14.316360155741375, + "rewards/rejected": -10.178382873535156, + "step": 5844 + }, + { + "epoch": 0.5340338053905893, + "grad_norm": 1.0625, + "kl": 0.0, + "learning_rate": 4.481170966080183e-06, + "logits/chosen": 580280704.0, + "logits/rejected": 634399451.4285715, + "logps/chosen": -304.7265319824219, + "logps/rejected": -535.4208984375, + "loss": 0.004, + "rewards/chosen": 3.4343597888946533, + "rewards/margins": 13.535898310797554, + "rewards/rejected": -10.101538521902901, + "step": 5845 + }, + { + "epoch": 0.5341251713111009, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 4.479740949300864e-06, + "logits/chosen": 668573184.0, + "logits/rejected": 408450764.8, + "logps/chosen": -452.9346516927083, + "logps/rejected": -549.775537109375, + "loss": 0.014, + "rewards/chosen": 3.390559514363607, + "rewards/margins": 12.93309415181478, + "rewards/rejected": -9.542534637451173, + "step": 5846 + }, + { + "epoch": 0.5342165372316126, + "grad_norm": 0.07470703125, + "kl": 0.0, + "learning_rate": 4.47831097554215e-06, + "logits/chosen": 200102816.0, + "logits/rejected": 472990427.4285714, + "logps/chosen": -115.21141815185547, + "logps/rejected": -555.4935825892857, + "loss": 0.0003, + "rewards/chosen": 6.252124786376953, + "rewards/margins": 17.35269546508789, + "rewards/rejected": -11.100570678710938, + "step": 5847 + }, + { + "epoch": 0.5343079031521243, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 4.476881044922283e-06, + "logits/chosen": 441142169.6, + "logits/rejected": 554205226.6666666, + "logps/chosen": -325.475146484375, + "logps/rejected": -522.2566731770834, + "loss": 0.0296, + "rewards/chosen": 3.7771156311035154, + "rewards/margins": 13.92924830118815, + "rewards/rejected": -10.152132670084635, + "step": 5848 + }, + { + "epoch": 0.5343992690726359, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.475451157559508e-06, + "logits/chosen": 526797129.14285713, + "logits/rejected": 305458368.0, + "logps/chosen": -317.5540248325893, + "logps/rejected": -423.943603515625, + "loss": 0.0196, + "rewards/chosen": 4.2998537336077005, + "rewards/margins": 13.717737606593541, + "rewards/rejected": -9.41788387298584, + "step": 5849 + }, + { + "epoch": 0.5344906349931475, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 4.474021313572061e-06, + "logits/chosen": 699172992.0, + "logits/rejected": 538873088.0, + "logps/chosen": -291.10858154296875, + "logps/rejected": -570.727294921875, + "loss": 0.0149, + "rewards/chosen": 3.6412711143493652, + "rewards/margins": 13.556566715240479, + "rewards/rejected": -9.915295600891113, + "step": 5850 + }, + { + "epoch": 0.5345820009136591, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 4.47259151307818e-06, + "logits/chosen": 620588032.0, + "logits/rejected": 949843008.0, + "logps/chosen": -424.1356201171875, + "logps/rejected": -426.55413818359375, + "loss": 0.0166, + "rewards/chosen": 4.062395095825195, + "rewards/margins": 12.523290634155273, + "rewards/rejected": -8.460895538330078, + "step": 5851 + }, + { + "epoch": 0.5346733668341709, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 4.471161756196093e-06, + "logits/chosen": 600550528.0, + "logits/rejected": 380652202.6666667, + "logps/chosen": -348.84222412109375, + "logps/rejected": -349.9621175130208, + "loss": 0.0126, + "rewards/chosen": 3.1576128005981445, + "rewards/margins": 11.176036516825357, + "rewards/rejected": -8.018423716227213, + "step": 5852 + }, + { + "epoch": 0.5347647327546825, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 4.469732043044029e-06, + "logits/chosen": 512960672.0, + "logits/rejected": 490180960.0, + "logps/chosen": -327.69036865234375, + "logps/rejected": -593.5836791992188, + "loss": 0.0246, + "rewards/chosen": 3.724701404571533, + "rewards/margins": 12.694251537322998, + "rewards/rejected": -8.969550132751465, + "step": 5853 + }, + { + "epoch": 0.5348560986751941, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 4.468302373740211e-06, + "logits/chosen": 852845721.6, + "logits/rejected": 593415082.6666666, + "logps/chosen": -632.888623046875, + "logps/rejected": -359.6693522135417, + "loss": 0.0236, + "rewards/chosen": 3.3293636322021483, + "rewards/margins": 13.186261622111001, + "rewards/rejected": -9.856897989908854, + "step": 5854 + }, + { + "epoch": 0.5349474645957059, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 4.466872748402863e-06, + "logits/chosen": 676999270.4, + "logits/rejected": 359220608.0, + "logps/chosen": -432.29267578125, + "logps/rejected": -338.7759602864583, + "loss": 0.0168, + "rewards/chosen": 4.014618301391602, + "rewards/margins": 13.21449317932129, + "rewards/rejected": -9.199874877929688, + "step": 5855 + }, + { + "epoch": 0.5350388305162175, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 4.465443167150198e-06, + "logits/chosen": 582041472.0, + "logits/rejected": 308310528.0, + "logps/chosen": -430.82647705078125, + "logps/rejected": -404.99102783203125, + "loss": 0.0101, + "rewards/chosen": 4.1416778564453125, + "rewards/margins": 12.321778297424316, + "rewards/rejected": -8.180100440979004, + "step": 5856 + }, + { + "epoch": 0.5351301964367291, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 4.46401363010043e-06, + "logits/chosen": 707613388.8, + "logits/rejected": 518018133.3333333, + "logps/chosen": -615.1677734375, + "logps/rejected": -426.9727376302083, + "loss": 0.0227, + "rewards/chosen": 3.9248313903808594, + "rewards/margins": 12.422046661376953, + "rewards/rejected": -8.497215270996094, + "step": 5857 + }, + { + "epoch": 0.5352215623572407, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 4.4625841373717685e-06, + "logits/chosen": 635957056.0, + "logits/rejected": 469600160.0, + "logps/chosen": -469.24774169921875, + "logps/rejected": -520.643310546875, + "loss": 0.0177, + "rewards/chosen": 3.673314094543457, + "rewards/margins": 13.151778221130371, + "rewards/rejected": -9.478464126586914, + "step": 5858 + }, + { + "epoch": 0.5353129282777525, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 4.461154689082419e-06, + "logits/chosen": 350868992.0, + "logits/rejected": 532120544.0, + "logps/chosen": -313.59027099609375, + "logps/rejected": -862.8826904296875, + "loss": 0.018, + "rewards/chosen": 3.6125411987304688, + "rewards/margins": 15.544832229614258, + "rewards/rejected": -11.932291030883789, + "step": 5859 + }, + { + "epoch": 0.5354042941982641, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 4.459725285350586e-06, + "logits/chosen": 389832746.6666667, + "logits/rejected": 548592025.6, + "logps/chosen": -224.34659830729166, + "logps/rejected": -449.842431640625, + "loss": 0.012, + "rewards/chosen": 3.852309544881185, + "rewards/margins": 13.83952267964681, + "rewards/rejected": -9.987213134765625, + "step": 5860 + }, + { + "epoch": 0.5354956601187757, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 4.458295926294465e-06, + "logits/chosen": 681297715.2, + "logits/rejected": 228058112.0, + "logps/chosen": -214.0028076171875, + "logps/rejected": -348.5766194661458, + "loss": 0.0479, + "rewards/chosen": 2.6919153213500975, + "rewards/margins": 12.466525840759278, + "rewards/rejected": -9.77461051940918, + "step": 5861 + }, + { + "epoch": 0.5355870260392873, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 4.4568666120322525e-06, + "logits/chosen": 535757824.0, + "logits/rejected": 393500800.0, + "logps/chosen": -346.6667785644531, + "logps/rejected": -302.036865234375, + "loss": 0.0101, + "rewards/chosen": 4.150386810302734, + "rewards/margins": 12.424867630004883, + "rewards/rejected": -8.274480819702148, + "step": 5862 + }, + { + "epoch": 0.535678391959799, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 4.4554373426821375e-06, + "logits/chosen": 943198400.0, + "logits/rejected": 415102336.0, + "logps/chosen": -365.2348937988281, + "logps/rejected": -414.13671875, + "loss": 0.0125, + "rewards/chosen": 3.9314823150634766, + "rewards/margins": 15.469100952148438, + "rewards/rejected": -11.537618637084961, + "step": 5863 + }, + { + "epoch": 0.5357697578803107, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.454008118362311e-06, + "logits/chosen": 431595818.6666667, + "logits/rejected": 628410777.6, + "logps/chosen": -233.77156575520834, + "logps/rejected": -709.212158203125, + "loss": 0.0163, + "rewards/chosen": 3.1323486963907876, + "rewards/margins": 14.025714937845866, + "rewards/rejected": -10.893366241455078, + "step": 5864 + }, + { + "epoch": 0.5358611238008223, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 4.452578939190952e-06, + "logits/chosen": 442253824.0, + "logits/rejected": 252670080.0, + "logps/chosen": -252.61376953125, + "logps/rejected": -574.1351318359375, + "loss": 0.0164, + "rewards/chosen": 3.8720413208007813, + "rewards/margins": 17.15495071411133, + "rewards/rejected": -13.282909393310547, + "step": 5865 + }, + { + "epoch": 0.5359524897213339, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 4.451149805286244e-06, + "logits/chosen": 1463932074.6666667, + "logits/rejected": 710360576.0, + "logps/chosen": -246.75545247395834, + "logps/rejected": -544.16103515625, + "loss": 0.0177, + "rewards/chosen": 3.5166877110799155, + "rewards/margins": 14.271753629048666, + "rewards/rejected": -10.75506591796875, + "step": 5866 + }, + { + "epoch": 0.5360438556418456, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 4.449720716766362e-06, + "logits/chosen": 705226922.6666666, + "logits/rejected": 548135628.8, + "logps/chosen": -492.6319173177083, + "logps/rejected": -371.8564453125, + "loss": 0.0086, + "rewards/chosen": 4.108634312947591, + "rewards/margins": 12.553559239705404, + "rewards/rejected": -8.444924926757812, + "step": 5867 + }, + { + "epoch": 0.5361352215623573, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.4482916737494785e-06, + "logits/chosen": 372154560.0, + "logits/rejected": 234353344.0, + "logps/chosen": -296.94000244140625, + "logps/rejected": -267.1956787109375, + "loss": 0.0305, + "rewards/chosen": 2.797650098800659, + "rewards/margins": 10.430325269699097, + "rewards/rejected": -7.6326751708984375, + "step": 5868 + }, + { + "epoch": 0.5362265874828689, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 4.446862676353762e-06, + "logits/chosen": 616319436.8, + "logits/rejected": 514292053.3333333, + "logps/chosen": -473.1962890625, + "logps/rejected": -498.4734700520833, + "loss": 0.0267, + "rewards/chosen": 3.968449020385742, + "rewards/margins": 12.313414637247721, + "rewards/rejected": -8.344965616861979, + "step": 5869 + }, + { + "epoch": 0.5363179534033805, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.445433724697376e-06, + "logits/chosen": 768993962.6666666, + "logits/rejected": 277712768.0, + "logps/chosen": -409.3983561197917, + "logps/rejected": -367.119384765625, + "loss": 0.0175, + "rewards/chosen": 4.164649327596028, + "rewards/margins": 13.399356206258137, + "rewards/rejected": -9.23470687866211, + "step": 5870 + }, + { + "epoch": 0.5364093193238922, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 4.444004818898484e-06, + "logits/chosen": 703897753.6, + "logits/rejected": 411526314.6666667, + "logps/chosen": -498.5041015625, + "logps/rejected": -170.16744995117188, + "loss": 0.0219, + "rewards/chosen": 3.8611656188964845, + "rewards/margins": 10.162587738037109, + "rewards/rejected": -6.301422119140625, + "step": 5871 + }, + { + "epoch": 0.5365006852444039, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.442575959075244e-06, + "logits/chosen": 804250240.0, + "logits/rejected": 584792960.0, + "logps/chosen": -619.509521484375, + "logps/rejected": -512.1612548828125, + "loss": 0.0118, + "rewards/chosen": 3.972566604614258, + "rewards/margins": 12.510194778442383, + "rewards/rejected": -8.537628173828125, + "step": 5872 + }, + { + "epoch": 0.5365920511649155, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.4411471453458045e-06, + "logits/chosen": 476882261.3333333, + "logits/rejected": 592400384.0, + "logps/chosen": -340.50746663411456, + "logps/rejected": -603.569873046875, + "loss": 0.0165, + "rewards/chosen": 3.4225692749023438, + "rewards/margins": 13.196870422363281, + "rewards/rejected": -9.774301147460937, + "step": 5873 + }, + { + "epoch": 0.5366834170854271, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 4.4397183778283195e-06, + "logits/chosen": 524832358.4, + "logits/rejected": 446361770.6666667, + "logps/chosen": -315.30576171875, + "logps/rejected": -499.0769856770833, + "loss": 0.1322, + "rewards/chosen": 2.5223670959472657, + "rewards/margins": 12.599279022216797, + "rewards/rejected": -10.076911926269531, + "step": 5874 + }, + { + "epoch": 0.5367747830059388, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 4.438289656640931e-06, + "logits/chosen": 843300736.0, + "logits/rejected": 327669312.0, + "logps/chosen": -291.2774353027344, + "logps/rejected": -471.87725830078125, + "loss": 0.0367, + "rewards/chosen": 4.119389533996582, + "rewards/margins": 10.78322982788086, + "rewards/rejected": -6.663840293884277, + "step": 5875 + }, + { + "epoch": 0.5368661489264505, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 4.4368609819017865e-06, + "logits/chosen": 854218496.0, + "logits/rejected": 428861798.4, + "logps/chosen": -221.2818603515625, + "logps/rejected": -435.741845703125, + "loss": 0.0172, + "rewards/chosen": 3.588949203491211, + "rewards/margins": 13.575832748413086, + "rewards/rejected": -9.986883544921875, + "step": 5876 + }, + { + "epoch": 0.5369575148469621, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 4.435432353729021e-06, + "logits/chosen": 282377932.8, + "logits/rejected": 225925397.33333334, + "logps/chosen": -167.7990966796875, + "logps/rejected": -332.68841552734375, + "loss": 0.0167, + "rewards/chosen": 4.376992797851562, + "rewards/margins": 12.75249760945638, + "rewards/rejected": -8.375504811604818, + "step": 5877 + }, + { + "epoch": 0.5370488807674737, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 4.434003772240767e-06, + "logits/chosen": 497942937.6, + "logits/rejected": 701815338.6666666, + "logps/chosen": -397.94384765625, + "logps/rejected": -682.7532552083334, + "loss": 0.0143, + "rewards/chosen": 4.2438232421875, + "rewards/margins": 18.05388946533203, + "rewards/rejected": -13.810066223144531, + "step": 5878 + }, + { + "epoch": 0.5371402466879854, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 4.432575237555159e-06, + "logits/chosen": 224387968.0, + "logits/rejected": 421944618.6666667, + "logps/chosen": -484.1418762207031, + "logps/rejected": -454.8921712239583, + "loss": 0.0094, + "rewards/chosen": 3.306680202484131, + "rewards/margins": 11.095450242360432, + "rewards/rejected": -7.788770039876302, + "step": 5879 + }, + { + "epoch": 0.5372316126084971, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.43114674979032e-06, + "logits/chosen": 269029632.0, + "logits/rejected": 267041536.0, + "logps/chosen": -267.48553466796875, + "logps/rejected": -307.50897216796875, + "loss": 0.013, + "rewards/chosen": 3.9536943435668945, + "rewards/margins": 11.592707633972168, + "rewards/rejected": -7.639013290405273, + "step": 5880 + }, + { + "epoch": 0.5373229785290087, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 4.429718309064375e-06, + "logits/chosen": 1073565696.0, + "logits/rejected": 730808320.0, + "logps/chosen": -267.15667724609375, + "logps/rejected": -698.2951049804688, + "loss": 0.0304, + "rewards/chosen": 3.217271089553833, + "rewards/margins": 11.577862024307251, + "rewards/rejected": -8.360590934753418, + "step": 5881 + }, + { + "epoch": 0.5374143444495203, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.428289915495442e-06, + "logits/chosen": 1003793715.2, + "logits/rejected": 1395895893.3333333, + "logps/chosen": -378.217529296875, + "logps/rejected": -488.952880859375, + "loss": 0.0162, + "rewards/chosen": 3.940806579589844, + "rewards/margins": 13.565010579427085, + "rewards/rejected": -9.62420399983724, + "step": 5882 + }, + { + "epoch": 0.537505710370032, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 4.426861569201637e-06, + "logits/chosen": 844562688.0, + "logits/rejected": 435879628.8, + "logps/chosen": -269.8857828776042, + "logps/rejected": -344.2864013671875, + "loss": 0.0076, + "rewards/chosen": 4.3557281494140625, + "rewards/margins": 11.540754699707032, + "rewards/rejected": -7.185026550292969, + "step": 5883 + }, + { + "epoch": 0.5375970762905437, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 4.4254332703010675e-06, + "logits/chosen": 962272665.6, + "logits/rejected": 568262656.0, + "logps/chosen": -393.6334228515625, + "logps/rejected": -549.5383707682291, + "loss": 0.0199, + "rewards/chosen": 3.805345916748047, + "rewards/margins": 12.405910873413086, + "rewards/rejected": -8.600564956665039, + "step": 5884 + }, + { + "epoch": 0.5376884422110553, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 4.424005018911845e-06, + "logits/chosen": 247290922.66666666, + "logits/rejected": 302738534.4, + "logps/chosen": -91.53843180338542, + "logps/rejected": -413.699755859375, + "loss": 0.0426, + "rewards/chosen": 4.5524476369222, + "rewards/margins": 12.800342686971028, + "rewards/rejected": -8.247895050048829, + "step": 5885 + }, + { + "epoch": 0.5377798081315669, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 4.42257681515207e-06, + "logits/chosen": 732134502.4, + "logits/rejected": 564050048.0, + "logps/chosen": -331.565380859375, + "logps/rejected": -526.6529134114584, + "loss": 0.0131, + "rewards/chosen": 4.467845916748047, + "rewards/margins": 14.100911458333334, + "rewards/rejected": -9.633065541585287, + "step": 5886 + }, + { + "epoch": 0.5378711740520786, + "grad_norm": 0.62109375, + "kl": 0.0, + "learning_rate": 4.421148659139843e-06, + "logits/chosen": 539751168.0, + "logits/rejected": 607207893.3333334, + "logps/chosen": -366.2157897949219, + "logps/rejected": -587.19873046875, + "loss": 0.0022, + "rewards/chosen": 4.9156389236450195, + "rewards/margins": 16.38346449534098, + "rewards/rejected": -11.467825571695963, + "step": 5887 + }, + { + "epoch": 0.5379625399725902, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 4.419720550993259e-06, + "logits/chosen": 665125171.2, + "logits/rejected": 642935168.0, + "logps/chosen": -315.436767578125, + "logps/rejected": -576.8456217447916, + "loss": 0.0216, + "rewards/chosen": 3.8544845581054688, + "rewards/margins": 12.879049301147461, + "rewards/rejected": -9.024564743041992, + "step": 5888 + }, + { + "epoch": 0.5380539058931019, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 4.4182924908304076e-06, + "logits/chosen": 454144597.3333333, + "logits/rejected": 826810944.0, + "logps/chosen": -425.4549153645833, + "logps/rejected": -524.7293090820312, + "loss": 0.016, + "rewards/chosen": 4.104635556538899, + "rewards/margins": 12.559789975484211, + "rewards/rejected": -8.455154418945312, + "step": 5889 + }, + { + "epoch": 0.5381452718136135, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 4.416864478769379e-06, + "logits/chosen": 523391616.0, + "logits/rejected": 928896576.0, + "logps/chosen": -362.8500671386719, + "logps/rejected": -585.72705078125, + "loss": 0.0192, + "rewards/chosen": 3.5179390907287598, + "rewards/margins": 14.018892765045166, + "rewards/rejected": -10.500953674316406, + "step": 5890 + }, + { + "epoch": 0.5382366377341252, + "grad_norm": 37.75, + "kl": 0.0, + "learning_rate": 4.4154365149282535e-06, + "logits/chosen": 523119808.0, + "logits/rejected": 775398528.0, + "logps/chosen": -324.058837890625, + "logps/rejected": -525.0751342773438, + "loss": 0.0577, + "rewards/chosen": 3.106128692626953, + "rewards/margins": 10.057558536529541, + "rewards/rejected": -6.951429843902588, + "step": 5891 + }, + { + "epoch": 0.5383280036546368, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 4.414008599425114e-06, + "logits/chosen": 856661162.6666666, + "logits/rejected": 449294080.0, + "logps/chosen": -289.35284423828125, + "logps/rejected": -242.33541870117188, + "loss": 0.0196, + "rewards/chosen": 4.196500778198242, + "rewards/margins": 11.751238822937012, + "rewards/rejected": -7.5547380447387695, + "step": 5892 + }, + { + "epoch": 0.5384193695751485, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 4.412580732378032e-06, + "logits/chosen": 541776810.6666666, + "logits/rejected": 880063488.0, + "logps/chosen": -407.4938151041667, + "logps/rejected": -441.5141296386719, + "loss": 0.0086, + "rewards/chosen": 4.576099713643392, + "rewards/margins": 13.840306599934895, + "rewards/rejected": -9.264206886291504, + "step": 5893 + }, + { + "epoch": 0.5385107354956601, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 4.411152913905083e-06, + "logits/chosen": 443492388.5714286, + "logits/rejected": 552297728.0, + "logps/chosen": -329.2588588169643, + "logps/rejected": -434.2867431640625, + "loss": 0.0412, + "rewards/chosen": 3.9823436737060547, + "rewards/margins": 13.435791969299316, + "rewards/rejected": -9.453448295593262, + "step": 5894 + }, + { + "epoch": 0.5386021014161718, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.40972514412433e-06, + "logits/chosen": 640398080.0, + "logits/rejected": 581667328.0, + "logps/chosen": -357.259765625, + "logps/rejected": -431.48687744140625, + "loss": 0.022, + "rewards/chosen": 3.6576833724975586, + "rewards/margins": 12.949563026428223, + "rewards/rejected": -9.291879653930664, + "step": 5895 + }, + { + "epoch": 0.5386934673366834, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 4.408297423153841e-06, + "logits/chosen": 658935893.3333334, + "logits/rejected": 453419724.8, + "logps/chosen": -413.2379964192708, + "logps/rejected": -267.794775390625, + "loss": 0.0146, + "rewards/chosen": 3.572983423868815, + "rewards/margins": 11.040976587931315, + "rewards/rejected": -7.4679931640625, + "step": 5896 + }, + { + "epoch": 0.5387848332571951, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 4.406869751111671e-06, + "logits/chosen": 231421056.0, + "logits/rejected": 425714761.14285713, + "logps/chosen": -128.96534729003906, + "logps/rejected": -357.2598353794643, + "loss": 0.0092, + "rewards/chosen": 2.5708389282226562, + "rewards/margins": 11.269300733293806, + "rewards/rejected": -8.69846180507115, + "step": 5897 + }, + { + "epoch": 0.5388761991777067, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 4.405442128115878e-06, + "logits/chosen": 1135138816.0, + "logits/rejected": 394714880.0, + "logps/chosen": -385.3634948730469, + "logps/rejected": -361.9751790364583, + "loss": 0.0055, + "rewards/chosen": 3.907473087310791, + "rewards/margins": 11.693089644114178, + "rewards/rejected": -7.785616556803386, + "step": 5898 + }, + { + "epoch": 0.5389675650982184, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 4.404014554284514e-06, + "logits/chosen": 534967040.0, + "logits/rejected": 720641792.0, + "logps/chosen": -275.363037109375, + "logps/rejected": -579.1756591796875, + "loss": 0.0698, + "rewards/chosen": 3.3179473876953125, + "rewards/margins": 12.093360900878906, + "rewards/rejected": -8.775413513183594, + "step": 5899 + }, + { + "epoch": 0.53905893101873, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.402587029735623e-06, + "logits/chosen": 621665587.2, + "logits/rejected": 455188394.6666667, + "logps/chosen": -286.8263671875, + "logps/rejected": -461.1259358723958, + "loss": 0.0172, + "rewards/chosen": 3.6974441528320314, + "rewards/margins": 14.179616800944011, + "rewards/rejected": -10.482172648111979, + "step": 5900 + }, + { + "epoch": 0.5391502969392417, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 4.401159554587248e-06, + "logits/chosen": 1272476288.0, + "logits/rejected": 694178880.0, + "logps/chosen": -237.31204223632812, + "logps/rejected": -409.79156494140625, + "loss": 0.0097, + "rewards/chosen": 4.30180025100708, + "rewards/margins": 12.302643299102783, + "rewards/rejected": -8.000843048095703, + "step": 5901 + }, + { + "epoch": 0.5392416628597533, + "grad_norm": 50.0, + "kl": 0.0, + "learning_rate": 4.399732128957431e-06, + "logits/chosen": 516776544.0, + "logits/rejected": 587768234.6666666, + "logps/chosen": -308.335205078125, + "logps/rejected": -306.3926595052083, + "loss": 0.0774, + "rewards/chosen": 2.9998674392700195, + "rewards/margins": 9.531542142232258, + "rewards/rejected": -6.531674702962239, + "step": 5902 + }, + { + "epoch": 0.539333028780265, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 4.398304752964204e-06, + "logits/chosen": 630591018.6666666, + "logits/rejected": 892301824.0, + "logps/chosen": -349.4759928385417, + "logps/rejected": -449.74371337890625, + "loss": 0.0191, + "rewards/chosen": 4.138291041056315, + "rewards/margins": 11.957878748575848, + "rewards/rejected": -7.819587707519531, + "step": 5903 + }, + { + "epoch": 0.5394243947007766, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 4.3968774267256e-06, + "logits/chosen": 632971712.0, + "logits/rejected": 621680566.8571428, + "logps/chosen": -739.3394775390625, + "logps/rejected": -433.34092494419644, + "loss": 0.0081, + "rewards/chosen": 2.7254638671875, + "rewards/margins": 11.314971923828125, + "rewards/rejected": -8.589508056640625, + "step": 5904 + }, + { + "epoch": 0.5395157606212883, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 4.395450150359645e-06, + "logits/chosen": 383199296.0, + "logits/rejected": 349217060.5714286, + "logps/chosen": -163.2249298095703, + "logps/rejected": -477.5472935267857, + "loss": 0.0084, + "rewards/chosen": 2.64111328125, + "rewards/margins": 13.253823416573661, + "rewards/rejected": -10.612710135323661, + "step": 5905 + }, + { + "epoch": 0.5396071265417999, + "grad_norm": 44.25, + "kl": 0.0, + "learning_rate": 4.394022923984361e-06, + "logits/chosen": 657577856.0, + "logits/rejected": 576876992.0, + "logps/chosen": -346.08746337890625, + "logps/rejected": -607.6390380859375, + "loss": 0.0906, + "rewards/chosen": 2.596775770187378, + "rewards/margins": 11.764826536178589, + "rewards/rejected": -9.168050765991211, + "step": 5906 + }, + { + "epoch": 0.5396984924623116, + "grad_norm": 40.0, + "kl": 0.0, + "learning_rate": 4.392595747717766e-06, + "logits/chosen": 758631168.0, + "logits/rejected": 1356990890.6666667, + "logps/chosen": -355.0960205078125, + "logps/rejected": -541.8902180989584, + "loss": 0.0406, + "rewards/chosen": 3.138187599182129, + "rewards/margins": 11.791624895731609, + "rewards/rejected": -8.653437296549479, + "step": 5907 + }, + { + "epoch": 0.5397898583828232, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 4.391168621677876e-06, + "logits/chosen": 826517504.0, + "logits/rejected": 424148787.2, + "logps/chosen": -249.63525390625, + "logps/rejected": -696.104296875, + "loss": 0.1508, + "rewards/chosen": -0.04449192682902018, + "rewards/margins": 11.260351975758871, + "rewards/rejected": -11.30484390258789, + "step": 5908 + }, + { + "epoch": 0.5398812243033349, + "grad_norm": 0.083984375, + "kl": 0.0, + "learning_rate": 4.3897415459827e-06, + "logits/rejected": 636133632.0, + "logps/rejected": -505.90850830078125, + "loss": 0.0003, + "rewards/rejected": -9.42138671875, + "step": 5909 + }, + { + "epoch": 0.5399725902238465, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 4.3883145207502435e-06, + "logits/chosen": 476571093.3333333, + "logits/rejected": 473849651.2, + "logps/chosen": -221.76241048177084, + "logps/rejected": -486.58671875, + "loss": 0.0292, + "rewards/chosen": 2.552089055379232, + "rewards/margins": 13.79457105000814, + "rewards/rejected": -11.242481994628907, + "step": 5910 + }, + { + "epoch": 0.5400639561443582, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 4.386887546098509e-06, + "logits/chosen": 697901248.0, + "logits/rejected": 513040064.0, + "logps/chosen": -264.2718200683594, + "logps/rejected": -405.6717224121094, + "loss": 0.0542, + "rewards/chosen": 2.7666354179382324, + "rewards/margins": 11.689160823822021, + "rewards/rejected": -8.922525405883789, + "step": 5911 + }, + { + "epoch": 0.5401553220648698, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 4.385460622145493e-06, + "logits/chosen": 468048640.0, + "logits/rejected": 613395507.2, + "logps/chosen": -244.94234212239584, + "logps/rejected": -556.02529296875, + "loss": 0.011, + "rewards/chosen": 3.6389506657918296, + "rewards/margins": 14.621633466084798, + "rewards/rejected": -10.982682800292968, + "step": 5912 + }, + { + "epoch": 0.5402466879853814, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 4.3840337490091905e-06, + "logits/chosen": 338785280.0, + "logits/rejected": 599133696.0, + "logps/chosen": -189.26852416992188, + "logps/rejected": -469.38751220703125, + "loss": 0.0258, + "rewards/chosen": 3.236738681793213, + "rewards/margins": 11.52018690109253, + "rewards/rejected": -8.283448219299316, + "step": 5913 + }, + { + "epoch": 0.5403380539058931, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 4.3826069268075884e-06, + "logits/chosen": 647132245.3333334, + "logits/rejected": 621538944.0, + "logps/chosen": -296.44748942057294, + "logps/rejected": -582.4620361328125, + "loss": 0.0187, + "rewards/chosen": 3.8688952128092446, + "rewards/margins": 12.669343630472818, + "rewards/rejected": -8.800448417663574, + "step": 5914 + }, + { + "epoch": 0.5404294198264048, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 4.3811801556586755e-06, + "logits/chosen": 375062869.3333333, + "logits/rejected": 644574924.8, + "logps/chosen": -242.82731119791666, + "logps/rejected": -540.910009765625, + "loss": 0.0101, + "rewards/chosen": 3.8363771438598633, + "rewards/margins": 13.054283332824706, + "rewards/rejected": -9.217906188964843, + "step": 5915 + }, + { + "epoch": 0.5405207857469164, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 4.379753435680429e-06, + "logits/chosen": 277836096.0, + "logits/rejected": 189905216.0, + "logps/chosen": -217.5462188720703, + "logps/rejected": -473.10565185546875, + "loss": 0.0161, + "rewards/chosen": 4.266273498535156, + "rewards/margins": 13.423933029174805, + "rewards/rejected": -9.157659530639648, + "step": 5916 + }, + { + "epoch": 0.540612151667428, + "grad_norm": 0.404296875, + "kl": 0.0, + "learning_rate": 4.378326766990827e-06, + "logits/chosen": 310919904.0, + "logits/rejected": 705907882.6666666, + "logps/chosen": -231.85873413085938, + "logps/rejected": -536.3961995442709, + "loss": 0.0023, + "rewards/chosen": 4.980401039123535, + "rewards/margins": 13.842813809712728, + "rewards/rejected": -8.862412770589193, + "step": 5917 + }, + { + "epoch": 0.5407035175879397, + "grad_norm": 32.25, + "kl": 0.0, + "learning_rate": 4.376900149707841e-06, + "logits/chosen": 629207637.3333334, + "logits/rejected": 998989824.0, + "logps/chosen": -301.8460693359375, + "logps/rejected": -427.53447265625, + "loss": 0.0903, + "rewards/chosen": 2.015805244445801, + "rewards/margins": 11.329661750793457, + "rewards/rejected": -9.313856506347657, + "step": 5918 + }, + { + "epoch": 0.5407948835084514, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 4.37547358394944e-06, + "logits/chosen": 441489578.6666667, + "logits/rejected": 622111334.4, + "logps/chosen": -349.6593831380208, + "logps/rejected": -428.09580078125, + "loss": 0.0143, + "rewards/chosen": 3.3473523457845054, + "rewards/margins": 14.510749562581381, + "rewards/rejected": -11.163397216796875, + "step": 5919 + }, + { + "epoch": 0.540886249428963, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 4.374047069833588e-06, + "logits/chosen": 543277158.4, + "logits/rejected": 528240042.6666667, + "logps/chosen": -317.930908203125, + "logps/rejected": -317.28521728515625, + "loss": 0.0235, + "rewards/chosen": 3.602857208251953, + "rewards/margins": 13.383609390258789, + "rewards/rejected": -9.780752182006836, + "step": 5920 + }, + { + "epoch": 0.5409776153494746, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 4.372620607478242e-06, + "logits/chosen": 537523008.0, + "logits/rejected": 517164032.0, + "logps/chosen": -468.092529296875, + "logps/rejected": -428.154296875, + "loss": 0.0232, + "rewards/chosen": 3.138080596923828, + "rewards/margins": 11.828008651733398, + "rewards/rejected": -8.68992805480957, + "step": 5921 + }, + { + "epoch": 0.5410689812699863, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 4.37119419700136e-06, + "logits/chosen": 500050432.0, + "logits/rejected": 624750028.8, + "logps/chosen": -238.29292805989584, + "logps/rejected": -565.9822265625, + "loss": 0.0584, + "rewards/chosen": 2.1801846822102866, + "rewards/margins": 10.824126942952475, + "rewards/rejected": -8.643942260742188, + "step": 5922 + }, + { + "epoch": 0.541160347190498, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.369767838520891e-06, + "logits/chosen": 522367616.0, + "logits/rejected": 244250144.0, + "logps/chosen": -366.3111572265625, + "logps/rejected": -226.41587829589844, + "loss": 0.031, + "rewards/chosen": 3.2889792124430337, + "rewards/margins": 10.440282503763834, + "rewards/rejected": -7.151303291320801, + "step": 5923 + }, + { + "epoch": 0.5412517131110096, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 4.368341532154784e-06, + "logits/chosen": 567334400.0, + "logits/rejected": 654972992.0, + "logps/chosen": -364.36572265625, + "logps/rejected": -484.16632080078125, + "loss": 0.0143, + "rewards/chosen": 3.554539442062378, + "rewards/margins": 13.987091779708862, + "rewards/rejected": -10.432552337646484, + "step": 5924 + }, + { + "epoch": 0.5413430790315212, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 4.366915278020979e-06, + "logits/chosen": 909676714.6666666, + "logits/rejected": 382521024.0, + "logps/chosen": -281.7616373697917, + "logps/rejected": -556.4290771484375, + "loss": 0.0213, + "rewards/chosen": 3.6621726353963218, + "rewards/margins": 15.256803830464682, + "rewards/rejected": -11.59463119506836, + "step": 5925 + }, + { + "epoch": 0.5414344449520329, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.3654890762374154e-06, + "logits/chosen": 588789504.0, + "logits/rejected": 933676544.0, + "logps/chosen": -340.5072021484375, + "logps/rejected": -510.2611490885417, + "loss": 0.0146, + "rewards/chosen": 4.408537673950195, + "rewards/margins": 11.96571617126465, + "rewards/rejected": -7.557178497314453, + "step": 5926 + }, + { + "epoch": 0.5415258108725446, + "grad_norm": 0.91796875, + "kl": 0.0, + "learning_rate": 4.3640629269220265e-06, + "logits/chosen": 494547456.0, + "logits/rejected": 446127001.6, + "logps/chosen": -454.1465657552083, + "logps/rejected": -480.040283203125, + "loss": 0.0045, + "rewards/chosen": 4.641552607218425, + "rewards/margins": 14.497651545206708, + "rewards/rejected": -9.856098937988282, + "step": 5927 + }, + { + "epoch": 0.5416171767930562, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.36263683019274e-06, + "logits/chosen": 915448064.0, + "logits/rejected": 482358988.8, + "logps/chosen": -261.1273600260417, + "logps/rejected": -380.77783203125, + "loss": 0.0121, + "rewards/chosen": 3.6914450327555337, + "rewards/margins": 13.259129969278971, + "rewards/rejected": -9.567684936523438, + "step": 5928 + }, + { + "epoch": 0.5417085427135678, + "grad_norm": 39.25, + "kl": 0.0, + "learning_rate": 4.3612107861674845e-06, + "logits/chosen": 829821312.0, + "logits/rejected": 517764800.0, + "logps/chosen": -257.30303955078125, + "logps/rejected": -474.0472412109375, + "loss": 0.0399, + "rewards/chosen": 3.0196385383605957, + "rewards/margins": 12.054358005523682, + "rewards/rejected": -9.034719467163086, + "step": 5929 + }, + { + "epoch": 0.5417999086340795, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 4.359784794964177e-06, + "logits/chosen": 742572416.0, + "logits/rejected": 963927808.0, + "logps/chosen": -452.9139099121094, + "logps/rejected": -487.1575622558594, + "loss": 0.0255, + "rewards/chosen": 3.2665252685546875, + "rewards/margins": 12.867792129516602, + "rewards/rejected": -9.601266860961914, + "step": 5930 + }, + { + "epoch": 0.5418912745545912, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 4.358358856700735e-06, + "logits/chosen": 574619904.0, + "logits/rejected": 550301568.0, + "logps/chosen": -510.1044616699219, + "logps/rejected": -643.7081909179688, + "loss": 0.0358, + "rewards/chosen": 2.579583168029785, + "rewards/margins": 13.161774635314941, + "rewards/rejected": -10.582191467285156, + "step": 5931 + }, + { + "epoch": 0.5419826404751028, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 4.356932971495071e-06, + "logits/chosen": 479118816.0, + "logits/rejected": 355892288.0, + "logps/chosen": -353.7474060058594, + "logps/rejected": -546.9874267578125, + "loss": 0.0082, + "rewards/chosen": 4.780385494232178, + "rewards/margins": 15.472840785980225, + "rewards/rejected": -10.692455291748047, + "step": 5932 + }, + { + "epoch": 0.5420740063956144, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 4.355507139465092e-06, + "logits/chosen": 296588320.0, + "logits/rejected": 461377718.85714287, + "logps/chosen": -282.00396728515625, + "logps/rejected": -482.623046875, + "loss": 0.0945, + "rewards/chosen": 5.242785930633545, + "rewards/margins": 13.285991736820765, + "rewards/rejected": -8.04320580618722, + "step": 5933 + }, + { + "epoch": 0.542165372316126, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 4.354081360728701e-06, + "logits/chosen": 538437184.0, + "logits/rejected": 399481216.0, + "logps/chosen": -346.9725646972656, + "logps/rejected": -240.68850708007812, + "loss": 0.0044, + "rewards/chosen": 5.006405830383301, + "rewards/margins": 12.891904830932617, + "rewards/rejected": -7.885499000549316, + "step": 5934 + }, + { + "epoch": 0.5422567382366378, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.352655635403797e-06, + "logits/chosen": 487417344.0, + "logits/rejected": 352950549.3333333, + "logps/chosen": -348.7892333984375, + "logps/rejected": -515.4736328125, + "loss": 0.0269, + "rewards/chosen": 3.319994354248047, + "rewards/margins": 14.163629913330078, + "rewards/rejected": -10.843635559082031, + "step": 5935 + }, + { + "epoch": 0.5423481041571494, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 4.351229963608274e-06, + "logits/chosen": 411251456.0, + "logits/rejected": 444247840.0, + "logps/chosen": -286.7330627441406, + "logps/rejected": -484.04949951171875, + "loss": 0.0106, + "rewards/chosen": 4.638824462890625, + "rewards/margins": 13.62142276763916, + "rewards/rejected": -8.982598304748535, + "step": 5936 + }, + { + "epoch": 0.542439470077661, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 4.349804345460022e-06, + "logits/chosen": 534728192.0, + "logits/rejected": 713793024.0, + "logps/chosen": -382.94287109375, + "logps/rejected": -573.7013671875, + "loss": 0.0326, + "rewards/chosen": 3.181987762451172, + "rewards/margins": 10.91733627319336, + "rewards/rejected": -7.735348510742187, + "step": 5937 + }, + { + "epoch": 0.5425308359981726, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 4.348378781076927e-06, + "logits/chosen": 701314048.0, + "logits/rejected": 449935018.6666667, + "logps/chosen": -302.5911560058594, + "logps/rejected": -456.7030436197917, + "loss": 0.0072, + "rewards/chosen": 3.7504234313964844, + "rewards/margins": 12.610151290893555, + "rewards/rejected": -8.85972785949707, + "step": 5938 + }, + { + "epoch": 0.5426222019186844, + "grad_norm": 0.7421875, + "kl": 0.0, + "learning_rate": 4.346953270576869e-06, + "logits/chosen": 955985664.0, + "logits/rejected": 611903402.6666666, + "logps/chosen": -321.5574951171875, + "logps/rejected": -718.82958984375, + "loss": 0.003, + "rewards/chosen": 4.850685119628906, + "rewards/margins": 14.437819163004557, + "rewards/rejected": -9.58713404337565, + "step": 5939 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.5546875, + "kl": 0.0, + "learning_rate": 4.345527814077726e-06, + "logits/chosen": 816056832.0, + "logits/rejected": 564534857.1428572, + "logps/chosen": -340.6549072265625, + "logps/rejected": -383.0518275669643, + "loss": 0.0018, + "rewards/chosen": 5.492352485656738, + "rewards/margins": 14.370491436549596, + "rewards/rejected": -8.878138950892858, + "step": 5940 + }, + { + "epoch": 0.5428049337597076, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 4.344102411697368e-06, + "logits/chosen": 563125760.0, + "logits/rejected": 664418432.0, + "logps/chosen": -418.4678955078125, + "logps/rejected": -138.1692352294922, + "loss": 0.111, + "rewards/chosen": 4.535857200622559, + "rewards/margins": 8.547821521759033, + "rewards/rejected": -4.011964321136475, + "step": 5941 + }, + { + "epoch": 0.5428962996802192, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 4.342677063553663e-06, + "logits/chosen": 1304582272.0, + "logits/rejected": 897952448.0, + "logps/chosen": -212.50668334960938, + "logps/rejected": -604.754638671875, + "loss": 0.0257, + "rewards/chosen": 3.0451252460479736, + "rewards/margins": 11.84146523475647, + "rewards/rejected": -8.796339988708496, + "step": 5942 + }, + { + "epoch": 0.542987665600731, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 4.341251769764475e-06, + "logits/chosen": 556805504.0, + "logits/rejected": 354854208.0, + "logps/chosen": -368.4005126953125, + "logps/rejected": -379.0250244140625, + "loss": 0.0131, + "rewards/chosen": 4.361795425415039, + "rewards/margins": 13.576761245727539, + "rewards/rejected": -9.2149658203125, + "step": 5943 + }, + { + "epoch": 0.5430790315212426, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 4.339826530447662e-06, + "logits/chosen": 705727897.6, + "logits/rejected": 853815808.0, + "logps/chosen": -235.0568115234375, + "logps/rejected": -555.4002278645834, + "loss": 0.0411, + "rewards/chosen": 3.0203676223754883, + "rewards/margins": 13.181488990783691, + "rewards/rejected": -10.161121368408203, + "step": 5944 + }, + { + "epoch": 0.5431703974417542, + "grad_norm": 0.79296875, + "kl": 0.0, + "learning_rate": 4.338401345721079e-06, + "logits/chosen": 220742768.0, + "logits/rejected": 951918372.5714285, + "logps/chosen": -403.8580322265625, + "logps/rejected": -505.62667410714283, + "loss": 0.0028, + "rewards/chosen": 3.855480909347534, + "rewards/margins": 12.679996115820748, + "rewards/rejected": -8.824515206473214, + "step": 5945 + }, + { + "epoch": 0.5432617633622658, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 4.336976215702574e-06, + "logits/chosen": 687435136.0, + "logits/rejected": 562849024.0, + "logps/chosen": -549.6511637369791, + "logps/rejected": -355.54189453125, + "loss": 0.0155, + "rewards/chosen": 3.506381352742513, + "rewards/margins": 11.460644658406576, + "rewards/rejected": -7.954263305664062, + "step": 5946 + }, + { + "epoch": 0.5433531292827776, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 4.335551140509994e-06, + "logits/chosen": 558631424.0, + "logits/rejected": 760679104.0, + "logps/chosen": -295.98988560267856, + "logps/rejected": -878.777099609375, + "loss": 0.0709, + "rewards/chosen": 2.8712071010044644, + "rewards/margins": 13.817422730582102, + "rewards/rejected": -10.946215629577637, + "step": 5947 + }, + { + "epoch": 0.5434444952032892, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 4.334126120261177e-06, + "logits/chosen": 652718592.0, + "logits/rejected": 1057763942.4, + "logps/chosen": -348.5569254557292, + "logps/rejected": -516.6166015625, + "loss": 0.0047, + "rewards/chosen": 4.526662826538086, + "rewards/margins": 15.047084426879882, + "rewards/rejected": -10.520421600341797, + "step": 5948 + }, + { + "epoch": 0.5435358611238008, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 4.332701155073962e-06, + "logits/chosen": 520423936.0, + "logits/rejected": 457369952.0, + "logps/chosen": -280.27850341796875, + "logps/rejected": -499.57275390625, + "loss": 0.0118, + "rewards/chosen": 4.618365287780762, + "rewards/margins": 13.812719345092773, + "rewards/rejected": -9.194354057312012, + "step": 5949 + }, + { + "epoch": 0.5436272270443124, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 4.3312762450661775e-06, + "logits/chosen": 894429440.0, + "logits/rejected": 622106496.0, + "logps/chosen": -694.0830078125, + "logps/rejected": -517.8890380859375, + "loss": 0.0204, + "rewards/chosen": 4.5811567306518555, + "rewards/margins": 13.521641731262207, + "rewards/rejected": -8.940485000610352, + "step": 5950 + }, + { + "epoch": 0.5437185929648242, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 4.329851390355653e-06, + "logits/chosen": 921112678.4, + "logits/rejected": 1110221653.3333333, + "logps/chosen": -314.2107421875, + "logps/rejected": -336.9904378255208, + "loss": 0.0154, + "rewards/chosen": 4.032732391357422, + "rewards/margins": 12.552546564737955, + "rewards/rejected": -8.519814173380533, + "step": 5951 + }, + { + "epoch": 0.5438099588853358, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 4.328426591060208e-06, + "logits/chosen": 742696618.6666666, + "logits/rejected": 479246643.2, + "logps/chosen": -546.6129557291666, + "logps/rejected": -509.7826171875, + "loss": 0.0118, + "rewards/chosen": 3.4696340560913086, + "rewards/margins": 14.183127403259277, + "rewards/rejected": -10.713493347167969, + "step": 5952 + }, + { + "epoch": 0.5439013248058474, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 4.327001847297665e-06, + "logits/chosen": 480557120.0, + "logits/rejected": 483357888.0, + "logps/chosen": -376.32244873046875, + "logps/rejected": -544.900634765625, + "loss": 0.0101, + "rewards/chosen": 4.016858100891113, + "rewards/margins": 14.310656547546387, + "rewards/rejected": -10.293798446655273, + "step": 5953 + }, + { + "epoch": 0.543992690726359, + "grad_norm": 0.67578125, + "kl": 0.0, + "learning_rate": 4.32557715918583e-06, + "logits/chosen": 585101696.0, + "logits/rejected": 509318485.3333333, + "logps/chosen": -408.9086608886719, + "logps/rejected": -526.337646484375, + "loss": 0.0026, + "rewards/chosen": 5.264422416687012, + "rewards/margins": 15.095706621805826, + "rewards/rejected": -9.831284205118815, + "step": 5954 + }, + { + "epoch": 0.5440840566468708, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 4.324152526842517e-06, + "logits/chosen": 390814677.3333333, + "logits/rejected": 208405824.0, + "logps/chosen": -259.1453857421875, + "logps/rejected": -423.26361083984375, + "loss": 0.1383, + "rewards/chosen": 2.939103444417318, + "rewards/margins": 16.555735905965168, + "rewards/rejected": -13.616632461547852, + "step": 5955 + }, + { + "epoch": 0.5441754225673824, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.322727950385527e-06, + "logits/chosen": 496249088.0, + "logits/rejected": 959099200.0, + "logps/chosen": -415.1767578125, + "logps/rejected": -1105.156005859375, + "loss": 0.0316, + "rewards/chosen": 3.3405815760294595, + "rewards/margins": 13.780352274576822, + "rewards/rejected": -10.439770698547363, + "step": 5956 + }, + { + "epoch": 0.544266788487894, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 4.321303429932662e-06, + "logits/chosen": 536527718.4, + "logits/rejected": 267563392.0, + "logps/chosen": -227.150634765625, + "logps/rejected": -290.8104248046875, + "loss": 0.0074, + "rewards/chosen": 5.114927673339844, + "rewards/margins": 15.132737223307291, + "rewards/rejected": -10.017809549967447, + "step": 5957 + }, + { + "epoch": 0.5443581544084056, + "grad_norm": 29.875, + "kl": 0.0, + "learning_rate": 4.3198789656017125e-06, + "logits/chosen": 950863530.6666666, + "logits/rejected": 431671168.0, + "logps/chosen": -306.7972412109375, + "logps/rejected": -358.03875732421875, + "loss": 0.1068, + "rewards/chosen": 3.2671632766723633, + "rewards/margins": 11.305145263671875, + "rewards/rejected": -8.037981986999512, + "step": 5958 + }, + { + "epoch": 0.5444495203289174, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 4.318454557510473e-06, + "logits/chosen": 857875904.0, + "logits/rejected": 953451434.6666666, + "logps/chosen": -373.4454650878906, + "logps/rejected": -560.5008138020834, + "loss": 0.0068, + "rewards/chosen": 3.6446352005004883, + "rewards/margins": 13.979738553365072, + "rewards/rejected": -10.335103352864584, + "step": 5959 + }, + { + "epoch": 0.544540886249429, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 4.317030205776726e-06, + "logits/chosen": 561823232.0, + "logits/rejected": 490685824.0, + "logps/chosen": -428.02518136160717, + "logps/rejected": -334.69525146484375, + "loss": 0.067, + "rewards/chosen": 2.5843838282993863, + "rewards/margins": 12.069775445120676, + "rewards/rejected": -9.485391616821289, + "step": 5960 + }, + { + "epoch": 0.5446322521699406, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 4.315605910518255e-06, + "logits/chosen": 519963946.6666667, + "logits/rejected": 1129907097.6, + "logps/chosen": -274.4378662109375, + "logps/rejected": -492.90830078125, + "loss": 0.0195, + "rewards/chosen": 3.146393140157064, + "rewards/margins": 11.815745862325032, + "rewards/rejected": -8.669352722167968, + "step": 5961 + }, + { + "epoch": 0.5447236180904522, + "grad_norm": 34.25, + "kl": 0.0, + "learning_rate": 4.314181671852833e-06, + "logits/chosen": 357703978.6666667, + "logits/rejected": 465109664.0, + "logps/chosen": -242.213623046875, + "logps/rejected": -408.48583984375, + "loss": 0.0961, + "rewards/chosen": 3.2527554829915366, + "rewards/margins": 13.694911321004232, + "rewards/rejected": -10.442155838012695, + "step": 5962 + }, + { + "epoch": 0.544814984010964, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 4.312757489898232e-06, + "logits/chosen": 303321376.0, + "logits/rejected": 518292699.4285714, + "logps/chosen": -308.20001220703125, + "logps/rejected": -496.11983816964283, + "loss": 0.0084, + "rewards/chosen": 2.656381368637085, + "rewards/margins": 12.653759036745344, + "rewards/rejected": -9.997377668108259, + "step": 5963 + }, + { + "epoch": 0.5449063499314756, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 4.31133336477222e-06, + "logits/chosen": 410074048.0, + "logits/rejected": 435255488.0, + "logps/chosen": -394.1266174316406, + "logps/rejected": -393.892578125, + "loss": 0.0244, + "rewards/chosen": 3.2750916481018066, + "rewards/margins": 12.708487033843994, + "rewards/rejected": -9.433395385742188, + "step": 5964 + }, + { + "epoch": 0.5449977158519872, + "grad_norm": 51.0, + "kl": 0.0, + "learning_rate": 4.309909296592556e-06, + "logits/chosen": 596202154.6666666, + "logits/rejected": 701246771.2, + "logps/chosen": -174.26214599609375, + "logps/rejected": -771.413037109375, + "loss": 0.0537, + "rewards/chosen": 2.687256177266439, + "rewards/margins": 17.665817578633625, + "rewards/rejected": -14.978561401367188, + "step": 5965 + }, + { + "epoch": 0.5450890817724988, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 4.308485285477002e-06, + "logits/chosen": 727961907.2, + "logits/rejected": 1211170645.3333333, + "logps/chosen": -403.842333984375, + "logps/rejected": -540.8079427083334, + "loss": 0.0328, + "rewards/chosen": 3.358021926879883, + "rewards/margins": 12.698275756835937, + "rewards/rejected": -9.340253829956055, + "step": 5966 + }, + { + "epoch": 0.5451804476930106, + "grad_norm": 27.125, + "kl": 0.0, + "learning_rate": 4.307061331543307e-06, + "logits/chosen": 924762282.6666666, + "logits/rejected": 718738688.0, + "logps/chosen": -298.68701171875, + "logps/rejected": -343.040283203125, + "loss": 0.1084, + "rewards/chosen": 2.918584187825521, + "rewards/margins": 10.254168351491293, + "rewards/rejected": -7.3355841636657715, + "step": 5967 + }, + { + "epoch": 0.5452718136135222, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 4.305637434909219e-06, + "logits/chosen": 375196134.4, + "logits/rejected": 584054698.6666666, + "logps/chosen": -322.565869140625, + "logps/rejected": -584.1968587239584, + "loss": 0.0136, + "rewards/chosen": 4.665470886230469, + "rewards/margins": 13.102349090576173, + "rewards/rejected": -8.436878204345703, + "step": 5968 + }, + { + "epoch": 0.5453631795340338, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 4.304213595692481e-06, + "logits/chosen": 719394944.0, + "logits/rejected": 377401664.0, + "logps/chosen": -311.134521484375, + "logps/rejected": -406.16412353515625, + "loss": 0.0201, + "rewards/chosen": 3.4707584381103516, + "rewards/margins": 12.848081588745117, + "rewards/rejected": -9.377323150634766, + "step": 5969 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 4.302789814010835e-06, + "logits/chosen": 352788373.3333333, + "logits/rejected": 517581977.6, + "logps/chosen": -269.61163330078125, + "logps/rejected": -618.164013671875, + "loss": 0.0108, + "rewards/chosen": 3.77339235941569, + "rewards/margins": 13.386521021525065, + "rewards/rejected": -9.613128662109375, + "step": 5970 + }, + { + "epoch": 0.5455459113750571, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 4.301366089982009e-06, + "logits/chosen": 609363456.0, + "logits/rejected": 430901043.2, + "logps/chosen": -236.69171142578125, + "logps/rejected": -377.5529296875, + "loss": 0.0049, + "rewards/chosen": 4.560863494873047, + "rewards/margins": 14.005413055419922, + "rewards/rejected": -9.444549560546875, + "step": 5971 + }, + { + "epoch": 0.5456372772955688, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 4.299942423723735e-06, + "logits/chosen": 567329587.2, + "logits/rejected": 668958464.0, + "logps/chosen": -405.955908203125, + "logps/rejected": -805.7716471354166, + "loss": 0.0326, + "rewards/chosen": 2.947550964355469, + "rewards/margins": 16.09911651611328, + "rewards/rejected": -13.151565551757812, + "step": 5972 + }, + { + "epoch": 0.5457286432160804, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 4.298518815353737e-06, + "logits/chosen": 594056704.0, + "logits/rejected": 808830464.0, + "logps/chosen": -298.4634033203125, + "logps/rejected": -638.482177734375, + "loss": 0.0211, + "rewards/chosen": 3.883517837524414, + "rewards/margins": 15.246211624145507, + "rewards/rejected": -11.362693786621094, + "step": 5973 + }, + { + "epoch": 0.545820009136592, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 4.297095264989732e-06, + "logits/chosen": 452794112.0, + "logits/rejected": 415943628.8, + "logps/chosen": -257.10845947265625, + "logps/rejected": -532.51416015625, + "loss": 0.0179, + "rewards/chosen": 3.079814592997233, + "rewards/margins": 13.898909441630044, + "rewards/rejected": -10.819094848632812, + "step": 5974 + }, + { + "epoch": 0.5459113750571037, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 4.295671772749438e-06, + "logits/chosen": 456336981.3333333, + "logits/rejected": 641238528.0, + "logps/chosen": -275.40488688151044, + "logps/rejected": -462.64033203125, + "loss": 0.0044, + "rewards/chosen": 4.871402422587077, + "rewards/margins": 13.146034685770672, + "rewards/rejected": -8.274632263183594, + "step": 5975 + }, + { + "epoch": 0.5460027409776154, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 4.294248338750561e-06, + "logits/chosen": 702161621.3333334, + "logits/rejected": 341166272.0, + "logps/chosen": -271.16823323567706, + "logps/rejected": -322.94317626953125, + "loss": 0.0158, + "rewards/chosen": 4.277035395304362, + "rewards/margins": 13.837459246317547, + "rewards/rejected": -9.560423851013184, + "step": 5976 + }, + { + "epoch": 0.546094106898127, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 4.292824963110809e-06, + "logits/chosen": 666606694.4, + "logits/rejected": 1032624554.6666666, + "logps/chosen": -326.09599609375, + "logps/rejected": -669.0556640625, + "loss": 0.0124, + "rewards/chosen": 4.102029418945312, + "rewards/margins": 14.822293853759765, + "rewards/rejected": -10.720264434814453, + "step": 5977 + }, + { + "epoch": 0.5461854728186386, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 4.291401645947879e-06, + "logits/chosen": 437018470.4, + "logits/rejected": 366194688.0, + "logps/chosen": -240.213916015625, + "logps/rejected": -435.0696207682292, + "loss": 0.0231, + "rewards/chosen": 4.167559051513672, + "rewards/margins": 14.001250966389975, + "rewards/rejected": -9.833691914876303, + "step": 5978 + }, + { + "epoch": 0.5462768387391503, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 4.28997838737947e-06, + "logits/chosen": 552512000.0, + "logits/rejected": 340088448.0, + "logps/chosen": -365.6736653645833, + "logps/rejected": -338.34515380859375, + "loss": 0.0313, + "rewards/chosen": 3.4006945292154946, + "rewards/margins": 12.939401308695475, + "rewards/rejected": -9.53870677947998, + "step": 5979 + }, + { + "epoch": 0.546368204659662, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 4.288555187523269e-06, + "logits/chosen": 562975808.0, + "logits/rejected": 504535552.0, + "logps/chosen": -229.0545654296875, + "logps/rejected": -417.72247314453125, + "loss": 0.0136, + "rewards/chosen": 4.359277725219727, + "rewards/margins": 11.760375022888184, + "rewards/rejected": -7.401097297668457, + "step": 5980 + }, + { + "epoch": 0.5464595705801736, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 4.28713204649696e-06, + "logits/chosen": 362668501.3333333, + "logits/rejected": 602358118.4, + "logps/chosen": -216.8254191080729, + "logps/rejected": -438.3244140625, + "loss": 0.0118, + "rewards/chosen": 3.909382184346517, + "rewards/margins": 13.893208630879721, + "rewards/rejected": -9.983826446533204, + "step": 5981 + }, + { + "epoch": 0.5465509365006852, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.285708964418228e-06, + "logits/chosen": 467024320.0, + "logits/rejected": 551118976.0, + "logps/chosen": -340.00872802734375, + "logps/rejected": -409.7880859375, + "loss": 0.0275, + "rewards/chosen": 3.039600372314453, + "rewards/margins": 13.82733154296875, + "rewards/rejected": -10.787731170654297, + "step": 5982 + }, + { + "epoch": 0.5466423024211969, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 4.284285941404747e-06, + "logits/chosen": 404071392.0, + "logits/rejected": 546681216.0, + "logps/chosen": -307.65045166015625, + "logps/rejected": -525.0380859375, + "loss": 0.0106, + "rewards/chosen": 4.238534450531006, + "rewards/margins": 12.417033672332764, + "rewards/rejected": -8.178499221801758, + "step": 5983 + }, + { + "epoch": 0.5467336683417086, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 4.282862977574185e-06, + "logits/chosen": 422701568.0, + "logits/rejected": 862618432.0, + "logps/chosen": -155.05661010742188, + "logps/rejected": -818.38671875, + "loss": 0.0328, + "rewards/chosen": 3.4249890645345054, + "rewards/margins": 13.619101842244467, + "rewards/rejected": -10.194112777709961, + "step": 5984 + }, + { + "epoch": 0.5468250342622202, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 4.281440073044213e-06, + "logits/chosen": 540656332.8, + "logits/rejected": 490480128.0, + "logps/chosen": -225.66875, + "logps/rejected": -716.6214192708334, + "loss": 0.0248, + "rewards/chosen": 3.441412353515625, + "rewards/margins": 15.34900131225586, + "rewards/rejected": -11.907588958740234, + "step": 5985 + }, + { + "epoch": 0.5469164001827318, + "grad_norm": 55.25, + "kl": 0.0, + "learning_rate": 4.280017227932487e-06, + "logits/chosen": 616792268.8, + "logits/rejected": 522991232.0, + "logps/chosen": -322.537548828125, + "logps/rejected": -412.3772786458333, + "loss": 0.1401, + "rewards/chosen": 2.9591001510620116, + "rewards/margins": 8.146664365132649, + "rewards/rejected": -5.187564214070638, + "step": 5986 + }, + { + "epoch": 0.5470077661032435, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 4.278594442356667e-06, + "logits/chosen": 528275029.3333333, + "logits/rejected": 388178464.0, + "logps/chosen": -325.8352457682292, + "logps/rejected": -419.83355712890625, + "loss": 0.0163, + "rewards/chosen": 4.150912602742513, + "rewards/margins": 11.440012296040852, + "rewards/rejected": -7.28909969329834, + "step": 5987 + }, + { + "epoch": 0.5470991320237552, + "grad_norm": 39.75, + "kl": 0.0, + "learning_rate": 4.2771717164344015e-06, + "logits/chosen": 633459498.6666666, + "logits/rejected": 431883008.0, + "logps/chosen": -200.43497721354166, + "logps/rejected": -603.1425170898438, + "loss": 0.0552, + "rewards/chosen": 3.4980014165242515, + "rewards/margins": 15.91980012257894, + "rewards/rejected": -12.421798706054688, + "step": 5988 + }, + { + "epoch": 0.5471904979442668, + "grad_norm": 0.76953125, + "kl": 0.0, + "learning_rate": 4.275749050283339e-06, + "logits/chosen": 532689184.0, + "logits/rejected": 425402208.0, + "logps/chosen": -352.7059631347656, + "logps/rejected": -580.9790649414062, + "loss": 0.0043, + "rewards/chosen": 4.99851131439209, + "rewards/margins": 14.147822380065918, + "rewards/rejected": -9.149311065673828, + "step": 5989 + }, + { + "epoch": 0.5472818638647784, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 4.274326444021119e-06, + "logits/chosen": 441391052.8, + "logits/rejected": 238248192.0, + "logps/chosen": -257.073828125, + "logps/rejected": -295.1868896484375, + "loss": 0.0112, + "rewards/chosen": 4.1373340606689455, + "rewards/margins": 13.495586903889976, + "rewards/rejected": -9.35825284322103, + "step": 5990 + }, + { + "epoch": 0.5473732297852901, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 4.272903897765378e-06, + "logits/chosen": 566734634.6666666, + "logits/rejected": 713830208.0, + "logps/chosen": -390.8510335286458, + "logps/rejected": -922.3480224609375, + "loss": 0.0447, + "rewards/chosen": 2.998080253601074, + "rewards/margins": 13.305535316467285, + "rewards/rejected": -10.307455062866211, + "step": 5991 + }, + { + "epoch": 0.5474645957058017, + "grad_norm": 0.490234375, + "kl": 0.0, + "learning_rate": 4.2714814116337485e-06, + "logits/chosen": 783857280.0, + "logits/rejected": 420030890.6666667, + "logps/chosen": -228.39584350585938, + "logps/rejected": -408.9735107421875, + "loss": 0.0029, + "rewards/chosen": 4.545154571533203, + "rewards/margins": 14.41397476196289, + "rewards/rejected": -9.868820190429688, + "step": 5992 + }, + { + "epoch": 0.5475559616263134, + "grad_norm": 0.353515625, + "kl": 0.0, + "learning_rate": 4.270058985743857e-06, + "logits/chosen": 148696272.0, + "logits/rejected": 421685101.71428573, + "logps/chosen": -62.40424346923828, + "logps/rejected": -451.95103236607144, + "loss": 0.0021, + "rewards/chosen": 4.278844356536865, + "rewards/margins": 12.802432809557233, + "rewards/rejected": -8.523588453020368, + "step": 5993 + }, + { + "epoch": 0.547647327546825, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 4.268636620213325e-06, + "logits/chosen": 634333525.3333334, + "logits/rejected": 867134668.8, + "logps/chosen": -417.5926106770833, + "logps/rejected": -410.2822265625, + "loss": 0.0087, + "rewards/chosen": 4.244919776916504, + "rewards/margins": 12.19344882965088, + "rewards/rejected": -7.948529052734375, + "step": 5994 + }, + { + "epoch": 0.5477386934673367, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 4.2672143151597664e-06, + "logits/chosen": 502439850.6666667, + "logits/rejected": 647257856.0, + "logps/chosen": -228.25333658854166, + "logps/rejected": -553.624755859375, + "loss": 0.037, + "rewards/chosen": 3.5230159759521484, + "rewards/margins": 11.773553848266602, + "rewards/rejected": -8.250537872314453, + "step": 5995 + }, + { + "epoch": 0.5478300593878483, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 4.265792070700796e-06, + "logits/chosen": 442384810.6666667, + "logits/rejected": 381618176.0, + "logps/chosen": -341.4521484375, + "logps/rejected": -509.18486328125, + "loss": 0.0109, + "rewards/chosen": 4.239367167154948, + "rewards/margins": 13.578637186686198, + "rewards/rejected": -9.33927001953125, + "step": 5996 + }, + { + "epoch": 0.54792142530836, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 4.264369886954017e-06, + "logits/chosen": 686701129.1428572, + "logits/rejected": 226932544.0, + "logps/chosen": -279.572265625, + "logps/rejected": -200.70230102539062, + "loss": 0.026, + "rewards/chosen": 3.9445719037737166, + "rewards/margins": 9.614292553492955, + "rewards/rejected": -5.669720649719238, + "step": 5997 + }, + { + "epoch": 0.5480127912288716, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 4.262947764037034e-06, + "logits/chosen": 736606208.0, + "logits/rejected": 351687200.0, + "logps/chosen": -283.0257568359375, + "logps/rejected": -403.4321594238281, + "loss": 0.0158, + "rewards/chosen": 3.500710964202881, + "rewards/margins": 13.661749362945557, + "rewards/rejected": -10.161038398742676, + "step": 5998 + }, + { + "epoch": 0.5481041571493833, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 4.261525702067442e-06, + "logits/chosen": 684282282.6666666, + "logits/rejected": 657706956.8, + "logps/chosen": -367.5849202473958, + "logps/rejected": -418.9716796875, + "loss": 0.0139, + "rewards/chosen": 3.456363042195638, + "rewards/margins": 12.574135716756185, + "rewards/rejected": -9.117772674560547, + "step": 5999 + }, + { + "epoch": 0.5481955230698949, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 4.260103701162832e-06, + "logits/chosen": 444401152.0, + "logits/rejected": 310948896.0, + "logps/chosen": -355.794189453125, + "logps/rejected": -295.79559326171875, + "loss": 0.0173, + "rewards/chosen": 4.277621587117513, + "rewards/margins": 14.245826085408527, + "rewards/rejected": -9.968204498291016, + "step": 6000 + }, + { + "epoch": 0.5482868889904066, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 4.25868176144079e-06, + "logits/chosen": 1090216140.8, + "logits/rejected": 734095018.6666666, + "logps/chosen": -562.273681640625, + "logps/rejected": -732.69287109375, + "loss": 0.0234, + "rewards/chosen": 3.4707305908203123, + "rewards/margins": 16.220480092366536, + "rewards/rejected": -12.749749501546225, + "step": 6001 + }, + { + "epoch": 0.5483782549109182, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 4.257259883018899e-06, + "logits/chosen": 462743296.0, + "logits/rejected": 589152768.0, + "logps/chosen": -431.015380859375, + "logps/rejected": -359.07354736328125, + "loss": 0.0157, + "rewards/chosen": 3.733384132385254, + "rewards/margins": 11.983783721923828, + "rewards/rejected": -8.250399589538574, + "step": 6002 + }, + { + "epoch": 0.5484696208314299, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 4.255838066014732e-06, + "logits/chosen": 503772736.0, + "logits/rejected": 793784320.0, + "logps/chosen": -244.0447998046875, + "logps/rejected": -448.104248046875, + "loss": 0.0104, + "rewards/chosen": 3.5073676109313965, + "rewards/margins": 11.856351375579834, + "rewards/rejected": -8.348983764648438, + "step": 6003 + }, + { + "epoch": 0.5485609867519415, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 4.254416310545865e-06, + "logits/chosen": 1387138304.0, + "logits/rejected": 619807936.0, + "logps/chosen": -253.76251220703125, + "logps/rejected": -674.6770629882812, + "loss": 0.023, + "rewards/chosen": 3.924004554748535, + "rewards/margins": 15.384865760803223, + "rewards/rejected": -11.460861206054688, + "step": 6004 + }, + { + "epoch": 0.5486523526724532, + "grad_norm": 3.203125, + "kl": 1.7618370056152344, + "learning_rate": 4.25299461672986e-06, + "logits/chosen": 560819029.3333334, + "logits/rejected": 727893888.0, + "logps/chosen": -463.6273600260417, + "logps/rejected": -768.4197998046875, + "loss": 0.0251, + "rewards/chosen": 3.6797666549682617, + "rewards/margins": 12.93741512298584, + "rewards/rejected": -9.257648468017578, + "step": 6005 + }, + { + "epoch": 0.5487437185929648, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 4.251572984684281e-06, + "logits/chosen": 522199756.8, + "logits/rejected": 354309205.3333333, + "logps/chosen": -235.91845703125, + "logps/rejected": -481.7554117838542, + "loss": 0.0153, + "rewards/chosen": 3.9229354858398438, + "rewards/margins": 13.431906382242838, + "rewards/rejected": -9.508970896402994, + "step": 6006 + }, + { + "epoch": 0.5488350845134765, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 4.250151414526679e-06, + "logits/chosen": 663467008.0, + "logits/rejected": 526060595.2, + "logps/chosen": -618.9105631510416, + "logps/rejected": -254.8589599609375, + "loss": 0.0121, + "rewards/chosen": 3.691819190979004, + "rewards/margins": 11.513179969787597, + "rewards/rejected": -7.821360778808594, + "step": 6007 + }, + { + "epoch": 0.5489264504339881, + "grad_norm": 4.90625, + "kl": 8.52457046508789, + "learning_rate": 4.248729906374608e-06, + "logits/chosen": 410208768.0, + "logps/chosen": -290.0411071777344, + "loss": 0.0399, + "rewards/chosen": 4.319212436676025, + "step": 6008 + }, + { + "epoch": 0.5490178163544998, + "grad_norm": 36.0, + "kl": 0.0, + "learning_rate": 4.247308460345613e-06, + "logits/chosen": 561782732.8, + "logits/rejected": 488702592.0, + "logps/chosen": -375.256201171875, + "logps/rejected": -410.8618570963542, + "loss": 0.0856, + "rewards/chosen": 3.9407981872558593, + "rewards/margins": 13.305502955118815, + "rewards/rejected": -9.364704767862955, + "step": 6009 + }, + { + "epoch": 0.5491091822750114, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 4.2458870765572354e-06, + "logits/chosen": 430345472.0, + "logits/rejected": 428201152.0, + "logps/chosen": -362.9757995605469, + "logps/rejected": -543.8247680664062, + "loss": 0.0092, + "rewards/chosen": 4.174671173095703, + "rewards/margins": 13.903646469116211, + "rewards/rejected": -9.728975296020508, + "step": 6010 + }, + { + "epoch": 0.5492005481955231, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 4.244465755127008e-06, + "logits/chosen": 826472192.0, + "logits/rejected": 894833664.0, + "logps/chosen": -308.1608581542969, + "logps/rejected": -434.05633544921875, + "loss": 0.0144, + "rewards/chosen": 4.559115409851074, + "rewards/margins": 16.9660587310791, + "rewards/rejected": -12.406943321228027, + "step": 6011 + }, + { + "epoch": 0.5492919141160347, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 4.243044496172464e-06, + "logits/chosen": 441352448.0, + "logits/rejected": 453439786.6666667, + "logps/chosen": -353.337451171875, + "logps/rejected": -379.9579264322917, + "loss": 0.0098, + "rewards/chosen": 4.30396728515625, + "rewards/margins": 13.169976043701173, + "rewards/rejected": -8.866008758544922, + "step": 6012 + }, + { + "epoch": 0.5493832800365464, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 4.241623299811125e-06, + "logits/chosen": 328185280.0, + "logits/rejected": 751471104.0, + "logps/chosen": -172.19168090820312, + "logps/rejected": -507.0628662109375, + "loss": 0.0867, + "rewards/chosen": 3.8999671936035156, + "rewards/margins": 11.012912273406982, + "rewards/rejected": -7.112945079803467, + "step": 6013 + }, + { + "epoch": 0.549474645957058, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.2402021661605145e-06, + "logits/chosen": 475444053.3333333, + "logits/rejected": 302386368.0, + "logps/chosen": -281.9683430989583, + "logps/rejected": -420.49273681640625, + "loss": 0.0201, + "rewards/chosen": 3.7274039586385093, + "rewards/margins": 15.250733693440756, + "rewards/rejected": -11.523329734802246, + "step": 6014 + }, + { + "epoch": 0.5495660118775697, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 4.2387810953381446e-06, + "logits/chosen": 451271680.0, + "logits/rejected": 484474572.8, + "logps/chosen": -173.3985392252604, + "logps/rejected": -475.11591796875, + "loss": 0.0425, + "rewards/chosen": 3.0895652770996094, + "rewards/margins": 10.95414047241211, + "rewards/rejected": -7.8645751953125, + "step": 6015 + }, + { + "epoch": 0.5496573777980813, + "grad_norm": 0.890625, + "kl": 0.0, + "learning_rate": 4.237360087461525e-06, + "logits/chosen": 853849941.3333334, + "logits/rejected": 411413555.2, + "logps/chosen": -296.9915771484375, + "logps/rejected": -237.424560546875, + "loss": 0.0051, + "rewards/chosen": 4.517781575520833, + "rewards/margins": 11.58760274251302, + "rewards/rejected": -7.069821166992187, + "step": 6016 + }, + { + "epoch": 0.549748743718593, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 4.235939142648161e-06, + "logits/chosen": 372956288.0, + "logits/rejected": 633275200.0, + "logps/chosen": -150.86398315429688, + "logps/rejected": -442.9893798828125, + "loss": 0.0287, + "rewards/chosen": 3.299598217010498, + "rewards/margins": 13.077165126800537, + "rewards/rejected": -9.777566909790039, + "step": 6017 + }, + { + "epoch": 0.5498401096391046, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 4.234518261015549e-06, + "logits/chosen": 472530624.0, + "logits/rejected": 484092352.0, + "logps/chosen": -248.03823852539062, + "logps/rejected": -390.95672607421875, + "loss": 0.0083, + "rewards/chosen": 4.6426897048950195, + "rewards/margins": 14.594622611999512, + "rewards/rejected": -9.951932907104492, + "step": 6018 + }, + { + "epoch": 0.5499314755596163, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 4.2330974426811864e-06, + "logits/chosen": 479118880.0, + "logits/rejected": 486320960.0, + "logps/chosen": -259.0889892578125, + "logps/rejected": -375.11541748046875, + "loss": 0.0368, + "rewards/chosen": 4.313937664031982, + "rewards/margins": 11.574995517730713, + "rewards/rejected": -7.2610578536987305, + "step": 6019 + }, + { + "epoch": 0.5500228414801279, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 4.231676687762558e-06, + "logits/chosen": 508831360.0, + "logits/rejected": 340765081.6, + "logps/chosen": -412.377685546875, + "logps/rejected": -308.4296875, + "loss": 0.02, + "rewards/chosen": 3.64276123046875, + "rewards/margins": 10.610877990722656, + "rewards/rejected": -6.968116760253906, + "step": 6020 + }, + { + "epoch": 0.5501142074006395, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 4.23025599637715e-06, + "logits/chosen": 525012172.8, + "logits/rejected": 379390805.3333333, + "logps/chosen": -308.8727294921875, + "logps/rejected": -470.953125, + "loss": 0.0198, + "rewards/chosen": 3.532665252685547, + "rewards/margins": 13.751791636149088, + "rewards/rejected": -10.219126383463541, + "step": 6021 + }, + { + "epoch": 0.5502055733211512, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 4.2288353686424385e-06, + "logits/chosen": 401785301.3333333, + "logits/rejected": 728497216.0, + "logps/chosen": -230.5870361328125, + "logps/rejected": -584.4064331054688, + "loss": 0.0401, + "rewards/chosen": 3.1582132975260415, + "rewards/margins": 12.247535387674967, + "rewards/rejected": -9.089322090148926, + "step": 6022 + }, + { + "epoch": 0.5502969392416629, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.227414804675898e-06, + "logits/chosen": 463529856.0, + "logits/rejected": 566831820.8, + "logps/chosen": -233.37325032552084, + "logps/rejected": -556.910546875, + "loss": 0.018, + "rewards/chosen": 3.1522305806477866, + "rewards/margins": 13.587264760335287, + "rewards/rejected": -10.4350341796875, + "step": 6023 + }, + { + "epoch": 0.5503883051621745, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 4.225994304594994e-06, + "logits/chosen": 360262442.6666667, + "logits/rejected": 511654092.8, + "logps/chosen": -236.55912272135416, + "logps/rejected": -598.6166015625, + "loss": 0.009, + "rewards/chosen": 3.9977747599283853, + "rewards/margins": 13.306405893961587, + "rewards/rejected": -9.308631134033202, + "step": 6024 + }, + { + "epoch": 0.5504796710826861, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 4.22457386851719e-06, + "logits/chosen": 471931289.6, + "logits/rejected": 254338240.0, + "logps/chosen": -330.151611328125, + "logps/rejected": -547.66357421875, + "loss": 0.0166, + "rewards/chosen": 4.130172348022461, + "rewards/margins": 17.101703262329103, + "rewards/rejected": -12.97153091430664, + "step": 6025 + }, + { + "epoch": 0.5505710370031978, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 4.223153496559943e-06, + "logits/chosen": 355019434.6666667, + "logits/rejected": 529691955.2, + "logps/chosen": -532.94873046875, + "logps/rejected": -483.15478515625, + "loss": 0.0121, + "rewards/chosen": 3.5175882975260415, + "rewards/margins": 12.340753428141275, + "rewards/rejected": -8.823165130615234, + "step": 6026 + }, + { + "epoch": 0.5506624029237095, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.221733188840703e-06, + "logits/chosen": 672616960.0, + "logits/rejected": 358788192.0, + "logps/chosen": -237.37208557128906, + "logps/rejected": -552.5997314453125, + "loss": 0.0565, + "rewards/chosen": 3.043247699737549, + "rewards/margins": 14.003339290618896, + "rewards/rejected": -10.960091590881348, + "step": 6027 + }, + { + "epoch": 0.5507537688442211, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 4.220312945476919e-06, + "logits/chosen": 576251392.0, + "logps/chosen": -307.0202331542969, + "loss": 0.0364, + "rewards/chosen": 3.4731831550598145, + "step": 6028 + }, + { + "epoch": 0.5508451347647327, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 4.21889276658603e-06, + "logits/chosen": 428584704.0, + "logits/rejected": 532682547.2, + "logps/chosen": -267.4833577473958, + "logps/rejected": -555.94638671875, + "loss": 0.0134, + "rewards/chosen": 3.511167526245117, + "rewards/margins": 13.86254005432129, + "rewards/rejected": -10.351372528076173, + "step": 6029 + }, + { + "epoch": 0.5509365006852444, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 4.217472652285472e-06, + "logits/chosen": 951754854.4, + "logits/rejected": 581316864.0, + "logps/chosen": -391.6308349609375, + "logps/rejected": -369.8894856770833, + "loss": 0.0411, + "rewards/chosen": 3.8882877349853517, + "rewards/margins": 12.27310167948405, + "rewards/rejected": -8.384813944498697, + "step": 6030 + }, + { + "epoch": 0.5510278666057561, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 4.216052602692675e-06, + "logits/chosen": 754643285.3333334, + "logits/rejected": 604683571.2, + "logps/chosen": -596.0262044270834, + "logps/rejected": -580.520703125, + "loss": 0.006, + "rewards/chosen": 4.4259688059488935, + "rewards/margins": 13.173581568400067, + "rewards/rejected": -8.747612762451173, + "step": 6031 + }, + { + "epoch": 0.5511192325262677, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 4.2146326179250645e-06, + "logits/chosen": 515696844.8, + "logits/rejected": 714560896.0, + "logps/chosen": -302.7919921875, + "logps/rejected": -689.3877766927084, + "loss": 0.0301, + "rewards/chosen": 3.2521041870117187, + "rewards/margins": 13.873734537760416, + "rewards/rejected": -10.621630350748697, + "step": 6032 + }, + { + "epoch": 0.5512105984467793, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.213212698100062e-06, + "logits/chosen": 366960320.0, + "logits/rejected": 671851136.0, + "logps/chosen": -304.4796142578125, + "logps/rejected": -380.45574951171875, + "loss": 0.1323, + "rewards/chosen": 2.2541518211364746, + "rewards/margins": 12.465363025665283, + "rewards/rejected": -10.211211204528809, + "step": 6033 + }, + { + "epoch": 0.551301964367291, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.211792843335078e-06, + "logits/chosen": 415979690.6666667, + "logits/rejected": 635241216.0, + "logps/chosen": -252.10074869791666, + "logps/rejected": -308.2679931640625, + "loss": 0.0123, + "rewards/chosen": 4.112328211466472, + "rewards/margins": 11.407240931193034, + "rewards/rejected": -7.294912719726563, + "step": 6034 + }, + { + "epoch": 0.5513933302878027, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.210373053747522e-06, + "logits/chosen": 493229056.0, + "logits/rejected": 238100736.0, + "logps/chosen": -323.852685546875, + "logps/rejected": -255.5625, + "loss": 0.0244, + "rewards/chosen": 3.7746341705322264, + "rewards/margins": 10.067538070678712, + "rewards/rejected": -6.292903900146484, + "step": 6035 + }, + { + "epoch": 0.5514846962083143, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 4.208953329454799e-06, + "logits/chosen": 771460352.0, + "logits/rejected": 730040115.2, + "logps/chosen": -240.80232747395834, + "logps/rejected": -666.59501953125, + "loss": 0.0077, + "rewards/chosen": 3.9268805185953775, + "rewards/margins": 16.024892298380536, + "rewards/rejected": -12.098011779785157, + "step": 6036 + }, + { + "epoch": 0.5515760621288259, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 4.2075336705743065e-06, + "logits/chosen": 602150229.3333334, + "logits/rejected": 193681881.6, + "logps/chosen": -453.4582112630208, + "logps/rejected": -391.2147216796875, + "loss": 0.0078, + "rewards/chosen": 3.8857138951619468, + "rewards/margins": 14.597186215718589, + "rewards/rejected": -10.711472320556641, + "step": 6037 + }, + { + "epoch": 0.5516674280493375, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 4.2061140772234375e-06, + "logits/chosen": 485730816.0, + "logits/rejected": 587585706.6666666, + "logps/chosen": -284.637060546875, + "logps/rejected": -748.6754557291666, + "loss": 0.0444, + "rewards/chosen": 3.4035923004150392, + "rewards/margins": 14.170682144165038, + "rewards/rejected": -10.76708984375, + "step": 6038 + }, + { + "epoch": 0.5517587939698493, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 4.204694549519578e-06, + "logits/chosen": 263776992.0, + "logits/rejected": 1007815552.0, + "logps/chosen": -176.72996520996094, + "logps/rejected": -726.5292358398438, + "loss": 0.0169, + "rewards/chosen": 3.8815386295318604, + "rewards/margins": 14.75914978981018, + "rewards/rejected": -10.87761116027832, + "step": 6039 + }, + { + "epoch": 0.5518501598903609, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 4.203275087580111e-06, + "logits/chosen": 567085525.3333334, + "logits/rejected": 1124269952.0, + "logps/chosen": -260.61277262369794, + "logps/rejected": -530.7778930664062, + "loss": 0.1354, + "rewards/chosen": 2.643623193105062, + "rewards/margins": 12.23006041844686, + "rewards/rejected": -9.586437225341797, + "step": 6040 + }, + { + "epoch": 0.5519415258108725, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 4.201855691522412e-06, + "logits/chosen": 661033779.2, + "logits/rejected": 1131077461.3333333, + "logps/chosen": -446.399951171875, + "logps/rejected": -795.2279459635416, + "loss": 0.0135, + "rewards/chosen": 4.099480819702149, + "rewards/margins": 11.330404790242513, + "rewards/rejected": -7.230923970540364, + "step": 6041 + }, + { + "epoch": 0.5520328917313841, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 4.200436361463853e-06, + "logits/chosen": 448549427.2, + "logits/rejected": 407874133.3333333, + "logps/chosen": -254.858447265625, + "logps/rejected": -409.1441243489583, + "loss": 0.0295, + "rewards/chosen": 3.746966552734375, + "rewards/margins": 12.94187660217285, + "rewards/rejected": -9.194910049438477, + "step": 6042 + }, + { + "epoch": 0.5521242576518959, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 4.199017097521797e-06, + "logits/chosen": 716197068.8, + "logits/rejected": 329094378.6666667, + "logps/chosen": -336.0609375, + "logps/rejected": -428.4668782552083, + "loss": 0.0114, + "rewards/chosen": 4.207415390014648, + "rewards/margins": 12.655321884155274, + "rewards/rejected": -8.447906494140625, + "step": 6043 + }, + { + "epoch": 0.5522156235724075, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 4.1975978998136086e-06, + "logits/chosen": 300791168.0, + "logits/rejected": 537006464.0, + "logps/chosen": -279.5901184082031, + "logps/rejected": -426.64373779296875, + "loss": 0.01, + "rewards/chosen": 4.557299613952637, + "rewards/margins": 12.513392448425293, + "rewards/rejected": -7.956092834472656, + "step": 6044 + }, + { + "epoch": 0.5523069894929191, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.196178768456639e-06, + "logits/chosen": 620177536.0, + "logits/rejected": 555325760.0, + "logps/chosen": -305.7934163411458, + "logps/rejected": -570.0223388671875, + "loss": 0.0268, + "rewards/chosen": 3.843866984049479, + "rewards/margins": 13.46761957804362, + "rewards/rejected": -9.62375259399414, + "step": 6045 + }, + { + "epoch": 0.5523983554134307, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 4.1947597035682355e-06, + "logits/chosen": 573664704.0, + "logits/rejected": 343501952.0, + "logps/chosen": -299.3534851074219, + "logps/rejected": -361.03424072265625, + "loss": 0.0239, + "rewards/chosen": 3.157451629638672, + "rewards/margins": 10.263011455535889, + "rewards/rejected": -7.105559825897217, + "step": 6046 + }, + { + "epoch": 0.5524897213339425, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 4.193340705265746e-06, + "logits/chosen": 1121489152.0, + "logits/rejected": 738890112.0, + "logps/chosen": -457.75360107421875, + "logps/rejected": -369.3412780761719, + "loss": 0.0207, + "rewards/chosen": 3.6575300693511963, + "rewards/margins": 11.92801308631897, + "rewards/rejected": -8.270483016967773, + "step": 6047 + }, + { + "epoch": 0.5525810872544541, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 4.191921773666504e-06, + "logits/chosen": 728558643.2, + "logits/rejected": 429467946.6666667, + "logps/chosen": -371.30078125, + "logps/rejected": -551.0782470703125, + "loss": 0.0477, + "rewards/chosen": 3.1902032852172852, + "rewards/margins": 11.188575681050619, + "rewards/rejected": -7.998372395833333, + "step": 6048 + }, + { + "epoch": 0.5526724531749657, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 4.190502908887847e-06, + "logits/chosen": 366723648.0, + "logits/rejected": 600572544.0, + "logps/chosen": -265.9818420410156, + "logps/rejected": -575.8699951171875, + "loss": 0.0219, + "rewards/chosen": 3.4003958702087402, + "rewards/margins": 12.120331287384033, + "rewards/rejected": -8.719935417175293, + "step": 6049 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 4.189084111047097e-06, + "logits/chosen": 599810304.0, + "logits/rejected": 753500330.6666666, + "logps/chosen": -271.0534912109375, + "logps/rejected": -633.7938639322916, + "loss": 0.0271, + "rewards/chosen": 3.489306640625, + "rewards/margins": 12.877347183227538, + "rewards/rejected": -9.388040542602539, + "step": 6050 + }, + { + "epoch": 0.5528551850159891, + "grad_norm": 2.421875, + "kl": 2.2237319946289062, + "learning_rate": 4.187665380261579e-06, + "logits/chosen": 466278954.6666667, + "logits/rejected": 548779200.0, + "logps/chosen": -416.5543619791667, + "logps/rejected": -613.2427978515625, + "loss": 0.0188, + "rewards/chosen": 4.567966461181641, + "rewards/margins": 12.152982711791992, + "rewards/rejected": -7.585016250610352, + "step": 6051 + }, + { + "epoch": 0.5529465509365007, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 4.1862467166486065e-06, + "logits/chosen": 1155554918.4, + "logits/rejected": 823041109.3333334, + "logps/chosen": -402.457861328125, + "logps/rejected": -290.00795491536456, + "loss": 0.0073, + "rewards/chosen": 4.9514610290527346, + "rewards/margins": 14.300150426228843, + "rewards/rejected": -9.348689397176107, + "step": 6052 + }, + { + "epoch": 0.5530379168570123, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 4.184828120325492e-06, + "logits/chosen": 324835584.0, + "logits/rejected": 382621568.0, + "logps/chosen": -265.8482666015625, + "logps/rejected": -467.35516357421875, + "loss": 0.0263, + "rewards/chosen": 3.3778958320617676, + "rewards/margins": 13.232424259185791, + "rewards/rejected": -9.854528427124023, + "step": 6053 + }, + { + "epoch": 0.5531292827775239, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 4.183409591409539e-06, + "logits/chosen": 301686988.8, + "logits/rejected": 333448874.6666667, + "logps/chosen": -241.9169677734375, + "logps/rejected": -324.689697265625, + "loss": 0.1318, + "rewards/chosen": 3.003057861328125, + "rewards/margins": 11.174360656738282, + "rewards/rejected": -8.171302795410156, + "step": 6054 + }, + { + "epoch": 0.5532206486980357, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 4.1819911300180475e-06, + "logits/chosen": 321764320.0, + "logits/rejected": 385339690.6666667, + "logps/chosen": -328.0895080566406, + "logps/rejected": -502.7329915364583, + "loss": 0.0129, + "rewards/chosen": 2.997023105621338, + "rewards/margins": 13.267069339752197, + "rewards/rejected": -10.27004623413086, + "step": 6055 + }, + { + "epoch": 0.5533120146185473, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 4.180572736268309e-06, + "logits/chosen": 781142848.0, + "logits/rejected": 516284160.0, + "logps/chosen": -291.2999267578125, + "logps/rejected": -451.2203776041667, + "loss": 0.0079, + "rewards/chosen": 3.4773406982421875, + "rewards/margins": 12.863999684651693, + "rewards/rejected": -9.386658986409506, + "step": 6056 + }, + { + "epoch": 0.5534033805390589, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 4.179154410277614e-06, + "logits/chosen": 584305280.0, + "logits/rejected": 497867136.0, + "logps/chosen": -376.13934326171875, + "logps/rejected": -715.8182983398438, + "loss": 0.0162, + "rewards/chosen": 3.5869834423065186, + "rewards/margins": 13.557535886764526, + "rewards/rejected": -9.970552444458008, + "step": 6057 + }, + { + "epoch": 0.5534947464595705, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 4.177736152163245e-06, + "logits/chosen": 716689237.3333334, + "logits/rejected": 500055859.2, + "logps/chosen": -353.5664876302083, + "logps/rejected": -387.5875732421875, + "loss": 0.0156, + "rewards/chosen": 3.258639653523763, + "rewards/margins": 11.922109349568686, + "rewards/rejected": -8.663469696044922, + "step": 6058 + }, + { + "epoch": 0.5535861123800823, + "grad_norm": 42.75, + "kl": 0.0, + "learning_rate": 4.176317962042476e-06, + "logits/chosen": 400777941.3333333, + "logits/rejected": 691531264.0, + "logps/chosen": -316.7499186197917, + "logps/rejected": -555.97099609375, + "loss": 0.0626, + "rewards/chosen": 5.03231684366862, + "rewards/margins": 13.123158518473307, + "rewards/rejected": -8.090841674804688, + "step": 6059 + }, + { + "epoch": 0.5536774783005939, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 4.174899840032583e-06, + "logits/chosen": 519625952.0, + "logits/rejected": 748888128.0, + "logps/chosen": -169.94253540039062, + "logps/rejected": -594.7257080078125, + "loss": 0.0074, + "rewards/chosen": 4.702882289886475, + "rewards/margins": 14.936252117156982, + "rewards/rejected": -10.233369827270508, + "step": 6060 + }, + { + "epoch": 0.5537688442211055, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 4.173481786250826e-06, + "logits/chosen": 476402602.6666667, + "logits/rejected": 711204352.0, + "logps/chosen": -306.5548502604167, + "logps/rejected": -426.175927734375, + "loss": 0.084, + "rewards/chosen": 4.097478866577148, + "rewards/margins": 11.355173110961914, + "rewards/rejected": -7.257694244384766, + "step": 6061 + }, + { + "epoch": 0.5538602101416171, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 4.1720638008144665e-06, + "logits/chosen": 437884928.0, + "logits/rejected": 414684416.0, + "logps/chosen": -284.28948974609375, + "logps/rejected": -450.3609619140625, + "loss": 0.0329, + "rewards/chosen": 2.8147385120391846, + "rewards/margins": 11.07687497138977, + "rewards/rejected": -8.262136459350586, + "step": 6062 + }, + { + "epoch": 0.5539515760621289, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 4.1706458838407616e-06, + "logits/chosen": 759373619.2, + "logits/rejected": 223445333.33333334, + "logps/chosen": -417.27763671875, + "logps/rejected": -301.5011393229167, + "loss": 0.0235, + "rewards/chosen": 3.714877700805664, + "rewards/margins": 11.335874811808269, + "rewards/rejected": -7.6209971110026045, + "step": 6063 + }, + { + "epoch": 0.5540429419826405, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 4.169228035446956e-06, + "logits/chosen": 612002773.3333334, + "logits/rejected": 506145894.4, + "logps/chosen": -424.350341796875, + "logps/rejected": -355.5441650390625, + "loss": 0.0098, + "rewards/chosen": 3.691273053487142, + "rewards/margins": 11.602296002705891, + "rewards/rejected": -7.91102294921875, + "step": 6064 + }, + { + "epoch": 0.5541343079031521, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 4.1678102557502965e-06, + "logits/chosen": 584724544.0, + "logits/rejected": 527456085.3333333, + "logps/chosen": -154.4522705078125, + "logps/rejected": -422.3111165364583, + "loss": 0.0112, + "rewards/chosen": 3.153959274291992, + "rewards/margins": 11.278774897257486, + "rewards/rejected": -8.124815622965494, + "step": 6065 + }, + { + "epoch": 0.5542256738236637, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 4.166392544868018e-06, + "logits/chosen": 430368204.8, + "logits/rejected": 465923797.3333333, + "logps/chosen": -275.668017578125, + "logps/rejected": -605.2466634114584, + "loss": 0.0248, + "rewards/chosen": 3.5902755737304686, + "rewards/margins": 12.469698715209962, + "rewards/rejected": -8.879423141479492, + "step": 6066 + }, + { + "epoch": 0.5543170397441755, + "grad_norm": 0.275390625, + "kl": 0.0, + "learning_rate": 4.164974902917352e-06, + "logits/chosen": 341234688.0, + "logits/rejected": 572437284.5714285, + "logps/chosen": -267.2705078125, + "logps/rejected": -361.05496651785717, + "loss": 0.0012, + "rewards/chosen": 5.244250774383545, + "rewards/margins": 13.524509498051234, + "rewards/rejected": -8.28025872366769, + "step": 6067 + }, + { + "epoch": 0.5544084056646871, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 4.1635573300155255e-06, + "logits/chosen": 1010139221.3333334, + "logits/rejected": 607325568.0, + "logps/chosen": -296.95977783203125, + "logps/rejected": -575.4141845703125, + "loss": 0.0378, + "rewards/chosen": 3.534888585408529, + "rewards/margins": 14.27559502919515, + "rewards/rejected": -10.740706443786621, + "step": 6068 + }, + { + "epoch": 0.5544997715851987, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 4.162139826279756e-06, + "logits/chosen": 1514449578.6666667, + "logits/rejected": 731268761.6, + "logps/chosen": -423.112060546875, + "logps/rejected": -617.16005859375, + "loss": 0.0225, + "rewards/chosen": 3.8302332560221353, + "rewards/margins": 13.150176493326823, + "rewards/rejected": -9.319943237304688, + "step": 6069 + }, + { + "epoch": 0.5545911375057103, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 4.160722391827262e-06, + "logits/chosen": 649715712.0, + "logits/rejected": 710259840.0, + "logps/chosen": -235.38818359375, + "logps/rejected": -534.1715087890625, + "loss": 0.0117, + "rewards/chosen": 3.995340585708618, + "rewards/margins": 12.487805604934692, + "rewards/rejected": -8.492465019226074, + "step": 6070 + }, + { + "epoch": 0.554682503426222, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 4.159305026775249e-06, + "logits/chosen": 659260842.6666666, + "logits/rejected": 560155443.2, + "logps/chosen": -357.7841389973958, + "logps/rejected": -540.720556640625, + "loss": 0.0189, + "rewards/chosen": 3.027663230895996, + "rewards/margins": 13.462589836120605, + "rewards/rejected": -10.434926605224609, + "step": 6071 + }, + { + "epoch": 0.5547738693467337, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 4.1578877312409225e-06, + "logits/chosen": 751572650.6666666, + "logits/rejected": 596141107.2, + "logps/chosen": -316.7236735026042, + "logps/rejected": -570.13740234375, + "loss": 0.0102, + "rewards/chosen": 4.1670481363932295, + "rewards/margins": 15.42876688639323, + "rewards/rejected": -11.26171875, + "step": 6072 + }, + { + "epoch": 0.5548652352672453, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 4.156470505341477e-06, + "logits/chosen": 535208096.0, + "logits/rejected": 389673472.0, + "logps/chosen": -398.8943786621094, + "logps/rejected": -493.6764322916667, + "loss": 0.0071, + "rewards/chosen": 4.12832498550415, + "rewards/margins": 14.940290927886963, + "rewards/rejected": -10.811965942382812, + "step": 6073 + }, + { + "epoch": 0.5549566011877569, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 4.1550533491941055e-06, + "logits/chosen": 344733312.0, + "logits/rejected": 174264608.0, + "logps/chosen": -351.8614196777344, + "logps/rejected": -380.90875244140625, + "loss": 0.0276, + "rewards/chosen": 3.407501459121704, + "rewards/margins": 14.746196985244751, + "rewards/rejected": -11.338695526123047, + "step": 6074 + }, + { + "epoch": 0.5550479671082686, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 4.153636262915993e-06, + "logits/chosen": 511210410.6666667, + "logits/rejected": 390713344.0, + "logps/chosen": -233.0515340169271, + "logps/rejected": -259.80767822265625, + "loss": 0.0167, + "rewards/chosen": 3.9425315856933594, + "rewards/margins": 10.854338645935059, + "rewards/rejected": -6.911807060241699, + "step": 6075 + }, + { + "epoch": 0.5551393330287803, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 4.15221924662432e-06, + "logits/chosen": 275523424.0, + "logits/rejected": 482038186.6666667, + "logps/chosen": -239.21243286132812, + "logps/rejected": -587.4706217447916, + "loss": 0.0112, + "rewards/chosen": 3.245732307434082, + "rewards/margins": 12.752004941304525, + "rewards/rejected": -9.506272633870443, + "step": 6076 + }, + { + "epoch": 0.5552306989492919, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 4.150802300436262e-06, + "logits/chosen": 413979648.0, + "logits/rejected": 526523562.6666667, + "logps/chosen": -245.95751953125, + "logps/rejected": -454.998046875, + "loss": 0.0158, + "rewards/chosen": 4.056887817382813, + "rewards/margins": 13.319145711263022, + "rewards/rejected": -9.262257893880209, + "step": 6077 + }, + { + "epoch": 0.5553220648698035, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 4.1493854244689845e-06, + "logits/chosen": 394729301.3333333, + "logits/rejected": 353748377.6, + "logps/chosen": -302.012939453125, + "logps/rejected": -407.953955078125, + "loss": 0.0214, + "rewards/chosen": 3.2300135294596353, + "rewards/margins": 12.98301035563151, + "rewards/rejected": -9.752996826171875, + "step": 6078 + }, + { + "epoch": 0.5554134307903152, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 4.147968618839651e-06, + "logits/chosen": 461461376.0, + "logits/rejected": 628668672.0, + "logps/chosen": -346.48187255859375, + "logps/rejected": -406.74615478515625, + "loss": 0.009, + "rewards/chosen": 4.191170692443848, + "rewards/margins": 13.61446475982666, + "rewards/rejected": -9.423294067382812, + "step": 6079 + }, + { + "epoch": 0.5555047967108269, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 4.146551883665418e-06, + "logits/chosen": 711504896.0, + "logits/rejected": 846139264.0, + "logps/chosen": -246.88882446289062, + "logps/rejected": -722.5025634765625, + "loss": 0.0261, + "rewards/chosen": 3.495333194732666, + "rewards/margins": 16.749087810516357, + "rewards/rejected": -13.253754615783691, + "step": 6080 + }, + { + "epoch": 0.5555961626313385, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 4.145135219063439e-06, + "logits/chosen": 751808682.6666666, + "logits/rejected": 807920435.2, + "logps/chosen": -153.21366373697916, + "logps/rejected": -584.36064453125, + "loss": 0.0197, + "rewards/chosen": 3.81197198232015, + "rewards/margins": 14.029775937398275, + "rewards/rejected": -10.217803955078125, + "step": 6081 + }, + { + "epoch": 0.5556875285518501, + "grad_norm": 24.875, + "kl": 0.0, + "learning_rate": 4.143718625150854e-06, + "logits/chosen": 440768128.0, + "logits/rejected": 506838016.0, + "logps/chosen": -203.75653076171875, + "logps/rejected": -556.590625, + "loss": 0.0504, + "rewards/chosen": 2.353549321492513, + "rewards/margins": 10.836422856648763, + "rewards/rejected": -8.48287353515625, + "step": 6082 + }, + { + "epoch": 0.5557788944723618, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 4.1423021020448075e-06, + "logits/chosen": 594023168.0, + "logits/rejected": 1100670156.8, + "logps/chosen": -310.6351725260417, + "logps/rejected": -494.42822265625, + "loss": 0.0099, + "rewards/chosen": 4.084510485331218, + "rewards/margins": 14.723418871561687, + "rewards/rejected": -10.638908386230469, + "step": 6083 + }, + { + "epoch": 0.5558702603928735, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 4.140885649862429e-06, + "logits/chosen": 512871744.0, + "logits/rejected": 501007808.0, + "logps/chosen": -303.5543212890625, + "logps/rejected": -571.132568359375, + "loss": 0.0086, + "rewards/chosen": 4.7141008377075195, + "rewards/margins": 14.7946195602417, + "rewards/rejected": -10.08051872253418, + "step": 6084 + }, + { + "epoch": 0.5559616263133851, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 4.139469268720849e-06, + "logits/chosen": 543403136.0, + "logits/rejected": 464632192.0, + "logps/chosen": -227.37306213378906, + "logps/rejected": -637.255859375, + "loss": 0.0398, + "rewards/chosen": 4.052014350891113, + "rewards/margins": 13.358437538146973, + "rewards/rejected": -9.30642318725586, + "step": 6085 + }, + { + "epoch": 0.5560529922338967, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 4.138052958737187e-06, + "logits/chosen": 618202931.2, + "logits/rejected": 362794965.3333333, + "logps/chosen": -423.10576171875, + "logps/rejected": -467.7728678385417, + "loss": 0.0256, + "rewards/chosen": 3.2647594451904296, + "rewards/margins": 11.943301264444987, + "rewards/rejected": -8.678541819254557, + "step": 6086 + }, + { + "epoch": 0.5561443581544084, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 4.136636720028559e-06, + "logits/chosen": 560142464.0, + "logits/rejected": 410900992.0, + "logps/chosen": -391.820556640625, + "logps/rejected": -517.1959228515625, + "loss": 0.0112, + "rewards/chosen": 4.460978825887044, + "rewards/margins": 14.180517514546711, + "rewards/rejected": -9.719538688659668, + "step": 6087 + }, + { + "epoch": 0.5562357240749201, + "grad_norm": 0.15625, + "kl": 0.0, + "learning_rate": 4.135220552712074e-06, + "logits/chosen": 295572224.0, + "logits/rejected": 658973403.4285715, + "logps/chosen": -246.104736328125, + "logps/rejected": -589.1990094866071, + "loss": 0.0007, + "rewards/chosen": 6.556603908538818, + "rewards/margins": 16.885370867592947, + "rewards/rejected": -10.32876695905413, + "step": 6088 + }, + { + "epoch": 0.5563270899954317, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 4.133804456904839e-06, + "logits/chosen": 731426252.8, + "logits/rejected": 592168576.0, + "logps/chosen": -277.65693359375, + "logps/rejected": -608.1023763020834, + "loss": 0.0134, + "rewards/chosen": 4.358377456665039, + "rewards/margins": 15.244616826375326, + "rewards/rejected": -10.886239369710287, + "step": 6089 + }, + { + "epoch": 0.5564184559159433, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 4.132388432723947e-06, + "logits/chosen": 705341397.3333334, + "logits/rejected": 378067302.4, + "logps/chosen": -335.1753743489583, + "logps/rejected": -413.494482421875, + "loss": 0.0097, + "rewards/chosen": 3.684530258178711, + "rewards/margins": 11.758849716186523, + "rewards/rejected": -8.074319458007812, + "step": 6090 + }, + { + "epoch": 0.556509821836455, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 4.130972480286495e-06, + "logits/chosen": 573776298.6666666, + "logits/rejected": 544094003.2, + "logps/chosen": -372.21435546875, + "logps/rejected": -544.20888671875, + "loss": 0.1096, + "rewards/chosen": 3.461529095967611, + "rewards/margins": 11.5882594426473, + "rewards/rejected": -8.126730346679688, + "step": 6091 + }, + { + "epoch": 0.5566011877569667, + "grad_norm": 0.185546875, + "kl": 0.0, + "learning_rate": 4.129556599709566e-06, + "logits/chosen": 406682944.0, + "logits/rejected": 705249499.4285715, + "logps/chosen": -386.9346008300781, + "logps/rejected": -560.4174107142857, + "loss": 0.0008, + "rewards/chosen": 6.751016139984131, + "rewards/margins": 17.48498351233346, + "rewards/rejected": -10.73396737234933, + "step": 6092 + }, + { + "epoch": 0.5566925536774783, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 4.128140791110243e-06, + "logits/chosen": 511350784.0, + "logits/rejected": 376549145.6, + "logps/chosen": -323.6694742838542, + "logps/rejected": -518.184423828125, + "loss": 0.0165, + "rewards/chosen": 3.3619457880655923, + "rewards/margins": 12.430012957255045, + "rewards/rejected": -9.068067169189453, + "step": 6093 + }, + { + "epoch": 0.5567839195979899, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 4.126725054605597e-06, + "logits/chosen": 563088256.0, + "logits/rejected": 306694048.0, + "logps/chosen": -348.1767578125, + "logps/rejected": -384.1537170410156, + "loss": 0.0062, + "rewards/chosen": 5.497218132019043, + "rewards/margins": 14.934065818786621, + "rewards/rejected": -9.436847686767578, + "step": 6094 + }, + { + "epoch": 0.5568752855185016, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 4.125309390312698e-06, + "logits/chosen": 1050743232.0, + "logits/rejected": 727001920.0, + "logps/chosen": -205.7443389892578, + "logps/rejected": -408.6326904296875, + "loss": 0.0239, + "rewards/chosen": 3.2863121032714844, + "rewards/margins": 11.751224517822266, + "rewards/rejected": -8.464912414550781, + "step": 6095 + }, + { + "epoch": 0.5569666514390132, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 4.1238937983486085e-06, + "logits/chosen": 344821632.0, + "logits/rejected": 646277248.0, + "logps/chosen": -268.96262613932294, + "logps/rejected": -390.676513671875, + "loss": 0.0209, + "rewards/chosen": 4.045745213826497, + "rewards/margins": 13.761165936787922, + "rewards/rejected": -9.715420722961426, + "step": 6096 + }, + { + "epoch": 0.5570580173595249, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 4.1224782788303855e-06, + "logits/chosen": 294094528.0, + "logits/rejected": 643737792.0, + "logps/chosen": -310.494873046875, + "logps/rejected": -550.4689331054688, + "loss": 0.0382, + "rewards/chosen": 2.66721773147583, + "rewards/margins": 12.508938312530518, + "rewards/rejected": -9.841720581054688, + "step": 6097 + }, + { + "epoch": 0.5571493832800365, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 4.121062831875078e-06, + "logits/chosen": 861427785.1428572, + "logits/rejected": 292684096.0, + "logps/chosen": -308.53480747767856, + "logps/rejected": -218.6708984375, + "loss": 0.0212, + "rewards/chosen": 3.9404280526297435, + "rewards/margins": 11.987820897783552, + "rewards/rejected": -8.047392845153809, + "step": 6098 + }, + { + "epoch": 0.5572407492005482, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 4.119647457599731e-06, + "logits/chosen": 879142464.0, + "logits/rejected": 560651264.0, + "logps/chosen": -615.558837890625, + "logps/rejected": -627.4578247070312, + "loss": 0.0175, + "rewards/chosen": 3.5299911499023438, + "rewards/margins": 12.607425689697266, + "rewards/rejected": -9.077434539794922, + "step": 6099 + }, + { + "epoch": 0.5573321151210598, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 4.118232156121382e-06, + "logits/chosen": 624269952.0, + "logits/rejected": 541794048.0, + "logps/chosen": -258.01861572265625, + "logps/rejected": -462.72125244140625, + "loss": 0.0223, + "rewards/chosen": 4.287582874298096, + "rewards/margins": 9.628495216369629, + "rewards/rejected": -5.340912342071533, + "step": 6100 + }, + { + "epoch": 0.5574234810415715, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 4.116816927557063e-06, + "logits/chosen": 624665804.8, + "logits/rejected": 713242538.6666666, + "logps/chosen": -425.62763671875, + "logps/rejected": -468.282470703125, + "loss": 0.0199, + "rewards/chosen": 3.7285717010498045, + "rewards/margins": 13.207976786295571, + "rewards/rejected": -9.479405085245768, + "step": 6101 + }, + { + "epoch": 0.5575148469620831, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 4.115401772023803e-06, + "logits/chosen": 529249728.0, + "logits/rejected": 449648160.0, + "logps/chosen": -373.5620422363281, + "logps/rejected": -495.25860595703125, + "loss": 0.0133, + "rewards/chosen": 3.875706195831299, + "rewards/margins": 14.486155033111572, + "rewards/rejected": -10.610448837280273, + "step": 6102 + }, + { + "epoch": 0.5576062128825948, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 4.113986689638619e-06, + "logits/chosen": 472858521.6, + "logits/rejected": 683483690.6666666, + "logps/chosen": -241.99404296875, + "logps/rejected": -570.6510009765625, + "loss": 0.028, + "rewards/chosen": 3.1445526123046874, + "rewards/margins": 12.98627192179362, + "rewards/rejected": -9.841719309488932, + "step": 6103 + }, + { + "epoch": 0.5576975788031064, + "grad_norm": 0.10498046875, + "kl": 0.0, + "learning_rate": 4.112571680518528e-06, + "logits/rejected": 374277568.0, + "logps/rejected": -386.5256042480469, + "loss": 0.0005, + "rewards/rejected": -8.361907958984375, + "step": 6104 + }, + { + "epoch": 0.5577889447236181, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 4.111156744780535e-06, + "logits/chosen": 237240917.33333334, + "logits/rejected": 374909132.8, + "logps/chosen": -209.94873046875, + "logps/rejected": -356.5636474609375, + "loss": 0.0178, + "rewards/chosen": 3.8039232889811196, + "rewards/margins": 13.281192270914714, + "rewards/rejected": -9.477268981933594, + "step": 6105 + }, + { + "epoch": 0.5578803106441297, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 4.109741882541647e-06, + "logits/chosen": 561021132.8, + "logits/rejected": 595641301.3333334, + "logps/chosen": -311.58466796875, + "logps/rejected": -573.0421549479166, + "loss": 0.0147, + "rewards/chosen": 4.289640045166015, + "rewards/margins": 14.429052607218424, + "rewards/rejected": -10.139412562052408, + "step": 6106 + }, + { + "epoch": 0.5579716765646414, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 4.108327093918854e-06, + "logits/chosen": 457070176.0, + "logps/chosen": -370.9925537109375, + "loss": 0.0335, + "rewards/chosen": 3.8533875942230225, + "step": 6107 + }, + { + "epoch": 0.558063042485153, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 4.10691237902915e-06, + "logits/chosen": 786073958.4, + "logits/rejected": 319677482.6666667, + "logps/chosen": -358.2347900390625, + "logps/rejected": -233.7629597981771, + "loss": 0.0171, + "rewards/chosen": 4.13000259399414, + "rewards/margins": 12.379118220011392, + "rewards/rejected": -8.249115626017252, + "step": 6108 + }, + { + "epoch": 0.5581544084056647, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 4.105497737989518e-06, + "logits/chosen": 787655552.0, + "logits/rejected": 318266075.4285714, + "logps/chosen": -604.4984130859375, + "logps/rejected": -326.79527064732144, + "loss": 0.0561, + "rewards/chosen": 4.946789741516113, + "rewards/margins": 12.905204091753278, + "rewards/rejected": -7.958414350237165, + "step": 6109 + }, + { + "epoch": 0.5582457743261764, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 4.104083170916934e-06, + "logits/chosen": 375260629.3333333, + "logits/rejected": 579615232.0, + "logps/chosen": -304.16062418619794, + "logps/rejected": -384.27236328125, + "loss": 0.028, + "rewards/chosen": 3.0978736877441406, + "rewards/margins": 12.818795776367187, + "rewards/rejected": -9.720922088623047, + "step": 6110 + }, + { + "epoch": 0.558337140246688, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 4.102668677928372e-06, + "logits/chosen": 306343509.3333333, + "logits/rejected": 305630566.4, + "logps/chosen": -182.741455078125, + "logps/rejected": -413.77705078125, + "loss": 0.0219, + "rewards/chosen": 4.554653803507487, + "rewards/margins": 14.086035792032877, + "rewards/rejected": -9.531381988525391, + "step": 6111 + }, + { + "epoch": 0.5584285061671996, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 4.101254259140796e-06, + "logits/chosen": 366675968.0, + "logits/rejected": 239890368.0, + "logps/chosen": -343.0082194010417, + "logps/rejected": -600.7479248046875, + "loss": 0.0278, + "rewards/chosen": 3.6263205210367837, + "rewards/margins": 13.211880366007486, + "rewards/rejected": -9.585559844970703, + "step": 6112 + }, + { + "epoch": 0.5585198720877113, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 4.099839914671168e-06, + "logits/chosen": 637507328.0, + "logits/rejected": 1378138496.0, + "logps/chosen": -353.6104329427083, + "logps/rejected": -1555.56298828125, + "loss": 0.0275, + "rewards/chosen": 3.605915387471517, + "rewards/margins": 19.07791550954183, + "rewards/rejected": -15.472000122070312, + "step": 6113 + }, + { + "epoch": 0.558611238008223, + "grad_norm": 0.392578125, + "kl": 0.0, + "learning_rate": 4.098425644636436e-06, + "logits/rejected": 616905920.0, + "logps/rejected": -499.3514099121094, + "loss": 0.0011, + "rewards/rejected": -9.307361602783203, + "step": 6114 + }, + { + "epoch": 0.5587026039287346, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 4.09701144915355e-06, + "logits/chosen": 510757728.0, + "logits/rejected": 887077568.0, + "logps/chosen": -309.4300537109375, + "logps/rejected": -1146.0435791015625, + "loss": 0.0214, + "rewards/chosen": 3.4985880851745605, + "rewards/margins": 19.178697109222412, + "rewards/rejected": -15.680109024047852, + "step": 6115 + }, + { + "epoch": 0.5587939698492462, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 4.0955973283394525e-06, + "logits/chosen": 798587392.0, + "logits/rejected": 640656486.4, + "logps/chosen": -430.6385091145833, + "logps/rejected": -416.54228515625, + "loss": 0.0091, + "rewards/chosen": 4.089783668518066, + "rewards/margins": 13.149590492248535, + "rewards/rejected": -9.059806823730469, + "step": 6116 + }, + { + "epoch": 0.5588853357697579, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 4.094183282311074e-06, + "logits/chosen": 809473024.0, + "logits/rejected": 461865420.8, + "logps/chosen": -688.6339518229166, + "logps/rejected": -525.151953125, + "loss": 0.0132, + "rewards/chosen": 3.493776003519694, + "rewards/margins": 13.019030825297037, + "rewards/rejected": -9.525254821777343, + "step": 6117 + }, + { + "epoch": 0.5589767016902696, + "grad_norm": 0.470703125, + "kl": 0.0, + "learning_rate": 4.092769311185348e-06, + "logits/chosen": 776367744.0, + "logits/rejected": 664265728.0, + "logps/chosen": -312.378173828125, + "logps/rejected": -343.603515625, + "loss": 0.0027, + "rewards/chosen": 4.678903579711914, + "rewards/margins": 13.392102559407553, + "rewards/rejected": -8.713198979695639, + "step": 6118 + }, + { + "epoch": 0.5590680676107812, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 4.091355415079193e-06, + "logits/chosen": 969430400.0, + "logits/rejected": 864608085.3333334, + "logps/chosen": -650.8578491210938, + "logps/rejected": -687.6793619791666, + "loss": 0.0077, + "rewards/chosen": 3.685757637023926, + "rewards/margins": 14.46251138051351, + "rewards/rejected": -10.776753743489584, + "step": 6119 + }, + { + "epoch": 0.5591594335312928, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 4.089941594109526e-06, + "logits/chosen": 625987904.0, + "logits/rejected": 547034112.0, + "logps/chosen": -395.95849609375, + "logps/rejected": -510.6476643880208, + "loss": 0.0123, + "rewards/chosen": 3.0310440063476562, + "rewards/margins": 13.793141682942709, + "rewards/rejected": -10.762097676595053, + "step": 6120 + }, + { + "epoch": 0.5592507994518044, + "grad_norm": 0.9609375, + "kl": 0.0, + "learning_rate": 4.088527848393258e-06, + "logits/chosen": 856003136.0, + "logits/rejected": 667731821.7142857, + "logps/chosen": -669.4051513671875, + "logps/rejected": -450.93798828125, + "loss": 0.0036, + "rewards/chosen": 3.687213182449341, + "rewards/margins": 12.65082587514605, + "rewards/rejected": -8.963612692696708, + "step": 6121 + }, + { + "epoch": 0.5593421653723162, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 4.087114178047291e-06, + "logits/chosen": 454287001.6, + "logits/rejected": 216855637.33333334, + "logps/chosen": -425.3212890625, + "logps/rejected": -347.6824137369792, + "loss": 0.0205, + "rewards/chosen": 3.522988128662109, + "rewards/margins": 12.782764561971028, + "rewards/rejected": -9.25977643330892, + "step": 6122 + }, + { + "epoch": 0.5594335312928278, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 4.085700583188525e-06, + "logits/chosen": 260263024.0, + "logits/rejected": 359741952.0, + "logps/chosen": -216.31597900390625, + "logps/rejected": -363.487060546875, + "loss": 0.0192, + "rewards/chosen": 3.72637939453125, + "rewards/margins": 13.825920104980469, + "rewards/rejected": -10.099540710449219, + "step": 6123 + }, + { + "epoch": 0.5595248972133394, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 4.084287063933848e-06, + "logits/chosen": 576640614.4, + "logits/rejected": 449604565.3333333, + "logps/chosen": -371.191796875, + "logps/rejected": -410.9298502604167, + "loss": 0.0133, + "rewards/chosen": 4.500306701660156, + "rewards/margins": 13.369374847412109, + "rewards/rejected": -8.869068145751953, + "step": 6124 + }, + { + "epoch": 0.559616263133851, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 4.082873620400148e-06, + "logits/chosen": 777545472.0, + "logits/rejected": 804834252.8, + "logps/chosen": -524.7078450520834, + "logps/rejected": -534.084765625, + "loss": 0.0048, + "rewards/chosen": 4.725093841552734, + "rewards/margins": 15.097673797607422, + "rewards/rejected": -10.372579956054688, + "step": 6125 + }, + { + "epoch": 0.5597076290543628, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 4.0814602527043e-06, + "logits/chosen": 638255040.0, + "logits/rejected": 440229120.0, + "logps/chosen": -505.91741943359375, + "logps/rejected": -584.702880859375, + "loss": 0.0167, + "rewards/chosen": 3.534106492996216, + "rewards/margins": 16.88281559944153, + "rewards/rejected": -13.348709106445312, + "step": 6126 + }, + { + "epoch": 0.5597989949748744, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 4.080046960963181e-06, + "logits/chosen": 464609920.0, + "logits/rejected": 474786368.0, + "logps/chosen": -422.80908203125, + "logps/rejected": -639.4869384765625, + "loss": 0.0415, + "rewards/chosen": 2.5316781997680664, + "rewards/margins": 13.506369590759277, + "rewards/rejected": -10.974691390991211, + "step": 6127 + }, + { + "epoch": 0.559890360895386, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.078633745293654e-06, + "logits/chosen": 878478131.2, + "logits/rejected": 840628053.3333334, + "logps/chosen": -333.9302001953125, + "logps/rejected": -876.0642903645834, + "loss": 0.0176, + "rewards/chosen": 4.101337051391601, + "rewards/margins": 17.300230280558267, + "rewards/rejected": -13.198893229166666, + "step": 6128 + }, + { + "epoch": 0.5599817268158976, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.07722060581258e-06, + "logits/chosen": 413459626.6666667, + "logits/rejected": 815055104.0, + "logps/chosen": -274.6380615234375, + "logps/rejected": -458.96453857421875, + "loss": 0.0225, + "rewards/chosen": 4.087753931681315, + "rewards/margins": 14.211365381876629, + "rewards/rejected": -10.123611450195312, + "step": 6129 + }, + { + "epoch": 0.5600730927364094, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 4.075807542636812e-06, + "logits/chosen": 498275737.6, + "logits/rejected": 220563626.66666666, + "logps/chosen": -265.44638671875, + "logps/rejected": -294.3710123697917, + "loss": 0.0446, + "rewards/chosen": 2.741699981689453, + "rewards/margins": 12.0062136332194, + "rewards/rejected": -9.264513651529947, + "step": 6130 + }, + { + "epoch": 0.560164458656921, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 4.074394555883196e-06, + "logits/chosen": 301103296.0, + "logits/rejected": 378618922.6666667, + "logps/chosen": -199.173828125, + "logps/rejected": -447.1124674479167, + "loss": 0.0125, + "rewards/chosen": 3.6461730003356934, + "rewards/margins": 13.878638108571371, + "rewards/rejected": -10.232465108235678, + "step": 6131 + }, + { + "epoch": 0.5602558245774326, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.072981645668576e-06, + "logits/chosen": 967945792.0, + "logits/rejected": 582302646.8571428, + "logps/chosen": -484.13818359375, + "logps/rejected": -461.86038643973217, + "loss": 0.0056, + "rewards/chosen": 5.763440132141113, + "rewards/margins": 13.963176046098981, + "rewards/rejected": -8.199735913957868, + "step": 6132 + }, + { + "epoch": 0.5603471904979442, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 4.071568812109783e-06, + "logits/chosen": 458603349.3333333, + "logits/rejected": 514494560.0, + "logps/chosen": -288.6081949869792, + "logps/rejected": -574.824951171875, + "loss": 0.0431, + "rewards/chosen": 2.929196357727051, + "rewards/margins": 10.821768760681152, + "rewards/rejected": -7.892572402954102, + "step": 6133 + }, + { + "epoch": 0.560438556418456, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 4.0701560553236495e-06, + "logits/chosen": 679314534.4, + "logits/rejected": 924365226.6666666, + "logps/chosen": -338.3417236328125, + "logps/rejected": -645.3275960286459, + "loss": 0.0344, + "rewards/chosen": 3.1072269439697267, + "rewards/margins": 11.471150334676107, + "rewards/rejected": -8.36392339070638, + "step": 6134 + }, + { + "epoch": 0.5605299223389676, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 4.0687433754269936e-06, + "logits/chosen": 361976746.6666667, + "logits/rejected": 481998540.8, + "logps/chosen": -225.71724446614584, + "logps/rejected": -438.0314453125, + "loss": 0.0198, + "rewards/chosen": 3.43182373046875, + "rewards/margins": 12.598837280273438, + "rewards/rejected": -9.167013549804688, + "step": 6135 + }, + { + "epoch": 0.5606212882594792, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 4.067330772536633e-06, + "logits/chosen": 474173354.6666667, + "logits/rejected": 772422348.8, + "logps/chosen": -253.77274576822916, + "logps/rejected": -599.7021484375, + "loss": 0.0106, + "rewards/chosen": 3.9011497497558594, + "rewards/margins": 15.07662582397461, + "rewards/rejected": -11.17547607421875, + "step": 6136 + }, + { + "epoch": 0.5607126541799908, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 4.065918246769375e-06, + "logits/chosen": 671798720.0, + "logits/rejected": 338095808.0, + "logps/chosen": -607.8370361328125, + "logps/rejected": -496.4712727864583, + "loss": 0.006, + "rewards/chosen": 3.8019866943359375, + "rewards/margins": 14.065142313639322, + "rewards/rejected": -10.263155619303385, + "step": 6137 + }, + { + "epoch": 0.5608040201005026, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 4.064505798242024e-06, + "logits/chosen": 561935936.0, + "logits/rejected": 343140800.0, + "logps/chosen": -434.9522399902344, + "logps/rejected": -434.97900390625, + "loss": 0.0305, + "rewards/chosen": 2.8619346618652344, + "rewards/margins": 11.241458892822266, + "rewards/rejected": -8.379524230957031, + "step": 6138 + }, + { + "epoch": 0.5608953860210142, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 4.063093427071376e-06, + "logits/chosen": 556554240.0, + "logits/rejected": 442682848.0, + "logps/chosen": -371.02191162109375, + "logps/rejected": -490.9311828613281, + "loss": 0.0084, + "rewards/chosen": 4.308663368225098, + "rewards/margins": 14.599085807800293, + "rewards/rejected": -10.290422439575195, + "step": 6139 + }, + { + "epoch": 0.5609867519415258, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 4.061681133374224e-06, + "logits/chosen": 412625280.0, + "logits/rejected": 1602384554.6666667, + "logps/chosen": -216.588232421875, + "logps/rejected": -678.7163899739584, + "loss": 0.0174, + "rewards/chosen": 4.283568572998047, + "rewards/margins": 15.123208872477214, + "rewards/rejected": -10.839640299479166, + "step": 6140 + }, + { + "epoch": 0.5610781178620374, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 4.0602689172673445e-06, + "logits/chosen": 433199456.0, + "logits/rejected": 255103552.0, + "logps/chosen": -327.6144104003906, + "logps/rejected": -304.442626953125, + "loss": 0.0061, + "rewards/chosen": 4.653436183929443, + "rewards/margins": 13.490931987762451, + "rewards/rejected": -8.837495803833008, + "step": 6141 + }, + { + "epoch": 0.5611694837825492, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 4.058856778867521e-06, + "logits/chosen": 502042368.0, + "logits/rejected": 655432652.8, + "logps/chosen": -240.81742350260416, + "logps/rejected": -572.7693359375, + "loss": 0.0136, + "rewards/chosen": 4.015449206034343, + "rewards/margins": 13.176003710428873, + "rewards/rejected": -9.160554504394531, + "step": 6142 + }, + { + "epoch": 0.5612608497030608, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 4.057444718291521e-06, + "logits/chosen": 510218700.8, + "logits/rejected": 579133184.0, + "logps/chosen": -365.820947265625, + "logps/rejected": -334.80674235026044, + "loss": 0.0266, + "rewards/chosen": 3.4175209045410155, + "rewards/margins": 10.91993687947591, + "rewards/rejected": -7.5024159749348955, + "step": 6143 + }, + { + "epoch": 0.5613522156235724, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 4.05603273565611e-06, + "logits/chosen": 783418521.6, + "logits/rejected": 929758890.6666666, + "logps/chosen": -401.3088134765625, + "logps/rejected": -525.88671875, + "loss": 0.0185, + "rewards/chosen": 3.7155807495117186, + "rewards/margins": 14.122041829427083, + "rewards/rejected": -10.406461079915365, + "step": 6144 + }, + { + "epoch": 0.561443581544084, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 4.054620831078045e-06, + "logits/chosen": 398197504.0, + "logits/rejected": 407689804.8, + "logps/chosen": -210.99676513671875, + "logps/rejected": -421.311328125, + "loss": 0.0212, + "rewards/chosen": 3.0276292165120444, + "rewards/margins": 11.333856328328451, + "rewards/rejected": -8.306227111816407, + "step": 6145 + }, + { + "epoch": 0.5615349474645958, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 4.053209004674079e-06, + "logits/chosen": 658377600.0, + "logits/rejected": 364029269.3333333, + "logps/chosen": -543.399658203125, + "logps/rejected": -496.8997395833333, + "loss": 0.0105, + "rewards/chosen": 3.209622383117676, + "rewards/margins": 12.286871910095215, + "rewards/rejected": -9.077249526977539, + "step": 6146 + }, + { + "epoch": 0.5616263133851074, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 4.0517972565609545e-06, + "logits/chosen": 467097152.0, + "logits/rejected": 456734848.0, + "logps/chosen": -257.5989990234375, + "logps/rejected": -535.1442260742188, + "loss": 0.0215, + "rewards/chosen": 3.825547456741333, + "rewards/margins": 13.86512303352356, + "rewards/rejected": -10.039575576782227, + "step": 6147 + }, + { + "epoch": 0.561717679305619, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 4.050385586855413e-06, + "logits/chosen": 789234944.0, + "logits/rejected": 620014464.0, + "logps/chosen": -336.767578125, + "logps/rejected": -615.0682373046875, + "loss": 0.0124, + "rewards/chosen": 4.147189140319824, + "rewards/margins": 12.758949279785156, + "rewards/rejected": -8.611760139465332, + "step": 6148 + }, + { + "epoch": 0.5618090452261306, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 4.048973995674184e-06, + "logits/chosen": 1541277952.0, + "logits/rejected": 597709525.3333334, + "logps/chosen": -390.94671630859375, + "logps/rejected": -700.0540364583334, + "loss": 0.0057, + "rewards/chosen": 4.262718200683594, + "rewards/margins": 15.331570943196615, + "rewards/rejected": -11.068852742513021, + "step": 6149 + }, + { + "epoch": 0.5619004111466424, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 4.047562483133996e-06, + "logits/chosen": 474802304.0, + "logits/rejected": 379976045.71428573, + "logps/chosen": -571.942138671875, + "logps/rejected": -399.2827845982143, + "loss": 0.0076, + "rewards/chosen": 2.906262159347534, + "rewards/margins": 11.249821969441005, + "rewards/rejected": -8.34355981009347, + "step": 6150 + }, + { + "epoch": 0.561991777067154, + "grad_norm": 5.6875, + "kl": 1.2215957641601562, + "learning_rate": 4.046151049351566e-06, + "logits/chosen": 605477888.0, + "logps/chosen": -377.3507995605469, + "loss": 0.0527, + "rewards/chosen": 3.200648784637451, + "step": 6151 + }, + { + "epoch": 0.5620831429876656, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 4.044739694443605e-06, + "logits/chosen": 629271808.0, + "logits/rejected": 973444608.0, + "logps/chosen": -432.04931640625, + "logps/rejected": -455.7002258300781, + "loss": 0.0056, + "rewards/chosen": 5.159236907958984, + "rewards/margins": 12.861907005310059, + "rewards/rejected": -7.702670097351074, + "step": 6152 + }, + { + "epoch": 0.5621745089081772, + "grad_norm": 0.392578125, + "kl": 0.0, + "learning_rate": 4.043328418526822e-06, + "logits/chosen": 207893408.0, + "logits/rejected": 427434922.6666667, + "logps/chosen": -103.79673767089844, + "logps/rejected": -443.092041015625, + "loss": 0.0018, + "rewards/chosen": 5.074620246887207, + "rewards/margins": 15.373464902242025, + "rewards/rejected": -10.298844655354818, + "step": 6153 + }, + { + "epoch": 0.562265874828689, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.041917221717914e-06, + "logits/chosen": 607821269.3333334, + "logits/rejected": 426919744.0, + "logps/chosen": -447.7963460286458, + "logps/rejected": -369.42742919921875, + "loss": 0.022, + "rewards/chosen": 3.6152588526407876, + "rewards/margins": 15.039167086283365, + "rewards/rejected": -11.423908233642578, + "step": 6154 + }, + { + "epoch": 0.5623572407492006, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 4.040506104133575e-06, + "logits/chosen": 374482773.3333333, + "logits/rejected": 619520640.0, + "logps/chosen": -326.5137125651042, + "logps/rejected": -668.1544799804688, + "loss": 0.0168, + "rewards/chosen": 4.808621724446614, + "rewards/margins": 13.010470708211262, + "rewards/rejected": -8.201848983764648, + "step": 6155 + }, + { + "epoch": 0.5624486066697122, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 4.039095065890492e-06, + "logits/chosen": 383003328.0, + "logits/rejected": 484699872.0, + "logps/chosen": -142.3463134765625, + "logps/rejected": -479.0118408203125, + "loss": 0.0281, + "rewards/chosen": 3.074288845062256, + "rewards/margins": 12.72960615158081, + "rewards/rejected": -9.655317306518555, + "step": 6156 + }, + { + "epoch": 0.5625399725902238, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 4.037684107105343e-06, + "logits/chosen": 616648038.4, + "logits/rejected": 1034764288.0, + "logps/chosen": -335.25244140625, + "logps/rejected": -506.64501953125, + "loss": 0.0191, + "rewards/chosen": 3.634804916381836, + "rewards/margins": 12.625881067911784, + "rewards/rejected": -8.991076151529947, + "step": 6157 + }, + { + "epoch": 0.5626313385107355, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 4.036273227894802e-06, + "logits/chosen": 435131744.0, + "logits/rejected": 276622688.0, + "logps/chosen": -330.55889892578125, + "logps/rejected": -412.7264709472656, + "loss": 0.0278, + "rewards/chosen": 3.172401189804077, + "rewards/margins": 13.14797568321228, + "rewards/rejected": -9.975574493408203, + "step": 6158 + }, + { + "epoch": 0.5627227044312472, + "grad_norm": 37.25, + "kl": 0.0, + "learning_rate": 4.034862428375538e-06, + "logits/chosen": 693025322.6666666, + "logits/rejected": 985151744.0, + "logps/chosen": -236.4981689453125, + "logps/rejected": -398.2470703125, + "loss": 0.0664, + "rewards/chosen": 2.5338425636291504, + "rewards/margins": 11.9017014503479, + "rewards/rejected": -9.36785888671875, + "step": 6159 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 4.033451708664207e-06, + "logits/chosen": 1212612992.0, + "logits/rejected": 669717632.0, + "logps/chosen": -437.6256408691406, + "logps/rejected": -536.1617431640625, + "loss": 0.0124, + "rewards/chosen": 4.139430522918701, + "rewards/margins": 12.949034214019775, + "rewards/rejected": -8.809603691101074, + "step": 6160 + }, + { + "epoch": 0.5629054362722704, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 4.032041068877466e-06, + "logits/chosen": 557233024.0, + "logits/rejected": 1003607381.3333334, + "logps/chosen": -201.55191040039062, + "logps/rejected": -432.9601236979167, + "loss": 0.016, + "rewards/chosen": 3.9390370845794678, + "rewards/margins": 13.460911194483439, + "rewards/rejected": -9.52187410990397, + "step": 6161 + }, + { + "epoch": 0.5629968021927821, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.03063050913196e-06, + "logits/chosen": 590762393.6, + "logits/rejected": 487058773.3333333, + "logps/chosen": -382.877734375, + "logps/rejected": -318.8788248697917, + "loss": 0.0398, + "rewards/chosen": 2.90958251953125, + "rewards/margins": 12.718708419799805, + "rewards/rejected": -9.809125900268555, + "step": 6162 + }, + { + "epoch": 0.5630881681132938, + "grad_norm": 24.75, + "kl": 0.0, + "learning_rate": 4.029220029544329e-06, + "logits/chosen": 982974464.0, + "logits/rejected": 835154560.0, + "logps/chosen": -467.39711216517856, + "logps/rejected": -501.82794189453125, + "loss": 0.129, + "rewards/chosen": 2.819904054914202, + "rewards/margins": 15.271136965070452, + "rewards/rejected": -12.45123291015625, + "step": 6163 + }, + { + "epoch": 0.5631795340338054, + "grad_norm": 0.9140625, + "kl": 0.0, + "learning_rate": 4.027809630231207e-06, + "logits/chosen": 827383125.3333334, + "logits/rejected": 913244569.6, + "logps/chosen": -348.59912109375, + "logps/rejected": -614.842724609375, + "loss": 0.0047, + "rewards/chosen": 4.417086283365886, + "rewards/margins": 14.024610392252605, + "rewards/rejected": -9.607524108886718, + "step": 6164 + }, + { + "epoch": 0.563270899954317, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 4.02639931130922e-06, + "logits/chosen": 837297371.4285715, + "logits/rejected": 264533328.0, + "logps/chosen": -443.7343052455357, + "logps/rejected": -297.2422790527344, + "loss": 0.0234, + "rewards/chosen": 4.156211853027344, + "rewards/margins": 15.290565490722656, + "rewards/rejected": -11.134353637695312, + "step": 6165 + }, + { + "epoch": 0.5633622658748287, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 4.024989072894992e-06, + "logits/chosen": 277300736.0, + "logits/rejected": 515667840.0, + "logps/chosen": -268.3027038574219, + "logps/rejected": -614.9765625, + "loss": 0.0092, + "rewards/chosen": 3.4148483276367188, + "rewards/margins": 13.401409149169922, + "rewards/rejected": -9.986560821533203, + "step": 6166 + }, + { + "epoch": 0.5634536317953404, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 4.023578915105133e-06, + "logits/chosen": 820685107.2, + "logits/rejected": 384070826.6666667, + "logps/chosen": -384.1203857421875, + "logps/rejected": -200.61027018229166, + "loss": 0.0804, + "rewards/chosen": 4.724524688720703, + "rewards/margins": 10.336817932128906, + "rewards/rejected": -5.612293243408203, + "step": 6167 + }, + { + "epoch": 0.563544997715852, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 4.02216883805625e-06, + "logits/chosen": 819759872.0, + "logits/rejected": 476341248.0, + "logps/chosen": -414.461669921875, + "logps/rejected": -409.66497802734375, + "loss": 0.008, + "rewards/chosen": 4.6420698165893555, + "rewards/margins": 14.105402946472168, + "rewards/rejected": -9.463333129882812, + "step": 6168 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 4.020758841864945e-06, + "logits/chosen": 393749376.0, + "logits/rejected": 279228480.0, + "logps/chosen": -328.71282958984375, + "logps/rejected": -457.5140075683594, + "loss": 0.0136, + "rewards/chosen": 3.769535779953003, + "rewards/margins": 13.386110067367554, + "rewards/rejected": -9.61657428741455, + "step": 6169 + }, + { + "epoch": 0.5637277295568753, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.01934892664781e-06, + "logits/chosen": 564800512.0, + "logits/rejected": 562536448.0, + "logps/chosen": -294.2250162760417, + "logps/rejected": -357.89921875, + "loss": 0.0117, + "rewards/chosen": 3.9447244008382163, + "rewards/margins": 11.976378758748373, + "rewards/rejected": -8.031654357910156, + "step": 6170 + }, + { + "epoch": 0.563819095477387, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 4.017939092521434e-06, + "logits/chosen": 376298154.6666667, + "logits/rejected": 514408096.0, + "logps/chosen": -265.60044352213544, + "logps/rejected": -365.07708740234375, + "loss": 0.0206, + "rewards/chosen": 3.9595864613850913, + "rewards/margins": 12.932682355244955, + "rewards/rejected": -8.973095893859863, + "step": 6171 + }, + { + "epoch": 0.5639104613978986, + "grad_norm": 60.25, + "kl": 0.0, + "learning_rate": 4.0165293396023965e-06, + "logits/chosen": 407540309.3333333, + "logits/rejected": 450502195.2, + "logps/chosen": -268.5426432291667, + "logps/rejected": -559.001513671875, + "loss": 0.0664, + "rewards/chosen": 2.187065283457438, + "rewards/margins": 11.468112341562906, + "rewards/rejected": -9.281047058105468, + "step": 6172 + }, + { + "epoch": 0.5640018273184102, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 4.015119668007269e-06, + "logits/chosen": 426165312.0, + "logits/rejected": 573515190.8571428, + "logps/chosen": -351.73272705078125, + "logps/rejected": -582.0174734933036, + "loss": 0.0029, + "rewards/chosen": 3.7941224575042725, + "rewards/margins": 13.558075530188423, + "rewards/rejected": -9.763953072684151, + "step": 6173 + }, + { + "epoch": 0.5640931932389219, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 4.01371007785262e-06, + "logits/chosen": 506561843.2, + "logits/rejected": 291880597.3333333, + "logps/chosen": -342.62744140625, + "logps/rejected": -419.3080240885417, + "loss": 0.0096, + "rewards/chosen": 4.230581665039063, + "rewards/margins": 12.974165344238282, + "rewards/rejected": -8.743583679199219, + "step": 6174 + }, + { + "epoch": 0.5641845591594336, + "grad_norm": 60.5, + "kl": 0.0, + "learning_rate": 4.01230056925501e-06, + "logits/chosen": 812738944.0, + "logits/rejected": 390575829.3333333, + "logps/chosen": -365.76593017578125, + "logps/rejected": -396.6080729166667, + "loss": 0.0625, + "rewards/chosen": 4.468388557434082, + "rewards/margins": 11.779410998026531, + "rewards/rejected": -7.311022440592448, + "step": 6175 + }, + { + "epoch": 0.5642759250799452, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 4.010891142330992e-06, + "logits/chosen": 629931968.0, + "logits/rejected": 316724800.0, + "logps/chosen": -486.5543212890625, + "logps/rejected": -407.1640625, + "loss": 0.0086, + "rewards/chosen": 4.2620368003845215, + "rewards/margins": 15.542935848236084, + "rewards/rejected": -11.280899047851562, + "step": 6176 + }, + { + "epoch": 0.5643672910004568, + "grad_norm": 0.71875, + "kl": 0.0, + "learning_rate": 4.009481797197111e-06, + "logits/chosen": 399835545.6, + "logits/rejected": 601738709.3333334, + "logps/chosen": -276.6587890625, + "logps/rejected": -775.781982421875, + "loss": 0.0047, + "rewards/chosen": 5.082231521606445, + "rewards/margins": 14.30256207784017, + "rewards/rejected": -9.220330556233725, + "step": 6177 + }, + { + "epoch": 0.5644586569209685, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 4.008072533969909e-06, + "logits/chosen": 236912112.0, + "logits/rejected": 538215680.0, + "logps/chosen": -294.64581298828125, + "logps/rejected": -473.971923828125, + "loss": 0.0137, + "rewards/chosen": 4.457956314086914, + "rewards/margins": 11.595736026763916, + "rewards/rejected": -7.137779712677002, + "step": 6178 + }, + { + "epoch": 0.5645500228414801, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 4.006663352765917e-06, + "logits/chosen": 417624064.0, + "logits/rejected": 545463552.0, + "logps/chosen": -301.1775309244792, + "logps/rejected": -604.2546875, + "loss": 0.0211, + "rewards/chosen": 2.9747419357299805, + "rewards/margins": 13.487351036071777, + "rewards/rejected": -10.512609100341797, + "step": 6179 + }, + { + "epoch": 0.5646413887619918, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 4.005254253701664e-06, + "logits/chosen": 578678579.2, + "logits/rejected": 453825152.0, + "logps/chosen": -414.57060546875, + "logps/rejected": -520.4794108072916, + "loss": 0.023, + "rewards/chosen": 3.594183349609375, + "rewards/margins": 12.08383534749349, + "rewards/rejected": -8.489651997884115, + "step": 6180 + }, + { + "epoch": 0.5647327546825034, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 4.003845236893665e-06, + "logits/chosen": 688582041.6, + "logits/rejected": 986508800.0, + "logps/chosen": -300.500146484375, + "logps/rejected": -443.3585611979167, + "loss": 0.0673, + "rewards/chosen": 3.780923843383789, + "rewards/margins": 12.10017458597819, + "rewards/rejected": -8.3192507425944, + "step": 6181 + }, + { + "epoch": 0.5648241206030151, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.002436302458438e-06, + "logits/chosen": 513597030.4, + "logits/rejected": 348200490.6666667, + "logps/chosen": -408.3270751953125, + "logps/rejected": -298.83937581380206, + "loss": 0.0295, + "rewards/chosen": 3.3315940856933595, + "rewards/margins": 11.364029184977213, + "rewards/rejected": -8.032435099283854, + "step": 6182 + }, + { + "epoch": 0.5649154865235267, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.001027450512486e-06, + "logits/chosen": 387726293.3333333, + "logits/rejected": 391020608.0, + "logps/chosen": -205.78108723958334, + "logps/rejected": -541.5850830078125, + "loss": 0.0225, + "rewards/chosen": 4.109164873758952, + "rewards/margins": 12.922004381815594, + "rewards/rejected": -8.81283950805664, + "step": 6183 + }, + { + "epoch": 0.5650068524440384, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 3.999618681172306e-06, + "logits/chosen": 328684864.0, + "logits/rejected": 295901226.6666667, + "logps/chosen": -183.49334716796875, + "logps/rejected": -463.1661376953125, + "loss": 0.0129, + "rewards/chosen": 2.9563608169555664, + "rewards/margins": 12.12134075164795, + "rewards/rejected": -9.164979934692383, + "step": 6184 + }, + { + "epoch": 0.56509821836455, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 3.998209994554395e-06, + "logits/chosen": 602013312.0, + "logits/rejected": 474379264.0, + "logps/chosen": -298.5946350097656, + "logps/rejected": -391.345703125, + "loss": 0.0148, + "rewards/chosen": 4.058041572570801, + "rewards/margins": 11.883830070495605, + "rewards/rejected": -7.825788497924805, + "step": 6185 + }, + { + "epoch": 0.5651895842850617, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 3.996801390775233e-06, + "logits/chosen": 1162948693.3333333, + "logits/rejected": 621925171.2, + "logps/chosen": -368.1063639322917, + "logps/rejected": -502.99072265625, + "loss": 0.0242, + "rewards/chosen": 2.697098731994629, + "rewards/margins": 12.833377265930176, + "rewards/rejected": -10.136278533935547, + "step": 6186 + }, + { + "epoch": 0.5652809502055733, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 3.995392869951304e-06, + "logits/chosen": 488479232.0, + "logits/rejected": 557625600.0, + "logps/chosen": -349.9138997395833, + "logps/rejected": -785.2343139648438, + "loss": 0.038, + "rewards/chosen": 3.433558781941732, + "rewards/margins": 11.840787251790365, + "rewards/rejected": -8.407228469848633, + "step": 6187 + }, + { + "epoch": 0.565372316126085, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 3.9939844321990736e-06, + "logits/chosen": 680510080.0, + "logits/rejected": 1018759744.0, + "logps/chosen": -344.36846923828125, + "logps/rejected": -437.06982421875, + "loss": 0.0366, + "rewards/chosen": 2.7811501026153564, + "rewards/margins": 11.809878587722778, + "rewards/rejected": -9.028728485107422, + "step": 6188 + }, + { + "epoch": 0.5654636820465966, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 3.992576077635012e-06, + "logits/chosen": 513057920.0, + "logits/rejected": 535218272.0, + "logps/chosen": -220.13553873697916, + "logps/rejected": -377.61859130859375, + "loss": 0.134, + "rewards/chosen": 2.9909998575846353, + "rewards/margins": 12.07643953959147, + "rewards/rejected": -9.085439682006836, + "step": 6189 + }, + { + "epoch": 0.5655550479671083, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.991167806375574e-06, + "logits/chosen": 503728938.6666667, + "logits/rejected": 409756160.0, + "logps/chosen": -314.51218668619794, + "logps/rejected": -526.7890625, + "loss": 0.0305, + "rewards/chosen": 3.3735262552897134, + "rewards/margins": 14.164761225382486, + "rewards/rejected": -10.791234970092773, + "step": 6190 + }, + { + "epoch": 0.5656464138876199, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 3.989759618537211e-06, + "logits/chosen": 652185664.0, + "logits/rejected": 448754080.0, + "logps/chosen": -161.25701904296875, + "logps/rejected": -268.2301025390625, + "loss": 0.1105, + "rewards/chosen": 3.371525287628174, + "rewards/margins": 8.519092082977295, + "rewards/rejected": -5.147566795349121, + "step": 6191 + }, + { + "epoch": 0.5657377798081316, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 3.988351514236368e-06, + "logits/chosen": 657597525.3333334, + "logits/rejected": 374648448.0, + "logps/chosen": -283.94354248046875, + "logps/rejected": -525.8701782226562, + "loss": 0.029, + "rewards/chosen": 3.5572338104248047, + "rewards/margins": 14.197874069213867, + "rewards/rejected": -10.640640258789062, + "step": 6192 + }, + { + "epoch": 0.5658291457286432, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 3.986943493589482e-06, + "logits/chosen": 868216173.7142857, + "logits/rejected": 971496128.0, + "logps/chosen": -372.63134765625, + "logps/rejected": -391.8448486328125, + "loss": 0.0227, + "rewards/chosen": 4.209577287946429, + "rewards/margins": 12.995377268110003, + "rewards/rejected": -8.785799980163574, + "step": 6193 + }, + { + "epoch": 0.5659205116491549, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 3.985535556712984e-06, + "logits/chosen": 654421248.0, + "logits/rejected": 373700992.0, + "logps/chosen": -271.8733642578125, + "logps/rejected": -460.6088053385417, + "loss": 0.1412, + "rewards/chosen": 1.862986946105957, + "rewards/margins": 9.896751467386881, + "rewards/rejected": -8.033764521280924, + "step": 6194 + }, + { + "epoch": 0.5660118775696665, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 3.984127703723295e-06, + "logits/chosen": 636875818.6666666, + "logits/rejected": 1026108825.6, + "logps/chosen": -363.0160319010417, + "logps/rejected": -499.31435546875, + "loss": 0.0282, + "rewards/chosen": 2.590928077697754, + "rewards/margins": 11.9301118850708, + "rewards/rejected": -9.339183807373047, + "step": 6195 + }, + { + "epoch": 0.5661032434901782, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 3.982719934736832e-06, + "logits/chosen": 401270229.3333333, + "logits/rejected": 476591206.4, + "logps/chosen": -287.7975667317708, + "logps/rejected": -369.7895751953125, + "loss": 0.027, + "rewards/chosen": 3.9317429860432944, + "rewards/margins": 11.27962277730306, + "rewards/rejected": -7.347879791259766, + "step": 6196 + }, + { + "epoch": 0.5661946094106898, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 3.981312249870006e-06, + "logits/chosen": 356786976.0, + "logits/rejected": 382420544.0, + "logps/chosen": -205.98651123046875, + "logps/rejected": -518.1630859375, + "loss": 0.0193, + "rewards/chosen": 3.983309268951416, + "rewards/margins": 15.33683729171753, + "rewards/rejected": -11.353528022766113, + "step": 6197 + }, + { + "epoch": 0.5662859753312015, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 3.979904649239217e-06, + "logits/chosen": 504901324.8, + "logits/rejected": 458562048.0, + "logps/chosen": -177.7876953125, + "logps/rejected": -497.3055013020833, + "loss": 0.1182, + "rewards/chosen": 2.0457923889160154, + "rewards/margins": 13.342160034179688, + "rewards/rejected": -11.296367645263672, + "step": 6198 + }, + { + "epoch": 0.5663773412517131, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 3.978497132960864e-06, + "logits/chosen": 849136844.8, + "logits/rejected": 1303651072.0, + "logps/chosen": -461.369921875, + "logps/rejected": -578.3142903645834, + "loss": 0.018, + "rewards/chosen": 3.8368862152099608, + "rewards/margins": 15.4507625579834, + "rewards/rejected": -11.613876342773438, + "step": 6199 + }, + { + "epoch": 0.5664687071722248, + "grad_norm": 4.21875, + "kl": 2.5026702880859375, + "learning_rate": 3.977089701151333e-06, + "logits/chosen": 625164501.3333334, + "logits/rejected": 683002624.0, + "logps/chosen": -356.567138671875, + "logps/rejected": -588.0936889648438, + "loss": 0.0317, + "rewards/chosen": 4.014294624328613, + "rewards/margins": 13.292438507080078, + "rewards/rejected": -9.278143882751465, + "step": 6200 + }, + { + "epoch": 0.5665600730927364, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 3.975682353927006e-06, + "logits/chosen": 732812697.6, + "logits/rejected": 618815317.3333334, + "logps/chosen": -268.42734375, + "logps/rejected": -561.69091796875, + "loss": 0.0107, + "rewards/chosen": 4.511864471435547, + "rewards/margins": 15.342217254638673, + "rewards/rejected": -10.830352783203125, + "step": 6201 + }, + { + "epoch": 0.5666514390132481, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 3.974275091404257e-06, + "logits/chosen": 459191500.8, + "logits/rejected": 466241792.0, + "logps/chosen": -332.51748046875, + "logps/rejected": -313.5614420572917, + "loss": 0.0197, + "rewards/chosen": 4.134413146972657, + "rewards/margins": 12.915024820963541, + "rewards/rejected": -8.780611673990885, + "step": 6202 + }, + { + "epoch": 0.5667428049337597, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 3.972867913699457e-06, + "logits/chosen": 497986602.6666667, + "logits/rejected": 386466713.6, + "logps/chosen": -334.7034505208333, + "logps/rejected": -514.29287109375, + "loss": 0.0223, + "rewards/chosen": 3.179403305053711, + "rewards/margins": 12.453438186645508, + "rewards/rejected": -9.274034881591797, + "step": 6203 + }, + { + "epoch": 0.5668341708542713, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 3.971460820928962e-06, + "logits/chosen": 702221056.0, + "logits/rejected": 536408896.0, + "logps/chosen": -449.82830810546875, + "logps/rejected": -400.6640625, + "loss": 0.1177, + "rewards/chosen": 4.559063911437988, + "rewards/margins": 9.44145393371582, + "rewards/rejected": -4.882390022277832, + "step": 6204 + }, + { + "epoch": 0.566925536774783, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 3.970053813209127e-06, + "logits/chosen": 672222464.0, + "logits/rejected": 551433088.0, + "logps/chosen": -325.54290771484375, + "logps/rejected": -457.2800598144531, + "loss": 0.0225, + "rewards/chosen": 3.2487473487854004, + "rewards/margins": 11.887463092803955, + "rewards/rejected": -8.638715744018555, + "step": 6205 + }, + { + "epoch": 0.5670169026952947, + "grad_norm": 27.25, + "kl": 0.0, + "learning_rate": 3.968646890656301e-06, + "logits/chosen": 354167594.6666667, + "logits/rejected": 472931008.0, + "logps/chosen": -277.3175048828125, + "logps/rejected": -559.252685546875, + "loss": 0.0488, + "rewards/chosen": 3.3912766774495444, + "rewards/margins": 12.192628224690756, + "rewards/rejected": -8.801351547241211, + "step": 6206 + }, + { + "epoch": 0.5671082686158063, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 3.967240053386818e-06, + "logits/chosen": 553882624.0, + "logits/rejected": 288135705.6, + "logps/chosen": -315.09670003255206, + "logps/rejected": -335.648388671875, + "loss": 0.0155, + "rewards/chosen": 3.15461794535319, + "rewards/margins": 12.880988947550454, + "rewards/rejected": -9.726371002197265, + "step": 6207 + }, + { + "epoch": 0.5671996345363179, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 3.965833301517017e-06, + "logits/chosen": 504568640.0, + "logits/rejected": 569820480.0, + "logps/chosen": -294.79296875, + "logps/rejected": -359.0005187988281, + "loss": 0.0141, + "rewards/chosen": 3.703205108642578, + "rewards/margins": 11.9910888671875, + "rewards/rejected": -8.287883758544922, + "step": 6208 + }, + { + "epoch": 0.5672910004568296, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 3.964426635163219e-06, + "logits/chosen": 613595221.3333334, + "logits/rejected": 617440448.0, + "logps/chosen": -377.4024251302083, + "logps/rejected": -535.595458984375, + "loss": 0.0415, + "rewards/chosen": 3.6873722076416016, + "rewards/margins": 13.162612915039062, + "rewards/rejected": -9.475240707397461, + "step": 6209 + }, + { + "epoch": 0.5673823663773413, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 3.963020054441744e-06, + "logits/chosen": 657401216.0, + "logits/rejected": 300518080.0, + "logps/chosen": -350.135009765625, + "logps/rejected": -436.5501708984375, + "loss": 0.0082, + "rewards/chosen": 4.227832794189453, + "rewards/margins": 15.811988830566406, + "rewards/rejected": -11.584156036376953, + "step": 6210 + }, + { + "epoch": 0.5674737322978529, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.961613559468901e-06, + "logits/chosen": 446606506.6666667, + "logits/rejected": 406655808.0, + "logps/chosen": -441.3463948567708, + "logps/rejected": -603.708984375, + "loss": 0.0215, + "rewards/chosen": 4.346386909484863, + "rewards/margins": 14.949322700500488, + "rewards/rejected": -10.602935791015625, + "step": 6211 + }, + { + "epoch": 0.5675650982183645, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 3.960207150360998e-06, + "logits/chosen": 464959520.0, + "logits/rejected": 690870336.0, + "logps/chosen": -301.5826721191406, + "logps/rejected": -396.123291015625, + "loss": 0.0081, + "rewards/chosen": 4.2328290939331055, + "rewards/margins": 12.2185640335083, + "rewards/rejected": -7.985734939575195, + "step": 6212 + }, + { + "epoch": 0.5676564641388762, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.958800827234328e-06, + "logits/chosen": 564740505.6, + "logits/rejected": 846968405.3333334, + "logps/chosen": -248.7800537109375, + "logps/rejected": -689.2976888020834, + "loss": 0.0177, + "rewards/chosen": 3.9300426483154296, + "rewards/margins": 16.233407719930014, + "rewards/rejected": -12.303365071614584, + "step": 6213 + }, + { + "epoch": 0.5677478300593879, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 3.957394590205185e-06, + "logits/chosen": 412310997.3333333, + "logits/rejected": 384134016.0, + "logps/chosen": -334.489013671875, + "logps/rejected": -273.63238525390625, + "loss": 0.0093, + "rewards/chosen": 4.749448776245117, + "rewards/margins": 12.801105499267578, + "rewards/rejected": -8.051656723022461, + "step": 6214 + }, + { + "epoch": 0.5678391959798995, + "grad_norm": 0.88671875, + "kl": 0.0, + "learning_rate": 3.955988439389848e-06, + "logits/chosen": 662257868.8, + "logits/rejected": 342799680.0, + "logps/chosen": -328.3533935546875, + "logps/rejected": -466.5787760416667, + "loss": 0.0057, + "rewards/chosen": 4.880982971191406, + "rewards/margins": 14.40679702758789, + "rewards/rejected": -9.525814056396484, + "step": 6215 + }, + { + "epoch": 0.5679305619004111, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.954582374904593e-06, + "logits/chosen": 676254310.4, + "logits/rejected": 434622293.3333333, + "logps/chosen": -644.8091796875, + "logps/rejected": -612.923583984375, + "loss": 0.0229, + "rewards/chosen": 3.303740310668945, + "rewards/margins": 14.391872533162434, + "rewards/rejected": -11.08813222249349, + "step": 6216 + }, + { + "epoch": 0.5680219278209228, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.9531763968656914e-06, + "logits/chosen": 411141845.3333333, + "logits/rejected": 509236326.4, + "logps/chosen": -283.61936442057294, + "logps/rejected": -540.02158203125, + "loss": 0.0123, + "rewards/chosen": 3.8318894704182944, + "rewards/margins": 12.343414433797202, + "rewards/rejected": -8.511524963378907, + "step": 6217 + }, + { + "epoch": 0.5681132937414345, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 3.9517705053894014e-06, + "logits/chosen": 483125376.0, + "logits/rejected": 832047104.0, + "logps/chosen": -311.49237060546875, + "logps/rejected": -551.7738037109375, + "loss": 0.0048, + "rewards/chosen": 4.509889602661133, + "rewards/margins": 13.991254806518555, + "rewards/rejected": -9.481365203857422, + "step": 6218 + }, + { + "epoch": 0.5682046596619461, + "grad_norm": 94.0, + "kl": 0.0, + "learning_rate": 3.95036470059198e-06, + "logits/chosen": 888418133.3333334, + "logits/rejected": 545808512.0, + "logps/chosen": -344.2843424479167, + "logps/rejected": -593.2159423828125, + "loss": 0.106, + "rewards/chosen": 2.9764617284139, + "rewards/margins": 13.333061536153158, + "rewards/rejected": -10.356599807739258, + "step": 6219 + }, + { + "epoch": 0.5682960255824577, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 3.948958982589673e-06, + "logits/chosen": 639989504.0, + "logits/rejected": 545974442.6666666, + "logps/chosen": -409.3158203125, + "logps/rejected": -544.640625, + "loss": 0.0152, + "rewards/chosen": 4.208251190185547, + "rewards/margins": 13.375867716471355, + "rewards/rejected": -9.167616526285807, + "step": 6220 + }, + { + "epoch": 0.5683873915029694, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 3.947553351498719e-06, + "logits/chosen": 771163050.6666666, + "logits/rejected": 477135411.2, + "logps/chosen": -310.50958251953125, + "logps/rejected": -479.00087890625, + "loss": 0.011, + "rewards/chosen": 4.531556447347005, + "rewards/margins": 13.625632985432944, + "rewards/rejected": -9.094076538085938, + "step": 6221 + }, + { + "epoch": 0.5684787574234811, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 3.946147807435352e-06, + "logits/chosen": 515432960.0, + "logits/rejected": 316398848.0, + "logps/chosen": -444.57318115234375, + "logps/rejected": -426.15987723214283, + "loss": 0.0029, + "rewards/chosen": 3.8214111328125, + "rewards/margins": 13.383595057896205, + "rewards/rejected": -9.562183925083705, + "step": 6222 + }, + { + "epoch": 0.5685701233439927, + "grad_norm": 2.203125, + "kl": 0.9659252166748047, + "learning_rate": 3.944742350515797e-06, + "logits/chosen": 623497216.0, + "logits/rejected": 467945984.0, + "logps/chosen": -368.3568638392857, + "logps/rejected": -472.5275573730469, + "loss": 0.0182, + "rewards/chosen": 4.236727850777762, + "rewards/margins": 13.551386015755789, + "rewards/rejected": -9.314658164978027, + "step": 6223 + }, + { + "epoch": 0.5686614892645043, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 3.9433369808562714e-06, + "logits/chosen": 575029452.8, + "logits/rejected": 906443946.6666666, + "logps/chosen": -279.797119140625, + "logps/rejected": -567.0040690104166, + "loss": 0.02, + "rewards/chosen": 4.048048400878907, + "rewards/margins": 15.553190612792969, + "rewards/rejected": -11.505142211914062, + "step": 6224 + }, + { + "epoch": 0.568752855185016, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.941931698572988e-06, + "logits/chosen": 545510016.0, + "logits/rejected": 297887232.0, + "logps/chosen": -388.64996337890625, + "logps/rejected": -304.95001220703125, + "loss": 0.0161, + "rewards/chosen": 3.5455238819122314, + "rewards/margins": 11.638404130935669, + "rewards/rejected": -8.092880249023438, + "step": 6225 + }, + { + "epoch": 0.5688442211055277, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 3.94052650378215e-06, + "logits/chosen": 470981802.6666667, + "logits/rejected": 240374323.2, + "logps/chosen": -252.59649658203125, + "logps/rejected": -345.5099853515625, + "loss": 0.0106, + "rewards/chosen": 4.380245844523112, + "rewards/margins": 12.335645167032878, + "rewards/rejected": -7.9553993225097654, + "step": 6226 + }, + { + "epoch": 0.5689355870260393, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 3.9391213965999545e-06, + "logits/chosen": 789487257.6, + "logits/rejected": 647110144.0, + "logps/chosen": -229.355029296875, + "logps/rejected": -320.9017740885417, + "loss": 0.0189, + "rewards/chosen": 3.8120540618896483, + "rewards/margins": 11.867254002888998, + "rewards/rejected": -8.05519994099935, + "step": 6227 + }, + { + "epoch": 0.5690269529465509, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 3.937716377142589e-06, + "logits/chosen": 389403264.0, + "logits/rejected": 360057856.0, + "logps/chosen": -201.11656494140624, + "logps/rejected": -438.3313802083333, + "loss": 0.0357, + "rewards/chosen": 3.225579833984375, + "rewards/margins": 12.60957997639974, + "rewards/rejected": -9.384000142415365, + "step": 6228 + }, + { + "epoch": 0.5691183188670625, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 3.936311445526238e-06, + "logits/chosen": 531733248.0, + "logits/rejected": 307458453.3333333, + "logps/chosen": -308.3669921875, + "logps/rejected": -353.4141031901042, + "loss": 0.037, + "rewards/chosen": 3.2467578887939452, + "rewards/margins": 11.122313563028971, + "rewards/rejected": -7.875555674235026, + "step": 6229 + }, + { + "epoch": 0.5692096847875743, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 3.934906601867074e-06, + "logits/chosen": 629556160.0, + "logits/rejected": 687267968.0, + "logps/chosen": -332.4442138671875, + "logps/rejected": -461.837158203125, + "loss": 0.0218, + "rewards/chosen": 3.5047812461853027, + "rewards/margins": 11.74474573135376, + "rewards/rejected": -8.239964485168457, + "step": 6230 + }, + { + "epoch": 0.5693010507080859, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.9335018462812664e-06, + "logits/chosen": 365496320.0, + "logits/rejected": 468727193.6, + "logps/chosen": -238.53243001302084, + "logps/rejected": -503.859130859375, + "loss": 0.024, + "rewards/chosen": 3.609644571940104, + "rewards/margins": 13.705465189615884, + "rewards/rejected": -10.09582061767578, + "step": 6231 + }, + { + "epoch": 0.5693924166285975, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 3.932097178884975e-06, + "logits/chosen": 569093952.0, + "logits/rejected": 490376000.0, + "logps/chosen": -281.34271240234375, + "logps/rejected": -379.6197509765625, + "loss": 0.0299, + "rewards/chosen": 3.277576446533203, + "rewards/margins": 13.170243263244629, + "rewards/rejected": -9.892666816711426, + "step": 6232 + }, + { + "epoch": 0.5694837825491091, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 3.930692599794352e-06, + "logits/chosen": 437982412.8, + "logits/rejected": 343371562.6666667, + "logps/chosen": -213.298974609375, + "logps/rejected": -439.076171875, + "loss": 0.0266, + "rewards/chosen": 3.2784988403320314, + "rewards/margins": 12.2179386138916, + "rewards/rejected": -8.93943977355957, + "step": 6233 + }, + { + "epoch": 0.5695751484696209, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 3.9292881091255444e-06, + "logits/chosen": 939685312.0, + "logits/rejected": 554928810.6666666, + "logps/chosen": -421.5832214355469, + "logps/rejected": -569.4937337239584, + "loss": 0.0128, + "rewards/chosen": 4.271255016326904, + "rewards/margins": 12.997217337290445, + "rewards/rejected": -8.725962320963541, + "step": 6234 + }, + { + "epoch": 0.5696665143901325, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 3.927883706994689e-06, + "logits/chosen": 436467404.8, + "logits/rejected": 424989952.0, + "logps/chosen": -288.032275390625, + "logps/rejected": -585.5120442708334, + "loss": 0.0622, + "rewards/chosen": 2.443172836303711, + "rewards/margins": 12.277073287963868, + "rewards/rejected": -9.833900451660156, + "step": 6235 + }, + { + "epoch": 0.5697578803106441, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 3.926479393517919e-06, + "logits/chosen": 2906519296.0, + "logits/rejected": 642809929.1428572, + "logps/chosen": -488.33245849609375, + "logps/rejected": -531.8991001674107, + "loss": 0.0056, + "rewards/chosen": 3.085772752761841, + "rewards/margins": 12.586628334862846, + "rewards/rejected": -9.500855582101005, + "step": 6236 + }, + { + "epoch": 0.5698492462311557, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 3.925075168811355e-06, + "logits/chosen": 702669098.6666666, + "logits/rejected": 395512268.8, + "logps/chosen": -348.4892578125, + "logps/rejected": -445.5875, + "loss": 0.0052, + "rewards/chosen": 4.869256973266602, + "rewards/margins": 13.48789710998535, + "rewards/rejected": -8.61864013671875, + "step": 6237 + }, + { + "epoch": 0.5699406121516675, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 3.923671032991117e-06, + "logits/chosen": 892092672.0, + "logits/rejected": 1220104601.6, + "logps/chosen": -473.9144694010417, + "logps/rejected": -664.5595703125, + "loss": 0.018, + "rewards/chosen": 3.8924783070882163, + "rewards/margins": 13.130509312947591, + "rewards/rejected": -9.238031005859375, + "step": 6238 + }, + { + "epoch": 0.5700319780721791, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.92226698617331e-06, + "logits/chosen": 732759424.0, + "logits/rejected": 904061568.0, + "logps/chosen": -238.96986389160156, + "logps/rejected": -483.832763671875, + "loss": 0.0135, + "rewards/chosen": 4.162359237670898, + "rewards/margins": 12.929940223693848, + "rewards/rejected": -8.76758098602295, + "step": 6239 + }, + { + "epoch": 0.5701233439926907, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 3.920863028474039e-06, + "logits/chosen": 370365376.0, + "logits/rejected": 874766592.0, + "logps/chosen": -205.1962890625, + "logps/rejected": -469.58929443359375, + "loss": 0.0159, + "rewards/chosen": 3.63234281539917, + "rewards/margins": 13.758091449737549, + "rewards/rejected": -10.125748634338379, + "step": 6240 + }, + { + "epoch": 0.5702147099132023, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 3.9194591600093966e-06, + "logits/chosen": 403742464.0, + "logits/rejected": 372615360.0, + "logps/chosen": -396.2331949869792, + "logps/rejected": -518.1109619140625, + "loss": 0.0189, + "rewards/chosen": 4.15744145711263, + "rewards/margins": 15.263337453206379, + "rewards/rejected": -11.10589599609375, + "step": 6241 + }, + { + "epoch": 0.5703060758337141, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 3.9180553808954715e-06, + "logits/chosen": 1095112704.0, + "logits/rejected": 598417664.0, + "logps/chosen": -254.27926635742188, + "logps/rejected": -509.93951416015625, + "loss": 0.0392, + "rewards/chosen": 2.7182087898254395, + "rewards/margins": 11.870906352996826, + "rewards/rejected": -9.152697563171387, + "step": 6242 + }, + { + "epoch": 0.5703974417542257, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 3.91665169124834e-06, + "logits/chosen": 480259712.0, + "logits/rejected": 731555328.0, + "logps/chosen": -314.66961669921875, + "logps/rejected": -685.4136962890625, + "loss": 0.1022, + "rewards/chosen": 4.106610298156738, + "rewards/margins": 14.897050857543945, + "rewards/rejected": -10.790440559387207, + "step": 6243 + }, + { + "epoch": 0.5704888076747373, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 3.9152480911840775e-06, + "logits/chosen": 630260309.3333334, + "logits/rejected": 716205004.8, + "logps/chosen": -227.93697102864584, + "logps/rejected": -484.188427734375, + "loss": 0.0087, + "rewards/chosen": 4.1126454671223955, + "rewards/margins": 12.627696736653647, + "rewards/rejected": -8.51505126953125, + "step": 6244 + }, + { + "epoch": 0.5705801735952489, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 3.913844580818747e-06, + "logits/chosen": 363661653.3333333, + "logits/rejected": 482604352.0, + "logps/chosen": -298.6663411458333, + "logps/rejected": -581.7671508789062, + "loss": 0.0399, + "rewards/chosen": 3.4160989125569663, + "rewards/margins": 10.351266702016195, + "rewards/rejected": -6.9351677894592285, + "step": 6245 + }, + { + "epoch": 0.5706715395157607, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.912441160268407e-06, + "logits/chosen": 465429452.8, + "logits/rejected": 398498517.3333333, + "logps/chosen": -254.883251953125, + "logps/rejected": -240.36433919270834, + "loss": 0.0283, + "rewards/chosen": 3.940041351318359, + "rewards/margins": 11.336515553792317, + "rewards/rejected": -7.396474202473958, + "step": 6246 + }, + { + "epoch": 0.5707629054362723, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 3.911037829649108e-06, + "logits/chosen": 403133376.0, + "logits/rejected": 493242752.0, + "logps/chosen": -224.8973388671875, + "logps/rejected": -475.22021484375, + "loss": 0.1278, + "rewards/chosen": 4.467333793640137, + "rewards/margins": 10.55690050125122, + "rewards/rejected": -6.089566707611084, + "step": 6247 + }, + { + "epoch": 0.5708542713567839, + "grad_norm": 0.40625, + "kl": 0.0, + "learning_rate": 3.909634589076889e-06, + "logits/chosen": 583213781.3333334, + "logits/rejected": 435240038.4, + "logps/chosen": -214.9763387044271, + "logps/rejected": -382.243310546875, + "loss": 0.0027, + "rewards/chosen": 4.990744272867839, + "rewards/margins": 13.729779307047526, + "rewards/rejected": -8.739035034179688, + "step": 6248 + }, + { + "epoch": 0.5709456372772955, + "grad_norm": 0.1953125, + "kl": 0.0, + "learning_rate": 3.908231438667786e-06, + "logits/chosen": 301746346.6666667, + "logits/rejected": 374091724.8, + "logps/chosen": -308.8314208984375, + "logps/rejected": -362.969091796875, + "loss": 0.001, + "rewards/chosen": 6.088327407836914, + "rewards/margins": 15.193098068237305, + "rewards/rejected": -9.10477066040039, + "step": 6249 + }, + { + "epoch": 0.5710370031978073, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 3.906828378537829e-06, + "logits/chosen": 742807210.6666666, + "logits/rejected": 385031475.2, + "logps/chosen": -447.6344807942708, + "logps/rejected": -431.252734375, + "loss": 0.0046, + "rewards/chosen": 4.553198496500651, + "rewards/margins": 14.1978754679362, + "rewards/rejected": -9.644676971435548, + "step": 6250 + }, + { + "epoch": 0.5711283691183189, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 3.905425408803036e-06, + "logits/chosen": 612776960.0, + "logits/rejected": 379618240.0, + "logps/chosen": -401.5544738769531, + "logps/rejected": -384.0733947753906, + "loss": 0.0185, + "rewards/chosen": 3.407578706741333, + "rewards/margins": 13.100987672805786, + "rewards/rejected": -9.693408966064453, + "step": 6251 + }, + { + "epoch": 0.5712197350388305, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.9040225295794215e-06, + "logits/chosen": 318944800.0, + "logits/rejected": 410523264.0, + "logps/chosen": -194.04220581054688, + "logps/rejected": -472.88861083984375, + "loss": 0.0173, + "rewards/chosen": 3.5409960746765137, + "rewards/margins": 13.971644878387451, + "rewards/rejected": -10.430648803710938, + "step": 6252 + }, + { + "epoch": 0.5713111009593421, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 3.902619740982988e-06, + "logits/chosen": 515154432.0, + "logits/rejected": 265512000.0, + "logps/chosen": -298.078857421875, + "logps/rejected": -541.453857421875, + "loss": 0.0319, + "rewards/chosen": 3.580500284830729, + "rewards/margins": 13.043504397074381, + "rewards/rejected": -9.463004112243652, + "step": 6253 + }, + { + "epoch": 0.5714024668798539, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 3.901217043129735e-06, + "logits/chosen": 404984149.3333333, + "logits/rejected": 419563238.4, + "logps/chosen": -290.21923828125, + "logps/rejected": -497.3818359375, + "loss": 0.0034, + "rewards/chosen": 4.90141232808431, + "rewards/margins": 13.572882207234699, + "rewards/rejected": -8.67146987915039, + "step": 6254 + }, + { + "epoch": 0.5714938328003655, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 3.899814436135653e-06, + "logits/chosen": 539103129.6, + "logits/rejected": 778385066.6666666, + "logps/chosen": -333.5329833984375, + "logps/rejected": -582.52001953125, + "loss": 0.0155, + "rewards/chosen": 4.164092636108398, + "rewards/margins": 14.93643913269043, + "rewards/rejected": -10.772346496582031, + "step": 6255 + }, + { + "epoch": 0.5715851987208771, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 3.898411920116722e-06, + "logits/chosen": 279188070.4, + "logits/rejected": 404384426.6666667, + "logps/chosen": -285.0154296875, + "logps/rejected": -272.3251546223958, + "loss": 0.0235, + "rewards/chosen": 4.111320495605469, + "rewards/margins": 11.71991793314616, + "rewards/rejected": -7.60859743754069, + "step": 6256 + }, + { + "epoch": 0.5716765646413887, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 3.897009495188919e-06, + "logits/chosen": 556428928.0, + "logits/rejected": 475034752.0, + "logps/chosen": -300.5391845703125, + "logps/rejected": -478.976806640625, + "loss": 0.0128, + "rewards/chosen": 3.9343180656433105, + "rewards/margins": 12.741078853607178, + "rewards/rejected": -8.806760787963867, + "step": 6257 + }, + { + "epoch": 0.5717679305619005, + "grad_norm": 30.0, + "kl": 0.0, + "learning_rate": 3.8956071614682115e-06, + "logits/chosen": 336125900.8, + "logits/rejected": 346011264.0, + "logps/chosen": -266.8949951171875, + "logps/rejected": -406.9586588541667, + "loss": 0.0621, + "rewards/chosen": 4.8149150848388675, + "rewards/margins": 11.48314069112142, + "rewards/rejected": -6.668225606282552, + "step": 6258 + }, + { + "epoch": 0.5718592964824121, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 3.89420491907056e-06, + "logits/chosen": 466264096.0, + "logits/rejected": 481950061.71428573, + "logps/chosen": -213.60205078125, + "logps/rejected": -520.974609375, + "loss": 0.0051, + "rewards/chosen": 3.2262344360351562, + "rewards/margins": 11.924320765904017, + "rewards/rejected": -8.698086329868861, + "step": 6259 + }, + { + "epoch": 0.5719506624029237, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 3.892802768111915e-06, + "logits/chosen": 450367897.6, + "logits/rejected": 649634560.0, + "logps/chosen": -237.0451416015625, + "logps/rejected": -552.3138834635416, + "loss": 0.0505, + "rewards/chosen": 2.7478302001953123, + "rewards/margins": 13.275807189941407, + "rewards/rejected": -10.527976989746094, + "step": 6260 + }, + { + "epoch": 0.5720420283234353, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 3.891400708708224e-06, + "logits/chosen": 466756650.6666667, + "logits/rejected": 367201056.0, + "logps/chosen": -448.3367919921875, + "logps/rejected": -552.6834716796875, + "loss": 0.0224, + "rewards/chosen": 3.8657522201538086, + "rewards/margins": 15.880860328674316, + "rewards/rejected": -12.015108108520508, + "step": 6261 + }, + { + "epoch": 0.572133394243947, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 3.889998740975421e-06, + "logits/chosen": 519298880.0, + "logits/rejected": 643359488.0, + "logps/chosen": -354.21795654296875, + "logps/rejected": -634.1757202148438, + "loss": 0.0146, + "rewards/chosen": 3.545179843902588, + "rewards/margins": 14.434113025665283, + "rewards/rejected": -10.888933181762695, + "step": 6262 + }, + { + "epoch": 0.5722247601644587, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 3.888596865029441e-06, + "logits/chosen": 701653760.0, + "logits/rejected": 557261888.0, + "logps/chosen": -362.6921081542969, + "logps/rejected": -495.6624755859375, + "loss": 0.0241, + "rewards/chosen": 3.547280788421631, + "rewards/margins": 12.904284954071045, + "rewards/rejected": -9.357004165649414, + "step": 6263 + }, + { + "epoch": 0.5723161260849703, + "grad_norm": 0.90234375, + "kl": 0.0, + "learning_rate": 3.887195080986199e-06, + "logits/chosen": 599833344.0, + "logits/rejected": 461717290.6666667, + "logps/chosen": -340.47021484375, + "logps/rejected": -538.2052001953125, + "loss": 0.0037, + "rewards/chosen": 4.323707580566406, + "rewards/margins": 13.623463948567709, + "rewards/rejected": -9.299756368001303, + "step": 6264 + }, + { + "epoch": 0.5724074920054819, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 3.8857933889616165e-06, + "logits/chosen": 479632448.0, + "logits/rejected": 662295680.0, + "logps/chosen": -225.4461212158203, + "logps/rejected": -413.7200927734375, + "loss": 0.0121, + "rewards/chosen": 3.9854683876037598, + "rewards/margins": 12.84070348739624, + "rewards/rejected": -8.85523509979248, + "step": 6265 + }, + { + "epoch": 0.5724988579259936, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.884391789071597e-06, + "logits/chosen": 506053205.3333333, + "logits/rejected": 383298457.6, + "logps/chosen": -382.4024251302083, + "logps/rejected": -503.9474609375, + "loss": 0.0239, + "rewards/chosen": 2.8835347493489585, + "rewards/margins": 11.98082021077474, + "rewards/rejected": -9.09728546142578, + "step": 6266 + }, + { + "epoch": 0.5725902238465053, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 3.882990281432039e-06, + "logits/chosen": 579462336.0, + "logits/rejected": 632344832.0, + "logps/chosen": -351.5111389160156, + "logps/rejected": -594.5419311523438, + "loss": 0.015, + "rewards/chosen": 3.53228497505188, + "rewards/margins": 11.65662407875061, + "rewards/rejected": -8.12433910369873, + "step": 6267 + }, + { + "epoch": 0.5726815897670169, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 3.881588866158837e-06, + "logits/chosen": 902751424.0, + "logits/rejected": 358261120.0, + "logps/chosen": -743.826171875, + "logps/rejected": -408.1161295572917, + "loss": 0.0071, + "rewards/chosen": 3.563015937805176, + "rewards/margins": 13.3712797164917, + "rewards/rejected": -9.808263778686523, + "step": 6268 + }, + { + "epoch": 0.5727729556875285, + "grad_norm": 0.54296875, + "kl": 0.0, + "learning_rate": 3.8801875433678714e-06, + "logits/chosen": 617146794.6666666, + "logits/rejected": 854556364.8, + "logps/chosen": -428.6056315104167, + "logps/rejected": -388.540869140625, + "loss": 0.0031, + "rewards/chosen": 4.856921513875325, + "rewards/margins": 14.083181126912436, + "rewards/rejected": -9.22625961303711, + "step": 6269 + }, + { + "epoch": 0.5728643216080402, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 3.878786313175022e-06, + "logits/chosen": 483692160.0, + "logits/rejected": 365170517.3333333, + "logps/chosen": -429.946044921875, + "logps/rejected": -395.3601888020833, + "loss": 0.0232, + "rewards/chosen": 2.814596652984619, + "rewards/margins": 10.763317584991455, + "rewards/rejected": -7.948720932006836, + "step": 6270 + }, + { + "epoch": 0.5729556875285519, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 3.877385175696156e-06, + "logits/chosen": 773304627.2, + "logits/rejected": 731106133.3333334, + "logps/chosen": -243.2325439453125, + "logps/rejected": -624.5579427083334, + "loss": 0.0172, + "rewards/chosen": 3.999886322021484, + "rewards/margins": 14.126359430948892, + "rewards/rejected": -10.126473108927408, + "step": 6271 + }, + { + "epoch": 0.5730470534490635, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 3.875984131047135e-06, + "logits/chosen": 795609344.0, + "logits/rejected": 560170368.0, + "logps/chosen": -446.1300964355469, + "logps/rejected": -618.3009847005209, + "loss": 0.0045, + "rewards/chosen": 4.212039470672607, + "rewards/margins": 13.089257081349691, + "rewards/rejected": -8.877217610677084, + "step": 6272 + }, + { + "epoch": 0.5731384193695751, + "grad_norm": 25.125, + "kl": 0.0, + "learning_rate": 3.87458317934381e-06, + "logits/chosen": 935353429.3333334, + "logits/rejected": 540894003.2, + "logps/chosen": -182.95161946614584, + "logps/rejected": -477.34052734375, + "loss": 0.023, + "rewards/chosen": 4.817087809244792, + "rewards/margins": 11.956791178385416, + "rewards/rejected": -7.139703369140625, + "step": 6273 + }, + { + "epoch": 0.5732297852900868, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 3.873182320702033e-06, + "logits/chosen": 536894208.0, + "logits/rejected": 593675366.4, + "logps/chosen": -386.3465576171875, + "logps/rejected": -603.38115234375, + "loss": 0.0128, + "rewards/chosen": 3.5287863413492837, + "rewards/margins": 13.29328753153483, + "rewards/rejected": -9.764501190185547, + "step": 6274 + }, + { + "epoch": 0.5733211512105985, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 3.871781555237636e-06, + "logits/chosen": 459690410.6666667, + "logits/rejected": 383757260.8, + "logps/chosen": -404.3319498697917, + "logps/rejected": -450.2044921875, + "loss": 0.0077, + "rewards/chosen": 4.0049082438151045, + "rewards/margins": 14.012369791666668, + "rewards/rejected": -10.007461547851562, + "step": 6275 + }, + { + "epoch": 0.5734125171311101, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 3.870380883066451e-06, + "logits/chosen": 581907392.0, + "logits/rejected": 359799392.0, + "logps/chosen": -303.7216491699219, + "logps/rejected": -413.92730712890625, + "loss": 0.1139, + "rewards/chosen": 4.562639236450195, + "rewards/margins": 14.382499694824219, + "rewards/rejected": -9.819860458374023, + "step": 6276 + }, + { + "epoch": 0.5735038830516217, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 3.8689803043043e-06, + "logits/chosen": 423510912.0, + "logits/rejected": 534148288.0, + "logps/chosen": -315.5509948730469, + "logps/rejected": -615.457763671875, + "loss": 0.0203, + "rewards/chosen": 4.1758880615234375, + "rewards/margins": 14.718403816223145, + "rewards/rejected": -10.542515754699707, + "step": 6277 + }, + { + "epoch": 0.5735952489721334, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 3.867579819066999e-06, + "logits/chosen": 669219669.3333334, + "logits/rejected": 886266265.6, + "logps/chosen": -387.9241536458333, + "logps/rejected": -551.6712890625, + "loss": 0.0182, + "rewards/chosen": 3.764397303263346, + "rewards/margins": 12.990955225626626, + "rewards/rejected": -9.22655792236328, + "step": 6278 + }, + { + "epoch": 0.573686614892645, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 3.866179427470355e-06, + "logits/chosen": 1065978880.0, + "logits/rejected": 533351808.0, + "logps/chosen": -428.998583984375, + "logps/rejected": -678.1811930338541, + "loss": 0.0245, + "rewards/chosen": 3.3071449279785154, + "rewards/margins": 14.31319376627604, + "rewards/rejected": -11.006048838297525, + "step": 6279 + }, + { + "epoch": 0.5737779808131567, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 3.864779129630168e-06, + "logits/chosen": 721597312.0, + "logits/rejected": 781818496.0, + "logps/chosen": -405.3447265625, + "logps/rejected": -476.307861328125, + "loss": 0.0126, + "rewards/chosen": 3.9775586128234863, + "rewards/margins": 12.559595584869385, + "rewards/rejected": -8.582036972045898, + "step": 6280 + }, + { + "epoch": 0.5738693467336683, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 3.863378925662228e-06, + "logits/chosen": 358405920.0, + "logits/rejected": 516829269.3333333, + "logps/chosen": -100.38661193847656, + "logps/rejected": -475.9195963541667, + "loss": 0.0114, + "rewards/chosen": 3.1392934322357178, + "rewards/margins": 14.271643082300821, + "rewards/rejected": -11.132349650065104, + "step": 6281 + }, + { + "epoch": 0.57396071265418, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 3.86197881568232e-06, + "logits/chosen": 760649625.6, + "logits/rejected": 421685802.6666667, + "logps/chosen": -326.518701171875, + "logps/rejected": -368.5579020182292, + "loss": 0.046, + "rewards/chosen": 3.012096405029297, + "rewards/margins": 10.705935160319012, + "rewards/rejected": -7.693838755289714, + "step": 6282 + }, + { + "epoch": 0.5740520785746916, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.860578799806219e-06, + "logits/chosen": 520188160.0, + "logits/rejected": 525451584.0, + "logps/chosen": -369.179931640625, + "logps/rejected": -472.6813659667969, + "loss": 0.0213, + "rewards/chosen": 3.5874452590942383, + "rewards/margins": 11.75444507598877, + "rewards/rejected": -8.166999816894531, + "step": 6283 + }, + { + "epoch": 0.5741434444952033, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 3.859178878149696e-06, + "logits/chosen": 285985770.6666667, + "logits/rejected": 579961984.0, + "logps/chosen": -250.88826497395834, + "logps/rejected": -435.43829345703125, + "loss": 0.0423, + "rewards/chosen": 3.555262565612793, + "rewards/margins": 12.174273490905762, + "rewards/rejected": -8.619010925292969, + "step": 6284 + }, + { + "epoch": 0.5742348104157149, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 3.857779050828509e-06, + "logits/chosen": 342095323.4285714, + "logits/rejected": 165501136.0, + "logps/chosen": -249.246826171875, + "logps/rejected": -426.12506103515625, + "loss": 0.165, + "rewards/chosen": 2.2599735260009766, + "rewards/margins": 16.757554054260254, + "rewards/rejected": -14.497580528259277, + "step": 6285 + }, + { + "epoch": 0.5743261763362266, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 3.856379317958412e-06, + "logits/chosen": 926017024.0, + "logits/rejected": 331745248.0, + "logps/chosen": -305.9638671875, + "logps/rejected": -375.0209655761719, + "loss": 0.015, + "rewards/chosen": 3.8005051612854004, + "rewards/margins": 12.380007266998291, + "rewards/rejected": -8.57950210571289, + "step": 6286 + }, + { + "epoch": 0.5744175422567382, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 3.8549796796551495e-06, + "logits/chosen": 614266112.0, + "logits/rejected": 496833408.0, + "logps/chosen": -224.72391764322916, + "logps/rejected": -602.28173828125, + "loss": 0.0341, + "rewards/chosen": 3.555147171020508, + "rewards/margins": 13.147154808044434, + "rewards/rejected": -9.592007637023926, + "step": 6287 + }, + { + "epoch": 0.5745089081772499, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 3.853580136034458e-06, + "logits/chosen": 516994112.0, + "logits/rejected": 530527573.3333333, + "logps/chosen": -220.70361328125, + "logps/rejected": -567.3730875651041, + "loss": 0.008, + "rewards/chosen": 3.480693817138672, + "rewards/margins": 14.345696767171225, + "rewards/rejected": -10.865002950032553, + "step": 6288 + }, + { + "epoch": 0.5746002740977615, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 3.852180687212068e-06, + "logits/chosen": 532087840.0, + "logits/rejected": 375100768.0, + "logps/chosen": -220.7096405029297, + "logps/rejected": -337.02655029296875, + "loss": 0.0105, + "rewards/chosen": 4.43044900894165, + "rewards/margins": 12.716506481170654, + "rewards/rejected": -8.286057472229004, + "step": 6289 + }, + { + "epoch": 0.5746916400182732, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 3.850781333303701e-06, + "logits/chosen": 520360640.0, + "logits/rejected": 448924586.6666667, + "logps/chosen": -365.87518310546875, + "logps/rejected": -643.646240234375, + "loss": 0.0078, + "rewards/chosen": 3.4432740211486816, + "rewards/margins": 15.801708062489828, + "rewards/rejected": -12.358434041341146, + "step": 6290 + }, + { + "epoch": 0.5747830059387848, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 3.849382074425069e-06, + "logits/chosen": 532622592.0, + "logits/rejected": 531175680.0, + "logps/chosen": -421.51552734375, + "logps/rejected": -488.9473876953125, + "loss": 0.0222, + "rewards/chosen": 4.353214263916016, + "rewards/margins": 14.201653798421225, + "rewards/rejected": -9.848439534505209, + "step": 6291 + }, + { + "epoch": 0.5748743718592965, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 3.847982910691878e-06, + "logits/chosen": 712771584.0, + "logits/rejected": 534564761.6, + "logps/chosen": -535.123291015625, + "logps/rejected": -563.428125, + "loss": 0.0105, + "rewards/chosen": 3.844493865966797, + "rewards/margins": 12.819878387451173, + "rewards/rejected": -8.975384521484376, + "step": 6292 + }, + { + "epoch": 0.5749657377798081, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 3.846583842219828e-06, + "logits/chosen": 463070762.6666667, + "logits/rejected": 400897740.8, + "logps/chosen": -221.74922688802084, + "logps/rejected": -408.8454345703125, + "loss": 0.0239, + "rewards/chosen": 2.773513158162435, + "rewards/margins": 11.777860387166342, + "rewards/rejected": -9.004347229003907, + "step": 6293 + }, + { + "epoch": 0.5750571037003198, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 3.845184869124606e-06, + "logits/rejected": 399969536.0, + "logps/rejected": -523.2105102539062, + "loss": 0.002, + "rewards/rejected": -9.207281112670898, + "step": 6294 + }, + { + "epoch": 0.5751484696208314, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 3.843785991521898e-06, + "logits/chosen": 1097798656.0, + "logits/rejected": 1352748288.0, + "logps/chosen": -451.53388671875, + "logps/rejected": -765.3740234375, + "loss": 0.0195, + "rewards/chosen": 3.8520103454589845, + "rewards/margins": 15.363896942138672, + "rewards/rejected": -11.511886596679688, + "step": 6295 + }, + { + "epoch": 0.5752398355413431, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 3.842387209527374e-06, + "logits/chosen": 468552832.0, + "logits/rejected": 330288384.0, + "logps/chosen": -184.6656494140625, + "logps/rejected": -507.4017639160156, + "loss": 0.1332, + "rewards/chosen": 3.4769585927327475, + "rewards/margins": 16.57095177968343, + "rewards/rejected": -13.093993186950684, + "step": 6296 + }, + { + "epoch": 0.5753312014618547, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 3.840988523256704e-06, + "logits/chosen": 617753130.6666666, + "logits/rejected": 488227840.0, + "logps/chosen": -377.409423828125, + "logps/rejected": -503.471923828125, + "loss": 0.0108, + "rewards/chosen": 3.6078786849975586, + "rewards/margins": 13.425162315368652, + "rewards/rejected": -9.817283630371094, + "step": 6297 + }, + { + "epoch": 0.5754225673823664, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 3.839589932825544e-06, + "logits/chosen": 834110378.6666666, + "logits/rejected": 769942528.0, + "logps/chosen": -380.9635416666667, + "logps/rejected": -546.78466796875, + "loss": 0.0334, + "rewards/chosen": 3.700838088989258, + "rewards/margins": 13.550874710083008, + "rewards/rejected": -9.85003662109375, + "step": 6298 + }, + { + "epoch": 0.575513933302878, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 3.838191438349545e-06, + "logits/chosen": 408596736.0, + "logits/rejected": 371641088.0, + "logps/chosen": -306.7281087239583, + "logps/rejected": -513.099853515625, + "loss": 0.0157, + "rewards/chosen": 4.19940185546875, + "rewards/margins": 14.255012512207031, + "rewards/rejected": -10.055610656738281, + "step": 6299 + }, + { + "epoch": 0.5756052992233897, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 3.8367930399443495e-06, + "logits/chosen": 627764650.6666666, + "logits/rejected": 406848204.8, + "logps/chosen": -205.1163126627604, + "logps/rejected": -478.81728515625, + "loss": 0.0224, + "rewards/chosen": 2.7936318715413413, + "rewards/margins": 12.835859807332357, + "rewards/rejected": -10.042227935791015, + "step": 6300 + }, + { + "epoch": 0.5756966651439013, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.8353947377255934e-06, + "logits/chosen": 410194432.0, + "logits/rejected": 688581120.0, + "logps/chosen": -287.6941324869792, + "logps/rejected": -347.715234375, + "loss": 0.017, + "rewards/chosen": 3.374261220296224, + "rewards/margins": 10.602271779378256, + "rewards/rejected": -7.228010559082032, + "step": 6301 + }, + { + "epoch": 0.575788031064413, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 3.8339965318089e-06, + "logits/chosen": 413688524.8, + "logits/rejected": 377050794.6666667, + "logps/chosen": -324.216455078125, + "logps/rejected": -348.8510335286458, + "loss": 0.0321, + "rewards/chosen": 3.720506286621094, + "rewards/margins": 12.250993347167968, + "rewards/rejected": -8.530487060546875, + "step": 6302 + }, + { + "epoch": 0.5758793969849246, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 3.832598422309892e-06, + "logits/chosen": 524783257.6, + "logits/rejected": 823412138.6666666, + "logps/chosen": -236.8458740234375, + "logps/rejected": -619.2978515625, + "loss": 0.0182, + "rewards/chosen": 3.9414947509765623, + "rewards/margins": 14.322925694783528, + "rewards/rejected": -10.381430943806967, + "step": 6303 + }, + { + "epoch": 0.5759707629054363, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 3.8312004093441765e-06, + "logits/chosen": 973615957.3333334, + "logits/rejected": 848107929.6, + "logps/chosen": -215.42791748046875, + "logps/rejected": -445.59990234375, + "loss": 0.0091, + "rewards/chosen": 3.7955395380655923, + "rewards/margins": 13.326987902323404, + "rewards/rejected": -9.531448364257812, + "step": 6304 + }, + { + "epoch": 0.5760621288259479, + "grad_norm": 0.41015625, + "kl": 0.0, + "learning_rate": 3.82980249302736e-06, + "logits/chosen": 252865536.0, + "logits/rejected": 424312832.0, + "logps/chosen": -167.35092163085938, + "logps/rejected": -545.4467366536459, + "loss": 0.0028, + "rewards/chosen": 4.920109748840332, + "rewards/margins": 13.36919116973877, + "rewards/rejected": -8.449081420898438, + "step": 6305 + }, + { + "epoch": 0.5761534947464596, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 3.828404673475034e-06, + "logits/chosen": 1133912473.6, + "logits/rejected": 662696021.3333334, + "logps/chosen": -312.99990234375, + "logps/rejected": -281.6093343098958, + "loss": 0.0136, + "rewards/chosen": 4.496479034423828, + "rewards/margins": 11.458175659179688, + "rewards/rejected": -6.961696624755859, + "step": 6306 + }, + { + "epoch": 0.5762448606669712, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 3.8270069508027864e-06, + "logits/chosen": 642275712.0, + "logits/rejected": 420649280.0, + "logps/chosen": -335.78521728515625, + "logps/rejected": -632.032470703125, + "loss": 0.026, + "rewards/chosen": 3.3704147338867188, + "rewards/margins": 12.570269584655762, + "rewards/rejected": -9.199854850769043, + "step": 6307 + }, + { + "epoch": 0.5763362265874828, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 3.825609325126197e-06, + "logits/chosen": 723536640.0, + "logits/rejected": 562607040.0, + "logps/chosen": -641.6087646484375, + "logps/rejected": -544.0117797851562, + "loss": 0.0101, + "rewards/chosen": 4.023919582366943, + "rewards/margins": 12.47502851486206, + "rewards/rejected": -8.451108932495117, + "step": 6308 + }, + { + "epoch": 0.5764275925079945, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 3.824211796560833e-06, + "logits/chosen": 554504064.0, + "logits/rejected": 449896960.0, + "logps/chosen": -244.93223571777344, + "logps/rejected": -587.8946533203125, + "loss": 0.0052, + "rewards/chosen": 4.2112555503845215, + "rewards/margins": 13.711671352386475, + "rewards/rejected": -9.500415802001953, + "step": 6309 + }, + { + "epoch": 0.5765189584285062, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 3.822814365222262e-06, + "logits/chosen": 609670604.8, + "logits/rejected": 397645098.6666667, + "logps/chosen": -408.854833984375, + "logps/rejected": -378.9022623697917, + "loss": 0.025, + "rewards/chosen": 3.440791702270508, + "rewards/margins": 11.301843897501628, + "rewards/rejected": -7.86105219523112, + "step": 6310 + }, + { + "epoch": 0.5766103243490178, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.821417031226035e-06, + "logits/chosen": 538184908.8, + "logits/rejected": 469348608.0, + "logps/chosen": -314.8494140625, + "logps/rejected": -402.7001953125, + "loss": 0.0147, + "rewards/chosen": 3.9089279174804688, + "rewards/margins": 13.615268071492514, + "rewards/rejected": -9.706340154012045, + "step": 6311 + }, + { + "epoch": 0.5767016902695294, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 3.8200197946876996e-06, + "logits/chosen": 624290304.0, + "logits/rejected": 517698624.0, + "logps/chosen": -339.4330240885417, + "logps/rejected": -379.114501953125, + "loss": 0.0321, + "rewards/chosen": 3.609062830607096, + "rewards/margins": 12.819292704264322, + "rewards/rejected": -9.210229873657227, + "step": 6312 + }, + { + "epoch": 0.5767930561900411, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 3.818622655722794e-06, + "logits/chosen": 677065984.0, + "logits/rejected": 403639661.71428573, + "logps/chosen": -368.7024230957031, + "logps/rejected": -385.31180245535717, + "loss": 0.0055, + "rewards/chosen": 3.165924072265625, + "rewards/margins": 11.198306492396764, + "rewards/rejected": -8.032382420131139, + "step": 6313 + }, + { + "epoch": 0.5768844221105528, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 3.817225614446851e-06, + "logits/chosen": 326007253.3333333, + "logits/rejected": 743207296.0, + "logps/chosen": -444.5548502604167, + "logps/rejected": -298.1652526855469, + "loss": 0.0263, + "rewards/chosen": 3.7256151835123696, + "rewards/margins": 10.962840716044107, + "rewards/rejected": -7.237225532531738, + "step": 6314 + }, + { + "epoch": 0.5769757880310644, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 3.815828670975389e-06, + "logits/chosen": 641922816.0, + "logits/rejected": 933157888.0, + "logps/chosen": -203.76133728027344, + "logps/rejected": -1072.27294921875, + "loss": 0.0278, + "rewards/chosen": 2.9032931327819824, + "rewards/margins": 19.569575786590576, + "rewards/rejected": -16.666282653808594, + "step": 6315 + }, + { + "epoch": 0.577067153951576, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 3.8144318254239256e-06, + "logits/chosen": 532965717.3333333, + "logits/rejected": 657676902.4, + "logps/chosen": -363.1846923828125, + "logps/rejected": -339.200634765625, + "loss": 0.1122, + "rewards/chosen": 3.786654790242513, + "rewards/margins": 10.327966435750326, + "rewards/rejected": -6.541311645507813, + "step": 6316 + }, + { + "epoch": 0.5771585198720877, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 3.8130350779079645e-06, + "logits/chosen": 470924608.0, + "logits/rejected": 721450944.0, + "logps/chosen": -293.89501953125, + "logps/rejected": -454.669921875, + "loss": 0.0318, + "rewards/chosen": 3.7539596557617188, + "rewards/margins": 10.171923160552979, + "rewards/rejected": -6.41796350479126, + "step": 6317 + }, + { + "epoch": 0.5772498857925994, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 3.8116384285430063e-06, + "logits/chosen": 637440298.6666666, + "logits/rejected": 514573056.0, + "logps/chosen": -571.5548502604166, + "logps/rejected": -709.4703125, + "loss": 0.0058, + "rewards/chosen": 4.446428616841634, + "rewards/margins": 16.3035987218221, + "rewards/rejected": -11.857170104980469, + "step": 6318 + }, + { + "epoch": 0.577341251713111, + "grad_norm": 48.75, + "kl": 0.0, + "learning_rate": 3.810241877444539e-06, + "logits/chosen": 793632768.0, + "logits/rejected": 473706272.0, + "logps/chosen": -287.1975911458333, + "logps/rejected": -649.1702880859375, + "loss": 0.1202, + "rewards/chosen": 3.21826966603597, + "rewards/margins": 14.718746503194174, + "rewards/rejected": -11.500476837158203, + "step": 6319 + }, + { + "epoch": 0.5774326176336226, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 3.808845424728045e-06, + "logits/chosen": 1185701376.0, + "logits/rejected": 407025120.0, + "logps/chosen": -370.4940185546875, + "logps/rejected": -358.8838195800781, + "loss": 0.0171, + "rewards/chosen": 3.613345146179199, + "rewards/margins": 12.544390678405762, + "rewards/rejected": -8.931045532226562, + "step": 6320 + }, + { + "epoch": 0.5775239835541343, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.8074490705089983e-06, + "logits/chosen": 712345280.0, + "logits/rejected": 257321776.0, + "logps/chosen": -410.7939147949219, + "logps/rejected": -353.22998046875, + "loss": 0.0127, + "rewards/chosen": 4.491391181945801, + "rewards/margins": 13.509764671325684, + "rewards/rejected": -9.018373489379883, + "step": 6321 + }, + { + "epoch": 0.577615349474646, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 3.8060528149028643e-06, + "logits/chosen": 1197717504.0, + "logits/rejected": 521001764.5714286, + "logps/chosen": -152.9258575439453, + "logps/rejected": -510.2066127232143, + "loss": 0.0046, + "rewards/chosen": 3.4030213356018066, + "rewards/margins": 13.71210663659232, + "rewards/rejected": -10.309085300990514, + "step": 6322 + }, + { + "epoch": 0.5777067153951576, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 3.8046566580251e-06, + "logits/chosen": 534983424.0, + "logits/rejected": 374843072.0, + "logps/chosen": -218.8965301513672, + "logps/rejected": -368.7685546875, + "loss": 0.0181, + "rewards/chosen": 4.245400428771973, + "rewards/margins": 13.424904823303223, + "rewards/rejected": -9.17950439453125, + "step": 6323 + }, + { + "epoch": 0.5777980813156692, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 3.8032605999911547e-06, + "logits/chosen": 823517696.0, + "logits/rejected": 634190131.2, + "logps/chosen": -134.03043619791666, + "logps/rejected": -749.591357421875, + "loss": 0.0137, + "rewards/chosen": 3.7863890329996743, + "rewards/margins": 14.687068049112955, + "rewards/rejected": -10.900679016113282, + "step": 6324 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 3.8018646409164706e-06, + "logits/chosen": 837019904.0, + "logits/rejected": 402798080.0, + "logps/chosen": -406.1734619140625, + "logps/rejected": -404.95440673828125, + "loss": 0.0384, + "rewards/chosen": 2.514160633087158, + "rewards/margins": 11.250195980072021, + "rewards/rejected": -8.736035346984863, + "step": 6325 + }, + { + "epoch": 0.5779808131566926, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 3.8004687809164788e-06, + "logits/chosen": 585352021.3333334, + "logits/rejected": 406585696.0, + "logps/chosen": -366.1901448567708, + "logps/rejected": -458.7687072753906, + "loss": 0.0144, + "rewards/chosen": 4.220145543416341, + "rewards/margins": 13.73026688893636, + "rewards/rejected": -9.51012134552002, + "step": 6326 + }, + { + "epoch": 0.5780721790772042, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 3.7990730201066073e-06, + "logits/chosen": 832512292.5714285, + "logits/rejected": 121523384.0, + "logps/chosen": -346.1820591517857, + "logps/rejected": -369.59893798828125, + "loss": 0.0245, + "rewards/chosen": 3.756099155970982, + "rewards/margins": 10.576512268611364, + "rewards/rejected": -6.820413112640381, + "step": 6327 + }, + { + "epoch": 0.5781635449977158, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 3.7976773586022687e-06, + "logits/chosen": 588613632.0, + "logits/rejected": 464814976.0, + "logps/chosen": -386.2802327473958, + "logps/rejected": -447.1759338378906, + "loss": 0.0206, + "rewards/chosen": 3.916199048360189, + "rewards/margins": 15.653152783711752, + "rewards/rejected": -11.736953735351562, + "step": 6328 + }, + { + "epoch": 0.5782549109182274, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.796281796518873e-06, + "logits/chosen": 478841301.3333333, + "logits/rejected": 289583001.6, + "logps/chosen": -258.74025472005206, + "logps/rejected": -335.06337890625, + "loss": 0.0109, + "rewards/chosen": 4.689293543497722, + "rewards/margins": 12.29269167582194, + "rewards/rejected": -7.603398132324219, + "step": 6329 + }, + { + "epoch": 0.5783462768387392, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 3.794886333971819e-06, + "logits/chosen": 540891852.8, + "logits/rejected": 480944085.3333333, + "logps/chosen": -369.3084228515625, + "logps/rejected": -504.8182779947917, + "loss": 0.019, + "rewards/chosen": 3.589044189453125, + "rewards/margins": 14.190494791666666, + "rewards/rejected": -10.601450602213541, + "step": 6330 + }, + { + "epoch": 0.5784376427592508, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 3.7934909710765e-06, + "logits/chosen": 638959155.2, + "logits/rejected": 516106410.6666667, + "logps/chosen": -261.11728515625, + "logps/rejected": -596.18408203125, + "loss": 0.0104, + "rewards/chosen": 4.861796188354492, + "rewards/margins": 13.170705286661782, + "rewards/rejected": -8.308909098307291, + "step": 6331 + }, + { + "epoch": 0.5785290086797624, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 3.792095707948299e-06, + "logits/chosen": 533302144.0, + "logits/rejected": 435141939.2, + "logps/chosen": -495.4392496744792, + "logps/rejected": -413.333056640625, + "loss": 0.0094, + "rewards/chosen": 3.9875497817993164, + "rewards/margins": 12.979961585998534, + "rewards/rejected": -8.992411804199218, + "step": 6332 + }, + { + "epoch": 0.578620374600274, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.790700544702592e-06, + "logits/chosen": 437841184.0, + "logits/rejected": 741014848.0, + "logps/chosen": -243.2076873779297, + "logps/rejected": -509.9727783203125, + "loss": 0.0146, + "rewards/chosen": 4.107102870941162, + "rewards/margins": 13.543225765228271, + "rewards/rejected": -9.43612289428711, + "step": 6333 + }, + { + "epoch": 0.5787117405207858, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.7893054814547446e-06, + "logits/chosen": 514932181.3333333, + "logits/rejected": 717822566.4, + "logps/chosen": -520.3853759765625, + "logps/rejected": -426.496728515625, + "loss": 0.0119, + "rewards/chosen": 3.773540496826172, + "rewards/margins": 12.481842803955079, + "rewards/rejected": -8.708302307128907, + "step": 6334 + }, + { + "epoch": 0.5788031064412974, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 3.7879105183201175e-06, + "logits/chosen": 625725696.0, + "logits/rejected": 415063449.6, + "logps/chosen": -481.4000651041667, + "logps/rejected": -406.0965087890625, + "loss": 0.0124, + "rewards/chosen": 3.633661905924479, + "rewards/margins": 11.37324244181315, + "rewards/rejected": -7.739580535888672, + "step": 6335 + }, + { + "epoch": 0.578894472361809, + "grad_norm": 0.69140625, + "kl": 0.0, + "learning_rate": 3.786515655414059e-06, + "logits/chosen": 753832448.0, + "logits/rejected": 1069816832.0, + "logps/chosen": -256.9170837402344, + "logps/rejected": -629.7274169921875, + "loss": 0.0049, + "rewards/chosen": 3.9573659896850586, + "rewards/margins": 14.13581371307373, + "rewards/rejected": -10.178447723388672, + "step": 6336 + }, + { + "epoch": 0.5789858382823206, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.7851208928519144e-06, + "logits/chosen": 525834624.0, + "logits/rejected": 732218112.0, + "logps/chosen": -79.60179138183594, + "logps/rejected": -442.2827962239583, + "loss": 0.0157, + "rewards/chosen": 2.7305500507354736, + "rewards/margins": 11.48453132311503, + "rewards/rejected": -8.753981272379557, + "step": 6337 + }, + { + "epoch": 0.5790772042028324, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 3.7837262307490142e-06, + "logits/chosen": 754794880.0, + "logits/rejected": 436184661.3333333, + "logps/chosen": -226.747802734375, + "logps/rejected": -559.03662109375, + "loss": 0.0081, + "rewards/chosen": 3.6905746459960938, + "rewards/margins": 12.354461034138998, + "rewards/rejected": -8.663886388142904, + "step": 6338 + }, + { + "epoch": 0.579168570123344, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 3.782331669220687e-06, + "logits/chosen": 600255402.6666666, + "logits/rejected": 540256204.8, + "logps/chosen": -232.93294270833334, + "logps/rejected": -484.549169921875, + "loss": 0.0117, + "rewards/chosen": 3.765148162841797, + "rewards/margins": 11.704335021972657, + "rewards/rejected": -7.939186859130859, + "step": 6339 + }, + { + "epoch": 0.5792599360438556, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 3.7809372083822486e-06, + "logits/chosen": 478314188.8, + "logits/rejected": 452825770.6666667, + "logps/chosen": -325.792138671875, + "logps/rejected": -407.3117268880208, + "loss": 0.0237, + "rewards/chosen": 3.8350288391113283, + "rewards/margins": 13.161084365844726, + "rewards/rejected": -9.326055526733398, + "step": 6340 + }, + { + "epoch": 0.5793513019643672, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 3.7795428483490076e-06, + "logits/chosen": 634657664.0, + "logits/rejected": 659081984.0, + "logps/chosen": -265.656005859375, + "logps/rejected": -463.3489583333333, + "loss": 0.0131, + "rewards/chosen": 3.187211036682129, + "rewards/margins": 13.047465960184732, + "rewards/rejected": -9.860254923502604, + "step": 6341 + }, + { + "epoch": 0.579442667884879, + "grad_norm": 86.0, + "kl": 0.0, + "learning_rate": 3.7781485892362657e-06, + "logits/chosen": 541887360.0, + "logits/rejected": 462424576.0, + "logps/chosen": -256.0745544433594, + "logps/rejected": -345.1761067708333, + "loss": 0.0719, + "rewards/chosen": 2.1561243534088135, + "rewards/margins": 12.578457117080688, + "rewards/rejected": -10.422332763671875, + "step": 6342 + }, + { + "epoch": 0.5795340338053906, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.776754431159314e-06, + "logits/chosen": 543447552.0, + "logits/rejected": 406584384.0, + "logps/chosen": -333.7087707519531, + "logps/rejected": -368.531494140625, + "loss": 0.0128, + "rewards/chosen": 4.169855117797852, + "rewards/margins": 14.836463928222656, + "rewards/rejected": -10.666608810424805, + "step": 6343 + }, + { + "epoch": 0.5796253997259022, + "grad_norm": 41.0, + "kl": 0.0, + "learning_rate": 3.775360374233439e-06, + "logits/chosen": 450984160.0, + "logits/rejected": 530580096.0, + "logps/chosen": -231.45147705078125, + "logps/rejected": -361.47393798828125, + "loss": 0.2003, + "rewards/chosen": 1.9573309421539307, + "rewards/margins": 8.671496152877808, + "rewards/rejected": -6.714165210723877, + "step": 6344 + }, + { + "epoch": 0.5797167656464138, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.7739664185739124e-06, + "logits/chosen": 531546214.4, + "logits/rejected": 380165717.3333333, + "logps/chosen": -438.564111328125, + "logps/rejected": -813.4303385416666, + "loss": 0.0126, + "rewards/chosen": 4.553234100341797, + "rewards/margins": 15.547898610432943, + "rewards/rejected": -10.994664510091146, + "step": 6345 + }, + { + "epoch": 0.5798081315669256, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 3.7725725642960047e-06, + "logits/chosen": 738145877.3333334, + "logits/rejected": 717766604.8, + "logps/chosen": -248.3604939778646, + "logps/rejected": -408.4658203125, + "loss": 0.0106, + "rewards/chosen": 3.736168543497721, + "rewards/margins": 12.731830469767251, + "rewards/rejected": -8.99566192626953, + "step": 6346 + }, + { + "epoch": 0.5798994974874372, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 3.7711788115149726e-06, + "logits/chosen": 317880106.6666667, + "logits/rejected": 402462054.4, + "logps/chosen": -233.4027099609375, + "logps/rejected": -610.853369140625, + "loss": 0.0079, + "rewards/chosen": 4.5614010492960615, + "rewards/margins": 15.402558581034342, + "rewards/rejected": -10.841157531738281, + "step": 6347 + }, + { + "epoch": 0.5799908634079488, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 3.7697851603460687e-06, + "logits/chosen": 569556821.3333334, + "logits/rejected": 569892352.0, + "logps/chosen": -423.4847005208333, + "logps/rejected": -434.731103515625, + "loss": 0.011, + "rewards/chosen": 3.702974319458008, + "rewards/margins": 11.395358657836914, + "rewards/rejected": -7.692384338378906, + "step": 6348 + }, + { + "epoch": 0.5800822293284604, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 3.7683916109045326e-06, + "logits/chosen": 462744917.3333333, + "logits/rejected": 295088281.6, + "logps/chosen": -377.2145589192708, + "logps/rejected": -384.631396484375, + "loss": 0.0166, + "rewards/chosen": 3.5639212926228843, + "rewards/margins": 13.420130093892416, + "rewards/rejected": -9.856208801269531, + "step": 6349 + }, + { + "epoch": 0.5801735952489722, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.766998163305601e-06, + "logits/chosen": 884369472.0, + "logits/rejected": 776825472.0, + "logps/chosen": -481.5689392089844, + "logps/rejected": -551.1841430664062, + "loss": 0.0093, + "rewards/chosen": 4.63149356842041, + "rewards/margins": 13.815895080566406, + "rewards/rejected": -9.184401512145996, + "step": 6350 + }, + { + "epoch": 0.5802649611694838, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 3.7656048176644965e-06, + "logits/chosen": 413488096.0, + "logits/rejected": 651581781.3333334, + "logps/chosen": -246.31381225585938, + "logps/rejected": -551.8350016276041, + "loss": 0.0108, + "rewards/chosen": 4.206182479858398, + "rewards/margins": 13.233945846557617, + "rewards/rejected": -9.027763366699219, + "step": 6351 + }, + { + "epoch": 0.5803563270899954, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 3.7642115740964357e-06, + "logits/chosen": 809669478.4, + "logits/rejected": 505089792.0, + "logps/chosen": -281.0461669921875, + "logps/rejected": -477.17138671875, + "loss": 0.0275, + "rewards/chosen": 3.9213233947753907, + "rewards/margins": 13.003593063354492, + "rewards/rejected": -9.082269668579102, + "step": 6352 + }, + { + "epoch": 0.580447693010507, + "grad_norm": 0.58984375, + "kl": 0.0, + "learning_rate": 3.762818432716629e-06, + "logits/chosen": 202141141.33333334, + "logits/rejected": 377674700.8, + "logps/chosen": -179.67537434895834, + "logps/rejected": -416.880078125, + "loss": 0.0047, + "rewards/chosen": 4.9905039469401045, + "rewards/margins": 12.830320231119792, + "rewards/rejected": -7.839816284179688, + "step": 6353 + }, + { + "epoch": 0.5805390589310188, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 3.7614253936402766e-06, + "logits/chosen": 293485248.0, + "logits/rejected": 373159466.6666667, + "logps/chosen": -130.1160888671875, + "logps/rejected": -530.4967447916666, + "loss": 0.1235, + "rewards/chosen": 0.1116933822631836, + "rewards/margins": 10.124055544535318, + "rewards/rejected": -10.012362162272135, + "step": 6354 + }, + { + "epoch": 0.5806304248515304, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 3.760032456982566e-06, + "logits/chosen": 425837440.0, + "logits/rejected": 387672149.3333333, + "logps/chosen": -242.19546508789062, + "logps/rejected": -412.1674397786458, + "loss": 0.0206, + "rewards/chosen": 3.2315921783447266, + "rewards/margins": 11.818777084350586, + "rewards/rejected": -8.58718490600586, + "step": 6355 + }, + { + "epoch": 0.580721790772042, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 3.7586396228586837e-06, + "logits/chosen": 376761344.0, + "logits/rejected": 677803861.3333334, + "logps/chosen": -182.2015838623047, + "logps/rejected": -630.4503173828125, + "loss": 0.0075, + "rewards/chosen": 3.664278030395508, + "rewards/margins": 14.364938735961914, + "rewards/rejected": -10.700660705566406, + "step": 6356 + }, + { + "epoch": 0.5808131566925536, + "grad_norm": 0.61328125, + "kl": 0.0, + "learning_rate": 3.7572468913838022e-06, + "logits/chosen": 365386453.3333333, + "logits/rejected": 295017779.2, + "logps/chosen": -293.09739176432294, + "logps/rejected": -477.703662109375, + "loss": 0.0028, + "rewards/chosen": 5.570699055989583, + "rewards/margins": 15.906129964192708, + "rewards/rejected": -10.335430908203126, + "step": 6357 + }, + { + "epoch": 0.5809045226130654, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.7558542626730894e-06, + "logits/chosen": 726278720.0, + "logits/rejected": 761387008.0, + "logps/chosen": -358.2761535644531, + "logps/rejected": -594.4398193359375, + "loss": 0.0143, + "rewards/chosen": 3.8583405017852783, + "rewards/margins": 14.450634241104126, + "rewards/rejected": -10.592293739318848, + "step": 6358 + }, + { + "epoch": 0.580995888533577, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 3.7544617368417008e-06, + "logits/chosen": 398850764.8, + "logits/rejected": 322789589.3333333, + "logps/chosen": -295.1275390625, + "logps/rejected": -362.2999674479167, + "loss": 0.1358, + "rewards/chosen": 2.7703948974609376, + "rewards/margins": 11.597568130493164, + "rewards/rejected": -8.827173233032227, + "step": 6359 + }, + { + "epoch": 0.5810872544540886, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 3.753069314004787e-06, + "logits/chosen": 320990784.0, + "logits/rejected": 360942624.0, + "logps/chosen": -307.2261962890625, + "logps/rejected": -473.780029296875, + "loss": 0.0198, + "rewards/chosen": 3.3822579383850098, + "rewards/margins": 14.014418125152588, + "rewards/rejected": -10.632160186767578, + "step": 6360 + }, + { + "epoch": 0.5811786203746002, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 3.751676994277488e-06, + "logits/chosen": 818363904.0, + "logits/rejected": 748415040.0, + "logps/chosen": -444.5975748697917, + "logps/rejected": -584.5838012695312, + "loss": 0.0168, + "rewards/chosen": 3.963648796081543, + "rewards/margins": 11.558799743652344, + "rewards/rejected": -7.595150947570801, + "step": 6361 + }, + { + "epoch": 0.581269986295112, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 3.7502847777749353e-06, + "logits/chosen": 854299699.2, + "logits/rejected": 510519978.6666667, + "logps/chosen": -365.0571044921875, + "logps/rejected": -391.6304117838542, + "loss": 0.0173, + "rewards/chosen": 4.113406372070313, + "rewards/margins": 12.870859018961589, + "rewards/rejected": -8.757452646891275, + "step": 6362 + }, + { + "epoch": 0.5813613522156236, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 3.7488926646122535e-06, + "logits/chosen": 360593664.0, + "logits/rejected": 405354592.0, + "logps/chosen": -225.63380432128906, + "logps/rejected": -584.00048828125, + "loss": 0.0083, + "rewards/chosen": 4.610678195953369, + "rewards/margins": 13.785418033599854, + "rewards/rejected": -9.174739837646484, + "step": 6363 + }, + { + "epoch": 0.5814527181361352, + "grad_norm": 8.5625, + "kl": 7.199497222900391, + "learning_rate": 3.747500654904555e-06, + "logits/chosen": 414332096.0, + "logps/chosen": -294.04840087890625, + "loss": 0.0913, + "rewards/chosen": 3.3608012199401855, + "step": 6364 + }, + { + "epoch": 0.5815440840566469, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 3.746108748766949e-06, + "logits/chosen": 398293216.0, + "logits/rejected": 824173824.0, + "logps/chosen": -123.91523742675781, + "logps/rejected": -653.3565266927084, + "loss": 0.0105, + "rewards/chosen": 3.251565456390381, + "rewards/margins": 12.834483623504639, + "rewards/rejected": -9.582918167114258, + "step": 6365 + }, + { + "epoch": 0.5816354499771585, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 3.7447169463145306e-06, + "logits/chosen": 260968448.0, + "logits/rejected": 835238546.2857143, + "logps/chosen": -50.052223205566406, + "logps/rejected": -550.0893903459821, + "loss": 0.0028, + "rewards/chosen": 5.0255937576293945, + "rewards/margins": 14.620125225612096, + "rewards/rejected": -9.594531467982701, + "step": 6366 + }, + { + "epoch": 0.5817268158976702, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 3.7433252476623916e-06, + "logits/chosen": 842527061.3333334, + "logits/rejected": 475970406.4, + "logps/chosen": -469.2400716145833, + "logps/rejected": -526.8541015625, + "loss": 0.0082, + "rewards/chosen": 3.897528966267904, + "rewards/margins": 12.900229008992513, + "rewards/rejected": -9.002700042724609, + "step": 6367 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 3.74193365292561e-06, + "logits/chosen": 434748330.6666667, + "logits/rejected": 780827776.0, + "logps/chosen": -290.68544514973956, + "logps/rejected": -347.30218505859375, + "loss": 0.0389, + "rewards/chosen": 3.4304720560709634, + "rewards/margins": 9.773977915445963, + "rewards/rejected": -6.343505859375, + "step": 6368 + }, + { + "epoch": 0.5819095477386935, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 3.7405421622192607e-06, + "logits/chosen": 291114453.3333333, + "logits/rejected": 323255552.0, + "logps/chosen": -276.464111328125, + "logps/rejected": -451.15313720703125, + "loss": 0.0196, + "rewards/chosen": 4.367071151733398, + "rewards/margins": 15.737066268920898, + "rewards/rejected": -11.3699951171875, + "step": 6369 + }, + { + "epoch": 0.5820009136592051, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 3.7391507756584033e-06, + "logits/chosen": 936673088.0, + "logits/rejected": 759911424.0, + "logps/chosen": -456.7729797363281, + "logps/rejected": -424.3484293619792, + "loss": 0.0232, + "rewards/chosen": 3.2304275035858154, + "rewards/margins": 11.863734006881714, + "rewards/rejected": -8.633306503295898, + "step": 6370 + }, + { + "epoch": 0.5820922795797168, + "grad_norm": 0.78515625, + "kl": 0.0, + "learning_rate": 3.7377594933580967e-06, + "logits/chosen": 479882304.0, + "logits/rejected": 482770133.3333333, + "logps/chosen": -223.8868408203125, + "logps/rejected": -378.3384195963542, + "loss": 0.0044, + "rewards/chosen": 4.166617393493652, + "rewards/margins": 13.55446974436442, + "rewards/rejected": -9.387852350870768, + "step": 6371 + }, + { + "epoch": 0.5821836455002284, + "grad_norm": 0.5625, + "kl": 0.0, + "learning_rate": 3.736368315433385e-06, + "logits/chosen": 459878208.0, + "logits/rejected": 484904608.0, + "logps/chosen": -290.3773193359375, + "logps/rejected": -555.53271484375, + "loss": 0.003, + "rewards/chosen": 5.3874077796936035, + "rewards/margins": 16.262959957122803, + "rewards/rejected": -10.8755521774292, + "step": 6372 + }, + { + "epoch": 0.5822750114207401, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 3.7349772419993046e-06, + "logits/chosen": 473295328.0, + "logits/rejected": 478513408.0, + "logps/chosen": -287.4658508300781, + "logps/rejected": -371.24530029296875, + "loss": 0.0173, + "rewards/chosen": 3.731778860092163, + "rewards/margins": 11.964383840560913, + "rewards/rejected": -8.23260498046875, + "step": 6373 + }, + { + "epoch": 0.5823663773412517, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.7335862731708866e-06, + "logits/chosen": 502193203.2, + "logits/rejected": 335971626.6666667, + "logps/chosen": -281.992431640625, + "logps/rejected": -241.4810994466146, + "loss": 0.0287, + "rewards/chosen": 3.845838165283203, + "rewards/margins": 10.497537231445312, + "rewards/rejected": -6.651699066162109, + "step": 6374 + }, + { + "epoch": 0.5824577432617634, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 3.732195409063149e-06, + "logits/chosen": 860490069.3333334, + "logits/rejected": 1607011737.6, + "logps/chosen": -360.9982096354167, + "logps/rejected": -619.193994140625, + "loss": 0.0074, + "rewards/chosen": 3.9845892588297525, + "rewards/margins": 13.436134974161783, + "rewards/rejected": -9.451545715332031, + "step": 6375 + }, + { + "epoch": 0.582549109182275, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 3.730804649791106e-06, + "logits/chosen": 338970069.3333333, + "logits/rejected": 381893120.0, + "logps/chosen": -407.9075520833333, + "logps/rejected": -453.4623046875, + "loss": 0.0145, + "rewards/chosen": 3.615410486857096, + "rewards/margins": 13.23398577372233, + "rewards/rejected": -9.618575286865234, + "step": 6376 + }, + { + "epoch": 0.5826404751027867, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 3.729413995469758e-06, + "logits/chosen": 791349120.0, + "logits/rejected": 1050132352.0, + "logps/chosen": -379.3592224121094, + "logps/rejected": -778.05712890625, + "loss": 0.0136, + "rewards/chosen": 4.1165289878845215, + "rewards/margins": 16.07247495651245, + "rewards/rejected": -11.95594596862793, + "step": 6377 + }, + { + "epoch": 0.5827318410232983, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 3.7280234462141006e-06, + "logits/chosen": 744592896.0, + "logits/rejected": 283119744.0, + "logps/chosen": -302.37595621744794, + "logps/rejected": -395.2119140625, + "loss": 0.0327, + "rewards/chosen": 3.366154670715332, + "rewards/margins": 12.293200492858887, + "rewards/rejected": -8.927045822143555, + "step": 6378 + }, + { + "epoch": 0.58282320694381, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 3.7266330021391193e-06, + "logits/chosen": 525652192.0, + "logits/rejected": 556184256.0, + "logps/chosen": -332.66766357421875, + "logps/rejected": -383.69482421875, + "loss": 0.0121, + "rewards/chosen": 3.7486343383789062, + "rewards/margins": 13.061424255371094, + "rewards/rejected": -9.312789916992188, + "step": 6379 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 3.725242663359791e-06, + "logits/chosen": 534152640.0, + "logits/rejected": 377884032.0, + "logps/chosen": -175.05502319335938, + "logps/rejected": -471.3006998697917, + "loss": 0.0093, + "rewards/chosen": 3.4559853076934814, + "rewards/margins": 13.72221302986145, + "rewards/rejected": -10.266227722167969, + "step": 6380 + }, + { + "epoch": 0.5830059387848333, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 3.723852429991085e-06, + "logits/chosen": 490864981.3333333, + "logits/rejected": 334626368.0, + "logps/chosen": -328.0312906901042, + "logps/rejected": -241.7498779296875, + "loss": 0.0233, + "rewards/chosen": 4.383193651835124, + "rewards/margins": 12.006076494852703, + "rewards/rejected": -7.622882843017578, + "step": 6381 + }, + { + "epoch": 0.5830973047053449, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 3.722462302147958e-06, + "logits/chosen": 566845312.0, + "logits/rejected": 516582080.0, + "logps/chosen": -291.2496032714844, + "logps/rejected": -470.79296875, + "loss": 0.0175, + "rewards/chosen": 3.412564516067505, + "rewards/margins": 12.606159448623657, + "rewards/rejected": -9.193594932556152, + "step": 6382 + }, + { + "epoch": 0.5831886706258566, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 3.721072279945361e-06, + "logits/chosen": 571794048.0, + "logits/rejected": 385025088.0, + "logps/chosen": -447.85650634765625, + "logps/rejected": -486.6988525390625, + "loss": 0.017, + "rewards/chosen": 4.038760185241699, + "rewards/margins": 13.514851570129395, + "rewards/rejected": -9.476091384887695, + "step": 6383 + }, + { + "epoch": 0.5832800365463682, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 3.7196823634982377e-06, + "logits/chosen": 732347968.0, + "logits/rejected": 476953792.0, + "logps/chosen": -440.94244384765625, + "logps/rejected": -482.178466796875, + "loss": 0.0337, + "rewards/chosen": 2.759892463684082, + "rewards/margins": 12.064352989196777, + "rewards/rejected": -9.304460525512695, + "step": 6384 + }, + { + "epoch": 0.5833714024668799, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 3.71829255292152e-06, + "logits/chosen": 762431680.0, + "logits/rejected": 359821248.0, + "logps/chosen": -432.12603759765625, + "logps/rejected": -282.611083984375, + "loss": 0.0349, + "rewards/chosen": 3.4956910610198975, + "rewards/margins": 11.09611964225769, + "rewards/rejected": -7.600428581237793, + "step": 6385 + }, + { + "epoch": 0.5834627683873915, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 3.7169028483301333e-06, + "logits/chosen": 536015872.0, + "logits/rejected": 339174272.0, + "logps/chosen": -375.3038330078125, + "logps/rejected": -354.1764221191406, + "loss": 0.0134, + "rewards/chosen": 4.217135111490886, + "rewards/margins": 11.007363001505535, + "rewards/rejected": -6.790227890014648, + "step": 6386 + }, + { + "epoch": 0.5835541343079031, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 3.7155132498389924e-06, + "logits/chosen": 469344320.0, + "logits/rejected": 402540352.0, + "logps/chosen": -349.27862548828125, + "logps/rejected": -356.61419677734375, + "loss": 0.0097, + "rewards/chosen": 4.3533935546875, + "rewards/margins": 13.267492294311523, + "rewards/rejected": -8.914098739624023, + "step": 6387 + }, + { + "epoch": 0.5836455002284148, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 3.7141237575630053e-06, + "logits/chosen": 1014616832.0, + "logits/rejected": 1128347392.0, + "logps/chosen": -205.59542846679688, + "logps/rejected": -613.202392578125, + "loss": 0.0187, + "rewards/chosen": 3.750351667404175, + "rewards/margins": 13.765002012252808, + "rewards/rejected": -10.014650344848633, + "step": 6388 + }, + { + "epoch": 0.5837368661489265, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 3.7127343716170685e-06, + "logits/chosen": 939881164.8, + "logits/rejected": 510752469.3333333, + "logps/chosen": -388.67236328125, + "logps/rejected": -382.957275390625, + "loss": 0.0217, + "rewards/chosen": 4.026282119750976, + "rewards/margins": 13.041299311319985, + "rewards/rejected": -9.01501719156901, + "step": 6389 + }, + { + "epoch": 0.5838282320694381, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 3.7113450921160736e-06, + "logits/chosen": 788529408.0, + "logits/rejected": 327035861.3333333, + "logps/chosen": -476.5125427246094, + "logps/rejected": -360.2582600911458, + "loss": 0.0077, + "rewards/chosen": 4.4655609130859375, + "rewards/margins": 12.089488983154297, + "rewards/rejected": -7.623928070068359, + "step": 6390 + }, + { + "epoch": 0.5839195979899497, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 3.709955919174899e-06, + "logits/chosen": 623424000.0, + "logits/rejected": 715878144.0, + "logps/chosen": -204.29812622070312, + "logps/rejected": -500.9353332519531, + "loss": 0.129, + "rewards/chosen": 2.538008213043213, + "rewards/margins": 11.985573291778564, + "rewards/rejected": -9.447565078735352, + "step": 6391 + }, + { + "epoch": 0.5840109639104614, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 3.7085668529084183e-06, + "logits/chosen": 583283584.0, + "logits/rejected": 736328832.0, + "logps/chosen": -278.372314453125, + "logps/rejected": -576.895263671875, + "loss": 0.1335, + "rewards/chosen": 1.8187925815582275, + "rewards/margins": 10.392970323562622, + "rewards/rejected": -8.574177742004395, + "step": 6392 + }, + { + "epoch": 0.5841023298309731, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 3.7071778934314934e-06, + "logits/chosen": 369858730.6666667, + "logits/rejected": 483027865.6, + "logps/chosen": -260.9268391927083, + "logps/rejected": -385.97646484375, + "loss": 0.0262, + "rewards/chosen": 3.024799346923828, + "rewards/margins": 10.577384185791015, + "rewards/rejected": -7.552584838867188, + "step": 6393 + }, + { + "epoch": 0.5841936957514847, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 3.7057890408589776e-06, + "logits/chosen": 359587264.0, + "logits/rejected": 286348992.0, + "logps/chosen": -242.93682861328125, + "logps/rejected": -469.59869384765625, + "loss": 0.023, + "rewards/chosen": 4.0378923416137695, + "rewards/margins": 16.301998138427734, + "rewards/rejected": -12.264105796813965, + "step": 6394 + }, + { + "epoch": 0.5842850616719963, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 3.704400295305718e-06, + "logits/chosen": 614602752.0, + "logits/rejected": 348976981.3333333, + "logps/chosen": -500.77191162109375, + "logps/rejected": -434.8382975260417, + "loss": 0.009, + "rewards/chosen": 3.363272190093994, + "rewards/margins": 11.110889911651611, + "rewards/rejected": -7.747617721557617, + "step": 6395 + }, + { + "epoch": 0.584376427592508, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 3.7030116568865486e-06, + "logits/chosen": 453327360.0, + "logits/rejected": 638944704.0, + "logps/chosen": -331.9738071986607, + "logps/rejected": -684.9100341796875, + "loss": 0.03, + "rewards/chosen": 3.939256123134068, + "rewards/margins": 15.97717707497733, + "rewards/rejected": -12.037920951843262, + "step": 6396 + }, + { + "epoch": 0.5844677935130197, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 3.7016231257162995e-06, + "logits/chosen": 706255658.6666666, + "logits/rejected": 403926592.0, + "logps/chosen": -457.5039876302083, + "logps/rejected": -452.1947937011719, + "loss": 0.02, + "rewards/chosen": 3.8446839650472007, + "rewards/margins": 13.512873967488607, + "rewards/rejected": -9.668190002441406, + "step": 6397 + }, + { + "epoch": 0.5845591594335313, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 3.7002347019097872e-06, + "logits/chosen": 767590997.3333334, + "logits/rejected": 354467635.2, + "logps/chosen": -575.7471110026041, + "logps/rejected": -419.6818359375, + "loss": 0.0185, + "rewards/chosen": 3.131594975789388, + "rewards/margins": 12.001706059773763, + "rewards/rejected": -8.870111083984375, + "step": 6398 + }, + { + "epoch": 0.5846505253540429, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.6988463855818236e-06, + "logits/chosen": 353078144.0, + "logits/rejected": 386230912.0, + "logps/chosen": -287.53289794921875, + "logps/rejected": -424.5926513671875, + "loss": 0.0149, + "rewards/chosen": 3.789754629135132, + "rewards/margins": 14.286750555038452, + "rewards/rejected": -10.49699592590332, + "step": 6399 + }, + { + "epoch": 0.5847418912745546, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 3.6974581768472075e-06, + "logits/chosen": 591373260.8, + "logits/rejected": 497147946.6666667, + "logps/chosen": -348.493310546875, + "logps/rejected": -436.0509847005208, + "loss": 0.0225, + "rewards/chosen": 3.8554050445556642, + "rewards/margins": 11.92325642903646, + "rewards/rejected": -8.067851384480795, + "step": 6400 + }, + { + "epoch": 0.5848332571950663, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 3.6960700758207326e-06, + "logits/chosen": 693583462.4, + "logits/rejected": 303824682.6666667, + "logps/chosen": -357.3255859375, + "logps/rejected": -482.0503743489583, + "loss": 0.0265, + "rewards/chosen": 3.4618766784667967, + "rewards/margins": 12.96981824239095, + "rewards/rejected": -9.507941563924154, + "step": 6401 + }, + { + "epoch": 0.5849246231155779, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 3.6946820826171804e-06, + "logits/chosen": 928637781.3333334, + "logits/rejected": 780839833.6, + "logps/chosen": -405.37744140625, + "logps/rejected": -478.67255859375, + "loss": 0.0045, + "rewards/chosen": 5.290015538533528, + "rewards/margins": 13.869141515096029, + "rewards/rejected": -8.5791259765625, + "step": 6402 + }, + { + "epoch": 0.5850159890360895, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 3.693294197351327e-06, + "logits/chosen": 623787456.0, + "logits/rejected": 288730922.6666667, + "logps/chosen": -317.50970458984375, + "logps/rejected": -494.74267578125, + "loss": 0.0079, + "rewards/chosen": 3.555255174636841, + "rewards/margins": 14.188124895095825, + "rewards/rejected": -10.632869720458984, + "step": 6403 + }, + { + "epoch": 0.5851073549566012, + "grad_norm": 9.375, + "kl": 9.479194641113281, + "learning_rate": 3.691906420137936e-06, + "logits/chosen": 496856832.0, + "logps/chosen": -422.6373291015625, + "loss": 0.0863, + "rewards/chosen": 3.5423731803894043, + "step": 6404 + }, + { + "epoch": 0.5851987208771129, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 3.6905187510917637e-06, + "logits/chosen": 471926357.3333333, + "logits/rejected": 460383488.0, + "logps/chosen": -294.44276936848956, + "logps/rejected": -514.96142578125, + "loss": 0.0247, + "rewards/chosen": 3.646815617879232, + "rewards/margins": 14.71674378712972, + "rewards/rejected": -11.069928169250488, + "step": 6405 + }, + { + "epoch": 0.5852900867976245, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 3.6891311903275593e-06, + "logits/chosen": 520487219.2, + "logits/rejected": 360237013.3333333, + "logps/chosen": -237.20283203125, + "logps/rejected": -340.43511962890625, + "loss": 0.0196, + "rewards/chosen": 3.8185577392578125, + "rewards/margins": 11.61159833272298, + "rewards/rejected": -7.793040593465169, + "step": 6406 + }, + { + "epoch": 0.5853814527181361, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 3.687743737960059e-06, + "logits/chosen": 895090688.0, + "logits/rejected": 652176588.8, + "logps/chosen": -287.452880859375, + "logps/rejected": -305.61748046875, + "loss": 0.0289, + "rewards/chosen": 2.8211441040039062, + "rewards/margins": 9.953466796875, + "rewards/rejected": -7.132322692871094, + "step": 6407 + }, + { + "epoch": 0.5854728186386478, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 3.6863563941039953e-06, + "logits/chosen": 479524608.0, + "logits/rejected": 458689536.0, + "logps/chosen": -476.6053466796875, + "logps/rejected": -447.962646484375, + "loss": 0.0136, + "rewards/chosen": 3.9562668800354004, + "rewards/margins": 12.238033771514893, + "rewards/rejected": -8.281766891479492, + "step": 6408 + }, + { + "epoch": 0.5855641845591595, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 3.6849691588740855e-06, + "logits/chosen": 575022592.0, + "logits/rejected": 472069824.0, + "logps/chosen": -432.9867757161458, + "logps/rejected": -563.9322509765625, + "loss": 0.0278, + "rewards/chosen": 3.4766181310017905, + "rewards/margins": 13.194510777791342, + "rewards/rejected": -9.71789264678955, + "step": 6409 + }, + { + "epoch": 0.5856555504796711, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.6835820323850415e-06, + "logits/chosen": 546453589.3333334, + "logits/rejected": 462532147.2, + "logps/chosen": -242.46537272135416, + "logps/rejected": -542.0357421875, + "loss": 0.0259, + "rewards/chosen": 3.488539695739746, + "rewards/margins": 13.498063468933106, + "rewards/rejected": -10.00952377319336, + "step": 6410 + }, + { + "epoch": 0.5857469164001827, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 3.6821950147515672e-06, + "logits/chosen": 782654208.0, + "logits/rejected": 1016584448.0, + "logps/chosen": -308.6545104980469, + "logps/rejected": -673.3070678710938, + "loss": 0.0905, + "rewards/chosen": 4.203269004821777, + "rewards/margins": 12.128807067871094, + "rewards/rejected": -7.925538063049316, + "step": 6411 + }, + { + "epoch": 0.5858382823206943, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 3.680808106088355e-06, + "logits/chosen": 558439116.8, + "logits/rejected": 295778730.6666667, + "logps/chosen": -343.641650390625, + "logps/rejected": -351.9581298828125, + "loss": 0.0236, + "rewards/chosen": 3.9799407958984374, + "rewards/margins": 14.841763559977213, + "rewards/rejected": -10.861822764078775, + "step": 6412 + }, + { + "epoch": 0.5859296482412061, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 3.679421306510089e-06, + "logits/chosen": 686960384.0, + "logits/rejected": 591297408.0, + "logps/chosen": -349.5414123535156, + "logps/rejected": -482.8707275390625, + "loss": 0.0111, + "rewards/chosen": 3.9482173919677734, + "rewards/margins": 13.746389389038086, + "rewards/rejected": -9.798171997070312, + "step": 6413 + }, + { + "epoch": 0.5860210141617177, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.678034616131447e-06, + "logits/chosen": 876840512.0, + "logits/rejected": 359106912.0, + "logps/chosen": -221.42959594726562, + "logps/rejected": -442.01300048828125, + "loss": 0.0266, + "rewards/chosen": 3.4166572093963623, + "rewards/margins": 12.718010663986206, + "rewards/rejected": -9.301353454589844, + "step": 6414 + }, + { + "epoch": 0.5861123800822293, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 3.676648035067093e-06, + "logits/chosen": 559385600.0, + "logits/rejected": 531327146.6666667, + "logps/chosen": -335.63076171875, + "logps/rejected": -314.28955078125, + "loss": 0.0184, + "rewards/chosen": 3.8962512969970704, + "rewards/margins": 11.953030522664388, + "rewards/rejected": -8.056779225667318, + "step": 6415 + }, + { + "epoch": 0.5862037460027409, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 3.6752615634316863e-06, + "logits/chosen": 396483754.6666667, + "logits/rejected": 386159974.4, + "logps/chosen": -234.44498697916666, + "logps/rejected": -485.554248046875, + "loss": 0.0062, + "rewards/chosen": 4.85487683614095, + "rewards/margins": 14.732411321004232, + "rewards/rejected": -9.877534484863281, + "step": 6416 + }, + { + "epoch": 0.5862951119232527, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 3.6738752013398726e-06, + "logits/chosen": 819111509.3333334, + "logits/rejected": 611004928.0, + "logps/chosen": -353.7115885416667, + "logps/rejected": -703.6916015625, + "loss": 0.0173, + "rewards/chosen": 3.1608988444010415, + "rewards/margins": 13.616092173258464, + "rewards/rejected": -10.455193328857423, + "step": 6417 + }, + { + "epoch": 0.5863864778437643, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 3.6724889489062953e-06, + "logits/chosen": 312084160.0, + "logits/rejected": 436187172.5714286, + "logps/chosen": -415.4478759765625, + "logps/rejected": -492.87339564732144, + "loss": 0.006, + "rewards/chosen": 3.007354736328125, + "rewards/margins": 13.09116690499442, + "rewards/rejected": -10.083812168666295, + "step": 6418 + }, + { + "epoch": 0.5864778437642759, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 3.67110280624558e-06, + "logits/chosen": 517680064.0, + "logits/rejected": 468197632.0, + "logps/chosen": -221.8445281982422, + "logps/rejected": -454.2176208496094, + "loss": 0.1257, + "rewards/chosen": 2.3944668769836426, + "rewards/margins": 14.101891994476318, + "rewards/rejected": -11.707425117492676, + "step": 6419 + }, + { + "epoch": 0.5865692096847875, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 3.6697167734723517e-06, + "logits/chosen": 315322176.0, + "logits/rejected": 402373760.0, + "logps/chosen": -351.1789855957031, + "logps/rejected": -460.3904622395833, + "loss": 0.0123, + "rewards/chosen": 3.915719509124756, + "rewards/margins": 12.43599017461141, + "rewards/rejected": -8.520270665486654, + "step": 6420 + }, + { + "epoch": 0.5866605756052993, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 3.6683308507012196e-06, + "logits/chosen": 833196416.0, + "logits/rejected": 594862656.0, + "logps/chosen": -396.55029296875, + "logps/rejected": -451.8285827636719, + "loss": 0.0147, + "rewards/chosen": 4.613907337188721, + "rewards/margins": 12.834936618804932, + "rewards/rejected": -8.221029281616211, + "step": 6421 + }, + { + "epoch": 0.5867519415258109, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 3.6669450380467898e-06, + "logits/chosen": 504306858.6666667, + "logits/rejected": 478133760.0, + "logps/chosen": -375.7514241536458, + "logps/rejected": -534.045263671875, + "loss": 0.0135, + "rewards/chosen": 4.351357777913411, + "rewards/margins": 12.907188161214194, + "rewards/rejected": -8.555830383300782, + "step": 6422 + }, + { + "epoch": 0.5868433074463225, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.665559335623653e-06, + "logits/chosen": 770979328.0, + "logits/rejected": 565008076.8, + "logps/chosen": -223.8418172200521, + "logps/rejected": -461.789892578125, + "loss": 0.0136, + "rewards/chosen": 4.229455629984538, + "rewards/margins": 13.582710329691569, + "rewards/rejected": -9.353254699707032, + "step": 6423 + }, + { + "epoch": 0.5869346733668341, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 3.6641737435463953e-06, + "logits/chosen": 702848448.0, + "logits/rejected": 410944448.0, + "logps/chosen": -434.9144287109375, + "logps/rejected": -598.2420654296875, + "loss": 0.0099, + "rewards/chosen": 4.170872688293457, + "rewards/margins": 14.165095329284668, + "rewards/rejected": -9.994222640991211, + "step": 6424 + }, + { + "epoch": 0.5870260392873459, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 3.662788261929593e-06, + "logits/chosen": 413736490.6666667, + "logits/rejected": 752744768.0, + "logps/chosen": -342.1005859375, + "logps/rejected": -502.04498291015625, + "loss": 0.0254, + "rewards/chosen": 3.691549301147461, + "rewards/margins": 13.084735870361328, + "rewards/rejected": -9.393186569213867, + "step": 6425 + }, + { + "epoch": 0.5871174052078575, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 3.6614028908878107e-06, + "logits/chosen": 678372928.0, + "logits/rejected": 444776832.0, + "logps/chosen": -444.9478759765625, + "logps/rejected": -351.96905517578125, + "loss": 0.0094, + "rewards/chosen": 4.298836708068848, + "rewards/margins": 13.292880058288574, + "rewards/rejected": -8.994043350219727, + "step": 6426 + }, + { + "epoch": 0.5872087711283691, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 3.6600176305356076e-06, + "logits/chosen": 550691072.0, + "logits/rejected": 1118680704.0, + "logps/chosen": -339.45330810546875, + "logps/rejected": -584.8634643554688, + "loss": 0.0232, + "rewards/chosen": 3.4489359855651855, + "rewards/margins": 11.638582706451416, + "rewards/rejected": -8.18964672088623, + "step": 6427 + }, + { + "epoch": 0.5873001370488807, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 3.658632480987531e-06, + "logits/chosen": 346094957.71428573, + "logits/rejected": 406510208.0, + "logps/chosen": -278.7021484375, + "logps/rejected": -257.50897216796875, + "loss": 0.1125, + "rewards/chosen": 3.4020674569266185, + "rewards/margins": 10.476184163774763, + "rewards/rejected": -7.0741167068481445, + "step": 6428 + }, + { + "epoch": 0.5873915029693925, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 3.6572474423581207e-06, + "logits/chosen": 452596266.6666667, + "logits/rejected": 359407808.0, + "logps/chosen": -267.037841796875, + "logps/rejected": -287.07763671875, + "loss": 0.0201, + "rewards/chosen": 3.8979361852010093, + "rewards/margins": 11.413810094197592, + "rewards/rejected": -7.515873908996582, + "step": 6429 + }, + { + "epoch": 0.5874828688899041, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 3.6558625147619055e-06, + "logits/chosen": 433850688.0, + "logits/rejected": 313964640.0, + "logps/chosen": -340.6041259765625, + "logps/rejected": -475.61962890625, + "loss": 0.0287, + "rewards/chosen": 2.8474931716918945, + "rewards/margins": 14.80500602722168, + "rewards/rejected": -11.957512855529785, + "step": 6430 + }, + { + "epoch": 0.5875742348104157, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.6544776983134074e-06, + "logits/chosen": 558209689.6, + "logits/rejected": 419949738.6666667, + "logps/chosen": -257.7849853515625, + "logps/rejected": -488.7855224609375, + "loss": 0.0221, + "rewards/chosen": 3.635201263427734, + "rewards/margins": 13.879610188802083, + "rewards/rejected": -10.24440892537435, + "step": 6431 + }, + { + "epoch": 0.5876656007309273, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 3.653092993127136e-06, + "logits/chosen": 564274892.8, + "logits/rejected": 748909141.3333334, + "logps/chosen": -280.0609130859375, + "logps/rejected": -657.7549641927084, + "loss": 0.0304, + "rewards/chosen": 4.181985473632812, + "rewards/margins": 14.911515808105468, + "rewards/rejected": -10.729530334472656, + "step": 6432 + }, + { + "epoch": 0.5877569666514391, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 3.6517083993175956e-06, + "logits/chosen": 411851008.0, + "logits/rejected": 392329045.3333333, + "logps/chosen": -360.1727294921875, + "logps/rejected": -353.8076578776042, + "loss": 0.0103, + "rewards/chosen": 4.257976531982422, + "rewards/margins": 12.870762507120768, + "rewards/rejected": -8.612785975138346, + "step": 6433 + }, + { + "epoch": 0.5878483325719507, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 3.6503239169992778e-06, + "logits/chosen": 650349209.6, + "logits/rejected": 342335338.6666667, + "logps/chosen": -423.304052734375, + "logps/rejected": -492.8387858072917, + "loss": 0.0414, + "rewards/chosen": 2.9914714813232424, + "rewards/margins": 13.456788253784179, + "rewards/rejected": -10.465316772460938, + "step": 6434 + }, + { + "epoch": 0.5879396984924623, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 3.6489395462866694e-06, + "logits/chosen": 409322272.0, + "logits/rejected": 609433770.6666666, + "logps/chosen": -384.91729736328125, + "logps/rejected": -771.9627278645834, + "loss": 0.0103, + "rewards/chosen": 4.1192474365234375, + "rewards/margins": 13.460933049519857, + "rewards/rejected": -9.34168561299642, + "step": 6435 + }, + { + "epoch": 0.5880310644129739, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 3.64755528729424e-06, + "logits/chosen": 624264448.0, + "logits/rejected": 761556377.6, + "logps/chosen": -445.2286783854167, + "logps/rejected": -507.985009765625, + "loss": 0.0124, + "rewards/chosen": 4.744001388549805, + "rewards/margins": 15.673664474487305, + "rewards/rejected": -10.9296630859375, + "step": 6436 + }, + { + "epoch": 0.5881224303334857, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 3.646171140136459e-06, + "logits/chosen": 788882432.0, + "logits/rejected": 885959680.0, + "logps/chosen": -334.0200500488281, + "logps/rejected": -680.1143188476562, + "loss": 0.2214, + "rewards/chosen": 2.544501781463623, + "rewards/margins": 10.692519664764404, + "rewards/rejected": -8.148017883300781, + "step": 6437 + }, + { + "epoch": 0.5882137962539973, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 3.64478710492778e-06, + "logits/chosen": 346640457.14285713, + "logits/rejected": 906628352.0, + "logps/chosen": -268.37098911830356, + "logps/rejected": -247.23806762695312, + "loss": 0.013, + "rewards/chosen": 4.5864377702985495, + "rewards/margins": 14.867721148899623, + "rewards/rejected": -10.281283378601074, + "step": 6438 + }, + { + "epoch": 0.5883051621745089, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 3.6434031817826522e-06, + "logits/chosen": 556831360.0, + "logits/rejected": 308025280.0, + "logps/chosen": -249.95294189453125, + "logps/rejected": -351.9957275390625, + "loss": 0.016, + "rewards/chosen": 3.716856002807617, + "rewards/margins": 13.046939849853516, + "rewards/rejected": -9.330083847045898, + "step": 6439 + }, + { + "epoch": 0.5883965280950205, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.6420193708155104e-06, + "logits/chosen": 439747993.6, + "logits/rejected": 227182720.0, + "logps/chosen": -425.0283203125, + "logps/rejected": -619.9224446614584, + "loss": 0.027, + "rewards/chosen": 3.2085346221923827, + "rewards/margins": 12.805102411905924, + "rewards/rejected": -9.596567789713541, + "step": 6440 + }, + { + "epoch": 0.5884878940155323, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.6406356721407863e-06, + "logits/chosen": 900593536.0, + "logits/rejected": 387996224.0, + "logps/chosen": -353.4693908691406, + "logps/rejected": -543.4217529296875, + "loss": 0.0151, + "rewards/chosen": 3.651175022125244, + "rewards/margins": 14.473567485809326, + "rewards/rejected": -10.822392463684082, + "step": 6441 + }, + { + "epoch": 0.5885792599360439, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 3.6392520858728964e-06, + "logits/chosen": 1430758058.6666667, + "logits/rejected": 671780249.6, + "logps/chosen": -522.354736328125, + "logps/rejected": -587.4955078125, + "loss": 0.0052, + "rewards/chosen": 4.402156511942546, + "rewards/margins": 15.704686419169107, + "rewards/rejected": -11.302529907226562, + "step": 6442 + }, + { + "epoch": 0.5886706258565555, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 3.637868612126252e-06, + "logits/chosen": 390906624.0, + "logits/rejected": 513801760.0, + "logps/chosen": -266.54616292317706, + "logps/rejected": -539.0838623046875, + "loss": 0.0697, + "rewards/chosen": 2.557612737019857, + "rewards/margins": 10.888352711995443, + "rewards/rejected": -8.330739974975586, + "step": 6443 + }, + { + "epoch": 0.5887619917770671, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.636485251015253e-06, + "logits/chosen": 433190208.0, + "logits/rejected": 411257600.0, + "logps/chosen": -254.73028564453125, + "logps/rejected": -476.21771240234375, + "loss": 0.0149, + "rewards/chosen": 4.058935165405273, + "rewards/margins": 12.973782539367676, + "rewards/rejected": -8.914847373962402, + "step": 6444 + }, + { + "epoch": 0.5888533576975788, + "grad_norm": 0.2431640625, + "kl": 0.0, + "learning_rate": 3.6351020026542897e-06, + "logits/chosen": 192563824.0, + "logits/rejected": 533386313.14285713, + "logps/chosen": -161.9395294189453, + "logps/rejected": -438.996826171875, + "loss": 0.0012, + "rewards/chosen": 4.941230773925781, + "rewards/margins": 14.807474408830915, + "rewards/rejected": -9.866243634905134, + "step": 6445 + }, + { + "epoch": 0.5889447236180905, + "grad_norm": 0.890625, + "kl": 0.0, + "learning_rate": 3.6337188671577463e-06, + "logits/chosen": 333403093.3333333, + "logits/rejected": 261586278.4, + "logps/chosen": -189.578125, + "logps/rejected": -363.8941650390625, + "loss": 0.0061, + "rewards/chosen": 4.424750328063965, + "rewards/margins": 14.161849784851075, + "rewards/rejected": -9.73709945678711, + "step": 6446 + }, + { + "epoch": 0.5890360895386021, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 3.6323358446399914e-06, + "logits/chosen": 219946224.0, + "logits/rejected": 384010880.0, + "logps/chosen": -209.7795867919922, + "logps/rejected": -324.3634847005208, + "loss": 0.008, + "rewards/chosen": 5.055849075317383, + "rewards/margins": 13.260327021280924, + "rewards/rejected": -8.204477945963541, + "step": 6447 + }, + { + "epoch": 0.5891274554591137, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.6309529352153925e-06, + "logits/chosen": 249370645.33333334, + "logits/rejected": 271826688.0, + "logps/chosen": -360.1490478515625, + "logps/rejected": -422.6267578125, + "loss": 0.0113, + "rewards/chosen": 4.561463673909505, + "rewards/margins": 15.640182240804037, + "rewards/rejected": -11.078718566894532, + "step": 6448 + }, + { + "epoch": 0.5892188213796254, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 3.6295701389982997e-06, + "logits/chosen": 707676288.0, + "logits/rejected": 532972896.0, + "logps/chosen": -318.4991455078125, + "logps/rejected": -433.02655029296875, + "loss": 0.0232, + "rewards/chosen": 3.2656822204589844, + "rewards/margins": 9.355636596679688, + "rewards/rejected": -6.089954376220703, + "step": 6449 + }, + { + "epoch": 0.5893101873001371, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 3.6281874561030607e-06, + "logits/chosen": 365932105.14285713, + "logits/rejected": 257177728.0, + "logps/chosen": -251.53424944196428, + "logps/rejected": -443.82269287109375, + "loss": 0.031, + "rewards/chosen": 3.8978233337402344, + "rewards/margins": 13.852193832397461, + "rewards/rejected": -9.954370498657227, + "step": 6450 + }, + { + "epoch": 0.5894015532206487, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.6268048866440075e-06, + "logits/chosen": 487143104.0, + "logits/rejected": 596677290.6666666, + "logps/chosen": -393.15850830078125, + "logps/rejected": -414.0946044921875, + "loss": 0.0129, + "rewards/chosen": 3.954876661300659, + "rewards/margins": 11.689297437667847, + "rewards/rejected": -7.7344207763671875, + "step": 6451 + }, + { + "epoch": 0.5894929191411603, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 3.625422430735468e-06, + "logits/chosen": 528914688.0, + "logits/rejected": 628507776.0, + "logps/chosen": -304.42885335286456, + "logps/rejected": -778.439208984375, + "loss": 0.0242, + "rewards/chosen": 3.9416586558024087, + "rewards/margins": 12.132589022318522, + "rewards/rejected": -8.190930366516113, + "step": 6452 + }, + { + "epoch": 0.589584285061672, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 3.624040088491757e-06, + "logits/chosen": 467636326.4, + "logits/rejected": 477189589.3333333, + "logps/chosen": -322.875341796875, + "logps/rejected": -568.6516927083334, + "loss": 0.0116, + "rewards/chosen": 4.170708847045899, + "rewards/margins": 14.8452579498291, + "rewards/rejected": -10.674549102783203, + "step": 6453 + }, + { + "epoch": 0.5896756509821837, + "grad_norm": 0.515625, + "kl": 0.0, + "learning_rate": 3.6226578600271835e-06, + "logits/chosen": 243600768.0, + "logits/rejected": 424677973.3333333, + "logps/chosen": -118.14930725097656, + "logps/rejected": -385.5734049479167, + "loss": 0.0029, + "rewards/chosen": 4.5855278968811035, + "rewards/margins": 13.202402591705322, + "rewards/rejected": -8.616874694824219, + "step": 6454 + }, + { + "epoch": 0.5897670169026953, + "grad_norm": 0.453125, + "kl": 0.0, + "learning_rate": 3.6212757454560433e-06, + "logits/chosen": 324677280.0, + "logits/rejected": 478246582.85714287, + "logps/chosen": -170.97140502929688, + "logps/rejected": -579.0789271763393, + "loss": 0.0022, + "rewards/chosen": 4.3671112060546875, + "rewards/margins": 13.486930847167969, + "rewards/rejected": -9.119819641113281, + "step": 6455 + }, + { + "epoch": 0.5898583828232069, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 3.619893744892624e-06, + "logits/chosen": 458622549.3333333, + "logits/rejected": 494476492.8, + "logps/chosen": -227.773681640625, + "logps/rejected": -386.6318359375, + "loss": 0.0115, + "rewards/chosen": 4.2512868245442705, + "rewards/margins": 12.789119466145834, + "rewards/rejected": -8.537832641601563, + "step": 6456 + }, + { + "epoch": 0.5899497487437186, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 3.6185118584512058e-06, + "logits/chosen": 747909376.0, + "logits/rejected": 554480128.0, + "logps/chosen": -494.1628011067708, + "logps/rejected": -546.6732421875, + "loss": 0.0109, + "rewards/chosen": 3.883753458658854, + "rewards/margins": 12.636380259195963, + "rewards/rejected": -8.75262680053711, + "step": 6457 + }, + { + "epoch": 0.5900411146642303, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 3.617130086246056e-06, + "logits/chosen": 591228518.4, + "logits/rejected": 596222805.3333334, + "logps/chosen": -229.741064453125, + "logps/rejected": -649.6824137369791, + "loss": 0.0094, + "rewards/chosen": 4.685481262207031, + "rewards/margins": 13.803752136230468, + "rewards/rejected": -9.118270874023438, + "step": 6458 + }, + { + "epoch": 0.5901324805847419, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.6157484283914367e-06, + "logits/chosen": 489336149.3333333, + "logits/rejected": 222144864.0, + "logps/chosen": -217.73372395833334, + "logps/rejected": -369.26605224609375, + "loss": 0.0347, + "rewards/chosen": 3.3157386779785156, + "rewards/margins": 14.86440658569336, + "rewards/rejected": -11.548667907714844, + "step": 6459 + }, + { + "epoch": 0.5902238465052535, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 3.6143668850015964e-06, + "logits/chosen": 445988812.8, + "logits/rejected": 465778474.6666667, + "logps/chosen": -296.3728759765625, + "logps/rejected": -593.238525390625, + "loss": 0.0183, + "rewards/chosen": 3.852194595336914, + "rewards/margins": 17.1670711517334, + "rewards/rejected": -13.314876556396484, + "step": 6460 + }, + { + "epoch": 0.5903152124257652, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 3.6129854561907786e-06, + "logits/chosen": 362847667.2, + "logits/rejected": 341881642.6666667, + "logps/chosen": -295.7353271484375, + "logps/rejected": -227.616455078125, + "loss": 0.0316, + "rewards/chosen": 3.777201461791992, + "rewards/margins": 11.405429967244466, + "rewards/rejected": -7.628228505452474, + "step": 6461 + }, + { + "epoch": 0.5904065783462769, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.6116041420732106e-06, + "logits/chosen": 372397107.2, + "logits/rejected": 156068906.66666666, + "logps/chosen": -196.989013671875, + "logps/rejected": -277.74835205078125, + "loss": 0.1351, + "rewards/chosen": 2.6172159194946287, + "rewards/margins": 10.52590986887614, + "rewards/rejected": -7.908693949381511, + "step": 6462 + }, + { + "epoch": 0.5904979442667885, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.610222942763116e-06, + "logits/chosen": 567108480.0, + "logits/rejected": 574503872.0, + "logps/chosen": -409.9151204427083, + "logps/rejected": -814.648681640625, + "loss": 0.0381, + "rewards/chosen": 3.271498998006185, + "rewards/margins": 14.6278928120931, + "rewards/rejected": -11.356393814086914, + "step": 6463 + }, + { + "epoch": 0.5905893101873001, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.608841858374708e-06, + "logits/chosen": 663806549.3333334, + "logits/rejected": 575478579.2, + "logps/chosen": -409.5125732421875, + "logps/rejected": -580.782421875, + "loss": 0.0123, + "rewards/chosen": 3.6162405014038086, + "rewards/margins": 13.111446952819824, + "rewards/rejected": -9.495206451416015, + "step": 6464 + }, + { + "epoch": 0.5906806761078118, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 3.60746088902219e-06, + "logits/chosen": 671129429.3333334, + "logits/rejected": 275310924.8, + "logps/chosen": -468.3810628255208, + "logps/rejected": -418.13349609375, + "loss": 0.0142, + "rewards/chosen": 3.332465489705404, + "rewards/margins": 11.27946408589681, + "rewards/rejected": -7.946998596191406, + "step": 6465 + }, + { + "epoch": 0.5907720420283235, + "grad_norm": 65.5, + "kl": 0.0, + "learning_rate": 3.6060800348197524e-06, + "logits/chosen": 470864054.85714287, + "logits/rejected": 390761408.0, + "logps/chosen": -330.68143136160717, + "logps/rejected": -676.953857421875, + "loss": 0.0862, + "rewards/chosen": 3.4234275817871094, + "rewards/margins": 13.368258476257324, + "rewards/rejected": -9.944830894470215, + "step": 6466 + }, + { + "epoch": 0.5908634079488351, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 3.604699295881582e-06, + "logits/chosen": 363717632.0, + "logits/rejected": 503726400.0, + "logps/chosen": -239.23758370535714, + "logps/rejected": -361.3750915527344, + "loss": 0.0272, + "rewards/chosen": 3.800614765712193, + "rewards/margins": 10.547639301845006, + "rewards/rejected": -6.7470245361328125, + "step": 6467 + }, + { + "epoch": 0.5909547738693467, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.6033186723218503e-06, + "logits/chosen": 363414016.0, + "logits/rejected": 159804576.0, + "logps/chosen": -279.7415771484375, + "logps/rejected": -328.8769226074219, + "loss": 0.0166, + "rewards/chosen": 4.048236846923828, + "rewards/margins": 14.064680099487305, + "rewards/rejected": -10.016443252563477, + "step": 6468 + }, + { + "epoch": 0.5910461397898584, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.6019381642547262e-06, + "logits/chosen": 402517216.0, + "logits/rejected": 760923584.0, + "logps/chosen": -248.9547119140625, + "logps/rejected": -682.270751953125, + "loss": 0.0167, + "rewards/chosen": 4.035528182983398, + "rewards/margins": 13.9315185546875, + "rewards/rejected": -9.895990371704102, + "step": 6469 + }, + { + "epoch": 0.59113750571037, + "grad_norm": 0.88671875, + "kl": 0.0, + "learning_rate": 3.6005577717943607e-06, + "logits/chosen": 277878464.0, + "logits/rejected": 442994218.6666667, + "logps/chosen": -224.25946044921875, + "logps/rejected": -512.8170572916666, + "loss": 0.0044, + "rewards/chosen": 4.187989711761475, + "rewards/margins": 13.027055263519287, + "rewards/rejected": -8.839065551757812, + "step": 6470 + }, + { + "epoch": 0.5912288716308817, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 3.599177495054903e-06, + "logits/chosen": 521618432.0, + "logits/rejected": 585943722.6666666, + "logps/chosen": -246.5442138671875, + "logps/rejected": -443.4871012369792, + "loss": 0.0074, + "rewards/chosen": 4.624777984619141, + "rewards/margins": 13.432449849446616, + "rewards/rejected": -8.807671864827475, + "step": 6471 + }, + { + "epoch": 0.5913202375513933, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 3.597797334150487e-06, + "logits/chosen": 541454482.2857143, + "logits/rejected": 504975584.0, + "logps/chosen": -346.0126255580357, + "logps/rejected": -433.251708984375, + "loss": 0.023, + "rewards/chosen": 3.8128623962402344, + "rewards/margins": 12.980364799499512, + "rewards/rejected": -9.167502403259277, + "step": 6472 + }, + { + "epoch": 0.591411603471905, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 3.5964172891952397e-06, + "logits/chosen": 735773952.0, + "logits/rejected": 647024960.0, + "logps/chosen": -270.0640869140625, + "logps/rejected": -716.4237060546875, + "loss": 0.1486, + "rewards/chosen": 2.360378901163737, + "rewards/margins": 13.003586451212565, + "rewards/rejected": -10.643207550048828, + "step": 6473 + }, + { + "epoch": 0.5915029693924166, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 3.5950373603032775e-06, + "logits/chosen": 1014087680.0, + "logits/rejected": 678068437.3333334, + "logps/chosen": -270.90771484375, + "logps/rejected": -724.470703125, + "loss": 0.0238, + "rewards/chosen": 3.4037635803222654, + "rewards/margins": 11.873148091634114, + "rewards/rejected": -8.46938451131185, + "step": 6474 + }, + { + "epoch": 0.5915943353129283, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 3.5936575475887093e-06, + "logits/chosen": 419870933.3333333, + "logits/rejected": 567045120.0, + "logps/chosen": -273.330078125, + "logps/rejected": -586.09482421875, + "loss": 0.01, + "rewards/chosen": 4.245523134867351, + "rewards/margins": 14.649198977152508, + "rewards/rejected": -10.403675842285157, + "step": 6475 + }, + { + "epoch": 0.5916857012334399, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 3.592277851165632e-06, + "logits/chosen": 565801088.0, + "logits/rejected": 357377312.0, + "logps/chosen": -433.66796875, + "logps/rejected": -361.3568115234375, + "loss": 0.0104, + "rewards/chosen": 4.297240257263184, + "rewards/margins": 12.244007587432861, + "rewards/rejected": -7.946767330169678, + "step": 6476 + }, + { + "epoch": 0.5917770671539516, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 3.590898271148132e-06, + "logits/chosen": 559668428.8, + "logits/rejected": 419795328.0, + "logps/chosen": -370.1659912109375, + "logps/rejected": -498.5630289713542, + "loss": 0.0143, + "rewards/chosen": 4.042163848876953, + "rewards/margins": 11.645917256673176, + "rewards/rejected": -7.603753407796224, + "step": 6477 + }, + { + "epoch": 0.5918684330744632, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 3.5895188076502907e-06, + "logits/chosen": 530419754.6666667, + "logits/rejected": 455241472.0, + "logps/chosen": -302.09320068359375, + "logps/rejected": -476.79501953125, + "loss": 0.0146, + "rewards/chosen": 3.945488611857096, + "rewards/margins": 12.980742518107096, + "rewards/rejected": -9.03525390625, + "step": 6478 + }, + { + "epoch": 0.5919597989949749, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 3.5881394607861753e-06, + "logits/chosen": 287883878.4, + "logits/rejected": 369125802.6666667, + "logps/chosen": -237.944580078125, + "logps/rejected": -468.5394694010417, + "loss": 0.0254, + "rewards/chosen": 4.077901840209961, + "rewards/margins": 14.582279841105143, + "rewards/rejected": -10.504378000895182, + "step": 6479 + }, + { + "epoch": 0.5920511649154865, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 3.5867602306698456e-06, + "logits/chosen": 608122709.3333334, + "logits/rejected": 339973888.0, + "logps/chosen": -323.10019938151044, + "logps/rejected": -260.1859130859375, + "loss": 0.0178, + "rewards/chosen": 4.445828437805176, + "rewards/margins": 12.932156562805176, + "rewards/rejected": -8.486328125, + "step": 6480 + }, + { + "epoch": 0.5921425308359982, + "grad_norm": 6.125, + "kl": 1.4871902465820312, + "learning_rate": 3.58538111741535e-06, + "logits/chosen": 763205546.6666666, + "logits/rejected": 1821746048.0, + "logps/chosen": -317.2095947265625, + "logps/rejected": -660.865234375, + "loss": 0.0337, + "rewards/chosen": 3.8703810373942056, + "rewards/margins": 15.752171198527018, + "rewards/rejected": -11.881790161132812, + "step": 6481 + }, + { + "epoch": 0.5922338967565098, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.5840021211367305e-06, + "logits/chosen": 657531545.6, + "logits/rejected": 1405861717.3333333, + "logps/chosen": -189.27099609375, + "logps/rejected": -547.9638264973959, + "loss": 0.0247, + "rewards/chosen": 3.7408294677734375, + "rewards/margins": 14.29365348815918, + "rewards/rejected": -10.552824020385742, + "step": 6482 + }, + { + "epoch": 0.5923252626770215, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 3.5826232419480147e-06, + "logits/chosen": 719496064.0, + "logits/rejected": 417000928.0, + "logps/chosen": -414.8887634277344, + "logps/rejected": -394.2066345214844, + "loss": 0.015, + "rewards/chosen": 3.5801291465759277, + "rewards/margins": 13.983189105987549, + "rewards/rejected": -10.403059959411621, + "step": 6483 + }, + { + "epoch": 0.5924166285975331, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 3.581244479963225e-06, + "logits/chosen": 645975936.0, + "logits/rejected": 469907382.85714287, + "logps/chosen": -414.1658935546875, + "logps/rejected": -614.5676618303571, + "loss": 0.0067, + "rewards/chosen": 2.882830858230591, + "rewards/margins": 14.390847172055926, + "rewards/rejected": -11.508016313825335, + "step": 6484 + }, + { + "epoch": 0.5925079945180448, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 3.5798658352963722e-06, + "logits/chosen": 493502549.3333333, + "logits/rejected": 482277785.6, + "logps/chosen": -473.5531412760417, + "logps/rejected": -792.38916015625, + "loss": 0.0075, + "rewards/chosen": 4.368653933207194, + "rewards/margins": 16.44703222910563, + "rewards/rejected": -12.078378295898437, + "step": 6485 + }, + { + "epoch": 0.5925993604385564, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 3.5784873080614567e-06, + "logits/chosen": 340717162.6666667, + "logits/rejected": 268380569.6, + "logps/chosen": -264.665771484375, + "logps/rejected": -454.89228515625, + "loss": 0.0153, + "rewards/chosen": 4.027232487996419, + "rewards/margins": 14.988569768269855, + "rewards/rejected": -10.961337280273437, + "step": 6486 + }, + { + "epoch": 0.592690726359068, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 3.577108898372471e-06, + "logits/chosen": 399512627.2, + "logits/rejected": 410273024.0, + "logps/chosen": -290.920458984375, + "logps/rejected": -435.01513671875, + "loss": 0.0225, + "rewards/chosen": 3.929241180419922, + "rewards/margins": 14.554914474487305, + "rewards/rejected": -10.625673294067383, + "step": 6487 + }, + { + "epoch": 0.5927820922795797, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 3.5757306063433967e-06, + "logits/chosen": 566158378.6666666, + "logits/rejected": 314796928.0, + "logps/chosen": -300.498046875, + "logps/rejected": -355.253466796875, + "loss": 0.0314, + "rewards/chosen": 3.421205202738444, + "rewards/margins": 11.877563540140788, + "rewards/rejected": -8.456358337402344, + "step": 6488 + }, + { + "epoch": 0.5928734582000914, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 3.5743524320882027e-06, + "logits/chosen": 780612198.4, + "logits/rejected": 471279104.0, + "logps/chosen": -541.420703125, + "logps/rejected": -209.06742350260416, + "loss": 0.0109, + "rewards/chosen": 4.610704040527343, + "rewards/margins": 12.502494430541992, + "rewards/rejected": -7.891790390014648, + "step": 6489 + }, + { + "epoch": 0.592964824120603, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 3.5729743757208558e-06, + "logits/chosen": 568674688.0, + "logits/rejected": 800592320.0, + "logps/chosen": -321.350830078125, + "logps/rejected": -410.8978271484375, + "loss": 0.0334, + "rewards/chosen": 2.676823854446411, + "rewards/margins": 11.506377458572388, + "rewards/rejected": -8.829553604125977, + "step": 6490 + }, + { + "epoch": 0.5930561900411147, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 3.571596437355304e-06, + "logits/chosen": 593751125.3333334, + "logits/rejected": 1060020838.4, + "logps/chosen": -367.392822265625, + "logps/rejected": -602.178515625, + "loss": 0.015, + "rewards/chosen": 3.2981897989908853, + "rewards/margins": 12.0948122660319, + "rewards/rejected": -8.796622467041015, + "step": 6491 + }, + { + "epoch": 0.5931475559616263, + "grad_norm": 0.8203125, + "kl": 0.0, + "learning_rate": 3.5702186171054944e-06, + "logits/chosen": 351301792.0, + "logits/rejected": 330791488.0, + "logps/chosen": -329.27581787109375, + "logps/rejected": -499.446044921875, + "loss": 0.0045, + "rewards/chosen": 4.894547462463379, + "rewards/margins": 13.619399070739746, + "rewards/rejected": -8.724851608276367, + "step": 6492 + }, + { + "epoch": 0.593238921882138, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 3.5688409150853563e-06, + "logits/chosen": 476255232.0, + "logits/rejected": 704884821.3333334, + "logps/chosen": -319.311865234375, + "logps/rejected": -235.53365071614584, + "loss": 0.0165, + "rewards/chosen": 4.069551086425781, + "rewards/margins": 10.676071421305338, + "rewards/rejected": -6.606520334879558, + "step": 6493 + }, + { + "epoch": 0.5933302878026496, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 3.567463331408816e-06, + "logits/chosen": 598163840.0, + "logits/rejected": 1096095232.0, + "logps/chosen": -337.3159993489583, + "logps/rejected": -606.19404296875, + "loss": 0.019, + "rewards/chosen": 4.347728729248047, + "rewards/margins": 14.198473358154297, + "rewards/rejected": -9.85074462890625, + "step": 6494 + }, + { + "epoch": 0.5934216537231612, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 3.566085866189784e-06, + "logits/chosen": 775870336.0, + "logits/rejected": 654750037.3333334, + "logps/chosen": -439.6241149902344, + "logps/rejected": -483.4591064453125, + "loss": 0.016, + "rewards/chosen": 2.852633476257324, + "rewards/margins": 12.425320625305176, + "rewards/rejected": -9.572687149047852, + "step": 6495 + }, + { + "epoch": 0.5935130196436729, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 3.5647085195421668e-06, + "logits/chosen": 756007936.0, + "logits/rejected": 583417856.0, + "logps/chosen": -587.4163818359375, + "logps/rejected": -369.3468933105469, + "loss": 0.0218, + "rewards/chosen": 4.312029838562012, + "rewards/margins": 11.593716621398926, + "rewards/rejected": -7.281686782836914, + "step": 6496 + }, + { + "epoch": 0.5936043855641846, + "grad_norm": 0.8359375, + "kl": 0.0, + "learning_rate": 3.5633312915798567e-06, + "logits/chosen": 438430080.0, + "logits/rejected": 859337536.0, + "logps/chosen": -253.79080200195312, + "logps/rejected": -690.8395385742188, + "loss": 0.0215, + "rewards/chosen": 3.8422582149505615, + "rewards/margins": 15.523786306381226, + "rewards/rejected": -11.681528091430664, + "step": 6497 + }, + { + "epoch": 0.5936957514846962, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 3.5619541824167364e-06, + "logits/chosen": 836323993.6, + "logits/rejected": 1086814208.0, + "logps/chosen": -366.57373046875, + "logps/rejected": -539.4690755208334, + "loss": 0.0235, + "rewards/chosen": 4.313738250732422, + "rewards/margins": 13.309776051839194, + "rewards/rejected": -8.996037801106771, + "step": 6498 + }, + { + "epoch": 0.5937871174052078, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.5605771921666826e-06, + "logits/chosen": 740837734.4, + "logits/rejected": 691153322.6666666, + "logps/chosen": -403.426953125, + "logps/rejected": -597.6410319010416, + "loss": 0.0306, + "rewards/chosen": 2.986968421936035, + "rewards/margins": 14.624663734436036, + "rewards/rejected": -11.6376953125, + "step": 6499 + }, + { + "epoch": 0.5938784833257195, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 3.559200320943558e-06, + "logits/chosen": 380443443.2, + "logits/rejected": 873722538.6666666, + "logps/chosen": -234.710986328125, + "logps/rejected": -712.9143880208334, + "loss": 0.0252, + "rewards/chosen": 3.508835220336914, + "rewards/margins": 14.183110936482748, + "rewards/rejected": -10.674275716145834, + "step": 6500 + }, + { + "epoch": 0.5939698492462312, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 3.5578235688612174e-06, + "logits/chosen": 480001433.6, + "logits/rejected": 409953408.0, + "logps/chosen": -445.707080078125, + "logps/rejected": -407.2766927083333, + "loss": 0.0218, + "rewards/chosen": 3.629913330078125, + "rewards/margins": 13.014788309733072, + "rewards/rejected": -9.384874979654947, + "step": 6501 + }, + { + "epoch": 0.5940612151667428, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 3.556446936033505e-06, + "logits/chosen": 793449813.3333334, + "logits/rejected": 374413107.2, + "logps/chosen": -407.1788330078125, + "logps/rejected": -420.70810546875, + "loss": 0.0088, + "rewards/chosen": 4.150066057840983, + "rewards/margins": 12.90873228708903, + "rewards/rejected": -8.758666229248046, + "step": 6502 + }, + { + "epoch": 0.5941525810872544, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 3.5550704225742576e-06, + "logits/chosen": 905882709.3333334, + "logits/rejected": 727356160.0, + "logps/chosen": -398.6691487630208, + "logps/rejected": -525.40771484375, + "loss": 0.0127, + "rewards/chosen": 4.676512082417806, + "rewards/margins": 15.443411191304524, + "rewards/rejected": -10.766899108886719, + "step": 6503 + }, + { + "epoch": 0.5942439470077661, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 3.5536940285972975e-06, + "logits/chosen": 638698700.8, + "logits/rejected": 513320874.6666667, + "logps/chosen": -373.1505126953125, + "logps/rejected": -724.9606119791666, + "loss": 0.0101, + "rewards/chosen": 4.3572437286376955, + "rewards/margins": 13.827691268920898, + "rewards/rejected": -9.470447540283203, + "step": 6504 + }, + { + "epoch": 0.5943353129282778, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.552317754216442e-06, + "logits/chosen": 574293930.6666666, + "logits/rejected": 296376704.0, + "logps/chosen": -444.8307291666667, + "logps/rejected": -420.74090576171875, + "loss": 0.015, + "rewards/chosen": 4.613674799601237, + "rewards/margins": 15.936466852823894, + "rewards/rejected": -11.322792053222656, + "step": 6505 + }, + { + "epoch": 0.5944266788487894, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 3.5509415995454943e-06, + "logits/chosen": 828462592.0, + "logits/rejected": 495383840.0, + "logps/chosen": -478.6307779947917, + "logps/rejected": -294.1138610839844, + "loss": 0.0242, + "rewards/chosen": 4.4191239674886065, + "rewards/margins": 12.52549107869466, + "rewards/rejected": -8.106367111206055, + "step": 6506 + }, + { + "epoch": 0.594518044769301, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 3.5495655646982506e-06, + "logits/chosen": 772518604.8, + "logits/rejected": 1014044757.3333334, + "logps/chosen": -290.094970703125, + "logps/rejected": -357.0922037760417, + "loss": 0.0252, + "rewards/chosen": 3.6813674926757813, + "rewards/margins": 11.158875401814779, + "rewards/rejected": -7.477507909138997, + "step": 6507 + }, + { + "epoch": 0.5946094106898127, + "grad_norm": 0.3671875, + "kl": 0.0, + "learning_rate": 3.5481896497884973e-06, + "logits/chosen": 352400000.0, + "logits/rejected": 385602272.0, + "logps/chosen": -224.76535034179688, + "logps/rejected": -451.26495361328125, + "loss": 0.002, + "rewards/chosen": 5.895485877990723, + "rewards/margins": 15.672038078308105, + "rewards/rejected": -9.776552200317383, + "step": 6508 + }, + { + "epoch": 0.5947007766103244, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 3.5468138549300067e-06, + "logits/chosen": 359769600.0, + "logits/rejected": 373739417.6, + "logps/chosen": -212.97705078125, + "logps/rejected": -546.1001953125, + "loss": 0.0096, + "rewards/chosen": 3.7351096471150718, + "rewards/margins": 15.02226022084554, + "rewards/rejected": -11.287150573730468, + "step": 6509 + }, + { + "epoch": 0.594792142530836, + "grad_norm": 48.75, + "kl": 0.0, + "learning_rate": 3.545438180236548e-06, + "logits/chosen": 414550314.6666667, + "logits/rejected": 614954035.2, + "logps/chosen": -335.09641520182294, + "logps/rejected": -386.4289306640625, + "loss": 0.0462, + "rewards/chosen": 3.4054126739501953, + "rewards/margins": 11.67231788635254, + "rewards/rejected": -8.266905212402344, + "step": 6510 + }, + { + "epoch": 0.5948835084513476, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 3.5440626258218735e-06, + "logits/chosen": 708916940.8, + "logits/rejected": 404622165.3333333, + "logps/chosen": -302.528759765625, + "logps/rejected": -440.7479654947917, + "loss": 0.0274, + "rewards/chosen": 3.4193698883056642, + "rewards/margins": 13.504011662801108, + "rewards/rejected": -10.084641774495443, + "step": 6511 + }, + { + "epoch": 0.5949748743718593, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 3.5426871917997314e-06, + "logits/chosen": 462084992.0, + "logits/rejected": 387978240.0, + "logps/chosen": -429.1147766113281, + "logps/rejected": -466.64013671875, + "loss": 0.0044, + "rewards/chosen": 3.3927948474884033, + "rewards/margins": 12.57047472681318, + "rewards/rejected": -9.177679879324776, + "step": 6512 + }, + { + "epoch": 0.595066240292371, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.541311878283855e-06, + "logits/chosen": 432773973.3333333, + "logits/rejected": 404698624.0, + "logps/chosen": -316.11037190755206, + "logps/rejected": -481.37958984375, + "loss": 0.0219, + "rewards/chosen": 3.406325658162435, + "rewards/margins": 12.072523625691732, + "rewards/rejected": -8.666197967529296, + "step": 6513 + }, + { + "epoch": 0.5951576062128826, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 3.5399366853879726e-06, + "logits/chosen": 916798464.0, + "logits/rejected": 563958101.3333334, + "logps/chosen": -345.4234375, + "logps/rejected": -579.1626790364584, + "loss": 0.0314, + "rewards/chosen": 3.3722442626953124, + "rewards/margins": 12.6086550394694, + "rewards/rejected": -9.236410776774088, + "step": 6514 + }, + { + "epoch": 0.5952489721333942, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 3.5385616132257994e-06, + "logits/chosen": 449835264.0, + "logits/rejected": 471935424.0, + "logps/chosen": -298.7887369791667, + "logps/rejected": -604.6180419921875, + "loss": 0.0282, + "rewards/chosen": 3.6174755096435547, + "rewards/margins": 15.478404998779297, + "rewards/rejected": -11.860929489135742, + "step": 6515 + }, + { + "epoch": 0.5953403380539058, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 3.5371866619110386e-06, + "logits/chosen": 491224320.0, + "logits/rejected": 579944448.0, + "logps/chosen": -277.5635986328125, + "logps/rejected": -396.46142578125, + "loss": 0.0362, + "rewards/chosen": 2.6550402641296387, + "rewards/margins": 11.827270030975342, + "rewards/rejected": -9.172229766845703, + "step": 6516 + }, + { + "epoch": 0.5954317039744176, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 3.5358118315573883e-06, + "logits/chosen": 757148416.0, + "logits/rejected": 685739904.0, + "logps/chosen": -396.7776184082031, + "logps/rejected": -621.20068359375, + "loss": 0.0128, + "rewards/chosen": 3.8296995162963867, + "rewards/margins": 12.876241683959961, + "rewards/rejected": -9.046542167663574, + "step": 6517 + }, + { + "epoch": 0.5955230698949292, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 3.5344371222785334e-06, + "logits/chosen": 461290720.0, + "logits/rejected": 314980896.0, + "logps/chosen": -284.97430419921875, + "logps/rejected": -432.17340087890625, + "loss": 0.0096, + "rewards/chosen": 4.521724224090576, + "rewards/margins": 15.177493572235107, + "rewards/rejected": -10.655769348144531, + "step": 6518 + }, + { + "epoch": 0.5956144358154408, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 3.5330625341881484e-06, + "logits/chosen": 527280064.0, + "logits/rejected": 530281216.0, + "logps/chosen": -358.57525634765625, + "logps/rejected": -686.058837890625, + "loss": 0.0217, + "rewards/chosen": 3.3020291328430176, + "rewards/margins": 15.43793535232544, + "rewards/rejected": -12.135906219482422, + "step": 6519 + }, + { + "epoch": 0.5957058017359524, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 3.531688067399902e-06, + "logits/chosen": 607086208.0, + "logits/rejected": 800083626.6666666, + "logps/chosen": -295.3685607910156, + "logps/rejected": -421.9192301432292, + "loss": 0.0074, + "rewards/chosen": 4.02474308013916, + "rewards/margins": 13.132572491963705, + "rewards/rejected": -9.107829411824545, + "step": 6520 + }, + { + "epoch": 0.5957971676564642, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 3.5303137220274467e-06, + "logits/chosen": 253145514.66666666, + "logits/rejected": 411411865.6, + "logps/chosen": -219.85408528645834, + "logps/rejected": -558.005859375, + "loss": 0.0064, + "rewards/chosen": 4.349978764851888, + "rewards/margins": 12.1098695119222, + "rewards/rejected": -7.759890747070313, + "step": 6521 + }, + { + "epoch": 0.5958885335769758, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 3.528939498184431e-06, + "logits/chosen": 541133824.0, + "logits/rejected": 841743018.6666666, + "logps/chosen": -289.81123046875, + "logps/rejected": -508.218994140625, + "loss": 0.1297, + "rewards/chosen": 3.191420555114746, + "rewards/margins": 13.513683001200357, + "rewards/rejected": -10.322262446085611, + "step": 6522 + }, + { + "epoch": 0.5959798994974874, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 3.527565395984488e-06, + "logits/chosen": 521534944.0, + "logits/rejected": 1306901504.0, + "logps/chosen": -206.94171142578125, + "logps/rejected": -407.3901062011719, + "loss": 0.0295, + "rewards/chosen": 3.6951675415039062, + "rewards/margins": 11.515318870544434, + "rewards/rejected": -7.820151329040527, + "step": 6523 + }, + { + "epoch": 0.596071265417999, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 3.5261914155412457e-06, + "logits/chosen": 293433664.0, + "logits/rejected": 488466688.0, + "logps/chosen": -173.42340087890625, + "logps/rejected": -507.5684814453125, + "loss": 0.0222, + "rewards/chosen": 2.4941468238830566, + "rewards/margins": 13.066968123118082, + "rewards/rejected": -10.572821299235025, + "step": 6524 + }, + { + "epoch": 0.5961626313385108, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 3.5248175569683165e-06, + "logits/chosen": 698303445.3333334, + "logits/rejected": 427898316.8, + "logps/chosen": -310.45888264973956, + "logps/rejected": -464.28935546875, + "loss": 0.0194, + "rewards/chosen": 3.543972969055176, + "rewards/margins": 12.998420143127442, + "rewards/rejected": -9.454447174072266, + "step": 6525 + }, + { + "epoch": 0.5962539972590224, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 3.5234438203793087e-06, + "logits/chosen": 631989674.6666666, + "logits/rejected": 288129536.0, + "logps/chosen": -430.7130940755208, + "logps/rejected": -392.742822265625, + "loss": 0.0086, + "rewards/chosen": 3.808516502380371, + "rewards/margins": 12.973261070251464, + "rewards/rejected": -9.164744567871093, + "step": 6526 + }, + { + "epoch": 0.596345363179534, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 3.522070205887816e-06, + "logits/chosen": 714342976.0, + "logits/rejected": 639068032.0, + "logps/chosen": -280.4361267089844, + "logps/rejected": -436.87213134765625, + "loss": 0.0154, + "rewards/chosen": 3.762071371078491, + "rewards/margins": 12.371026754379272, + "rewards/rejected": -8.608955383300781, + "step": 6527 + }, + { + "epoch": 0.5964367291000456, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 3.5206967136074245e-06, + "logits/chosen": 339961088.0, + "logits/rejected": 390285354.6666667, + "logps/chosen": -374.07373046875, + "logps/rejected": -355.2801106770833, + "loss": 0.0186, + "rewards/chosen": 3.527604579925537, + "rewards/margins": 11.086483160654705, + "rewards/rejected": -7.558878580729167, + "step": 6528 + }, + { + "epoch": 0.5965280950205574, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 3.5193233436517084e-06, + "logits/chosen": 1045058645.3333334, + "logits/rejected": 929494208.0, + "logps/chosen": -394.9970296223958, + "logps/rejected": -536.2113037109375, + "loss": 0.0304, + "rewards/chosen": 3.6405105590820312, + "rewards/margins": 13.338299751281738, + "rewards/rejected": -9.697789192199707, + "step": 6529 + }, + { + "epoch": 0.596619460941069, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 3.517950096134232e-06, + "logits/chosen": 399234752.0, + "logits/rejected": 423845504.0, + "logps/chosen": -328.60052490234375, + "logps/rejected": -470.57281494140625, + "loss": 0.0136, + "rewards/chosen": 4.5828752517700195, + "rewards/margins": 14.053404808044434, + "rewards/rejected": -9.470529556274414, + "step": 6530 + }, + { + "epoch": 0.5967108268615806, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.5165769711685527e-06, + "logits/chosen": 437569536.0, + "logits/rejected": 570323251.2, + "logps/chosen": -282.8437906901042, + "logps/rejected": -436.54873046875, + "loss": 0.0184, + "rewards/chosen": 3.1515401204427085, + "rewards/margins": 11.888475545247397, + "rewards/rejected": -8.736935424804688, + "step": 6531 + }, + { + "epoch": 0.5968021927820922, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 3.515203968868213e-06, + "logits/chosen": 307696320.0, + "logits/rejected": 314016618.6666667, + "logps/chosen": -169.96446228027344, + "logps/rejected": -342.3411865234375, + "loss": 0.104, + "rewards/chosen": 3.3362841606140137, + "rewards/margins": 9.691479523976643, + "rewards/rejected": -6.35519536336263, + "step": 6532 + }, + { + "epoch": 0.596893558702604, + "grad_norm": 0.703125, + "kl": 0.0, + "learning_rate": 3.513831089346748e-06, + "logits/chosen": 526465843.2, + "logits/rejected": 352166165.3333333, + "logps/chosen": -299.0646484375, + "logps/rejected": -341.7123209635417, + "loss": 0.0072, + "rewards/chosen": 4.59594497680664, + "rewards/margins": 12.269420878092447, + "rewards/rejected": -7.673475901285808, + "step": 6533 + }, + { + "epoch": 0.5969849246231156, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 3.512458332717682e-06, + "logits/chosen": 694730368.0, + "logits/rejected": 456338261.3333333, + "logps/chosen": -285.383056640625, + "logps/rejected": -483.5065104166667, + "loss": 0.1169, + "rewards/chosen": 3.4763846397399902, + "rewards/margins": 11.887728532155355, + "rewards/rejected": -8.411343892415365, + "step": 6534 + }, + { + "epoch": 0.5970762905436272, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 3.51108569909453e-06, + "logits/chosen": 799366400.0, + "logits/rejected": 470670080.0, + "logps/chosen": -239.78643798828125, + "logps/rejected": -430.6938883463542, + "loss": 0.0052, + "rewards/chosen": 4.200070381164551, + "rewards/margins": 12.340615272521973, + "rewards/rejected": -8.140544891357422, + "step": 6535 + }, + { + "epoch": 0.5971676564641388, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 3.5097131885907953e-06, + "logits/chosen": 617628352.0, + "logits/rejected": 613032960.0, + "logps/chosen": -416.91754150390625, + "logps/rejected": -368.197265625, + "loss": 0.0139, + "rewards/chosen": 4.27388858795166, + "rewards/margins": 11.882874011993408, + "rewards/rejected": -7.608985424041748, + "step": 6536 + }, + { + "epoch": 0.5972590223846506, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 3.5083408013199727e-06, + "logits/chosen": 408067712.0, + "logits/rejected": 514341184.0, + "logps/chosen": -272.0536804199219, + "logps/rejected": -588.3298950195312, + "loss": 0.0126, + "rewards/chosen": 3.764295816421509, + "rewards/margins": 13.713323831558228, + "rewards/rejected": -9.949028015136719, + "step": 6537 + }, + { + "epoch": 0.5973503883051622, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 3.5069685373955442e-06, + "logits/chosen": 674265344.0, + "logits/rejected": 452950630.4, + "logps/chosen": -484.2715657552083, + "logps/rejected": -453.44833984375, + "loss": 0.0093, + "rewards/chosen": 3.9654744466145835, + "rewards/margins": 12.268540700276693, + "rewards/rejected": -8.30306625366211, + "step": 6538 + }, + { + "epoch": 0.5974417542256738, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 3.5055963969309853e-06, + "logits/chosen": 699151232.0, + "logits/rejected": 660678016.0, + "logps/chosen": -316.42421468098956, + "logps/rejected": -491.53558349609375, + "loss": 0.0205, + "rewards/chosen": 3.6998910903930664, + "rewards/margins": 12.560546875, + "rewards/rejected": -8.860655784606934, + "step": 6539 + }, + { + "epoch": 0.5975331201461854, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.5042243800397586e-06, + "logits/chosen": 449163605.3333333, + "logits/rejected": 464860352.0, + "logps/chosen": -361.4388427734375, + "logps/rejected": -560.875, + "loss": 0.0266, + "rewards/chosen": 3.969135602315267, + "rewards/margins": 15.217337926228842, + "rewards/rejected": -11.248202323913574, + "step": 6540 + }, + { + "epoch": 0.5976244860666972, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 3.5028524868353157e-06, + "logits/chosen": 1057298090.6666666, + "logits/rejected": 539792076.8, + "logps/chosen": -450.4019368489583, + "logps/rejected": -366.04365234375, + "loss": 0.0109, + "rewards/chosen": 3.7709197998046875, + "rewards/margins": 10.818862915039062, + "rewards/rejected": -7.047943115234375, + "step": 6541 + }, + { + "epoch": 0.5977158519872088, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 3.5014807174311034e-06, + "logits/chosen": 423883187.2, + "logits/rejected": 329055658.6666667, + "logps/chosen": -239.8230712890625, + "logps/rejected": -311.50439453125, + "loss": 0.0226, + "rewards/chosen": 4.004875946044922, + "rewards/margins": 12.466388193766274, + "rewards/rejected": -8.461512247721354, + "step": 6542 + }, + { + "epoch": 0.5978072179077204, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 3.5001090719405506e-06, + "logits/chosen": 435457600.0, + "logits/rejected": 486428992.0, + "logps/chosen": -341.0994873046875, + "logps/rejected": -356.2312927246094, + "loss": 0.125, + "rewards/chosen": 3.3498971462249756, + "rewards/margins": 8.59131121635437, + "rewards/rejected": -5.2414140701293945, + "step": 6543 + }, + { + "epoch": 0.597898583828232, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 3.498737550477079e-06, + "logits/chosen": 793000667.4285715, + "logits/rejected": 957404160.0, + "logps/chosen": -316.01485770089283, + "logps/rejected": -784.3202514648438, + "loss": 0.0643, + "rewards/chosen": 3.1820618765694753, + "rewards/margins": 15.818487848554339, + "rewards/rejected": -12.636425971984863, + "step": 6544 + }, + { + "epoch": 0.5979899497487438, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.497366153154104e-06, + "logits/chosen": 439812181.3333333, + "logits/rejected": 558695372.8, + "logps/chosen": -332.140625, + "logps/rejected": -549.761376953125, + "loss": 0.0116, + "rewards/chosen": 3.5994040171305337, + "rewards/margins": 13.814943567911783, + "rewards/rejected": -10.21553955078125, + "step": 6545 + }, + { + "epoch": 0.5980813156692554, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 3.4959948800850253e-06, + "logits/chosen": 469103584.0, + "logits/rejected": 587334144.0, + "logps/chosen": -234.32369995117188, + "logps/rejected": -538.7400716145834, + "loss": 0.0202, + "rewards/chosen": 3.906291961669922, + "rewards/margins": 12.246091842651367, + "rewards/rejected": -8.339799880981445, + "step": 6546 + }, + { + "epoch": 0.598172681589767, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 3.4946237313832356e-06, + "logits/chosen": 346579488.0, + "logits/rejected": 390188640.0, + "logps/chosen": -234.88998413085938, + "logps/rejected": -364.9444274902344, + "loss": 0.0144, + "rewards/chosen": 4.540419101715088, + "rewards/margins": 10.951269626617432, + "rewards/rejected": -6.410850524902344, + "step": 6547 + }, + { + "epoch": 0.5982640475102786, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 3.4932527071621146e-06, + "logits/chosen": 696984502.8571428, + "logits/rejected": 240403552.0, + "logps/chosen": -290.66060965401783, + "logps/rejected": -184.7123260498047, + "loss": 0.0412, + "rewards/chosen": 3.541163308279855, + "rewards/margins": 11.526403767721995, + "rewards/rejected": -7.985240459442139, + "step": 6548 + }, + { + "epoch": 0.5983554134307904, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 3.4918818075350357e-06, + "logits/chosen": 435657312.0, + "logits/rejected": 1088018944.0, + "logps/chosen": -405.9953308105469, + "logps/rejected": -735.3159790039062, + "loss": 0.0257, + "rewards/chosen": 3.115915298461914, + "rewards/margins": 12.058614730834961, + "rewards/rejected": -8.942699432373047, + "step": 6549 + }, + { + "epoch": 0.598446779351302, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 3.490511032615358e-06, + "logits/chosen": 672463189.3333334, + "logits/rejected": 734851200.0, + "logps/chosen": -412.8340250651042, + "logps/rejected": -747.552490234375, + "loss": 0.0389, + "rewards/chosen": 3.1820170084635415, + "rewards/margins": 12.472610155741373, + "rewards/rejected": -9.290593147277832, + "step": 6550 + }, + { + "epoch": 0.5985381452718136, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 3.4891403825164317e-06, + "logits/chosen": 330243925.3333333, + "logits/rejected": 218031664.0, + "logps/chosen": -318.53875732421875, + "logps/rejected": -481.56390380859375, + "loss": 0.0252, + "rewards/chosen": 4.029171943664551, + "rewards/margins": 16.33335304260254, + "rewards/rejected": -12.304181098937988, + "step": 6551 + }, + { + "epoch": 0.5986295111923252, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 3.487769857351598e-06, + "logits/chosen": 495619456.0, + "logits/rejected": 388678741.3333333, + "logps/chosen": -154.69277954101562, + "logps/rejected": -393.7935791015625, + "loss": 0.0059, + "rewards/chosen": 4.346195220947266, + "rewards/margins": 13.029805501302084, + "rewards/rejected": -8.683610280354818, + "step": 6552 + }, + { + "epoch": 0.598720877112837, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 3.4863994572341845e-06, + "logits/chosen": 747658816.0, + "logits/rejected": 515636704.0, + "logps/chosen": -444.71014404296875, + "logps/rejected": -393.85791015625, + "loss": 0.0138, + "rewards/chosen": 3.9650697708129883, + "rewards/margins": 13.473380088806152, + "rewards/rejected": -9.508310317993164, + "step": 6553 + }, + { + "epoch": 0.5988122430333486, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 3.485029182277514e-06, + "logits/chosen": 592329600.0, + "logits/rejected": 482137709.71428573, + "logps/chosen": -315.7622375488281, + "logps/rejected": -504.24197823660717, + "loss": 0.0079, + "rewards/chosen": 2.725454807281494, + "rewards/margins": 11.754314082009452, + "rewards/rejected": -9.028859274727958, + "step": 6554 + }, + { + "epoch": 0.5989036089538602, + "grad_norm": 36.5, + "kl": 0.0, + "learning_rate": 3.4836590325948914e-06, + "logits/chosen": 610013257.1428572, + "logits/rejected": 656077184.0, + "logps/chosen": -401.474365234375, + "logps/rejected": -470.52069091796875, + "loss": 0.0499, + "rewards/chosen": 3.616954803466797, + "rewards/margins": 11.650609970092773, + "rewards/rejected": -8.033655166625977, + "step": 6555 + }, + { + "epoch": 0.5989949748743718, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 3.4822890082996197e-06, + "logits/chosen": 527033984.0, + "logits/rejected": 571838634.6666666, + "logps/chosen": -413.9364013671875, + "logps/rejected": -334.90346272786456, + "loss": 0.0048, + "rewards/chosen": 4.482966423034668, + "rewards/margins": 12.38789717356364, + "rewards/rejected": -7.904930750528972, + "step": 6556 + }, + { + "epoch": 0.5990863407948835, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 3.480919109504984e-06, + "logits/chosen": 345536170.6666667, + "logits/rejected": 223625267.2, + "logps/chosen": -246.7543741861979, + "logps/rejected": -413.7638671875, + "loss": 0.0111, + "rewards/chosen": 4.181485493977864, + "rewards/margins": 11.783301289876302, + "rewards/rejected": -7.601815795898437, + "step": 6557 + }, + { + "epoch": 0.5991777067153952, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.479549336324264e-06, + "logits/chosen": 1027824025.6, + "logits/rejected": 943853994.6666666, + "logps/chosen": -340.2006103515625, + "logps/rejected": -412.033203125, + "loss": 0.0319, + "rewards/chosen": 3.4056671142578123, + "rewards/margins": 12.475676981608071, + "rewards/rejected": -9.07000986735026, + "step": 6558 + }, + { + "epoch": 0.5992690726359068, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 3.4781796888707253e-06, + "logits/chosen": 579781427.2, + "logits/rejected": 548082090.6666666, + "logps/chosen": -304.751904296875, + "logps/rejected": -487.2014973958333, + "loss": 0.1014, + "rewards/chosen": 3.528901290893555, + "rewards/margins": 11.495344670613607, + "rewards/rejected": -7.966443379720052, + "step": 6559 + }, + { + "epoch": 0.5993604385564184, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.476810167257628e-06, + "logits/chosen": 300175744.0, + "logits/rejected": 390167910.4, + "logps/chosen": -302.171875, + "logps/rejected": -458.95205078125, + "loss": 0.0104, + "rewards/chosen": 3.982025146484375, + "rewards/margins": 14.354741668701172, + "rewards/rejected": -10.372716522216797, + "step": 6560 + }, + { + "epoch": 0.5994518044769301, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 3.4754407715982174e-06, + "logits/chosen": 998430336.0, + "logits/rejected": 663641664.0, + "logps/chosen": -259.725341796875, + "logps/rejected": -559.6864013671875, + "loss": 0.0201, + "rewards/chosen": 4.627801895141602, + "rewards/margins": 13.77497673034668, + "rewards/rejected": -9.147174835205078, + "step": 6561 + }, + { + "epoch": 0.5995431703974418, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 3.474071502005728e-06, + "logits/chosen": 369870624.0, + "logits/rejected": 541047296.0, + "logps/chosen": -341.65765380859375, + "logps/rejected": -649.09716796875, + "loss": 0.0103, + "rewards/chosen": 4.564950942993164, + "rewards/margins": 12.45754623413086, + "rewards/rejected": -7.892595291137695, + "step": 6562 + }, + { + "epoch": 0.5996345363179534, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 3.472702358593388e-06, + "logits/chosen": 996236352.0, + "logits/rejected": 681152554.6666666, + "logps/chosen": -386.24810791015625, + "logps/rejected": -586.5470377604166, + "loss": 0.0062, + "rewards/chosen": 4.896310806274414, + "rewards/margins": 12.366551717122395, + "rewards/rejected": -7.4702409108479815, + "step": 6563 + }, + { + "epoch": 0.599725902238465, + "grad_norm": 65.5, + "kl": 0.0, + "learning_rate": 3.471333341474412e-06, + "logits/chosen": 394223308.8, + "logits/rejected": 312141525.3333333, + "logps/chosen": -192.25316162109374, + "logps/rejected": -456.7921956380208, + "loss": 0.0773, + "rewards/chosen": 3.322816848754883, + "rewards/margins": 12.288052876790365, + "rewards/rejected": -8.965236028035482, + "step": 6564 + }, + { + "epoch": 0.5998172681589767, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 3.4699644507620063e-06, + "logits/chosen": 963701248.0, + "logits/rejected": 878692778.6666666, + "logps/chosen": -263.16181640625, + "logps/rejected": -439.7669677734375, + "loss": 0.0186, + "rewards/chosen": 4.117818450927734, + "rewards/margins": 12.334977086385091, + "rewards/rejected": -8.217158635457357, + "step": 6565 + }, + { + "epoch": 0.5999086340794884, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 3.4685956865693628e-06, + "logits/chosen": 569898965.3333334, + "logits/rejected": 588504883.2, + "logps/chosen": -391.054443359375, + "logps/rejected": -281.626708984375, + "loss": 0.0179, + "rewards/chosen": 3.3263937632242837, + "rewards/margins": 11.426338068644204, + "rewards/rejected": -8.099944305419921, + "step": 6566 + }, + { + "epoch": 0.6, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 3.4672270490096682e-06, + "logits/chosen": 859666261.3333334, + "logits/rejected": 1248246144.0, + "logps/chosen": -253.9842325846354, + "logps/rejected": -609.7857666015625, + "loss": 0.123, + "rewards/chosen": 3.5597333908081055, + "rewards/margins": 10.121277809143066, + "rewards/rejected": -6.561544418334961, + "step": 6567 + }, + { + "epoch": 0.6000913659205116, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.4658585381960973e-06, + "logits/chosen": 264151680.0, + "logits/rejected": 456386867.2, + "logps/chosen": -181.4302978515625, + "logps/rejected": -398.477587890625, + "loss": 0.0181, + "rewards/chosen": 3.054492632548014, + "rewards/margins": 11.823274676005045, + "rewards/rejected": -8.768782043457032, + "step": 6568 + }, + { + "epoch": 0.6001827318410233, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 3.464490154241808e-06, + "logits/chosen": 626240448.0, + "logits/rejected": 471878144.0, + "logps/chosen": -336.5247802734375, + "logps/rejected": -440.1331481933594, + "loss": 0.0221, + "rewards/chosen": 3.773338794708252, + "rewards/margins": 14.234022617340088, + "rewards/rejected": -10.460683822631836, + "step": 6569 + }, + { + "epoch": 0.600274097761535, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.463121897259958e-06, + "logits/chosen": 615562752.0, + "logits/rejected": 441893952.0, + "logps/chosen": -399.0658874511719, + "logps/rejected": -365.7901611328125, + "loss": 0.0225, + "rewards/chosen": 3.1062850952148438, + "rewards/margins": 11.993943214416504, + "rewards/rejected": -8.88765811920166, + "step": 6570 + }, + { + "epoch": 0.6003654636820466, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 3.461753767363687e-06, + "logits/chosen": 526045376.0, + "logits/rejected": 823506304.0, + "logps/chosen": -225.8740692138672, + "logps/rejected": -753.6881713867188, + "loss": 0.0057, + "rewards/chosen": 4.828526496887207, + "rewards/margins": 16.364359855651855, + "rewards/rejected": -11.535833358764648, + "step": 6571 + }, + { + "epoch": 0.6004568296025582, + "grad_norm": 43.25, + "kl": 0.0, + "learning_rate": 3.4603857646661266e-06, + "logits/chosen": 474029107.2, + "logits/rejected": 737954986.6666666, + "logps/chosen": -269.158154296875, + "logps/rejected": -504.1062418619792, + "loss": 0.0539, + "rewards/chosen": 3.34134521484375, + "rewards/margins": 9.986480458577473, + "rewards/rejected": -6.645135243733724, + "step": 6572 + }, + { + "epoch": 0.6005481955230699, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 3.4590178892803995e-06, + "logits/chosen": 482363136.0, + "logits/rejected": 597937834.6666666, + "logps/chosen": -232.12384033203125, + "logps/rejected": -507.271484375, + "loss": 0.0103, + "rewards/chosen": 4.643022537231445, + "rewards/margins": 12.174421310424805, + "rewards/rejected": -7.531398773193359, + "step": 6573 + }, + { + "epoch": 0.6006395614435815, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 3.457650141319614e-06, + "logits/chosen": 727988992.0, + "logits/rejected": 1268740736.0, + "logps/chosen": -460.5355631510417, + "logps/rejected": -414.7912292480469, + "loss": 0.0344, + "rewards/chosen": 3.1090262730916343, + "rewards/margins": 11.126251538594564, + "rewards/rejected": -8.01722526550293, + "step": 6574 + }, + { + "epoch": 0.6007309273640932, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 3.456282520896873e-06, + "logits/chosen": 600177877.3333334, + "logits/rejected": 377883340.8, + "logps/chosen": -339.58880615234375, + "logps/rejected": -440.48935546875, + "loss": 0.0158, + "rewards/chosen": 3.198514938354492, + "rewards/margins": 11.525683212280274, + "rewards/rejected": -8.327168273925782, + "step": 6575 + }, + { + "epoch": 0.6008222932846048, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.4549150281252635e-06, + "logits/chosen": 175576960.0, + "logits/rejected": 540111530.6666666, + "logps/chosen": -651.8539428710938, + "logps/rejected": -670.3017578125, + "loss": 0.0041, + "rewards/chosen": 4.450131416320801, + "rewards/margins": 14.776480038960775, + "rewards/rejected": -10.326348622639975, + "step": 6576 + }, + { + "epoch": 0.6009136592051165, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 3.4535476631178666e-06, + "logits/chosen": 1174974549.3333333, + "logits/rejected": 813838592.0, + "logps/chosen": -601.3563639322916, + "logps/rejected": -355.194091796875, + "loss": 0.0302, + "rewards/chosen": 3.641918182373047, + "rewards/margins": 11.55559492111206, + "rewards/rejected": -7.913676738739014, + "step": 6577 + }, + { + "epoch": 0.6010050251256281, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 3.452180425987749e-06, + "logits/chosen": 638655829.3333334, + "logits/rejected": 267219264.0, + "logps/chosen": -398.7407633463542, + "logps/rejected": -201.50245666503906, + "loss": 0.0401, + "rewards/chosen": 3.0094213485717773, + "rewards/margins": 10.184595108032227, + "rewards/rejected": -7.175173759460449, + "step": 6578 + }, + { + "epoch": 0.6010963910461398, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 3.4508133168479707e-06, + "logits/chosen": 852590976.0, + "logits/rejected": 1062553088.0, + "logps/chosen": -364.8349609375, + "logps/rejected": -493.3948669433594, + "loss": 0.0153, + "rewards/chosen": 4.401618480682373, + "rewards/margins": 13.773474216461182, + "rewards/rejected": -9.371855735778809, + "step": 6579 + }, + { + "epoch": 0.6011877569666514, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 3.449446335811576e-06, + "logits/chosen": 517348736.0, + "logits/rejected": 504079360.0, + "logps/chosen": -331.435302734375, + "logps/rejected": -428.1036376953125, + "loss": 0.0215, + "rewards/chosen": 3.4779021739959717, + "rewards/margins": 12.17029356956482, + "rewards/rejected": -8.692391395568848, + "step": 6580 + }, + { + "epoch": 0.6012791228871631, + "grad_norm": 0.87890625, + "kl": 0.0, + "learning_rate": 3.448079482991604e-06, + "logits/chosen": 573616512.0, + "logits/rejected": 325338624.0, + "logps/chosen": -295.1473083496094, + "logps/rejected": -498.134765625, + "loss": 0.0057, + "rewards/chosen": 4.024409770965576, + "rewards/margins": 14.562998612721762, + "rewards/rejected": -10.538588841756185, + "step": 6581 + }, + { + "epoch": 0.6013704888076747, + "grad_norm": 0.90234375, + "kl": 0.0, + "learning_rate": 3.446712758501081e-06, + "logits/chosen": 340007168.0, + "logits/rejected": 310531626.6666667, + "logps/chosen": -123.59059143066406, + "logps/rejected": -405.27197265625, + "loss": 0.0045, + "rewards/chosen": 4.373115062713623, + "rewards/margins": 13.641121705373129, + "rewards/rejected": -9.268006642659506, + "step": 6582 + }, + { + "epoch": 0.6014618547281864, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 3.445346162453019e-06, + "logits/chosen": 507284224.0, + "logits/rejected": 692485120.0, + "logps/chosen": -294.979736328125, + "logps/rejected": -661.23955078125, + "loss": 0.0197, + "rewards/chosen": 3.0327981313069663, + "rewards/margins": 12.85041135152181, + "rewards/rejected": -9.817613220214843, + "step": 6583 + }, + { + "epoch": 0.601553220648698, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 3.4439796949604263e-06, + "logits/chosen": 354246208.0, + "logits/rejected": 278651744.0, + "logps/chosen": -228.26974487304688, + "logps/rejected": -390.91912841796875, + "loss": 0.0135, + "rewards/chosen": 4.081622123718262, + "rewards/margins": 12.795084953308105, + "rewards/rejected": -8.713462829589844, + "step": 6584 + }, + { + "epoch": 0.6016445865692097, + "grad_norm": 0.703125, + "kl": 0.0, + "learning_rate": 3.4426133561362953e-06, + "logits/chosen": 373257386.6666667, + "logits/rejected": 632906035.2, + "logps/chosen": -199.487060546875, + "logps/rejected": -381.3938232421875, + "loss": 0.0054, + "rewards/chosen": 4.840267817179362, + "rewards/margins": 12.786624018351237, + "rewards/rejected": -7.946356201171875, + "step": 6585 + }, + { + "epoch": 0.6017359524897213, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 3.4412471460936114e-06, + "logits/chosen": 433468313.6, + "logits/rejected": 281106218.6666667, + "logps/chosen": -349.973388671875, + "logps/rejected": -327.5933837890625, + "loss": 0.0179, + "rewards/chosen": 4.048524475097656, + "rewards/margins": 13.837404505411783, + "rewards/rejected": -9.788880030314127, + "step": 6586 + }, + { + "epoch": 0.601827318410233, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 3.4398810649453453e-06, + "logits/chosen": 498081749.3333333, + "logits/rejected": 331313280.0, + "logps/chosen": -340.2295328776042, + "logps/rejected": -464.8683776855469, + "loss": 0.0382, + "rewards/chosen": 3.3089234034220376, + "rewards/margins": 12.223121325174967, + "rewards/rejected": -8.91419792175293, + "step": 6587 + }, + { + "epoch": 0.6019186843307446, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 3.4385151128044615e-06, + "logits/chosen": 672703680.0, + "logits/rejected": 507552448.0, + "logps/chosen": -299.49664306640625, + "logps/rejected": -481.3198547363281, + "loss": 0.0104, + "rewards/chosen": 4.4134063720703125, + "rewards/margins": 13.640932083129883, + "rewards/rejected": -9.22752571105957, + "step": 6588 + }, + { + "epoch": 0.6020100502512563, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 3.4371492897839087e-06, + "logits/chosen": 579593344.0, + "logits/rejected": 577363797.3333334, + "logps/chosen": -451.3978271484375, + "logps/rejected": -498.6234130859375, + "loss": 0.0089, + "rewards/chosen": 3.3413283824920654, + "rewards/margins": 11.974068403244019, + "rewards/rejected": -8.632740020751953, + "step": 6589 + }, + { + "epoch": 0.6021014161717679, + "grad_norm": 40.5, + "kl": 0.0, + "learning_rate": 3.4357835959966313e-06, + "logits/chosen": 382652629.3333333, + "logits/rejected": 441462067.2, + "logps/chosen": -451.0989990234375, + "logps/rejected": -626.3455078125, + "loss": 0.0217, + "rewards/chosen": 4.665802001953125, + "rewards/margins": 15.356714630126953, + "rewards/rejected": -10.690912628173828, + "step": 6590 + }, + { + "epoch": 0.6021927820922796, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 3.4344180315555566e-06, + "logits/chosen": 692955989.3333334, + "logits/rejected": 379261388.8, + "logps/chosen": -405.0445556640625, + "logps/rejected": -555.918994140625, + "loss": 0.014, + "rewards/chosen": 3.7488937377929688, + "rewards/margins": 13.903649139404298, + "rewards/rejected": -10.154755401611329, + "step": 6591 + }, + { + "epoch": 0.6022841480127912, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 3.4330525965736054e-06, + "logits/chosen": 1071408742.4, + "logits/rejected": 749498709.3333334, + "logps/chosen": -442.68876953125, + "logps/rejected": -405.5185546875, + "loss": 0.0175, + "rewards/chosen": 4.39813003540039, + "rewards/margins": 11.72525494893392, + "rewards/rejected": -7.327124913533528, + "step": 6592 + }, + { + "epoch": 0.6023755139333029, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 3.4316872911636873e-06, + "logits/chosen": 1169261738.6666667, + "logits/rejected": 683385088.0, + "logps/chosen": -461.5313313802083, + "logps/rejected": -809.0703125, + "loss": 0.031, + "rewards/chosen": 3.4431753158569336, + "rewards/margins": 14.686217308044434, + "rewards/rejected": -11.2430419921875, + "step": 6593 + }, + { + "epoch": 0.6024668798538145, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 3.4303221154386977e-06, + "logits/chosen": 801063765.3333334, + "logits/rejected": 1121008742.4, + "logps/chosen": -301.6219482421875, + "logps/rejected": -460.60419921875, + "loss": 0.0252, + "rewards/chosen": 2.7778809865315757, + "rewards/margins": 11.512652715047201, + "rewards/rejected": -8.734771728515625, + "step": 6594 + }, + { + "epoch": 0.6025582457743262, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.428957069511529e-06, + "logits/chosen": 581205376.0, + "logits/rejected": 328868096.0, + "logps/chosen": -326.622802734375, + "logps/rejected": -303.3974609375, + "loss": 0.0301, + "rewards/chosen": 3.1679635047912598, + "rewards/margins": 10.925475597381592, + "rewards/rejected": -7.757512092590332, + "step": 6595 + }, + { + "epoch": 0.6026496116948378, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 3.427592153495053e-06, + "logits/chosen": 293477280.0, + "logits/rejected": 423812800.0, + "logps/chosen": -248.68331909179688, + "logps/rejected": -567.856201171875, + "loss": 0.006, + "rewards/chosen": 4.624765872955322, + "rewards/margins": 15.965185642242432, + "rewards/rejected": -11.34041976928711, + "step": 6596 + }, + { + "epoch": 0.6027409776153495, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 3.4262273675021357e-06, + "logits/chosen": 1064966741.3333334, + "logits/rejected": 540571238.4, + "logps/chosen": -368.899658203125, + "logps/rejected": -466.4685546875, + "loss": 0.0117, + "rewards/chosen": 4.496301651000977, + "rewards/margins": 12.293638229370117, + "rewards/rejected": -7.797336578369141, + "step": 6597 + }, + { + "epoch": 0.6028323435358611, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 3.424862711645635e-06, + "logits/chosen": 473645209.6, + "logits/rejected": 434659370.6666667, + "logps/chosen": -174.178564453125, + "logps/rejected": -556.3524576822916, + "loss": 0.0245, + "rewards/chosen": 3.728498077392578, + "rewards/margins": 13.709942118326822, + "rewards/rejected": -9.981444040934244, + "step": 6598 + }, + { + "epoch": 0.6029237094563727, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 3.423498186038393e-06, + "logits/chosen": 622276693.3333334, + "logits/rejected": 1055352627.2, + "logps/chosen": -223.62630208333334, + "logps/rejected": -453.058984375, + "loss": 0.0066, + "rewards/chosen": 4.191929181416829, + "rewards/margins": 12.756727155049642, + "rewards/rejected": -8.564797973632812, + "step": 6599 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 3.422133790793246e-06, + "logits/chosen": 412044576.0, + "logits/rejected": 490057770.6666667, + "logps/chosen": -313.08447265625, + "logps/rejected": -527.1369222005209, + "loss": 0.0094, + "rewards/chosen": 3.8191747665405273, + "rewards/margins": 13.480932553609213, + "rewards/rejected": -9.661757787068685, + "step": 6600 + }, + { + "epoch": 0.6031064412973961, + "grad_norm": 0.765625, + "kl": 0.0, + "learning_rate": 3.4207695260230144e-06, + "logits/chosen": 362298752.0, + "logits/rejected": 301734570.6666667, + "logps/chosen": -232.22840881347656, + "logps/rejected": -478.596435546875, + "loss": 0.0045, + "rewards/chosen": 4.2851176261901855, + "rewards/margins": 12.875010967254639, + "rewards/rejected": -8.589893341064453, + "step": 6601 + }, + { + "epoch": 0.6031978072179077, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 3.4194053918405112e-06, + "logits/chosen": 517788544.0, + "logits/rejected": 287068768.0, + "logps/chosen": -273.5315246582031, + "logps/rejected": -448.29547119140625, + "loss": 0.0232, + "rewards/chosen": 3.28802490234375, + "rewards/margins": 12.232115745544434, + "rewards/rejected": -8.944090843200684, + "step": 6602 + }, + { + "epoch": 0.6032891731384193, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 3.418041388358538e-06, + "logits/chosen": 564596992.0, + "logits/rejected": 582390954.6666666, + "logps/chosen": -319.01263427734375, + "logps/rejected": -747.5817057291666, + "loss": 0.0166, + "rewards/chosen": 2.6580328941345215, + "rewards/margins": 12.019100983937582, + "rewards/rejected": -9.36106808980306, + "step": 6603 + }, + { + "epoch": 0.603380539058931, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 3.4166775156898835e-06, + "logits/chosen": 707837235.2, + "logits/rejected": 456008149.3333333, + "logps/chosen": -270.370947265625, + "logps/rejected": -452.1737467447917, + "loss": 0.0264, + "rewards/chosen": 3.829562759399414, + "rewards/margins": 11.887294387817382, + "rewards/rejected": -8.057731628417969, + "step": 6604 + }, + { + "epoch": 0.6034719049794427, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 3.4153137739473297e-06, + "logits/chosen": 516030720.0, + "logits/rejected": 1140005120.0, + "logps/chosen": -259.5664876302083, + "logps/rejected": -1235.10107421875, + "loss": 0.0235, + "rewards/chosen": 3.7353623708089194, + "rewards/margins": 15.499343236287435, + "rewards/rejected": -11.763980865478516, + "step": 6605 + }, + { + "epoch": 0.6035632708999543, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 3.4139501632436435e-06, + "logits/chosen": 770160179.2, + "logits/rejected": 416451413.3333333, + "logps/chosen": -429.080126953125, + "logps/rejected": -521.9268798828125, + "loss": 0.0188, + "rewards/chosen": 3.78782958984375, + "rewards/margins": 13.870132954915366, + "rewards/rejected": -10.082303365071615, + "step": 6606 + }, + { + "epoch": 0.6036546368204659, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 3.412586683691585e-06, + "logits/chosen": 401179946.6666667, + "logits/rejected": 509887648.0, + "logps/chosen": -256.5734049479167, + "logps/rejected": -362.6886901855469, + "loss": 0.0237, + "rewards/chosen": 3.936288833618164, + "rewards/margins": 13.524508476257324, + "rewards/rejected": -9.58821964263916, + "step": 6607 + }, + { + "epoch": 0.6037460027409776, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.411223335403899e-06, + "logits/chosen": 496050227.2, + "logits/rejected": 534125226.6666667, + "logps/chosen": -276.4286376953125, + "logps/rejected": -535.7110188802084, + "loss": 0.02, + "rewards/chosen": 3.9488014221191405, + "rewards/margins": 12.320499165852866, + "rewards/rejected": -8.371697743733725, + "step": 6608 + }, + { + "epoch": 0.6038373686614893, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 3.409860118493323e-06, + "logits/chosen": 268467520.0, + "logits/rejected": 445484608.0, + "logps/chosen": -363.84954833984375, + "logps/rejected": -414.29034423828125, + "loss": 0.0082, + "rewards/chosen": 4.4822611808776855, + "rewards/margins": 15.137755870819092, + "rewards/rejected": -10.655494689941406, + "step": 6609 + }, + { + "epoch": 0.6039287345820009, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 3.408497033072582e-06, + "logits/chosen": 536710048.0, + "logits/rejected": 607247744.0, + "logps/chosen": -377.5984802246094, + "logps/rejected": -545.4523111979166, + "loss": 0.0084, + "rewards/chosen": 3.7202484607696533, + "rewards/margins": 13.149661302566528, + "rewards/rejected": -9.429412841796875, + "step": 6610 + }, + { + "epoch": 0.6040201005025125, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.4071340792543915e-06, + "logits/chosen": 700647765.3333334, + "logits/rejected": 682421760.0, + "logps/chosen": -205.3113810221354, + "logps/rejected": -354.554345703125, + "loss": 0.0246, + "rewards/chosen": 2.8496551513671875, + "rewards/margins": 12.680166625976563, + "rewards/rejected": -9.830511474609375, + "step": 6611 + }, + { + "epoch": 0.6041114664230242, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.405771257151455e-06, + "logits/chosen": 655732352.0, + "logits/rejected": 525615168.0, + "logps/chosen": -480.72821044921875, + "logps/rejected": -358.9080810546875, + "loss": 0.011, + "rewards/chosen": 4.144024848937988, + "rewards/margins": 12.033790111541748, + "rewards/rejected": -7.88976526260376, + "step": 6612 + }, + { + "epoch": 0.6042028323435359, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 3.404408566876463e-06, + "logits/chosen": 528889088.0, + "logits/rejected": 628895104.0, + "logps/chosen": -384.15155029296875, + "logps/rejected": -524.9019775390625, + "loss": 0.0172, + "rewards/chosen": 3.7308509349823, + "rewards/margins": 13.875411748886108, + "rewards/rejected": -10.144560813903809, + "step": 6613 + }, + { + "epoch": 0.6042941982640475, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 3.4030460085421e-06, + "logits/chosen": 523482538.6666667, + "logits/rejected": 517199769.6, + "logps/chosen": -233.97212727864584, + "logps/rejected": -509.99951171875, + "loss": 0.1299, + "rewards/chosen": 1.1086608568827312, + "rewards/margins": 9.850270048777261, + "rewards/rejected": -8.741609191894531, + "step": 6614 + }, + { + "epoch": 0.6043855641845591, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.4016835822610356e-06, + "logits/chosen": 405521792.0, + "logits/rejected": 404598912.0, + "logps/chosen": -198.04962158203125, + "logps/rejected": -378.9271647135417, + "loss": 0.0091, + "rewards/chosen": 4.40272331237793, + "rewards/margins": 12.185771306355793, + "rewards/rejected": -7.783047993977864, + "step": 6615 + }, + { + "epoch": 0.6044769301050708, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 3.4003212881459315e-06, + "logits/chosen": 669805158.4, + "logits/rejected": 473901696.0, + "logps/chosen": -445.394775390625, + "logps/rejected": -525.5220540364584, + "loss": 0.0348, + "rewards/chosen": 3.246977615356445, + "rewards/margins": 11.225175348917643, + "rewards/rejected": -7.978197733561198, + "step": 6616 + }, + { + "epoch": 0.6045682960255825, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 3.3989591263094336e-06, + "logits/chosen": 559239270.4, + "logits/rejected": 274584896.0, + "logps/chosen": -359.1581787109375, + "logps/rejected": -557.6072591145834, + "loss": 0.0283, + "rewards/chosen": 3.1798883438110352, + "rewards/margins": 14.860059038798013, + "rewards/rejected": -11.680170694986979, + "step": 6617 + }, + { + "epoch": 0.6046596619460941, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 3.397597096864183e-06, + "logits/chosen": 531847065.6, + "logits/rejected": 523811242.6666667, + "logps/chosen": -272.941259765625, + "logps/rejected": -790.8890787760416, + "loss": 0.0251, + "rewards/chosen": 3.6546867370605467, + "rewards/margins": 14.826823679606118, + "rewards/rejected": -11.172136942545572, + "step": 6618 + }, + { + "epoch": 0.6047510278666057, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 3.3962351999228048e-06, + "logits/chosen": 350380480.0, + "logits/rejected": 311425408.0, + "logps/chosen": -294.614501953125, + "logps/rejected": -560.938720703125, + "loss": 0.0253, + "rewards/chosen": 3.3546090126037598, + "rewards/margins": 14.28142786026001, + "rewards/rejected": -10.92681884765625, + "step": 6619 + }, + { + "epoch": 0.6048423937871175, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 3.3948734355979178e-06, + "logits/chosen": 780078796.8, + "logits/rejected": 498798677.3333333, + "logps/chosen": -222.514794921875, + "logps/rejected": -493.1331380208333, + "loss": 0.0654, + "rewards/chosen": 3.3735443115234376, + "rewards/margins": 13.40191421508789, + "rewards/rejected": -10.028369903564453, + "step": 6620 + }, + { + "epoch": 0.6049337597076291, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 3.3935118040021255e-06, + "logits/chosen": 328127904.0, + "logits/rejected": 600478378.6666666, + "logps/chosen": -157.54689025878906, + "logps/rejected": -801.95263671875, + "loss": 0.0054, + "rewards/chosen": 4.208216667175293, + "rewards/margins": 14.734291394551596, + "rewards/rejected": -10.526074727376303, + "step": 6621 + }, + { + "epoch": 0.6050251256281407, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 3.3921503052480243e-06, + "logits/chosen": 671437482.6666666, + "logits/rejected": 496371712.0, + "logps/chosen": -451.3907877604167, + "logps/rejected": -469.58046875, + "loss": 0.0081, + "rewards/chosen": 4.126071294148763, + "rewards/margins": 13.529684575398761, + "rewards/rejected": -9.40361328125, + "step": 6622 + }, + { + "epoch": 0.6051164915486523, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 3.390788939448193e-06, + "logits/chosen": 570100004.5714285, + "logits/rejected": 133676272.0, + "logps/chosen": -421.2776576450893, + "logps/rejected": -216.02284240722656, + "loss": 0.0235, + "rewards/chosen": 4.188077109200614, + "rewards/margins": 14.411262648446218, + "rewards/rejected": -10.223185539245605, + "step": 6623 + }, + { + "epoch": 0.6052078574691641, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 3.389427706715208e-06, + "logits/chosen": 511619488.0, + "logits/rejected": 489925504.0, + "logps/chosen": -213.89764404296875, + "logps/rejected": -507.2078857421875, + "loss": 0.0296, + "rewards/chosen": 4.335233688354492, + "rewards/margins": 13.127588272094727, + "rewards/rejected": -8.792354583740234, + "step": 6624 + }, + { + "epoch": 0.6052992233896757, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 3.3880666071616286e-06, + "logits/chosen": 633186560.0, + "logits/rejected": 553855616.0, + "logps/chosen": -389.0738932291667, + "logps/rejected": -576.59814453125, + "loss": 0.0246, + "rewards/chosen": 3.578012466430664, + "rewards/margins": 13.413580894470215, + "rewards/rejected": -9.83556842803955, + "step": 6625 + }, + { + "epoch": 0.6053905893101873, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 3.386705640900007e-06, + "logits/chosen": 535057984.0, + "logits/rejected": 669456384.0, + "logps/chosen": -316.2642822265625, + "logps/rejected": -596.4443969726562, + "loss": 0.0957, + "rewards/chosen": 3.02645206451416, + "rewards/margins": 13.588577270507812, + "rewards/rejected": -10.562125205993652, + "step": 6626 + }, + { + "epoch": 0.6054819552306989, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 3.3853448080428797e-06, + "logits/chosen": 534262826.6666667, + "logits/rejected": 633338828.8, + "logps/chosen": -347.4296468098958, + "logps/rejected": -514.11357421875, + "loss": 0.0109, + "rewards/chosen": 4.479124704996745, + "rewards/margins": 14.217107645670573, + "rewards/rejected": -9.737982940673827, + "step": 6627 + }, + { + "epoch": 0.6055733211512107, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 3.3839841087027782e-06, + "logits/chosen": 371670016.0, + "logits/rejected": 498504960.0, + "logps/chosen": -203.6054931640625, + "logps/rejected": -625.955322265625, + "loss": 0.141, + "rewards/chosen": 2.1379106521606444, + "rewards/margins": 12.197812843322755, + "rewards/rejected": -10.05990219116211, + "step": 6628 + }, + { + "epoch": 0.6056646870717223, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.382623542992217e-06, + "logits/chosen": 511227424.0, + "logits/rejected": 479211008.0, + "logps/chosen": -286.8712158203125, + "logps/rejected": -575.6029052734375, + "loss": 0.0173, + "rewards/chosen": 2.6160430908203125, + "rewards/margins": 12.208868662516275, + "rewards/rejected": -9.592825571695963, + "step": 6629 + }, + { + "epoch": 0.6057560529922339, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 3.381263111023705e-06, + "logits/chosen": 341462720.0, + "logits/rejected": 361567264.0, + "logps/chosen": -257.3863220214844, + "logps/rejected": -279.2779541015625, + "loss": 0.0155, + "rewards/chosen": 4.595478057861328, + "rewards/margins": 12.355856895446777, + "rewards/rejected": -7.760378837585449, + "step": 6630 + }, + { + "epoch": 0.6058474189127455, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 3.3799028129097346e-06, + "logits/chosen": 592866048.0, + "logits/rejected": 465104981.3333333, + "logps/chosen": -322.229638671875, + "logps/rejected": -564.7077229817709, + "loss": 0.0258, + "rewards/chosen": 3.216253662109375, + "rewards/margins": 11.914122009277344, + "rewards/rejected": -8.697868347167969, + "step": 6631 + }, + { + "epoch": 0.6059387848332572, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 3.3785426487627913e-06, + "logits/chosen": 607046442.6666666, + "logits/rejected": 714948480.0, + "logps/chosen": -262.65639241536456, + "logps/rejected": -456.7892150878906, + "loss": 0.021, + "rewards/chosen": 3.916337331136068, + "rewards/margins": 11.959098180135092, + "rewards/rejected": -8.042760848999023, + "step": 6632 + }, + { + "epoch": 0.6060301507537689, + "grad_norm": 0.84375, + "kl": 0.0, + "learning_rate": 3.3771826186953483e-06, + "logits/chosen": 717868288.0, + "logits/rejected": 493104932.5714286, + "logps/chosen": -587.695068359375, + "logps/rejected": -409.8181849888393, + "loss": 0.004, + "rewards/chosen": 3.8341310024261475, + "rewards/margins": 11.749525376728602, + "rewards/rejected": -7.915394374302456, + "step": 6633 + }, + { + "epoch": 0.6061215166742805, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 3.375822722819866e-06, + "logits/chosen": 344073312.0, + "logits/rejected": 508626517.3333333, + "logps/chosen": -372.26995849609375, + "logps/rejected": -573.1868489583334, + "loss": 0.0051, + "rewards/chosen": 4.356409072875977, + "rewards/margins": 14.198264439900717, + "rewards/rejected": -9.84185536702474, + "step": 6634 + }, + { + "epoch": 0.6062128825947921, + "grad_norm": 31.375, + "kl": 0.0, + "learning_rate": 3.374462961248797e-06, + "logits/chosen": 811085977.6, + "logits/rejected": 358137941.3333333, + "logps/chosen": -322.311328125, + "logps/rejected": -501.9943033854167, + "loss": 0.0884, + "rewards/chosen": 2.879517364501953, + "rewards/margins": 15.54811808268229, + "rewards/rejected": -12.668600718180338, + "step": 6635 + }, + { + "epoch": 0.6063042485153038, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 3.3731033340945797e-06, + "logits/chosen": 761195434.6666666, + "logits/rejected": 472661440.0, + "logps/chosen": -461.5760498046875, + "logps/rejected": -370.39617919921875, + "loss": 0.0113, + "rewards/chosen": 4.392388661702474, + "rewards/margins": 12.677779515584309, + "rewards/rejected": -8.285390853881836, + "step": 6636 + }, + { + "epoch": 0.6063956144358155, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 3.3717438414696437e-06, + "logits/chosen": 558650709.3333334, + "logits/rejected": 521637427.2, + "logps/chosen": -322.52541097005206, + "logps/rejected": -516.526171875, + "loss": 0.0117, + "rewards/chosen": 3.833393414815267, + "rewards/margins": 12.73333314259847, + "rewards/rejected": -8.899939727783202, + "step": 6637 + }, + { + "epoch": 0.6064869803563271, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 3.3703844834864045e-06, + "logits/chosen": 802693734.4, + "logits/rejected": 362176938.6666667, + "logps/chosen": -510.6771484375, + "logps/rejected": -561.5814615885416, + "loss": 0.023, + "rewards/chosen": 3.3235332489013674, + "rewards/margins": 13.56306635538737, + "rewards/rejected": -10.239533106486002, + "step": 6638 + }, + { + "epoch": 0.6065783462768387, + "grad_norm": 43.75, + "kl": 0.0, + "learning_rate": 3.3690252602572714e-06, + "logits/chosen": 540063424.0, + "logits/rejected": 430866688.0, + "logps/chosen": -318.31536865234375, + "logps/rejected": -603.4636840820312, + "loss": 0.0943, + "rewards/chosen": 2.9384796619415283, + "rewards/margins": 11.858065843582153, + "rewards/rejected": -8.919586181640625, + "step": 6639 + }, + { + "epoch": 0.6066697121973504, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 3.367666171894637e-06, + "logits/chosen": 411132704.0, + "logits/rejected": 519681504.0, + "logps/chosen": -216.04080200195312, + "logps/rejected": -655.9232788085938, + "loss": 0.0147, + "rewards/chosen": 4.496868133544922, + "rewards/margins": 14.81182861328125, + "rewards/rejected": -10.314960479736328, + "step": 6640 + }, + { + "epoch": 0.6067610781178621, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 3.366307218510887e-06, + "logits/chosen": 437145920.0, + "logps/chosen": -389.4431457519531, + "loss": 0.0204, + "rewards/chosen": 4.458163261413574, + "step": 6641 + }, + { + "epoch": 0.6068524440383737, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 3.3649484002183926e-06, + "logits/chosen": 419770848.0, + "logits/rejected": 295207232.0, + "logps/chosen": -222.0227813720703, + "logps/rejected": -515.7023315429688, + "loss": 0.019, + "rewards/chosen": 3.6778223514556885, + "rewards/margins": 11.355014085769653, + "rewards/rejected": -7.677191734313965, + "step": 6642 + }, + { + "epoch": 0.6069438099588853, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 3.3635897171295174e-06, + "logits/chosen": 722474112.0, + "logits/rejected": 522983296.0, + "logps/chosen": -527.5004272460938, + "logps/rejected": -423.5655822753906, + "loss": 0.018, + "rewards/chosen": 3.4860010147094727, + "rewards/margins": 13.656134605407715, + "rewards/rejected": -10.170133590698242, + "step": 6643 + }, + { + "epoch": 0.607035175879397, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 3.362231169356611e-06, + "logits/chosen": 551534299.4285715, + "logits/rejected": 499823136.0, + "logps/chosen": -244.01231166294642, + "logps/rejected": -598.4472045898438, + "loss": 0.0211, + "rewards/chosen": 4.059935433523996, + "rewards/margins": 15.601457459586008, + "rewards/rejected": -11.541522026062012, + "step": 6644 + }, + { + "epoch": 0.6071265417999087, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 3.3608727570120114e-06, + "logits/chosen": 801425472.0, + "logits/rejected": 427607722.6666667, + "logps/chosen": -172.32595825195312, + "logps/rejected": -467.4385579427083, + "loss": 0.1335, + "rewards/chosen": 1.3137198686599731, + "rewards/margins": 10.765339970588684, + "rewards/rejected": -9.451620101928711, + "step": 6645 + }, + { + "epoch": 0.6072179077204203, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 3.3595144802080493e-06, + "logits/chosen": 469007923.2, + "logits/rejected": 166655317.33333334, + "logps/chosen": -434.121875, + "logps/rejected": -361.1881510416667, + "loss": 0.0283, + "rewards/chosen": 3.940899658203125, + "rewards/margins": 15.10508041381836, + "rewards/rejected": -11.164180755615234, + "step": 6646 + }, + { + "epoch": 0.6073092736409319, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 3.358156339057039e-06, + "logits/chosen": 479425638.4, + "logits/rejected": 390528341.3333333, + "logps/chosen": -263.55966796875, + "logps/rejected": -520.4380696614584, + "loss": 0.1408, + "rewards/chosen": 2.2409542083740233, + "rewards/margins": 12.490684636433919, + "rewards/rejected": -10.249730428059896, + "step": 6647 + }, + { + "epoch": 0.6074006395614436, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 3.3567983336712884e-06, + "logits/chosen": 549420032.0, + "logits/rejected": 362991616.0, + "logps/chosen": -318.9159240722656, + "logps/rejected": -364.1001892089844, + "loss": 0.0254, + "rewards/chosen": 4.117502212524414, + "rewards/margins": 12.838706016540527, + "rewards/rejected": -8.721203804016113, + "step": 6648 + }, + { + "epoch": 0.6074920054819553, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 3.3554404641630923e-06, + "logits/chosen": 536039338.6666667, + "logits/rejected": 400387520.0, + "logps/chosen": -278.28611246744794, + "logps/rejected": -530.3980102539062, + "loss": 0.0195, + "rewards/chosen": 3.8555100758870444, + "rewards/margins": 12.931948979695639, + "rewards/rejected": -9.076438903808594, + "step": 6649 + }, + { + "epoch": 0.6075833714024669, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 3.3540827306447306e-06, + "logits/chosen": 921996202.6666666, + "logits/rejected": 1867208576.0, + "logps/chosen": -549.0562337239584, + "logps/rejected": -647.7924194335938, + "loss": 0.0246, + "rewards/chosen": 3.542170524597168, + "rewards/margins": 14.361207008361816, + "rewards/rejected": -10.819036483764648, + "step": 6650 + }, + { + "epoch": 0.6076747373229785, + "grad_norm": 0.70703125, + "kl": 0.0, + "learning_rate": 3.3527251332284784e-06, + "logits/chosen": 359824128.0, + "logits/rejected": 577877760.0, + "logps/chosen": -256.3531188964844, + "logps/rejected": -519.7093912760416, + "loss": 0.0039, + "rewards/chosen": 4.219176292419434, + "rewards/margins": 13.500416119893393, + "rewards/rejected": -9.281239827473959, + "step": 6651 + }, + { + "epoch": 0.6077661032434902, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 3.3513676720265937e-06, + "logits/chosen": 479783936.0, + "logits/rejected": 948158390.8571428, + "logps/chosen": -241.43801879882812, + "logps/rejected": -471.21142578125, + "loss": 0.0047, + "rewards/chosen": 5.162008762359619, + "rewards/margins": 14.129536969321114, + "rewards/rejected": -8.967528206961495, + "step": 6652 + }, + { + "epoch": 0.6078574691640019, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 3.350010347151329e-06, + "logits/chosen": 813207978.6666666, + "logits/rejected": 1335154278.4, + "logps/chosen": -264.9363606770833, + "logps/rejected": -607.41728515625, + "loss": 0.025, + "rewards/chosen": 2.7274297078450522, + "rewards/margins": 13.006490071614584, + "rewards/rejected": -10.279060363769531, + "step": 6653 + }, + { + "epoch": 0.6079488350845135, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 3.3486531587149207e-06, + "logits/chosen": 454114816.0, + "logits/rejected": 634290560.0, + "logps/chosen": -334.3435872395833, + "logps/rejected": -724.28759765625, + "loss": 0.0074, + "rewards/chosen": 5.208307266235352, + "rewards/margins": 15.44638729095459, + "rewards/rejected": -10.238080024719238, + "step": 6654 + }, + { + "epoch": 0.6080402010050251, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 3.3472961068295957e-06, + "logits/chosen": 977676629.3333334, + "logits/rejected": 544031539.2, + "logps/chosen": -306.48040771484375, + "logps/rejected": -488.73203125, + "loss": 0.0057, + "rewards/chosen": 4.269158681233724, + "rewards/margins": 12.897062428792317, + "rewards/rejected": -8.627903747558594, + "step": 6655 + }, + { + "epoch": 0.6081315669255368, + "grad_norm": 0.29296875, + "kl": 0.0, + "learning_rate": 3.3459391916075712e-06, + "logits/chosen": 254934464.0, + "logits/rejected": 463493851.4285714, + "logps/chosen": -265.30804443359375, + "logps/rejected": -452.2096470424107, + "loss": 0.0013, + "rewards/chosen": 4.83424711227417, + "rewards/margins": 14.030265876225062, + "rewards/rejected": -9.196018763950892, + "step": 6656 + }, + { + "epoch": 0.6082229328460484, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 3.3445824131610482e-06, + "logits/chosen": 413092864.0, + "logits/rejected": 392021632.0, + "logps/chosen": -306.2547200520833, + "logps/rejected": -560.3311767578125, + "loss": 0.0296, + "rewards/chosen": 3.4611644744873047, + "rewards/margins": 10.549378395080566, + "rewards/rejected": -7.088213920593262, + "step": 6657 + }, + { + "epoch": 0.6083142987665601, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 3.3432257716022244e-06, + "logits/chosen": 592484505.6, + "logits/rejected": 1086201856.0, + "logps/chosen": -191.935595703125, + "logps/rejected": -894.622314453125, + "loss": 0.0231, + "rewards/chosen": 3.5496967315673826, + "rewards/margins": 15.866102981567384, + "rewards/rejected": -12.31640625, + "step": 6658 + }, + { + "epoch": 0.6084056646870717, + "grad_norm": 17.375, + "kl": 1.6774921417236328, + "learning_rate": 3.3418692670432768e-06, + "logits/chosen": 610482505.1428572, + "logits/rejected": 238851328.0, + "logps/chosen": -448.46058872767856, + "logps/rejected": -463.23291015625, + "loss": 0.043, + "rewards/chosen": 3.8252792358398438, + "rewards/margins": 16.022541999816895, + "rewards/rejected": -12.19726276397705, + "step": 6659 + }, + { + "epoch": 0.6084970306075834, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 3.3405128995963793e-06, + "logits/chosen": 578616640.0, + "logits/rejected": 743451648.0, + "logps/chosen": -225.50808715820312, + "logps/rejected": -423.585205078125, + "loss": 0.0058, + "rewards/chosen": 3.8533196449279785, + "rewards/margins": 11.69065014521281, + "rewards/rejected": -7.837330500284831, + "step": 6660 + }, + { + "epoch": 0.608588396528095, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 3.339156669373688e-06, + "logits/chosen": 401174848.0, + "logits/rejected": 525004970.6666667, + "logps/chosen": -383.27520751953125, + "logps/rejected": -630.8395182291666, + "loss": 0.0054, + "rewards/chosen": 4.55978536605835, + "rewards/margins": 13.180779933929443, + "rewards/rejected": -8.620994567871094, + "step": 6661 + }, + { + "epoch": 0.6086797624486067, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 3.337800576487354e-06, + "logits/chosen": 609788352.0, + "logits/rejected": 634206080.0, + "logps/chosen": -194.90765380859375, + "logps/rejected": -270.1257629394531, + "loss": 0.113, + "rewards/chosen": 3.0568761825561523, + "rewards/margins": 10.826654434204102, + "rewards/rejected": -7.769778251647949, + "step": 6662 + }, + { + "epoch": 0.6087711283691183, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 3.33644462104951e-06, + "logits/chosen": 612931635.2, + "logits/rejected": 423954005.3333333, + "logps/chosen": -354.0334228515625, + "logps/rejected": -882.1318359375, + "loss": 0.1349, + "rewards/chosen": 2.3060070037841798, + "rewards/margins": 11.523226038614908, + "rewards/rejected": -9.217219034830729, + "step": 6663 + }, + { + "epoch": 0.60886249428963, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 3.3350888031722837e-06, + "logits/chosen": 328933312.0, + "logits/rejected": 302175616.0, + "logps/chosen": -125.29263305664062, + "logps/rejected": -285.9346516927083, + "loss": 0.0141, + "rewards/chosen": 3.6113643646240234, + "rewards/margins": 11.33437220255534, + "rewards/rejected": -7.723007837931315, + "step": 6664 + }, + { + "epoch": 0.6089538602101416, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.333733122967788e-06, + "logits/chosen": 576973994.6666666, + "logits/rejected": 620845465.6, + "logps/chosen": -333.6504313151042, + "logps/rejected": -645.38876953125, + "loss": 0.0236, + "rewards/chosen": 2.822652816772461, + "rewards/margins": 12.808207321166993, + "rewards/rejected": -9.985554504394532, + "step": 6665 + }, + { + "epoch": 0.6090452261306533, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 3.332377580548124e-06, + "logits/chosen": 486292684.8, + "logits/rejected": 315194069.3333333, + "logps/chosen": -349.6265380859375, + "logps/rejected": -472.8500162760417, + "loss": 0.0292, + "rewards/chosen": 3.0912498474121093, + "rewards/margins": 14.551261647542319, + "rewards/rejected": -11.460011800130209, + "step": 6666 + }, + { + "epoch": 0.6091365920511649, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 3.3310221760253834e-06, + "logits/chosen": 576250581.3333334, + "logits/rejected": 999601984.0, + "logps/chosen": -266.6703694661458, + "logps/rejected": -675.7557983398438, + "loss": 0.0294, + "rewards/chosen": 3.319779396057129, + "rewards/margins": 13.473307609558105, + "rewards/rejected": -10.153528213500977, + "step": 6667 + }, + { + "epoch": 0.6092279579716766, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 3.3296669095116454e-06, + "logits/chosen": 918900352.0, + "logits/rejected": 1319158912.0, + "logps/chosen": -452.8284912109375, + "logps/rejected": -961.765869140625, + "loss": 0.0285, + "rewards/chosen": 2.992971420288086, + "rewards/margins": 13.931558609008789, + "rewards/rejected": -10.938587188720703, + "step": 6668 + }, + { + "epoch": 0.6093193238921882, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.328311781118978e-06, + "logits/chosen": 977582208.0, + "logits/rejected": 490856576.0, + "logps/chosen": -443.4272766113281, + "logps/rejected": -564.6066080729166, + "loss": 0.0142, + "rewards/chosen": 2.824070930480957, + "rewards/margins": 13.954932848612467, + "rewards/rejected": -11.13086191813151, + "step": 6669 + }, + { + "epoch": 0.6094106898126999, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 3.326956790959437e-06, + "logits/chosen": 822258944.0, + "logits/rejected": 527197269.3333333, + "logps/chosen": -617.8665771484375, + "logps/rejected": -612.7374674479166, + "loss": 0.0143, + "rewards/chosen": 2.8734068870544434, + "rewards/margins": 12.279290676116943, + "rewards/rejected": -9.4058837890625, + "step": 6670 + }, + { + "epoch": 0.6095020557332115, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 3.3256019391450696e-06, + "logits/chosen": 415876480.0, + "logits/rejected": 410617830.4, + "logps/chosen": -232.63704427083334, + "logps/rejected": -502.054443359375, + "loss": 0.1297, + "rewards/chosen": 1.5539614359537761, + "rewards/margins": 10.387062327067056, + "rewards/rejected": -8.833100891113281, + "step": 6671 + }, + { + "epoch": 0.6095934216537232, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 3.3242472257879066e-06, + "logits/chosen": 423953706.6666667, + "logits/rejected": 542728396.8, + "logps/chosen": -386.5862223307292, + "logps/rejected": -489.57119140625, + "loss": 0.0175, + "rewards/chosen": 3.9749272664388022, + "rewards/margins": 13.199295552571614, + "rewards/rejected": -9.224368286132812, + "step": 6672 + }, + { + "epoch": 0.6096847875742348, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 3.322892650999974e-06, + "logits/chosen": 550453120.0, + "logits/rejected": 437381529.6, + "logps/chosen": -350.8221435546875, + "logps/rejected": -504.335693359375, + "loss": 0.0152, + "rewards/chosen": 3.246049245198568, + "rewards/margins": 14.67360356648763, + "rewards/rejected": -11.427554321289062, + "step": 6673 + }, + { + "epoch": 0.6097761534947465, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 3.3215382148932786e-06, + "logits/chosen": 354903756.8, + "logits/rejected": 382153856.0, + "logps/chosen": -149.2926513671875, + "logps/rejected": -595.3264973958334, + "loss": 0.008, + "rewards/chosen": 4.54515380859375, + "rewards/margins": 14.998072814941406, + "rewards/rejected": -10.452919006347656, + "step": 6674 + }, + { + "epoch": 0.6098675194152581, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.3201839175798224e-06, + "logits/chosen": 598891178.6666666, + "logits/rejected": 551788032.0, + "logps/chosen": -293.04543050130206, + "logps/rejected": -676.46484375, + "loss": 0.0178, + "rewards/chosen": 3.8659776051839194, + "rewards/margins": 15.884496053059896, + "rewards/rejected": -12.018518447875977, + "step": 6675 + }, + { + "epoch": 0.6099588853357698, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 3.3188297591715936e-06, + "logits/chosen": 808887125.3333334, + "logits/rejected": 459798425.6, + "logps/chosen": -411.4949951171875, + "logps/rejected": -457.71162109375, + "loss": 0.0259, + "rewards/chosen": 2.608491897583008, + "rewards/margins": 12.176718521118165, + "rewards/rejected": -9.568226623535157, + "step": 6676 + }, + { + "epoch": 0.6100502512562814, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 3.317475739780567e-06, + "logits/chosen": 695305728.0, + "logits/rejected": 342942208.0, + "logps/chosen": -380.4451416015625, + "logps/rejected": -447.569580078125, + "loss": 0.0207, + "rewards/chosen": 3.516804504394531, + "rewards/margins": 13.219641494750977, + "rewards/rejected": -9.702836990356445, + "step": 6677 + }, + { + "epoch": 0.610141617176793, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 3.3161218595187057e-06, + "logits/chosen": 1017072192.0, + "logits/rejected": 496450144.0, + "logps/chosen": -332.802001953125, + "logps/rejected": -484.42132568359375, + "loss": 0.0347, + "rewards/chosen": 3.619840383529663, + "rewards/margins": 9.773276090621948, + "rewards/rejected": -6.153435707092285, + "step": 6678 + }, + { + "epoch": 0.6102329830973047, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.314768118497966e-06, + "logits/chosen": 530693056.0, + "logits/rejected": 436338016.0, + "logps/chosen": -401.99554443359375, + "logps/rejected": -528.528076171875, + "loss": 0.018, + "rewards/chosen": 4.156134128570557, + "rewards/margins": 10.543023586273193, + "rewards/rejected": -6.386889457702637, + "step": 6679 + }, + { + "epoch": 0.6103243490178164, + "grad_norm": 2.84375, + "kl": 0.9230117797851562, + "learning_rate": 3.313414516830289e-06, + "logits/chosen": 437848149.3333333, + "logits/rejected": 315543712.0, + "logps/chosen": -339.36887613932294, + "logps/rejected": -444.832763671875, + "loss": 0.0181, + "rewards/chosen": 4.355284690856934, + "rewards/margins": 12.39596176147461, + "rewards/rejected": -8.040677070617676, + "step": 6680 + }, + { + "epoch": 0.610415714938328, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 3.3120610546276057e-06, + "logits/chosen": 545759616.0, + "logits/rejected": 833567914.6666666, + "logps/chosen": -344.13134765625, + "logps/rejected": -531.2872721354166, + "loss": 0.0116, + "rewards/chosen": 3.587780714035034, + "rewards/margins": 13.1567542552948, + "rewards/rejected": -9.568973541259766, + "step": 6681 + }, + { + "epoch": 0.6105070808588396, + "grad_norm": 0.9609375, + "kl": 0.0, + "learning_rate": 3.310707732001832e-06, + "logits/chosen": 435846400.0, + "logits/rejected": 552869504.0, + "logps/chosen": -258.3304443359375, + "logps/rejected": -524.7774658203125, + "loss": 0.0052, + "rewards/chosen": 5.095067024230957, + "rewards/margins": 13.051856994628906, + "rewards/rejected": -7.956789970397949, + "step": 6682 + }, + { + "epoch": 0.6105984467793513, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 3.309354549064879e-06, + "logits/chosen": 766595379.2, + "logits/rejected": 526036906.6666667, + "logps/chosen": -419.829150390625, + "logps/rejected": -286.11181640625, + "loss": 0.008, + "rewards/chosen": 4.502835845947265, + "rewards/margins": 12.398912048339843, + "rewards/rejected": -7.896076202392578, + "step": 6683 + }, + { + "epoch": 0.610689812699863, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 3.3080015059286397e-06, + "logits/chosen": 501061427.2, + "logits/rejected": 328276992.0, + "logps/chosen": -294.768896484375, + "logps/rejected": -230.62640380859375, + "loss": 0.0229, + "rewards/chosen": 3.851848602294922, + "rewards/margins": 10.660079956054688, + "rewards/rejected": -6.808231353759766, + "step": 6684 + }, + { + "epoch": 0.6107811786203746, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 3.3066486027050005e-06, + "logits/chosen": 470768896.0, + "logits/rejected": 452199552.0, + "logps/chosen": -253.8997802734375, + "logps/rejected": -457.3600667317708, + "loss": 0.0359, + "rewards/chosen": 2.8355695724487306, + "rewards/margins": 12.076938184102378, + "rewards/rejected": -9.241368611653646, + "step": 6685 + }, + { + "epoch": 0.6108725445408862, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 3.305295839505832e-06, + "logits/chosen": 860719513.6, + "logits/rejected": 484477184.0, + "logps/chosen": -724.148681640625, + "logps/rejected": -428.0225423177083, + "loss": 0.0161, + "rewards/chosen": 4.1939849853515625, + "rewards/margins": 11.13659159342448, + "rewards/rejected": -6.942606608072917, + "step": 6686 + }, + { + "epoch": 0.6109639104613979, + "grad_norm": 62.0, + "kl": 0.0, + "learning_rate": 3.3039432164429947e-06, + "logits/chosen": 268862656.0, + "logits/rejected": 520947748.5714286, + "logps/chosen": -102.80908966064453, + "logps/rejected": -595.4851422991071, + "loss": 0.0376, + "rewards/chosen": 0.8485817313194275, + "rewards/margins": 10.836686415331704, + "rewards/rejected": -9.988104684012276, + "step": 6687 + }, + { + "epoch": 0.6110552763819096, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 3.30259073362834e-06, + "logits/chosen": 552248405.3333334, + "logits/rejected": 372095168.0, + "logps/chosen": -295.2454833984375, + "logps/rejected": -483.62200927734375, + "loss": 0.0352, + "rewards/chosen": 3.5843044916788735, + "rewards/margins": 15.004406611124674, + "rewards/rejected": -11.4201021194458, + "step": 6688 + }, + { + "epoch": 0.6111466423024212, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 3.3012383911737043e-06, + "logits/chosen": 667612800.0, + "logits/rejected": 429365205.3333333, + "logps/chosen": -530.0064697265625, + "logps/rejected": -483.27734375, + "loss": 0.0092, + "rewards/chosen": 3.7045562267303467, + "rewards/margins": 13.630475441614786, + "rewards/rejected": -9.92591921488444, + "step": 6689 + }, + { + "epoch": 0.6112380082229328, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 3.299886189190915e-06, + "logits/chosen": 487850176.0, + "logits/rejected": 537798336.0, + "logps/chosen": -296.1246337890625, + "logps/rejected": -507.07672119140625, + "loss": 0.0213, + "rewards/chosen": 3.358501434326172, + "rewards/margins": 13.259592056274414, + "rewards/rejected": -9.901090621948242, + "step": 6690 + }, + { + "epoch": 0.6113293741434445, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.298534127791785e-06, + "logits/chosen": 458439776.0, + "logits/rejected": 439066880.0, + "logps/chosen": -280.3526306152344, + "logps/rejected": -549.549560546875, + "loss": 0.0243, + "rewards/chosen": 3.564596176147461, + "rewards/margins": 14.193964004516602, + "rewards/rejected": -10.62936782836914, + "step": 6691 + }, + { + "epoch": 0.6114207400639562, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 3.2971822070881193e-06, + "logits/chosen": 310055884.8, + "logits/rejected": 399353600.0, + "logps/chosen": -171.632958984375, + "logps/rejected": -452.7274169921875, + "loss": 0.1323, + "rewards/chosen": 2.8950069427490233, + "rewards/margins": 11.179112243652344, + "rewards/rejected": -8.28410530090332, + "step": 6692 + }, + { + "epoch": 0.6115121059844678, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 3.295830427191707e-06, + "logits/chosen": 716442794.6666666, + "logits/rejected": 617434880.0, + "logps/chosen": -364.3721516927083, + "logps/rejected": -531.816455078125, + "loss": 0.1098, + "rewards/chosen": 4.399537404378255, + "rewards/margins": 11.206583913167318, + "rewards/rejected": -6.807046508789062, + "step": 6693 + }, + { + "epoch": 0.6116034719049794, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 3.29447878821433e-06, + "logits/chosen": 574654592.0, + "logits/rejected": 350969446.4, + "logps/chosen": -310.298583984375, + "logps/rejected": -472.9572265625, + "loss": 0.0324, + "rewards/chosen": 2.5696517626444497, + "rewards/margins": 14.10050884882609, + "rewards/rejected": -11.53085708618164, + "step": 6694 + }, + { + "epoch": 0.6116948378254911, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 3.2931272902677534e-06, + "logits/chosen": 1051122380.8, + "logits/rejected": 627444608.0, + "logps/chosen": -280.931201171875, + "logps/rejected": -461.1832682291667, + "loss": 0.1466, + "rewards/chosen": 1.75134334564209, + "rewards/margins": 9.933308219909668, + "rewards/rejected": -8.181964874267578, + "step": 6695 + }, + { + "epoch": 0.6117862037460028, + "grad_norm": 43.25, + "kl": 0.0, + "learning_rate": 3.2917759334637376e-06, + "logits/chosen": 431755136.0, + "logits/rejected": 679027008.0, + "logps/chosen": -243.00794982910156, + "logps/rejected": -527.800048828125, + "loss": 0.1102, + "rewards/chosen": 2.069735527038574, + "rewards/margins": 10.99817180633545, + "rewards/rejected": -8.928436279296875, + "step": 6696 + }, + { + "epoch": 0.6118775696665144, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 3.290424717914025e-06, + "logits/chosen": 669951360.0, + "logits/rejected": 963055424.0, + "logps/chosen": -253.8963623046875, + "logps/rejected": -377.2945556640625, + "loss": 0.0113, + "rewards/chosen": 4.278436183929443, + "rewards/margins": 12.878098964691162, + "rewards/rejected": -8.599662780761719, + "step": 6697 + }, + { + "epoch": 0.611968935587026, + "grad_norm": 0.90234375, + "kl": 0.0, + "learning_rate": 3.2890736437303467e-06, + "logits/chosen": 447105472.0, + "logits/rejected": 745997994.6666666, + "logps/chosen": -305.9853515625, + "logps/rejected": -663.2178548177084, + "loss": 0.003, + "rewards/chosen": 5.042546272277832, + "rewards/margins": 14.940361976623535, + "rewards/rejected": -9.897815704345703, + "step": 6698 + }, + { + "epoch": 0.6120603015075377, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 3.2877227110244274e-06, + "logits/chosen": 1473129301.3333333, + "logits/rejected": 1556390502.4, + "logps/chosen": -273.83681233723956, + "logps/rejected": -490.960400390625, + "loss": 0.0119, + "rewards/chosen": 3.491461435953776, + "rewards/margins": 13.108526102701822, + "rewards/rejected": -9.617064666748046, + "step": 6699 + }, + { + "epoch": 0.6121516674280494, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 3.2863719199079747e-06, + "logits/chosen": 626310348.8, + "logits/rejected": 1226278826.6666667, + "logps/chosen": -322.1966064453125, + "logps/rejected": -594.6737467447916, + "loss": 0.0227, + "rewards/chosen": 3.986041259765625, + "rewards/margins": 13.16330769856771, + "rewards/rejected": -9.177266438802084, + "step": 6700 + }, + { + "epoch": 0.612243033348561, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 3.2850212704926875e-06, + "logits/chosen": 923440230.4, + "logits/rejected": 823878826.6666666, + "logps/chosen": -283.4786376953125, + "logps/rejected": -768.3328450520834, + "loss": 0.0099, + "rewards/chosen": 4.74365005493164, + "rewards/margins": 13.123798624674478, + "rewards/rejected": -8.380148569742838, + "step": 6701 + }, + { + "epoch": 0.6123343992690726, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 3.2836707628902533e-06, + "logits/chosen": 422735317.3333333, + "logits/rejected": 641371852.8, + "logps/chosen": -203.68756103515625, + "logps/rejected": -666.325341796875, + "loss": 0.0072, + "rewards/chosen": 4.20439338684082, + "rewards/margins": 14.395209121704102, + "rewards/rejected": -10.190815734863282, + "step": 6702 + }, + { + "epoch": 0.6124257651895842, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.282320397212342e-06, + "logits/chosen": 542288384.0, + "logits/rejected": 269197482.6666667, + "logps/chosen": -355.355908203125, + "logps/rejected": -436.901611328125, + "loss": 0.0206, + "rewards/chosen": 3.99290771484375, + "rewards/margins": 13.228706868489585, + "rewards/rejected": -9.235799153645834, + "step": 6703 + }, + { + "epoch": 0.612517131110096, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 3.2809701735706212e-06, + "logits/chosen": 730540544.0, + "logits/rejected": 596817322.6666666, + "logps/chosen": -363.8606689453125, + "logps/rejected": -560.9930419921875, + "loss": 0.0195, + "rewards/chosen": 3.961956024169922, + "rewards/margins": 12.989279429117838, + "rewards/rejected": -9.027323404947916, + "step": 6704 + }, + { + "epoch": 0.6126084970306076, + "grad_norm": 0.50390625, + "kl": 0.0, + "learning_rate": 3.2796200920767377e-06, + "logits/chosen": 991133696.0, + "logits/rejected": 444176091.4285714, + "logps/chosen": -903.49951171875, + "logps/rejected": -408.23793247767856, + "loss": 0.0021, + "rewards/chosen": 4.675586223602295, + "rewards/margins": 12.61841971533639, + "rewards/rejected": -7.942833491734096, + "step": 6705 + }, + { + "epoch": 0.6126998629511192, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 3.278270152842335e-06, + "logits/chosen": 374168294.4, + "logits/rejected": 295707968.0, + "logps/chosen": -263.6655517578125, + "logps/rejected": -408.7963460286458, + "loss": 0.0253, + "rewards/chosen": 3.531149673461914, + "rewards/margins": 13.196689732869466, + "rewards/rejected": -9.665540059407553, + "step": 6706 + }, + { + "epoch": 0.6127912288716308, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 3.276920355979038e-06, + "logits/chosen": 553656917.3333334, + "logits/rejected": 509052057.6, + "logps/chosen": -189.27766927083334, + "logps/rejected": -382.9215087890625, + "loss": 0.0144, + "rewards/chosen": 3.594493548075358, + "rewards/margins": 11.90412794748942, + "rewards/rejected": -8.309634399414062, + "step": 6707 + }, + { + "epoch": 0.6128825947921426, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 3.2755707015984618e-06, + "logits/chosen": 623853141.3333334, + "logits/rejected": 560596172.8, + "logps/chosen": -109.7009989420573, + "logps/rejected": -571.873486328125, + "loss": 0.0153, + "rewards/chosen": 3.6244468688964844, + "rewards/margins": 13.212006378173829, + "rewards/rejected": -9.587559509277344, + "step": 6708 + }, + { + "epoch": 0.6129739607126542, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 3.2742211898122123e-06, + "logits/chosen": 599009740.8, + "logits/rejected": 461430400.0, + "logps/chosen": -291.94990234375, + "logps/rejected": -207.88789876302084, + "loss": 0.0148, + "rewards/chosen": 4.043598937988281, + "rewards/margins": 11.856561787923177, + "rewards/rejected": -7.8129628499348955, + "step": 6709 + }, + { + "epoch": 0.6130653266331658, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 3.27287182073188e-06, + "logits/chosen": 879829674.6666666, + "logits/rejected": 590270771.2, + "logps/chosen": -200.52461751302084, + "logps/rejected": -443.54169921875, + "loss": 0.0059, + "rewards/chosen": 4.602078119913737, + "rewards/margins": 12.89093844095866, + "rewards/rejected": -8.288860321044922, + "step": 6710 + }, + { + "epoch": 0.6131566925536774, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.2715225944690466e-06, + "logits/chosen": 622779477.3333334, + "logits/rejected": 993905459.2, + "logps/chosen": -438.6297607421875, + "logps/rejected": -633.000732421875, + "loss": 0.0126, + "rewards/chosen": 3.3771082560221353, + "rewards/margins": 14.535250345865885, + "rewards/rejected": -11.15814208984375, + "step": 6711 + }, + { + "epoch": 0.6132480584741892, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 3.2701735111352794e-06, + "logits/chosen": 677165141.3333334, + "logits/rejected": 607978240.0, + "logps/chosen": -291.22381591796875, + "logps/rejected": -463.51103515625, + "loss": 0.0143, + "rewards/chosen": 3.4508612950642905, + "rewards/margins": 12.51343911488851, + "rewards/rejected": -9.062577819824218, + "step": 6712 + }, + { + "epoch": 0.6133394243947008, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 3.268824570842136e-06, + "logits/chosen": 1301243562.6666667, + "logits/rejected": 587240038.4, + "logps/chosen": -425.9496663411458, + "logps/rejected": -413.615625, + "loss": 0.0196, + "rewards/chosen": 4.0349915822347, + "rewards/margins": 13.055919011433918, + "rewards/rejected": -9.020927429199219, + "step": 6713 + }, + { + "epoch": 0.6134307903152124, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.267475773701161e-06, + "logits/chosen": 616215466.6666666, + "logits/rejected": 493744576.0, + "logps/chosen": -409.7069091796875, + "logps/rejected": -387.5353088378906, + "loss": 0.0164, + "rewards/chosen": 4.306569417317708, + "rewards/margins": 12.40108807881673, + "rewards/rejected": -8.094518661499023, + "step": 6714 + }, + { + "epoch": 0.613522156235724, + "grad_norm": 75.5, + "kl": 0.0, + "learning_rate": 3.2661271198238875e-06, + "logits/chosen": 617697066.6666666, + "logits/rejected": 622203968.0, + "logps/chosen": -380.928466796875, + "logps/rejected": -549.2052612304688, + "loss": 0.0881, + "rewards/chosen": 3.2570699055989585, + "rewards/margins": 10.689214070638021, + "rewards/rejected": -7.4321441650390625, + "step": 6715 + }, + { + "epoch": 0.6136135221562358, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 3.264778609321835e-06, + "logits/chosen": 607585536.0, + "logits/rejected": 556612800.0, + "logps/chosen": -348.63525390625, + "logps/rejected": -305.639404296875, + "loss": 0.0147, + "rewards/chosen": 3.6483983993530273, + "rewards/margins": 11.05761432647705, + "rewards/rejected": -7.409215927124023, + "step": 6716 + }, + { + "epoch": 0.6137048880767474, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 3.263430242306516e-06, + "logits/chosen": 791350937.6, + "logits/rejected": 480685141.3333333, + "logps/chosen": -288.869140625, + "logps/rejected": -436.9556477864583, + "loss": 0.0088, + "rewards/chosen": 4.706594848632813, + "rewards/margins": 12.324523035685221, + "rewards/rejected": -7.617928187052409, + "step": 6717 + }, + { + "epoch": 0.613796253997259, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 3.2620820188894264e-06, + "logits/chosen": 440505270.85714287, + "logits/rejected": 516204896.0, + "logps/chosen": -245.887451171875, + "logps/rejected": -387.52783203125, + "loss": 0.03, + "rewards/chosen": 4.157765252249582, + "rewards/margins": 13.518394333975657, + "rewards/rejected": -9.360629081726074, + "step": 6718 + }, + { + "epoch": 0.6138876199177706, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 3.26073393918205e-06, + "logits/chosen": 519473152.0, + "logits/rejected": 645829017.6, + "logps/chosen": -347.2146809895833, + "logps/rejected": -309.0753662109375, + "loss": 0.0178, + "rewards/chosen": 3.2306086222330728, + "rewards/margins": 11.125862375895181, + "rewards/rejected": -7.895253753662109, + "step": 6719 + }, + { + "epoch": 0.6139789858382824, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 3.259386003295864e-06, + "logits/chosen": 532105386.6666667, + "logits/rejected": 305795379.2, + "logps/chosen": -200.67230224609375, + "logps/rejected": -406.6138916015625, + "loss": 0.0215, + "rewards/chosen": 3.5431346893310547, + "rewards/margins": 12.531877517700195, + "rewards/rejected": -8.98874282836914, + "step": 6720 + }, + { + "epoch": 0.614070351758794, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 3.258038211342327e-06, + "logits/chosen": 581195673.6, + "logits/rejected": 409122773.3333333, + "logps/chosen": -320.98056640625, + "logps/rejected": -311.197998046875, + "loss": 0.0197, + "rewards/chosen": 4.024745178222656, + "rewards/margins": 11.84413324991862, + "rewards/rejected": -7.819388071695964, + "step": 6721 + }, + { + "epoch": 0.6141617176793056, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 3.2566905634328917e-06, + "logits/chosen": 623175314.2857143, + "logits/rejected": 243257632.0, + "logps/chosen": -592.9188058035714, + "logps/rejected": -81.19755554199219, + "loss": 0.0319, + "rewards/chosen": 3.6185095650809154, + "rewards/margins": 6.679456268038068, + "rewards/rejected": -3.0609467029571533, + "step": 6722 + }, + { + "epoch": 0.6142530835998172, + "grad_norm": 0.7109375, + "kl": 0.0, + "learning_rate": 3.255343059678993e-06, + "logits/chosen": 496950368.0, + "logits/rejected": 763938596.5714285, + "logps/chosen": -586.276611328125, + "logps/rejected": -468.19778878348217, + "loss": 0.0024, + "rewards/chosen": 4.129846096038818, + "rewards/margins": 11.952685015542166, + "rewards/rejected": -7.822838919503348, + "step": 6723 + }, + { + "epoch": 0.614344449520329, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 3.25399570019206e-06, + "logits/chosen": 235138752.0, + "logits/rejected": 311103317.3333333, + "logps/chosen": -205.0502166748047, + "logps/rejected": -611.6640625, + "loss": 0.0092, + "rewards/chosen": 3.9369077682495117, + "rewards/margins": 13.573689460754395, + "rewards/rejected": -9.636781692504883, + "step": 6724 + }, + { + "epoch": 0.6144358154408406, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 3.2526484850835048e-06, + "logits/chosen": 99595016.0, + "logits/rejected": 362406326.85714287, + "logps/chosen": -5.135448455810547, + "logps/rejected": -425.40004185267856, + "loss": 0.0263, + "rewards/chosen": 1.506809115409851, + "rewards/margins": 10.4286470583507, + "rewards/rejected": -8.921837942940849, + "step": 6725 + }, + { + "epoch": 0.6145271813613522, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 3.2513014144647305e-06, + "logits/chosen": 436701081.6, + "logits/rejected": 468805717.3333333, + "logps/chosen": -410.196044921875, + "logps/rejected": -449.2660319010417, + "loss": 0.0234, + "rewards/chosen": 3.521987533569336, + "rewards/margins": 12.169340387980142, + "rewards/rejected": -8.647352854410807, + "step": 6726 + }, + { + "epoch": 0.6146185472818638, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.249954488447127e-06, + "logits/chosen": 592059545.6, + "logits/rejected": 1679005696.0, + "logps/chosen": -281.1804931640625, + "logps/rejected": -946.483154296875, + "loss": 0.0213, + "rewards/chosen": 3.5055110931396483, + "rewards/margins": 14.831476974487305, + "rewards/rejected": -11.325965881347656, + "step": 6727 + }, + { + "epoch": 0.6147099132023756, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 3.248607707142073e-06, + "logits/chosen": 207210905.6, + "logits/rejected": 271313493.3333333, + "logps/chosen": -207.5801513671875, + "logps/rejected": -609.0503336588541, + "loss": 0.0066, + "rewards/chosen": 4.930657196044922, + "rewards/margins": 14.988575744628907, + "rewards/rejected": -10.057918548583984, + "step": 6728 + }, + { + "epoch": 0.6148012791228872, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 3.2472610706609363e-06, + "logits/chosen": 484053248.0, + "logits/rejected": 652962112.0, + "logps/chosen": -287.7747802734375, + "logps/rejected": -365.2024230957031, + "loss": 0.026, + "rewards/chosen": 3.6823577880859375, + "rewards/margins": 13.414119720458984, + "rewards/rejected": -9.731761932373047, + "step": 6729 + }, + { + "epoch": 0.6148926450433988, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.2459145791150683e-06, + "logits/chosen": 786292650.6666666, + "logits/rejected": 389470912.0, + "logps/chosen": -302.09747314453125, + "logps/rejected": -379.91314697265625, + "loss": 0.019, + "rewards/chosen": 4.034958203633626, + "rewards/margins": 12.755335172017414, + "rewards/rejected": -8.720376968383789, + "step": 6730 + }, + { + "epoch": 0.6149840109639104, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 3.2445682326158118e-06, + "logits/chosen": 401223680.0, + "logits/rejected": 12113448.0, + "logps/chosen": -286.74265834263394, + "logps/rejected": -708.468505859375, + "loss": 0.0235, + "rewards/chosen": 3.7984856196812222, + "rewards/margins": 12.604858262198313, + "rewards/rejected": -8.80637264251709, + "step": 6731 + }, + { + "epoch": 0.6150753768844222, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 3.243222031274499e-06, + "logits/chosen": 917414848.0, + "logits/rejected": 783881728.0, + "logps/chosen": -507.3649597167969, + "logps/rejected": -428.329345703125, + "loss": 0.0141, + "rewards/chosen": 3.665602922439575, + "rewards/margins": 11.95401120185852, + "rewards/rejected": -8.288408279418945, + "step": 6732 + }, + { + "epoch": 0.6151667428049338, + "grad_norm": 38.0, + "kl": 0.0, + "learning_rate": 3.241875975202446e-06, + "logits/chosen": 587736490.6666666, + "logits/rejected": 274735936.0, + "logps/chosen": -422.9066569010417, + "logps/rejected": -493.5325012207031, + "loss": 0.1193, + "rewards/chosen": 2.5181872049967446, + "rewards/margins": 13.293078104654947, + "rewards/rejected": -10.774890899658203, + "step": 6733 + }, + { + "epoch": 0.6152581087254454, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.2405300645109615e-06, + "logits/chosen": 590822809.6, + "logits/rejected": 572229546.6666666, + "logps/chosen": -295.1513671875, + "logps/rejected": -644.2720133463541, + "loss": 0.0225, + "rewards/chosen": 3.6314659118652344, + "rewards/margins": 13.597892761230469, + "rewards/rejected": -9.966426849365234, + "step": 6734 + }, + { + "epoch": 0.615349474645957, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 3.239184299311338e-06, + "logits/chosen": 732362880.0, + "logits/rejected": 712007168.0, + "logps/chosen": -468.662841796875, + "logps/rejected": -404.716552734375, + "loss": 0.0182, + "rewards/chosen": 3.5527467727661133, + "rewards/margins": 11.294252395629883, + "rewards/rejected": -7.7415056228637695, + "step": 6735 + }, + { + "epoch": 0.6154408405664687, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 3.2378386797148598e-06, + "logits/chosen": 353741977.6, + "logits/rejected": 597489749.3333334, + "logps/chosen": -144.267919921875, + "logps/rejected": -656.6691487630209, + "loss": 0.1301, + "rewards/chosen": 3.155913543701172, + "rewards/margins": 12.169181696573894, + "rewards/rejected": -9.01326815287272, + "step": 6736 + }, + { + "epoch": 0.6155322064869804, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 3.236493205832795e-06, + "logits/chosen": 799021824.0, + "logits/rejected": 842345472.0, + "logps/chosen": -183.5605265299479, + "logps/rejected": -552.89208984375, + "loss": 0.016, + "rewards/chosen": 3.2969703674316406, + "rewards/margins": 10.592713165283204, + "rewards/rejected": -7.295742797851562, + "step": 6737 + }, + { + "epoch": 0.615623572407492, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 3.235147877776403e-06, + "logits/chosen": 1005512128.0, + "logits/rejected": 715613760.0, + "logps/chosen": -217.36849975585938, + "logps/rejected": -590.63037109375, + "loss": 0.0113, + "rewards/chosen": 4.299965858459473, + "rewards/margins": 14.491899490356445, + "rewards/rejected": -10.191933631896973, + "step": 6738 + }, + { + "epoch": 0.6157149383280036, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 3.233802695656931e-06, + "logits/chosen": 531966506.6666667, + "logits/rejected": 382694848.0, + "logps/chosen": -198.3477579752604, + "logps/rejected": -404.9312744140625, + "loss": 0.0568, + "rewards/chosen": 3.083354632059733, + "rewards/margins": 13.668189684549967, + "rewards/rejected": -10.584835052490234, + "step": 6739 + }, + { + "epoch": 0.6158063042485153, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 3.23245765958561e-06, + "logits/chosen": 376094003.2, + "logits/rejected": 433014528.0, + "logps/chosen": -194.8909423828125, + "logps/rejected": -496.699462890625, + "loss": 0.0132, + "rewards/chosen": 4.270140838623047, + "rewards/margins": 14.091248067220054, + "rewards/rejected": -9.821107228597006, + "step": 6740 + }, + { + "epoch": 0.615897670169027, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 3.2311127696736655e-06, + "logits/chosen": 358664490.6666667, + "logits/rejected": 245531648.0, + "logps/chosen": -289.810546875, + "logps/rejected": -417.443701171875, + "loss": 0.0116, + "rewards/chosen": 3.963127772013346, + "rewards/margins": 13.295444361368814, + "rewards/rejected": -9.332316589355468, + "step": 6741 + }, + { + "epoch": 0.6159890360895386, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.229768026032304e-06, + "logits/chosen": 576716074.6666666, + "logits/rejected": 294503488.0, + "logps/chosen": -461.4136555989583, + "logps/rejected": -491.7024841308594, + "loss": 0.0222, + "rewards/chosen": 3.9301846822102866, + "rewards/margins": 15.652395566304525, + "rewards/rejected": -11.722210884094238, + "step": 6742 + }, + { + "epoch": 0.6160804020100502, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 3.2284234287727267e-06, + "logits/rejected": 542684544.0, + "logps/rejected": -628.6541748046875, + "loss": 0.0018, + "rewards/rejected": -11.24472427368164, + "step": 6743 + }, + { + "epoch": 0.6161717679305619, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 3.2270789780061174e-06, + "logits/chosen": 491642944.0, + "logits/rejected": 531155346.28571427, + "logps/chosen": -365.8507995605469, + "logps/rejected": -588.3957868303571, + "loss": 0.0057, + "rewards/chosen": 3.0826447010040283, + "rewards/margins": 12.588692631040301, + "rewards/rejected": -9.506047930036273, + "step": 6744 + }, + { + "epoch": 0.6162631338510736, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 3.22573467384365e-06, + "logits/chosen": 967940608.0, + "logits/rejected": 555136640.0, + "logps/chosen": -226.74308268229166, + "logps/rejected": -399.7332458496094, + "loss": 0.0252, + "rewards/chosen": 3.9003613789876304, + "rewards/margins": 12.483664830525717, + "rewards/rejected": -8.583303451538086, + "step": 6745 + }, + { + "epoch": 0.6163544997715852, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 3.2243905163964863e-06, + "logits/chosen": 747797162.6666666, + "logits/rejected": 652340838.4, + "logps/chosen": -199.10660807291666, + "logps/rejected": -540.549462890625, + "loss": 0.0819, + "rewards/chosen": 3.543354352315267, + "rewards/margins": 11.123822339375813, + "rewards/rejected": -7.580467987060547, + "step": 6746 + }, + { + "epoch": 0.6164458656920968, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 3.2230465057757754e-06, + "logits/chosen": 262801024.0, + "logits/rejected": 597331404.8, + "logps/chosen": -173.6655476888021, + "logps/rejected": -325.4925048828125, + "loss": 0.0041, + "rewards/chosen": 5.745293935139974, + "rewards/margins": 12.836637624104817, + "rewards/rejected": -7.091343688964844, + "step": 6747 + }, + { + "epoch": 0.6165372316126085, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 3.2217026420926545e-06, + "logits/chosen": 425187008.0, + "logits/rejected": 582425024.0, + "logps/chosen": -307.27545166015625, + "logps/rejected": -310.8365478515625, + "loss": 0.1286, + "rewards/chosen": 3.669259786605835, + "rewards/margins": 8.903826475143433, + "rewards/rejected": -5.234566688537598, + "step": 6748 + }, + { + "epoch": 0.6166285975331202, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.220358925458249e-06, + "logits/chosen": 675439616.0, + "logits/rejected": 670908467.2, + "logps/chosen": -280.2123616536458, + "logps/rejected": -608.93681640625, + "loss": 0.0105, + "rewards/chosen": 3.881594657897949, + "rewards/margins": 12.018365287780762, + "rewards/rejected": -8.136770629882813, + "step": 6749 + }, + { + "epoch": 0.6167199634536318, + "grad_norm": 0.9375, + "kl": 0.0, + "learning_rate": 3.2190153559836724e-06, + "logits/chosen": 537433792.0, + "logits/rejected": 822152277.3333334, + "logps/chosen": -382.37078857421875, + "logps/rejected": -483.9226888020833, + "loss": 0.0035, + "rewards/chosen": 4.956973552703857, + "rewards/margins": 14.864379405975342, + "rewards/rejected": -9.907405853271484, + "step": 6750 + }, + { + "epoch": 0.6168113293741434, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 3.217671933780023e-06, + "logits/chosen": 493787562.6666667, + "logits/rejected": 419840742.4, + "logps/chosen": -356.04443359375, + "logps/rejected": -479.636474609375, + "loss": 0.0348, + "rewards/chosen": 4.529558817545573, + "rewards/margins": 13.650278727213543, + "rewards/rejected": -9.120719909667969, + "step": 6751 + }, + { + "epoch": 0.6169026952946551, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 3.2163286589583932e-06, + "logits/chosen": 574432320.0, + "logits/rejected": 1180278656.0, + "logps/chosen": -283.22406005859375, + "logps/rejected": -438.8727111816406, + "loss": 0.0258, + "rewards/chosen": 3.0624372959136963, + "rewards/margins": 14.490880250930786, + "rewards/rejected": -11.42844295501709, + "step": 6752 + }, + { + "epoch": 0.6169940612151668, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 3.2149855316298552e-06, + "logits/chosen": 465191168.0, + "logits/rejected": 482690880.0, + "logps/chosen": -327.80515543619794, + "logps/rejected": -362.5893249511719, + "loss": 0.0688, + "rewards/chosen": 2.8236865997314453, + "rewards/margins": 12.254837989807129, + "rewards/rejected": -9.431151390075684, + "step": 6753 + }, + { + "epoch": 0.6170854271356784, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 3.2136425519054764e-06, + "logits/chosen": 401545728.0, + "logits/rejected": 281687808.0, + "logps/chosen": -261.51519775390625, + "logps/rejected": -434.45831298828125, + "loss": 0.0113, + "rewards/chosen": 4.067324161529541, + "rewards/margins": 12.74536943435669, + "rewards/rejected": -8.678045272827148, + "step": 6754 + }, + { + "epoch": 0.61717679305619, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.212299719896307e-06, + "logits/chosen": 689839744.0, + "logits/rejected": 621557376.0, + "logps/chosen": -450.7878112792969, + "logps/rejected": -506.1287841796875, + "loss": 0.0148, + "rewards/chosen": 3.7134430408477783, + "rewards/margins": 13.764653444290161, + "rewards/rejected": -10.051210403442383, + "step": 6755 + }, + { + "epoch": 0.6172681589767017, + "grad_norm": 0.76953125, + "kl": 0.0, + "learning_rate": 3.210957035713389e-06, + "logits/chosen": 257257472.0, + "logits/rejected": 557005824.0, + "logps/chosen": -274.8395589192708, + "logps/rejected": -676.393115234375, + "loss": 0.0049, + "rewards/chosen": 4.475268046061198, + "rewards/margins": 15.284475199381511, + "rewards/rejected": -10.809207153320312, + "step": 6756 + }, + { + "epoch": 0.6173595248972134, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 3.2096144994677468e-06, + "logits/chosen": 696435328.0, + "logits/rejected": 1242977194.6666667, + "logps/chosen": -349.4154357910156, + "logps/rejected": -496.22607421875, + "loss": 0.0055, + "rewards/chosen": 3.8838930130004883, + "rewards/margins": 12.652750333150228, + "rewards/rejected": -8.76885732014974, + "step": 6757 + }, + { + "epoch": 0.617450890817725, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 3.2082721112703966e-06, + "logits/chosen": 575611648.0, + "logits/rejected": 203491936.0, + "logps/chosen": -448.38214111328125, + "logps/rejected": -417.2838134765625, + "loss": 0.0136, + "rewards/chosen": 3.7947328090667725, + "rewards/margins": 12.882783651351929, + "rewards/rejected": -9.088050842285156, + "step": 6758 + }, + { + "epoch": 0.6175422567382366, + "grad_norm": 5.5625, + "kl": 7.93022346496582, + "learning_rate": 3.206929871232343e-06, + "logits/chosen": 404019136.0, + "logps/chosen": -271.87152099609375, + "loss": 0.0285, + "rewards/chosen": 4.901526927947998, + "step": 6759 + }, + { + "epoch": 0.6176336226587483, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 3.205587779464576e-06, + "logits/chosen": 476227520.0, + "logits/rejected": 540212672.0, + "logps/chosen": -182.81494140625, + "logps/rejected": -631.1951904296875, + "loss": 0.0201, + "rewards/chosen": 3.6104540824890137, + "rewards/margins": 12.931498050689697, + "rewards/rejected": -9.321043968200684, + "step": 6760 + }, + { + "epoch": 0.61772498857926, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 3.2042458360780736e-06, + "logits/chosen": 506663872.0, + "logits/rejected": 606684096.0, + "logps/chosen": -297.61181640625, + "logps/rejected": -513.850341796875, + "loss": 0.0404, + "rewards/chosen": 2.6052589416503906, + "rewards/margins": 12.126623153686523, + "rewards/rejected": -9.521364212036133, + "step": 6761 + }, + { + "epoch": 0.6178163544997716, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 3.202904041183803e-06, + "logits/chosen": 628890240.0, + "logits/rejected": 573113497.6, + "logps/chosen": -512.9546305338541, + "logps/rejected": -483.24326171875, + "loss": 0.0127, + "rewards/chosen": 4.152097702026367, + "rewards/margins": 12.328709030151368, + "rewards/rejected": -8.176611328125, + "step": 6762 + }, + { + "epoch": 0.6179077204202832, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 3.2015623948927165e-06, + "logits/chosen": 373072128.0, + "logits/rejected": 755030976.0, + "logps/chosen": -295.4923095703125, + "logps/rejected": -984.9624633789062, + "loss": 0.0085, + "rewards/chosen": 4.6953935623168945, + "rewards/margins": 19.690373420715332, + "rewards/rejected": -14.994979858398438, + "step": 6763 + }, + { + "epoch": 0.6179990863407949, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.2002208973157584e-06, + "logits/chosen": 481432371.2, + "logits/rejected": 637501440.0, + "logps/chosen": -274.23955078125, + "logps/rejected": -584.30810546875, + "loss": 0.0441, + "rewards/chosen": 4.203433609008789, + "rewards/margins": 12.804465866088867, + "rewards/rejected": -8.601032257080078, + "step": 6764 + }, + { + "epoch": 0.6180904522613065, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 3.1988795485638558e-06, + "logits/chosen": 640442828.8, + "logits/rejected": 417209941.3333333, + "logps/chosen": -276.95947265625, + "logps/rejected": -411.4336751302083, + "loss": 0.0128, + "rewards/chosen": 4.25439453125, + "rewards/margins": 14.451075236002604, + "rewards/rejected": -10.196680704752604, + "step": 6765 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 3.197538348747927e-06, + "logits/chosen": 576186496.0, + "logits/rejected": 506799820.8, + "logps/chosen": -396.4703369140625, + "logps/rejected": -581.761083984375, + "loss": 0.0912, + "rewards/chosen": 4.869953155517578, + "rewards/margins": 13.461431884765625, + "rewards/rejected": -8.591478729248047, + "step": 6766 + }, + { + "epoch": 0.6182731841023298, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 3.196197297978877e-06, + "logits/chosen": 746160640.0, + "logits/rejected": 365842944.0, + "logps/chosen": -545.0978393554688, + "logps/rejected": -435.7639973958333, + "loss": 0.0085, + "rewards/chosen": 3.6665468215942383, + "rewards/margins": 12.9132293065389, + "rewards/rejected": -9.246682484944662, + "step": 6767 + }, + { + "epoch": 0.6183645500228415, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.194856396367598e-06, + "logits/chosen": 473969408.0, + "logits/rejected": 234490976.0, + "logps/chosen": -249.7679443359375, + "logps/rejected": -306.38775634765625, + "loss": 0.0282, + "rewards/chosen": 3.5078159968058267, + "rewards/margins": 11.256318728129068, + "rewards/rejected": -7.748502731323242, + "step": 6768 + }, + { + "epoch": 0.6184559159433531, + "grad_norm": 47.25, + "kl": 0.0, + "learning_rate": 3.193515644024969e-06, + "logits/chosen": 494243242.6666667, + "logits/rejected": 341578528.0, + "logps/chosen": -201.80277506510416, + "logps/rejected": -443.9060974121094, + "loss": 0.0721, + "rewards/chosen": 3.4092400868733725, + "rewards/margins": 11.31473191579183, + "rewards/rejected": -7.905491828918457, + "step": 6769 + }, + { + "epoch": 0.6185472818638648, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.1921750410618598e-06, + "logits/chosen": 704821964.8, + "logits/rejected": 1321945258.6666667, + "logps/chosen": -394.8595703125, + "logps/rejected": -453.7078450520833, + "loss": 0.0178, + "rewards/chosen": 3.6529197692871094, + "rewards/margins": 14.235702514648438, + "rewards/rejected": -10.582782745361328, + "step": 6770 + }, + { + "epoch": 0.6186386477843764, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 3.1908345875891243e-06, + "logits/chosen": 531449514.6666667, + "logits/rejected": 314879040.0, + "logps/chosen": -309.41684977213544, + "logps/rejected": -359.82391357421875, + "loss": 0.0248, + "rewards/chosen": 3.7165307998657227, + "rewards/margins": 13.919716835021973, + "rewards/rejected": -10.20318603515625, + "step": 6771 + }, + { + "epoch": 0.6187300137048881, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 3.1894942837176057e-06, + "logits/chosen": 552247347.2, + "logits/rejected": 713598890.6666666, + "logps/chosen": -396.5843994140625, + "logps/rejected": -581.9553629557291, + "loss": 0.0243, + "rewards/chosen": 3.392496109008789, + "rewards/margins": 13.052180608113607, + "rewards/rejected": -9.659684499104818, + "step": 6772 + }, + { + "epoch": 0.6188213796253997, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 3.1881541295581364e-06, + "logits/chosen": 312958336.0, + "logits/rejected": 314424170.6666667, + "logps/chosen": -283.2099853515625, + "logps/rejected": -512.7572021484375, + "loss": 0.0129, + "rewards/chosen": 4.3027690887451175, + "rewards/margins": 15.59373639424642, + "rewards/rejected": -11.290967305501303, + "step": 6773 + }, + { + "epoch": 0.6189127455459114, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.186814125221532e-06, + "logits/chosen": 435542784.0, + "logits/rejected": 483503200.0, + "logps/chosen": -361.6985677083333, + "logps/rejected": -464.90966796875, + "loss": 0.016, + "rewards/chosen": 4.156037330627441, + "rewards/margins": 12.855720520019531, + "rewards/rejected": -8.69968318939209, + "step": 6774 + }, + { + "epoch": 0.619004111466423, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 3.1854742708186013e-06, + "logits/chosen": 432919808.0, + "logits/rejected": 432888704.0, + "logps/chosen": -303.5519104003906, + "logps/rejected": -390.54736328125, + "loss": 0.0127, + "rewards/chosen": 4.407562255859375, + "rewards/margins": 12.649659156799316, + "rewards/rejected": -8.242096900939941, + "step": 6775 + }, + { + "epoch": 0.6190954773869347, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 3.1841345664601354e-06, + "logits/chosen": 1148792627.2, + "logits/rejected": 535916373.3333333, + "logps/chosen": -356.6544921875, + "logps/rejected": -593.4136555989584, + "loss": 0.0483, + "rewards/chosen": 2.6416259765625, + "rewards/margins": 13.10357182820638, + "rewards/rejected": -10.46194585164388, + "step": 6776 + }, + { + "epoch": 0.6191868433074463, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 3.1827950122569173e-06, + "logits/chosen": 563725312.0, + "logits/rejected": 437788160.0, + "logps/chosen": -297.21669921875, + "logps/rejected": -613.6682942708334, + "loss": 0.1321, + "rewards/chosen": 3.0809394836425783, + "rewards/margins": 15.599520874023437, + "rewards/rejected": -12.51858139038086, + "step": 6777 + }, + { + "epoch": 0.619278209227958, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 3.181455608319714e-06, + "logits/chosen": 402362752.0, + "logits/rejected": 511635168.0, + "logps/chosen": -262.1754150390625, + "logps/rejected": -611.5182495117188, + "loss": 0.0164, + "rewards/chosen": 4.150707721710205, + "rewards/margins": 12.927735805511475, + "rewards/rejected": -8.77702808380127, + "step": 6778 + }, + { + "epoch": 0.6193695751484696, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 3.180116354759284e-06, + "logits/chosen": 618243993.6, + "logits/rejected": 500159829.3333333, + "logps/chosen": -324.42568359375, + "logps/rejected": -604.079833984375, + "loss": 0.0225, + "rewards/chosen": 3.701377105712891, + "rewards/margins": 14.55796101888021, + "rewards/rejected": -10.856583913167318, + "step": 6779 + }, + { + "epoch": 0.6194609410689813, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 3.178777251686368e-06, + "logits/chosen": 333090389.3333333, + "logits/rejected": 384023500.8, + "logps/chosen": -176.8595987955729, + "logps/rejected": -525.67119140625, + "loss": 0.0395, + "rewards/chosen": 2.4236251513163247, + "rewards/margins": 13.515016142527262, + "rewards/rejected": -11.091390991210938, + "step": 6780 + }, + { + "epoch": 0.6195523069894929, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 3.177438299211701e-06, + "logits/chosen": 877215451.4285715, + "logits/rejected": 610924160.0, + "logps/chosen": -379.41517857142856, + "logps/rejected": -462.9149475097656, + "loss": 0.0222, + "rewards/chosen": 4.042873382568359, + "rewards/margins": 13.605034828186035, + "rewards/rejected": -9.562161445617676, + "step": 6781 + }, + { + "epoch": 0.6196436729100046, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 3.1760994974460003e-06, + "logits/chosen": 573180032.0, + "logits/rejected": 377418837.3333333, + "logps/chosen": -335.50946044921875, + "logps/rejected": -447.224853515625, + "loss": 0.005, + "rewards/chosen": 4.44997501373291, + "rewards/margins": 13.197164217631022, + "rewards/rejected": -8.747189203898111, + "step": 6782 + }, + { + "epoch": 0.6197350388305162, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 3.1747608464999723e-06, + "logits/chosen": 410711808.0, + "logits/rejected": 627228544.0, + "logps/chosen": -374.5255533854167, + "logps/rejected": -693.4225463867188, + "loss": 0.0273, + "rewards/chosen": 3.8435452779134116, + "rewards/margins": 13.69773801167806, + "rewards/rejected": -9.854192733764648, + "step": 6783 + }, + { + "epoch": 0.6198264047510279, + "grad_norm": 30.625, + "kl": 0.0, + "learning_rate": 3.1734223464843105e-06, + "logits/chosen": 403392736.0, + "logits/rejected": 430424768.0, + "logps/chosen": -402.5934143066406, + "logps/rejected": -542.5201416015625, + "loss": 0.0443, + "rewards/chosen": 3.2962355613708496, + "rewards/margins": 13.326472759246826, + "rewards/rejected": -10.030237197875977, + "step": 6784 + }, + { + "epoch": 0.6199177706715395, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 3.1720839975096974e-06, + "logits/chosen": 535979296.0, + "logits/rejected": 482857344.0, + "logps/chosen": -359.60015869140625, + "logps/rejected": -242.0348663330078, + "loss": 0.0089, + "rewards/chosen": 4.555699348449707, + "rewards/margins": 12.441735744476318, + "rewards/rejected": -7.886036396026611, + "step": 6785 + }, + { + "epoch": 0.6200091365920511, + "grad_norm": 35.0, + "kl": 0.0, + "learning_rate": 3.1707457996868008e-06, + "logits/chosen": 426729642.6666667, + "logits/rejected": 453485536.0, + "logps/chosen": -222.7432861328125, + "logps/rejected": -666.154296875, + "loss": 0.0988, + "rewards/chosen": 3.376763661702474, + "rewards/margins": 15.778912862141928, + "rewards/rejected": -12.402149200439453, + "step": 6786 + }, + { + "epoch": 0.6201005025125628, + "grad_norm": 0.39453125, + "kl": 0.0, + "learning_rate": 3.1694077531262792e-06, + "logits/chosen": 227259696.0, + "logits/rejected": 419244330.6666667, + "logps/chosen": -203.07534790039062, + "logps/rejected": -326.68359375, + "loss": 0.0022, + "rewards/chosen": 5.456360816955566, + "rewards/margins": 13.817004839579264, + "rewards/rejected": -8.360644022623697, + "step": 6787 + }, + { + "epoch": 0.6201918684330745, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 3.168069857938774e-06, + "logits/chosen": 322418944.0, + "logits/rejected": 445934250.6666667, + "logps/chosen": -269.2089599609375, + "logps/rejected": -421.1644694010417, + "loss": 0.0269, + "rewards/chosen": 3.2231204986572264, + "rewards/margins": 13.38180046081543, + "rewards/rejected": -10.158679962158203, + "step": 6788 + }, + { + "epoch": 0.6202832343535861, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 3.1667321142349194e-06, + "logits/chosen": 700312405.3333334, + "logits/rejected": 597931724.8, + "logps/chosen": -486.4245198567708, + "logps/rejected": -434.514453125, + "loss": 0.008, + "rewards/chosen": 4.249342918395996, + "rewards/margins": 13.381242942810058, + "rewards/rejected": -9.131900024414062, + "step": 6789 + }, + { + "epoch": 0.6203746002740977, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 3.1653945221253342e-06, + "logits/chosen": 604847680.0, + "logits/rejected": 480138336.0, + "logps/chosen": -203.39646911621094, + "logps/rejected": -488.6553955078125, + "loss": 0.0066, + "rewards/chosen": 4.532106399536133, + "rewards/margins": 14.725373268127441, + "rewards/rejected": -10.193266868591309, + "step": 6790 + }, + { + "epoch": 0.6204659661946094, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 3.164057081720622e-06, + "logits/chosen": 607407018.6666666, + "logits/rejected": 485847232.0, + "logps/chosen": -280.627685546875, + "logps/rejected": -712.2716064453125, + "loss": 0.0347, + "rewards/chosen": 3.062859853108724, + "rewards/margins": 14.401556332906088, + "rewards/rejected": -11.338696479797363, + "step": 6791 + }, + { + "epoch": 0.6205573321151211, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.1627197931313807e-06, + "logits/chosen": 340544032.0, + "logits/rejected": 429482656.0, + "logps/chosen": -273.6439208984375, + "logps/rejected": -530.831787109375, + "loss": 0.0264, + "rewards/chosen": 3.731334686279297, + "rewards/margins": 13.159692764282227, + "rewards/rejected": -9.42835807800293, + "step": 6792 + }, + { + "epoch": 0.6206486980356327, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 3.1613826564681883e-06, + "logits/chosen": 456706656.0, + "logits/rejected": 384200064.0, + "logps/chosen": -286.5186767578125, + "logps/rejected": -423.8654479980469, + "loss": 0.0171, + "rewards/chosen": 4.231471061706543, + "rewards/margins": 14.004213333129883, + "rewards/rejected": -9.77274227142334, + "step": 6793 + }, + { + "epoch": 0.6207400639561443, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 3.160045671841616e-06, + "logits/chosen": 510949216.0, + "logits/rejected": 686318272.0, + "logps/chosen": -263.6556091308594, + "logps/rejected": -378.2745361328125, + "loss": 0.0522, + "rewards/chosen": 2.5011682510375977, + "rewards/margins": 10.390294551849365, + "rewards/rejected": -7.889126300811768, + "step": 6794 + }, + { + "epoch": 0.620831429876656, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 3.158708839362218e-06, + "logits/chosen": 239896880.0, + "logits/rejected": 401512352.0, + "logps/chosen": -189.6583251953125, + "logps/rejected": -419.4598083496094, + "loss": 0.142, + "rewards/chosen": 1.7947807312011719, + "rewards/margins": 10.9209566116333, + "rewards/rejected": -9.126175880432129, + "step": 6795 + }, + { + "epoch": 0.6209227957971677, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 3.1573721591405405e-06, + "logits/chosen": 723759360.0, + "logits/rejected": 965633344.0, + "logps/chosen": -485.0001220703125, + "logps/rejected": -297.0074462890625, + "loss": 0.0081, + "rewards/chosen": 4.523011207580566, + "rewards/margins": 11.972792625427246, + "rewards/rejected": -7.44978141784668, + "step": 6796 + }, + { + "epoch": 0.6210141617176793, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 3.1560356312871117e-06, + "logits/chosen": 477407232.0, + "logits/rejected": 364165024.0, + "logps/chosen": -356.9322509765625, + "logps/rejected": -484.3886413574219, + "loss": 0.0115, + "rewards/chosen": 4.515539169311523, + "rewards/margins": 14.142803192138672, + "rewards/rejected": -9.627264022827148, + "step": 6797 + }, + { + "epoch": 0.6211055276381909, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 3.1546992559124523e-06, + "logits/chosen": 358848170.6666667, + "logits/rejected": 439875072.0, + "logps/chosen": -287.5905354817708, + "logps/rejected": -394.3928466796875, + "loss": 0.0117, + "rewards/chosen": 3.7623558044433594, + "rewards/margins": 12.197020721435546, + "rewards/rejected": -8.434664916992187, + "step": 6798 + }, + { + "epoch": 0.6211968935587026, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 3.153363033127067e-06, + "logits/chosen": 926856576.0, + "logits/rejected": 535941952.0, + "logps/chosen": -380.15057373046875, + "logps/rejected": -475.16156005859375, + "loss": 0.011, + "rewards/chosen": 4.160482883453369, + "rewards/margins": 14.625096797943115, + "rewards/rejected": -10.464613914489746, + "step": 6799 + }, + { + "epoch": 0.6212882594792143, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 3.15202696304145e-06, + "logits/chosen": 321150208.0, + "logits/rejected": 241327232.0, + "logps/chosen": -291.108154296875, + "logps/rejected": -343.250732421875, + "loss": 0.0163, + "rewards/chosen": 4.207958221435547, + "rewards/margins": 13.955615043640137, + "rewards/rejected": -9.74765682220459, + "step": 6800 + }, + { + "epoch": 0.6213796253997259, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 3.150691045766081e-06, + "logits/chosen": 590837555.2, + "logits/rejected": 578984618.6666666, + "logps/chosen": -302.7234375, + "logps/rejected": -431.3715413411458, + "loss": 0.0393, + "rewards/chosen": 3.233807373046875, + "rewards/margins": 13.623861440022786, + "rewards/rejected": -10.390054066975912, + "step": 6801 + }, + { + "epoch": 0.6214709913202375, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 3.1493552814114282e-06, + "logits/chosen": 551437056.0, + "logits/rejected": 354960320.0, + "logps/chosen": -284.599267578125, + "logps/rejected": -446.6558430989583, + "loss": 0.0499, + "rewards/chosen": 2.695738983154297, + "rewards/margins": 13.8392453511556, + "rewards/rejected": -11.143506368001303, + "step": 6802 + }, + { + "epoch": 0.6215623572407492, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 3.148019670087947e-06, + "logits/chosen": 513369216.0, + "logits/rejected": 450791808.0, + "logps/chosen": -263.6336364746094, + "logps/rejected": -462.35205078125, + "loss": 0.0266, + "rewards/chosen": 2.9770143032073975, + "rewards/margins": 11.229892492294312, + "rewards/rejected": -8.252878189086914, + "step": 6803 + }, + { + "epoch": 0.6216537231612609, + "grad_norm": 0.83984375, + "kl": 0.0, + "learning_rate": 3.14668421190608e-06, + "logits/chosen": 437302169.6, + "logits/rejected": 383208448.0, + "logps/chosen": -361.354296875, + "logps/rejected": -423.1500244140625, + "loss": 0.0054, + "rewards/chosen": 4.8889915466308596, + "rewards/margins": 12.501363627115886, + "rewards/rejected": -7.612372080485026, + "step": 6804 + }, + { + "epoch": 0.6217450890817725, + "grad_norm": 0.86328125, + "kl": 0.0, + "learning_rate": 3.145348906976258e-06, + "logits/chosen": 852016448.0, + "logits/rejected": 500426313.14285713, + "logps/chosen": -513.3751220703125, + "logps/rejected": -408.7177734375, + "loss": 0.0028, + "rewards/chosen": 3.8468873500823975, + "rewards/margins": 13.654711621148246, + "rewards/rejected": -9.807824271065849, + "step": 6805 + }, + { + "epoch": 0.6218364550022841, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 3.1440137554088957e-06, + "logits/chosen": 678049484.8, + "logits/rejected": 401926101.3333333, + "logps/chosen": -448.30048828125, + "logps/rejected": -549.3505859375, + "loss": 0.0132, + "rewards/chosen": 4.039834594726562, + "rewards/margins": 15.372221883138021, + "rewards/rejected": -11.332387288411459, + "step": 6806 + }, + { + "epoch": 0.6219278209227957, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 3.142678757314402e-06, + "logits/chosen": 309210048.0, + "logits/rejected": 359256480.0, + "logps/chosen": -194.5031534830729, + "logps/rejected": -465.799560546875, + "loss": 0.0234, + "rewards/chosen": 3.7104438145955405, + "rewards/margins": 13.850239117940268, + "rewards/rejected": -10.139795303344727, + "step": 6807 + }, + { + "epoch": 0.6220191868433075, + "grad_norm": 0.72265625, + "kl": 0.0, + "learning_rate": 3.1413439128031655e-06, + "logits/chosen": 706797952.0, + "logits/rejected": 461369753.6, + "logps/chosen": -275.34055582682294, + "logps/rejected": -449.5580078125, + "loss": 0.0033, + "rewards/chosen": 5.2683900197347, + "rewards/margins": 14.660991795857747, + "rewards/rejected": -9.392601776123048, + "step": 6808 + }, + { + "epoch": 0.6221105527638191, + "grad_norm": 0.408203125, + "kl": 0.0, + "learning_rate": 3.140009221985568e-06, + "logits/chosen": 440860586.6666667, + "logits/rejected": 1042437632.0, + "logps/chosen": -294.8935953776042, + "logps/rejected": -401.708935546875, + "loss": 0.002, + "rewards/chosen": 5.389997482299805, + "rewards/margins": 14.516823959350585, + "rewards/rejected": -9.12682647705078, + "step": 6809 + }, + { + "epoch": 0.6222019186843307, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 3.138674684971974e-06, + "logits/chosen": 190035136.0, + "logits/rejected": 256629600.0, + "logps/chosen": -244.2898966471354, + "logps/rejected": -262.8350830078125, + "loss": 0.0099, + "rewards/chosen": 4.764990488688151, + "rewards/margins": 13.566317240397137, + "rewards/rejected": -8.801326751708984, + "step": 6810 + }, + { + "epoch": 0.6222932846048423, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 3.137340301872738e-06, + "logits/chosen": 395408896.0, + "logits/rejected": 870516224.0, + "logps/chosen": -274.9210611979167, + "logps/rejected": -679.08251953125, + "loss": 0.0232, + "rewards/chosen": 3.97908083597819, + "rewards/margins": 14.370930353800455, + "rewards/rejected": -10.391849517822266, + "step": 6811 + }, + { + "epoch": 0.6223846505253541, + "grad_norm": 55.75, + "kl": 0.0, + "learning_rate": 3.1360060727982e-06, + "logits/chosen": 639708992.0, + "logits/rejected": 666003584.0, + "logps/chosen": -394.4507141113281, + "logps/rejected": -752.4940185546875, + "loss": 0.1058, + "rewards/chosen": 2.0305886268615723, + "rewards/margins": 11.142357349395752, + "rewards/rejected": -9.11176872253418, + "step": 6812 + }, + { + "epoch": 0.6224760164458657, + "grad_norm": 0.40625, + "kl": 0.0, + "learning_rate": 3.1346719978586904e-06, + "logits/chosen": 194285504.0, + "logits/rejected": 394751061.3333333, + "logps/chosen": -217.73123168945312, + "logps/rejected": -549.8330891927084, + "loss": 0.0015, + "rewards/chosen": 5.630695343017578, + "rewards/margins": 15.592385609944662, + "rewards/rejected": -9.961690266927084, + "step": 6813 + }, + { + "epoch": 0.6225673823663773, + "grad_norm": 24.0, + "kl": 0.0, + "learning_rate": 3.1333380771645227e-06, + "logits/chosen": 560827289.6, + "logits/rejected": 754204842.6666666, + "logps/chosen": -366.33515625, + "logps/rejected": -1022.8433430989584, + "loss": 0.0642, + "rewards/chosen": 3.731650543212891, + "rewards/margins": 13.567581049601237, + "rewards/rejected": -9.835930506388346, + "step": 6814 + }, + { + "epoch": 0.6226587482868889, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 3.1320043108260024e-06, + "logits/chosen": 508373504.0, + "logits/rejected": 640048810.6666666, + "logps/chosen": -396.34580078125, + "logps/rejected": -643.8339029947916, + "loss": 0.0097, + "rewards/chosen": 4.548727035522461, + "rewards/margins": 14.752372105916342, + "rewards/rejected": -10.20364507039388, + "step": 6815 + }, + { + "epoch": 0.6227501142074007, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 3.1306706989534175e-06, + "logits/chosen": 779560140.8, + "logits/rejected": 384281216.0, + "logps/chosen": -420.68935546875, + "logps/rejected": -497.257080078125, + "loss": 0.0164, + "rewards/chosen": 3.8941604614257814, + "rewards/margins": 13.461236826578777, + "rewards/rejected": -9.567076365152994, + "step": 6816 + }, + { + "epoch": 0.6228414801279123, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 3.1293372416570466e-06, + "logits/chosen": 555502796.8, + "logits/rejected": 811446869.3333334, + "logps/chosen": -295.38408203125, + "logps/rejected": -690.1484375, + "loss": 0.0206, + "rewards/chosen": 3.681346893310547, + "rewards/margins": 13.42047373453776, + "rewards/rejected": -9.739126841227213, + "step": 6817 + }, + { + "epoch": 0.6229328460484239, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 3.128003939047153e-06, + "logits/chosen": 371659776.0, + "logits/rejected": 605331072.0, + "logps/chosen": -317.8081970214844, + "logps/rejected": -417.3050231933594, + "loss": 0.0092, + "rewards/chosen": 4.620214939117432, + "rewards/margins": 14.519154071807861, + "rewards/rejected": -9.89893913269043, + "step": 6818 + }, + { + "epoch": 0.6230242119689355, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 3.12667079123399e-06, + "logits/chosen": 813819648.0, + "logits/rejected": 487769120.0, + "logps/chosen": -368.4867350260417, + "logps/rejected": -359.0, + "loss": 0.033, + "rewards/chosen": 3.617201805114746, + "rewards/margins": 13.600286483764648, + "rewards/rejected": -9.983084678649902, + "step": 6819 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.125337798327794e-06, + "logits/chosen": 924110720.0, + "logits/rejected": 418889792.0, + "logps/chosen": -389.9007568359375, + "logps/rejected": -404.37542724609375, + "loss": 0.0177, + "rewards/chosen": 3.3932442665100098, + "rewards/margins": 11.65834093093872, + "rewards/rejected": -8.265096664428711, + "step": 6820 + }, + { + "epoch": 0.6232069438099589, + "grad_norm": 0.482421875, + "kl": 0.0, + "learning_rate": 3.1240049604387955e-06, + "logits/chosen": 563089024.0, + "logits/rejected": 394421222.4, + "logps/chosen": -239.6473185221354, + "logps/rejected": -457.511328125, + "loss": 0.003, + "rewards/chosen": 5.166553815205892, + "rewards/margins": 14.412465985616048, + "rewards/rejected": -9.245912170410156, + "step": 6821 + }, + { + "epoch": 0.6232983097304705, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 3.1226722776772044e-06, + "logits/chosen": 706105600.0, + "logits/rejected": 738868032.0, + "logps/chosen": -248.38644409179688, + "logps/rejected": -527.2069091796875, + "loss": 0.0089, + "rewards/chosen": 4.249964714050293, + "rewards/margins": 13.414101600646973, + "rewards/rejected": -9.16413688659668, + "step": 6822 + }, + { + "epoch": 0.6233896756509821, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 3.1213397501532216e-06, + "logits/chosen": 592584490.6666666, + "logits/rejected": 371896166.4, + "logps/chosen": -321.9009602864583, + "logps/rejected": -453.2302734375, + "loss": 0.0163, + "rewards/chosen": 3.418600082397461, + "rewards/margins": 12.173463058471679, + "rewards/rejected": -8.754862976074218, + "step": 6823 + }, + { + "epoch": 0.6234810415714939, + "grad_norm": 27.25, + "kl": 0.0, + "learning_rate": 3.1200073779770357e-06, + "logits/chosen": 455810048.0, + "logits/rejected": 861889856.0, + "logps/chosen": -242.13034057617188, + "logps/rejected": -397.0141906738281, + "loss": 0.0177, + "rewards/chosen": 4.121151924133301, + "rewards/margins": 13.805768013000488, + "rewards/rejected": -9.684616088867188, + "step": 6824 + }, + { + "epoch": 0.6235724074920055, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 3.1186751612588205e-06, + "logits/chosen": 998483285.3333334, + "logits/rejected": 573042880.0, + "logps/chosen": -369.5418701171875, + "logps/rejected": -352.9573669433594, + "loss": 0.0268, + "rewards/chosen": 3.7568251291910806, + "rewards/margins": 10.767391840616861, + "rewards/rejected": -7.010566711425781, + "step": 6825 + }, + { + "epoch": 0.6236637734125171, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 3.1173431001087394e-06, + "logits/chosen": 239143792.0, + "logits/rejected": 510996699.4285714, + "logps/chosen": -48.11468505859375, + "logps/rejected": -509.11617606026783, + "loss": 0.0114, + "rewards/chosen": 2.360011339187622, + "rewards/margins": 11.357865299497332, + "rewards/rejected": -8.99785396030971, + "step": 6826 + }, + { + "epoch": 0.6237551393330287, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 3.1160111946369396e-06, + "logits/chosen": 398005248.0, + "logits/rejected": 578759168.0, + "logps/chosen": -214.19317626953125, + "logps/rejected": -585.7850341796875, + "loss": 0.031, + "rewards/chosen": 2.925014019012451, + "rewards/margins": 14.567972660064697, + "rewards/rejected": -11.642958641052246, + "step": 6827 + }, + { + "epoch": 0.6238465052535405, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 3.11467944495356e-06, + "logits/chosen": 495903027.2, + "logits/rejected": 607396522.6666666, + "logps/chosen": -305.289990234375, + "logps/rejected": -400.716552734375, + "loss": 0.02, + "rewards/chosen": 3.6215797424316407, + "rewards/margins": 12.081746037801107, + "rewards/rejected": -8.460166295369467, + "step": 6828 + }, + { + "epoch": 0.6239378711740521, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 3.1133478511687217e-06, + "logits/chosen": 549352234.6666666, + "logits/rejected": 610601152.0, + "logps/chosen": -370.5207926432292, + "logps/rejected": -431.17694091796875, + "loss": 0.0144, + "rewards/chosen": 4.525762557983398, + "rewards/margins": 11.164405822753906, + "rewards/rejected": -6.638643264770508, + "step": 6829 + }, + { + "epoch": 0.6240292370945637, + "grad_norm": 47.0, + "kl": 0.0, + "learning_rate": 3.1120164133925356e-06, + "logits/chosen": 919027712.0, + "logits/rejected": 600191701.3333334, + "logps/chosen": -373.6435791015625, + "logps/rejected": -453.93310546875, + "loss": 0.1132, + "rewards/chosen": 2.1260509490966797, + "rewards/margins": 12.341981887817383, + "rewards/rejected": -10.215930938720703, + "step": 6830 + }, + { + "epoch": 0.6241206030150753, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 3.1106851317350993e-06, + "logits/chosen": 684442240.0, + "logits/rejected": 545865113.6, + "logps/chosen": -451.630126953125, + "logps/rejected": -543.31796875, + "loss": 0.017, + "rewards/chosen": 3.2114807764689126, + "rewards/margins": 12.341058413187662, + "rewards/rejected": -9.12957763671875, + "step": 6831 + }, + { + "epoch": 0.6242119689355871, + "grad_norm": 0.61328125, + "kl": 0.0, + "learning_rate": 3.1093540063064994e-06, + "logits/chosen": 876328000.0, + "logits/rejected": 805193898.6666666, + "logps/chosen": -311.40234375, + "logps/rejected": -385.6592610677083, + "loss": 0.0025, + "rewards/chosen": 6.0158586502075195, + "rewards/margins": 13.639871915181477, + "rewards/rejected": -7.624013264973958, + "step": 6832 + }, + { + "epoch": 0.6243033348560987, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 3.108023037216805e-06, + "logits/chosen": 369585024.0, + "logits/rejected": 342283968.0, + "logps/chosen": -329.3603820800781, + "logps/rejected": -306.30859375, + "loss": 0.0205, + "rewards/chosen": 3.3298287391662598, + "rewards/margins": 10.662067890167236, + "rewards/rejected": -7.332239151000977, + "step": 6833 + }, + { + "epoch": 0.6243947007766103, + "grad_norm": 0.447265625, + "kl": 0.0, + "learning_rate": 3.106692224576075e-06, + "logits/chosen": 274484512.0, + "logits/rejected": 410840746.6666667, + "logps/chosen": -270.3708190917969, + "logps/rejected": -522.8379720052084, + "loss": 0.0023, + "rewards/chosen": 4.80986213684082, + "rewards/margins": 14.506135940551758, + "rewards/rejected": -9.696273803710938, + "step": 6834 + }, + { + "epoch": 0.6244860666971219, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.105361568494357e-06, + "logits/chosen": 364757401.6, + "logits/rejected": 499715584.0, + "logps/chosen": -281.49033203125, + "logps/rejected": -532.6590983072916, + "loss": 0.0281, + "rewards/chosen": 3.669446563720703, + "rewards/margins": 12.222602462768554, + "rewards/rejected": -8.553155899047852, + "step": 6835 + }, + { + "epoch": 0.6245774326176337, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 3.1040310690816843e-06, + "logits/chosen": 624079667.2, + "logits/rejected": 647523498.6666666, + "logps/chosen": -282.221923828125, + "logps/rejected": -646.1532389322916, + "loss": 0.0427, + "rewards/chosen": 3.8210323333740233, + "rewards/margins": 15.385397720336915, + "rewards/rejected": -11.56436538696289, + "step": 6836 + }, + { + "epoch": 0.6246687985381453, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.102700726448074e-06, + "logits/chosen": 543798997.3333334, + "logits/rejected": 694612377.6, + "logps/chosen": -179.80255126953125, + "logps/rejected": -440.662060546875, + "loss": 0.0115, + "rewards/chosen": 3.746474266052246, + "rewards/margins": 13.210612297058105, + "rewards/rejected": -9.46413803100586, + "step": 6837 + }, + { + "epoch": 0.6247601644586569, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.1013705407035353e-06, + "logits/chosen": 564455296.0, + "logits/rejected": 363568544.0, + "logps/chosen": -354.713623046875, + "logps/rejected": -610.7809448242188, + "loss": 0.0236, + "rewards/chosen": 4.092768669128418, + "rewards/margins": 13.585177421569824, + "rewards/rejected": -9.492408752441406, + "step": 6838 + }, + { + "epoch": 0.6248515303791685, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 3.1000405119580612e-06, + "logits/chosen": 383710156.8, + "logits/rejected": 312284586.6666667, + "logps/chosen": -253.4024169921875, + "logps/rejected": -366.5420735677083, + "loss": 0.01, + "rewards/chosen": 4.222465133666992, + "rewards/margins": 13.579674657185873, + "rewards/rejected": -9.35720952351888, + "step": 6839 + }, + { + "epoch": 0.6249428962996803, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 3.098710640321634e-06, + "logits/chosen": 483251797.3333333, + "logits/rejected": 248458624.0, + "logps/chosen": -272.91782633463544, + "logps/rejected": -308.32769775390625, + "loss": 0.0311, + "rewards/chosen": 3.8194125493367515, + "rewards/margins": 12.084159215291342, + "rewards/rejected": -8.26474666595459, + "step": 6840 + }, + { + "epoch": 0.6250342622201919, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 3.09738092590422e-06, + "logits/chosen": 320253747.2, + "logits/rejected": 490950869.3333333, + "logps/chosen": -217.7193115234375, + "logps/rejected": -684.704345703125, + "loss": 0.0226, + "rewards/chosen": 3.9519962310791015, + "rewards/margins": 12.706914647420248, + "rewards/rejected": -8.754918416341146, + "step": 6841 + }, + { + "epoch": 0.6251256281407035, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.096051368815776e-06, + "logits/chosen": 585998976.0, + "logits/rejected": 415849472.0, + "logps/chosen": -332.7812805175781, + "logps/rejected": -504.87017822265625, + "loss": 0.0159, + "rewards/chosen": 3.866211414337158, + "rewards/margins": 14.886377811431885, + "rewards/rejected": -11.020166397094727, + "step": 6842 + }, + { + "epoch": 0.6252169940612151, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 3.0947219691662435e-06, + "logits/chosen": 756738816.0, + "logits/rejected": 570104832.0, + "logps/chosen": -361.3612060546875, + "logps/rejected": -624.4320068359375, + "loss": 0.138, + "rewards/chosen": 1.5181312561035156, + "rewards/margins": 12.192331314086914, + "rewards/rejected": -10.674200057983398, + "step": 6843 + }, + { + "epoch": 0.6253083599817268, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 3.09339272706555e-06, + "logits/chosen": 590589235.2, + "logits/rejected": 1212847104.0, + "logps/chosen": -417.228662109375, + "logps/rejected": -526.518310546875, + "loss": 0.0302, + "rewards/chosen": 3.1933141708374024, + "rewards/margins": 12.68389631907145, + "rewards/rejected": -9.490582148234049, + "step": 6844 + }, + { + "epoch": 0.6253997259022385, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.092063642623614e-06, + "logits/chosen": 948108492.8, + "logits/rejected": 377189248.0, + "logps/chosen": -362.2861083984375, + "logps/rejected": -333.07806396484375, + "loss": 0.0156, + "rewards/chosen": 4.299347686767578, + "rewards/margins": 13.299844868977864, + "rewards/rejected": -9.000497182210287, + "step": 6845 + }, + { + "epoch": 0.6254910918227501, + "grad_norm": 0.58203125, + "kl": 0.0, + "learning_rate": 3.0907347159503364e-06, + "logits/chosen": 738025813.3333334, + "logits/rejected": 466054604.8, + "logps/chosen": -254.52986653645834, + "logps/rejected": -498.935986328125, + "loss": 0.0032, + "rewards/chosen": 4.887726465861003, + "rewards/margins": 13.590058008829754, + "rewards/rejected": -8.70233154296875, + "step": 6846 + }, + { + "epoch": 0.6255824577432617, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 3.0894059471556094e-06, + "logits/chosen": 427480032.0, + "logits/rejected": 343642837.3333333, + "logps/chosen": -268.01312255859375, + "logps/rejected": -276.9864908854167, + "loss": 0.009, + "rewards/chosen": 3.817574977874756, + "rewards/margins": 12.264097372690836, + "rewards/rejected": -8.44652239481608, + "step": 6847 + }, + { + "epoch": 0.6256738236637734, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 3.0880773363493076e-06, + "logits/chosen": 1073773568.0, + "logits/rejected": 676242790.4, + "logps/chosen": -485.9682210286458, + "logps/rejected": -453.955419921875, + "loss": 0.0088, + "rewards/chosen": 3.900670369466146, + "rewards/margins": 12.901178487141928, + "rewards/rejected": -9.000508117675782, + "step": 6848 + }, + { + "epoch": 0.6257651895842851, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 3.0867488836412963e-06, + "logits/chosen": 871031296.0, + "logits/rejected": 1274305706.6666667, + "logps/chosen": -406.28310546875, + "logps/rejected": -341.7045084635417, + "loss": 0.1053, + "rewards/chosen": 2.2662471771240233, + "rewards/margins": 9.963189188639323, + "rewards/rejected": -7.6969420115153, + "step": 6849 + }, + { + "epoch": 0.6258565555047967, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.0854205891414246e-06, + "logits/chosen": 532211498.6666667, + "logits/rejected": 746775654.4, + "logps/chosen": -193.71785481770834, + "logps/rejected": -420.3345703125, + "loss": 0.0148, + "rewards/chosen": 3.8977444966634116, + "rewards/margins": 11.091743214925131, + "rewards/rejected": -7.193998718261719, + "step": 6850 + }, + { + "epoch": 0.6259479214253083, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.0840924529595325e-06, + "logits/chosen": 399291584.0, + "logits/rejected": 471440768.0, + "logps/chosen": -315.3033447265625, + "logps/rejected": -398.38555908203125, + "loss": 0.0273, + "rewards/chosen": 3.0720481872558594, + "rewards/margins": 12.490070343017578, + "rewards/rejected": -9.418022155761719, + "step": 6851 + }, + { + "epoch": 0.62603928734582, + "grad_norm": 5.5, + "kl": 8.053665161132812, + "learning_rate": 3.082764475205442e-06, + "logits/chosen": 687222400.0, + "logps/chosen": -357.62945556640625, + "loss": 0.0503, + "rewards/chosen": 4.0424394607543945, + "step": 6852 + }, + { + "epoch": 0.6261306532663317, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 3.081436655988967e-06, + "logits/chosen": 703144618.6666666, + "logits/rejected": 532156723.2, + "logps/chosen": -377.4917805989583, + "logps/rejected": -529.532958984375, + "loss": 0.0154, + "rewards/chosen": 3.223367691040039, + "rewards/margins": 11.790561294555664, + "rewards/rejected": -8.567193603515625, + "step": 6853 + }, + { + "epoch": 0.6262220191868433, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 3.080108995419905e-06, + "logits/chosen": 342780448.0, + "logits/rejected": 694852224.0, + "logps/chosen": -122.5326156616211, + "logps/rejected": -541.2783203125, + "loss": 0.0095, + "rewards/chosen": 3.8145318031311035, + "rewards/margins": 12.377771218617758, + "rewards/rejected": -8.563239415486654, + "step": 6854 + }, + { + "epoch": 0.6263133851073549, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 3.0787814936080394e-06, + "logits/chosen": 449799466.6666667, + "logits/rejected": 956154240.0, + "logps/chosen": -313.60207112630206, + "logps/rejected": -406.72210693359375, + "loss": 0.0235, + "rewards/chosen": 4.402217864990234, + "rewards/margins": 11.737334728240967, + "rewards/rejected": -7.335116863250732, + "step": 6855 + }, + { + "epoch": 0.6264047510278666, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 3.077454150663145e-06, + "logits/chosen": 505433819.4285714, + "logits/rejected": 631805248.0, + "logps/chosen": -285.34598214285717, + "logps/rejected": -616.5556640625, + "loss": 0.0254, + "rewards/chosen": 4.106856754847935, + "rewards/margins": 10.47318356377738, + "rewards/rejected": -6.366326808929443, + "step": 6856 + }, + { + "epoch": 0.6264961169483783, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.0761269666949777e-06, + "logits/rejected": 729995264.0, + "logps/rejected": -428.4786071777344, + "loss": 0.0036, + "rewards/rejected": -7.789706230163574, + "step": 6857 + }, + { + "epoch": 0.6265874828688899, + "grad_norm": 0.1796875, + "kl": 0.0, + "learning_rate": 3.0747999418132874e-06, + "logits/chosen": 303218261.3333333, + "logits/rejected": 638952652.8, + "logps/chosen": -211.29443359375, + "logps/rejected": -533.338037109375, + "loss": 0.001, + "rewards/chosen": 6.102884292602539, + "rewards/margins": 15.452592849731445, + "rewards/rejected": -9.349708557128906, + "step": 6858 + }, + { + "epoch": 0.6266788487894015, + "grad_norm": 36.75, + "kl": 0.0, + "learning_rate": 3.073473076127803e-06, + "logits/chosen": 506177177.6, + "logits/rejected": 621209344.0, + "logps/chosen": -425.339013671875, + "logps/rejected": -539.091064453125, + "loss": 0.0274, + "rewards/chosen": 4.436240005493164, + "rewards/margins": 13.56276206970215, + "rewards/rejected": -9.126522064208984, + "step": 6859 + }, + { + "epoch": 0.6267702147099132, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 3.0721463697482466e-06, + "logits/chosen": 803329962.6666666, + "logits/rejected": 600099686.4, + "logps/chosen": -430.428466796875, + "logps/rejected": -332.984765625, + "loss": 0.0181, + "rewards/chosen": 3.0688082377115884, + "rewards/margins": 11.63256861368815, + "rewards/rejected": -8.563760375976562, + "step": 6860 + }, + { + "epoch": 0.6268615806304249, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 3.070819822784323e-06, + "logits/chosen": 619438144.0, + "logits/rejected": 924012458.6666666, + "logps/chosen": -336.5603942871094, + "logps/rejected": -602.5116373697916, + "loss": 0.0083, + "rewards/chosen": 3.5612640380859375, + "rewards/margins": 13.385144551595053, + "rewards/rejected": -9.823880513509115, + "step": 6861 + }, + { + "epoch": 0.6269529465509365, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 3.0694934353457263e-06, + "logits/chosen": 588619776.0, + "logits/rejected": 600477354.6666666, + "logps/chosen": -305.3080810546875, + "logps/rejected": -557.8888346354166, + "loss": 0.0331, + "rewards/chosen": 3.556257629394531, + "rewards/margins": 13.749839528401694, + "rewards/rejected": -10.193581899007162, + "step": 6862 + }, + { + "epoch": 0.6270443124714481, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 3.0681672075421365e-06, + "logits/chosen": 526927658.6666667, + "logits/rejected": 539898572.8, + "logps/chosen": -515.7667236328125, + "logps/rejected": -483.05625, + "loss": 0.0217, + "rewards/chosen": 2.8365987141927085, + "rewards/margins": 12.266834004720053, + "rewards/rejected": -9.430235290527344, + "step": 6863 + }, + { + "epoch": 0.6271356783919598, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 3.0668411394832194e-06, + "logits/chosen": 512913868.8, + "logits/rejected": 663693397.3333334, + "logps/chosen": -423.4314453125, + "logps/rejected": -591.1887613932291, + "loss": 0.0184, + "rewards/chosen": 3.655190277099609, + "rewards/margins": 12.394563166300454, + "rewards/rejected": -8.739372889200846, + "step": 6864 + }, + { + "epoch": 0.6272270443124714, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 3.0655152312786273e-06, + "logits/chosen": 674014080.0, + "logits/rejected": 374882272.0, + "logps/chosen": -292.016845703125, + "logps/rejected": -436.7757568359375, + "loss": 0.0137, + "rewards/chosen": 3.7665419578552246, + "rewards/margins": 13.723887920379639, + "rewards/rejected": -9.957345962524414, + "step": 6865 + }, + { + "epoch": 0.6273184102329831, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 3.064189483038004e-06, + "logits/chosen": 579856000.0, + "logits/rejected": 827239936.0, + "logps/chosen": -333.3402913411458, + "logps/rejected": -597.2459106445312, + "loss": 0.0226, + "rewards/chosen": 3.9644130071004233, + "rewards/margins": 13.244398434956869, + "rewards/rejected": -9.279985427856445, + "step": 6866 + }, + { + "epoch": 0.6274097761534947, + "grad_norm": 0.921875, + "kl": 0.0, + "learning_rate": 3.062863894870973e-06, + "logits/chosen": 395456341.3333333, + "logits/rejected": 497719142.4, + "logps/chosen": -391.892578125, + "logps/rejected": -501.8751953125, + "loss": 0.0044, + "rewards/chosen": 4.962320963541667, + "rewards/margins": 13.501146952311199, + "rewards/rejected": -8.538825988769531, + "step": 6867 + }, + { + "epoch": 0.6275011420740064, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 3.0615384668871516e-06, + "logits/chosen": 481221478.4, + "logits/rejected": 482941781.3333333, + "logps/chosen": -348.59765625, + "logps/rejected": -344.0898844401042, + "loss": 0.0188, + "rewards/chosen": 3.7228065490722657, + "rewards/margins": 10.628240203857422, + "rewards/rejected": -6.905433654785156, + "step": 6868 + }, + { + "epoch": 0.627592507994518, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 3.060213199196138e-06, + "logits/chosen": 525418496.0, + "logits/rejected": 494375424.0, + "logps/chosen": -369.3424072265625, + "logps/rejected": -497.36357421875, + "loss": 0.0307, + "rewards/chosen": 2.8137995402018228, + "rewards/margins": 12.457834116617837, + "rewards/rejected": -9.644034576416015, + "step": 6869 + }, + { + "epoch": 0.6276838739150297, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 3.05888809190752e-06, + "logits/chosen": 431467690.6666667, + "logits/rejected": 609592422.4, + "logps/chosen": -269.4617919921875, + "logps/rejected": -675.37724609375, + "loss": 0.014, + "rewards/chosen": 3.2782872517903647, + "rewards/margins": 13.94350331624349, + "rewards/rejected": -10.665216064453125, + "step": 6870 + }, + { + "epoch": 0.6277752398355413, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 3.057563145130873e-06, + "logits/chosen": 986359936.0, + "logits/rejected": 734919168.0, + "logps/chosen": -379.21514892578125, + "logps/rejected": -365.3936767578125, + "loss": 0.1282, + "rewards/chosen": 4.198646545410156, + "rewards/margins": 10.55147409439087, + "rewards/rejected": -6.352827548980713, + "step": 6871 + }, + { + "epoch": 0.627866605756053, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.056238358975756e-06, + "logits/chosen": 422380544.0, + "logits/rejected": 722936746.6666666, + "logps/chosen": -299.4144775390625, + "logps/rejected": -457.8490397135417, + "loss": 0.0183, + "rewards/chosen": 3.811318206787109, + "rewards/margins": 12.424388376871743, + "rewards/rejected": -8.613070170084635, + "step": 6872 + }, + { + "epoch": 0.6279579716765646, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 3.0549137335517178e-06, + "logits/chosen": 442343424.0, + "logits/rejected": 587136896.0, + "logps/chosen": -274.1199645996094, + "logps/rejected": -704.3494873046875, + "loss": 0.0283, + "rewards/chosen": 2.885891914367676, + "rewards/margins": 13.37779426574707, + "rewards/rejected": -10.491902351379395, + "step": 6873 + }, + { + "epoch": 0.6280493375970763, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 3.0535892689682924e-06, + "logits/chosen": 466094592.0, + "logits/rejected": 875036501.3333334, + "logps/chosen": -221.6833984375, + "logps/rejected": -404.93994140625, + "loss": 0.0155, + "rewards/chosen": 4.399831390380859, + "rewards/margins": 13.033308283487955, + "rewards/rejected": -8.633476893107096, + "step": 6874 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 3.052264965335e-06, + "logits/chosen": 573251242.6666666, + "logits/rejected": 389580544.0, + "logps/chosen": -315.54986572265625, + "logps/rejected": -412.85283203125, + "loss": 0.0124, + "rewards/chosen": 3.4912503560384116, + "rewards/margins": 12.789341481526693, + "rewards/rejected": -9.29809112548828, + "step": 6875 + }, + { + "epoch": 0.6282320694380996, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 3.0509408227613483e-06, + "logits/chosen": 768359253.3333334, + "logits/rejected": 645605990.4, + "logps/chosen": -169.63482666015625, + "logps/rejected": -581.91796875, + "loss": 0.02, + "rewards/chosen": 3.07843812306722, + "rewards/margins": 14.325181897481283, + "rewards/rejected": -11.246743774414062, + "step": 6876 + }, + { + "epoch": 0.6283234353586112, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 3.0496168413568334e-06, + "logits/chosen": 280931456.0, + "logits/rejected": 592252202.6666666, + "logps/chosen": -276.05950927734375, + "logps/rejected": -631.1667073567709, + "loss": 0.006, + "rewards/chosen": 4.531993389129639, + "rewards/margins": 14.788184642791748, + "rewards/rejected": -10.25619125366211, + "step": 6877 + }, + { + "epoch": 0.6284148012791229, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 3.0482930212309335e-06, + "logits/chosen": 394489830.4, + "logits/rejected": 463069866.6666667, + "logps/chosen": -182.2694091796875, + "logps/rejected": -437.8498942057292, + "loss": 0.024, + "rewards/chosen": 3.6389808654785156, + "rewards/margins": 11.851290384928385, + "rewards/rejected": -8.21230951944987, + "step": 6878 + }, + { + "epoch": 0.6285061671996346, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 3.0469693624931195e-06, + "logits/chosen": 679630250.6666666, + "logits/rejected": 515789875.2, + "logps/chosen": -420.0048828125, + "logps/rejected": -528.48505859375, + "loss": 0.0163, + "rewards/chosen": 3.252072334289551, + "rewards/margins": 13.274066352844239, + "rewards/rejected": -10.021994018554688, + "step": 6879 + }, + { + "epoch": 0.6285975331201462, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.045645865252842e-06, + "logits/chosen": 719520853.3333334, + "logits/rejected": 596827238.4, + "logps/chosen": -450.0893147786458, + "logps/rejected": -467.82177734375, + "loss": 0.0087, + "rewards/chosen": 4.428653717041016, + "rewards/margins": 13.648332977294922, + "rewards/rejected": -9.219679260253907, + "step": 6880 + }, + { + "epoch": 0.6286888990406578, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 3.0443225296195454e-06, + "logits/chosen": 822645760.0, + "logits/rejected": 585623168.0, + "logps/chosen": -569.1746215820312, + "logps/rejected": -519.0418701171875, + "loss": 0.013, + "rewards/chosen": 3.9364302158355713, + "rewards/margins": 14.185844659805298, + "rewards/rejected": -10.249414443969727, + "step": 6881 + }, + { + "epoch": 0.6287802649611695, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 3.0429993557026556e-06, + "logits/chosen": 579935436.8, + "logits/rejected": 745462954.6666666, + "logps/chosen": -290.8052490234375, + "logps/rejected": -678.7227376302084, + "loss": 0.0293, + "rewards/chosen": 3.4271141052246095, + "rewards/margins": 13.698504384358724, + "rewards/rejected": -10.271390279134115, + "step": 6882 + }, + { + "epoch": 0.6288716308816812, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 3.041676343611587e-06, + "logits/chosen": 290245930.6666667, + "logits/rejected": 419547750.4, + "logps/chosen": -201.1260986328125, + "logps/rejected": -579.2896484375, + "loss": 0.1386, + "rewards/chosen": 0.39862060546875, + "rewards/margins": 10.873048400878906, + "rewards/rejected": -10.474427795410156, + "step": 6883 + }, + { + "epoch": 0.6289629968021928, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 3.04035349345574e-06, + "logits/chosen": 637677056.0, + "logits/rejected": 642721024.0, + "logps/chosen": -478.29058837890625, + "logps/rejected": -397.719970703125, + "loss": 0.0195, + "rewards/chosen": 2.948183536529541, + "rewards/margins": 10.34970458348592, + "rewards/rejected": -7.40152104695638, + "step": 6884 + }, + { + "epoch": 0.6290543627227044, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 3.039030805344504e-06, + "logits/chosen": 468093760.0, + "logits/rejected": 425287744.0, + "logps/chosen": -246.30751037597656, + "logps/rejected": -492.08245849609375, + "loss": 0.0113, + "rewards/chosen": 4.671756744384766, + "rewards/margins": 14.367499351501465, + "rewards/rejected": -9.6957426071167, + "step": 6885 + }, + { + "epoch": 0.629145728643216, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 3.0377082793872514e-06, + "logits/chosen": 391641952.0, + "logits/rejected": 675886250.6666666, + "logps/chosen": -272.8961486816406, + "logps/rejected": -697.92333984375, + "loss": 0.0117, + "rewards/chosen": 3.0218329429626465, + "rewards/margins": 15.590990861256918, + "rewards/rejected": -12.569157918294271, + "step": 6886 + }, + { + "epoch": 0.6292370945637278, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.036385915693342e-06, + "logits/chosen": 605055061.3333334, + "logits/rejected": 466104268.8, + "logps/chosen": -322.2608642578125, + "logps/rejected": -566.971533203125, + "loss": 0.0184, + "rewards/chosen": 3.226060231526693, + "rewards/margins": 13.158698781331381, + "rewards/rejected": -9.932638549804688, + "step": 6887 + }, + { + "epoch": 0.6293284604842394, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 3.035063714372124e-06, + "logits/chosen": 1142814105.6, + "logits/rejected": 593313493.3333334, + "logps/chosen": -398.482177734375, + "logps/rejected": -318.1929931640625, + "loss": 0.0137, + "rewards/chosen": 4.467759323120117, + "rewards/margins": 12.751654179890952, + "rewards/rejected": -8.283894856770834, + "step": 6888 + }, + { + "epoch": 0.629419826404751, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 3.033741675532932e-06, + "logits/chosen": 882008149.3333334, + "logits/rejected": 739282073.6, + "logps/chosen": -334.3590494791667, + "logps/rejected": -491.13193359375, + "loss": 0.0189, + "rewards/chosen": 3.4087886810302734, + "rewards/margins": 14.149241256713868, + "rewards/rejected": -10.740452575683594, + "step": 6889 + }, + { + "epoch": 0.6295111923252626, + "grad_norm": 0.8125, + "kl": 0.0, + "learning_rate": 3.032419799285086e-06, + "logits/chosen": 291593450.6666667, + "logits/rejected": 472774041.6, + "logps/chosen": -173.59037272135416, + "logps/rejected": -543.6716796875, + "loss": 0.0055, + "rewards/chosen": 4.270600954691569, + "rewards/margins": 13.272658602396646, + "rewards/rejected": -9.002057647705078, + "step": 6890 + }, + { + "epoch": 0.6296025582457744, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 3.0310980857378926e-06, + "logits/chosen": 572132693.3333334, + "logits/rejected": 599292876.8, + "logps/chosen": -445.0024820963542, + "logps/rejected": -559.8171875, + "loss": 0.014, + "rewards/chosen": 3.8893537521362305, + "rewards/margins": 13.601467704772949, + "rewards/rejected": -9.712113952636718, + "step": 6891 + }, + { + "epoch": 0.629693924166286, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 3.029776535000644e-06, + "logits/chosen": 608821504.0, + "logits/rejected": 284759808.0, + "logps/chosen": -236.84449768066406, + "logps/rejected": -400.35986328125, + "loss": 0.0266, + "rewards/chosen": 3.070384979248047, + "rewards/margins": 13.189422607421875, + "rewards/rejected": -10.119037628173828, + "step": 6892 + }, + { + "epoch": 0.6297852900867976, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 3.0284551471826228e-06, + "logits/chosen": 650691840.0, + "logits/rejected": 444536934.4, + "logps/chosen": -523.4301350911459, + "logps/rejected": -447.24404296875, + "loss": 0.0047, + "rewards/chosen": 4.481389999389648, + "rewards/margins": 13.88929557800293, + "rewards/rejected": -9.407905578613281, + "step": 6893 + }, + { + "epoch": 0.6298766560073092, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 3.027133922393093e-06, + "logits/chosen": 484388454.4, + "logits/rejected": 348401152.0, + "logps/chosen": -280.605859375, + "logps/rejected": -351.0499674479167, + "loss": 0.0166, + "rewards/chosen": 3.7720584869384766, + "rewards/margins": 12.99928347269694, + "rewards/rejected": -9.227224985758463, + "step": 6894 + }, + { + "epoch": 0.629968021927821, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.0258128607413096e-06, + "logits/chosen": 668808896.0, + "logits/rejected": 532100266.6666667, + "logps/chosen": -449.5263977050781, + "logps/rejected": -407.1295979817708, + "loss": 0.0101, + "rewards/chosen": 3.244847297668457, + "rewards/margins": 12.43621031443278, + "rewards/rejected": -9.191363016764322, + "step": 6895 + }, + { + "epoch": 0.6300593878483326, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 3.024491962336511e-06, + "logits/chosen": 508827648.0, + "logits/rejected": 615004774.4, + "logps/chosen": -541.9803059895834, + "logps/rejected": -524.100927734375, + "loss": 0.0187, + "rewards/chosen": 3.0450929005940757, + "rewards/margins": 11.956779607137046, + "rewards/rejected": -8.91168670654297, + "step": 6896 + }, + { + "epoch": 0.6301507537688442, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 3.023171227287923e-06, + "logits/chosen": 640133546.6666666, + "logits/rejected": 374465971.2, + "logps/chosen": -351.8671061197917, + "logps/rejected": -283.5263671875, + "loss": 0.0237, + "rewards/chosen": 3.9735565185546875, + "rewards/margins": 12.50011978149414, + "rewards/rejected": -8.526563262939453, + "step": 6897 + }, + { + "epoch": 0.6302421196893558, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 3.02185065570476e-06, + "logits/chosen": 549107370.6666666, + "logits/rejected": 482818368.0, + "logps/chosen": -301.9488525390625, + "logps/rejected": -287.05780029296875, + "loss": 0.0196, + "rewards/chosen": 4.014670054117839, + "rewards/margins": 11.017162958780926, + "rewards/rejected": -7.002492904663086, + "step": 6898 + }, + { + "epoch": 0.6303334856098676, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 3.0205302476962186e-06, + "logits/chosen": 524304032.0, + "logits/rejected": 351860992.0, + "logps/chosen": -391.7730712890625, + "logps/rejected": -354.9501037597656, + "loss": 0.0156, + "rewards/chosen": 3.6595189571380615, + "rewards/margins": 12.29542326927185, + "rewards/rejected": -8.635904312133789, + "step": 6899 + }, + { + "epoch": 0.6304248515303792, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 3.0192100033714864e-06, + "logits/chosen": 444787353.6, + "logits/rejected": 567354965.3333334, + "logps/chosen": -308.0044921875, + "logps/rejected": -533.621826171875, + "loss": 0.129, + "rewards/chosen": 3.4715950012207033, + "rewards/margins": 12.908435694376628, + "rewards/rejected": -9.436840693155924, + "step": 6900 + }, + { + "epoch": 0.6305162174508908, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 3.0178899228397337e-06, + "logits/chosen": 497125824.0, + "logits/rejected": 426449152.0, + "logps/chosen": -348.8669738769531, + "logps/rejected": -468.5572509765625, + "loss": 0.0155, + "rewards/chosen": 3.7292189598083496, + "rewards/margins": 15.535594463348389, + "rewards/rejected": -11.806375503540039, + "step": 6901 + }, + { + "epoch": 0.6306075833714024, + "grad_norm": 2.703125, + "kl": 0.8434467315673828, + "learning_rate": 3.01657000621012e-06, + "logits/chosen": 413272000.0, + "logps/chosen": -283.70733642578125, + "loss": 0.0329, + "rewards/chosen": 3.5844922065734863, + "step": 6902 + }, + { + "epoch": 0.6306989492919142, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 3.0152502535917898e-06, + "logits/chosen": 402040320.0, + "logits/rejected": 434607680.0, + "logps/chosen": -277.73561604817706, + "logps/rejected": -531.30029296875, + "loss": 0.0087, + "rewards/chosen": 4.580652236938477, + "rewards/margins": 15.507495880126953, + "rewards/rejected": -10.926843643188477, + "step": 6903 + }, + { + "epoch": 0.6307903152124258, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 3.0139306650938748e-06, + "logits/chosen": 597855658.6666666, + "logits/rejected": 387660192.0, + "logps/chosen": -245.26802571614584, + "logps/rejected": -494.610595703125, + "loss": 0.0345, + "rewards/chosen": 3.132129669189453, + "rewards/margins": 11.910022735595703, + "rewards/rejected": -8.77789306640625, + "step": 6904 + }, + { + "epoch": 0.6308816811329374, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 3.0126112408254928e-06, + "logits/chosen": 495297621.3333333, + "logits/rejected": 616315904.0, + "logps/chosen": -378.8964029947917, + "logps/rejected": -707.1896484375, + "loss": 0.0096, + "rewards/chosen": 4.377577781677246, + "rewards/margins": 14.650490379333496, + "rewards/rejected": -10.27291259765625, + "step": 6905 + }, + { + "epoch": 0.630973047053449, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 3.0112919808957475e-06, + "logits/chosen": 462273408.0, + "logits/rejected": 440581939.2, + "logps/chosen": -312.92578125, + "logps/rejected": -425.82763671875, + "loss": 0.0225, + "rewards/chosen": 3.006018320719401, + "rewards/margins": 11.305926005045572, + "rewards/rejected": -8.299907684326172, + "step": 6906 + }, + { + "epoch": 0.6310644129739608, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 3.00997288541373e-06, + "logits/chosen": 554346752.0, + "logits/rejected": 596022528.0, + "logps/chosen": -282.3499755859375, + "logps/rejected": -699.967529296875, + "loss": 0.0179, + "rewards/chosen": 3.3628125190734863, + "rewards/margins": 13.716522693634033, + "rewards/rejected": -10.353710174560547, + "step": 6907 + }, + { + "epoch": 0.6311557788944724, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 3.008653954488515e-06, + "logits/chosen": 884805802.6666666, + "logits/rejected": 672856832.0, + "logps/chosen": -348.5888671875, + "logps/rejected": -670.872265625, + "loss": 0.0145, + "rewards/chosen": 3.4944820404052734, + "rewards/margins": 13.912095260620116, + "rewards/rejected": -10.417613220214843, + "step": 6908 + }, + { + "epoch": 0.631247144814984, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.0073351882291696e-06, + "logits/chosen": 903225472.0, + "logits/rejected": 683345344.0, + "logps/chosen": -389.2343444824219, + "logps/rejected": -511.0370788574219, + "loss": 0.018, + "rewards/chosen": 3.8392961025238037, + "rewards/margins": 14.58807635307312, + "rewards/rejected": -10.748780250549316, + "step": 6909 + }, + { + "epoch": 0.6313385107354956, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 3.0060165867447405e-06, + "logits/chosen": 968209280.0, + "logits/rejected": 1311679616.0, + "logps/chosen": -186.90652465820312, + "logps/rejected": -418.0120544433594, + "loss": 0.013, + "rewards/chosen": 3.9619414806365967, + "rewards/margins": 12.686846494674683, + "rewards/rejected": -8.724905014038086, + "step": 6910 + }, + { + "epoch": 0.6314298766560074, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 3.0046981501442663e-06, + "logits/chosen": 223031200.0, + "logits/rejected": 409414144.0, + "logps/chosen": -238.08819580078125, + "logps/rejected": -473.2325439453125, + "loss": 0.0057, + "rewards/chosen": 4.427556037902832, + "rewards/margins": 14.276533444722494, + "rewards/rejected": -9.848977406819662, + "step": 6911 + }, + { + "epoch": 0.631521242576519, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 3.0033798785367664e-06, + "logits/chosen": 522167142.4, + "logits/rejected": 388122453.3333333, + "logps/chosen": -406.96630859375, + "logps/rejected": -278.3832600911458, + "loss": 0.0185, + "rewards/chosen": 3.933919906616211, + "rewards/margins": 11.685713450113933, + "rewards/rejected": -7.751793543497722, + "step": 6912 + }, + { + "epoch": 0.6316126084970306, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.002061772031253e-06, + "logits/chosen": 420336076.8, + "logits/rejected": 194551829.33333334, + "logps/chosen": -329.513720703125, + "logps/rejected": -275.33258056640625, + "loss": 0.0341, + "rewards/chosen": 3.5591033935546874, + "rewards/margins": 12.120961252848307, + "rewards/rejected": -8.56185785929362, + "step": 6913 + }, + { + "epoch": 0.6317039744175422, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 3.0007438307367186e-06, + "logits/chosen": 785412352.0, + "logits/rejected": 689187584.0, + "logps/chosen": -210.78797912597656, + "logps/rejected": -475.6596984863281, + "loss": 0.013, + "rewards/chosen": 4.220928192138672, + "rewards/margins": 13.637341499328613, + "rewards/rejected": -9.416413307189941, + "step": 6914 + }, + { + "epoch": 0.631795340338054, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 2.9994260547621474e-06, + "logits/chosen": 938673868.8, + "logits/rejected": 796881920.0, + "logps/chosen": -311.1416259765625, + "logps/rejected": -657.5552571614584, + "loss": 0.0187, + "rewards/chosen": 3.6039539337158204, + "rewards/margins": 14.111765162150064, + "rewards/rejected": -10.507811228434244, + "step": 6915 + }, + { + "epoch": 0.6318867062585656, + "grad_norm": 0.76171875, + "kl": 0.0, + "learning_rate": 2.9981084442165044e-06, + "logits/chosen": 206393002.66666666, + "logits/rejected": 366233497.6, + "logps/chosen": -148.55754597981772, + "logps/rejected": -411.546044921875, + "loss": 0.0047, + "rewards/chosen": 4.882155100504558, + "rewards/margins": 11.437610117594401, + "rewards/rejected": -6.555455017089844, + "step": 6916 + }, + { + "epoch": 0.6319780721790772, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 2.9967909992087478e-06, + "logits/chosen": 566916147.2, + "logits/rejected": 952620032.0, + "logps/chosen": -303.833251953125, + "logps/rejected": -542.6673990885416, + "loss": 0.0187, + "rewards/chosen": 4.155953216552734, + "rewards/margins": 12.588089752197266, + "rewards/rejected": -8.432136535644531, + "step": 6917 + }, + { + "epoch": 0.6320694380995888, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 2.9954737198478122e-06, + "logits/chosen": 326747340.8, + "logits/rejected": 500085674.6666667, + "logps/chosen": -271.586865234375, + "logps/rejected": -440.4012858072917, + "loss": 0.0161, + "rewards/chosen": 4.425690841674805, + "rewards/margins": 12.402006657918294, + "rewards/rejected": -7.976315816243489, + "step": 6918 + }, + { + "epoch": 0.6321608040201006, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 2.9941566062426296e-06, + "logits/chosen": 335994368.0, + "logits/rejected": 893317248.0, + "logps/chosen": -176.63603864397322, + "logps/rejected": -1017.04296875, + "loss": 0.1218, + "rewards/chosen": 3.2881867544991628, + "rewards/margins": 13.218179430280413, + "rewards/rejected": -9.92999267578125, + "step": 6919 + }, + { + "epoch": 0.6322521699406122, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 2.9928396585021097e-06, + "logits/chosen": 500842700.8, + "logits/rejected": 528839424.0, + "logps/chosen": -427.18515625, + "logps/rejected": -587.1944986979166, + "loss": 0.0363, + "rewards/chosen": 3.0110841751098634, + "rewards/margins": 12.491271909077962, + "rewards/rejected": -9.4801877339681, + "step": 6920 + }, + { + "epoch": 0.6323435358611238, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 2.991522876735154e-06, + "logits/chosen": 676831744.0, + "logits/rejected": 377351072.0, + "logps/chosen": -366.3798828125, + "logps/rejected": -375.53900146484375, + "loss": 0.0119, + "rewards/chosen": 4.209861755371094, + "rewards/margins": 12.764450073242188, + "rewards/rejected": -8.554588317871094, + "step": 6921 + }, + { + "epoch": 0.6324349017816354, + "grad_norm": 7.34375, + "kl": 4.368705749511719, + "learning_rate": 2.9902062610506466e-06, + "logits/chosen": 554892800.0, + "logits/rejected": 178673536.0, + "logps/chosen": -384.33018275669644, + "logps/rejected": -264.5283203125, + "loss": 0.0569, + "rewards/chosen": 3.528917040143694, + "rewards/margins": 10.410777296338763, + "rewards/rejected": -6.881860256195068, + "step": 6922 + }, + { + "epoch": 0.6325262677021471, + "grad_norm": 0.640625, + "kl": 0.0, + "learning_rate": 2.9888898115574615e-06, + "logits/chosen": 735380288.0, + "logits/rejected": 1003231402.6666666, + "logps/chosen": -291.85272216796875, + "logps/rejected": -502.4762776692708, + "loss": 0.0033, + "rewards/chosen": 4.405344009399414, + "rewards/margins": 13.88087018330892, + "rewards/rejected": -9.475526173909506, + "step": 6923 + }, + { + "epoch": 0.6326176336226588, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 2.9875735283644542e-06, + "logits/chosen": 866451456.0, + "logits/rejected": 593522048.0, + "logps/chosen": -240.64449055989584, + "logps/rejected": -585.69189453125, + "loss": 0.0475, + "rewards/chosen": 3.0770365397135415, + "rewards/margins": 17.637808481852215, + "rewards/rejected": -14.560771942138672, + "step": 6924 + }, + { + "epoch": 0.6327089995431704, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 2.986257411580472e-06, + "logits/chosen": 1021666560.0, + "logits/rejected": 539119981.7142857, + "logps/chosen": -205.83856201171875, + "logps/rejected": -522.1812918526786, + "loss": 0.0282, + "rewards/chosen": 4.916714668273926, + "rewards/margins": 14.861187934875488, + "rewards/rejected": -9.944473266601562, + "step": 6925 + }, + { + "epoch": 0.632800365463682, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.9849414613143423e-06, + "logits/chosen": 365669632.0, + "logits/rejected": 501599488.0, + "logps/chosen": -273.376953125, + "logps/rejected": -651.759521484375, + "loss": 0.0068, + "rewards/chosen": 4.460963249206543, + "rewards/margins": 14.097208976745605, + "rewards/rejected": -9.636245727539062, + "step": 6926 + }, + { + "epoch": 0.6328917313841937, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 2.983625677674886e-06, + "logits/chosen": 1448417450.6666667, + "logits/rejected": 718802636.8, + "logps/chosen": -457.1066487630208, + "logps/rejected": -649.81435546875, + "loss": 0.016, + "rewards/chosen": 3.793349266052246, + "rewards/margins": 14.373159599304199, + "rewards/rejected": -10.579810333251952, + "step": 6927 + }, + { + "epoch": 0.6329830973047054, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 2.982310060770903e-06, + "logits/chosen": 1039866368.0, + "logits/rejected": 764821196.8, + "logps/chosen": -192.1598917643229, + "logps/rejected": -558.933984375, + "loss": 0.0148, + "rewards/chosen": 3.8171812693277993, + "rewards/margins": 13.124737421671549, + "rewards/rejected": -9.30755615234375, + "step": 6928 + }, + { + "epoch": 0.633074463225217, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.9809946107111833e-06, + "logits/chosen": 393917504.0, + "logits/rejected": 645209216.0, + "logps/chosen": -312.5536193847656, + "logps/rejected": -503.65020751953125, + "loss": 0.0211, + "rewards/chosen": 3.37497615814209, + "rewards/margins": 12.399495124816895, + "rewards/rejected": -9.024518966674805, + "step": 6929 + }, + { + "epoch": 0.6331658291457286, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 2.979679327604504e-06, + "logits/chosen": 1061019328.0, + "logits/rejected": 706708565.3333334, + "logps/chosen": -293.2178955078125, + "logps/rejected": -412.2372233072917, + "loss": 0.0076, + "rewards/chosen": 3.982461452484131, + "rewards/margins": 13.40073792139689, + "rewards/rejected": -9.41827646891276, + "step": 6930 + }, + { + "epoch": 0.6332571950662403, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 2.978364211559624e-06, + "logits/chosen": 598372608.0, + "logits/rejected": 778450432.0, + "logps/chosen": -410.18438720703125, + "logps/rejected": -767.7217407226562, + "loss": 0.0138, + "rewards/chosen": 4.733846664428711, + "rewards/margins": 15.356222152709961, + "rewards/rejected": -10.62237548828125, + "step": 6931 + }, + { + "epoch": 0.633348560986752, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 2.9770492626852944e-06, + "logits/chosen": 691246720.0, + "logits/rejected": 1056988352.0, + "logps/chosen": -355.3762512207031, + "logps/rejected": -540.9931640625, + "loss": 0.0112, + "rewards/chosen": 4.244386672973633, + "rewards/margins": 14.546915054321289, + "rewards/rejected": -10.302528381347656, + "step": 6932 + }, + { + "epoch": 0.6334399269072636, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 2.9757344810902456e-06, + "logits/chosen": 402630144.0, + "logits/rejected": 825751296.0, + "logps/chosen": -264.4637451171875, + "logps/rejected": -649.6461181640625, + "loss": 0.0109, + "rewards/chosen": 4.281485557556152, + "rewards/margins": 13.841065406799316, + "rewards/rejected": -9.559579849243164, + "step": 6933 + }, + { + "epoch": 0.6335312928277752, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 2.9744198668832026e-06, + "logits/chosen": 187417424.0, + "logits/rejected": 456319488.0, + "logps/chosen": -100.18994903564453, + "logps/rejected": -550.7251790364584, + "loss": 0.0109, + "rewards/chosen": 3.8018078804016113, + "rewards/margins": 14.8718368212382, + "rewards/rejected": -11.070028940836588, + "step": 6934 + }, + { + "epoch": 0.6336226587482869, + "grad_norm": 52.0, + "kl": 0.0, + "learning_rate": 2.9731054201728673e-06, + "logits/chosen": 315991232.0, + "logits/rejected": 535891370.6666667, + "logps/chosen": -120.86961364746094, + "logps/rejected": -495.1373697916667, + "loss": 0.0632, + "rewards/chosen": 1.350789189338684, + "rewards/margins": 12.378759821256002, + "rewards/rejected": -11.027970631917318, + "step": 6935 + }, + { + "epoch": 0.6337140246687986, + "grad_norm": 0.8984375, + "kl": 0.0, + "learning_rate": 2.9717911410679346e-06, + "logits/chosen": 460271411.2, + "logits/rejected": 719990954.6666666, + "logps/chosen": -211.6211181640625, + "logps/rejected": -851.5494791666666, + "loss": 0.0048, + "rewards/chosen": 5.236260223388672, + "rewards/margins": 15.048509724934895, + "rewards/rejected": -9.812249501546225, + "step": 6936 + }, + { + "epoch": 0.6338053905893102, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.9704770296770823e-06, + "logits/chosen": 663499072.0, + "logits/rejected": 539372617.1428572, + "logps/chosen": -448.5750732421875, + "logps/rejected": -573.0708356584821, + "loss": 0.0078, + "rewards/chosen": 2.7470948696136475, + "rewards/margins": 11.485575028828212, + "rewards/rejected": -8.738480159214564, + "step": 6937 + }, + { + "epoch": 0.6338967565098218, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 2.969163086108977e-06, + "logits/chosen": 528585429.3333333, + "logits/rejected": 429541888.0, + "logps/chosen": -420.8430989583333, + "logps/rejected": -586.1804809570312, + "loss": 0.0154, + "rewards/chosen": 4.243897438049316, + "rewards/margins": 13.712654113769531, + "rewards/rejected": -9.468756675720215, + "step": 6938 + }, + { + "epoch": 0.6339881224303335, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.9678493104722684e-06, + "logits/chosen": 649868544.0, + "logits/rejected": 844255914.6666666, + "logps/chosen": -275.66650390625, + "logps/rejected": -503.110595703125, + "loss": 0.022, + "rewards/chosen": 3.7895248413085936, + "rewards/margins": 15.994647471110024, + "rewards/rejected": -12.205122629801432, + "step": 6939 + }, + { + "epoch": 0.6340794883508452, + "grad_norm": 0.8203125, + "kl": 0.0, + "learning_rate": 2.966535702875592e-06, + "logits/chosen": 867871744.0, + "logits/rejected": 720342016.0, + "logps/chosen": -246.3292032877604, + "logps/rejected": -556.1431640625, + "loss": 0.0058, + "rewards/chosen": 4.376593907674153, + "rewards/margins": 13.726809056599933, + "rewards/rejected": -9.35021514892578, + "step": 6940 + }, + { + "epoch": 0.6341708542713568, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 2.965222263427574e-06, + "logits/chosen": 501590080.0, + "logits/rejected": 560794240.0, + "logps/chosen": -185.91973876953125, + "logps/rejected": -680.2720947265625, + "loss": 0.0103, + "rewards/chosen": 4.627324104309082, + "rewards/margins": 15.591588020324707, + "rewards/rejected": -10.964263916015625, + "step": 6941 + }, + { + "epoch": 0.6342622201918684, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 2.9639089922368204e-06, + "logits/chosen": 375037760.0, + "logits/rejected": 393797120.0, + "logps/chosen": -298.03375244140625, + "logps/rejected": -311.1629638671875, + "loss": 0.0397, + "rewards/chosen": 3.375918388366699, + "rewards/margins": 11.635778427124023, + "rewards/rejected": -8.259860038757324, + "step": 6942 + }, + { + "epoch": 0.6343535861123801, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 2.9625958894119312e-06, + "logits/chosen": 362656614.4, + "logits/rejected": 428778581.3333333, + "logps/chosen": -287.44912109375, + "logps/rejected": -513.326416015625, + "loss": 0.0099, + "rewards/chosen": 4.513019561767578, + "rewards/margins": 14.032028834025065, + "rewards/rejected": -9.519009272257486, + "step": 6943 + }, + { + "epoch": 0.6344449520328918, + "grad_norm": 66.0, + "kl": 0.0, + "learning_rate": 2.9612829550614836e-06, + "logits/chosen": 488521898.6666667, + "logits/rejected": 527441408.0, + "logps/chosen": -360.2418212890625, + "logps/rejected": -542.850537109375, + "loss": 0.0421, + "rewards/chosen": 4.119459788004558, + "rewards/margins": 12.177787272135419, + "rewards/rejected": -8.05832748413086, + "step": 6944 + }, + { + "epoch": 0.6345363179534034, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 2.9599701892940456e-06, + "logits/chosen": 456367744.0, + "logits/rejected": 594626944.0, + "logps/chosen": -335.22686767578125, + "logps/rejected": -355.4921875, + "loss": 0.0759, + "rewards/chosen": 3.1220345497131348, + "rewards/margins": 10.020749568939209, + "rewards/rejected": -6.898715019226074, + "step": 6945 + }, + { + "epoch": 0.634627683873915, + "grad_norm": 0.248046875, + "kl": 0.0, + "learning_rate": 2.9586575922181724e-06, + "logits/chosen": 244279765.33333334, + "logits/rejected": 439342796.8, + "logps/chosen": -286.1756184895833, + "logps/rejected": -576.5544921875, + "loss": 0.0014, + "rewards/chosen": 5.758598327636719, + "rewards/margins": 14.889924621582031, + "rewards/rejected": -9.131326293945312, + "step": 6946 + }, + { + "epoch": 0.6347190497944267, + "grad_norm": 0.2177734375, + "kl": 0.0, + "learning_rate": 2.957345163942402e-06, + "logits/chosen": 269128832.0, + "logits/rejected": 737486262.8571428, + "logps/chosen": -156.88429260253906, + "logps/rejected": -497.2781459263393, + "loss": 0.0009, + "rewards/chosen": 6.0336456298828125, + "rewards/margins": 15.093597412109375, + "rewards/rejected": -9.059951782226562, + "step": 6947 + }, + { + "epoch": 0.6348104157149383, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.956032904575262e-06, + "logits/chosen": 720671317.3333334, + "logits/rejected": 416672102.4, + "logps/chosen": -287.9299723307292, + "logps/rejected": -619.44453125, + "loss": 0.0114, + "rewards/chosen": 3.940655072530111, + "rewards/margins": 13.80433381398519, + "rewards/rejected": -9.863678741455079, + "step": 6948 + }, + { + "epoch": 0.63490178163545, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 2.954720814225263e-06, + "logits/chosen": 304895296.0, + "logits/rejected": 343463456.0, + "logps/chosen": -235.81277465820312, + "logps/rejected": -359.3511657714844, + "loss": 0.0197, + "rewards/chosen": 3.6700258255004883, + "rewards/margins": 11.883557319641113, + "rewards/rejected": -8.213531494140625, + "step": 6949 + }, + { + "epoch": 0.6349931475559616, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 2.953408893000902e-06, + "logits/chosen": 650703411.2, + "logits/rejected": 386802090.6666667, + "logps/chosen": -293.0904296875, + "logps/rejected": -610.1457112630209, + "loss": 0.0255, + "rewards/chosen": 3.464122772216797, + "rewards/margins": 16.405335744222008, + "rewards/rejected": -12.941212972005209, + "step": 6950 + }, + { + "epoch": 0.6350845134764733, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 2.9520971410106643e-06, + "logits/chosen": 502266726.4, + "logits/rejected": 311305600.0, + "logps/chosen": -405.053564453125, + "logps/rejected": -345.36865234375, + "loss": 0.0145, + "rewards/chosen": 3.786956787109375, + "rewards/margins": 13.682514190673828, + "rewards/rejected": -9.895557403564453, + "step": 6951 + }, + { + "epoch": 0.6351758793969849, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 2.9507855583630173e-06, + "logits/chosen": 566487680.0, + "logits/rejected": 499827264.0, + "logps/chosen": -288.27203369140625, + "logps/rejected": -553.424072265625, + "loss": 0.0197, + "rewards/chosen": 3.5525283813476562, + "rewards/margins": 13.81346321105957, + "rewards/rejected": -10.260934829711914, + "step": 6952 + }, + { + "epoch": 0.6352672453174966, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 2.94947414516642e-06, + "logits/chosen": 469100032.0, + "logits/rejected": 430256640.0, + "logps/chosen": -255.4630126953125, + "logps/rejected": -432.5447265625, + "loss": 0.0213, + "rewards/chosen": 2.910832722981771, + "rewards/margins": 10.167152150472006, + "rewards/rejected": -7.256319427490235, + "step": 6953 + }, + { + "epoch": 0.6353586112380082, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.9481629015293105e-06, + "logits/chosen": 848938154.6666666, + "logits/rejected": 601342848.0, + "logps/chosen": -289.9012044270833, + "logps/rejected": -511.80352783203125, + "loss": 0.0329, + "rewards/chosen": 3.63411553700765, + "rewards/margins": 12.581124623616537, + "rewards/rejected": -8.947009086608887, + "step": 6954 + }, + { + "epoch": 0.6354499771585199, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 2.94685182756012e-06, + "logits/chosen": 785256960.0, + "logits/rejected": 576741939.2, + "logps/chosen": -225.71038818359375, + "logps/rejected": -416.957421875, + "loss": 0.0117, + "rewards/chosen": 3.929175059000651, + "rewards/margins": 11.20267817179362, + "rewards/rejected": -7.273503112792969, + "step": 6955 + }, + { + "epoch": 0.6355413430790315, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.9455409233672594e-06, + "logits/chosen": 594567552.0, + "logits/rejected": 625522688.0, + "logps/chosen": -335.2304992675781, + "logps/rejected": -793.385986328125, + "loss": 0.0248, + "rewards/chosen": 3.0688273906707764, + "rewards/margins": 15.35221552848816, + "rewards/rejected": -12.283388137817383, + "step": 6956 + }, + { + "epoch": 0.6356327089995432, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 2.9442301890591308e-06, + "logits/chosen": 582113728.0, + "logits/rejected": 740077760.0, + "logps/chosen": -385.4934387207031, + "logps/rejected": -439.9463806152344, + "loss": 0.0136, + "rewards/chosen": 4.345122337341309, + "rewards/margins": 13.922894477844238, + "rewards/rejected": -9.57777214050293, + "step": 6957 + }, + { + "epoch": 0.6357240749200548, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 2.9429196247441166e-06, + "logits/chosen": 578268672.0, + "logits/rejected": 600009600.0, + "logps/chosen": -291.0002136230469, + "logps/rejected": -619.4656372070312, + "loss": 0.0125, + "rewards/chosen": 3.8148436546325684, + "rewards/margins": 14.066089153289795, + "rewards/rejected": -10.251245498657227, + "step": 6958 + }, + { + "epoch": 0.6358154408405665, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 2.9416092305305918e-06, + "logits/chosen": 934180672.0, + "logits/rejected": 419505024.0, + "logps/chosen": -474.777587890625, + "logps/rejected": -320.094970703125, + "loss": 0.0154, + "rewards/chosen": 4.180947303771973, + "rewards/margins": 12.52461051940918, + "rewards/rejected": -8.343663215637207, + "step": 6959 + }, + { + "epoch": 0.6359068067610781, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 2.940299006526912e-06, + "logits/chosen": 563850197.3333334, + "logits/rejected": 675057728.0, + "logps/chosen": -609.705322265625, + "logps/rejected": -672.2564697265625, + "loss": 0.0232, + "rewards/chosen": 3.737293561299642, + "rewards/margins": 12.764530499776205, + "rewards/rejected": -9.027236938476562, + "step": 6960 + }, + { + "epoch": 0.6359981726815898, + "grad_norm": 0.251953125, + "kl": 0.0, + "learning_rate": 2.9389889528414195e-06, + "logits/chosen": 311743776.0, + "logits/rejected": 671077205.3333334, + "logps/chosen": -330.013427734375, + "logps/rejected": -538.3291829427084, + "loss": 0.0012, + "rewards/chosen": 5.469592094421387, + "rewards/margins": 15.293072064717611, + "rewards/rejected": -9.823479970296225, + "step": 6961 + }, + { + "epoch": 0.6360895386021014, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 2.9376790695824466e-06, + "logits/chosen": 345932842.6666667, + "logits/rejected": 553842688.0, + "logps/chosen": -313.1724446614583, + "logps/rejected": -661.2763671875, + "loss": 0.0031, + "rewards/chosen": 5.084828058878581, + "rewards/margins": 14.805287043253582, + "rewards/rejected": -9.720458984375, + "step": 6962 + }, + { + "epoch": 0.6361809045226131, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 2.936369356858305e-06, + "logits/chosen": 490477363.2, + "logits/rejected": 420885504.0, + "logps/chosen": -271.4376953125, + "logps/rejected": -842.7954915364584, + "loss": 0.017, + "rewards/chosen": 3.9206779479980467, + "rewards/margins": 15.858980305989583, + "rewards/rejected": -11.938302357991537, + "step": 6963 + }, + { + "epoch": 0.6362722704431247, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.935059814777299e-06, + "logits/chosen": 716258176.0, + "logits/rejected": 649930816.0, + "logps/chosen": -323.75079345703125, + "logps/rejected": -441.28961181640625, + "loss": 0.0262, + "rewards/chosen": 3.24961519241333, + "rewards/margins": 12.816936016082764, + "rewards/rejected": -9.567320823669434, + "step": 6964 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.56640625, + "kl": 0.0, + "learning_rate": 2.9337504434477138e-06, + "logits/chosen": 361975040.0, + "logits/rejected": 397026764.8, + "logps/chosen": -379.9214680989583, + "logps/rejected": -467.896484375, + "loss": 0.0027, + "rewards/chosen": 5.372843424479167, + "rewards/margins": 14.482096354166668, + "rewards/rejected": -9.1092529296875, + "step": 6965 + }, + { + "epoch": 0.636455002284148, + "grad_norm": 0.32421875, + "kl": 0.0, + "learning_rate": 2.9324412429778224e-06, + "logits/chosen": 343744160.0, + "logits/rejected": 563936365.7142857, + "logps/chosen": -163.16122436523438, + "logps/rejected": -620.1729213169643, + "loss": 0.0012, + "rewards/chosen": 4.8219146728515625, + "rewards/margins": 14.565758841378349, + "rewards/rejected": -9.743844168526786, + "step": 6966 + }, + { + "epoch": 0.6365463682046597, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 2.931132213475884e-06, + "logits/chosen": 533068117.3333333, + "logits/rejected": 516345907.2, + "logps/chosen": -323.8412679036458, + "logps/rejected": -604.515380859375, + "loss": 0.0091, + "rewards/chosen": 3.958347956339518, + "rewards/margins": 13.755371729532877, + "rewards/rejected": -9.79702377319336, + "step": 6967 + }, + { + "epoch": 0.6366377341251713, + "grad_norm": 24.25, + "kl": 0.0, + "learning_rate": 2.9298233550501435e-06, + "logits/chosen": 666864192.0, + "logits/rejected": 613431210.6666666, + "logps/chosen": -425.64984130859375, + "logps/rejected": -474.7485758463542, + "loss": 0.0359, + "rewards/chosen": 2.4689531326293945, + "rewards/margins": 11.035660107930502, + "rewards/rejected": -8.566706975301107, + "step": 6968 + }, + { + "epoch": 0.636729100045683, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 2.9285146678088304e-06, + "logits/chosen": 439552800.0, + "logits/rejected": 468557440.0, + "logps/chosen": -402.70477294921875, + "logps/rejected": -548.670166015625, + "loss": 0.0088, + "rewards/chosen": 4.1238579750061035, + "rewards/margins": 14.464787006378174, + "rewards/rejected": -10.34092903137207, + "step": 6969 + }, + { + "epoch": 0.6368204659661946, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 2.927206151860164e-06, + "logits/chosen": 479310016.0, + "logits/rejected": 649361216.0, + "logps/chosen": -283.5713806152344, + "logps/rejected": -454.774658203125, + "loss": 0.0119, + "rewards/chosen": 3.8962881565093994, + "rewards/margins": 11.859846353530884, + "rewards/rejected": -7.963558197021484, + "step": 6970 + }, + { + "epoch": 0.6369118318867063, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.9258978073123413e-06, + "logits/chosen": 432397397.3333333, + "logits/rejected": 451070361.6, + "logps/chosen": -340.662353515625, + "logps/rejected": -473.7513671875, + "loss": 0.0161, + "rewards/chosen": 3.3793557484944663, + "rewards/margins": 12.021502049763997, + "rewards/rejected": -8.64214630126953, + "step": 6971 + }, + { + "epoch": 0.6370031978072179, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 2.924589634273554e-06, + "logits/chosen": 980239616.0, + "logits/rejected": 320072192.0, + "logps/chosen": -333.41404215494794, + "logps/rejected": -283.44793701171875, + "loss": 0.0334, + "rewards/chosen": 3.3231945037841797, + "rewards/margins": 10.726526260375977, + "rewards/rejected": -7.403331756591797, + "step": 6972 + }, + { + "epoch": 0.6370945637277295, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 2.9232816328519736e-06, + "logits/chosen": 560330965.3333334, + "logits/rejected": 588810752.0, + "logps/chosen": -469.5496419270833, + "logps/rejected": -409.202490234375, + "loss": 0.0132, + "rewards/chosen": 3.5071239471435547, + "rewards/margins": 13.727602767944337, + "rewards/rejected": -10.220478820800782, + "step": 6973 + }, + { + "epoch": 0.6371859296482412, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 2.9219738031557622e-06, + "logits/chosen": 305365824.0, + "logits/rejected": 286605158.4, + "logps/chosen": -259.7086588541667, + "logps/rejected": -422.1900390625, + "loss": 0.0104, + "rewards/chosen": 4.438002268473308, + "rewards/margins": 14.813939921061198, + "rewards/rejected": -10.37593765258789, + "step": 6974 + }, + { + "epoch": 0.6372772955687529, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 2.920666145293063e-06, + "logits/chosen": 901742080.0, + "logits/rejected": 575189418.6666666, + "logps/chosen": -395.443994140625, + "logps/rejected": -821.0808919270834, + "loss": 0.0108, + "rewards/chosen": 4.302595520019532, + "rewards/margins": 19.143568420410155, + "rewards/rejected": -14.840972900390625, + "step": 6975 + }, + { + "epoch": 0.6373686614892645, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.9193586593720093e-06, + "logits/chosen": 944296960.0, + "logits/rejected": 529259520.0, + "logps/chosen": -352.49261474609375, + "logps/rejected": -461.90155029296875, + "loss": 0.0187, + "rewards/chosen": 3.7466468811035156, + "rewards/margins": 13.255024909973145, + "rewards/rejected": -9.508378028869629, + "step": 6976 + }, + { + "epoch": 0.6374600274097761, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 2.918051345500716e-06, + "logits/chosen": 664400832.0, + "logits/rejected": 528014880.0, + "logps/chosen": -342.70501708984375, + "logps/rejected": -463.02294921875, + "loss": 0.0198, + "rewards/chosen": 3.4260029792785645, + "rewards/margins": 10.03955364227295, + "rewards/rejected": -6.613550662994385, + "step": 6977 + }, + { + "epoch": 0.6375513933302878, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 2.9167442037872853e-06, + "logits/chosen": 421033472.0, + "logits/rejected": 632784469.3333334, + "logps/chosen": -238.0616943359375, + "logps/rejected": -554.3590494791666, + "loss": 0.0092, + "rewards/chosen": 4.600603485107422, + "rewards/margins": 12.706400934855143, + "rewards/rejected": -8.10579744974772, + "step": 6978 + }, + { + "epoch": 0.6376427592507995, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.915437234339809e-06, + "logits/chosen": 705386598.4, + "logits/rejected": 238352512.0, + "logps/chosen": -236.824267578125, + "logps/rejected": -302.0129801432292, + "loss": 0.1112, + "rewards/chosen": 3.736310577392578, + "rewards/margins": 12.10327173868815, + "rewards/rejected": -8.366961161295572, + "step": 6979 + }, + { + "epoch": 0.6377341251713111, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.9141304372663596e-06, + "logits/chosen": 518546380.8, + "logits/rejected": 581760000.0, + "logps/chosen": -303.7774658203125, + "logps/rejected": -521.27001953125, + "loss": 0.0147, + "rewards/chosen": 4.388481140136719, + "rewards/margins": 13.551953633626303, + "rewards/rejected": -9.163472493489584, + "step": 6980 + }, + { + "epoch": 0.6378254910918227, + "grad_norm": 48.5, + "kl": 0.0, + "learning_rate": 2.9128238126749974e-06, + "logits/chosen": 759760320.0, + "logits/rejected": 514700653.71428573, + "logps/chosen": -303.0213623046875, + "logps/rejected": -638.8182198660714, + "loss": 0.0903, + "rewards/chosen": -0.944866955280304, + "rewards/margins": 7.443915545940399, + "rewards/rejected": -8.388782501220703, + "step": 6981 + }, + { + "epoch": 0.6379168570123344, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 2.911517360673767e-06, + "logits/chosen": 459956121.6, + "logits/rejected": 503674880.0, + "logps/chosen": -353.02158203125, + "logps/rejected": -380.3446451822917, + "loss": 0.0154, + "rewards/chosen": 4.1262565612792965, + "rewards/margins": 10.778038914998373, + "rewards/rejected": -6.651782353719075, + "step": 6982 + }, + { + "epoch": 0.6380082229328461, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 2.9102110813706997e-06, + "logits/chosen": 447599488.0, + "logits/rejected": 566331093.3333334, + "logps/chosen": -423.3115539550781, + "logps/rejected": -506.1698811848958, + "loss": 0.0051, + "rewards/chosen": 3.9465227127075195, + "rewards/margins": 14.54191811879476, + "rewards/rejected": -10.59539540608724, + "step": 6983 + }, + { + "epoch": 0.6380995888533577, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.9089049748738147e-06, + "logits/chosen": 401661363.2, + "logits/rejected": 346065450.6666667, + "logps/chosen": -237.869775390625, + "logps/rejected": -443.6813151041667, + "loss": 0.0348, + "rewards/chosen": 3.3790924072265627, + "rewards/margins": 14.342578125, + "rewards/rejected": -10.963485717773438, + "step": 6984 + }, + { + "epoch": 0.6381909547738693, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 2.9075990412911143e-06, + "logits/chosen": 670938752.0, + "logits/rejected": 560266496.0, + "logps/chosen": -470.5234680175781, + "logps/rejected": -569.400390625, + "loss": 0.0214, + "rewards/chosen": 3.2568490505218506, + "rewards/margins": 13.525272130966187, + "rewards/rejected": -10.268423080444336, + "step": 6985 + }, + { + "epoch": 0.638282320694381, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 2.9062932807305865e-06, + "logits/chosen": 597330432.0, + "logits/rejected": 481892288.0, + "logps/chosen": -381.7991943359375, + "logps/rejected": -463.6534423828125, + "loss": 0.0213, + "rewards/chosen": 3.5072829723358154, + "rewards/margins": 12.512160062789917, + "rewards/rejected": -9.004877090454102, + "step": 6986 + }, + { + "epoch": 0.6383736866148927, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.904987693300204e-06, + "logits/chosen": 397741670.4, + "logits/rejected": 352641621.3333333, + "logps/chosen": -206.5726318359375, + "logps/rejected": -415.6311442057292, + "loss": 0.0276, + "rewards/chosen": 3.5651695251464846, + "rewards/margins": 12.462818654378257, + "rewards/rejected": -8.897649129231771, + "step": 6987 + }, + { + "epoch": 0.6384650525354043, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 2.90368227910793e-06, + "logits/chosen": 497446304.0, + "logits/rejected": 640938944.0, + "logps/chosen": -225.24957275390625, + "logps/rejected": -469.597412109375, + "loss": 0.2427, + "rewards/chosen": 1.3306671380996704, + "rewards/margins": 7.5161052942276, + "rewards/rejected": -6.18543815612793, + "step": 6988 + }, + { + "epoch": 0.6385564184559159, + "grad_norm": 0.142578125, + "kl": 0.0, + "learning_rate": 2.902377038261709e-06, + "logits/chosen": 367356672.0, + "logits/rejected": 630735945.1428572, + "logps/chosen": -260.0584411621094, + "logps/rejected": -520.6290806361607, + "loss": 0.0006, + "rewards/chosen": 5.707126140594482, + "rewards/margins": 15.480948380061559, + "rewards/rejected": -9.773822239467076, + "step": 6989 + }, + { + "epoch": 0.6386477843764276, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.9010719708694724e-06, + "logits/chosen": 588605610.6666666, + "logits/rejected": 967769024.0, + "logps/chosen": -325.74900309244794, + "logps/rejected": -544.9119873046875, + "loss": 0.0208, + "rewards/chosen": 3.7039337158203125, + "rewards/margins": 13.15953540802002, + "rewards/rejected": -9.455601692199707, + "step": 6990 + }, + { + "epoch": 0.6387391502969393, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 2.8997670770391364e-06, + "logits/chosen": 801000384.0, + "logits/rejected": 566657664.0, + "logps/chosen": -321.76727294921875, + "logps/rejected": -752.229248046875, + "loss": 0.0143, + "rewards/chosen": 3.121487617492676, + "rewards/margins": 12.677739779154459, + "rewards/rejected": -9.556252161661783, + "step": 6991 + }, + { + "epoch": 0.6388305162174509, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 2.898462356878602e-06, + "logits/chosen": 1078689536.0, + "logits/rejected": 506135872.0, + "logps/chosen": -297.6502685546875, + "logps/rejected": -340.2413635253906, + "loss": 0.1328, + "rewards/chosen": 1.885239839553833, + "rewards/margins": 9.920043230056763, + "rewards/rejected": -8.03480339050293, + "step": 6992 + }, + { + "epoch": 0.6389218821379625, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.897157810495762e-06, + "logits/chosen": 524219712.0, + "logits/rejected": 784500416.0, + "logps/chosen": -305.90167236328125, + "logps/rejected": -480.1982116699219, + "loss": 0.0232, + "rewards/chosen": 3.798828601837158, + "rewards/margins": 13.187724590301514, + "rewards/rejected": -9.388895988464355, + "step": 6993 + }, + { + "epoch": 0.6390132480584741, + "grad_norm": 0.859375, + "kl": 0.0, + "learning_rate": 2.895853437998487e-06, + "logits/chosen": 504015786.6666667, + "logits/rejected": 477546547.2, + "logps/chosen": -337.2021484375, + "logps/rejected": -596.27685546875, + "loss": 0.0049, + "rewards/chosen": 4.866240501403809, + "rewards/margins": 15.613712882995605, + "rewards/rejected": -10.747472381591797, + "step": 6994 + }, + { + "epoch": 0.6391046139789859, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 2.8945492394946384e-06, + "logits/chosen": 448210944.0, + "logits/rejected": 435195584.0, + "logps/chosen": -233.7318115234375, + "logps/rejected": -619.628662109375, + "loss": 0.0238, + "rewards/chosen": 3.042921543121338, + "rewards/margins": 14.667728900909424, + "rewards/rejected": -11.624807357788086, + "step": 6995 + }, + { + "epoch": 0.6391959798994975, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 2.8932452150920576e-06, + "logits/chosen": 856549290.6666666, + "logits/rejected": 822976051.2, + "logps/chosen": -485.6385498046875, + "logps/rejected": -748.3353515625, + "loss": 0.0073, + "rewards/chosen": 4.243149757385254, + "rewards/margins": 17.34983310699463, + "rewards/rejected": -13.106683349609375, + "step": 6996 + }, + { + "epoch": 0.6392873458200091, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 2.8919413648985816e-06, + "logits/chosen": 566862464.0, + "logits/rejected": 567311232.0, + "logps/chosen": -373.17645263671875, + "logps/rejected": -787.54833984375, + "loss": 0.027, + "rewards/chosen": 3.688490867614746, + "rewards/margins": 19.205248832702637, + "rewards/rejected": -15.51675796508789, + "step": 6997 + }, + { + "epoch": 0.6393787117405207, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 2.8906376890220217e-06, + "logits/chosen": 439956224.0, + "logits/rejected": 729763264.0, + "logps/chosen": -400.5897216796875, + "logps/rejected": -743.2839965820312, + "loss": 0.1364, + "rewards/chosen": 2.840948740641276, + "rewards/margins": 13.649663607279459, + "rewards/rejected": -10.808714866638184, + "step": 6998 + }, + { + "epoch": 0.6394700776610325, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 2.889334187570179e-06, + "logits/chosen": 619184853.3333334, + "logits/rejected": 710011340.8, + "logps/chosen": -278.13938395182294, + "logps/rejected": -326.4183837890625, + "loss": 0.0077, + "rewards/chosen": 4.394898096720378, + "rewards/margins": 12.93413912455241, + "rewards/rejected": -8.53924102783203, + "step": 6999 + }, + { + "epoch": 0.6395614435815441, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 2.8880308606508456e-06, + "logits/chosen": 278133632.0, + "logits/rejected": 448293312.0, + "logps/chosen": -311.6865234375, + "logps/rejected": -519.896484375, + "loss": 0.0242, + "rewards/chosen": 3.503757953643799, + "rewards/margins": 13.606700420379639, + "rewards/rejected": -10.10294246673584, + "step": 7000 + }, + { + "epoch": 0.6396528095020557, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 2.8867277083717915e-06, + "logits/chosen": 458633113.6, + "logits/rejected": 467637802.6666667, + "logps/chosen": -186.2625732421875, + "logps/rejected": -597.6458740234375, + "loss": 0.0221, + "rewards/chosen": 4.25854377746582, + "rewards/margins": 15.948776372273763, + "rewards/rejected": -11.690232594807943, + "step": 7001 + }, + { + "epoch": 0.6397441754225673, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 2.885424730840777e-06, + "logits/chosen": 492723419.4285714, + "logits/rejected": 434366656.0, + "logps/chosen": -340.12765066964283, + "logps/rejected": -492.54095458984375, + "loss": 0.031, + "rewards/chosen": 3.6891070774623325, + "rewards/margins": 12.626250403267996, + "rewards/rejected": -8.937143325805664, + "step": 7002 + }, + { + "epoch": 0.6398355413430791, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.884121928165544e-06, + "logits/chosen": 514722336.0, + "logits/rejected": 618371072.0, + "logps/chosen": -327.3511657714844, + "logps/rejected": -619.69482421875, + "loss": 0.0228, + "rewards/chosen": 3.3503317832946777, + "rewards/margins": 12.300567150115967, + "rewards/rejected": -8.950235366821289, + "step": 7003 + }, + { + "epoch": 0.6399269072635907, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 2.8828193004538228e-06, + "logits/chosen": 524490581.3333333, + "logits/rejected": 462067302.4, + "logps/chosen": -439.7828776041667, + "logps/rejected": -500.14541015625, + "loss": 0.0105, + "rewards/chosen": 3.604466756184896, + "rewards/margins": 14.70906728108724, + "rewards/rejected": -11.104600524902343, + "step": 7004 + }, + { + "epoch": 0.6400182731841023, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.8815168478133304e-06, + "logits/chosen": 892288819.2, + "logits/rejected": 584855850.6666666, + "logps/chosen": -350.55126953125, + "logps/rejected": -752.2125651041666, + "loss": 0.018, + "rewards/chosen": 3.5709083557128904, + "rewards/margins": 18.46167449951172, + "rewards/rejected": -14.890766143798828, + "step": 7005 + }, + { + "epoch": 0.6401096391046139, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 2.880214570351767e-06, + "logits/chosen": 483678592.0, + "logits/rejected": 566884966.4, + "logps/chosen": -202.39945475260416, + "logps/rejected": -464.596484375, + "loss": 0.0301, + "rewards/chosen": 2.534844398498535, + "rewards/margins": 11.800693702697753, + "rewards/rejected": -9.265849304199218, + "step": 7006 + }, + { + "epoch": 0.6402010050251257, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 2.878912468176818e-06, + "logits/chosen": 623725248.0, + "logits/rejected": 598171648.0, + "logps/chosen": -314.72296142578125, + "logps/rejected": -359.01043701171875, + "loss": 0.0264, + "rewards/chosen": 2.9212288856506348, + "rewards/margins": 11.036348819732666, + "rewards/rejected": -8.115119934082031, + "step": 7007 + }, + { + "epoch": 0.6402923709456373, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 2.877610541396154e-06, + "logits/chosen": 580092672.0, + "logits/rejected": 444268748.8, + "logps/chosen": -358.9051106770833, + "logps/rejected": -445.39873046875, + "loss": 0.0141, + "rewards/chosen": 3.6259237925211587, + "rewards/margins": 13.041422907511393, + "rewards/rejected": -9.415499114990235, + "step": 7008 + }, + { + "epoch": 0.6403837368661489, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 2.876308790117436e-06, + "logits/chosen": 600670400.0, + "logits/rejected": 767579328.0, + "logps/chosen": -209.02105712890625, + "logps/rejected": -468.9075012207031, + "loss": 0.0803, + "rewards/chosen": 3.198202610015869, + "rewards/margins": 9.98521089553833, + "rewards/rejected": -6.787008285522461, + "step": 7009 + }, + { + "epoch": 0.6404751027866605, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.875007214448304e-06, + "logits/chosen": 429076650.6666667, + "logits/rejected": 350859366.4, + "logps/chosen": -196.7957560221354, + "logps/rejected": -428.50712890625, + "loss": 0.0313, + "rewards/chosen": 2.6787856419881186, + "rewards/margins": 12.541567166646322, + "rewards/rejected": -9.862781524658203, + "step": 7010 + }, + { + "epoch": 0.6405664687071723, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.873705814496387e-06, + "logits/chosen": 418568000.0, + "logits/rejected": 433748949.3333333, + "logps/chosen": -309.5379333496094, + "logps/rejected": -490.8784993489583, + "loss": 0.0115, + "rewards/chosen": 3.092771053314209, + "rewards/margins": 13.234532197316488, + "rewards/rejected": -10.14176114400228, + "step": 7011 + }, + { + "epoch": 0.6406578346276839, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.872404590369299e-06, + "logits/chosen": 631491754.6666666, + "logits/rejected": 300781568.0, + "logps/chosen": -364.8311360677083, + "logps/rejected": -324.535546875, + "loss": 0.0133, + "rewards/chosen": 3.406989415486654, + "rewards/margins": 11.843042882283529, + "rewards/rejected": -8.436053466796874, + "step": 7012 + }, + { + "epoch": 0.6407492005481955, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 2.871103542174637e-06, + "logits/chosen": 506792320.0, + "logits/rejected": 386374016.0, + "logps/chosen": -243.832275390625, + "logps/rejected": -563.5751953125, + "loss": 0.0409, + "rewards/chosen": 3.251471201578776, + "rewards/margins": 13.554448763529459, + "rewards/rejected": -10.302977561950684, + "step": 7013 + }, + { + "epoch": 0.6408405664687071, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 2.869802670019989e-06, + "logits/chosen": 613323059.2, + "logits/rejected": 515321173.3333333, + "logps/chosen": -301.190869140625, + "logps/rejected": -492.6529947916667, + "loss": 0.0161, + "rewards/chosen": 4.67407341003418, + "rewards/margins": 13.942759068806968, + "rewards/rejected": -9.268685658772787, + "step": 7014 + }, + { + "epoch": 0.6409319323892189, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 2.8685019740129227e-06, + "logits/chosen": 613242953.1428572, + "logits/rejected": 477038144.0, + "logps/chosen": -305.47279575892856, + "logps/rejected": -658.11181640625, + "loss": 0.0314, + "rewards/chosen": 3.868194035121373, + "rewards/margins": 11.097564152308873, + "rewards/rejected": -7.2293701171875, + "step": 7015 + }, + { + "epoch": 0.6410232983097305, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 2.867201454260995e-06, + "logits/chosen": 543660458.6666666, + "logits/rejected": 229533088.0, + "logps/chosen": -364.5859375, + "logps/rejected": -363.1269836425781, + "loss": 0.0273, + "rewards/chosen": 3.738765080769857, + "rewards/margins": 13.516249974568685, + "rewards/rejected": -9.777484893798828, + "step": 7016 + }, + { + "epoch": 0.6411146642302421, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 2.8659011108717442e-06, + "logits/chosen": 694142400.0, + "logits/rejected": 530941952.0, + "logps/chosen": -406.00628662109375, + "logps/rejected": -636.5860595703125, + "loss": 0.0049, + "rewards/chosen": 4.270555019378662, + "rewards/margins": 14.261402289072672, + "rewards/rejected": -9.99084726969401, + "step": 7017 + }, + { + "epoch": 0.6412060301507537, + "grad_norm": 0.4609375, + "kl": 0.0, + "learning_rate": 2.8646009439527014e-06, + "logits/chosen": 282095018.6666667, + "logits/rejected": 600320870.4, + "logps/chosen": -150.8864542643229, + "logps/rejected": -515.890625, + "loss": 0.0025, + "rewards/chosen": 5.186315536499023, + "rewards/margins": 14.439072799682616, + "rewards/rejected": -9.252757263183593, + "step": 7018 + }, + { + "epoch": 0.6412973960712655, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 2.863300953611374e-06, + "logits/chosen": 561234048.0, + "logits/rejected": 397247402.6666667, + "logps/chosen": -382.9140625, + "logps/rejected": -416.0475260416667, + "loss": 0.0202, + "rewards/chosen": 2.701373338699341, + "rewards/margins": 12.424181540807089, + "rewards/rejected": -9.722808202107748, + "step": 7019 + }, + { + "epoch": 0.6413887619917771, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 2.862001139955263e-06, + "logits/chosen": 1586910336.0, + "logits/rejected": 464969545.14285713, + "logps/chosen": -361.20892333984375, + "logps/rejected": -451.19566127232144, + "loss": 0.0055, + "rewards/chosen": 3.1066651344299316, + "rewards/margins": 13.534990242549352, + "rewards/rejected": -10.42832510811942, + "step": 7020 + }, + { + "epoch": 0.6414801279122887, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 2.860701503091845e-06, + "logits/chosen": 371129557.3333333, + "logits/rejected": 400906956.8, + "logps/chosen": -253.58011881510416, + "logps/rejected": -524.620166015625, + "loss": 0.0139, + "rewards/chosen": 3.755357106526693, + "rewards/margins": 13.806983693440756, + "rewards/rejected": -10.051626586914063, + "step": 7021 + }, + { + "epoch": 0.6415714938328003, + "grad_norm": 30.5, + "kl": 0.0, + "learning_rate": 2.859402043128594e-06, + "logits/chosen": 486155110.4, + "logits/rejected": 703597269.3333334, + "logps/chosen": -381.044091796875, + "logps/rejected": -594.5907796223959, + "loss": 0.1264, + "rewards/chosen": 2.4311601638793947, + "rewards/margins": 12.035506757100425, + "rewards/rejected": -9.60434659322103, + "step": 7022 + }, + { + "epoch": 0.641662859753312, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 2.8581027601729612e-06, + "logits/chosen": 821893461.3333334, + "logits/rejected": 604110796.8, + "logps/chosen": -375.308349609375, + "logps/rejected": -620.555224609375, + "loss": 0.1289, + "rewards/chosen": 1.0752838452657063, + "rewards/margins": 10.238381989796958, + "rewards/rejected": -9.16309814453125, + "step": 7023 + }, + { + "epoch": 0.6417542256738237, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 2.856803654332387e-06, + "logits/chosen": 222261299.2, + "logits/rejected": 453465216.0, + "logps/chosen": -171.96971435546874, + "logps/rejected": -443.9086100260417, + "loss": 0.0139, + "rewards/chosen": 4.726446533203125, + "rewards/margins": 13.805945460001627, + "rewards/rejected": -9.079498926798502, + "step": 7024 + }, + { + "epoch": 0.6418455915943353, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 2.8555047257142894e-06, + "logits/chosen": 810904448.0, + "logits/rejected": 407892032.0, + "logps/chosen": -359.8282165527344, + "logps/rejected": -425.0364074707031, + "loss": 0.0089, + "rewards/chosen": 4.306741714477539, + "rewards/margins": 13.833064079284668, + "rewards/rejected": -9.526322364807129, + "step": 7025 + }, + { + "epoch": 0.6419369575148469, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.8542059744260834e-06, + "logits/chosen": 882870101.3333334, + "logits/rejected": 604025344.0, + "logps/chosen": -424.9562174479167, + "logps/rejected": -235.1654815673828, + "loss": 0.0148, + "rewards/chosen": 4.336846987406413, + "rewards/margins": 12.73616377512614, + "rewards/rejected": -8.399316787719727, + "step": 7026 + }, + { + "epoch": 0.6420283234353586, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 2.852907400575162e-06, + "logits/chosen": 526530560.0, + "logits/rejected": 579916629.3333334, + "logps/chosen": -273.21845703125, + "logps/rejected": -445.8915608723958, + "loss": 0.0279, + "rewards/chosen": 3.3018413543701173, + "rewards/margins": 12.772366460164388, + "rewards/rejected": -9.470525105794271, + "step": 7027 + }, + { + "epoch": 0.6421196893558703, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.851609004268906e-06, + "logits/chosen": 359649382.4, + "logits/rejected": 227925930.66666666, + "logps/chosen": -219.692578125, + "logps/rejected": -324.5257975260417, + "loss": 0.0326, + "rewards/chosen": 3.1517791748046875, + "rewards/margins": 9.690147399902344, + "rewards/rejected": -6.538368225097656, + "step": 7028 + }, + { + "epoch": 0.6422110552763819, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 2.8503107856146784e-06, + "logits/chosen": 745679564.8, + "logits/rejected": 590702250.6666666, + "logps/chosen": -531.753125, + "logps/rejected": -557.9447835286459, + "loss": 0.016, + "rewards/chosen": 4.04603271484375, + "rewards/margins": 13.522263336181641, + "rewards/rejected": -9.47623062133789, + "step": 7029 + }, + { + "epoch": 0.6423024211968935, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 2.8490127447198325e-06, + "logits/chosen": 540981162.6666666, + "logits/rejected": 705630272.0, + "logps/chosen": -144.482177734375, + "logps/rejected": -496.22833251953125, + "loss": 0.1571, + "rewards/chosen": 2.1722338994344077, + "rewards/margins": 8.699518521626791, + "rewards/rejected": -6.527284622192383, + "step": 7030 + }, + { + "epoch": 0.6423937871174052, + "grad_norm": 45.25, + "kl": 0.0, + "learning_rate": 2.847714881691704e-06, + "logits/chosen": 421074474.6666667, + "logits/rejected": 325966438.4, + "logps/chosen": -126.53511555989583, + "logps/rejected": -347.7170166015625, + "loss": 0.0587, + "rewards/chosen": 2.539704958597819, + "rewards/margins": 10.048984591166178, + "rewards/rejected": -7.50927963256836, + "step": 7031 + }, + { + "epoch": 0.6424851530379169, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 2.846417196637613e-06, + "logits/chosen": 301579328.0, + "logits/rejected": 578624768.0, + "logps/chosen": -213.79595947265625, + "logps/rejected": -538.0381469726562, + "loss": 0.0202, + "rewards/chosen": 4.301332473754883, + "rewards/margins": 12.838915824890137, + "rewards/rejected": -8.537583351135254, + "step": 7032 + }, + { + "epoch": 0.6425765189584285, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 2.8451196896648665e-06, + "logits/chosen": 481536810.6666667, + "logits/rejected": 591328972.8, + "logps/chosen": -208.25445556640625, + "logps/rejected": -523.12998046875, + "loss": 0.0096, + "rewards/chosen": 3.9349854787190757, + "rewards/margins": 13.028054936726889, + "rewards/rejected": -9.093069458007813, + "step": 7033 + }, + { + "epoch": 0.6426678848789401, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.843822360880754e-06, + "logits/chosen": 727296170.6666666, + "logits/rejected": 930825625.6, + "logps/chosen": -409.5377604166667, + "logps/rejected": -456.94892578125, + "loss": 0.012, + "rewards/chosen": 3.92213503519694, + "rewards/margins": 13.77766024271647, + "rewards/rejected": -9.85552520751953, + "step": 7034 + }, + { + "epoch": 0.6427592507994518, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.842525210392557e-06, + "logits/chosen": 754188458.6666666, + "logits/rejected": 1081944320.0, + "logps/chosen": -345.7390950520833, + "logps/rejected": -464.70855712890625, + "loss": 0.0313, + "rewards/chosen": 3.287173589070638, + "rewards/margins": 12.500941594441732, + "rewards/rejected": -9.213768005371094, + "step": 7035 + }, + { + "epoch": 0.6428506167199635, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 2.8412282383075362e-06, + "logits/chosen": 585574502.4, + "logits/rejected": 416460288.0, + "logps/chosen": -467.8353515625, + "logps/rejected": -419.0818684895833, + "loss": 0.017, + "rewards/chosen": 3.8249465942382814, + "rewards/margins": 12.259099833170573, + "rewards/rejected": -8.434153238932291, + "step": 7036 + }, + { + "epoch": 0.6429419826404751, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 2.8399314447329383e-06, + "logits/chosen": 608341930.6666666, + "logits/rejected": 629253222.4, + "logps/chosen": -350.6451416015625, + "logps/rejected": -586.742236328125, + "loss": 0.0165, + "rewards/chosen": 3.1888144810994468, + "rewards/margins": 11.673093350728353, + "rewards/rejected": -8.484278869628906, + "step": 7037 + }, + { + "epoch": 0.6430333485609867, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.8386348297759947e-06, + "logits/chosen": 403684505.6, + "logits/rejected": 211345749.33333334, + "logps/chosen": -274.1852294921875, + "logps/rejected": -323.63291422526044, + "loss": 0.0329, + "rewards/chosen": 3.083385467529297, + "rewards/margins": 12.227086385091146, + "rewards/rejected": -9.14370091756185, + "step": 7038 + }, + { + "epoch": 0.6431247144814984, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 2.837338393543926e-06, + "logits/chosen": 431882688.0, + "logits/rejected": 651766528.0, + "logps/chosen": -186.47909545898438, + "logps/rejected": -464.0516357421875, + "loss": 0.0488, + "rewards/chosen": 2.5774221420288086, + "rewards/margins": 12.451476097106934, + "rewards/rejected": -9.874053955078125, + "step": 7039 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 26.5, + "kl": 0.0, + "learning_rate": 2.836042136143936e-06, + "logits/chosen": 919824384.0, + "logits/rejected": 510528256.0, + "logps/chosen": -317.7130940755208, + "logps/rejected": -435.89736328125, + "loss": 0.041, + "rewards/chosen": 2.7222251892089844, + "rewards/margins": 12.193831634521484, + "rewards/rejected": -9.4716064453125, + "step": 7040 + }, + { + "epoch": 0.6433074463225217, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 2.8347460576832107e-06, + "logits/chosen": 735481002.6666666, + "logits/rejected": 590273792.0, + "logps/chosen": -707.4742838541666, + "logps/rejected": -419.20810546875, + "loss": 0.0143, + "rewards/chosen": 3.274826685587565, + "rewards/margins": 11.870001856486002, + "rewards/rejected": -8.595175170898438, + "step": 7041 + }, + { + "epoch": 0.6433988122430333, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 2.8334501582689244e-06, + "logits/chosen": 236807024.0, + "logits/rejected": 366706496.0, + "logps/chosen": -202.31631469726562, + "logps/rejected": -438.1922607421875, + "loss": 0.0076, + "rewards/chosen": 4.562943935394287, + "rewards/margins": 11.994727611541748, + "rewards/rejected": -7.431783676147461, + "step": 7042 + }, + { + "epoch": 0.643490178163545, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 2.8321544380082373e-06, + "logits/chosen": 645727317.3333334, + "logits/rejected": 434153920.0, + "logps/chosen": -318.4888509114583, + "logps/rejected": -445.42822265625, + "loss": 0.0127, + "rewards/chosen": 4.260093053181966, + "rewards/margins": 14.962151845296223, + "rewards/rejected": -10.702058792114258, + "step": 7043 + }, + { + "epoch": 0.6435815440840567, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 2.830858897008293e-06, + "logits/chosen": 301623142.4, + "logits/rejected": 400384725.3333333, + "logps/chosen": -296.2630615234375, + "logps/rejected": -512.2976888020834, + "loss": 0.0075, + "rewards/chosen": 5.165659332275391, + "rewards/margins": 16.03639450073242, + "rewards/rejected": -10.870735168457031, + "step": 7044 + }, + { + "epoch": 0.6436729100045683, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 2.8295635353762202e-06, + "logits/chosen": 573566976.0, + "logits/rejected": 754486272.0, + "logps/chosen": -259.15704345703125, + "logps/rejected": -795.6427001953125, + "loss": 0.0139, + "rewards/chosen": 3.7956337928771973, + "rewards/margins": 13.19633436203003, + "rewards/rejected": -9.400700569152832, + "step": 7045 + }, + { + "epoch": 0.6437642759250799, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 2.8282683532191333e-06, + "logits/chosen": 1075811840.0, + "logits/rejected": 589168213.3333334, + "logps/chosen": -287.9717102050781, + "logps/rejected": -385.096435546875, + "loss": 0.0091, + "rewards/chosen": 3.310080051422119, + "rewards/margins": 12.821943759918213, + "rewards/rejected": -9.511863708496094, + "step": 7046 + }, + { + "epoch": 0.6438556418455916, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 2.826973350644131e-06, + "logits/chosen": 442260224.0, + "logits/rejected": 272099225.6, + "logps/chosen": -369.3822428385417, + "logps/rejected": -367.9518310546875, + "loss": 0.0069, + "rewards/chosen": 4.487905820210774, + "rewards/margins": 12.695518430074056, + "rewards/rejected": -8.207612609863281, + "step": 7047 + }, + { + "epoch": 0.6439470077661033, + "grad_norm": 0.9765625, + "kl": 0.0, + "learning_rate": 2.8256785277583e-06, + "logits/chosen": 479827712.0, + "logits/rejected": 464157610.6666667, + "logps/chosen": -381.422314453125, + "logps/rejected": -518.4537760416666, + "loss": 0.0074, + "rewards/chosen": 4.813017272949219, + "rewards/margins": 12.889317576090495, + "rewards/rejected": -8.076300303141275, + "step": 7048 + }, + { + "epoch": 0.6440383736866149, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 2.82438388466871e-06, + "logits/chosen": 620287402.6666666, + "logits/rejected": 1093457920.0, + "logps/chosen": -313.0037434895833, + "logps/rejected": -583.89638671875, + "loss": 0.0125, + "rewards/chosen": 3.6026878356933594, + "rewards/margins": 13.10065155029297, + "rewards/rejected": -9.49796371459961, + "step": 7049 + }, + { + "epoch": 0.6441297396071265, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 2.8230894214824147e-06, + "logits/chosen": 646586752.0, + "logits/rejected": 1062527552.0, + "logps/chosen": -404.817138671875, + "logps/rejected": -620.9042358398438, + "loss": 0.0094, + "rewards/chosen": 4.092414379119873, + "rewards/margins": 14.438004970550537, + "rewards/rejected": -10.345590591430664, + "step": 7050 + }, + { + "epoch": 0.6442211055276382, + "grad_norm": 1.0703125, + "kl": 0.0, + "learning_rate": 2.8217951383064546e-06, + "logits/chosen": 457881088.0, + "logits/rejected": 499945952.0, + "logps/chosen": -299.25689697265625, + "logps/rejected": -443.6124267578125, + "loss": 0.0057, + "rewards/chosen": 4.894859313964844, + "rewards/margins": 13.664632797241211, + "rewards/rejected": -8.769773483276367, + "step": 7051 + }, + { + "epoch": 0.6443124714481498, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 2.820501035247856e-06, + "logits/chosen": 463480384.0, + "logits/rejected": 704199744.0, + "logps/chosen": -336.54425048828125, + "logps/rejected": -398.5201110839844, + "loss": 0.0191, + "rewards/chosen": 3.6177878379821777, + "rewards/margins": 11.783505916595459, + "rewards/rejected": -8.165718078613281, + "step": 7052 + }, + { + "epoch": 0.6444038373686615, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 2.8192071124136274e-06, + "logits/chosen": 904896409.6, + "logits/rejected": 1168698197.3333333, + "logps/chosen": -344.09921875, + "logps/rejected": -740.6337890625, + "loss": 0.0196, + "rewards/chosen": 3.8217811584472656, + "rewards/margins": 13.693663279215494, + "rewards/rejected": -9.871882120768229, + "step": 7053 + }, + { + "epoch": 0.6444952032891731, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 2.8179133699107652e-06, + "logits/chosen": 354089472.0, + "logits/rejected": 519760597.3333333, + "logps/chosen": -258.2062683105469, + "logps/rejected": -669.5084635416666, + "loss": 0.0044, + "rewards/chosen": 4.693099498748779, + "rewards/margins": 15.285111904144287, + "rewards/rejected": -10.592012405395508, + "step": 7054 + }, + { + "epoch": 0.6445865692096848, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 2.816619807846248e-06, + "logits/chosen": 570376704.0, + "logits/rejected": 308631296.0, + "logps/chosen": -327.6254069010417, + "logps/rejected": -441.8319091796875, + "loss": 0.0359, + "rewards/chosen": 3.385727564493815, + "rewards/margins": 12.870137850443522, + "rewards/rejected": -9.484410285949707, + "step": 7055 + }, + { + "epoch": 0.6446779351301964, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 2.8153264263270452e-06, + "logits/chosen": 439845248.0, + "logits/rejected": 435905792.0, + "logps/chosen": -309.351318359375, + "logps/rejected": -518.55, + "loss": 0.0994, + "rewards/chosen": 3.588651657104492, + "rewards/margins": 10.982363510131837, + "rewards/rejected": -7.393711853027344, + "step": 7056 + }, + { + "epoch": 0.6447693010507081, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.8140332254601055e-06, + "logits/chosen": 417878016.0, + "logits/rejected": 689914538.6666666, + "logps/chosen": -274.10146484375, + "logps/rejected": -478.5523274739583, + "loss": 0.0248, + "rewards/chosen": 3.6004390716552734, + "rewards/margins": 12.203123728434244, + "rewards/rejected": -8.60268465677897, + "step": 7057 + }, + { + "epoch": 0.6448606669712197, + "grad_norm": 0.4453125, + "kl": 0.0, + "learning_rate": 2.8127402053523647e-06, + "logits/chosen": 177983328.0, + "logits/rejected": 517386788.5714286, + "logps/chosen": -79.73995971679688, + "logps/rejected": -631.5986328125, + "loss": 0.002, + "rewards/chosen": 4.145421504974365, + "rewards/margins": 15.055121081215995, + "rewards/rejected": -10.90969957624163, + "step": 7058 + }, + { + "epoch": 0.6449520328917314, + "grad_norm": 0.5625, + "kl": 0.0, + "learning_rate": 2.811447366110741e-06, + "logits/chosen": 570804160.0, + "logits/rejected": 338816853.3333333, + "logps/chosen": -497.1296081542969, + "logps/rejected": -385.2472330729167, + "loss": 0.0028, + "rewards/chosen": 4.702237129211426, + "rewards/margins": 14.31795851389567, + "rewards/rejected": -9.615721384684244, + "step": 7059 + }, + { + "epoch": 0.645043398812243, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 2.8101547078421455e-06, + "logits/chosen": 877871786.6666666, + "logits/rejected": 496121651.2, + "logps/chosen": -499.0256754557292, + "logps/rejected": -501.7162109375, + "loss": 0.0178, + "rewards/chosen": 3.019033749898275, + "rewards/margins": 14.053568967183432, + "rewards/rejected": -11.034535217285157, + "step": 7060 + }, + { + "epoch": 0.6451347647327547, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 2.808862230653465e-06, + "logits/chosen": 118183216.0, + "logits/rejected": 452718976.0, + "logps/chosen": -219.10055541992188, + "logps/rejected": -655.1907958984375, + "loss": 0.0103, + "rewards/chosen": 3.497551918029785, + "rewards/margins": 13.699042320251465, + "rewards/rejected": -10.20149040222168, + "step": 7061 + }, + { + "epoch": 0.6452261306532663, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 2.8075699346515784e-06, + "logits/chosen": 475124480.0, + "logits/rejected": 498652160.0, + "logps/chosen": -371.042626953125, + "logps/rejected": -669.1300455729166, + "loss": 0.0212, + "rewards/chosen": 3.866595458984375, + "rewards/margins": 13.074155044555663, + "rewards/rejected": -9.207559585571289, + "step": 7062 + }, + { + "epoch": 0.645317496573778, + "grad_norm": 52.75, + "kl": 0.0, + "learning_rate": 2.806277819943344e-06, + "logits/chosen": 351048294.4, + "logits/rejected": 377630805.3333333, + "logps/chosen": -228.6732666015625, + "logps/rejected": -319.1188557942708, + "loss": 0.1024, + "rewards/chosen": 2.1853759765625, + "rewards/margins": 9.937676239013673, + "rewards/rejected": -7.752300262451172, + "step": 7063 + }, + { + "epoch": 0.6454088624942896, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 2.804985886635606e-06, + "logits/chosen": 332697088.0, + "logits/rejected": 604858944.0, + "logps/chosen": -180.15279134114584, + "logps/rejected": -449.837646484375, + "loss": 0.0233, + "rewards/chosen": 4.315783182779948, + "rewards/margins": 13.041561762491863, + "rewards/rejected": -8.725778579711914, + "step": 7064 + }, + { + "epoch": 0.6455002284148013, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 2.803694134835201e-06, + "logits/chosen": 390714112.0, + "logits/rejected": 679516416.0, + "logps/chosen": -199.74961853027344, + "logps/rejected": -413.6149495442708, + "loss": 0.0072, + "rewards/chosen": 4.300036907196045, + "rewards/margins": 12.501279354095459, + "rewards/rejected": -8.201242446899414, + "step": 7065 + }, + { + "epoch": 0.6455915943353129, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.8024025646489416e-06, + "logits/chosen": 859058176.0, + "logits/rejected": 516185376.0, + "logps/chosen": -441.1773986816406, + "logps/rejected": -511.5408020019531, + "loss": 0.0133, + "rewards/chosen": 3.759572744369507, + "rewards/margins": 12.66079592704773, + "rewards/rejected": -8.901223182678223, + "step": 7066 + }, + { + "epoch": 0.6456829602558246, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 2.8011111761836283e-06, + "logits/chosen": 604063232.0, + "logits/rejected": 453507328.0, + "logps/chosen": -333.5738525390625, + "logps/rejected": -278.57989501953125, + "loss": 0.0115, + "rewards/chosen": 4.135955810546875, + "rewards/margins": 10.888696670532227, + "rewards/rejected": -6.752740859985352, + "step": 7067 + }, + { + "epoch": 0.6457743261763362, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 2.799819969546046e-06, + "logits/chosen": 298513248.0, + "logits/rejected": 224443562.66666666, + "logps/chosen": -175.54090881347656, + "logps/rejected": -469.1953938802083, + "loss": 0.029, + "rewards/chosen": 2.1835689544677734, + "rewards/margins": 12.66146151224772, + "rewards/rejected": -10.477892557779947, + "step": 7068 + }, + { + "epoch": 0.6458656920968479, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 2.7985289448429697e-06, + "logits/chosen": 508883072.0, + "logits/rejected": 1024297779.2, + "logps/chosen": -355.9842122395833, + "logps/rejected": -445.3857421875, + "loss": 0.0178, + "rewards/chosen": 3.1883150736490884, + "rewards/margins": 13.029137674967448, + "rewards/rejected": -9.84082260131836, + "step": 7069 + }, + { + "epoch": 0.6459570580173595, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 2.7972381021811516e-06, + "logits/chosen": 796488192.0, + "logits/rejected": 348255136.0, + "logps/chosen": -348.56170654296875, + "logps/rejected": -386.05743408203125, + "loss": 0.0066, + "rewards/chosen": 4.983607292175293, + "rewards/margins": 14.641472816467285, + "rewards/rejected": -9.657865524291992, + "step": 7070 + }, + { + "epoch": 0.6460484239378712, + "grad_norm": 0.91015625, + "kl": 0.0, + "learning_rate": 2.795947441667334e-06, + "logits/chosen": 374331392.0, + "logits/rejected": 386393472.0, + "logps/chosen": -283.5012512207031, + "logps/rejected": -623.1030883789062, + "loss": 0.0056, + "rewards/chosen": 4.537670612335205, + "rewards/margins": 17.16109609603882, + "rewards/rejected": -12.623425483703613, + "step": 7071 + }, + { + "epoch": 0.6461397898583828, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 2.7946569634082398e-06, + "logits/chosen": 852460672.0, + "logits/rejected": 299815957.3333333, + "logps/chosen": -673.650634765625, + "logps/rejected": -451.2288411458333, + "loss": 0.0086, + "rewards/chosen": 3.558743476867676, + "rewards/margins": 10.87214247385661, + "rewards/rejected": -7.313398996988933, + "step": 7072 + }, + { + "epoch": 0.6462311557788945, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 2.7933666675105835e-06, + "logits/chosen": 823551744.0, + "logits/rejected": 280402688.0, + "logps/chosen": -390.3056335449219, + "logps/rejected": -259.4635009765625, + "loss": 0.0219, + "rewards/chosen": 3.8824455738067627, + "rewards/margins": 11.106325387954712, + "rewards/rejected": -7.223879814147949, + "step": 7073 + }, + { + "epoch": 0.6463225216994061, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 2.792076554081059e-06, + "logits/chosen": 608458649.6, + "logits/rejected": 410041002.6666667, + "logps/chosen": -554.74580078125, + "logps/rejected": -368.073974609375, + "loss": 0.0094, + "rewards/chosen": 4.4444538116455075, + "rewards/margins": 12.821669514973959, + "rewards/rejected": -8.377215703328451, + "step": 7074 + }, + { + "epoch": 0.6464138876199178, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 2.790786623226347e-06, + "logits/chosen": 657256192.0, + "logits/rejected": 675833941.3333334, + "logps/chosen": -288.81220703125, + "logps/rejected": -795.4026692708334, + "loss": 0.0165, + "rewards/chosen": 3.7673316955566407, + "rewards/margins": 14.75712865193685, + "rewards/rejected": -10.989796956380209, + "step": 7075 + }, + { + "epoch": 0.6465052535404294, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 2.789496875053112e-06, + "logits/chosen": 434679125.3333333, + "logits/rejected": 467992064.0, + "logps/chosen": -299.67384847005206, + "logps/rejected": -411.6201171875, + "loss": 0.0182, + "rewards/chosen": 3.904686609903971, + "rewards/margins": 11.872897974650066, + "rewards/rejected": -7.968211364746094, + "step": 7076 + }, + { + "epoch": 0.646596619460941, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.7882073096680053e-06, + "logits/chosen": 793602304.0, + "logits/rejected": 580036352.0, + "logps/chosen": -601.2333984375, + "logps/rejected": -533.8948974609375, + "loss": 0.0132, + "rewards/chosen": 3.640690326690674, + "rewards/margins": 13.416589260101318, + "rewards/rejected": -9.775898933410645, + "step": 7077 + }, + { + "epoch": 0.6466879853814527, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 2.7869179271776604e-06, + "logits/chosen": 637882368.0, + "logits/rejected": 835935296.0, + "logps/chosen": -427.203857421875, + "logps/rejected": -594.41943359375, + "loss": 0.0236, + "rewards/chosen": 3.5591138203938804, + "rewards/margins": 12.518583615620932, + "rewards/rejected": -8.95946979522705, + "step": 7078 + }, + { + "epoch": 0.6467793513019644, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 2.785628727688699e-06, + "logits/chosen": 743813824.0, + "logits/rejected": 667490005.3333334, + "logps/chosen": -424.08544921875, + "logps/rejected": -387.1266682942708, + "loss": 0.0057, + "rewards/chosen": 3.90397047996521, + "rewards/margins": 12.499564250310263, + "rewards/rejected": -8.595593770345053, + "step": 7079 + }, + { + "epoch": 0.646870717222476, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 2.784339711307723e-06, + "logits/chosen": 426309034.6666667, + "logits/rejected": 517200230.4, + "logps/chosen": -307.1150716145833, + "logps/rejected": -585.716748046875, + "loss": 0.01, + "rewards/chosen": 4.627896626790364, + "rewards/margins": 13.467104085286458, + "rewards/rejected": -8.839207458496094, + "step": 7080 + }, + { + "epoch": 0.6469620831429876, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 2.783050878141326e-06, + "logits/chosen": 292080192.0, + "logits/rejected": 310793568.0, + "logps/chosen": -235.9921112060547, + "logps/rejected": -355.80023193359375, + "loss": 0.0135, + "rewards/chosen": 4.42258882522583, + "rewards/margins": 12.474236011505127, + "rewards/rejected": -8.051647186279297, + "step": 7081 + }, + { + "epoch": 0.6470534490634993, + "grad_norm": 0.388671875, + "kl": 0.0, + "learning_rate": 2.7817622282960816e-06, + "logits/chosen": 455638656.0, + "logits/rejected": 462264166.4, + "logps/chosen": -316.62021891276044, + "logps/rejected": -661.2896484375, + "loss": 0.0026, + "rewards/chosen": 5.24759038289388, + "rewards/margins": 14.367382558186847, + "rewards/rejected": -9.119792175292968, + "step": 7082 + }, + { + "epoch": 0.647144814984011, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 2.7804737618785484e-06, + "logits/chosen": 448814592.0, + "logits/rejected": 265861216.0, + "logps/chosen": -268.48248291015625, + "logps/rejected": -552.6554565429688, + "loss": 0.1157, + "rewards/chosen": 2.255925178527832, + "rewards/margins": 14.664071083068848, + "rewards/rejected": -12.408145904541016, + "step": 7083 + }, + { + "epoch": 0.6472361809045226, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 2.77918547899527e-06, + "logits/chosen": 695942528.0, + "logits/rejected": 574332544.0, + "logps/chosen": -472.479248046875, + "logps/rejected": -387.49273681640625, + "loss": 0.017, + "rewards/chosen": 3.534939765930176, + "rewards/margins": 10.42184066772461, + "rewards/rejected": -6.886900901794434, + "step": 7084 + }, + { + "epoch": 0.6473275468250342, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 2.7778973797527743e-06, + "logits/chosen": 573112473.6, + "logits/rejected": 1048860672.0, + "logps/chosen": -260.5447265625, + "logps/rejected": -597.66357421875, + "loss": 0.0191, + "rewards/chosen": 3.877741241455078, + "rewards/margins": 13.031490071614584, + "rewards/rejected": -9.153748830159506, + "step": 7085 + }, + { + "epoch": 0.6474189127455459, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 2.7766094642575796e-06, + "logits/chosen": 423839189.3333333, + "logits/rejected": 378662041.6, + "logps/chosen": -328.9000244140625, + "logps/rejected": -476.222998046875, + "loss": 0.1227, + "rewards/chosen": 2.0197280248006186, + "rewards/margins": 10.907095273335775, + "rewards/rejected": -8.887367248535156, + "step": 7086 + }, + { + "epoch": 0.6475102786660576, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 2.7753217326161817e-06, + "logits/chosen": 685003008.0, + "logits/rejected": 561627200.0, + "logps/chosen": -357.2815755208333, + "logps/rejected": -414.8122863769531, + "loss": 0.022, + "rewards/chosen": 3.8572800954182944, + "rewards/margins": 13.487983067830404, + "rewards/rejected": -9.63070297241211, + "step": 7087 + }, + { + "epoch": 0.6476016445865692, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.774034184935065e-06, + "logits/chosen": 492254016.0, + "logits/rejected": 663125760.0, + "logps/chosen": -315.82806396484375, + "logps/rejected": -600.5679321289062, + "loss": 0.0144, + "rewards/chosen": 4.057362079620361, + "rewards/margins": 14.105740070343018, + "rewards/rejected": -10.048377990722656, + "step": 7088 + }, + { + "epoch": 0.6476930105070808, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 2.7727468213206944e-06, + "logits/chosen": 590393856.0, + "logits/rejected": 417520213.3333333, + "logps/chosen": -334.8360595703125, + "logps/rejected": -288.787353515625, + "loss": 0.0313, + "rewards/chosen": 3.0404216766357424, + "rewards/margins": 10.695580673217773, + "rewards/rejected": -7.655158996582031, + "step": 7089 + }, + { + "epoch": 0.6477843764275925, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.771459641879528e-06, + "logits/chosen": 501783449.6, + "logits/rejected": 402653696.0, + "logps/chosen": -322.743212890625, + "logps/rejected": -439.1741943359375, + "loss": 0.0232, + "rewards/chosen": 3.8177452087402344, + "rewards/margins": 12.795063018798828, + "rewards/rejected": -8.977317810058594, + "step": 7090 + }, + { + "epoch": 0.6478757423481042, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.7701726467180013e-06, + "logits/chosen": 453133312.0, + "logits/rejected": 399590848.0, + "logps/chosen": -294.6195373535156, + "logps/rejected": -426.07794189453125, + "loss": 0.0257, + "rewards/chosen": 3.2560973167419434, + "rewards/margins": 12.172380924224854, + "rewards/rejected": -8.91628360748291, + "step": 7091 + }, + { + "epoch": 0.6479671082686158, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.768885835942537e-06, + "logits/chosen": 577580160.0, + "logits/rejected": 758136473.6, + "logps/chosen": -292.88584391276044, + "logps/rejected": -597.9927734375, + "loss": 0.0185, + "rewards/chosen": 3.2462778091430664, + "rewards/margins": 10.707232093811035, + "rewards/rejected": -7.460954284667968, + "step": 7092 + }, + { + "epoch": 0.6480584741891274, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 2.7675992096595405e-06, + "logits/chosen": 496967552.0, + "logits/rejected": 1059760320.0, + "logps/chosen": -282.0806884765625, + "logps/rejected": -372.8723449707031, + "loss": 0.0134, + "rewards/chosen": 3.7658920288085938, + "rewards/margins": 12.442353248596191, + "rewards/rejected": -8.676461219787598, + "step": 7093 + }, + { + "epoch": 0.648149840109639, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 2.766312767975407e-06, + "logits/chosen": 594943590.4, + "logits/rejected": 216034602.66666666, + "logps/chosen": -415.27763671875, + "logps/rejected": -244.0293172200521, + "loss": 0.0302, + "rewards/chosen": 3.2733131408691407, + "rewards/margins": 11.259565607706707, + "rewards/rejected": -7.986252466837565, + "step": 7094 + }, + { + "epoch": 0.6482412060301508, + "grad_norm": 29.25, + "kl": 0.0, + "learning_rate": 2.7650265109965123e-06, + "logits/chosen": 446301593.6, + "logits/rejected": 449871957.3333333, + "logps/chosen": -265.042919921875, + "logps/rejected": -533.3212076822916, + "loss": 0.1254, + "rewards/chosen": 3.3487945556640626, + "rewards/margins": 11.921727752685547, + "rewards/rejected": -8.572933197021484, + "step": 7095 + }, + { + "epoch": 0.6483325719506624, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 2.7637404388292184e-06, + "logits/chosen": 319514912.0, + "logits/rejected": 469725504.0, + "logps/chosen": -306.59967041015625, + "logps/rejected": -424.0068359375, + "loss": 0.0081, + "rewards/chosen": 5.004533767700195, + "rewards/margins": 13.201310157775879, + "rewards/rejected": -8.196776390075684, + "step": 7096 + }, + { + "epoch": 0.648423937871174, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 2.76245455157987e-06, + "logits/chosen": 660582954.6666666, + "logits/rejected": 262792672.0, + "logps/chosen": -365.5998128255208, + "logps/rejected": -382.26556396484375, + "loss": 0.0204, + "rewards/chosen": 3.60096804300944, + "rewards/margins": 13.416880289713541, + "rewards/rejected": -9.815912246704102, + "step": 7097 + }, + { + "epoch": 0.6485153037916856, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 2.7611688493547974e-06, + "logits/chosen": 537145514.6666666, + "logits/rejected": 436912076.8, + "logps/chosen": -409.138427734375, + "logps/rejected": -534.124658203125, + "loss": 0.0065, + "rewards/chosen": 4.276922861735026, + "rewards/margins": 14.178003946940105, + "rewards/rejected": -9.901081085205078, + "step": 7098 + }, + { + "epoch": 0.6486066697121974, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.759883332260319e-06, + "logits/chosen": 650527283.2, + "logits/rejected": 893612288.0, + "logps/chosen": -477.37646484375, + "logps/rejected": -654.1040445963541, + "loss": 0.0153, + "rewards/chosen": 4.11810073852539, + "rewards/margins": 16.828981018066408, + "rewards/rejected": -12.710880279541016, + "step": 7099 + }, + { + "epoch": 0.648698035632709, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 2.7585980004027346e-06, + "logits/chosen": 422325589.3333333, + "logits/rejected": 405674265.6, + "logps/chosen": -304.95379638671875, + "logps/rejected": -484.034228515625, + "loss": 0.0064, + "rewards/chosen": 4.117708524068196, + "rewards/margins": 14.045330746968588, + "rewards/rejected": -9.927622222900391, + "step": 7100 + }, + { + "epoch": 0.6487894015532206, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 2.757312853888328e-06, + "logits/chosen": 314488576.0, + "logits/rejected": 278965984.0, + "logps/chosen": -246.21968587239584, + "logps/rejected": -346.3255615234375, + "loss": 0.0213, + "rewards/chosen": 4.2779130935668945, + "rewards/margins": 13.172799110412598, + "rewards/rejected": -8.894886016845703, + "step": 7101 + }, + { + "epoch": 0.6488807674737322, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 2.7560278928233676e-06, + "logits/chosen": 455634517.3333333, + "logits/rejected": 347449952.0, + "logps/chosen": -289.7157389322917, + "logps/rejected": -384.0478515625, + "loss": 0.0294, + "rewards/chosen": 4.274763107299805, + "rewards/margins": 13.097285270690918, + "rewards/rejected": -8.822522163391113, + "step": 7102 + }, + { + "epoch": 0.648972133394244, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 2.754743117314112e-06, + "logits/chosen": 503392109.71428573, + "logits/rejected": 601537472.0, + "logps/chosen": -329.65066964285717, + "logps/rejected": -764.593505859375, + "loss": 0.0363, + "rewards/chosen": 3.372720173427037, + "rewards/margins": 15.142831257411412, + "rewards/rejected": -11.770111083984375, + "step": 7103 + }, + { + "epoch": 0.6490634993147556, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.753458527466799e-06, + "logits/chosen": 477709670.4, + "logits/rejected": 341854378.6666667, + "logps/chosen": -261.844873046875, + "logps/rejected": -324.2097574869792, + "loss": 0.027, + "rewards/chosen": 3.4628692626953126, + "rewards/margins": 12.254178619384765, + "rewards/rejected": -8.791309356689453, + "step": 7104 + }, + { + "epoch": 0.6491548652352672, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 2.7521741233876496e-06, + "logits/chosen": 437592928.0, + "logits/rejected": 516328640.0, + "logps/chosen": -299.0296630859375, + "logps/rejected": -698.6998291015625, + "loss": 0.0252, + "rewards/chosen": 3.901207447052002, + "rewards/margins": 16.02320909500122, + "rewards/rejected": -12.122001647949219, + "step": 7105 + }, + { + "epoch": 0.6492462311557788, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 2.7508899051828718e-06, + "logits/chosen": 383613269.3333333, + "logits/rejected": 467631744.0, + "logps/chosen": -295.2793782552083, + "logps/rejected": -415.0552978515625, + "loss": 0.0237, + "rewards/chosen": 4.070350011189778, + "rewards/margins": 11.949739774068195, + "rewards/rejected": -7.879389762878418, + "step": 7106 + }, + { + "epoch": 0.6493375970762906, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 2.7496058729586626e-06, + "logits/chosen": 515423317.3333333, + "logits/rejected": 823075840.0, + "logps/chosen": -242.88423665364584, + "logps/rejected": -746.6259155273438, + "loss": 0.0308, + "rewards/chosen": 3.8339754740397134, + "rewards/margins": 12.393619219462076, + "rewards/rejected": -8.559643745422363, + "step": 7107 + }, + { + "epoch": 0.6494289629968022, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 2.748322026821197e-06, + "logits/chosen": 611195520.0, + "logits/rejected": 347920170.6666667, + "logps/chosen": -162.0042724609375, + "logps/rejected": -555.5241292317709, + "loss": 0.1127, + "rewards/chosen": 1.816036581993103, + "rewards/margins": 14.984577536582947, + "rewards/rejected": -13.168540954589844, + "step": 7108 + }, + { + "epoch": 0.6495203289173138, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 2.7470383668766378e-06, + "logits/chosen": 348875468.8, + "logits/rejected": 337751552.0, + "logps/chosen": -267.3447021484375, + "logps/rejected": -448.3336588541667, + "loss": 0.0316, + "rewards/chosen": 3.757373809814453, + "rewards/margins": 14.357027053833008, + "rewards/rejected": -10.599653244018555, + "step": 7109 + }, + { + "epoch": 0.6496116948378254, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 2.7457548932311284e-06, + "logits/chosen": 759294464.0, + "logits/rejected": 426574515.2, + "logps/chosen": -540.9764404296875, + "logps/rejected": -445.625244140625, + "loss": 0.0063, + "rewards/chosen": 4.197883288065593, + "rewards/margins": 12.384418932596844, + "rewards/rejected": -8.18653564453125, + "step": 7110 + }, + { + "epoch": 0.6497030607583372, + "grad_norm": 54.0, + "kl": 0.0, + "learning_rate": 2.744471605990806e-06, + "logits/chosen": 546835626.6666666, + "logits/rejected": 340310656.0, + "logps/chosen": -306.6667887369792, + "logps/rejected": -565.284423828125, + "loss": 0.0786, + "rewards/chosen": 3.591850916544596, + "rewards/margins": 13.736703364054362, + "rewards/rejected": -10.144852447509766, + "step": 7111 + }, + { + "epoch": 0.6497944266788488, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 2.7431885052617844e-06, + "logits/chosen": 407504864.0, + "logits/rejected": 453158368.0, + "logps/chosen": -321.1620178222656, + "logps/rejected": -508.278564453125, + "loss": 0.0102, + "rewards/chosen": 4.470925331115723, + "rewards/margins": 14.459185600280762, + "rewards/rejected": -9.988260269165039, + "step": 7112 + }, + { + "epoch": 0.6498857925993604, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 2.741905591150163e-06, + "logits/chosen": 631388074.6666666, + "logits/rejected": 486781542.4, + "logps/chosen": -309.48288981119794, + "logps/rejected": -470.25556640625, + "loss": 0.0049, + "rewards/chosen": 4.822238286336263, + "rewards/margins": 14.42011349995931, + "rewards/rejected": -9.597875213623047, + "step": 7113 + }, + { + "epoch": 0.649977158519872, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 2.7406228637620257e-06, + "logits/chosen": 314712576.0, + "logits/rejected": 492449894.4, + "logps/chosen": -314.493408203125, + "logps/rejected": -525.647314453125, + "loss": 0.0071, + "rewards/chosen": 4.687221527099609, + "rewards/margins": 14.870903778076173, + "rewards/rejected": -10.183682250976563, + "step": 7114 + }, + { + "epoch": 0.6500685244403838, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.7393403232034455e-06, + "logits/chosen": 561079347.2, + "logits/rejected": 639495509.3333334, + "logps/chosen": -381.297216796875, + "logps/rejected": -290.15869140625, + "loss": 0.0301, + "rewards/chosen": 3.164508819580078, + "rewards/margins": 10.595839182535808, + "rewards/rejected": -7.4313303629557295, + "step": 7115 + }, + { + "epoch": 0.6501598903608954, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 2.7380579695804755e-06, + "logits/chosen": 520568106.6666667, + "logits/rejected": 482079334.4, + "logps/chosen": -346.7757161458333, + "logps/rejected": -505.195068359375, + "loss": 0.0197, + "rewards/chosen": 3.7615038553873696, + "rewards/margins": 12.50543696085612, + "rewards/rejected": -8.74393310546875, + "step": 7116 + }, + { + "epoch": 0.650251256281407, + "grad_norm": 3.140625, + "kl": 4.011236190795898, + "learning_rate": 2.7367758029991544e-06, + "logits/chosen": 447312310.85714287, + "logits/rejected": 131773296.0, + "logps/chosen": -281.5120152064732, + "logps/rejected": -147.30902099609375, + "loss": 0.0252, + "rewards/chosen": 4.101222991943359, + "rewards/margins": 12.545797348022461, + "rewards/rejected": -8.444574356079102, + "step": 7117 + }, + { + "epoch": 0.6503426222019186, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 2.7354938235655046e-06, + "logits/chosen": 611525778.2857143, + "logits/rejected": 753899776.0, + "logps/chosen": -386.50223214285717, + "logps/rejected": -665.273681640625, + "loss": 0.0349, + "rewards/chosen": 3.428223201206752, + "rewards/margins": 13.872919627598353, + "rewards/rejected": -10.444696426391602, + "step": 7118 + }, + { + "epoch": 0.6504339881224304, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 2.7342120313855327e-06, + "logits/chosen": 541284300.8, + "logits/rejected": 782849194.6666666, + "logps/chosen": -405.358447265625, + "logps/rejected": -481.3018798828125, + "loss": 0.0178, + "rewards/chosen": 3.6285869598388674, + "rewards/margins": 13.268999608357749, + "rewards/rejected": -9.64041264851888, + "step": 7119 + }, + { + "epoch": 0.650525354042942, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 2.7329304265652346e-06, + "logits/chosen": 598376320.0, + "logits/rejected": 401407232.0, + "logps/chosen": -399.5698547363281, + "logps/rejected": -488.72869873046875, + "loss": 0.0294, + "rewards/chosen": 2.924166679382324, + "rewards/margins": 15.305742263793945, + "rewards/rejected": -12.381575584411621, + "step": 7120 + }, + { + "epoch": 0.6506167199634536, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 2.7316490092105856e-06, + "logits/chosen": 620225097.1428572, + "logits/rejected": 663137344.0, + "logps/chosen": -400.9929896763393, + "logps/rejected": -851.90380859375, + "loss": 0.0102, + "rewards/chosen": 4.9664731706891745, + "rewards/margins": 14.940136500767299, + "rewards/rejected": -9.973663330078125, + "step": 7121 + }, + { + "epoch": 0.6507080858839652, + "grad_norm": 0.95703125, + "kl": 0.0, + "learning_rate": 2.730367779427547e-06, + "logits/chosen": 424843008.0, + "logits/rejected": 524364083.2, + "logps/chosen": -367.2156575520833, + "logps/rejected": -509.96591796875, + "loss": 0.0046, + "rewards/chosen": 4.8219788869222, + "rewards/margins": 14.486003239949543, + "rewards/rejected": -9.664024353027344, + "step": 7122 + }, + { + "epoch": 0.650799451804477, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 2.729086737322062e-06, + "logits/chosen": 713932373.3333334, + "logits/rejected": 557701222.4, + "logps/chosen": -565.1722005208334, + "logps/rejected": -342.202880859375, + "loss": 0.0947, + "rewards/chosen": 3.908543268839518, + "rewards/margins": 10.368725458780924, + "rewards/rejected": -6.460182189941406, + "step": 7123 + }, + { + "epoch": 0.6508908177249886, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 2.7278058830000654e-06, + "logits/chosen": 697753152.0, + "logits/rejected": 444217920.0, + "logps/chosen": -324.09417724609375, + "logps/rejected": -445.955322265625, + "loss": 0.041, + "rewards/chosen": 3.130013942718506, + "rewards/margins": 12.08751630783081, + "rewards/rejected": -8.957502365112305, + "step": 7124 + }, + { + "epoch": 0.6509821836455002, + "grad_norm": 68.5, + "kl": 0.0, + "learning_rate": 2.7265252165674706e-06, + "logits/chosen": 420695987.2, + "logits/rejected": 383115946.6666667, + "logps/chosen": -288.9376953125, + "logps/rejected": -456.9430338541667, + "loss": 0.0663, + "rewards/chosen": 2.8186878204345702, + "rewards/margins": 12.677580897013346, + "rewards/rejected": -9.858893076578775, + "step": 7125 + }, + { + "epoch": 0.6510735495660118, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 2.725244738130175e-06, + "logits/chosen": 549377749.3333334, + "logits/rejected": 531493600.0, + "logps/chosen": -189.90775553385416, + "logps/rejected": -324.0525207519531, + "loss": 0.0184, + "rewards/chosen": 4.313131014506022, + "rewards/margins": 12.203788439432781, + "rewards/rejected": -7.890657424926758, + "step": 7126 + }, + { + "epoch": 0.6511649154865236, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 2.723964447794064e-06, + "logits/chosen": 417764032.0, + "logits/rejected": 521189504.0, + "logps/chosen": -427.2914733886719, + "logps/rejected": -517.5474853515625, + "loss": 0.0068, + "rewards/chosen": 5.1408891677856445, + "rewards/margins": 14.631464958190918, + "rewards/rejected": -9.490575790405273, + "step": 7127 + }, + { + "epoch": 0.6512562814070352, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 2.722684345665004e-06, + "logits/chosen": 427418624.0, + "logits/rejected": 488581120.0, + "logps/chosen": -315.4998291015625, + "logps/rejected": -556.5304361979166, + "loss": 0.0398, + "rewards/chosen": 2.8619409561157227, + "rewards/margins": 11.03631331125895, + "rewards/rejected": -8.174372355143229, + "step": 7128 + }, + { + "epoch": 0.6513476473275468, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 2.72140443184885e-06, + "logits/chosen": 292319974.4, + "logits/rejected": 356637141.3333333, + "logps/chosen": -229.920458984375, + "logps/rejected": -515.3560384114584, + "loss": 0.0231, + "rewards/chosen": 3.7017791748046873, + "rewards/margins": 15.632237243652344, + "rewards/rejected": -11.930458068847656, + "step": 7129 + }, + { + "epoch": 0.6514390132480585, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 2.720124706451438e-06, + "logits/chosen": 627952042.6666666, + "logits/rejected": 518706944.0, + "logps/chosen": -301.8106689453125, + "logps/rejected": -547.777734375, + "loss": 0.0289, + "rewards/chosen": 2.6206370989481607, + "rewards/margins": 12.771528212229411, + "rewards/rejected": -10.15089111328125, + "step": 7130 + }, + { + "epoch": 0.6515303791685702, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 2.718845169578589e-06, + "logits/chosen": 1399419520.0, + "logits/rejected": 613139669.3333334, + "logps/chosen": -353.1751403808594, + "logps/rejected": -629.2619222005209, + "loss": 0.0132, + "rewards/chosen": 2.899184465408325, + "rewards/margins": 13.379215796788534, + "rewards/rejected": -10.480031331380209, + "step": 7131 + }, + { + "epoch": 0.6516217450890818, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 2.7175658213361095e-06, + "logits/chosen": 396779690.6666667, + "logits/rejected": 552875520.0, + "logps/chosen": -309.66546630859375, + "logps/rejected": -854.3743896484375, + "loss": 0.0365, + "rewards/chosen": 3.2974583307902017, + "rewards/margins": 17.29968325297038, + "rewards/rejected": -14.002224922180176, + "step": 7132 + }, + { + "epoch": 0.6517131110095934, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 2.716286661829789e-06, + "logits/chosen": 482391136.0, + "logits/rejected": 433764896.0, + "logps/chosen": -191.60328674316406, + "logps/rejected": -541.177734375, + "loss": 0.0233, + "rewards/chosen": 3.1541295051574707, + "rewards/margins": 13.074608325958252, + "rewards/rejected": -9.920478820800781, + "step": 7133 + }, + { + "epoch": 0.6518044769301051, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 2.7150076911654026e-06, + "logits/chosen": 382391296.0, + "logits/rejected": 386566784.0, + "logps/chosen": -285.65252685546875, + "logps/rejected": -496.53497314453125, + "loss": 0.0158, + "rewards/chosen": 4.102874755859375, + "rewards/margins": 14.255910873413086, + "rewards/rejected": -10.153036117553711, + "step": 7134 + }, + { + "epoch": 0.6518958428506167, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 2.7137289094487073e-06, + "logits/chosen": 610638182.4, + "logits/rejected": 532732416.0, + "logps/chosen": -318.333544921875, + "logps/rejected": -497.215576171875, + "loss": 0.0224, + "rewards/chosen": 3.462404251098633, + "rewards/margins": 10.209187189737957, + "rewards/rejected": -6.746782938639323, + "step": 7135 + }, + { + "epoch": 0.6519872087711284, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.7124503167854503e-06, + "logits/chosen": 382746944.0, + "logits/rejected": 606280960.0, + "logps/chosen": -233.343505859375, + "logps/rejected": -443.40045166015625, + "loss": 0.0349, + "rewards/chosen": 3.1843762397766113, + "rewards/margins": 11.80714464187622, + "rewards/rejected": -8.62276840209961, + "step": 7136 + }, + { + "epoch": 0.65207857469164, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 2.711171913281358e-06, + "logits/chosen": 551628117.3333334, + "logits/rejected": 645563596.8, + "logps/chosen": -296.67030843098956, + "logps/rejected": -408.1916015625, + "loss": 0.0108, + "rewards/chosen": 4.172069549560547, + "rewards/margins": 12.352703857421876, + "rewards/rejected": -8.180634307861329, + "step": 7137 + }, + { + "epoch": 0.6521699406121517, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 2.7098936990421414e-06, + "logits/chosen": 617451861.3333334, + "logits/rejected": 598958182.4, + "logps/chosen": -436.025390625, + "logps/rejected": -527.6501953125, + "loss": 0.0061, + "rewards/chosen": 4.443599383036296, + "rewards/margins": 14.042575518290203, + "rewards/rejected": -9.598976135253906, + "step": 7138 + }, + { + "epoch": 0.6522613065326633, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 2.7086156741734983e-06, + "logits/chosen": 904712874.6666666, + "logits/rejected": 568953702.4, + "logps/chosen": -289.5927327473958, + "logps/rejected": -402.66650390625, + "loss": 0.0985, + "rewards/chosen": 4.67192014058431, + "rewards/margins": 14.063032658894855, + "rewards/rejected": -9.391112518310546, + "step": 7139 + }, + { + "epoch": 0.652352672453175, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 2.7073378387811056e-06, + "logits/chosen": 657671296.0, + "logits/rejected": 606949478.4, + "logps/chosen": -430.9842936197917, + "logps/rejected": -478.16806640625, + "loss": 0.0109, + "rewards/chosen": 3.9070612589518228, + "rewards/margins": 13.394893900553384, + "rewards/rejected": -9.487832641601562, + "step": 7140 + }, + { + "epoch": 0.6524440383736866, + "grad_norm": 49.5, + "kl": 0.0, + "learning_rate": 2.7060601929706344e-06, + "logits/chosen": 324445280.0, + "logits/rejected": 460391058.28571427, + "logps/chosen": -127.00017547607422, + "logps/rejected": -579.0198102678571, + "loss": 0.0636, + "rewards/chosen": -0.01364898681640625, + "rewards/margins": 8.187816074916295, + "rewards/rejected": -8.201465061732701, + "step": 7141 + }, + { + "epoch": 0.6525354042941983, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 2.704782736847732e-06, + "logits/chosen": 609611776.0, + "logits/rejected": 813753472.0, + "logps/chosen": -361.4007873535156, + "logps/rejected": -526.9881591796875, + "loss": 0.007, + "rewards/chosen": 4.340778350830078, + "rewards/margins": 14.728116035461426, + "rewards/rejected": -10.387337684631348, + "step": 7142 + }, + { + "epoch": 0.6526267702147099, + "grad_norm": 42.0, + "kl": 0.0, + "learning_rate": 2.7035054705180307e-06, + "logits/chosen": 713554496.0, + "logits/rejected": 487271552.0, + "logps/chosen": -430.7879638671875, + "logps/rejected": -454.02801513671875, + "loss": 0.0658, + "rewards/chosen": 3.0496597290039062, + "rewards/margins": 11.835205078125, + "rewards/rejected": -8.785545349121094, + "step": 7143 + }, + { + "epoch": 0.6527181361352216, + "grad_norm": 0.79296875, + "kl": 0.0, + "learning_rate": 2.7022283940871474e-06, + "logits/chosen": 380660906.6666667, + "logits/rejected": 406373785.6, + "logps/chosen": -265.93719482421875, + "logps/rejected": -538.89521484375, + "loss": 0.0033, + "rewards/chosen": 5.460273106892903, + "rewards/margins": 14.061132176717123, + "rewards/rejected": -8.600859069824219, + "step": 7144 + }, + { + "epoch": 0.6528095020557332, + "grad_norm": 0.79296875, + "kl": 0.0, + "learning_rate": 2.700951507660689e-06, + "logits/chosen": 828690496.0, + "logits/rejected": 529206485.3333333, + "logps/chosen": -610.008056640625, + "logps/rejected": -558.618408203125, + "loss": 0.0037, + "rewards/chosen": 4.443939208984375, + "rewards/margins": 13.47625986735026, + "rewards/rejected": -9.032320658365885, + "step": 7145 + }, + { + "epoch": 0.6529008679762449, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.6996748113442397e-06, + "logits/chosen": 601966284.8, + "logits/rejected": 464535808.0, + "logps/chosen": -296.2320556640625, + "logps/rejected": -434.5511474609375, + "loss": 0.0232, + "rewards/chosen": 3.983805847167969, + "rewards/margins": 11.100218963623046, + "rewards/rejected": -7.116413116455078, + "step": 7146 + }, + { + "epoch": 0.6529922338967565, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 2.6983983052433703e-06, + "logits/chosen": 563942400.0, + "logits/rejected": 684122112.0, + "logps/chosen": -347.3949381510417, + "logps/rejected": -646.7681640625, + "loss": 0.0095, + "rewards/chosen": 3.8158833185831704, + "rewards/margins": 15.508055559794107, + "rewards/rejected": -11.692172241210937, + "step": 7147 + }, + { + "epoch": 0.6530835998172682, + "grad_norm": 0.365234375, + "kl": 0.0, + "learning_rate": 2.697121989463637e-06, + "logits/chosen": 474526080.0, + "logits/rejected": 447716644.5714286, + "logps/chosen": -211.7977294921875, + "logps/rejected": -418.4242466517857, + "loss": 0.0014, + "rewards/chosen": 4.605938911437988, + "rewards/margins": 15.56546061379569, + "rewards/rejected": -10.959521702357701, + "step": 7148 + }, + { + "epoch": 0.6531749657377798, + "grad_norm": 27.125, + "kl": 0.0, + "learning_rate": 2.6958458641105755e-06, + "logits/chosen": 476036160.0, + "logits/rejected": 343364064.0, + "logps/chosen": -166.3372802734375, + "logps/rejected": -649.8359985351562, + "loss": 0.0543, + "rewards/chosen": 2.7918879985809326, + "rewards/margins": 16.98957896232605, + "rewards/rejected": -14.197690963745117, + "step": 7149 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 2.6945699292897154e-06, + "logits/chosen": 329370837.3333333, + "logits/rejected": 483856742.4, + "logps/chosen": -286.3200276692708, + "logps/rejected": -407.5583251953125, + "loss": 0.0225, + "rewards/chosen": 3.624837875366211, + "rewards/margins": 13.01827507019043, + "rewards/rejected": -9.39343719482422, + "step": 7150 + }, + { + "epoch": 0.6533576975788031, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 2.693294185106562e-06, + "logits/chosen": 523679914.6666667, + "logits/rejected": 570202880.0, + "logps/chosen": -327.3495279947917, + "logps/rejected": -630.2759399414062, + "loss": 0.0252, + "rewards/chosen": 3.8405431111653647, + "rewards/margins": 17.551738103230793, + "rewards/rejected": -13.71119499206543, + "step": 7151 + }, + { + "epoch": 0.6534490634993148, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.692018631666606e-06, + "logits/chosen": 536983003.4285715, + "logits/rejected": 675950592.0, + "logps/chosen": -212.83818708147322, + "logps/rejected": -606.0611572265625, + "loss": 0.038, + "rewards/chosen": 3.3921522412981306, + "rewards/margins": 13.915302685328893, + "rewards/rejected": -10.523150444030762, + "step": 7152 + }, + { + "epoch": 0.6535404294198264, + "grad_norm": 0.96484375, + "kl": 0.0, + "learning_rate": 2.690743269075324e-06, + "logits/rejected": 453782656.0, + "logps/rejected": -477.9703063964844, + "loss": 0.0017, + "rewards/rejected": -9.41946029663086, + "step": 7153 + }, + { + "epoch": 0.6536317953403381, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 2.6894680974381803e-06, + "logits/chosen": 290613866.6666667, + "logits/rejected": 516823244.8, + "logps/chosen": -171.18815104166666, + "logps/rejected": -365.7690185546875, + "loss": 0.0094, + "rewards/chosen": 4.572366078694661, + "rewards/margins": 12.807378896077473, + "rewards/rejected": -8.235012817382813, + "step": 7154 + }, + { + "epoch": 0.6537231612608497, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 2.688193116860618e-06, + "logits/chosen": 1034399616.0, + "logits/rejected": 608823552.0, + "logps/chosen": -229.90591430664062, + "logps/rejected": -480.33465576171875, + "loss": 0.0183, + "rewards/chosen": 3.6799588203430176, + "rewards/margins": 14.396596431732178, + "rewards/rejected": -10.71663761138916, + "step": 7155 + }, + { + "epoch": 0.6538145271813613, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 2.686918327448064e-06, + "logits/chosen": 657323571.2, + "logits/rejected": 563208661.3333334, + "logps/chosen": -190.6010009765625, + "logps/rejected": -470.7742919921875, + "loss": 0.0145, + "rewards/chosen": 4.12262191772461, + "rewards/margins": 14.717704518636069, + "rewards/rejected": -10.595082600911459, + "step": 7156 + }, + { + "epoch": 0.653905893101873, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 2.6856437293059325e-06, + "logits/chosen": 685855488.0, + "logits/rejected": 413048672.0, + "logps/chosen": -508.2180582682292, + "logps/rejected": -501.95660400390625, + "loss": 0.0241, + "rewards/chosen": 3.669947942097982, + "rewards/margins": 14.91535218556722, + "rewards/rejected": -11.245404243469238, + "step": 7157 + }, + { + "epoch": 0.6539972590223847, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 2.6843693225396243e-06, + "logits/chosen": 631356501.3333334, + "logits/rejected": 555517644.8, + "logps/chosen": -280.9049072265625, + "logps/rejected": -507.86953125, + "loss": 0.1298, + "rewards/chosen": 1.6175050735473633, + "rewards/margins": 9.907251930236816, + "rewards/rejected": -8.289746856689453, + "step": 7158 + }, + { + "epoch": 0.6540886249428963, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.6830951072545176e-06, + "logits/chosen": 473312736.0, + "logits/rejected": 365339904.0, + "logps/chosen": -434.3688659667969, + "logps/rejected": -387.4873860677083, + "loss": 0.008, + "rewards/chosen": 4.086050510406494, + "rewards/margins": 13.157752831776937, + "rewards/rejected": -9.071702321370443, + "step": 7159 + }, + { + "epoch": 0.654179990863408, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.681821083555978e-06, + "logits/chosen": 420835498.6666667, + "logits/rejected": 386092416.0, + "logps/chosen": -298.41180419921875, + "logps/rejected": -351.3739013671875, + "loss": 0.0154, + "rewards/chosen": 4.287162780761719, + "rewards/margins": 11.993383407592773, + "rewards/rejected": -7.706220626831055, + "step": 7160 + }, + { + "epoch": 0.6542713567839196, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 2.6805472515493546e-06, + "logits/chosen": 479080618.6666667, + "logits/rejected": 726039680.0, + "logps/chosen": -238.5670369466146, + "logps/rejected": -549.0352172851562, + "loss": 0.0054, + "rewards/chosen": 5.236132621765137, + "rewards/margins": 13.52141284942627, + "rewards/rejected": -8.285280227661133, + "step": 7161 + }, + { + "epoch": 0.6543627227044313, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 2.6792736113399855e-06, + "logits/chosen": 1169053866.6666667, + "logits/rejected": 948520345.6, + "logps/chosen": -230.6928914388021, + "logps/rejected": -703.379541015625, + "loss": 0.0078, + "rewards/chosen": 4.581111907958984, + "rewards/margins": 15.002948760986328, + "rewards/rejected": -10.421836853027344, + "step": 7162 + }, + { + "epoch": 0.6544540886249429, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 2.6780001630331866e-06, + "logits/chosen": 502155072.0, + "logits/rejected": 747544064.0, + "logps/chosen": -319.25830078125, + "logps/rejected": -273.57757568359375, + "loss": 0.0247, + "rewards/chosen": 3.3317604064941406, + "rewards/margins": 10.78650712966919, + "rewards/rejected": -7.454746723175049, + "step": 7163 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 2.6767269067342604e-06, + "logits/chosen": 618265984.0, + "logits/rejected": 876261068.8, + "logps/chosen": -270.91151936848956, + "logps/rejected": -496.328759765625, + "loss": 0.0156, + "rewards/chosen": 3.7652295430501304, + "rewards/margins": 11.584875233968098, + "rewards/rejected": -7.819645690917969, + "step": 7164 + }, + { + "epoch": 0.6546368204659662, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 2.6754538425484916e-06, + "logits/chosen": 344101034.6666667, + "logits/rejected": 393667891.2, + "logps/chosen": -275.48276774088544, + "logps/rejected": -612.5689453125, + "loss": 0.005, + "rewards/chosen": 4.801235198974609, + "rewards/margins": 16.205015563964842, + "rewards/rejected": -11.403780364990235, + "step": 7165 + }, + { + "epoch": 0.6547281863864779, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 2.6741809705811543e-06, + "logits/chosen": 438513472.0, + "logits/rejected": 245011424.0, + "logps/chosen": -428.6556396484375, + "logps/rejected": -368.1783142089844, + "loss": 0.0085, + "rewards/chosen": 4.539248943328857, + "rewards/margins": 14.79557466506958, + "rewards/rejected": -10.256325721740723, + "step": 7166 + }, + { + "epoch": 0.6548195523069895, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.6729082909375016e-06, + "logits/chosen": 473410048.0, + "logits/rejected": 675017856.0, + "logps/chosen": -501.7068176269531, + "logps/rejected": -546.3880004882812, + "loss": 0.0172, + "rewards/chosen": 3.710111141204834, + "rewards/margins": 12.985837459564209, + "rewards/rejected": -9.275726318359375, + "step": 7167 + }, + { + "epoch": 0.6549109182275011, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 2.671635803722773e-06, + "logits/chosen": 447068160.0, + "logits/rejected": 523179562.6666667, + "logps/chosen": -315.057421875, + "logps/rejected": -399.3547770182292, + "loss": 0.0255, + "rewards/chosen": 3.9702301025390625, + "rewards/margins": 14.001900990804037, + "rewards/rejected": -10.031670888264975, + "step": 7168 + }, + { + "epoch": 0.6550022841480128, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 2.67036350904219e-06, + "logits/chosen": 447994828.8, + "logits/rejected": 706641450.6666666, + "logps/chosen": -383.6979736328125, + "logps/rejected": -617.1547037760416, + "loss": 0.0394, + "rewards/chosen": 2.769194412231445, + "rewards/margins": 11.80764274597168, + "rewards/rejected": -9.038448333740234, + "step": 7169 + }, + { + "epoch": 0.6550936500685245, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 2.669091407000958e-06, + "logits/chosen": 542633856.0, + "logits/rejected": 495079552.0, + "logps/chosen": -370.90643310546875, + "logps/rejected": -557.2852783203125, + "loss": 0.0099, + "rewards/chosen": 4.758805274963379, + "rewards/margins": 11.937725067138672, + "rewards/rejected": -7.178919792175293, + "step": 7170 + }, + { + "epoch": 0.6551850159890361, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 2.6678194977042727e-06, + "logits/chosen": 403700224.0, + "logits/rejected": 751578794.6666666, + "logps/chosen": -289.187744140625, + "logps/rejected": -516.1049397786459, + "loss": 0.0187, + "rewards/chosen": 4.5867454528808596, + "rewards/margins": 13.574358495076499, + "rewards/rejected": -8.987613042195639, + "step": 7171 + }, + { + "epoch": 0.6552763819095477, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.6665477812573064e-06, + "logits/chosen": 388544469.3333333, + "logits/rejected": 568168038.4, + "logps/chosen": -256.7392985026042, + "logps/rejected": -487.9111328125, + "loss": 0.0133, + "rewards/chosen": 4.200774510701497, + "rewards/margins": 14.162380345662434, + "rewards/rejected": -9.961605834960938, + "step": 7172 + }, + { + "epoch": 0.6553677478300594, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 2.665276257765219e-06, + "logits/chosen": 565141094.4, + "logits/rejected": 452865536.0, + "logps/chosen": -303.530810546875, + "logps/rejected": -306.61049397786456, + "loss": 0.0326, + "rewards/chosen": 3.631397247314453, + "rewards/margins": 10.780118942260742, + "rewards/rejected": -7.148721694946289, + "step": 7173 + }, + { + "epoch": 0.6554591137505711, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 2.6640049273331516e-06, + "logits/chosen": 331068373.3333333, + "logits/rejected": 460795392.0, + "logps/chosen": -175.61785888671875, + "logps/rejected": -435.15126953125, + "loss": 0.0088, + "rewards/chosen": 3.9044663111368814, + "rewards/margins": 12.68944174448649, + "rewards/rejected": -8.78497543334961, + "step": 7174 + }, + { + "epoch": 0.6555504796710827, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 2.662733790066235e-06, + "logits/chosen": 959553536.0, + "logits/rejected": 949618048.0, + "logps/chosen": -332.09185791015625, + "logps/rejected": -512.77001953125, + "loss": 0.0508, + "rewards/chosen": 2.9222793579101562, + "rewards/margins": 12.7711181640625, + "rewards/rejected": -9.848838806152344, + "step": 7175 + }, + { + "epoch": 0.6556418455915943, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 2.6614628460695786e-06, + "logits/chosen": 279921120.0, + "logits/rejected": 342929184.0, + "logps/chosen": -298.490966796875, + "logps/rejected": -476.434326171875, + "loss": 0.0111, + "rewards/chosen": 5.05055046081543, + "rewards/margins": 15.275686264038086, + "rewards/rejected": -10.225135803222656, + "step": 7176 + }, + { + "epoch": 0.655733211512106, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 2.6601920954482776e-06, + "logits/chosen": 550917802.6666666, + "logits/rejected": 456062668.8, + "logps/chosen": -347.8001302083333, + "logps/rejected": -486.56884765625, + "loss": 0.0055, + "rewards/chosen": 4.449628194173177, + "rewards/margins": 13.969429524739581, + "rewards/rejected": -9.519801330566406, + "step": 7177 + }, + { + "epoch": 0.6558245774326177, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.6589215383074095e-06, + "logits/chosen": 812438016.0, + "logits/rejected": 593986560.0, + "logps/chosen": -398.47967529296875, + "logps/rejected": -675.1495971679688, + "loss": 0.1286, + "rewards/chosen": 3.8156325817108154, + "rewards/margins": 11.301243543624878, + "rewards/rejected": -7.4856109619140625, + "step": 7178 + }, + { + "epoch": 0.6559159433531293, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 2.657651174752042e-06, + "logits/chosen": 485599616.0, + "logits/rejected": 308923968.0, + "logps/chosen": -378.78924560546875, + "logps/rejected": -563.0087890625, + "loss": 0.0176, + "rewards/chosen": 3.3231725692749023, + "rewards/margins": 14.312644004821777, + "rewards/rejected": -10.989471435546875, + "step": 7179 + }, + { + "epoch": 0.6560073092736409, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 2.6563810048872202e-06, + "logits/chosen": 575156224.0, + "logits/rejected": 725827379.2, + "logps/chosen": -358.0816243489583, + "logps/rejected": -343.8868408203125, + "loss": 0.113, + "rewards/chosen": 4.300540288289388, + "rewards/margins": 11.09167340596517, + "rewards/rejected": -6.791133117675781, + "step": 7180 + }, + { + "epoch": 0.6560986751941525, + "grad_norm": 1.0703125, + "kl": 0.0, + "learning_rate": 2.655111028817975e-06, + "logits/chosen": 693610598.4, + "logits/rejected": 440205056.0, + "logps/chosen": -208.809912109375, + "logps/rejected": -415.6100260416667, + "loss": 0.0064, + "rewards/chosen": 4.855489349365234, + "rewards/margins": 14.120038604736328, + "rewards/rejected": -9.264549255371094, + "step": 7181 + }, + { + "epoch": 0.6561900411146643, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 2.6538412466493213e-06, + "logits/chosen": 1166875392.0, + "logits/rejected": 593559680.0, + "logps/chosen": -384.5988464355469, + "logps/rejected": -499.6353454589844, + "loss": 0.0181, + "rewards/chosen": 3.8106813430786133, + "rewards/margins": 12.577912330627441, + "rewards/rejected": -8.767230987548828, + "step": 7182 + }, + { + "epoch": 0.6562814070351759, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 2.6525716584862576e-06, + "logits/chosen": 583775829.3333334, + "logits/rejected": 573606451.2, + "logps/chosen": -237.3739013671875, + "logps/rejected": -808.909765625, + "loss": 0.0079, + "rewards/chosen": 3.8904701868693032, + "rewards/margins": 15.783915011088054, + "rewards/rejected": -11.89344482421875, + "step": 7183 + }, + { + "epoch": 0.6563727729556875, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 2.651302264433772e-06, + "logits/chosen": 532222528.0, + "logits/rejected": 354692544.0, + "logps/chosen": -419.3949279785156, + "logps/rejected": -386.6043701171875, + "loss": 0.0073, + "rewards/chosen": 4.532453536987305, + "rewards/margins": 13.973319053649902, + "rewards/rejected": -9.440865516662598, + "step": 7184 + }, + { + "epoch": 0.6564641388761991, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 2.6500330645968263e-06, + "logits/chosen": 353308569.6, + "logits/rejected": 522892458.6666667, + "logps/chosen": -283.859912109375, + "logps/rejected": -558.0904134114584, + "loss": 0.0091, + "rewards/chosen": 4.654727172851563, + "rewards/margins": 15.276320648193359, + "rewards/rejected": -10.621593475341797, + "step": 7185 + }, + { + "epoch": 0.6565555047967109, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.648764059080371e-06, + "logits/chosen": 1087172403.2, + "logits/rejected": 827930453.3333334, + "logps/chosen": -302.3701904296875, + "logps/rejected": -346.0959879557292, + "loss": 0.0197, + "rewards/chosen": 4.348376846313476, + "rewards/margins": 13.027666346232095, + "rewards/rejected": -8.67928949991862, + "step": 7186 + }, + { + "epoch": 0.6566468707172225, + "grad_norm": 0.5, + "kl": 0.0, + "learning_rate": 2.6474952479893445e-06, + "logits/chosen": 557620266.6666666, + "logits/rejected": 516538982.4, + "logps/chosen": -456.1458333333333, + "logps/rejected": -712.6234375, + "loss": 0.0027, + "rewards/chosen": 5.052836100260417, + "rewards/margins": 16.658835856119794, + "rewards/rejected": -11.605999755859376, + "step": 7187 + }, + { + "epoch": 0.6567382366377341, + "grad_norm": 0.31640625, + "kl": 0.0, + "learning_rate": 2.6462266314286655e-06, + "logits/chosen": 1204944256.0, + "logits/rejected": 624633386.6666666, + "logps/chosen": -618.797119140625, + "logps/rejected": -439.8980305989583, + "loss": 0.0015, + "rewards/chosen": 5.603611946105957, + "rewards/margins": 14.410335222880045, + "rewards/rejected": -8.806723276774088, + "step": 7188 + }, + { + "epoch": 0.6568296025582457, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 2.6449582095032354e-06, + "logits/chosen": 446544960.0, + "logits/rejected": 925531355.4285715, + "logps/chosen": -198.82028198242188, + "logps/rejected": -524.0882742745536, + "loss": 0.008, + "rewards/chosen": 2.7989394664764404, + "rewards/margins": 10.945781878062657, + "rewards/rejected": -8.146842411586217, + "step": 7189 + }, + { + "epoch": 0.6569209684787575, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 2.643689982317942e-06, + "logits/chosen": 566568192.0, + "logits/rejected": 642763264.0, + "logps/chosen": -301.02939860026044, + "logps/rejected": -731.330078125, + "loss": 0.0145, + "rewards/chosen": 3.85463015238444, + "rewards/margins": 13.702855555216471, + "rewards/rejected": -9.848225402832032, + "step": 7190 + }, + { + "epoch": 0.6570123343992691, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.6424219499776533e-06, + "logits/chosen": 333092064.0, + "logits/rejected": 336105952.0, + "logps/chosen": -321.0411682128906, + "logps/rejected": -433.71661376953125, + "loss": 0.0132, + "rewards/chosen": 4.255160331726074, + "rewards/margins": 12.820752143859863, + "rewards/rejected": -8.565591812133789, + "step": 7191 + }, + { + "epoch": 0.6571037003197807, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 2.6411541125872274e-06, + "logits/chosen": 641390592.0, + "logits/rejected": 669535424.0, + "logps/chosen": -254.89071655273438, + "logps/rejected": -632.3202514648438, + "loss": 0.0077, + "rewards/chosen": 4.21309757232666, + "rewards/margins": 14.007603645324707, + "rewards/rejected": -9.794506072998047, + "step": 7192 + }, + { + "epoch": 0.6571950662402923, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 2.6398864702515016e-06, + "logits/chosen": 644284562.2857143, + "logits/rejected": 1650343424.0, + "logps/chosen": -363.81539481026783, + "logps/rejected": -1252.235595703125, + "loss": 0.0173, + "rewards/chosen": 4.216279438563755, + "rewards/margins": 22.58876555306571, + "rewards/rejected": -18.372486114501953, + "step": 7193 + }, + { + "epoch": 0.6572864321608041, + "grad_norm": 46.0, + "kl": 0.0, + "learning_rate": 2.638619023075298e-06, + "logits/chosen": 246221781.33333334, + "logits/rejected": 610280038.4, + "logps/chosen": -289.84234619140625, + "logps/rejected": -874.1837890625, + "loss": 0.0273, + "rewards/chosen": 4.774141311645508, + "rewards/margins": 16.000801467895506, + "rewards/rejected": -11.22666015625, + "step": 7194 + }, + { + "epoch": 0.6573777980813157, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.63735177116342e-06, + "logits/chosen": 565154457.6, + "logits/rejected": 506778197.3333333, + "logps/chosen": -384.186962890625, + "logps/rejected": -761.5502115885416, + "loss": 0.0192, + "rewards/chosen": 3.9368125915527346, + "rewards/margins": 14.197595469156902, + "rewards/rejected": -10.260782877604166, + "step": 7195 + }, + { + "epoch": 0.6574691640018273, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.6360847146206624e-06, + "logits/chosen": 458796288.0, + "logits/rejected": 1171987456.0, + "logps/chosen": -285.053076171875, + "logps/rejected": -717.802734375, + "loss": 0.0261, + "rewards/chosen": 3.2215499877929688, + "rewards/margins": 14.105121612548828, + "rewards/rejected": -10.88357162475586, + "step": 7196 + }, + { + "epoch": 0.6575605299223389, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 2.6348178535517967e-06, + "logits/chosen": 566171562.6666666, + "logits/rejected": 592969113.6, + "logps/chosen": -433.7919921875, + "logps/rejected": -609.0775390625, + "loss": 0.0128, + "rewards/chosen": 3.7692222595214844, + "rewards/margins": 14.618329620361328, + "rewards/rejected": -10.849107360839843, + "step": 7197 + }, + { + "epoch": 0.6576518958428507, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 2.6335511880615806e-06, + "logits/chosen": 447273514.6666667, + "logits/rejected": 703216025.6, + "logps/chosen": -318.3878173828125, + "logps/rejected": -565.37265625, + "loss": 0.0836, + "rewards/chosen": 4.721630732218425, + "rewards/margins": 11.448745600382487, + "rewards/rejected": -6.727114868164063, + "step": 7198 + }, + { + "epoch": 0.6577432617633623, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 2.632284718254753e-06, + "logits/chosen": 329248640.0, + "logits/rejected": 274644736.0, + "logps/chosen": -138.88494873046875, + "logps/rejected": -221.93426513671875, + "loss": 0.0241, + "rewards/chosen": 4.013197898864746, + "rewards/margins": 11.884499073028564, + "rewards/rejected": -7.871301174163818, + "step": 7199 + }, + { + "epoch": 0.6578346276838739, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.6310184442360442e-06, + "logits/chosen": 589338240.0, + "logits/rejected": 950372224.0, + "logps/chosen": -366.8455810546875, + "logps/rejected": -874.9885864257812, + "loss": 0.0102, + "rewards/chosen": 4.6423797607421875, + "rewards/margins": 14.410994529724121, + "rewards/rejected": -9.768614768981934, + "step": 7200 + }, + { + "epoch": 0.6579259936043855, + "grad_norm": 0.82421875, + "kl": 0.0, + "learning_rate": 2.629752366110161e-06, + "logits/chosen": 294734688.0, + "logits/rejected": 178440768.0, + "logps/chosen": -291.2373352050781, + "logps/rejected": -384.0921630859375, + "loss": 0.0051, + "rewards/chosen": 4.860845565795898, + "rewards/margins": 13.874932289123535, + "rewards/rejected": -9.014086723327637, + "step": 7201 + }, + { + "epoch": 0.6580173595248973, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 2.628486483981797e-06, + "logits/chosen": 447857365.3333333, + "logits/rejected": 526895974.4, + "logps/chosen": -189.82025146484375, + "logps/rejected": -387.29931640625, + "loss": 0.0171, + "rewards/chosen": 3.9518699645996094, + "rewards/margins": 12.513506317138672, + "rewards/rejected": -8.561636352539063, + "step": 7202 + }, + { + "epoch": 0.6581087254454089, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 2.6272207979556276e-06, + "logits/chosen": 497418538.6666667, + "logits/rejected": 718312089.6, + "logps/chosen": -354.2764078776042, + "logps/rejected": -521.8173828125, + "loss": 0.0106, + "rewards/chosen": 3.733191172281901, + "rewards/margins": 14.086253611246745, + "rewards/rejected": -10.353062438964844, + "step": 7203 + }, + { + "epoch": 0.6582000913659205, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 2.6259553081363113e-06, + "logits/chosen": 729040000.0, + "logits/rejected": 549251968.0, + "logps/chosen": -409.4471740722656, + "logps/rejected": -684.4464111328125, + "loss": 0.0299, + "rewards/chosen": 3.0913257598876953, + "rewards/margins": 13.34714126586914, + "rewards/rejected": -10.255815505981445, + "step": 7204 + }, + { + "epoch": 0.6582914572864321, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 2.624690014628498e-06, + "logits/chosen": 514229043.2, + "logits/rejected": 692512000.0, + "logps/chosen": -268.994189453125, + "logps/rejected": -546.294921875, + "loss": 0.0254, + "rewards/chosen": 3.749707794189453, + "rewards/margins": 11.620265579223632, + "rewards/rejected": -7.87055778503418, + "step": 7205 + }, + { + "epoch": 0.6583828232069439, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 2.623424917536812e-06, + "logits/chosen": 403058739.2, + "logits/rejected": 382782464.0, + "logps/chosen": -244.2857421875, + "logps/rejected": -350.1015625, + "loss": 0.0265, + "rewards/chosen": 3.2508216857910157, + "rewards/margins": 11.912487920125326, + "rewards/rejected": -8.66166623433431, + "step": 7206 + }, + { + "epoch": 0.6584741891274555, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 2.622160016965867e-06, + "logits/chosen": 551723264.0, + "logits/rejected": 414920384.0, + "logps/chosen": -347.5100911458333, + "logps/rejected": -420.413818359375, + "loss": 0.0178, + "rewards/chosen": 4.4836273193359375, + "rewards/margins": 14.039092063903809, + "rewards/rejected": -9.555464744567871, + "step": 7207 + }, + { + "epoch": 0.6585655550479671, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 2.6208953130202543e-06, + "logits/chosen": 546099520.0, + "logits/rejected": 533193216.0, + "logps/chosen": -365.7597351074219, + "logps/rejected": -704.3355102539062, + "loss": 0.0402, + "rewards/chosen": 2.6268701553344727, + "rewards/margins": 12.890432357788086, + "rewards/rejected": -10.263562202453613, + "step": 7208 + }, + { + "epoch": 0.6586569209684787, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.619630805804558e-06, + "logits/chosen": 476748800.0, + "logits/rejected": 390190668.8, + "logps/chosen": -289.8962809244792, + "logps/rejected": -537.13017578125, + "loss": 0.0126, + "rewards/chosen": 3.6213998794555664, + "rewards/margins": 14.19772891998291, + "rewards/rejected": -10.576329040527344, + "step": 7209 + }, + { + "epoch": 0.6587482868889905, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 2.618366495423341e-06, + "logits/chosen": 556760780.8, + "logits/rejected": 1003724970.6666666, + "logps/chosen": -434.47861328125, + "logps/rejected": -482.2677408854167, + "loss": 0.0299, + "rewards/chosen": 3.557476806640625, + "rewards/margins": 14.021516927083333, + "rewards/rejected": -10.464040120442709, + "step": 7210 + }, + { + "epoch": 0.6588396528095021, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 2.617102381981149e-06, + "logits/chosen": 625429888.0, + "logits/rejected": 950186240.0, + "logps/chosen": -332.1756896972656, + "logps/rejected": -356.7206115722656, + "loss": 0.0272, + "rewards/chosen": 2.87031888961792, + "rewards/margins": 11.16235876083374, + "rewards/rejected": -8.29203987121582, + "step": 7211 + }, + { + "epoch": 0.6589310187300137, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 2.6158384655825085e-06, + "logits/chosen": 468605226.6666667, + "logits/rejected": 635785984.0, + "logps/chosen": -200.69561767578125, + "logps/rejected": -663.172802734375, + "loss": 0.009, + "rewards/chosen": 3.8464972178141275, + "rewards/margins": 14.76939608256022, + "rewards/rejected": -10.922898864746093, + "step": 7212 + }, + { + "epoch": 0.6590223846505253, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 2.61457474633194e-06, + "logits/chosen": 410458368.0, + "logits/rejected": 1069377344.0, + "logps/chosen": -254.50938415527344, + "logps/rejected": -382.7661437988281, + "loss": 0.0387, + "rewards/chosen": 3.150188446044922, + "rewards/margins": 10.216805934906006, + "rewards/rejected": -7.066617488861084, + "step": 7213 + }, + { + "epoch": 0.659113750571037, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 2.6133112243339377e-06, + "logits/chosen": 332210656.0, + "logits/rejected": 392361408.0, + "logps/chosen": -128.71072387695312, + "logps/rejected": -291.38629150390625, + "loss": 0.1412, + "rewards/chosen": 3.0384247303009033, + "rewards/margins": 8.487787961959839, + "rewards/rejected": -5.4493632316589355, + "step": 7214 + }, + { + "epoch": 0.6592051164915487, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 2.6120478996929843e-06, + "logits/chosen": 888551552.0, + "logits/rejected": 723094208.0, + "logps/chosen": -383.212158203125, + "logps/rejected": -284.6394348144531, + "loss": 0.1025, + "rewards/chosen": 3.261622428894043, + "rewards/margins": 9.253949165344238, + "rewards/rejected": -5.992326736450195, + "step": 7215 + }, + { + "epoch": 0.6592964824120603, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 2.610784772513542e-06, + "logits/chosen": 375448832.0, + "logits/rejected": 531344281.6, + "logps/chosen": -276.80702718098956, + "logps/rejected": -408.3109619140625, + "loss": 0.0091, + "rewards/chosen": 4.134246826171875, + "rewards/margins": 13.185836791992188, + "rewards/rejected": -9.051589965820312, + "step": 7216 + }, + { + "epoch": 0.6593878483325719, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.6095218429000653e-06, + "logits/chosen": 688798515.2, + "logits/rejected": 589089322.6666666, + "logps/chosen": -359.30146484375, + "logps/rejected": -743.25439453125, + "loss": 0.0188, + "rewards/chosen": 3.8012668609619142, + "rewards/margins": 17.555431238810222, + "rewards/rejected": -13.754164377848307, + "step": 7217 + }, + { + "epoch": 0.6594792142530836, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 2.608259110956983e-06, + "logits/chosen": 440248612.5714286, + "logits/rejected": 442269280.0, + "logps/chosen": -375.9251185825893, + "logps/rejected": -787.693115234375, + "loss": 0.0317, + "rewards/chosen": 3.4844251360212053, + "rewards/margins": 15.933058329990931, + "rewards/rejected": -12.448633193969727, + "step": 7218 + }, + { + "epoch": 0.6595705801735953, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 2.6069965767887127e-06, + "logits/chosen": 553469440.0, + "logits/rejected": 675258176.0, + "logps/chosen": -360.7659505208333, + "logps/rejected": -365.4196472167969, + "loss": 0.0187, + "rewards/chosen": 4.179227193196614, + "rewards/margins": 12.923746426900227, + "rewards/rejected": -8.744519233703613, + "step": 7219 + }, + { + "epoch": 0.6596619460941069, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 2.605734240499652e-06, + "logits/chosen": 638789717.3333334, + "logits/rejected": 609349248.0, + "logps/chosen": -216.53715006510416, + "logps/rejected": -524.318603515625, + "loss": 0.0704, + "rewards/chosen": 2.6000712712605796, + "rewards/margins": 12.677179654439291, + "rewards/rejected": -10.077108383178711, + "step": 7220 + }, + { + "epoch": 0.6597533120146185, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 2.6044721021941887e-06, + "logits/chosen": 304379744.0, + "logits/rejected": 375488384.0, + "logps/chosen": -251.92628479003906, + "logps/rejected": -446.5074462890625, + "loss": 0.0108, + "rewards/chosen": 4.333829879760742, + "rewards/margins": 13.742805480957031, + "rewards/rejected": -9.408975601196289, + "step": 7221 + }, + { + "epoch": 0.6598446779351302, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 2.603210161976687e-06, + "logits/chosen": 702988083.2, + "logits/rejected": 582403754.6666666, + "logps/chosen": -262.47041015625, + "logps/rejected": -544.1784261067709, + "loss": 0.021, + "rewards/chosen": 3.667485809326172, + "rewards/margins": 12.446556345621744, + "rewards/rejected": -8.779070536295572, + "step": 7222 + }, + { + "epoch": 0.6599360438556419, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 2.6019484199514975e-06, + "logits/chosen": 427616256.0, + "logits/rejected": 190900640.0, + "logps/chosen": -331.837158203125, + "logps/rejected": -226.9539794921875, + "loss": 0.0257, + "rewards/chosen": 4.038229306538899, + "rewards/margins": 10.908204396565754, + "rewards/rejected": -6.8699750900268555, + "step": 7223 + }, + { + "epoch": 0.6600274097761535, + "grad_norm": 0.93359375, + "kl": 0.0, + "learning_rate": 2.6006868762229566e-06, + "logits/chosen": 352000234.6666667, + "logits/rejected": 446602496.0, + "logps/chosen": -265.7265625, + "logps/rejected": -489.97998046875, + "loss": 0.0049, + "rewards/chosen": 4.700615564982097, + "rewards/margins": 13.770751825968425, + "rewards/rejected": -9.070136260986327, + "step": 7224 + }, + { + "epoch": 0.6601187756966651, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 2.599425530895378e-06, + "logits/chosen": 716349610.6666666, + "logits/rejected": 784816640.0, + "logps/chosen": -355.615966796875, + "logps/rejected": -618.3294921875, + "loss": 0.0091, + "rewards/chosen": 3.9108282725016275, + "rewards/margins": 14.448403040568033, + "rewards/rejected": -10.537574768066406, + "step": 7225 + }, + { + "epoch": 0.6602101416171768, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 2.5981643840730684e-06, + "logits/chosen": 998722048.0, + "logits/rejected": 1113189888.0, + "logps/chosen": -481.9829508463542, + "logps/rejected": -352.3109130859375, + "loss": 0.0275, + "rewards/chosen": 4.147365252176921, + "rewards/margins": 11.057626787821452, + "rewards/rejected": -6.910261535644532, + "step": 7226 + }, + { + "epoch": 0.6603015075376885, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 2.5969034358603105e-06, + "logits/chosen": 974870613.3333334, + "logits/rejected": 956027699.2, + "logps/chosen": -125.7648417154948, + "logps/rejected": -451.29443359375, + "loss": 0.0534, + "rewards/chosen": 2.032799402872721, + "rewards/margins": 11.781169764200845, + "rewards/rejected": -9.748370361328124, + "step": 7227 + }, + { + "epoch": 0.6603928734582001, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 2.5956426863613737e-06, + "logits/chosen": 638914176.0, + "logits/rejected": 411397824.0, + "logps/chosen": -340.6741943359375, + "logps/rejected": -603.0494995117188, + "loss": 0.0133, + "rewards/chosen": 4.144630432128906, + "rewards/margins": 13.02440071105957, + "rewards/rejected": -8.879770278930664, + "step": 7228 + }, + { + "epoch": 0.6604842393787117, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 2.5943821356805077e-06, + "logits/chosen": 686240576.0, + "logits/rejected": 879045184.0, + "logps/chosen": -377.0169677734375, + "logps/rejected": -509.59466552734375, + "loss": 0.02, + "rewards/chosen": 3.308398485183716, + "rewards/margins": 13.954140901565552, + "rewards/rejected": -10.645742416381836, + "step": 7229 + }, + { + "epoch": 0.6605756052992234, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 2.5931217839219523e-06, + "logits/chosen": 576918169.6, + "logits/rejected": 567399040.0, + "logps/chosen": -263.624560546875, + "logps/rejected": -652.6995849609375, + "loss": 0.0116, + "rewards/chosen": 4.166251373291016, + "rewards/margins": 15.544226837158202, + "rewards/rejected": -11.377975463867188, + "step": 7230 + }, + { + "epoch": 0.660666971219735, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 2.5918616311899255e-06, + "logits/chosen": 537053610.6666666, + "logits/rejected": 1035124352.0, + "logps/chosen": -281.5951334635417, + "logps/rejected": -273.80352783203125, + "loss": 0.0252, + "rewards/chosen": 3.9878622690836587, + "rewards/margins": 11.234073321024576, + "rewards/rejected": -7.246211051940918, + "step": 7231 + }, + { + "epoch": 0.6607583371402467, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.5906016775886293e-06, + "logits/chosen": 246838293.33333334, + "logits/rejected": 423649024.0, + "logps/chosen": -383.7759195963542, + "logps/rejected": -452.92265625, + "loss": 0.0121, + "rewards/chosen": 4.022322018941243, + "rewards/margins": 12.898968442281088, + "rewards/rejected": -8.876646423339844, + "step": 7232 + }, + { + "epoch": 0.6608497030607583, + "grad_norm": 58.25, + "kl": 0.0, + "learning_rate": 2.5893419232222518e-06, + "logits/chosen": 364663637.3333333, + "logits/rejected": 395649049.6, + "logps/chosen": -278.2760416666667, + "logps/rejected": -375.62216796875, + "loss": 0.0575, + "rewards/chosen": 5.1141204833984375, + "rewards/margins": 12.317362213134766, + "rewards/rejected": -7.2032417297363285, + "step": 7233 + }, + { + "epoch": 0.66094106898127, + "grad_norm": 43.25, + "kl": 0.0, + "learning_rate": 2.588082368194959e-06, + "logits/chosen": 225669552.0, + "logits/rejected": 367429760.0, + "logps/chosen": -199.1732940673828, + "logps/rejected": -648.7479858398438, + "loss": 0.0799, + "rewards/chosen": 3.4458935260772705, + "rewards/margins": 13.963382959365845, + "rewards/rejected": -10.517489433288574, + "step": 7234 + }, + { + "epoch": 0.6610324349017817, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 2.5868230126109102e-06, + "logits/chosen": 815666880.0, + "logits/rejected": 600038784.0, + "logps/chosen": -398.7142333984375, + "logps/rejected": -364.1646321614583, + "loss": 0.013, + "rewards/chosen": 4.3704118728637695, + "rewards/margins": 12.21751054128011, + "rewards/rejected": -7.847098668416341, + "step": 7235 + }, + { + "epoch": 0.6611238008222933, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 2.5855638565742392e-06, + "logits/chosen": 935467178.6666666, + "logits/rejected": 514167705.6, + "logps/chosen": -459.051025390625, + "logps/rejected": -442.94384765625, + "loss": 0.0065, + "rewards/chosen": 4.140545845031738, + "rewards/margins": 12.939280891418457, + "rewards/rejected": -8.798735046386719, + "step": 7236 + }, + { + "epoch": 0.6612151667428049, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 2.584304900189067e-06, + "logits/chosen": 1174645589.3333333, + "logits/rejected": 699460812.8, + "logps/chosen": -199.41192626953125, + "logps/rejected": -614.8115234375, + "loss": 0.0183, + "rewards/chosen": 3.0068305333455405, + "rewards/margins": 13.81364205678304, + "rewards/rejected": -10.8068115234375, + "step": 7237 + }, + { + "epoch": 0.6613065326633166, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 2.5830461435594977e-06, + "logits/chosen": 469526976.0, + "logits/rejected": 911293760.0, + "logps/chosen": -240.872314453125, + "logps/rejected": -492.2127685546875, + "loss": 0.0245, + "rewards/chosen": 3.316894292831421, + "rewards/margins": 13.545944929122925, + "rewards/rejected": -10.229050636291504, + "step": 7238 + }, + { + "epoch": 0.6613978985838282, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 2.581787586789619e-06, + "logits/chosen": 677249344.0, + "logits/rejected": 724306368.0, + "logps/chosen": -410.2882995605469, + "logps/rejected": -508.9847412109375, + "loss": 0.0181, + "rewards/chosen": 3.5931365489959717, + "rewards/margins": 15.208770513534546, + "rewards/rejected": -11.615633964538574, + "step": 7239 + }, + { + "epoch": 0.6614892645043399, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 2.5805292299835015e-06, + "logits/chosen": 1021549248.0, + "logits/rejected": 665206400.0, + "logps/chosen": -246.2264404296875, + "logps/rejected": -360.9006652832031, + "loss": 0.0227, + "rewards/chosen": 3.4487438201904297, + "rewards/margins": 10.437920570373535, + "rewards/rejected": -6.9891767501831055, + "step": 7240 + }, + { + "epoch": 0.6615806304248515, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 2.5792710732452e-06, + "logits/chosen": 404513971.2, + "logits/rejected": 507716309.3333333, + "logps/chosen": -326.0086181640625, + "logps/rejected": -570.054931640625, + "loss": 0.0303, + "rewards/chosen": 3.352606201171875, + "rewards/margins": 12.129446411132813, + "rewards/rejected": -8.776840209960938, + "step": 7241 + }, + { + "epoch": 0.6616719963453632, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.5780131166787494e-06, + "logits/chosen": 622581248.0, + "logits/rejected": 383118054.4, + "logps/chosen": -468.7185465494792, + "logps/rejected": -464.23154296875, + "loss": 0.0159, + "rewards/chosen": 3.559107462565104, + "rewards/margins": 12.606135050455729, + "rewards/rejected": -9.047027587890625, + "step": 7242 + }, + { + "epoch": 0.6617633622658748, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 2.576755360388177e-06, + "logits/chosen": 606081945.6, + "logits/rejected": 916611072.0, + "logps/chosen": -276.70185546875, + "logps/rejected": -323.3393147786458, + "loss": 0.0635, + "rewards/chosen": 3.1959266662597656, + "rewards/margins": 9.495046615600586, + "rewards/rejected": -6.29911994934082, + "step": 7243 + }, + { + "epoch": 0.6618547281863865, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 2.5754978044774837e-06, + "logits/chosen": 446492672.0, + "logits/rejected": 493445760.0, + "logps/chosen": -583.031494140625, + "logps/rejected": -443.41217041015625, + "loss": 0.0263, + "rewards/chosen": 3.0087366104125977, + "rewards/margins": 10.493066310882568, + "rewards/rejected": -7.484329700469971, + "step": 7244 + }, + { + "epoch": 0.6619460941068981, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.5742404490506584e-06, + "logits/chosen": 656713472.0, + "logits/rejected": 626210048.0, + "logps/chosen": -224.76039123535156, + "logps/rejected": -486.0449523925781, + "loss": 0.0299, + "rewards/chosen": 3.238450288772583, + "rewards/margins": 13.068959474563599, + "rewards/rejected": -9.830509185791016, + "step": 7245 + }, + { + "epoch": 0.6620374600274098, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 2.5729832942116705e-06, + "logits/chosen": 608427733.3333334, + "logits/rejected": 621488588.8, + "logps/chosen": -227.8753458658854, + "logps/rejected": -411.382666015625, + "loss": 0.009, + "rewards/chosen": 3.975651423136393, + "rewards/margins": 14.103494135538737, + "rewards/rejected": -10.127842712402344, + "step": 7246 + }, + { + "epoch": 0.6621288259479214, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 2.5717263400644797e-06, + "logits/chosen": 637346432.0, + "logits/rejected": 408788992.0, + "logps/chosen": -325.34112548828125, + "logps/rejected": -325.7490539550781, + "loss": 0.0392, + "rewards/chosen": 3.0036699771881104, + "rewards/margins": 10.949947118759155, + "rewards/rejected": -7.946277141571045, + "step": 7247 + }, + { + "epoch": 0.6622201918684331, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 2.5704695867130224e-06, + "logits/chosen": 346296384.0, + "logits/rejected": 341182272.0, + "logps/chosen": -301.67718505859375, + "logps/rejected": -372.83099365234375, + "loss": 0.01, + "rewards/chosen": 4.480474472045898, + "rewards/margins": 14.521263122558594, + "rewards/rejected": -10.040788650512695, + "step": 7248 + }, + { + "epoch": 0.6623115577889447, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.5692130342612203e-06, + "logits/chosen": 682609536.0, + "logits/rejected": 865899008.0, + "logps/chosen": -327.90338134765625, + "logps/rejected": -552.627685546875, + "loss": 0.0309, + "rewards/chosen": 3.136950969696045, + "rewards/margins": 12.639890193939209, + "rewards/rejected": -9.502939224243164, + "step": 7249 + }, + { + "epoch": 0.6624029237094564, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 2.5679566828129763e-06, + "logits/chosen": 427369258.6666667, + "logits/rejected": 442464608.0, + "logps/chosen": -318.8853759765625, + "logps/rejected": -606.1193237304688, + "loss": 0.0117, + "rewards/chosen": 4.563482602437337, + "rewards/margins": 12.117680390675861, + "rewards/rejected": -7.554197788238525, + "step": 7250 + }, + { + "epoch": 0.662494289629968, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 2.5667005324721833e-06, + "logits/chosen": 466368341.3333333, + "logits/rejected": 509175142.4, + "logps/chosen": -322.2242838541667, + "logps/rejected": -555.93564453125, + "loss": 0.0111, + "rewards/chosen": 3.7269163131713867, + "rewards/margins": 15.103504753112793, + "rewards/rejected": -11.376588439941406, + "step": 7251 + }, + { + "epoch": 0.6625856555504797, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 2.565444583342711e-06, + "logits/chosen": 716053248.0, + "logits/rejected": 1191878912.0, + "logps/chosen": -403.44622802734375, + "logps/rejected": -774.6114501953125, + "loss": 0.0089, + "rewards/chosen": 4.439024448394775, + "rewards/margins": 16.561277866363525, + "rewards/rejected": -12.12225341796875, + "step": 7252 + }, + { + "epoch": 0.6626770214709913, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.564188835528415e-06, + "logits/chosen": 515254835.2, + "logits/rejected": 733717845.3333334, + "logps/chosen": -190.857861328125, + "logps/rejected": -848.8896484375, + "loss": 0.1421, + "rewards/chosen": 2.7378057479858398, + "rewards/margins": 14.500980440775553, + "rewards/rejected": -11.763174692789713, + "step": 7253 + }, + { + "epoch": 0.662768387391503, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.562933289133135e-06, + "logits/chosen": 260098512.0, + "logits/rejected": 728077994.6666666, + "logps/chosen": -274.9027099609375, + "logps/rejected": -704.9860026041666, + "loss": 0.0139, + "rewards/chosen": 3.068115234375, + "rewards/margins": 11.577931722005209, + "rewards/rejected": -8.509816487630209, + "step": 7254 + }, + { + "epoch": 0.6628597533120146, + "grad_norm": 0.640625, + "kl": 0.0, + "learning_rate": 2.561677944260689e-06, + "logits/chosen": 203546730.66666666, + "logits/rejected": 520044134.4, + "logps/chosen": -127.90211995442708, + "logps/rejected": -458.88232421875, + "loss": 0.0032, + "rewards/chosen": 5.480400721232097, + "rewards/margins": 13.518456141153973, + "rewards/rejected": -8.038055419921875, + "step": 7255 + }, + { + "epoch": 0.6629511192325263, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 2.560422801014888e-06, + "logits/chosen": 234854656.0, + "logits/rejected": 372287914.6666667, + "logps/chosen": -290.6596374511719, + "logps/rejected": -337.1393229166667, + "loss": 0.0201, + "rewards/chosen": 5.234875679016113, + "rewards/margins": 13.566435178120932, + "rewards/rejected": -8.331559499104818, + "step": 7256 + }, + { + "epoch": 0.6630424851530379, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.5591678594995173e-06, + "logits/chosen": 905828693.3333334, + "logits/rejected": 537901465.6, + "logps/chosen": -505.605712890625, + "logps/rejected": -423.37197265625, + "loss": 0.0175, + "rewards/chosen": 3.307213465372721, + "rewards/margins": 12.526543299357096, + "rewards/rejected": -9.219329833984375, + "step": 7257 + }, + { + "epoch": 0.6631338510735496, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 2.5579131198183502e-06, + "logits/chosen": 650165440.0, + "logits/rejected": 880178944.0, + "logps/chosen": -412.2364501953125, + "logps/rejected": -626.5408325195312, + "loss": 0.0157, + "rewards/chosen": 3.783315420150757, + "rewards/margins": 15.257795095443726, + "rewards/rejected": -11.474479675292969, + "step": 7258 + }, + { + "epoch": 0.6632252169940612, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 2.5566585820751398e-06, + "logits/chosen": 478777824.0, + "logits/rejected": 406613632.0, + "logps/chosen": -238.26095581054688, + "logps/rejected": -473.5455017089844, + "loss": 0.0378, + "rewards/chosen": 2.746854782104492, + "rewards/margins": 11.99709701538086, + "rewards/rejected": -9.250242233276367, + "step": 7259 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 2.555404246373629e-06, + "logits/chosen": 448603840.0, + "logits/rejected": 381900960.0, + "logps/chosen": -370.05126953125, + "logps/rejected": -233.7124786376953, + "loss": 0.0095, + "rewards/chosen": 4.459575653076172, + "rewards/margins": 12.221391201019287, + "rewards/rejected": -7.761815547943115, + "step": 7260 + }, + { + "epoch": 0.6634079488350845, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 2.554150112817537e-06, + "logits/chosen": 733913344.0, + "logits/rejected": 688910250.6666666, + "logps/chosen": -134.59146118164062, + "logps/rejected": -531.9881998697916, + "loss": 0.0126, + "rewards/chosen": 3.3838019371032715, + "rewards/margins": 13.835437933603922, + "rewards/rejected": -10.45163599650065, + "step": 7261 + }, + { + "epoch": 0.6634993147555962, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 2.552896181510569e-06, + "logits/chosen": 475752448.0, + "logits/rejected": 413583360.0, + "logps/chosen": -447.5404866536458, + "logps/rejected": -490.61895751953125, + "loss": 0.0106, + "rewards/chosen": 4.771782557169597, + "rewards/margins": 15.688024202982586, + "rewards/rejected": -10.916241645812988, + "step": 7262 + }, + { + "epoch": 0.6635906806761078, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 2.5516424525564127e-06, + "logits/chosen": 502720682.6666667, + "logits/rejected": 501893888.0, + "logps/chosen": -340.5187174479167, + "logps/rejected": -531.390380859375, + "loss": 0.0083, + "rewards/chosen": 3.8413826624552407, + "rewards/margins": 14.04823080698649, + "rewards/rejected": -10.20684814453125, + "step": 7263 + }, + { + "epoch": 0.6636820465966194, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 2.550388926058742e-06, + "logits/chosen": 481719253.3333333, + "logits/rejected": 630416025.6, + "logps/chosen": -322.7132975260417, + "logps/rejected": -196.156005859375, + "loss": 0.0979, + "rewards/chosen": 4.243437767028809, + "rewards/margins": 9.380851936340331, + "rewards/rejected": -5.137414169311524, + "step": 7264 + }, + { + "epoch": 0.6637734125171311, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 2.549135602121214e-06, + "logits/chosen": 681280256.0, + "logits/rejected": 1040844458.6666666, + "logps/chosen": -290.781640625, + "logps/rejected": -442.9574788411458, + "loss": 0.0101, + "rewards/chosen": 4.46727180480957, + "rewards/margins": 11.868890762329102, + "rewards/rejected": -7.401618957519531, + "step": 7265 + }, + { + "epoch": 0.6638647784376428, + "grad_norm": 0.63671875, + "kl": 0.0, + "learning_rate": 2.5478824808474613e-06, + "logits/chosen": 184927680.0, + "logits/rejected": 542169088.0, + "logps/chosen": -143.66898600260416, + "logps/rejected": -428.226416015625, + "loss": 0.0051, + "rewards/chosen": 4.496477762858073, + "rewards/margins": 14.414827982584637, + "rewards/rejected": -9.918350219726562, + "step": 7266 + }, + { + "epoch": 0.6639561443581544, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.5466295623411063e-06, + "logits/chosen": 579175104.0, + "logits/rejected": 717223808.0, + "logps/chosen": -274.96136474609375, + "logps/rejected": -724.5150146484375, + "loss": 0.0139, + "rewards/chosen": 4.048657417297363, + "rewards/margins": 15.921442031860352, + "rewards/rejected": -11.872784614562988, + "step": 7267 + }, + { + "epoch": 0.664047510278666, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.545376846705757e-06, + "logits/chosen": 553242112.0, + "logits/rejected": 529285222.4, + "logps/chosen": -719.7652994791666, + "logps/rejected": -491.48974609375, + "loss": 0.0068, + "rewards/chosen": 4.329769134521484, + "rewards/margins": 13.374650573730468, + "rewards/rejected": -9.044881439208984, + "step": 7268 + }, + { + "epoch": 0.6641388761991777, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 2.5441243340450006e-06, + "logits/chosen": 587827797.3333334, + "logits/rejected": 306524441.6, + "logps/chosen": -448.7347819010417, + "logps/rejected": -353.4416748046875, + "loss": 0.0182, + "rewards/chosen": 3.739022890726725, + "rewards/margins": 12.968336550394694, + "rewards/rejected": -9.22931365966797, + "step": 7269 + }, + { + "epoch": 0.6642302421196894, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 2.542872024462407e-06, + "logits/chosen": 738184106.6666666, + "logits/rejected": 562680371.2, + "logps/chosen": -662.4252522786459, + "logps/rejected": -355.222265625, + "loss": 0.0066, + "rewards/chosen": 4.198543548583984, + "rewards/margins": 13.088388824462891, + "rewards/rejected": -8.889845275878907, + "step": 7270 + }, + { + "epoch": 0.664321608040201, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.5416199180615297e-06, + "logits/chosen": 259393520.0, + "logits/rejected": 697366442.6666666, + "logps/chosen": -137.42636108398438, + "logps/rejected": -631.4086100260416, + "loss": 0.0159, + "rewards/chosen": 3.2191152572631836, + "rewards/margins": 11.29737377166748, + "rewards/rejected": -8.078258514404297, + "step": 7271 + }, + { + "epoch": 0.6644129739607126, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 2.5403680149459097e-06, + "logits/chosen": 382370880.0, + "logits/rejected": 343114304.0, + "logps/chosen": -389.845458984375, + "logps/rejected": -535.7900390625, + "loss": 0.0025, + "rewards/chosen": 5.129209995269775, + "rewards/margins": 15.916118462880453, + "rewards/rejected": -10.786908467610678, + "step": 7272 + }, + { + "epoch": 0.6645043398812243, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 2.5391163152190656e-06, + "logits/chosen": 644760874.6666666, + "logits/rejected": 633965056.0, + "logps/chosen": -253.14068603515625, + "logps/rejected": -370.33843994140625, + "loss": 0.0299, + "rewards/chosen": 3.256209055582682, + "rewards/margins": 12.240360895792643, + "rewards/rejected": -8.984151840209961, + "step": 7273 + }, + { + "epoch": 0.664595705801736, + "grad_norm": 54.5, + "kl": 0.0, + "learning_rate": 2.5378648189845025e-06, + "logits/chosen": 907983701.3333334, + "logits/rejected": 446037708.8, + "logps/chosen": -212.2481892903646, + "logps/rejected": -429.90771484375, + "loss": 0.1398, + "rewards/chosen": 2.505279858907064, + "rewards/margins": 11.113829358418783, + "rewards/rejected": -8.608549499511719, + "step": 7274 + }, + { + "epoch": 0.6646870717222476, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 2.536613526345705e-06, + "logits/chosen": 460372320.0, + "logits/rejected": 520156992.0, + "logps/chosen": -383.9314880371094, + "logps/rejected": -721.0423583984375, + "loss": 0.0071, + "rewards/chosen": 4.604557037353516, + "rewards/margins": 17.043397903442383, + "rewards/rejected": -12.438840866088867, + "step": 7275 + }, + { + "epoch": 0.6647784376427592, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 2.5353624374061446e-06, + "logits/chosen": 398199210.6666667, + "logits/rejected": 377025984.0, + "logps/chosen": -381.4086507161458, + "logps/rejected": -566.1229248046875, + "loss": 0.0175, + "rewards/chosen": 4.264952023824056, + "rewards/margins": 15.422007878621418, + "rewards/rejected": -11.157055854797363, + "step": 7276 + }, + { + "epoch": 0.6648698035632709, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 2.5341115522692764e-06, + "logits/chosen": 427985254.4, + "logits/rejected": 381933653.3333333, + "logps/chosen": -266.909814453125, + "logps/rejected": -477.4473470052083, + "loss": 0.0193, + "rewards/chosen": 3.6643821716308596, + "rewards/margins": 11.537860107421874, + "rewards/rejected": -7.873477935791016, + "step": 7277 + }, + { + "epoch": 0.6649611694837826, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 2.532860871038536e-06, + "logits/chosen": 504081344.0, + "logits/rejected": 458988970.6666667, + "logps/chosen": -304.1169128417969, + "logps/rejected": -467.834716796875, + "loss": 0.0147, + "rewards/chosen": 3.0201077461242676, + "rewards/margins": 12.58814032872518, + "rewards/rejected": -9.568032582600912, + "step": 7278 + }, + { + "epoch": 0.6650525354042942, + "grad_norm": 0.423828125, + "kl": 0.0, + "learning_rate": 2.531610393817343e-06, + "logits/chosen": 515884416.0, + "logits/rejected": 603509162.6666666, + "logps/chosen": -157.66055297851562, + "logps/rejected": -504.9462076822917, + "loss": 0.0026, + "rewards/chosen": 4.695019721984863, + "rewards/margins": 13.945589383443197, + "rewards/rejected": -9.250569661458334, + "step": 7279 + }, + { + "epoch": 0.6651439013248058, + "grad_norm": 0.443359375, + "kl": 0.0, + "learning_rate": 2.530360120709099e-06, + "logits/chosen": 570433536.0, + "logits/rejected": 560426349.7142857, + "logps/chosen": -232.35047912597656, + "logps/rejected": -727.4010184151786, + "loss": 0.0022, + "rewards/chosen": 4.602910041809082, + "rewards/margins": 13.916240555899483, + "rewards/rejected": -9.313330514090401, + "step": 7280 + }, + { + "epoch": 0.6652352672453175, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 2.529110051817193e-06, + "logits/chosen": 343933849.6, + "logits/rejected": 687089834.6666666, + "logps/chosen": -280.4392578125, + "logps/rejected": -494.5513102213542, + "loss": 0.023, + "rewards/chosen": 4.143379974365234, + "rewards/margins": 13.916252899169923, + "rewards/rejected": -9.772872924804688, + "step": 7281 + }, + { + "epoch": 0.6653266331658292, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 2.5278601872449916e-06, + "logits/chosen": 472840106.6666667, + "logits/rejected": 420277504.0, + "logps/chosen": -321.56667073567706, + "logps/rejected": -676.8853515625, + "loss": 0.0075, + "rewards/chosen": 3.9932448069254556, + "rewards/margins": 12.86057268778483, + "rewards/rejected": -8.867327880859374, + "step": 7282 + }, + { + "epoch": 0.6654179990863408, + "grad_norm": 40.5, + "kl": 0.0, + "learning_rate": 2.5266105270958484e-06, + "logits/chosen": 773987328.0, + "logits/rejected": 693537600.0, + "logps/chosen": -302.79757254464283, + "logps/rejected": -740.21630859375, + "loss": 0.0519, + "rewards/chosen": 3.448854718889509, + "rewards/margins": 12.229433332170759, + "rewards/rejected": -8.78057861328125, + "step": 7283 + }, + { + "epoch": 0.6655093650068524, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 2.525361071473098e-06, + "logits/chosen": 383236181.3333333, + "logits/rejected": 403420006.4, + "logps/chosen": -247.96419270833334, + "logps/rejected": -400.53642578125, + "loss": 0.0053, + "rewards/chosen": 4.66708246866862, + "rewards/margins": 15.015471903483075, + "rewards/rejected": -10.348389434814454, + "step": 7284 + }, + { + "epoch": 0.665600730927364, + "grad_norm": 0.9921875, + "kl": 0.0, + "learning_rate": 2.5241118204800574e-06, + "logits/chosen": 506198176.0, + "logits/rejected": 922890581.3333334, + "logps/chosen": -269.116455078125, + "logps/rejected": -662.22509765625, + "loss": 0.0044, + "rewards/chosen": 5.248286247253418, + "rewards/margins": 15.962603569030762, + "rewards/rejected": -10.714317321777344, + "step": 7285 + }, + { + "epoch": 0.6656920968478758, + "grad_norm": 5.5, + "kl": 6.92747688293457, + "learning_rate": 2.522862774220032e-06, + "logits/chosen": 514102637.71428573, + "logits/rejected": 173803280.0, + "logps/chosen": -340.86648995535717, + "logps/rejected": -160.55838012695312, + "loss": 0.0419, + "rewards/chosen": 4.118916102818081, + "rewards/margins": 10.789472648075648, + "rewards/rejected": -6.670556545257568, + "step": 7286 + }, + { + "epoch": 0.6657834627683874, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.521613932796303e-06, + "logits/chosen": 585864960.0, + "logits/rejected": 417949013.3333333, + "logps/chosen": -359.619140625, + "logps/rejected": -471.4652099609375, + "loss": 0.0085, + "rewards/chosen": 3.5486974716186523, + "rewards/margins": 12.831938743591309, + "rewards/rejected": -9.283241271972656, + "step": 7287 + }, + { + "epoch": 0.665874828688899, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 2.5203652963121388e-06, + "logits/chosen": 333009706.6666667, + "logits/rejected": 423468595.2, + "logps/chosen": -234.32267252604166, + "logps/rejected": -475.927734375, + "loss": 0.0067, + "rewards/chosen": 4.538407643636067, + "rewards/margins": 13.149526723225911, + "rewards/rejected": -8.611119079589844, + "step": 7288 + }, + { + "epoch": 0.6659661946094106, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.5191168648707888e-06, + "logits/chosen": 560717312.0, + "logits/rejected": 840752640.0, + "logps/chosen": -384.90545654296875, + "logps/rejected": -460.87957763671875, + "loss": 0.0104, + "rewards/chosen": 4.486030101776123, + "rewards/margins": 13.926441669464111, + "rewards/rejected": -9.440411567687988, + "step": 7289 + }, + { + "epoch": 0.6660575605299224, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 2.5178686385754903e-06, + "logits/chosen": 726249728.0, + "logits/rejected": 770119296.0, + "logps/chosen": -427.3851013183594, + "logps/rejected": -299.4468994140625, + "loss": 0.1186, + "rewards/chosen": 3.2110557556152344, + "rewards/margins": 8.981364727020264, + "rewards/rejected": -5.770308971405029, + "step": 7290 + }, + { + "epoch": 0.666148926450434, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 2.5166206175294594e-06, + "logits/chosen": 542357504.0, + "logits/rejected": 410077632.0, + "logps/chosen": -397.42144775390625, + "logps/rejected": -506.83001708984375, + "loss": 0.0228, + "rewards/chosen": 3.345994234085083, + "rewards/margins": 13.654948472976685, + "rewards/rejected": -10.308954238891602, + "step": 7291 + }, + { + "epoch": 0.6662402923709456, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.5153728018358915e-06, + "logits/chosen": 398846336.0, + "logits/rejected": 395921792.0, + "logps/chosen": -269.5565999348958, + "logps/rejected": -409.294482421875, + "loss": 0.0175, + "rewards/chosen": 3.053950627644857, + "rewards/margins": 12.98100446065267, + "rewards/rejected": -9.927053833007813, + "step": 7292 + }, + { + "epoch": 0.6663316582914572, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.5141251915979737e-06, + "logits/chosen": 365009100.8, + "logits/rejected": 623994666.6666666, + "logps/chosen": -348.740771484375, + "logps/rejected": -711.294921875, + "loss": 0.0151, + "rewards/chosen": 4.410969924926758, + "rewards/margins": 13.455290349324546, + "rewards/rejected": -9.044320424397787, + "step": 7293 + }, + { + "epoch": 0.666423024211969, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.5128777869188704e-06, + "logits/chosen": 378716885.3333333, + "logits/rejected": 288129600.0, + "logps/chosen": -338.8962809244792, + "logps/rejected": -466.07861328125, + "loss": 0.0236, + "rewards/chosen": 3.5777161916097007, + "rewards/margins": 13.703505833943685, + "rewards/rejected": -10.125789642333984, + "step": 7294 + }, + { + "epoch": 0.6665143901324806, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.51163058790173e-06, + "logits/chosen": 623968841.1428572, + "logits/rejected": 306831104.0, + "logps/chosen": -379.2610560825893, + "logps/rejected": -570.621337890625, + "loss": 0.0357, + "rewards/chosen": 3.209357125418527, + "rewards/margins": 15.478961808340891, + "rewards/rejected": -12.269604682922363, + "step": 7295 + }, + { + "epoch": 0.6666057560529922, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.5103835946496846e-06, + "logits/chosen": 293925600.0, + "logits/rejected": 725309878.8571428, + "logps/chosen": -150.88812255859375, + "logps/rejected": -532.2713797433036, + "loss": 0.1113, + "rewards/chosen": 4.852804660797119, + "rewards/margins": 13.190721443721227, + "rewards/rejected": -8.337916782924108, + "step": 7296 + }, + { + "epoch": 0.6666971219735038, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 2.5091368072658485e-06, + "logits/chosen": 77029392.0, + "logits/rejected": 560128731.4285715, + "logps/chosen": -443.61859130859375, + "logps/rejected": -639.4034598214286, + "loss": 0.0067, + "rewards/chosen": 3.735180616378784, + "rewards/margins": 11.694967712674822, + "rewards/rejected": -7.959787096296038, + "step": 7297 + }, + { + "epoch": 0.6667884878940156, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 2.5078902258533206e-06, + "logits/chosen": 1077813845.3333333, + "logits/rejected": 453645107.2, + "logps/chosen": -411.115234375, + "logps/rejected": -557.443359375, + "loss": 0.0069, + "rewards/chosen": 4.228991826375325, + "rewards/margins": 16.284480412801106, + "rewards/rejected": -12.055488586425781, + "step": 7298 + }, + { + "epoch": 0.6668798538145272, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 2.5066438505151814e-06, + "logits/chosen": 796691200.0, + "logits/rejected": 539552204.8, + "logps/chosen": -315.64158121744794, + "logps/rejected": -501.5021484375, + "loss": 0.0188, + "rewards/chosen": 2.9852158228556314, + "rewards/margins": 11.840927950541177, + "rewards/rejected": -8.855712127685546, + "step": 7299 + }, + { + "epoch": 0.6669712197350388, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 2.505397681354495e-06, + "logits/chosen": 590333098.6666666, + "logits/rejected": 295380633.6, + "logps/chosen": -323.3566080729167, + "logps/rejected": -268.118359375, + "loss": 0.0773, + "rewards/chosen": 3.0645176569620767, + "rewards/margins": 9.484611956278483, + "rewards/rejected": -6.420094299316406, + "step": 7300 + }, + { + "epoch": 0.6670625856555504, + "grad_norm": 48.25, + "kl": 0.0, + "learning_rate": 2.5041517184743046e-06, + "logits/chosen": 463784064.0, + "logits/rejected": 468236851.2, + "logps/chosen": -282.16530354817706, + "logps/rejected": -270.24033203125, + "loss": 0.1312, + "rewards/chosen": 3.875149408976237, + "rewards/margins": 9.248082224527995, + "rewards/rejected": -5.372932815551758, + "step": 7301 + }, + { + "epoch": 0.6671539515760622, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 2.502905961977645e-06, + "logits/chosen": 492050739.2, + "logits/rejected": 699529173.3333334, + "logps/chosen": -309.790625, + "logps/rejected": -705.047607421875, + "loss": 0.0139, + "rewards/chosen": 4.184262084960937, + "rewards/margins": 15.38557637532552, + "rewards/rejected": -11.201314290364584, + "step": 7302 + }, + { + "epoch": 0.6672453174965738, + "grad_norm": 29.375, + "kl": 0.0, + "learning_rate": 2.5016604119675257e-06, + "logits/chosen": 544378282.6666666, + "logits/rejected": 641065574.4, + "logps/chosen": -436.1274820963542, + "logps/rejected": -348.558203125, + "loss": 0.059, + "rewards/chosen": 4.166343053181966, + "rewards/margins": 10.764811833699543, + "rewards/rejected": -6.598468780517578, + "step": 7303 + }, + { + "epoch": 0.6673366834170854, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 2.5004150685469434e-06, + "logits/chosen": 655528064.0, + "logits/rejected": 379639104.0, + "logps/chosen": -215.14743041992188, + "logps/rejected": -333.6553955078125, + "loss": 0.0272, + "rewards/chosen": 3.400587320327759, + "rewards/margins": 11.649011850357056, + "rewards/rejected": -8.248424530029297, + "step": 7304 + }, + { + "epoch": 0.667428049337597, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 2.4991699318188755e-06, + "logits/chosen": 847939712.0, + "logits/rejected": 506654829.71428573, + "logps/chosen": -294.5871276855469, + "logps/rejected": -460.09068080357144, + "loss": 0.0039, + "rewards/chosen": 3.4787538051605225, + "rewards/margins": 14.698965379170009, + "rewards/rejected": -11.220211574009486, + "step": 7305 + }, + { + "epoch": 0.6675194152581088, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.4979250018862817e-06, + "logits/chosen": 612394349.7142857, + "logits/rejected": 516418976.0, + "logps/chosen": -200.55768694196428, + "logps/rejected": -276.5784912109375, + "loss": 0.0301, + "rewards/chosen": 4.227348600115095, + "rewards/margins": 12.332524572099958, + "rewards/rejected": -8.105175971984863, + "step": 7306 + }, + { + "epoch": 0.6676107811786204, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 2.49668027885211e-06, + "logits/chosen": 587722547.2, + "logits/rejected": 590233941.3333334, + "logps/chosen": -205.5099365234375, + "logps/rejected": -411.5235188802083, + "loss": 0.0501, + "rewards/chosen": 2.7098415374755858, + "rewards/margins": 10.754380671183267, + "rewards/rejected": -8.044539133707682, + "step": 7307 + }, + { + "epoch": 0.667702147099132, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 2.4954357628192856e-06, + "logits/chosen": 516419584.0, + "logits/rejected": 339222912.0, + "logps/chosen": -274.0030029296875, + "logps/rejected": -568.6952718098959, + "loss": 0.0204, + "rewards/chosen": 3.9176021575927735, + "rewards/margins": 15.96134910583496, + "rewards/rejected": -12.043746948242188, + "step": 7308 + }, + { + "epoch": 0.6677935130196436, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 2.4941914538907174e-06, + "logits/chosen": 255191658.66666666, + "logits/rejected": 559911219.2, + "logps/chosen": -130.78645833333334, + "logps/rejected": -553.3466796875, + "loss": 0.1238, + "rewards/chosen": 1.64326016108195, + "rewards/margins": 10.84369961420695, + "rewards/rejected": -9.200439453125, + "step": 7309 + }, + { + "epoch": 0.6678848789401554, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 2.4929473521692983e-06, + "logits/chosen": 663465984.0, + "logits/rejected": 614824832.0, + "logps/chosen": -234.275390625, + "logps/rejected": -582.5178833007812, + "loss": 0.0138, + "rewards/chosen": 4.105710983276367, + "rewards/margins": 14.849650382995605, + "rewards/rejected": -10.743939399719238, + "step": 7310 + }, + { + "epoch": 0.667976244860667, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.491703457757906e-06, + "logits/chosen": 406313557.3333333, + "logits/rejected": 465472358.4, + "logps/chosen": -387.0330810546875, + "logps/rejected": -416.308837890625, + "loss": 0.0179, + "rewards/chosen": 3.9855626424153647, + "rewards/margins": 13.554041035970053, + "rewards/rejected": -9.568478393554688, + "step": 7311 + }, + { + "epoch": 0.6680676107811786, + "grad_norm": 0.4140625, + "kl": 0.0, + "learning_rate": 2.490459770759398e-06, + "logits/chosen": 367805504.0, + "logits/rejected": 818234624.0, + "logps/chosen": -125.30551147460938, + "logps/rejected": -373.9750162760417, + "loss": 0.0021, + "rewards/chosen": 4.872855186462402, + "rewards/margins": 14.532158215840658, + "rewards/rejected": -9.659303029378256, + "step": 7312 + }, + { + "epoch": 0.6681589767016902, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 2.4892162912766153e-06, + "logits/chosen": 553377728.0, + "logits/rejected": 702059072.0, + "logps/chosen": -288.6654968261719, + "logps/rejected": -367.3536376953125, + "loss": 0.1321, + "rewards/chosen": 1.949291467666626, + "rewards/margins": 10.090553522109985, + "rewards/rejected": -8.14126205444336, + "step": 7313 + }, + { + "epoch": 0.668250342622202, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.487973019412381e-06, + "logits/chosen": 622705817.6, + "logits/rejected": 666184960.0, + "logps/chosen": -463.665673828125, + "logps/rejected": -399.6901041666667, + "loss": 0.0161, + "rewards/chosen": 3.6995777130126952, + "rewards/margins": 12.97316017150879, + "rewards/rejected": -9.273582458496094, + "step": 7314 + }, + { + "epoch": 0.6683417085427136, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 2.4867299552695057e-06, + "logits/chosen": 392812714.6666667, + "logits/rejected": 378137472.0, + "logps/chosen": -317.7039794921875, + "logps/rejected": -386.4769287109375, + "loss": 0.0054, + "rewards/chosen": 5.129907290140788, + "rewards/margins": 14.220809237162271, + "rewards/rejected": -9.090901947021484, + "step": 7315 + }, + { + "epoch": 0.6684330744632252, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 2.485487098950776e-06, + "logits/chosen": 445479840.0, + "logits/rejected": 392341632.0, + "logps/chosen": -283.4283447265625, + "logps/rejected": -476.8397521972656, + "loss": 0.0136, + "rewards/chosen": 3.825756549835205, + "rewards/margins": 12.513282299041748, + "rewards/rejected": -8.687525749206543, + "step": 7316 + }, + { + "epoch": 0.6685244403837368, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 2.4842444505589667e-06, + "logits/chosen": 472356224.0, + "logits/rejected": 653666560.0, + "logps/chosen": -311.87890625, + "logps/rejected": -469.7533264160156, + "loss": 0.1045, + "rewards/chosen": 3.6859853267669678, + "rewards/margins": 10.993725538253784, + "rewards/rejected": -7.307740211486816, + "step": 7317 + }, + { + "epoch": 0.6686158063042485, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 2.4830020101968323e-06, + "logits/chosen": 566686912.0, + "logits/rejected": 354393568.0, + "logps/chosen": -436.16058349609375, + "logps/rejected": -558.4283447265625, + "loss": 0.0339, + "rewards/chosen": 2.970963478088379, + "rewards/margins": 10.71478796005249, + "rewards/rejected": -7.743824481964111, + "step": 7318 + }, + { + "epoch": 0.6687071722247602, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 2.481759777967111e-06, + "logits/chosen": 417907648.0, + "logits/rejected": 455606656.0, + "logps/chosen": -400.2023010253906, + "logps/rejected": -376.8739318847656, + "loss": 0.0307, + "rewards/chosen": 3.0309572219848633, + "rewards/margins": 11.185853004455566, + "rewards/rejected": -8.154895782470703, + "step": 7319 + }, + { + "epoch": 0.6687985381452718, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.480517753972524e-06, + "logits/chosen": 655521664.0, + "logits/rejected": 603129301.3333334, + "logps/chosen": -436.15887451171875, + "logps/rejected": -359.4507649739583, + "loss": 0.1124, + "rewards/chosen": 5.108392715454102, + "rewards/margins": 11.96844482421875, + "rewards/rejected": -6.860052108764648, + "step": 7320 + }, + { + "epoch": 0.6688899040657834, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 2.479275938315775e-06, + "logits/chosen": 464512768.0, + "logits/rejected": 598441728.0, + "logps/chosen": -287.6021728515625, + "logps/rejected": -527.0133056640625, + "loss": 0.027, + "rewards/chosen": 2.938960313796997, + "rewards/margins": 11.04833436012268, + "rewards/rejected": -8.109374046325684, + "step": 7321 + }, + { + "epoch": 0.6689812699862951, + "grad_norm": 0.70703125, + "kl": 0.0, + "learning_rate": 2.478034331099549e-06, + "logits/chosen": 813145600.0, + "logits/rejected": 505077248.0, + "logps/chosen": -315.1408284505208, + "logps/rejected": -526.5916015625, + "loss": 0.0037, + "rewards/chosen": 4.6946461995442705, + "rewards/margins": 15.897597249348959, + "rewards/rejected": -11.202951049804687, + "step": 7322 + }, + { + "epoch": 0.6690726359068068, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.47679293242652e-06, + "logits/chosen": 769085568.0, + "logits/rejected": 387079168.0, + "logps/chosen": -222.78802490234375, + "logps/rejected": -191.2195281982422, + "loss": 0.0214, + "rewards/chosen": 4.494297504425049, + "rewards/margins": 9.887518405914307, + "rewards/rejected": -5.393220901489258, + "step": 7323 + }, + { + "epoch": 0.6691640018273184, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 2.4755517423993364e-06, + "logits/chosen": 857440320.0, + "logits/rejected": 602429696.0, + "logps/chosen": -448.7885437011719, + "logps/rejected": -388.9725341796875, + "loss": 0.0155, + "rewards/chosen": 3.503305435180664, + "rewards/margins": 11.568490028381348, + "rewards/rejected": -8.065184593200684, + "step": 7324 + }, + { + "epoch": 0.66925536774783, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 2.4743107611206346e-06, + "logits/chosen": 743220394.6666666, + "logits/rejected": 888503936.0, + "logps/chosen": -388.2694498697917, + "logps/rejected": -614.98779296875, + "loss": 0.0186, + "rewards/chosen": 4.240441004435222, + "rewards/margins": 14.496111551920574, + "rewards/rejected": -10.255670547485352, + "step": 7325 + }, + { + "epoch": 0.6693467336683417, + "grad_norm": 32.25, + "kl": 0.0, + "learning_rate": 2.4730699886930323e-06, + "logits/chosen": 533006378.6666667, + "logits/rejected": 434954912.0, + "logps/chosen": -250.805908203125, + "logps/rejected": -482.4111328125, + "loss": 0.0938, + "rewards/chosen": 2.4236197471618652, + "rewards/margins": 10.425811290740967, + "rewards/rejected": -8.002191543579102, + "step": 7326 + }, + { + "epoch": 0.6694380995888534, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 2.4718294252191267e-06, + "logits/chosen": 549982080.0, + "logits/rejected": 314645913.6, + "logps/chosen": -418.09765625, + "logps/rejected": -436.373193359375, + "loss": 0.0125, + "rewards/chosen": 3.9176597595214844, + "rewards/margins": 12.749444580078125, + "rewards/rejected": -8.83178482055664, + "step": 7327 + }, + { + "epoch": 0.669529465509365, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 2.470589070801506e-06, + "logits/chosen": 401853632.0, + "logits/rejected": 422306304.0, + "logps/chosen": -296.6410217285156, + "logps/rejected": -340.467041015625, + "loss": 0.0349, + "rewards/chosen": 3.3062267303466797, + "rewards/margins": 9.30125904083252, + "rewards/rejected": -5.99503231048584, + "step": 7328 + }, + { + "epoch": 0.6696208314298766, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 2.469348925542733e-06, + "logits/chosen": 663529920.0, + "logits/rejected": 375711488.0, + "logps/chosen": -556.1524658203125, + "logps/rejected": -436.14320591517856, + "loss": 0.0056, + "rewards/chosen": 3.0824952125549316, + "rewards/margins": 12.662978785378593, + "rewards/rejected": -9.580483572823661, + "step": 7329 + }, + { + "epoch": 0.6697121973503883, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 2.4681089895453575e-06, + "logits/chosen": 585649920.0, + "logits/rejected": 894876032.0, + "logps/chosen": -348.8363342285156, + "logps/rejected": -725.2007446289062, + "loss": 0.0382, + "rewards/chosen": 2.5805938243865967, + "rewards/margins": 11.892609357833862, + "rewards/rejected": -9.312015533447266, + "step": 7330 + }, + { + "epoch": 0.6698035632709, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 2.4668692629119086e-06, + "logits/chosen": 1713981184.0, + "logits/rejected": 618690816.0, + "logps/chosen": -366.9632568359375, + "logps/rejected": -474.395751953125, + "loss": 0.0116, + "rewards/chosen": 3.661285400390625, + "rewards/margins": 11.392302831013996, + "rewards/rejected": -7.731017430623372, + "step": 7331 + }, + { + "epoch": 0.6698949291914116, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.4656297457449023e-06, + "logits/chosen": 429470054.4, + "logits/rejected": 569746773.3333334, + "logps/chosen": -380.4431396484375, + "logps/rejected": -613.598388671875, + "loss": 0.0319, + "rewards/chosen": 3.2556434631347657, + "rewards/margins": 12.68050537109375, + "rewards/rejected": -9.424861907958984, + "step": 7332 + }, + { + "epoch": 0.6699862951119232, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 2.464390438146835e-06, + "logits/chosen": 535706016.0, + "logits/rejected": 463323968.0, + "logps/chosen": -527.34765625, + "logps/rejected": -596.1073608398438, + "loss": 0.0117, + "rewards/chosen": 3.8535447120666504, + "rewards/margins": 14.386253833770752, + "rewards/rejected": -10.532709121704102, + "step": 7333 + }, + { + "epoch": 0.6700776610324349, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 2.4631513402201863e-06, + "logits/chosen": 544586688.0, + "logits/rejected": 251873440.0, + "logps/chosen": -391.55108642578125, + "logps/rejected": -247.3106689453125, + "loss": 0.0207, + "rewards/chosen": 3.331118106842041, + "rewards/margins": 11.808790683746338, + "rewards/rejected": -8.477672576904297, + "step": 7334 + }, + { + "epoch": 0.6701690269529466, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.461912452067415e-06, + "logits/chosen": 558116352.0, + "logits/rejected": 389409152.0, + "logps/chosen": -181.2740966796875, + "logps/rejected": -538.138916015625, + "loss": 0.034, + "rewards/chosen": 3.0741764068603517, + "rewards/margins": 13.796016311645507, + "rewards/rejected": -10.721839904785156, + "step": 7335 + }, + { + "epoch": 0.6702603928734582, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 2.4606737737909696e-06, + "logits/chosen": 975585450.6666666, + "logits/rejected": 624621977.6, + "logps/chosen": -205.4141845703125, + "logps/rejected": -489.69033203125, + "loss": 0.0064, + "rewards/chosen": 4.648995717366536, + "rewards/margins": 13.119532521565755, + "rewards/rejected": -8.470536804199218, + "step": 7336 + }, + { + "epoch": 0.6703517587939698, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 2.459435305493276e-06, + "logits/chosen": 888562496.0, + "logits/rejected": 569078720.0, + "logps/chosen": -373.24261474609375, + "logps/rejected": -598.3795166015625, + "loss": 0.012, + "rewards/chosen": 3.830392599105835, + "rewards/margins": 14.15418028831482, + "rewards/rejected": -10.323787689208984, + "step": 7337 + }, + { + "epoch": 0.6704431247144815, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 2.4581970472767443e-06, + "logits/chosen": 491929292.8, + "logits/rejected": 520786346.6666667, + "logps/chosen": -350.7049560546875, + "logps/rejected": -506.7234700520833, + "loss": 0.0175, + "rewards/chosen": 3.7210762023925783, + "rewards/margins": 12.983351262410483, + "rewards/rejected": -9.262275060017904, + "step": 7338 + }, + { + "epoch": 0.6705344906349932, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.4569589992437655e-06, + "logits/chosen": 565923328.0, + "logits/rejected": 866295232.0, + "logps/chosen": -290.82373046875, + "logps/rejected": -550.888916015625, + "loss": 0.0197, + "rewards/chosen": 3.6989684104919434, + "rewards/margins": 13.63452959060669, + "rewards/rejected": -9.935561180114746, + "step": 7339 + }, + { + "epoch": 0.6706258565555048, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.455721161496714e-06, + "logits/chosen": 553089638.4, + "logits/rejected": 729191082.6666666, + "logps/chosen": -319.855712890625, + "logps/rejected": -502.01171875, + "loss": 0.0237, + "rewards/chosen": 3.759917449951172, + "rewards/margins": 13.707624944051108, + "rewards/rejected": -9.947707494099935, + "step": 7340 + }, + { + "epoch": 0.6707172224760164, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.4544835341379513e-06, + "logits/chosen": 509872128.0, + "logits/rejected": 466182656.0, + "logps/chosen": -389.5501403808594, + "logps/rejected": -525.1731567382812, + "loss": 0.0149, + "rewards/chosen": 3.98984956741333, + "rewards/margins": 13.075367450714111, + "rewards/rejected": -9.085517883300781, + "step": 7341 + }, + { + "epoch": 0.6708085883965281, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 2.453246117269816e-06, + "logits/chosen": 460707157.3333333, + "logits/rejected": 478164633.6, + "logps/chosen": -264.67917887369794, + "logps/rejected": -473.4333984375, + "loss": 0.0053, + "rewards/chosen": 4.862593650817871, + "rewards/margins": 13.56680965423584, + "rewards/rejected": -8.704216003417969, + "step": 7342 + }, + { + "epoch": 0.6708999543170397, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 2.45200891099463e-06, + "logits/chosen": 756126054.4, + "logits/rejected": 859253162.6666666, + "logps/chosen": -260.498046875, + "logps/rejected": -596.7249348958334, + "loss": 0.0179, + "rewards/chosen": 4.007875061035156, + "rewards/margins": 14.999319966634115, + "rewards/rejected": -10.991444905598959, + "step": 7343 + }, + { + "epoch": 0.6709913202375514, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 2.4507719154146974e-06, + "logits/chosen": 488940134.4, + "logits/rejected": 533342677.3333333, + "logps/chosen": -373.2743896484375, + "logps/rejected": -300.6170654296875, + "loss": 0.0413, + "rewards/chosen": 3.2332244873046876, + "rewards/margins": 10.712127304077148, + "rewards/rejected": -7.478902816772461, + "step": 7344 + }, + { + "epoch": 0.671082686158063, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 2.4495351306323124e-06, + "logits/chosen": 928147370.6666666, + "logits/rejected": 781262272.0, + "logps/chosen": -371.5877278645833, + "logps/rejected": -300.68939208984375, + "loss": 0.0088, + "rewards/chosen": 4.819513003031413, + "rewards/margins": 11.546661535898846, + "rewards/rejected": -6.727148532867432, + "step": 7345 + }, + { + "epoch": 0.6711740520785747, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 2.4482985567497395e-06, + "logits/chosen": 402062122.6666667, + "logits/rejected": 406314086.4, + "logps/chosen": -305.47035725911456, + "logps/rejected": -367.60302734375, + "loss": 0.0088, + "rewards/chosen": 4.324112892150879, + "rewards/margins": 13.74296817779541, + "rewards/rejected": -9.418855285644531, + "step": 7346 + }, + { + "epoch": 0.6712654179990863, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 2.447062193869234e-06, + "logits/chosen": 408900288.0, + "logits/rejected": 456949184.0, + "logps/chosen": -322.0719299316406, + "logps/rejected": -492.4233093261719, + "loss": 0.0081, + "rewards/chosen": 4.794483184814453, + "rewards/margins": 14.273244857788086, + "rewards/rejected": -9.478761672973633, + "step": 7347 + }, + { + "epoch": 0.671356783919598, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.44582604209303e-06, + "logits/chosen": 724766822.4, + "logits/rejected": 528520405.3333333, + "logps/chosen": -273.4933349609375, + "logps/rejected": -573.9603678385416, + "loss": 0.0418, + "rewards/chosen": 2.7549747467041015, + "rewards/margins": 11.744198226928711, + "rewards/rejected": -8.98922348022461, + "step": 7348 + }, + { + "epoch": 0.6714481498401096, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 2.4445901015233485e-06, + "logits/chosen": 742512179.2, + "logits/rejected": 1239292160.0, + "logps/chosen": -311.7185546875, + "logps/rejected": -418.460693359375, + "loss": 0.0438, + "rewards/chosen": 3.261151123046875, + "rewards/margins": 11.71300900777181, + "rewards/rejected": -8.451857884724935, + "step": 7349 + }, + { + "epoch": 0.6715395157606213, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 2.443354372262389e-06, + "logits/chosen": 814067660.8, + "logits/rejected": 600857898.6666666, + "logps/chosen": -534.410205078125, + "logps/rejected": -616.644287109375, + "loss": 0.0248, + "rewards/chosen": 3.614402008056641, + "rewards/margins": 13.739354960123698, + "rewards/rejected": -10.124952952067057, + "step": 7350 + }, + { + "epoch": 0.6716308816811329, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 2.4421188544123364e-06, + "logits/chosen": 474060992.0, + "logits/rejected": 467662848.0, + "logps/chosen": -279.3492736816406, + "logps/rejected": -544.5017700195312, + "loss": 0.0194, + "rewards/chosen": 3.339677333831787, + "rewards/margins": 13.215854167938232, + "rewards/rejected": -9.876176834106445, + "step": 7351 + }, + { + "epoch": 0.6717222476016446, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 2.4408835480753523e-06, + "logits/chosen": 562113984.0, + "logits/rejected": 395596512.0, + "logps/chosen": -296.09423828125, + "logps/rejected": -354.278076171875, + "loss": 0.0148, + "rewards/chosen": 3.9879016876220703, + "rewards/margins": 12.377923011779785, + "rewards/rejected": -8.390021324157715, + "step": 7352 + }, + { + "epoch": 0.6718136135221562, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 2.4396484533535903e-06, + "logits/chosen": 310304160.0, + "logits/rejected": 355257045.3333333, + "logps/chosen": -348.2554931640625, + "logps/rejected": -425.5509440104167, + "loss": 0.0066, + "rewards/chosen": 4.558069229125977, + "rewards/margins": 13.23223622639974, + "rewards/rejected": -8.674166997273764, + "step": 7353 + }, + { + "epoch": 0.6719049794426679, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 2.438413570349179e-06, + "logits/chosen": 492792012.8, + "logits/rejected": 406789504.0, + "logps/chosen": -330.382373046875, + "logps/rejected": -438.2923990885417, + "loss": 0.1053, + "rewards/chosen": 4.691763305664063, + "rewards/margins": 9.729096158345541, + "rewards/rejected": -5.037332852681478, + "step": 7354 + }, + { + "epoch": 0.6719963453631795, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.437178899164232e-06, + "logits/chosen": 883659605.3333334, + "logits/rejected": 590990694.4, + "logps/chosen": -379.0123291015625, + "logps/rejected": -287.88486328125, + "loss": 0.0256, + "rewards/chosen": 2.6744276682535806, + "rewards/margins": 10.11104113260905, + "rewards/rejected": -7.436613464355469, + "step": 7355 + }, + { + "epoch": 0.6720877112836912, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 2.4359444399008426e-06, + "logits/chosen": 1286455424.0, + "logits/rejected": 688616576.0, + "logps/chosen": -381.2737121582031, + "logps/rejected": -624.494140625, + "loss": 0.0102, + "rewards/chosen": 4.157339572906494, + "rewards/margins": 14.72988748550415, + "rewards/rejected": -10.572547912597656, + "step": 7356 + }, + { + "epoch": 0.6721790772042028, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 2.434710192661094e-06, + "logits/chosen": 604279040.0, + "logits/rejected": 618193766.4, + "logps/chosen": -445.0254313151042, + "logps/rejected": -379.78876953125, + "loss": 0.0078, + "rewards/chosen": 4.17983881632487, + "rewards/margins": 13.329052225748697, + "rewards/rejected": -9.149213409423828, + "step": 7357 + }, + { + "epoch": 0.6722704431247145, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 2.433476157547044e-06, + "logits/chosen": 597517653.3333334, + "logits/rejected": 888803020.8, + "logps/chosen": -267.74595133463544, + "logps/rejected": -563.401708984375, + "loss": 0.0145, + "rewards/chosen": 3.3583971659342446, + "rewards/margins": 11.537718454996744, + "rewards/rejected": -8.1793212890625, + "step": 7358 + }, + { + "epoch": 0.6723618090452261, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 2.4322423346607368e-06, + "logits/chosen": 1092544000.0, + "logits/rejected": 625200704.0, + "logps/chosen": -471.9610290527344, + "logps/rejected": -649.8568725585938, + "loss": 0.0151, + "rewards/chosen": 3.7372069358825684, + "rewards/margins": 13.430287837982178, + "rewards/rejected": -9.69308090209961, + "step": 7359 + }, + { + "epoch": 0.6724531749657378, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 2.4310087241041973e-06, + "logits/chosen": 444689792.0, + "logits/rejected": 1160547072.0, + "logps/chosen": -398.3536783854167, + "logps/rejected": -399.2433166503906, + "loss": 0.0216, + "rewards/chosen": 4.047927538553874, + "rewards/margins": 13.212340990702312, + "rewards/rejected": -9.164413452148438, + "step": 7360 + }, + { + "epoch": 0.6725445408862494, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 2.429775325979432e-06, + "logits/chosen": 1073424896.0, + "logits/rejected": 1130155008.0, + "logps/chosen": -511.853515625, + "logps/rejected": -529.972216796875, + "loss": 0.0105, + "rewards/chosen": 3.722745895385742, + "rewards/margins": 14.20942497253418, + "rewards/rejected": -10.486679077148438, + "step": 7361 + }, + { + "epoch": 0.6726359068067611, + "grad_norm": 70.0, + "kl": 0.0, + "learning_rate": 2.4285421403884363e-06, + "logits/chosen": 395014592.0, + "logits/rejected": 295902080.0, + "logps/chosen": -238.06727600097656, + "logps/rejected": -348.4783935546875, + "loss": 0.088, + "rewards/chosen": 2.7821178436279297, + "rewards/margins": 12.96753215789795, + "rewards/rejected": -10.18541431427002, + "step": 7362 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 2.42730916743318e-06, + "logits/chosen": 522707008.0, + "logits/rejected": 568093376.0, + "logps/chosen": -299.83367919921875, + "logps/rejected": -380.3895568847656, + "loss": 0.0801, + "rewards/chosen": 3.6496715545654297, + "rewards/margins": 11.06042194366455, + "rewards/rejected": -7.410750389099121, + "step": 7363 + }, + { + "epoch": 0.6728186386477844, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.426076407215619e-06, + "logits/chosen": 813792320.0, + "logits/rejected": 620777984.0, + "logps/chosen": -357.3558044433594, + "logps/rejected": -431.2301330566406, + "loss": 0.0115, + "rewards/chosen": 4.597728729248047, + "rewards/margins": 13.950029373168945, + "rewards/rejected": -9.352300643920898, + "step": 7364 + }, + { + "epoch": 0.672910004568296, + "grad_norm": 42.25, + "kl": 0.0, + "learning_rate": 2.4248438598376893e-06, + "logits/chosen": 752283050.6666666, + "logits/rejected": 545148825.6, + "logps/chosen": -240.5206298828125, + "logps/rejected": -535.7634765625, + "loss": 0.196, + "rewards/chosen": 0.6041763623555502, + "rewards/margins": 10.710112031300863, + "rewards/rejected": -10.105935668945312, + "step": 7365 + }, + { + "epoch": 0.6730013704888077, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.4236115254013155e-06, + "logits/chosen": 499933593.6, + "logits/rejected": 1132893696.0, + "logps/chosen": -350.4970703125, + "logps/rejected": -643.161376953125, + "loss": 0.0115, + "rewards/chosen": 4.272372436523438, + "rewards/margins": 14.740626525878906, + "rewards/rejected": -10.468254089355469, + "step": 7366 + }, + { + "epoch": 0.6730927364093193, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 2.422379404008397e-06, + "logits/chosen": 495397088.0, + "logits/rejected": 270301088.0, + "logps/chosen": -297.7958984375, + "logps/rejected": -390.53802490234375, + "loss": 0.0281, + "rewards/chosen": 3.7357726097106934, + "rewards/margins": 11.767662525177002, + "rewards/rejected": -8.031889915466309, + "step": 7367 + }, + { + "epoch": 0.673184102329831, + "grad_norm": 0.6796875, + "kl": 0.0, + "learning_rate": 2.42114749576082e-06, + "logits/chosen": 256864640.0, + "logits/rejected": 460889024.0, + "logps/chosen": -192.31857299804688, + "logps/rejected": -575.1268310546875, + "loss": 0.0041, + "rewards/chosen": 5.038612365722656, + "rewards/margins": 15.143449783325195, + "rewards/rejected": -10.104837417602539, + "step": 7368 + }, + { + "epoch": 0.6732754682503426, + "grad_norm": 0.52734375, + "kl": 0.0, + "learning_rate": 2.419915800760451e-06, + "logits/chosen": 461730730.6666667, + "logits/rejected": 1123230617.6, + "logps/chosen": -300.8448893229167, + "logps/rejected": -639.093505859375, + "loss": 0.0026, + "rewards/chosen": 5.156510353088379, + "rewards/margins": 14.656547737121581, + "rewards/rejected": -9.500037384033202, + "step": 7369 + }, + { + "epoch": 0.6733668341708543, + "grad_norm": 59.25, + "kl": 0.0, + "learning_rate": 2.418684319109139e-06, + "logits/chosen": 450334617.6, + "logits/rejected": 552948778.6666666, + "logps/chosen": -245.0166015625, + "logps/rejected": -569.5476888020834, + "loss": 0.041, + "rewards/chosen": 4.069217681884766, + "rewards/margins": 15.303243001302084, + "rewards/rejected": -11.234025319417318, + "step": 7370 + }, + { + "epoch": 0.6734582000913659, + "grad_norm": 0.73046875, + "kl": 0.0, + "learning_rate": 2.4174530509087193e-06, + "logits/chosen": 680357376.0, + "logits/rejected": 581896917.3333334, + "logps/chosen": -447.6170349121094, + "logps/rejected": -612.1056722005209, + "loss": 0.0029, + "rewards/chosen": 4.647161960601807, + "rewards/margins": 15.880865573883057, + "rewards/rejected": -11.23370361328125, + "step": 7371 + }, + { + "epoch": 0.6735495660118775, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 2.416221996261006e-06, + "logits/chosen": 749553766.4, + "logits/rejected": 453655850.6666667, + "logps/chosen": -335.7466552734375, + "logps/rejected": -399.2928059895833, + "loss": 0.0127, + "rewards/chosen": 3.914209747314453, + "rewards/margins": 14.04876937866211, + "rewards/rejected": -10.134559631347656, + "step": 7372 + }, + { + "epoch": 0.6736409319323892, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 2.4149911552677907e-06, + "logits/chosen": 547083349.3333334, + "logits/rejected": 473041280.0, + "logps/chosen": -384.8140462239583, + "logps/rejected": -471.49932861328125, + "loss": 0.0147, + "rewards/chosen": 4.52052370707194, + "rewards/margins": 11.848679224650066, + "rewards/rejected": -7.328155517578125, + "step": 7373 + }, + { + "epoch": 0.6737322978529009, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 2.4137605280308583e-06, + "logits/chosen": 435522176.0, + "logits/rejected": 423347114.6666667, + "logps/chosen": -192.38833618164062, + "logps/rejected": -350.1875, + "loss": 0.012, + "rewards/chosen": 3.5991744995117188, + "rewards/margins": 13.27146848042806, + "rewards/rejected": -9.672293980916342, + "step": 7374 + }, + { + "epoch": 0.6738236637734125, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 2.412530114651968e-06, + "logits/chosen": 463742464.0, + "logits/rejected": 565688448.0, + "logps/chosen": -294.823876953125, + "logps/rejected": -604.2856852213541, + "loss": 0.0237, + "rewards/chosen": 3.8419166564941407, + "rewards/margins": 12.172657267252603, + "rewards/rejected": -8.330740610758463, + "step": 7375 + }, + { + "epoch": 0.6739150296939241, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.4112999152328637e-06, + "logits/chosen": 579528857.6, + "logits/rejected": 484539520.0, + "logps/chosen": -307.2866455078125, + "logps/rejected": -452.8251953125, + "loss": 0.0162, + "rewards/chosen": 4.006691741943359, + "rewards/margins": 10.868665186564128, + "rewards/rejected": -6.8619734446207685, + "step": 7376 + }, + { + "epoch": 0.6740063956144358, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.41006992987527e-06, + "logits/chosen": 868878528.0, + "logits/rejected": 456693952.0, + "logps/chosen": -299.51611328125, + "logps/rejected": -447.6959228515625, + "loss": 0.0227, + "rewards/chosen": 3.1460723876953125, + "rewards/margins": 12.643705368041992, + "rewards/rejected": -9.49763298034668, + "step": 7377 + }, + { + "epoch": 0.6740977615349475, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 2.4088401586808983e-06, + "logits/chosen": 701942988.8, + "logits/rejected": 446570240.0, + "logps/chosen": -344.4551025390625, + "logps/rejected": -566.3269449869791, + "loss": 0.0182, + "rewards/chosen": 3.655841827392578, + "rewards/margins": 14.51709238688151, + "rewards/rejected": -10.861250559488932, + "step": 7378 + }, + { + "epoch": 0.6741891274554591, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 2.4076106017514384e-06, + "logits/chosen": 404320864.0, + "logits/rejected": 804342784.0, + "logps/chosen": -200.95797729492188, + "logps/rejected": -512.0126342773438, + "loss": 0.0222, + "rewards/chosen": 4.230718612670898, + "rewards/margins": 15.280821800231934, + "rewards/rejected": -11.050103187561035, + "step": 7379 + }, + { + "epoch": 0.6742804933759707, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 2.406381259188562e-06, + "logits/chosen": 564201984.0, + "logits/rejected": 615200972.8, + "logps/chosen": -445.3171793619792, + "logps/rejected": -508.63623046875, + "loss": 0.0081, + "rewards/chosen": 4.01686414082845, + "rewards/margins": 13.337590154012045, + "rewards/rejected": -9.320726013183593, + "step": 7380 + }, + { + "epoch": 0.6743718592964824, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 2.4051521310939258e-06, + "logits/chosen": 678670464.0, + "logits/rejected": 443277824.0, + "logps/chosen": -188.2042236328125, + "logps/rejected": -544.9888916015625, + "loss": 0.1391, + "rewards/chosen": 2.7256828943888345, + "rewards/margins": 12.813031832377115, + "rewards/rejected": -10.087348937988281, + "step": 7381 + }, + { + "epoch": 0.6744632252169941, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 2.4039232175691644e-06, + "logits/chosen": 1370970026.6666667, + "logits/rejected": 677632870.4, + "logps/chosen": -361.65380859375, + "logps/rejected": -583.61435546875, + "loss": 0.0061, + "rewards/chosen": 4.200247446695964, + "rewards/margins": 13.48921839396159, + "rewards/rejected": -9.288970947265625, + "step": 7382 + }, + { + "epoch": 0.6745545911375057, + "grad_norm": 0.65234375, + "kl": 0.0, + "learning_rate": 2.4026945187159018e-06, + "logits/chosen": 273520768.0, + "logits/rejected": 461409749.3333333, + "logps/chosen": -311.51983642578125, + "logps/rejected": -573.4495849609375, + "loss": 0.0025, + "rewards/chosen": 5.027041912078857, + "rewards/margins": 15.091657797495523, + "rewards/rejected": -10.064615885416666, + "step": 7383 + }, + { + "epoch": 0.6746459570580173, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 2.4014660346357376e-06, + "logits/chosen": 635106457.6, + "logits/rejected": 540186112.0, + "logps/chosen": -569.72919921875, + "logps/rejected": -285.6088053385417, + "loss": 0.0097, + "rewards/chosen": 4.260298156738282, + "rewards/margins": 12.305768076578776, + "rewards/rejected": -8.045469919840494, + "step": 7384 + }, + { + "epoch": 0.6747373229785291, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 2.4002377654302563e-06, + "logits/chosen": 626512384.0, + "logits/rejected": 768567296.0, + "logps/chosen": -353.78521728515625, + "logps/rejected": -575.2901088169643, + "loss": 0.006, + "rewards/chosen": 2.997149705886841, + "rewards/margins": 13.257194348743983, + "rewards/rejected": -10.260044642857142, + "step": 7385 + }, + { + "epoch": 0.6748286888990407, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 2.399009711201023e-06, + "logits/chosen": 1029391744.0, + "logits/rejected": 959579477.3333334, + "logps/chosen": -331.1248779296875, + "logps/rejected": -603.750732421875, + "loss": 0.0167, + "rewards/chosen": 2.692915439605713, + "rewards/margins": 11.99789031346639, + "rewards/rejected": -9.304974873860678, + "step": 7386 + }, + { + "epoch": 0.6749200548195523, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 2.397781872049589e-06, + "logits/chosen": 456661952.0, + "logits/rejected": 352576576.0, + "logps/chosen": -359.65716552734375, + "logps/rejected": -513.8951416015625, + "loss": 0.0339, + "rewards/chosen": 2.720425605773926, + "rewards/margins": 12.299880027770996, + "rewards/rejected": -9.57945442199707, + "step": 7387 + }, + { + "epoch": 0.6750114207400639, + "grad_norm": 41.75, + "kl": 0.0, + "learning_rate": 2.396554248077485e-06, + "logits/chosen": 538089216.0, + "logits/rejected": 564425728.0, + "logps/chosen": -406.8243713378906, + "logps/rejected": -467.9150390625, + "loss": 0.0491, + "rewards/chosen": 4.848664283752441, + "rewards/margins": 12.937929471333822, + "rewards/rejected": -8.08926518758138, + "step": 7388 + }, + { + "epoch": 0.6751027866605757, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 2.3953268393862227e-06, + "logits/chosen": 344510592.0, + "logits/rejected": 627958656.0, + "logps/chosen": -194.20677185058594, + "logps/rejected": -741.7213134765625, + "loss": 0.0281, + "rewards/chosen": 2.841397762298584, + "rewards/margins": 16.472398281097412, + "rewards/rejected": -13.631000518798828, + "step": 7389 + }, + { + "epoch": 0.6751941525810873, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 2.394099646077298e-06, + "logits/chosen": 697869056.0, + "logits/rejected": 494046822.4, + "logps/chosen": -552.3588053385416, + "logps/rejected": -612.297705078125, + "loss": 0.0052, + "rewards/chosen": 4.582446416219075, + "rewards/margins": 13.237506993611653, + "rewards/rejected": -8.655060577392579, + "step": 7390 + }, + { + "epoch": 0.6752855185015989, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 2.3928726682521862e-06, + "logits/chosen": 810922560.0, + "logits/rejected": 985355008.0, + "logps/chosen": -290.9048767089844, + "logps/rejected": -510.6463623046875, + "loss": 0.0175, + "rewards/chosen": 3.6737289428710938, + "rewards/margins": 12.668722152709961, + "rewards/rejected": -8.994993209838867, + "step": 7391 + }, + { + "epoch": 0.6753768844221105, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 2.391645906012352e-06, + "logits/chosen": 648444416.0, + "logits/rejected": 588551722.6666666, + "logps/chosen": -408.1292419433594, + "logps/rejected": -634.774658203125, + "loss": 0.0073, + "rewards/chosen": 3.6552765369415283, + "rewards/margins": 12.654290437698364, + "rewards/rejected": -8.999013900756836, + "step": 7392 + }, + { + "epoch": 0.6754682503426223, + "grad_norm": 65.0, + "kl": 0.0, + "learning_rate": 2.3904193594592334e-06, + "logits/chosen": 228893664.0, + "logits/rejected": 397124437.3333333, + "logps/chosen": -193.07357788085938, + "logps/rejected": -486.4571126302083, + "loss": 0.065, + "rewards/chosen": 4.5360283851623535, + "rewards/margins": 12.715394814809164, + "rewards/rejected": -8.17936642964681, + "step": 7393 + }, + { + "epoch": 0.6755596162631339, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 2.389193028694256e-06, + "logits/chosen": 627030997.3333334, + "logits/rejected": 765097600.0, + "logps/chosen": -253.2012939453125, + "logps/rejected": -484.9342956542969, + "loss": 0.0176, + "rewards/chosen": 4.00160535176595, + "rewards/margins": 14.128632863362629, + "rewards/rejected": -10.12702751159668, + "step": 7394 + }, + { + "epoch": 0.6756509821836455, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 2.3879669138188238e-06, + "logits/chosen": 628680832.0, + "logits/rejected": 461959616.0, + "logps/chosen": -321.26593017578125, + "logps/rejected": -456.62255859375, + "loss": 0.0113, + "rewards/chosen": 3.951690435409546, + "rewards/margins": 13.247990369796753, + "rewards/rejected": -9.296299934387207, + "step": 7395 + }, + { + "epoch": 0.6757423481041571, + "grad_norm": 24.5, + "kl": 0.0, + "learning_rate": 2.3867410149343284e-06, + "logits/chosen": 459285376.0, + "logits/rejected": 772746240.0, + "logps/chosen": -190.91830444335938, + "logps/rejected": -322.79327392578125, + "loss": 0.1215, + "rewards/chosen": 2.7915990352630615, + "rewards/margins": 9.409224271774292, + "rewards/rejected": -6.6176252365112305, + "step": 7396 + }, + { + "epoch": 0.6758337140246689, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 2.385515332142139e-06, + "logits/chosen": 699534250.6666666, + "logits/rejected": 659271040.0, + "logps/chosen": -339.2463785807292, + "logps/rejected": -639.5632934570312, + "loss": 0.0104, + "rewards/chosen": 4.984498023986816, + "rewards/margins": 12.747150421142578, + "rewards/rejected": -7.762652397155762, + "step": 7397 + }, + { + "epoch": 0.6759250799451805, + "grad_norm": 0.65234375, + "kl": 0.0, + "learning_rate": 2.384289865543607e-06, + "logits/chosen": 568133312.0, + "logits/rejected": 603993770.6666666, + "logps/chosen": -290.3150634765625, + "logps/rejected": -514.5713704427084, + "loss": 0.0025, + "rewards/chosen": 4.816397190093994, + "rewards/margins": 13.899529616038004, + "rewards/rejected": -9.08313242594401, + "step": 7398 + }, + { + "epoch": 0.6760164458656921, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 2.383064615240069e-06, + "logits/chosen": 505232384.0, + "logits/rejected": 463486336.0, + "logps/chosen": -223.6119842529297, + "logps/rejected": -581.1612548828125, + "loss": 0.1281, + "rewards/chosen": 2.297797679901123, + "rewards/margins": 14.299371242523193, + "rewards/rejected": -12.00157356262207, + "step": 7399 + }, + { + "epoch": 0.6761078117862037, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 2.38183958133284e-06, + "logits/chosen": 391230805.3333333, + "logits/rejected": 471037593.6, + "logps/chosen": -285.5623779296875, + "logps/rejected": -315.7431884765625, + "loss": 0.0203, + "rewards/chosen": 2.9090048472086587, + "rewards/margins": 10.709130732218425, + "rewards/rejected": -7.8001258850097654, + "step": 7400 + }, + { + "epoch": 0.6761991777067154, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 2.38061476392322e-06, + "logits/chosen": 357943210.6666667, + "logits/rejected": 599552307.2, + "logps/chosen": -315.92340087890625, + "logps/rejected": -435.8404296875, + "loss": 0.01, + "rewards/chosen": 4.737935384114583, + "rewards/margins": 14.11369145711263, + "rewards/rejected": -9.375756072998048, + "step": 7401 + }, + { + "epoch": 0.6762905436272271, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 2.3793901631124907e-06, + "logits/chosen": 238041968.0, + "logits/rejected": 493679317.3333333, + "logps/chosen": -163.7629852294922, + "logps/rejected": -730.458740234375, + "loss": 0.0133, + "rewards/chosen": 4.1204514503479, + "rewards/margins": 13.996689955393473, + "rewards/rejected": -9.876238505045572, + "step": 7402 + }, + { + "epoch": 0.6763819095477387, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 2.3781657790019118e-06, + "logits/chosen": 687400704.0, + "logits/rejected": 529471936.0, + "logps/chosen": -287.78814697265625, + "logps/rejected": -502.066162109375, + "loss": 0.0104, + "rewards/chosen": 4.182252883911133, + "rewards/margins": 15.711763381958008, + "rewards/rejected": -11.529510498046875, + "step": 7403 + }, + { + "epoch": 0.6764732754682503, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 2.3769416116927335e-06, + "logits/chosen": 663201331.2, + "logits/rejected": 395876394.6666667, + "logps/chosen": -472.295166015625, + "logps/rejected": -566.282958984375, + "loss": 0.0123, + "rewards/chosen": 4.192135238647461, + "rewards/margins": 16.66349067687988, + "rewards/rejected": -12.471355438232422, + "step": 7404 + }, + { + "epoch": 0.676564641388762, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 2.3757176612861802e-06, + "logits/chosen": 504671424.0, + "logits/rejected": 412008576.0, + "logps/chosen": -389.6265563964844, + "logps/rejected": -444.66748046875, + "loss": 0.0235, + "rewards/chosen": 3.7241501808166504, + "rewards/margins": 11.838331699371338, + "rewards/rejected": -8.114181518554688, + "step": 7405 + }, + { + "epoch": 0.6766560073092737, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 2.374493927883462e-06, + "logits/chosen": 392560554.6666667, + "logits/rejected": 418210752.0, + "logps/chosen": -294.0964762369792, + "logps/rejected": -446.26934814453125, + "loss": 0.014, + "rewards/chosen": 4.647576332092285, + "rewards/margins": 14.346821784973145, + "rewards/rejected": -9.69924545288086, + "step": 7406 + }, + { + "epoch": 0.6767473732297853, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 2.3732704115857687e-06, + "logits/chosen": 523367978.6666667, + "logits/rejected": 525574297.6, + "logps/chosen": -383.5155436197917, + "logps/rejected": -449.947509765625, + "loss": 0.0183, + "rewards/chosen": 3.0393549601236978, + "rewards/margins": 12.362871805826822, + "rewards/rejected": -9.323516845703125, + "step": 7407 + }, + { + "epoch": 0.6768387391502969, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 2.372047112494276e-06, + "logits/chosen": 416731801.6, + "logits/rejected": 456007637.3333333, + "logps/chosen": -489.219873046875, + "logps/rejected": -442.0284830729167, + "loss": 0.0343, + "rewards/chosen": 3.0321718215942384, + "rewards/margins": 10.757554308573406, + "rewards/rejected": -7.725382486979167, + "step": 7408 + }, + { + "epoch": 0.6769301050708086, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 2.3708240307101392e-06, + "logits/chosen": 443071488.0, + "logits/rejected": 553122918.4, + "logps/chosen": -401.1985270182292, + "logps/rejected": -628.843896484375, + "loss": 0.0179, + "rewards/chosen": 3.032750447591146, + "rewards/margins": 12.308597310384116, + "rewards/rejected": -9.275846862792969, + "step": 7409 + }, + { + "epoch": 0.6770214709913203, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 2.3696011663344954e-06, + "logits/chosen": 648386304.0, + "logits/rejected": 668557440.0, + "logps/chosen": -376.1717224121094, + "logps/rejected": -471.0797424316406, + "loss": 0.0095, + "rewards/chosen": 4.534756660461426, + "rewards/margins": 12.977381706237793, + "rewards/rejected": -8.442625045776367, + "step": 7410 + }, + { + "epoch": 0.6771128369118319, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 2.368378519468464e-06, + "logits/chosen": 635657600.0, + "logits/rejected": 1024376627.2, + "logps/chosen": -328.61049397786456, + "logps/rejected": -570.365380859375, + "loss": 0.0131, + "rewards/chosen": 3.6138219833374023, + "rewards/margins": 13.113160514831543, + "rewards/rejected": -9.499338531494141, + "step": 7411 + }, + { + "epoch": 0.6772042028323435, + "grad_norm": 65.0, + "kl": 0.0, + "learning_rate": 2.3671560902131445e-06, + "logits/chosen": 775600128.0, + "logits/rejected": 599348053.3333334, + "logps/chosen": -431.8572265625, + "logps/rejected": -594.765869140625, + "loss": 0.1696, + "rewards/chosen": 1.5569184303283692, + "rewards/margins": 11.617507266998292, + "rewards/rejected": -10.060588836669922, + "step": 7412 + }, + { + "epoch": 0.6772955687528552, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 2.365933878669624e-06, + "logits/chosen": 864418432.0, + "logits/rejected": 499179520.0, + "logps/chosen": -360.40753173828125, + "logps/rejected": -413.3976135253906, + "loss": 0.011, + "rewards/chosen": 4.090616226196289, + "rewards/margins": 13.67831039428711, + "rewards/rejected": -9.58769416809082, + "step": 7413 + }, + { + "epoch": 0.6773869346733669, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.3647118849389673e-06, + "logits/chosen": 626513216.0, + "logits/rejected": 1103031552.0, + "logps/chosen": -290.7344970703125, + "logps/rejected": -501.90472412109375, + "loss": 0.0129, + "rewards/chosen": 3.8183367252349854, + "rewards/margins": 14.204805612564087, + "rewards/rejected": -10.386468887329102, + "step": 7414 + }, + { + "epoch": 0.6774783005938785, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.363490109122221e-06, + "logits/chosen": 878135369.1428572, + "logits/rejected": 421392000.0, + "logps/chosen": -350.50844029017856, + "logps/rejected": -811.704345703125, + "loss": 0.0224, + "rewards/chosen": 4.0605267116001675, + "rewards/margins": 20.302433831351145, + "rewards/rejected": -16.241907119750977, + "step": 7415 + }, + { + "epoch": 0.6775696665143901, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 2.3622685513204124e-06, + "logits/chosen": 670487978.6666666, + "logits/rejected": 553479987.2, + "logps/chosen": -205.62601725260416, + "logps/rejected": -580.41044921875, + "loss": 0.0081, + "rewards/chosen": 4.248107274373372, + "rewards/margins": 15.504766209920248, + "rewards/rejected": -11.256658935546875, + "step": 7416 + }, + { + "epoch": 0.6776610324349018, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 2.361047211634558e-06, + "logits/chosen": 387366976.0, + "logits/rejected": 654464128.0, + "logps/chosen": -254.50221252441406, + "logps/rejected": -436.1939697265625, + "loss": 0.0158, + "rewards/chosen": 3.7060675621032715, + "rewards/margins": 11.907997608184814, + "rewards/rejected": -8.201930046081543, + "step": 7417 + }, + { + "epoch": 0.6777523983554135, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 2.3598260901656485e-06, + "logits/chosen": 380845824.0, + "logits/rejected": 350891315.2, + "logps/chosen": -350.8308919270833, + "logps/rejected": -425.11240234375, + "loss": 0.0075, + "rewards/chosen": 4.563185691833496, + "rewards/margins": 13.126724815368652, + "rewards/rejected": -8.563539123535156, + "step": 7418 + }, + { + "epoch": 0.6778437642759251, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 2.358605187014659e-06, + "logits/chosen": 836164992.0, + "logits/rejected": 616786304.0, + "logps/chosen": -339.51495361328125, + "logps/rejected": -222.2303466796875, + "loss": 0.0089, + "rewards/chosen": 4.481049537658691, + "rewards/margins": 11.526628494262695, + "rewards/rejected": -7.045578956604004, + "step": 7419 + }, + { + "epoch": 0.6779351301964367, + "grad_norm": 0.6875, + "kl": 0.0, + "learning_rate": 2.3573845022825465e-06, + "logits/chosen": 600666709.3333334, + "logits/rejected": 650340096.0, + "logps/chosen": -167.9224853515625, + "logps/rejected": -509.772705078125, + "loss": 0.0043, + "rewards/chosen": 4.573629061381022, + "rewards/margins": 15.601044527689616, + "rewards/rejected": -11.027415466308593, + "step": 7420 + }, + { + "epoch": 0.6780264961169484, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 2.3561640360702525e-06, + "logits/chosen": 506311520.0, + "logits/rejected": 440664746.6666667, + "logps/chosen": -250.18614196777344, + "logps/rejected": -449.593017578125, + "loss": 0.014, + "rewards/chosen": 2.9587230682373047, + "rewards/margins": 12.178707758585611, + "rewards/rejected": -9.219984690348307, + "step": 7421 + }, + { + "epoch": 0.67811786203746, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.354943788478698e-06, + "logits/chosen": 574382284.8, + "logits/rejected": 450253056.0, + "logps/chosen": -613.7380859375, + "logps/rejected": -561.324462890625, + "loss": 0.0188, + "rewards/chosen": 3.912891387939453, + "rewards/margins": 14.483872095743815, + "rewards/rejected": -10.570980707804361, + "step": 7422 + }, + { + "epoch": 0.6782092279579717, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 2.353723759608784e-06, + "logits/chosen": 455108821.3333333, + "logits/rejected": 416001248.0, + "logps/chosen": -290.1900227864583, + "logps/rejected": -412.2572937011719, + "loss": 0.1455, + "rewards/chosen": 2.2463353474934897, + "rewards/margins": 8.610970815022787, + "rewards/rejected": -6.364635467529297, + "step": 7423 + }, + { + "epoch": 0.6783005938784833, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 2.3525039495613975e-06, + "logits/chosen": 424254233.6, + "logits/rejected": 324212352.0, + "logps/chosen": -388.3412109375, + "logps/rejected": -257.58583577473956, + "loss": 0.0063, + "rewards/chosen": 4.806349563598633, + "rewards/margins": 12.68384246826172, + "rewards/rejected": -7.877492904663086, + "step": 7424 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 2.351284358437405e-06, + "logits/chosen": 1582994688.0, + "logits/rejected": 588540117.3333334, + "logps/chosen": -236.84466552734375, + "logps/rejected": -311.271728515625, + "loss": 0.0089, + "rewards/chosen": 3.3961899280548096, + "rewards/margins": 12.749491930007935, + "rewards/rejected": -9.353302001953125, + "step": 7425 + }, + { + "epoch": 0.6784833257195066, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.3500649863376544e-06, + "logits/chosen": 564165504.0, + "logits/rejected": 724948992.0, + "logps/chosen": -540.0330810546875, + "logps/rejected": -434.0771484375, + "loss": 0.0115, + "rewards/chosen": 3.7848076820373535, + "rewards/margins": 12.62214994430542, + "rewards/rejected": -8.837342262268066, + "step": 7426 + }, + { + "epoch": 0.6785746916400183, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.3488458333629777e-06, + "logits/chosen": 405199360.0, + "logits/rejected": 385004704.0, + "logps/chosen": -258.1973063151042, + "logps/rejected": -399.6939697265625, + "loss": 0.0279, + "rewards/chosen": 3.749359130859375, + "rewards/margins": 12.42058277130127, + "rewards/rejected": -8.671223640441895, + "step": 7427 + }, + { + "epoch": 0.6786660575605299, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 2.3476268996141853e-06, + "logits/chosen": 1177373013.3333333, + "logits/rejected": 883818086.4, + "logps/chosen": -333.3861083984375, + "logps/rejected": -362.9104248046875, + "loss": 0.0325, + "rewards/chosen": 3.3598785400390625, + "rewards/margins": 10.960967254638671, + "rewards/rejected": -7.6010887145996096, + "step": 7428 + }, + { + "epoch": 0.6787574234810416, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 2.3464081851920758e-06, + "logits/chosen": 496470368.0, + "logits/rejected": 559964544.0, + "logps/chosen": -315.63079833984375, + "logps/rejected": -532.0737915039062, + "loss": 0.0208, + "rewards/chosen": 3.6576976776123047, + "rewards/margins": 14.008618354797363, + "rewards/rejected": -10.350920677185059, + "step": 7429 + }, + { + "epoch": 0.6788487894015532, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 2.3451896901974223e-06, + "logits/chosen": 677619968.0, + "logits/rejected": 639616938.6666666, + "logps/chosen": -427.4391174316406, + "logps/rejected": -395.0340169270833, + "loss": 0.0075, + "rewards/chosen": 3.675894260406494, + "rewards/margins": 11.883972009023031, + "rewards/rejected": -8.208077748616537, + "step": 7430 + }, + { + "epoch": 0.6789401553220649, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 2.3439714147309845e-06, + "logits/chosen": 645473920.0, + "logits/rejected": 553538688.0, + "logps/chosen": -326.73431396484375, + "logps/rejected": -566.2784423828125, + "loss": 0.0069, + "rewards/chosen": 4.875680923461914, + "rewards/margins": 13.950339317321777, + "rewards/rejected": -9.074658393859863, + "step": 7431 + }, + { + "epoch": 0.6790315212425765, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 2.342753358893502e-06, + "logits/chosen": 623901824.0, + "logits/rejected": 456095948.8, + "logps/chosen": -414.7308756510417, + "logps/rejected": -641.37587890625, + "loss": 0.0084, + "rewards/chosen": 3.77508544921875, + "rewards/margins": 15.868877410888672, + "rewards/rejected": -12.093791961669922, + "step": 7432 + }, + { + "epoch": 0.6791228871630882, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 2.341535522785695e-06, + "logits/chosen": 704497920.0, + "logits/rejected": 830861738.6666666, + "logps/chosen": -455.2515869140625, + "logps/rejected": -711.2665201822916, + "loss": 0.0058, + "rewards/chosen": 4.336865425109863, + "rewards/margins": 14.661955197652182, + "rewards/rejected": -10.325089772542318, + "step": 7433 + }, + { + "epoch": 0.6792142530835998, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 2.340317906508271e-06, + "logits/chosen": 820563264.0, + "logits/rejected": 426950048.0, + "logps/chosen": -397.80206298828125, + "logps/rejected": -588.103515625, + "loss": 0.0184, + "rewards/chosen": 3.345729112625122, + "rewards/margins": 12.972520112991333, + "rewards/rejected": -9.626791000366211, + "step": 7434 + }, + { + "epoch": 0.6793056190041115, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 2.3391005101619134e-06, + "logits/chosen": 354254043.4285714, + "logits/rejected": 329192000.0, + "logps/chosen": -197.10177176339286, + "logps/rejected": -280.2774658203125, + "loss": 0.2568, + "rewards/chosen": 2.040585926600865, + "rewards/margins": 6.4806236539568225, + "rewards/rejected": -4.440037727355957, + "step": 7435 + }, + { + "epoch": 0.6793969849246231, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 2.33788333384729e-06, + "logits/chosen": 501891114.6666667, + "logits/rejected": 444704864.0, + "logps/chosen": -259.8542073567708, + "logps/rejected": -575.0072021484375, + "loss": 0.0134, + "rewards/chosen": 4.137457529703776, + "rewards/margins": 15.10065523783366, + "rewards/rejected": -10.963197708129883, + "step": 7436 + }, + { + "epoch": 0.6794883508451348, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 2.336666377665048e-06, + "logits/chosen": 584146624.0, + "logits/rejected": 520205376.0, + "logps/chosen": -419.46868896484375, + "logps/rejected": -687.6494140625, + "loss": 0.0122, + "rewards/chosen": 4.272320747375488, + "rewards/margins": 13.780240058898926, + "rewards/rejected": -9.507919311523438, + "step": 7437 + }, + { + "epoch": 0.6795797167656464, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 2.3354496417158217e-06, + "logits/chosen": 977894656.0, + "logits/rejected": 340637013.3333333, + "logps/chosen": -732.658935546875, + "logps/rejected": -398.2075602213542, + "loss": 0.0065, + "rewards/chosen": 4.526822090148926, + "rewards/margins": 14.520583788553873, + "rewards/rejected": -9.993761698404947, + "step": 7438 + }, + { + "epoch": 0.6796710826861581, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 2.334233126100223e-06, + "logits/chosen": 407369881.6, + "logits/rejected": 393192362.6666667, + "logps/chosen": -330.031982421875, + "logps/rejected": -604.9313557942709, + "loss": 0.0139, + "rewards/chosen": 4.3024852752685545, + "rewards/margins": 16.021813837687173, + "rewards/rejected": -11.71932856241862, + "step": 7439 + }, + { + "epoch": 0.6797624486066697, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 2.3330168309188455e-06, + "logits/chosen": 252459120.0, + "logits/rejected": 628863040.0, + "logps/chosen": -361.5862121582031, + "logps/rejected": -620.5606689453125, + "loss": 0.03, + "rewards/chosen": 3.170382022857666, + "rewards/margins": 12.40847635269165, + "rewards/rejected": -9.238094329833984, + "step": 7440 + }, + { + "epoch": 0.6798538145271814, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.331800756272266e-06, + "logits/chosen": 429535948.8, + "logits/rejected": 306731584.0, + "logps/chosen": -338.529931640625, + "logps/rejected": -473.130615234375, + "loss": 0.0142, + "rewards/chosen": 4.079225540161133, + "rewards/margins": 15.06302579243978, + "rewards/rejected": -10.983800252278646, + "step": 7441 + }, + { + "epoch": 0.679945180447693, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 2.330584902261041e-06, + "logits/chosen": 492856128.0, + "logits/rejected": 409233056.0, + "logps/chosen": -296.8535461425781, + "logps/rejected": -524.1550903320312, + "loss": 0.0159, + "rewards/chosen": 3.5825724601745605, + "rewards/margins": 14.712698459625244, + "rewards/rejected": -11.130125999450684, + "step": 7442 + }, + { + "epoch": 0.6800365463682047, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 2.3293692689857134e-06, + "logits/chosen": 558401365.3333334, + "logits/rejected": 508525792.0, + "logps/chosen": -316.3900960286458, + "logps/rejected": -670.7697143554688, + "loss": 0.011, + "rewards/chosen": 4.5358937581380205, + "rewards/margins": 15.967430432637531, + "rewards/rejected": -11.431536674499512, + "step": 7443 + }, + { + "epoch": 0.6801279122887163, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.3281538565468037e-06, + "logits/chosen": 509969920.0, + "logits/rejected": 362736832.0, + "logps/chosen": -349.0487467447917, + "logps/rejected": -320.07525634765625, + "loss": 0.0196, + "rewards/chosen": 3.847040812174479, + "rewards/margins": 12.602522532145182, + "rewards/rejected": -8.755481719970703, + "step": 7444 + }, + { + "epoch": 0.680219278209228, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 2.3269386650448145e-06, + "logits/chosen": 404040806.4, + "logits/rejected": 366110122.6666667, + "logps/chosen": -293.29716796875, + "logps/rejected": -369.0293782552083, + "loss": 0.129, + "rewards/chosen": 2.816522216796875, + "rewards/margins": 12.097150421142578, + "rewards/rejected": -9.280628204345703, + "step": 7445 + }, + { + "epoch": 0.6803106441297396, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.3257236945802292e-06, + "logits/chosen": 591396437.3333334, + "logits/rejected": 483558656.0, + "logps/chosen": -336.2200113932292, + "logps/rejected": -526.1453247070312, + "loss": 0.0303, + "rewards/chosen": 3.586276054382324, + "rewards/margins": 12.185591697692871, + "rewards/rejected": -8.599315643310547, + "step": 7446 + }, + { + "epoch": 0.6804020100502512, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.324508945253519e-06, + "logits/chosen": 507937664.0, + "logits/rejected": 197384416.0, + "logps/chosen": -352.7204284667969, + "logps/rejected": -302.8623962402344, + "loss": 0.0173, + "rewards/chosen": 3.7768337726593018, + "rewards/margins": 14.67901349067688, + "rewards/rejected": -10.902179718017578, + "step": 7447 + }, + { + "epoch": 0.6804933759707629, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 2.32329441716513e-06, + "logits/chosen": 607987456.0, + "logits/rejected": 862460928.0, + "logps/chosen": -337.63909912109375, + "logps/rejected": -444.4641520182292, + "loss": 0.0043, + "rewards/chosen": 4.248424530029297, + "rewards/margins": 12.842027028401693, + "rewards/rejected": -8.593602498372396, + "step": 7448 + }, + { + "epoch": 0.6805847418912746, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 2.3220801104154923e-06, + "logits/chosen": 686838528.0, + "logits/rejected": 559565209.6, + "logps/chosen": -491.7857259114583, + "logps/rejected": -431.432958984375, + "loss": 0.0053, + "rewards/chosen": 4.806935628255208, + "rewards/margins": 13.239622243245442, + "rewards/rejected": -8.432686614990235, + "step": 7449 + }, + { + "epoch": 0.6806761078117862, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 2.320866025105016e-06, + "logits/chosen": 391023872.0, + "logits/rejected": 416437674.6666667, + "logps/chosen": -269.904443359375, + "logps/rejected": -341.43408203125, + "loss": 0.0273, + "rewards/chosen": 3.228902053833008, + "rewards/margins": 13.479172897338866, + "rewards/rejected": -10.25027084350586, + "step": 7450 + }, + { + "epoch": 0.6807674737322978, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.319652161334099e-06, + "logits/chosen": 448897984.0, + "logits/rejected": 511037056.0, + "logps/chosen": -338.28643798828125, + "logps/rejected": -526.9243774414062, + "loss": 0.019, + "rewards/chosen": 3.6894283294677734, + "rewards/margins": 14.509466171264648, + "rewards/rejected": -10.820037841796875, + "step": 7451 + }, + { + "epoch": 0.6808588396528095, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.3184385192031157e-06, + "logits/chosen": 784413644.8, + "logits/rejected": 1191054592.0, + "logps/chosen": -410.40419921875, + "logps/rejected": -495.7948811848958, + "loss": 0.0304, + "rewards/chosen": 3.2877830505371093, + "rewards/margins": 13.192549769083659, + "rewards/rejected": -9.904766718546549, + "step": 7452 + }, + { + "epoch": 0.6809502055733212, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.3172250988124196e-06, + "logits/chosen": 695226880.0, + "logits/rejected": 532557738.6666667, + "logps/chosen": -414.14189453125, + "logps/rejected": -579.9680989583334, + "loss": 0.0169, + "rewards/chosen": 4.069857406616211, + "rewards/margins": 14.276591746012368, + "rewards/rejected": -10.206734339396158, + "step": 7453 + }, + { + "epoch": 0.6810415714938328, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.3160119002623497e-06, + "logits/chosen": 406660224.0, + "logits/rejected": 432701354.6666667, + "logps/chosen": -399.183447265625, + "logps/rejected": -514.9016927083334, + "loss": 0.025, + "rewards/chosen": 3.630881500244141, + "rewards/margins": 12.436665089925132, + "rewards/rejected": -8.80578358968099, + "step": 7454 + }, + { + "epoch": 0.6811329374143444, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 2.3147989236532304e-06, + "logits/chosen": 565328512.0, + "logits/rejected": 1025134720.0, + "logps/chosen": -269.89593505859375, + "logps/rejected": -398.9458312988281, + "loss": 0.0205, + "rewards/chosen": 3.419057607650757, + "rewards/margins": 13.32910704612732, + "rewards/rejected": -9.910049438476562, + "step": 7455 + }, + { + "epoch": 0.6812243033348561, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 2.3135861690853605e-06, + "logits/chosen": 709002496.0, + "logits/rejected": 536519680.0, + "logps/chosen": -337.22149658203125, + "logps/rejected": -435.409375, + "loss": 0.0061, + "rewards/chosen": 4.153735478719075, + "rewards/margins": 13.886225255330402, + "rewards/rejected": -9.732489776611327, + "step": 7456 + }, + { + "epoch": 0.6813156692553678, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 2.3123736366590244e-06, + "logits/chosen": 619073834.6666666, + "logits/rejected": 551246182.4, + "logps/chosen": -313.789306640625, + "logps/rejected": -488.40068359375, + "loss": 0.0083, + "rewards/chosen": 4.236583709716797, + "rewards/margins": 13.473941802978516, + "rewards/rejected": -9.237358093261719, + "step": 7457 + }, + { + "epoch": 0.6814070351758794, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.3111613264744855e-06, + "logits/chosen": 449216682.6666667, + "logits/rejected": 547502208.0, + "logps/chosen": -336.9168294270833, + "logps/rejected": -643.9386596679688, + "loss": 0.0294, + "rewards/chosen": 3.5312267939249673, + "rewards/margins": 16.329053560892742, + "rewards/rejected": -12.797826766967773, + "step": 7458 + }, + { + "epoch": 0.681498401096391, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.309949238631994e-06, + "logits/chosen": 762460160.0, + "logits/rejected": 390551328.0, + "logps/chosen": -543.3650512695312, + "logps/rejected": -231.96197509765625, + "loss": 0.0235, + "rewards/chosen": 3.4500908851623535, + "rewards/margins": 10.702961921691895, + "rewards/rejected": -7.252871036529541, + "step": 7459 + }, + { + "epoch": 0.6815897670169027, + "grad_norm": 0.73828125, + "kl": 0.0, + "learning_rate": 2.3087373732317765e-06, + "logits/chosen": 309720448.0, + "logits/rejected": 462417203.2, + "logps/chosen": -288.8702799479167, + "logps/rejected": -574.62783203125, + "loss": 0.0035, + "rewards/chosen": 4.906885464986165, + "rewards/margins": 14.59257043202718, + "rewards/rejected": -9.685684967041016, + "step": 7460 + }, + { + "epoch": 0.6816811329374144, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 2.307525730374043e-06, + "logits/chosen": 421811200.0, + "logits/rejected": 333748992.0, + "logps/chosen": -370.48577880859375, + "logps/rejected": -484.29150390625, + "loss": 0.0246, + "rewards/chosen": 3.851346015930176, + "rewards/margins": 14.966891288757324, + "rewards/rejected": -11.115545272827148, + "step": 7461 + }, + { + "epoch": 0.681772498857926, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 2.3063143101589854e-06, + "logits/chosen": 760420949.3333334, + "logits/rejected": 394936704.0, + "logps/chosen": -306.1746012369792, + "logps/rejected": -350.55078125, + "loss": 0.0532, + "rewards/chosen": 3.0070387522379556, + "rewards/margins": 11.285696665445963, + "rewards/rejected": -8.278657913208008, + "step": 7462 + }, + { + "epoch": 0.6818638647784376, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 2.3051031126867748e-06, + "logits/chosen": 242202880.0, + "logits/rejected": 317296691.2, + "logps/chosen": -253.81001790364584, + "logps/rejected": -409.603857421875, + "loss": 0.023, + "rewards/chosen": 3.87600040435791, + "rewards/margins": 12.60994815826416, + "rewards/rejected": -8.73394775390625, + "step": 7463 + }, + { + "epoch": 0.6819552306989493, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 2.30389213805757e-06, + "logits/chosen": 381316608.0, + "logits/rejected": 643695974.4, + "logps/chosen": -317.1380208333333, + "logps/rejected": -417.1564453125, + "loss": 0.0128, + "rewards/chosen": 3.6745192209879556, + "rewards/margins": 11.33984514872233, + "rewards/rejected": -7.665325927734375, + "step": 7464 + }, + { + "epoch": 0.682046596619461, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.3026813863715053e-06, + "logits/chosen": 653196160.0, + "logits/rejected": 360884416.0, + "logps/chosen": -187.97488403320312, + "logps/rejected": -301.72918701171875, + "loss": 0.0139, + "rewards/chosen": 4.180276870727539, + "rewards/margins": 14.31663703918457, + "rewards/rejected": -10.136360168457031, + "step": 7465 + }, + { + "epoch": 0.6821379625399726, + "grad_norm": 64.0, + "kl": 0.0, + "learning_rate": 2.3014708577286986e-06, + "logits/chosen": 1132680448.0, + "logits/rejected": 691722880.0, + "logps/chosen": -330.44818115234375, + "logps/rejected": -399.734130859375, + "loss": 0.0835, + "rewards/chosen": 4.114204406738281, + "rewards/margins": 11.026531219482422, + "rewards/rejected": -6.912326812744141, + "step": 7466 + }, + { + "epoch": 0.6822293284604842, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 2.300260552229248e-06, + "logits/chosen": 628325248.0, + "logits/rejected": 437382963.2, + "logps/chosen": -358.3736165364583, + "logps/rejected": -335.0224609375, + "loss": 0.0245, + "rewards/chosen": 2.8476282755533853, + "rewards/margins": 11.44818598429362, + "rewards/rejected": -8.600557708740235, + "step": 7467 + }, + { + "epoch": 0.6823206943809959, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 2.299050469973238e-06, + "logits/chosen": 369264947.2, + "logits/rejected": 525707904.0, + "logps/chosen": -204.48731689453126, + "logps/rejected": -519.0857340494791, + "loss": 0.0332, + "rewards/chosen": 3.3733314514160155, + "rewards/margins": 11.506887817382813, + "rewards/rejected": -8.133556365966797, + "step": 7468 + }, + { + "epoch": 0.6824120603015076, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 2.297840611060729e-06, + "logits/chosen": 706723456.0, + "logits/rejected": 408051168.0, + "logps/chosen": -493.4344177246094, + "logps/rejected": -424.15399169921875, + "loss": 0.0121, + "rewards/chosen": 3.784398078918457, + "rewards/margins": 12.604758262634277, + "rewards/rejected": -8.82036018371582, + "step": 7469 + }, + { + "epoch": 0.6825034262220192, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 2.296630975591766e-06, + "logits/chosen": 571223594.6666666, + "logits/rejected": 421958246.4, + "logps/chosen": -349.5443929036458, + "logps/rejected": -686.11875, + "loss": 0.0093, + "rewards/chosen": 3.8271716435750327, + "rewards/margins": 15.836361249287924, + "rewards/rejected": -12.00918960571289, + "step": 7470 + }, + { + "epoch": 0.6825947921425308, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 2.295421563666372e-06, + "logits/chosen": 432902336.0, + "logits/rejected": 623353792.0, + "logps/chosen": -296.224609375, + "logps/rejected": -482.78033447265625, + "loss": 0.0162, + "rewards/chosen": 4.086357116699219, + "rewards/margins": 13.088238716125488, + "rewards/rejected": -9.00188159942627, + "step": 7471 + }, + { + "epoch": 0.6826861580630424, + "grad_norm": 26.5, + "kl": 0.0, + "learning_rate": 2.2942123753845573e-06, + "logits/chosen": 530360149.3333333, + "logits/rejected": 570088896.0, + "logps/chosen": -292.8720703125, + "logps/rejected": -291.3455810546875, + "loss": 0.0678, + "rewards/chosen": 3.9083944956461587, + "rewards/margins": 9.232258001963297, + "rewards/rejected": -5.323863506317139, + "step": 7472 + }, + { + "epoch": 0.6827775239835542, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 2.29300341084631e-06, + "logits/chosen": 566483584.0, + "logits/rejected": 349324960.0, + "logps/chosen": -321.91705322265625, + "logps/rejected": -433.07269287109375, + "loss": 0.0114, + "rewards/chosen": 3.881467342376709, + "rewards/margins": 13.712161540985107, + "rewards/rejected": -9.830694198608398, + "step": 7473 + }, + { + "epoch": 0.6828688899040658, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 2.2917946701515996e-06, + "logits/chosen": 437720234.6666667, + "logits/rejected": 428178892.8, + "logps/chosen": -356.6847737630208, + "logps/rejected": -447.8158203125, + "loss": 0.0089, + "rewards/chosen": 4.16355578104655, + "rewards/margins": 13.885177485148112, + "rewards/rejected": -9.721621704101562, + "step": 7474 + }, + { + "epoch": 0.6829602558245774, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 2.2905861534003783e-06, + "logits/chosen": 400696917.3333333, + "logits/rejected": 268033664.0, + "logps/chosen": -239.8480021158854, + "logps/rejected": -370.9901611328125, + "loss": 0.0189, + "rewards/chosen": 3.7225755055745444, + "rewards/margins": 12.396542485555013, + "rewards/rejected": -8.673966979980468, + "step": 7475 + }, + { + "epoch": 0.683051621745089, + "grad_norm": 56.75, + "kl": 0.0, + "learning_rate": 2.2893778606925767e-06, + "logits/chosen": 1022320384.0, + "logits/rejected": 452871372.8, + "logps/chosen": -408.697509765625, + "logps/rejected": -252.4894775390625, + "loss": 0.0941, + "rewards/chosen": 3.9976412455240884, + "rewards/margins": 10.229767100016275, + "rewards/rejected": -6.232125854492187, + "step": 7476 + }, + { + "epoch": 0.6831429876656008, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 2.2881697921281134e-06, + "logits/chosen": 413276032.0, + "logits/rejected": 443521740.8, + "logps/chosen": -305.051025390625, + "logps/rejected": -283.8281494140625, + "loss": 0.1419, + "rewards/chosen": 0.827077309290568, + "rewards/margins": 8.540843788782755, + "rewards/rejected": -7.713766479492188, + "step": 7477 + }, + { + "epoch": 0.6832343535861124, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.2869619478068827e-06, + "logits/chosen": 257039701.33333334, + "logits/rejected": 299578905.6, + "logps/chosen": -168.39251708984375, + "logps/rejected": -413.69072265625, + "loss": 0.1206, + "rewards/chosen": 2.5280380249023438, + "rewards/margins": 11.738848114013672, + "rewards/rejected": -9.210810089111328, + "step": 7478 + }, + { + "epoch": 0.683325719506624, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.285754327828762e-06, + "logits/chosen": 522425088.0, + "logits/rejected": 241755818.66666666, + "logps/chosen": -406.36953125, + "logps/rejected": -345.0247802734375, + "loss": 0.0348, + "rewards/chosen": 3.1411758422851563, + "rewards/margins": 12.401453908284505, + "rewards/rejected": -9.26027806599935, + "step": 7479 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 2.2845469322936103e-06, + "logits/chosen": 512975786.6666667, + "logits/rejected": 842189312.0, + "logps/chosen": -247.28548177083334, + "logps/rejected": -475.0388671875, + "loss": 0.0125, + "rewards/chosen": 3.759097417195638, + "rewards/margins": 13.11885846455892, + "rewards/rejected": -9.359761047363282, + "step": 7480 + }, + { + "epoch": 0.6835084513476474, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 2.283339761301268e-06, + "logits/chosen": 684135833.6, + "logits/rejected": 710036480.0, + "logps/chosen": -324.96025390625, + "logps/rejected": -319.26902262369794, + "loss": 0.014, + "rewards/chosen": 4.179881286621094, + "rewards/margins": 11.22200444539388, + "rewards/rejected": -7.042123158772786, + "step": 7481 + }, + { + "epoch": 0.683599817268159, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 2.2821328149515576e-06, + "logits/chosen": 406890837.3333333, + "logits/rejected": 382372684.8, + "logps/chosen": -424.6018473307292, + "logps/rejected": -450.469189453125, + "loss": 0.0169, + "rewards/chosen": 3.2449563344319663, + "rewards/margins": 13.998163731892904, + "rewards/rejected": -10.753207397460937, + "step": 7482 + }, + { + "epoch": 0.6836911831886706, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 2.2809260933442814e-06, + "logits/chosen": 593224320.0, + "logits/rejected": 506265952.0, + "logps/chosen": -408.05810546875, + "logps/rejected": -568.1968383789062, + "loss": 0.0225, + "rewards/chosen": 3.844552516937256, + "rewards/margins": 12.6161208152771, + "rewards/rejected": -8.771568298339844, + "step": 7483 + }, + { + "epoch": 0.6837825491091822, + "grad_norm": 0.8046875, + "kl": 0.0, + "learning_rate": 2.279719596579222e-06, + "logits/chosen": 595741440.0, + "logits/rejected": 475400192.0, + "logps/chosen": -412.35479736328125, + "logps/rejected": -309.6802978515625, + "loss": 0.0048, + "rewards/chosen": 5.213250160217285, + "rewards/margins": 13.623002052307129, + "rewards/rejected": -8.409751892089844, + "step": 7484 + }, + { + "epoch": 0.683873915029694, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 2.278513324756151e-06, + "logits/chosen": 839357132.8, + "logits/rejected": 603431338.6666666, + "logps/chosen": -263.8530517578125, + "logps/rejected": -540.3841959635416, + "loss": 0.0339, + "rewards/chosen": 3.8413047790527344, + "rewards/margins": 11.341904958089192, + "rewards/rejected": -7.500600179036458, + "step": 7485 + }, + { + "epoch": 0.6839652809502056, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.2773072779748123e-06, + "logits/chosen": 406959718.4, + "logits/rejected": 501041493.3333333, + "logps/chosen": -243.5901611328125, + "logps/rejected": -559.170654296875, + "loss": 0.0167, + "rewards/chosen": 4.424459075927734, + "rewards/margins": 14.926109568277994, + "rewards/rejected": -10.50165049235026, + "step": 7486 + }, + { + "epoch": 0.6840566468707172, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 2.2761014563349354e-06, + "logits/chosen": 454759381.3333333, + "logits/rejected": 340946176.0, + "logps/chosen": -301.58502197265625, + "logps/rejected": -395.3040771484375, + "loss": 0.0253, + "rewards/chosen": 3.841730753580729, + "rewards/margins": 14.01347796122233, + "rewards/rejected": -10.171747207641602, + "step": 7487 + }, + { + "epoch": 0.6841480127912288, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 2.274895859936229e-06, + "logits/chosen": 926815872.0, + "logits/rejected": 1335482112.0, + "logps/chosen": -591.127685546875, + "logps/rejected": -634.1818237304688, + "loss": 0.0132, + "rewards/chosen": 4.631178379058838, + "rewards/margins": 14.36275053024292, + "rewards/rejected": -9.731572151184082, + "step": 7488 + }, + { + "epoch": 0.6842393787117406, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 2.2736904888783877e-06, + "logits/chosen": 989812864.0, + "logits/rejected": 751107904.0, + "logps/chosen": -344.81610107421875, + "logps/rejected": -727.5596923828125, + "loss": 0.0113, + "rewards/chosen": 4.068442344665527, + "rewards/margins": 13.740826606750488, + "rewards/rejected": -9.672384262084961, + "step": 7489 + }, + { + "epoch": 0.6843307446322522, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 2.2724853432610832e-06, + "logits/chosen": 822373205.3333334, + "logits/rejected": 770317248.0, + "logps/chosen": -272.3291829427083, + "logps/rejected": -613.2833251953125, + "loss": 0.0231, + "rewards/chosen": 3.988265037536621, + "rewards/margins": 15.190271377563477, + "rewards/rejected": -11.202006340026855, + "step": 7490 + }, + { + "epoch": 0.6844221105527638, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 2.27128042318397e-06, + "logits/chosen": 962149696.0, + "logits/rejected": 737174848.0, + "logps/chosen": -206.3031768798828, + "logps/rejected": -368.693359375, + "loss": 0.1274, + "rewards/chosen": 4.128870964050293, + "rewards/margins": 11.601208686828613, + "rewards/rejected": -7.47233772277832, + "step": 7491 + }, + { + "epoch": 0.6845134764732754, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 2.2700757287466817e-06, + "logits/chosen": 399389909.3333333, + "logits/rejected": 403300768.0, + "logps/chosen": -248.2254842122396, + "logps/rejected": -310.9437255859375, + "loss": 0.1636, + "rewards/chosen": 2.276400883992513, + "rewards/margins": 10.220528920491537, + "rewards/rejected": -7.944128036499023, + "step": 7492 + }, + { + "epoch": 0.6846048423937872, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 2.2688712600488394e-06, + "logits/chosen": 710608128.0, + "logits/rejected": 1127357952.0, + "logps/chosen": -379.5773518880208, + "logps/rejected": -441.47271728515625, + "loss": 0.0299, + "rewards/chosen": 3.4600823720296225, + "rewards/margins": 12.675892194112143, + "rewards/rejected": -9.21580982208252, + "step": 7493 + }, + { + "epoch": 0.6846962083142988, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 2.2676670171900395e-06, + "logits/chosen": 379002240.0, + "logits/rejected": 547987712.0, + "logps/chosen": -344.4359130859375, + "logps/rejected": -537.666748046875, + "loss": 0.015, + "rewards/chosen": 4.394133567810059, + "rewards/margins": 14.518136024475098, + "rewards/rejected": -10.124002456665039, + "step": 7494 + }, + { + "epoch": 0.6847875742348104, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 2.2664630002698616e-06, + "logits/chosen": 389435221.3333333, + "logits/rejected": 578930816.0, + "logps/chosen": -318.7311604817708, + "logps/rejected": -746.1735229492188, + "loss": 0.0166, + "rewards/chosen": 4.307523091634114, + "rewards/margins": 15.698609670003254, + "rewards/rejected": -11.39108657836914, + "step": 7495 + }, + { + "epoch": 0.684878940155322, + "grad_norm": 0.263671875, + "kl": 0.0, + "learning_rate": 2.265259209387867e-06, + "logits/chosen": 406229248.0, + "logits/rejected": 428848076.8, + "logps/chosen": -435.7750244140625, + "logps/rejected": -437.727392578125, + "loss": 0.0014, + "rewards/chosen": 6.157594045003255, + "rewards/margins": 15.57647221883138, + "rewards/rejected": -9.418878173828125, + "step": 7496 + }, + { + "epoch": 0.6849703060758338, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 2.264055644643595e-06, + "logits/chosen": 1011292608.0, + "logits/rejected": 468722176.0, + "logps/chosen": -333.2481994628906, + "logps/rejected": -324.6782633463542, + "loss": 0.0076, + "rewards/chosen": 3.6423544883728027, + "rewards/margins": 11.857138474782309, + "rewards/rejected": -8.214783986409506, + "step": 7497 + }, + { + "epoch": 0.6850616719963454, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 2.2628523061365756e-06, + "logits/chosen": 772669440.0, + "logits/rejected": 359676373.3333333, + "logps/chosen": -204.87666015625, + "logps/rejected": -541.2692057291666, + "loss": 0.0325, + "rewards/chosen": 3.724953460693359, + "rewards/margins": 13.027010726928712, + "rewards/rejected": -9.302057266235352, + "step": 7498 + }, + { + "epoch": 0.685153037916857, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 2.261649193966309e-06, + "logits/chosen": 468393173.3333333, + "logits/rejected": 405938918.4, + "logps/chosen": -198.38155110677084, + "logps/rejected": -406.6048828125, + "loss": 0.0164, + "rewards/chosen": 3.115753491719564, + "rewards/margins": 11.918883069356283, + "rewards/rejected": -8.803129577636719, + "step": 7499 + }, + { + "epoch": 0.6852444038373686, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 2.2604463082322825e-06, + "logits/chosen": 1171013120.0, + "logits/rejected": 922505472.0, + "logps/chosen": -200.88233947753906, + "logps/rejected": -576.2469482421875, + "loss": 0.022, + "rewards/chosen": 3.7914466857910156, + "rewards/margins": 14.311712265014648, + "rewards/rejected": -10.520265579223633, + "step": 7500 + }, + { + "epoch": 0.6853357697578804, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.2592436490339624e-06, + "logits/chosen": 343236672.0, + "logits/rejected": 575852352.0, + "logps/chosen": -449.1739807128906, + "logps/rejected": -522.6257934570312, + "loss": 0.0147, + "rewards/chosen": 3.9252586364746094, + "rewards/margins": 13.668022155761719, + "rewards/rejected": -9.74276351928711, + "step": 7501 + }, + { + "epoch": 0.685427135678392, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 2.258041216470801e-06, + "logits/chosen": 691998848.0, + "logits/rejected": 612988288.0, + "logps/chosen": -366.03741455078125, + "logps/rejected": -462.143798828125, + "loss": 0.0064, + "rewards/chosen": 5.2935791015625, + "rewards/margins": 14.553527196248373, + "rewards/rejected": -9.259948094685873, + "step": 7502 + }, + { + "epoch": 0.6855185015989036, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 2.2568390106422263e-06, + "logits/chosen": 612531456.0, + "logits/rejected": 793654272.0, + "logps/chosen": -538.2230631510416, + "logps/rejected": -681.722119140625, + "loss": 0.0112, + "rewards/chosen": 3.575290044148763, + "rewards/margins": 14.977682622273763, + "rewards/rejected": -11.402392578125, + "step": 7503 + }, + { + "epoch": 0.6856098675194152, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 2.255637031647649e-06, + "logits/chosen": 559839317.3333334, + "logits/rejected": 357676723.2, + "logps/chosen": -476.0674641927083, + "logps/rejected": -356.918701171875, + "loss": 0.007, + "rewards/chosen": 4.4411195119222, + "rewards/margins": 13.875269444783527, + "rewards/rejected": -9.434149932861327, + "step": 7504 + }, + { + "epoch": 0.685701233439927, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 2.2544352795864616e-06, + "logits/chosen": 779155797.3333334, + "logits/rejected": 593936793.6, + "logps/chosen": -540.2734781901041, + "logps/rejected": -336.0296142578125, + "loss": 0.0127, + "rewards/chosen": 4.021910667419434, + "rewards/margins": 12.556692314147949, + "rewards/rejected": -8.534781646728515, + "step": 7505 + }, + { + "epoch": 0.6857925993604386, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 2.253233754558039e-06, + "logits/chosen": 340342656.0, + "logits/rejected": 503992000.0, + "logps/chosen": -271.2862243652344, + "logps/rejected": -340.1205749511719, + "loss": 0.017, + "rewards/chosen": 3.6621360778808594, + "rewards/margins": 12.403545379638672, + "rewards/rejected": -8.741409301757812, + "step": 7506 + }, + { + "epoch": 0.6858839652809502, + "grad_norm": 0.546875, + "kl": 0.0, + "learning_rate": 2.252032456661736e-06, + "logits/chosen": 217416992.0, + "logits/rejected": 530818816.0, + "logps/chosen": -145.93157958984375, + "logps/rejected": -447.0052083333333, + "loss": 0.0025, + "rewards/chosen": 5.316493988037109, + "rewards/margins": 14.1445681254069, + "rewards/rejected": -8.828074137369791, + "step": 7507 + }, + { + "epoch": 0.6859753312014618, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 2.2508313859968874e-06, + "logits/chosen": 628782762.6666666, + "logits/rejected": 757151846.4, + "logps/chosen": -447.9108072916667, + "logps/rejected": -382.138232421875, + "loss": 0.0075, + "rewards/chosen": 4.521676699320476, + "rewards/margins": 11.81808172861735, + "rewards/rejected": -7.296405029296875, + "step": 7508 + }, + { + "epoch": 0.6860666971219735, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 2.24963054266281e-06, + "logits/chosen": 497282208.0, + "logits/rejected": 681127424.0, + "logps/chosen": -329.66156005859375, + "logps/rejected": -637.4655151367188, + "loss": 0.0132, + "rewards/chosen": 3.7282845973968506, + "rewards/margins": 14.499997854232788, + "rewards/rejected": -10.771713256835938, + "step": 7509 + }, + { + "epoch": 0.6861580630424852, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.2484299267588063e-06, + "logits/chosen": 504681941.3333333, + "logits/rejected": 435910976.0, + "logps/chosen": -239.0306599934896, + "logps/rejected": -397.706298828125, + "loss": 0.0199, + "rewards/chosen": 4.493609110514323, + "rewards/margins": 12.257255236307781, + "rewards/rejected": -7.763646125793457, + "step": 7510 + }, + { + "epoch": 0.6862494289629968, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 2.2472295383841532e-06, + "logits/chosen": 365774784.0, + "logits/rejected": 388377632.0, + "logps/chosen": -333.9225158691406, + "logps/rejected": -450.20269775390625, + "loss": 0.011, + "rewards/chosen": 3.936868190765381, + "rewards/margins": 14.187831401824951, + "rewards/rejected": -10.25096321105957, + "step": 7511 + }, + { + "epoch": 0.6863407948835084, + "grad_norm": 28.5, + "kl": 0.0, + "learning_rate": 2.2460293776381126e-06, + "logits/chosen": 386443264.0, + "logits/rejected": 468408985.6, + "logps/chosen": -341.8138427734375, + "logps/rejected": -335.6547119140625, + "loss": 0.0191, + "rewards/chosen": 4.8807172775268555, + "rewards/margins": 13.231113243103028, + "rewards/rejected": -8.350395965576173, + "step": 7512 + }, + { + "epoch": 0.6864321608040201, + "grad_norm": 0.359375, + "kl": 0.0, + "learning_rate": 2.244829444619924e-06, + "logits/chosen": 646385600.0, + "logits/rejected": 462863140.5714286, + "logps/chosen": -457.4166259765625, + "logps/rejected": -469.90248325892856, + "loss": 0.001, + "rewards/chosen": 5.0199127197265625, + "rewards/margins": 13.816370282854352, + "rewards/rejected": -8.79645756312779, + "step": 7513 + }, + { + "epoch": 0.6865235267245318, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 2.243629739428816e-06, + "logits/chosen": 1565674325.3333333, + "logits/rejected": 1043077529.6, + "logps/chosen": -342.4756673177083, + "logps/rejected": -516.201904296875, + "loss": 0.0231, + "rewards/chosen": 3.0507593154907227, + "rewards/margins": 13.567600440979003, + "rewards/rejected": -10.51684112548828, + "step": 7514 + }, + { + "epoch": 0.6866148926450434, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 2.242430262163989e-06, + "logits/chosen": 638399424.0, + "logits/rejected": 412489301.3333333, + "logps/chosen": -486.048583984375, + "logps/rejected": -420.0867513020833, + "loss": 0.0057, + "rewards/chosen": 3.8318939208984375, + "rewards/margins": 13.53268051147461, + "rewards/rejected": -9.700786590576172, + "step": 7515 + }, + { + "epoch": 0.686706258565555, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 2.241231012924631e-06, + "logits/chosen": 682046976.0, + "logits/rejected": 390306517.3333333, + "logps/chosen": -494.4669189453125, + "logps/rejected": -394.6472574869792, + "loss": 0.1086, + "rewards/chosen": 3.678617000579834, + "rewards/margins": 9.781310558319092, + "rewards/rejected": -6.102693557739258, + "step": 7516 + }, + { + "epoch": 0.6867976244860667, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 2.2400319918099073e-06, + "logits/chosen": 434668864.0, + "logits/rejected": 324560192.0, + "logps/chosen": -294.91522216796875, + "logps/rejected": -381.36077880859375, + "loss": 0.0108, + "rewards/chosen": 3.968327045440674, + "rewards/margins": 12.469549655914307, + "rewards/rejected": -8.501222610473633, + "step": 7517 + }, + { + "epoch": 0.6868889904065784, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 2.238833198918964e-06, + "logits/chosen": 648615296.0, + "logits/rejected": 1219152128.0, + "logps/chosen": -265.8612976074219, + "logps/rejected": -438.1198425292969, + "loss": 0.0126, + "rewards/chosen": 4.100836753845215, + "rewards/margins": 12.454632759094238, + "rewards/rejected": -8.353796005249023, + "step": 7518 + }, + { + "epoch": 0.68698035632709, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.2376346343509343e-06, + "logits/chosen": 975338700.8, + "logits/rejected": 600728661.3333334, + "logps/chosen": -317.361181640625, + "logps/rejected": -378.33984375, + "loss": 0.0333, + "rewards/chosen": 3.1694093704223634, + "rewards/margins": 12.944942410786947, + "rewards/rejected": -9.775533040364584, + "step": 7519 + }, + { + "epoch": 0.6870717222476016, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 2.2364362982049265e-06, + "logits/chosen": 553070976.0, + "logits/rejected": 467285205.3333333, + "logps/chosen": -334.5627136230469, + "logps/rejected": -400.05126953125, + "loss": 0.0096, + "rewards/chosen": 3.389415740966797, + "rewards/margins": 12.15063730875651, + "rewards/rejected": -8.761221567789713, + "step": 7520 + }, + { + "epoch": 0.6871630881681133, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 2.2352381905800325e-06, + "logits/chosen": 408128512.0, + "logits/rejected": 586538410.6666666, + "logps/chosen": -327.3482666015625, + "logps/rejected": -696.5621744791666, + "loss": 0.0148, + "rewards/chosen": 4.188266372680664, + "rewards/margins": 12.179230626424154, + "rewards/rejected": -7.990964253743489, + "step": 7521 + }, + { + "epoch": 0.687254454088625, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 2.2340403115753212e-06, + "logits/chosen": 486657536.0, + "logits/rejected": 428547029.3333333, + "logps/chosen": -185.7716064453125, + "logps/rejected": -504.84130859375, + "loss": 0.0244, + "rewards/chosen": 4.010235214233399, + "rewards/margins": 12.653918329874674, + "rewards/rejected": -8.643683115641275, + "step": 7522 + }, + { + "epoch": 0.6873458200091366, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 2.2328426612898514e-06, + "logits/chosen": 579924480.0, + "logits/rejected": 951170688.0, + "logps/chosen": -288.855712890625, + "logps/rejected": -385.2728271484375, + "loss": 0.029, + "rewards/chosen": 3.2574260234832764, + "rewards/margins": 11.076559782028198, + "rewards/rejected": -7.819133758544922, + "step": 7523 + }, + { + "epoch": 0.6874371859296482, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 2.231645239822655e-06, + "logits/chosen": 885133504.0, + "logits/rejected": 645300160.0, + "logps/chosen": -370.20379638671875, + "logps/rejected": -597.93115234375, + "loss": 0.0242, + "rewards/chosen": 3.555919647216797, + "rewards/margins": 12.669662475585938, + "rewards/rejected": -9.11374282836914, + "step": 7524 + }, + { + "epoch": 0.6875285518501599, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 2.230448047272748e-06, + "logits/chosen": 1201904640.0, + "logits/rejected": 698357504.0, + "logps/chosen": -526.8555908203125, + "logps/rejected": -596.6414794921875, + "loss": 0.0224, + "rewards/chosen": 3.2100753784179688, + "rewards/margins": 14.110969543457031, + "rewards/rejected": -10.900894165039062, + "step": 7525 + }, + { + "epoch": 0.6876199177706716, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 2.229251083739127e-06, + "logits/chosen": 396060992.0, + "logits/rejected": 710055360.0, + "logps/chosen": -232.86050415039062, + "logps/rejected": -496.1868896484375, + "loss": 0.019, + "rewards/chosen": 3.942612886428833, + "rewards/margins": 12.604460954666138, + "rewards/rejected": -8.661848068237305, + "step": 7526 + }, + { + "epoch": 0.6877112836911832, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 2.2280543493207678e-06, + "logits/chosen": 600703424.0, + "logits/rejected": 759537024.0, + "logps/chosen": -297.7827453613281, + "logps/rejected": -486.8220520019531, + "loss": 0.017, + "rewards/chosen": 3.5769028663635254, + "rewards/margins": 14.432718753814697, + "rewards/rejected": -10.855815887451172, + "step": 7527 + }, + { + "epoch": 0.6878026496116948, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 2.2268578441166332e-06, + "logits/chosen": 361239398.4, + "logits/rejected": 320157034.6666667, + "logps/chosen": -270.587744140625, + "logps/rejected": -299.12939453125, + "loss": 0.0205, + "rewards/chosen": 4.149932861328125, + "rewards/margins": 12.31716537475586, + "rewards/rejected": -8.167232513427734, + "step": 7528 + }, + { + "epoch": 0.6878940155322065, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 2.2256615682256615e-06, + "logits/chosen": 293451136.0, + "logits/rejected": 495224576.0, + "logps/chosen": -177.969287109375, + "logps/rejected": -598.53076171875, + "loss": 0.0137, + "rewards/chosen": 4.067091369628907, + "rewards/margins": 13.168511454264323, + "rewards/rejected": -9.101420084635416, + "step": 7529 + }, + { + "epoch": 0.6879853814527181, + "grad_norm": 0.3359375, + "kl": 0.0, + "learning_rate": 2.2244655217467733e-06, + "logits/chosen": 213477216.0, + "logits/rejected": 477380960.0, + "logps/chosen": -202.49240112304688, + "logps/rejected": -592.0811157226562, + "loss": 0.0018, + "rewards/chosen": 5.931879043579102, + "rewards/margins": 14.97679328918457, + "rewards/rejected": -9.044914245605469, + "step": 7530 + }, + { + "epoch": 0.6880767473732298, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 2.223269704778868e-06, + "logits/chosen": 547104597.3333334, + "logits/rejected": 216304320.0, + "logps/chosen": -339.5286458333333, + "logps/rejected": -408.4642639160156, + "loss": 0.0267, + "rewards/chosen": 3.589721361796061, + "rewards/margins": 14.713946978251139, + "rewards/rejected": -11.124225616455078, + "step": 7531 + }, + { + "epoch": 0.6881681132937414, + "grad_norm": 45.5, + "kl": 0.0, + "learning_rate": 2.2220741174208355e-06, + "logits/chosen": 490590080.0, + "logits/rejected": 569619840.0, + "logps/chosen": -363.4696350097656, + "logps/rejected": -507.5592346191406, + "loss": 0.0669, + "rewards/chosen": 4.378796577453613, + "rewards/margins": 11.823616027832031, + "rewards/rejected": -7.444819450378418, + "step": 7532 + }, + { + "epoch": 0.6882594792142531, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 2.220878759771533e-06, + "logits/chosen": 724146688.0, + "logits/rejected": 336744557.71428573, + "logps/chosen": -323.2743835449219, + "logps/rejected": -332.98043387276783, + "loss": 0.0039, + "rewards/chosen": 3.453683614730835, + "rewards/margins": 13.557565450668335, + "rewards/rejected": -10.1038818359375, + "step": 7533 + }, + { + "epoch": 0.6883508451347647, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.2196836319298064e-06, + "logits/chosen": 610134912.0, + "logits/rejected": 581950016.0, + "logps/chosen": -296.9454040527344, + "logps/rejected": -681.3773193359375, + "loss": 0.0156, + "rewards/chosen": 3.7741055488586426, + "rewards/margins": 13.166686534881592, + "rewards/rejected": -9.39258098602295, + "step": 7534 + }, + { + "epoch": 0.6884422110552764, + "grad_norm": 49.0, + "kl": 0.0, + "learning_rate": 2.2184887339944854e-06, + "logits/chosen": 687857322.6666666, + "logits/rejected": 379012480.0, + "logps/chosen": -302.3622639973958, + "logps/rejected": -404.92584228515625, + "loss": 0.0752, + "rewards/chosen": 3.1779518127441406, + "rewards/margins": 12.511722564697266, + "rewards/rejected": -9.333770751953125, + "step": 7535 + }, + { + "epoch": 0.688533576975788, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.2172940660643743e-06, + "logits/chosen": 570752341.3333334, + "logits/rejected": 693798963.2, + "logps/chosen": -214.99161783854166, + "logps/rejected": -511.70556640625, + "loss": 0.0149, + "rewards/chosen": 3.6120630900065103, + "rewards/margins": 11.992180887858073, + "rewards/rejected": -8.380117797851563, + "step": 7536 + }, + { + "epoch": 0.6886249428962997, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 2.2160996282382617e-06, + "logits/chosen": 483219616.0, + "logits/rejected": 512175658.6666667, + "logps/chosen": -335.20623779296875, + "logps/rejected": -471.6399739583333, + "loss": 0.009, + "rewards/chosen": 3.327235460281372, + "rewards/margins": 13.641549984614054, + "rewards/rejected": -10.314314524332682, + "step": 7537 + }, + { + "epoch": 0.6887163088168113, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 2.2149054206149166e-06, + "logits/chosen": 616977024.0, + "logits/rejected": 618346837.3333334, + "logps/chosen": -396.3961181640625, + "logps/rejected": -716.3905436197916, + "loss": 0.0056, + "rewards/chosen": 3.8172364234924316, + "rewards/margins": 13.63545529047648, + "rewards/rejected": -9.818218866984049, + "step": 7538 + }, + { + "epoch": 0.688807674737323, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 2.213711443293087e-06, + "logits/chosen": 555942144.0, + "logits/rejected": 465645363.2, + "logps/chosen": -339.04872639973956, + "logps/rejected": -530.453515625, + "loss": 0.0081, + "rewards/chosen": 3.971655527750651, + "rewards/margins": 13.052902475992838, + "rewards/rejected": -9.081246948242187, + "step": 7539 + }, + { + "epoch": 0.6888990406578346, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 2.212517696371507e-06, + "logits/chosen": 663989034.6666666, + "logits/rejected": 412534835.2, + "logps/chosen": -235.395751953125, + "logps/rejected": -443.0484375, + "loss": 0.009, + "rewards/chosen": 4.326359113057454, + "rewards/margins": 13.023263104756673, + "rewards/rejected": -8.69690399169922, + "step": 7540 + }, + { + "epoch": 0.6889904065783463, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.211324179948888e-06, + "logits/chosen": 598305194.6666666, + "logits/rejected": 970098944.0, + "logps/chosen": -406.1588134765625, + "logps/rejected": -578.8251953125, + "loss": 0.0315, + "rewards/chosen": 3.8379383087158203, + "rewards/margins": 11.91545295715332, + "rewards/rejected": -8.0775146484375, + "step": 7541 + }, + { + "epoch": 0.6890817724988579, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 2.2101308941239204e-06, + "logits/chosen": 391655424.0, + "logits/rejected": 749664896.0, + "logps/chosen": -273.7970377604167, + "logps/rejected": -289.6824645996094, + "loss": 0.014, + "rewards/chosen": 4.187614440917969, + "rewards/margins": 11.684284210205078, + "rewards/rejected": -7.496669769287109, + "step": 7542 + }, + { + "epoch": 0.6891731384193696, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 2.2089378389952777e-06, + "logits/chosen": 471213920.0, + "logits/rejected": 373892010.6666667, + "logps/chosen": -202.06747436523438, + "logps/rejected": -461.624267578125, + "loss": 0.0086, + "rewards/chosen": 3.351734161376953, + "rewards/margins": 13.541067759195963, + "rewards/rejected": -10.18933359781901, + "step": 7543 + }, + { + "epoch": 0.6892645043398812, + "grad_norm": 46.0, + "kl": 0.0, + "learning_rate": 2.2077450146616175e-06, + "logits/chosen": 684300544.0, + "logits/rejected": 543504000.0, + "logps/chosen": -314.43310546875, + "logps/rejected": -418.7064208984375, + "loss": 0.1017, + "rewards/chosen": 2.4985716342926025, + "rewards/margins": 13.552847146987915, + "rewards/rejected": -11.054275512695312, + "step": 7544 + }, + { + "epoch": 0.6893558702603929, + "grad_norm": 50.25, + "kl": 0.0, + "learning_rate": 2.2065524212215744e-06, + "logits/chosen": 702170688.0, + "logits/rejected": 476202880.0, + "logps/chosen": -207.929443359375, + "logps/rejected": -555.5001831054688, + "loss": 0.079, + "rewards/chosen": 3.2593531608581543, + "rewards/margins": 11.850688457489014, + "rewards/rejected": -8.59133529663086, + "step": 7545 + }, + { + "epoch": 0.6894472361809045, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 2.205360058773764e-06, + "logits/chosen": 311069354.6666667, + "logits/rejected": 167492864.0, + "logps/chosen": -169.60499064127603, + "logps/rejected": -274.8843994140625, + "loss": 0.0172, + "rewards/chosen": 4.935675938924153, + "rewards/margins": 11.908313115437824, + "rewards/rejected": -6.972637176513672, + "step": 7546 + }, + { + "epoch": 0.6895386021014162, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.2041679274167832e-06, + "logits/chosen": 467436416.0, + "logits/rejected": 504077107.2, + "logps/chosen": -138.74920654296875, + "logps/rejected": -482.060400390625, + "loss": 0.0241, + "rewards/chosen": 3.187321662902832, + "rewards/margins": 11.876627540588379, + "rewards/rejected": -8.689305877685547, + "step": 7547 + }, + { + "epoch": 0.6896299680219278, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.2029760272492097e-06, + "logits/chosen": 466259285.3333333, + "logits/rejected": 743488307.2, + "logps/chosen": -341.0452880859375, + "logps/rejected": -346.610546875, + "loss": 0.0077, + "rewards/chosen": 4.630688667297363, + "rewards/margins": 13.093855476379394, + "rewards/rejected": -8.463166809082031, + "step": 7548 + }, + { + "epoch": 0.6897213339424395, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 2.2017843583696054e-06, + "logits/chosen": 288889164.8, + "logits/rejected": 457919061.3333333, + "logps/chosen": -352.614599609375, + "logps/rejected": -682.3025309244791, + "loss": 0.0053, + "rewards/chosen": 5.120208740234375, + "rewards/margins": 16.806016286214195, + "rewards/rejected": -11.685807545979818, + "step": 7549 + }, + { + "epoch": 0.6898126998629511, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 2.2005929208765087e-06, + "logits/chosen": 807209386.6666666, + "logits/rejected": 449304832.0, + "logps/chosen": -341.7076822916667, + "logps/rejected": -458.7680358886719, + "loss": 0.0297, + "rewards/chosen": 3.35484250386556, + "rewards/margins": 11.821400006612143, + "rewards/rejected": -8.466557502746582, + "step": 7550 + }, + { + "epoch": 0.6899040657834627, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 2.1994017148684397e-06, + "logits/chosen": 220286822.4, + "logits/rejected": 200394880.0, + "logps/chosen": -171.1563720703125, + "logps/rejected": -399.2207845052083, + "loss": 0.0819, + "rewards/chosen": 4.421833038330078, + "rewards/margins": 12.065662384033203, + "rewards/rejected": -7.643829345703125, + "step": 7551 + }, + { + "epoch": 0.6899954317039744, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 2.1982107404438994e-06, + "logits/chosen": 580329152.0, + "logits/rejected": 446435392.0, + "logps/chosen": -226.76751708984375, + "logps/rejected": -824.9599609375, + "loss": 0.0226, + "rewards/chosen": 3.40232515335083, + "rewards/margins": 15.6828293800354, + "rewards/rejected": -12.28050422668457, + "step": 7552 + }, + { + "epoch": 0.6900867976244861, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 2.197019997701374e-06, + "logits/chosen": 881961984.0, + "logits/rejected": 610438016.0, + "logps/chosen": -380.2062174479167, + "logps/rejected": -549.621337890625, + "loss": 0.0233, + "rewards/chosen": 3.8411731719970703, + "rewards/margins": 11.871283531188965, + "rewards/rejected": -8.030110359191895, + "step": 7553 + }, + { + "epoch": 0.6901781635449977, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 2.1958294867393245e-06, + "logits/chosen": 383436224.0, + "logits/rejected": 424878848.0, + "logps/chosen": -289.4970397949219, + "logps/rejected": -505.0177815755208, + "loss": 0.0243, + "rewards/chosen": 2.3397552967071533, + "rewards/margins": 11.816250403722128, + "rewards/rejected": -9.476495107014975, + "step": 7554 + }, + { + "epoch": 0.6902695294655093, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 2.1946392076561945e-06, + "logits/chosen": 456517568.0, + "logits/rejected": 454285152.0, + "logps/chosen": -294.86004638671875, + "logps/rejected": -555.0567016601562, + "loss": 0.0376, + "rewards/chosen": 2.5905652046203613, + "rewards/margins": 13.267778873443604, + "rewards/rejected": -10.677213668823242, + "step": 7555 + }, + { + "epoch": 0.690360895386021, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 2.193449160550409e-06, + "logits/chosen": 543979264.0, + "logits/rejected": 511792844.8, + "logps/chosen": -426.185546875, + "logps/rejected": -630.874072265625, + "loss": 0.0362, + "rewards/chosen": 2.710508346557617, + "rewards/margins": 12.963630294799804, + "rewards/rejected": -10.253121948242187, + "step": 7556 + }, + { + "epoch": 0.6904522613065327, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 2.1922593455203755e-06, + "logits/chosen": 884305280.0, + "logits/rejected": 542656960.0, + "logps/chosen": -399.8922119140625, + "logps/rejected": -286.57781982421875, + "loss": 0.0165, + "rewards/chosen": 4.07301139831543, + "rewards/margins": 11.582921981811523, + "rewards/rejected": -7.509910583496094, + "step": 7557 + }, + { + "epoch": 0.6905436272270443, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 2.19106976266448e-06, + "logits/chosen": 352771712.0, + "logits/rejected": 563193429.3333334, + "logps/chosen": -298.42926025390625, + "logps/rejected": -541.6855061848959, + "loss": 0.0047, + "rewards/chosen": 4.638354301452637, + "rewards/margins": 13.126720110575357, + "rewards/rejected": -8.48836580912272, + "step": 7558 + }, + { + "epoch": 0.6906349931475559, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 2.1898804120810916e-06, + "logits/chosen": 952813056.0, + "logits/rejected": 888945459.2, + "logps/chosen": -330.90504964192706, + "logps/rejected": -541.4732421875, + "loss": 0.0071, + "rewards/chosen": 4.126217524210612, + "rewards/margins": 14.821211878458659, + "rewards/rejected": -10.694994354248047, + "step": 7559 + }, + { + "epoch": 0.6907263590680676, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 2.188691293868554e-06, + "logits/chosen": 450649344.0, + "logits/rejected": 339739904.0, + "logps/chosen": -287.74542236328125, + "logps/rejected": -375.3216247558594, + "loss": 0.019, + "rewards/chosen": 3.634556770324707, + "rewards/margins": 11.5299711227417, + "rewards/rejected": -7.895414352416992, + "step": 7560 + }, + { + "epoch": 0.6908177249885793, + "grad_norm": 0.91796875, + "kl": 0.0, + "learning_rate": 2.1875024081252e-06, + "logits/chosen": 700596864.0, + "logits/rejected": 1004787712.0, + "logps/chosen": -397.57879638671875, + "logps/rejected": -563.7483258928571, + "loss": 0.0042, + "rewards/chosen": 3.5701873302459717, + "rewards/margins": 11.918092625481743, + "rewards/rejected": -8.34790529523577, + "step": 7561 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 2.1863137549493387e-06, + "logits/chosen": 516455722.6666667, + "logits/rejected": 948794432.0, + "logps/chosen": -408.8449300130208, + "logps/rejected": -274.63653564453125, + "loss": 0.0247, + "rewards/chosen": 3.552138646443685, + "rewards/margins": 11.288019498189291, + "rewards/rejected": -7.7358808517456055, + "step": 7562 + }, + { + "epoch": 0.6910004568296025, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 2.1851253344392605e-06, + "logits/chosen": 336244416.0, + "logits/rejected": 500056192.0, + "logps/chosen": -244.70095825195312, + "logps/rejected": -442.4254557291667, + "loss": 0.0051, + "rewards/chosen": 4.570446014404297, + "rewards/margins": 13.855186462402344, + "rewards/rejected": -9.284740447998047, + "step": 7563 + }, + { + "epoch": 0.6910918227501142, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 2.1839371466932353e-06, + "logits/chosen": 532397856.0, + "logits/rejected": 516090912.0, + "logps/chosen": -184.49856567382812, + "logps/rejected": -498.8082275390625, + "loss": 0.0143, + "rewards/chosen": 4.500722408294678, + "rewards/margins": 13.159465312957764, + "rewards/rejected": -8.658742904663086, + "step": 7564 + }, + { + "epoch": 0.6911831886706259, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 2.182749191809518e-06, + "logits/chosen": 469236906.6666667, + "logits/rejected": 445550489.6, + "logps/chosen": -237.79243977864584, + "logps/rejected": -470.1765625, + "loss": 0.0148, + "rewards/chosen": 3.658797264099121, + "rewards/margins": 11.216994285583496, + "rewards/rejected": -7.558197021484375, + "step": 7565 + }, + { + "epoch": 0.6912745545911375, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 2.1815614698863402e-06, + "logits/chosen": 669002752.0, + "logits/rejected": 1040003392.0, + "logps/chosen": -367.1048278808594, + "logps/rejected": -500.2486572265625, + "loss": 0.0731, + "rewards/chosen": 4.371997833251953, + "rewards/margins": 13.111810684204102, + "rewards/rejected": -8.739812850952148, + "step": 7566 + }, + { + "epoch": 0.6913659205116491, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 2.1803739810219156e-06, + "logits/chosen": 424810496.0, + "logits/rejected": 373710848.0, + "logps/chosen": -317.40673828125, + "logps/rejected": -514.181640625, + "loss": 0.0058, + "rewards/chosen": 5.106297969818115, + "rewards/margins": 16.35731840133667, + "rewards/rejected": -11.251020431518555, + "step": 7567 + }, + { + "epoch": 0.6914572864321608, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 2.1791867253144384e-06, + "logits/chosen": 419346432.0, + "logits/rejected": 348592864.0, + "logps/chosen": -209.11651611328125, + "logps/rejected": -473.3660583496094, + "loss": 0.0089, + "rewards/chosen": 4.143190860748291, + "rewards/margins": 14.290061473846436, + "rewards/rejected": -10.146870613098145, + "step": 7568 + }, + { + "epoch": 0.6915486523526725, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 2.1779997028620815e-06, + "logits/chosen": 534641920.0, + "logits/rejected": 655096384.0, + "logps/chosen": -253.97377014160156, + "logps/rejected": -760.4605712890625, + "loss": 0.0171, + "rewards/chosen": 3.6925556659698486, + "rewards/margins": 15.078646898269653, + "rewards/rejected": -11.386091232299805, + "step": 7569 + }, + { + "epoch": 0.6916400182731841, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 2.1768129137630045e-06, + "logits/chosen": 859403520.0, + "logits/rejected": 525938432.0, + "logps/chosen": -405.5181884765625, + "logps/rejected": -516.2249755859375, + "loss": 0.0194, + "rewards/chosen": 3.6815643310546875, + "rewards/margins": 13.238380432128906, + "rewards/rejected": -9.556816101074219, + "step": 7570 + }, + { + "epoch": 0.6917313841936957, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.1756263581153427e-06, + "logits/chosen": 575772416.0, + "logits/rejected": 426940723.2, + "logps/chosen": -386.7910563151042, + "logps/rejected": -455.031396484375, + "loss": 0.0125, + "rewards/chosen": 3.596550941467285, + "rewards/margins": 13.466781044006348, + "rewards/rejected": -9.870230102539063, + "step": 7571 + }, + { + "epoch": 0.6918227501142074, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 2.174440036017212e-06, + "logits/chosen": 567230105.6, + "logits/rejected": 754367829.3333334, + "logps/chosen": -272.049755859375, + "logps/rejected": -575.7635498046875, + "loss": 0.018, + "rewards/chosen": 4.026126480102539, + "rewards/margins": 13.17972780863444, + "rewards/rejected": -9.1536013285319, + "step": 7572 + }, + { + "epoch": 0.6919141160347191, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 2.173253947566709e-06, + "logits/chosen": 441966848.0, + "logits/rejected": 340910378.6666667, + "logps/chosen": -264.64482421875, + "logps/rejected": -563.91357421875, + "loss": 0.0559, + "rewards/chosen": 2.865354537963867, + "rewards/margins": 14.232319513956705, + "rewards/rejected": -11.366964975992838, + "step": 7573 + }, + { + "epoch": 0.6920054819552307, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 2.1720680928619164e-06, + "logits/chosen": 364611264.0, + "logits/rejected": 313925248.0, + "logps/chosen": -253.7918701171875, + "logps/rejected": -406.6795349121094, + "loss": 0.0102, + "rewards/chosen": 4.194986343383789, + "rewards/margins": 12.94808292388916, + "rewards/rejected": -8.753096580505371, + "step": 7574 + }, + { + "epoch": 0.6920968478757423, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 2.1708824720008913e-06, + "logits/chosen": 870046912.0, + "logits/rejected": 517268000.0, + "logps/chosen": -251.6763458251953, + "logps/rejected": -735.4332275390625, + "loss": 0.0204, + "rewards/chosen": 3.834308385848999, + "rewards/margins": 13.919542074203491, + "rewards/rejected": -10.085233688354492, + "step": 7575 + }, + { + "epoch": 0.692188213796254, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 2.1696970850816735e-06, + "logits/chosen": 572870570.6666666, + "logits/rejected": 338445926.4, + "logps/chosen": -298.7626953125, + "logps/rejected": -447.05068359375, + "loss": 0.0382, + "rewards/chosen": 2.2780443827311196, + "rewards/margins": 11.670798746744792, + "rewards/rejected": -9.392754364013673, + "step": 7576 + }, + { + "epoch": 0.6922795797167657, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 2.1685119322022814e-06, + "logits/chosen": 975634368.0, + "logits/rejected": 717579072.0, + "logps/chosen": -337.6546630859375, + "logps/rejected": -486.24554443359375, + "loss": 0.0091, + "rewards/chosen": 4.8060712814331055, + "rewards/margins": 14.7074556350708, + "rewards/rejected": -9.901384353637695, + "step": 7577 + }, + { + "epoch": 0.6923709456372773, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.16732701346072e-06, + "logits/chosen": 458244992.0, + "logits/rejected": 394700480.0, + "logps/chosen": -279.3127848307292, + "logps/rejected": -582.0465698242188, + "loss": 0.0276, + "rewards/chosen": 4.0224809646606445, + "rewards/margins": 14.392125129699707, + "rewards/rejected": -10.369644165039062, + "step": 7578 + }, + { + "epoch": 0.6924623115577889, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 2.1661423289549688e-06, + "logits/chosen": 505937493.3333333, + "logits/rejected": 379861324.8, + "logps/chosen": -231.2330118815104, + "logps/rejected": -504.521826171875, + "loss": 0.005, + "rewards/chosen": 4.840895970662435, + "rewards/margins": 15.951865514119465, + "rewards/rejected": -11.110969543457031, + "step": 7579 + }, + { + "epoch": 0.6925536774783005, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 2.1649578787829912e-06, + "logits/chosen": 1088238762.6666667, + "logits/rejected": 700832512.0, + "logps/chosen": -670.7271321614584, + "logps/rejected": -531.86044921875, + "loss": 0.0091, + "rewards/chosen": 3.800742785135905, + "rewards/margins": 13.347117296854654, + "rewards/rejected": -9.54637451171875, + "step": 7580 + }, + { + "epoch": 0.6926450433988123, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 2.1637736630427296e-06, + "logits/chosen": 444462890.6666667, + "logits/rejected": 260004448.0, + "logps/chosen": -283.1094970703125, + "logps/rejected": -316.9043273925781, + "loss": 0.0204, + "rewards/chosen": 3.9536059697469077, + "rewards/margins": 12.695658047993978, + "rewards/rejected": -8.74205207824707, + "step": 7581 + }, + { + "epoch": 0.6927364093193239, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 2.162589681832106e-06, + "logits/chosen": 471229098.6666667, + "logits/rejected": 805397196.8, + "logps/chosen": -360.314697265625, + "logps/rejected": -453.02822265625, + "loss": 0.0161, + "rewards/chosen": 4.308712323506673, + "rewards/margins": 12.451711591084798, + "rewards/rejected": -8.142999267578125, + "step": 7582 + }, + { + "epoch": 0.6928277752398355, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 2.161405935249029e-06, + "logits/chosen": 720274368.0, + "logits/rejected": 489758208.0, + "logps/chosen": -97.790771484375, + "logps/rejected": -413.9749232700893, + "loss": 0.0892, + "rewards/chosen": 2.6538612842559814, + "rewards/margins": 10.40877788407462, + "rewards/rejected": -7.754916599818638, + "step": 7583 + }, + { + "epoch": 0.6929191411603471, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 2.1602224233913803e-06, + "logits/chosen": 595658112.0, + "logits/rejected": 433389440.0, + "logps/chosen": -397.9632568359375, + "logps/rejected": -432.6896057128906, + "loss": 0.0098, + "rewards/chosen": 4.129763126373291, + "rewards/margins": 14.576540470123291, + "rewards/rejected": -10.44677734375, + "step": 7584 + }, + { + "epoch": 0.6930105070808589, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 2.1590391463570264e-06, + "logits/chosen": 689015680.0, + "logits/rejected": 346756970.6666667, + "logps/chosen": -423.8282775878906, + "logps/rejected": -410.2493489583333, + "loss": 0.05, + "rewards/chosen": 2.0534470081329346, + "rewards/margins": 11.533007224400839, + "rewards/rejected": -9.479560216267904, + "step": 7585 + }, + { + "epoch": 0.6931018730013705, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 2.157856104243813e-06, + "logits/chosen": 932523827.2, + "logits/rejected": 1464548693.3333333, + "logps/chosen": -274.961865234375, + "logps/rejected": -523.1318766276041, + "loss": 0.0212, + "rewards/chosen": 3.6019771575927733, + "rewards/margins": 11.614033508300782, + "rewards/rejected": -8.012056350708008, + "step": 7586 + }, + { + "epoch": 0.6931932389218821, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 2.156673297149566e-06, + "logits/chosen": 516742348.8, + "logits/rejected": 689433898.6666666, + "logps/chosen": -369.66416015625, + "logps/rejected": -800.157470703125, + "loss": 0.0154, + "rewards/chosen": 3.9528762817382814, + "rewards/margins": 17.168302408854167, + "rewards/rejected": -13.215426127115885, + "step": 7587 + }, + { + "epoch": 0.6932846048423937, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 2.1554907251720947e-06, + "logits/chosen": 514952277.3333333, + "logits/rejected": 440917952.0, + "logps/chosen": -304.25390625, + "logps/rejected": -430.98638916015625, + "loss": 0.0438, + "rewards/chosen": 3.323719342549642, + "rewards/margins": 13.174807866414389, + "rewards/rejected": -9.851088523864746, + "step": 7588 + }, + { + "epoch": 0.6933759707629055, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 2.154308388409184e-06, + "logits/chosen": 940886272.0, + "logits/rejected": 593053824.0, + "logps/chosen": -194.63209533691406, + "logps/rejected": -723.8865966796875, + "loss": 0.0133, + "rewards/chosen": 4.637167930603027, + "rewards/margins": 17.69722080230713, + "rewards/rejected": -13.060052871704102, + "step": 7589 + }, + { + "epoch": 0.6934673366834171, + "grad_norm": 46.25, + "kl": 0.0, + "learning_rate": 2.153126286958603e-06, + "logits/chosen": 482690976.0, + "logits/rejected": 301688800.0, + "logps/chosen": -227.4168243408203, + "logps/rejected": -535.532470703125, + "loss": 0.0409, + "rewards/chosen": 3.0944626331329346, + "rewards/margins": 14.201724290847778, + "rewards/rejected": -11.107261657714844, + "step": 7590 + }, + { + "epoch": 0.6935587026039287, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 2.1519444209181023e-06, + "logits/chosen": 495390240.0, + "logits/rejected": 535040384.0, + "logps/chosen": -319.30328369140625, + "logps/rejected": -550.0288696289062, + "loss": 0.0115, + "rewards/chosen": 4.234390735626221, + "rewards/margins": 14.675219058990479, + "rewards/rejected": -10.440828323364258, + "step": 7591 + }, + { + "epoch": 0.6936500685244403, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.1507627903854107e-06, + "logits/chosen": 495584460.8, + "logits/rejected": 663960746.6666666, + "logps/chosen": -321.9438232421875, + "logps/rejected": -402.5555826822917, + "loss": 0.0147, + "rewards/chosen": 4.079899597167969, + "rewards/margins": 13.030230967203774, + "rewards/rejected": -8.950331370035807, + "step": 7592 + }, + { + "epoch": 0.6937414344449521, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 2.1495813954582373e-06, + "logits/chosen": 341186752.0, + "logits/rejected": 322448192.0, + "logps/chosen": -235.52487182617188, + "logps/rejected": -299.09613037109375, + "loss": 0.0047, + "rewards/chosen": 5.234867572784424, + "rewards/margins": 13.085337162017822, + "rewards/rejected": -7.850469589233398, + "step": 7593 + }, + { + "epoch": 0.6938328003654637, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 2.148400236234271e-06, + "logits/chosen": 459337898.6666667, + "logits/rejected": 641227264.0, + "logps/chosen": -332.5478515625, + "logps/rejected": -401.524169921875, + "loss": 0.0043, + "rewards/chosen": 4.782501220703125, + "rewards/margins": 14.205718994140625, + "rewards/rejected": -9.4232177734375, + "step": 7594 + }, + { + "epoch": 0.6939241662859753, + "grad_norm": 0.451171875, + "kl": 0.0, + "learning_rate": 2.1472193128111862e-06, + "logits/chosen": 710895744.0, + "logits/rejected": 585239125.3333334, + "logps/chosen": -318.11627197265625, + "logps/rejected": -617.2796223958334, + "loss": 0.002, + "rewards/chosen": 5.335481643676758, + "rewards/margins": 15.125822067260742, + "rewards/rejected": -9.790340423583984, + "step": 7595 + }, + { + "epoch": 0.6940155322064869, + "grad_norm": 0.48046875, + "kl": 0.0, + "learning_rate": 2.1460386252866327e-06, + "logits/chosen": 896602368.0, + "logits/rejected": 505364838.4, + "logps/chosen": -293.33241780598956, + "logps/rejected": -557.452685546875, + "loss": 0.003, + "rewards/chosen": 5.1555226643880205, + "rewards/margins": 14.728269704182942, + "rewards/rejected": -9.572747039794923, + "step": 7596 + }, + { + "epoch": 0.6941068981269987, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.1448581737582425e-06, + "logits/chosen": 271493632.0, + "logits/rejected": 665925120.0, + "logps/chosen": -151.90970458984376, + "logps/rejected": -507.432861328125, + "loss": 0.0166, + "rewards/chosen": 4.197369384765625, + "rewards/margins": 13.656032053629556, + "rewards/rejected": -9.458662668863932, + "step": 7597 + }, + { + "epoch": 0.6941982640475103, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 2.1436779583236257e-06, + "logits/chosen": 1008305536.0, + "logits/rejected": 471485056.0, + "logps/chosen": -340.4273681640625, + "logps/rejected": -306.921142578125, + "loss": 0.0172, + "rewards/chosen": 3.5446107387542725, + "rewards/margins": 10.372109174728394, + "rewards/rejected": -6.827498435974121, + "step": 7598 + }, + { + "epoch": 0.6942896299680219, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 2.1424979790803794e-06, + "logits/chosen": 691143424.0, + "logits/rejected": 807068352.0, + "logps/chosen": -294.5171203613281, + "logps/rejected": -433.78466796875, + "loss": 0.0199, + "rewards/chosen": 3.2022061347961426, + "rewards/margins": 11.941392421722412, + "rewards/rejected": -8.73918628692627, + "step": 7599 + }, + { + "epoch": 0.6943809958885335, + "grad_norm": 0.376953125, + "kl": 0.0, + "learning_rate": 2.1413182361260747e-06, + "logits/chosen": 495373141.3333333, + "logits/rejected": 664491212.8, + "logps/chosen": -355.728271484375, + "logps/rejected": -433.0640625, + "loss": 0.0017, + "rewards/chosen": 5.554751714070638, + "rewards/margins": 14.49853146870931, + "rewards/rejected": -8.943779754638673, + "step": 7600 + }, + { + "epoch": 0.6944723618090453, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 2.140138729558266e-06, + "logits/chosen": 786572544.0, + "logits/rejected": 602090837.3333334, + "logps/chosen": -262.7834716796875, + "logps/rejected": -434.0593668619792, + "loss": 0.0199, + "rewards/chosen": 3.807678985595703, + "rewards/margins": 12.688992309570313, + "rewards/rejected": -8.88131332397461, + "step": 7601 + }, + { + "epoch": 0.6945637277295569, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 2.138959459474487e-06, + "logits/chosen": 418503808.0, + "logits/rejected": 564304426.6666666, + "logps/chosen": -348.78692626953125, + "logps/rejected": -455.702880859375, + "loss": 0.0062, + "rewards/chosen": 4.4042158126831055, + "rewards/margins": 13.939478874206543, + "rewards/rejected": -9.535263061523438, + "step": 7602 + }, + { + "epoch": 0.6946550936500685, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.13778042597225e-06, + "logits/chosen": 490343712.0, + "logits/rejected": 319693760.0, + "logps/chosen": -256.33270263671875, + "logps/rejected": -503.3162841796875, + "loss": 0.0198, + "rewards/chosen": 3.2796266078948975, + "rewards/margins": 13.016923666000366, + "rewards/rejected": -9.737297058105469, + "step": 7603 + }, + { + "epoch": 0.6947464595705801, + "grad_norm": 0.341796875, + "kl": 0.0, + "learning_rate": 2.1366016291490552e-06, + "logits/chosen": 494185088.0, + "logits/rejected": 277601846.85714287, + "logps/chosen": -310.37518310546875, + "logps/rejected": -334.794921875, + "loss": 0.0016, + "rewards/chosen": 5.0124053955078125, + "rewards/margins": 13.102759769984655, + "rewards/rejected": -8.090354374476842, + "step": 7604 + }, + { + "epoch": 0.6948378254910919, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 2.135423069102376e-06, + "logits/chosen": 332331744.0, + "logits/rejected": 582446549.3333334, + "logps/chosen": -269.69989013671875, + "logps/rejected": -488.00390625, + "loss": 0.009, + "rewards/chosen": 3.7168822288513184, + "rewards/margins": 13.371191819508871, + "rewards/rejected": -9.654309590657553, + "step": 7605 + }, + { + "epoch": 0.6949291914116035, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 2.1342447459296676e-06, + "logits/chosen": 348627072.0, + "logits/rejected": 280276633.6, + "logps/chosen": -299.12904866536456, + "logps/rejected": -349.0484375, + "loss": 0.0148, + "rewards/chosen": 3.7393360137939453, + "rewards/margins": 12.122607803344726, + "rewards/rejected": -8.383271789550781, + "step": 7606 + }, + { + "epoch": 0.6950205573321151, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 2.1330666597283656e-06, + "logits/chosen": 581881241.6, + "logits/rejected": 463217194.6666667, + "logps/chosen": -278.20537109375, + "logps/rejected": -523.6315104166666, + "loss": 0.0182, + "rewards/chosen": 4.029705810546875, + "rewards/margins": 12.571400451660157, + "rewards/rejected": -8.541694641113281, + "step": 7607 + }, + { + "epoch": 0.6951119232526267, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 2.1318888105958895e-06, + "logits/chosen": 611620544.0, + "logits/rejected": 558540544.0, + "logps/chosen": -363.51141357421875, + "logps/rejected": -429.3687744140625, + "loss": 0.0073, + "rewards/chosen": 4.579464912414551, + "rewards/margins": 14.11656665802002, + "rewards/rejected": -9.537101745605469, + "step": 7608 + }, + { + "epoch": 0.6952032891731385, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.130711198629635e-06, + "logits/chosen": 710160725.3333334, + "logits/rejected": 574211328.0, + "logps/chosen": -379.8622233072917, + "logps/rejected": -638.92724609375, + "loss": 0.0285, + "rewards/chosen": 3.3377208709716797, + "rewards/margins": 11.507831192016601, + "rewards/rejected": -8.170110321044922, + "step": 7609 + }, + { + "epoch": 0.6952946550936501, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 2.1295338239269802e-06, + "logits/chosen": 447292864.0, + "logits/rejected": 421220768.0, + "logps/chosen": -463.1224365234375, + "logps/rejected": -792.0280151367188, + "loss": 0.018, + "rewards/chosen": 3.757462501525879, + "rewards/margins": 15.186832427978516, + "rewards/rejected": -11.429369926452637, + "step": 7610 + }, + { + "epoch": 0.6953860210141617, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 2.1283566865852824e-06, + "logits/chosen": 967806566.4, + "logits/rejected": 269088384.0, + "logps/chosen": -274.816943359375, + "logps/rejected": -251.20914713541666, + "loss": 0.0114, + "rewards/chosen": 4.222625350952148, + "rewards/margins": 13.344256973266601, + "rewards/rejected": -9.121631622314453, + "step": 7611 + }, + { + "epoch": 0.6954773869346733, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 2.1271797867018785e-06, + "logits/chosen": 702443392.0, + "logits/rejected": 489217024.0, + "logps/chosen": -402.54022216796875, + "logps/rejected": -382.78338623046875, + "loss": 0.0247, + "rewards/chosen": 3.240532875061035, + "rewards/margins": 12.386919975280762, + "rewards/rejected": -9.146387100219727, + "step": 7612 + }, + { + "epoch": 0.695568752855185, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 2.1260031243740925e-06, + "logits/chosen": 1423753301.3333333, + "logits/rejected": 732542720.0, + "logps/chosen": -364.6604817708333, + "logps/rejected": -470.38955078125, + "loss": 0.0336, + "rewards/chosen": 3.296881993611654, + "rewards/margins": 11.62212003072103, + "rewards/rejected": -8.325238037109376, + "step": 7613 + }, + { + "epoch": 0.6956601187756967, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 2.1248266996992184e-06, + "logits/chosen": 485997792.0, + "logits/rejected": 323530304.0, + "logps/chosen": -267.2715759277344, + "logps/rejected": -582.7929077148438, + "loss": 0.0087, + "rewards/chosen": 4.184544563293457, + "rewards/margins": 13.148505210876465, + "rewards/rejected": -8.963960647583008, + "step": 7614 + }, + { + "epoch": 0.6957514846962083, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 2.1236505127745355e-06, + "logits/chosen": 523651968.0, + "logits/rejected": 900317056.0, + "logps/chosen": -196.433349609375, + "logps/rejected": -702.519287109375, + "loss": 0.0494, + "rewards/chosen": 2.9296560287475586, + "rewards/margins": 13.006635665893555, + "rewards/rejected": -10.076979637145996, + "step": 7615 + }, + { + "epoch": 0.6958428506167199, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 2.1224745636973065e-06, + "logits/chosen": 964084800.0, + "logits/rejected": 347843584.0, + "logps/chosen": -484.3397216796875, + "logps/rejected": -492.697509765625, + "loss": 0.0086, + "rewards/chosen": 4.074352264404297, + "rewards/margins": 15.238418579101562, + "rewards/rejected": -11.164066314697266, + "step": 7616 + }, + { + "epoch": 0.6959342165372316, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 2.1212988525647702e-06, + "logits/chosen": 1065940787.2, + "logits/rejected": 1604511573.3333333, + "logps/chosen": -328.5958984375, + "logps/rejected": -341.9391276041667, + "loss": 0.0231, + "rewards/chosen": 3.7028350830078125, + "rewards/margins": 11.191708882649738, + "rewards/rejected": -7.488873799641927, + "step": 7617 + }, + { + "epoch": 0.6960255824577433, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 2.120123379474148e-06, + "logits/chosen": 347567360.0, + "logits/rejected": 451803562.6666667, + "logps/chosen": -195.8988037109375, + "logps/rejected": -436.5982259114583, + "loss": 0.0077, + "rewards/chosen": 3.7081871032714844, + "rewards/margins": 14.13210360209147, + "rewards/rejected": -10.423916498819986, + "step": 7618 + }, + { + "epoch": 0.6961169483782549, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 2.118948144522639e-06, + "logits/chosen": 461044736.0, + "logits/rejected": 450968405.3333333, + "logps/chosen": -161.6744384765625, + "logps/rejected": -453.8787841796875, + "loss": 0.004, + "rewards/chosen": 4.470479488372803, + "rewards/margins": 13.180033524831137, + "rewards/rejected": -8.709554036458334, + "step": 7619 + }, + { + "epoch": 0.6962083142987665, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.1177731478074227e-06, + "logits/chosen": 491058304.0, + "logits/rejected": 364278688.0, + "logps/chosen": -404.0394694010417, + "logps/rejected": -540.66650390625, + "loss": 0.026, + "rewards/chosen": 4.212820053100586, + "rewards/margins": 15.082550048828125, + "rewards/rejected": -10.869729995727539, + "step": 7620 + }, + { + "epoch": 0.6962996802192782, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 2.1165983894256647e-06, + "logits/chosen": 244517568.0, + "logits/rejected": 818765397.3333334, + "logps/chosen": -404.649169921875, + "logps/rejected": -625.820068359375, + "loss": 0.0069, + "rewards/chosen": 4.377105712890625, + "rewards/margins": 14.324188868204752, + "rewards/rejected": -9.947083155314127, + "step": 7621 + }, + { + "epoch": 0.6963910461397899, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 2.115423869474505e-06, + "logits/chosen": 397763481.6, + "logits/rejected": 336949802.6666667, + "logps/chosen": -212.8892333984375, + "logps/rejected": -457.13232421875, + "loss": 0.007, + "rewards/chosen": 4.805931091308594, + "rewards/margins": 13.558136622111002, + "rewards/rejected": -8.752205530802408, + "step": 7622 + }, + { + "epoch": 0.6964824120603015, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 2.114249588051065e-06, + "logits/chosen": 367223296.0, + "logits/rejected": 455146720.0, + "logps/chosen": -281.99761962890625, + "logps/rejected": -574.3828125, + "loss": 0.01, + "rewards/chosen": 4.111813545227051, + "rewards/margins": 14.938504219055176, + "rewards/rejected": -10.826690673828125, + "step": 7623 + }, + { + "epoch": 0.6965737779808131, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 2.113075545252445e-06, + "logits/chosen": 576913715.2, + "logits/rejected": 1215108181.3333333, + "logps/chosen": -312.254150390625, + "logps/rejected": -490.4065348307292, + "loss": 0.0176, + "rewards/chosen": 3.9940223693847656, + "rewards/margins": 13.716654459635416, + "rewards/rejected": -9.72263209025065, + "step": 7624 + }, + { + "epoch": 0.6966651439013248, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 2.1119017411757313e-06, + "logits/chosen": 755494229.3333334, + "logits/rejected": 622083840.0, + "logps/chosen": -418.0653483072917, + "logps/rejected": -371.6337646484375, + "loss": 0.011, + "rewards/chosen": 3.837841033935547, + "rewards/margins": 11.368858337402344, + "rewards/rejected": -7.531017303466797, + "step": 7625 + }, + { + "epoch": 0.6967565098218365, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.110728175917985e-06, + "logits/chosen": 576702976.0, + "logits/rejected": 427441433.6, + "logps/chosen": -433.8168131510417, + "logps/rejected": -370.2189697265625, + "loss": 0.0115, + "rewards/chosen": 4.127968470255534, + "rewards/margins": 12.039342371622721, + "rewards/rejected": -7.911373901367187, + "step": 7626 + }, + { + "epoch": 0.6968478757423481, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.1095548495762485e-06, + "logits/chosen": 196513792.0, + "logits/rejected": 579914313.1428572, + "logps/chosen": -78.8651123046875, + "logps/rejected": -362.4823521205357, + "loss": 0.0213, + "rewards/chosen": 1.5987900495529175, + "rewards/margins": 10.707395025662013, + "rewards/rejected": -9.108604976109095, + "step": 7627 + }, + { + "epoch": 0.6969392416628597, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 2.1083817622475434e-06, + "logits/chosen": 663805013.3333334, + "logits/rejected": 685729228.8, + "logps/chosen": -351.5112711588542, + "logps/rejected": -461.89443359375, + "loss": 0.0109, + "rewards/chosen": 3.648462931315104, + "rewards/margins": 11.914708964029947, + "rewards/rejected": -8.266246032714843, + "step": 7628 + }, + { + "epoch": 0.6970306075833714, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.107208914028877e-06, + "logits/chosen": 703792435.2, + "logits/rejected": 382781696.0, + "logps/chosen": -297.1371337890625, + "logps/rejected": -482.9796142578125, + "loss": 0.0348, + "rewards/chosen": 3.2729183197021485, + "rewards/margins": 14.119208908081054, + "rewards/rejected": -10.846290588378906, + "step": 7629 + }, + { + "epoch": 0.697121973503883, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 2.1060363050172317e-06, + "logits/chosen": 866792533.3333334, + "logits/rejected": 772115968.0, + "logps/chosen": -316.5259195963542, + "logps/rejected": -610.8005981445312, + "loss": 0.0209, + "rewards/chosen": 3.8372157414754233, + "rewards/margins": 13.4128049214681, + "rewards/rejected": -9.575589179992676, + "step": 7630 + }, + { + "epoch": 0.6972133394243947, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 2.1048639353095703e-06, + "logits/chosen": 469363552.0, + "logits/rejected": 385018944.0, + "logps/chosen": -290.2403564453125, + "logps/rejected": -576.0907592773438, + "loss": 0.0149, + "rewards/chosen": 3.6245908737182617, + "rewards/margins": 14.260814666748047, + "rewards/rejected": -10.636223793029785, + "step": 7631 + }, + { + "epoch": 0.6973047053449063, + "grad_norm": 0.43359375, + "kl": 0.0, + "learning_rate": 2.1036918050028377e-06, + "logits/chosen": 316566720.0, + "logits/rejected": 465311195.4285714, + "logps/chosen": -346.87640380859375, + "logps/rejected": -535.5653599330357, + "loss": 0.0018, + "rewards/chosen": 4.634799480438232, + "rewards/margins": 14.987430095672607, + "rewards/rejected": -10.352630615234375, + "step": 7632 + }, + { + "epoch": 0.697396071265418, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 2.1025199141939564e-06, + "logits/chosen": 569745024.0, + "logits/rejected": 643975296.0, + "logps/chosen": -381.5484619140625, + "logps/rejected": -479.3138834635417, + "loss": 0.0098, + "rewards/chosen": 4.37758207321167, + "rewards/margins": 13.108551820119223, + "rewards/rejected": -8.730969746907553, + "step": 7633 + }, + { + "epoch": 0.6974874371859296, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 2.1013482629798334e-06, + "logits/chosen": 859429504.0, + "logits/rejected": 585986596.5714285, + "logps/chosen": -668.0526123046875, + "logps/rejected": -417.33482142857144, + "loss": 0.0043, + "rewards/chosen": 4.863152980804443, + "rewards/margins": 13.332013607025146, + "rewards/rejected": -8.468860626220703, + "step": 7634 + }, + { + "epoch": 0.6975788031064413, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 2.1001768514573527e-06, + "logits/chosen": 293124544.0, + "logits/rejected": 370913194.6666667, + "logps/chosen": -432.9041748046875, + "logps/rejected": -316.55780029296875, + "loss": 0.009, + "rewards/chosen": 4.245108127593994, + "rewards/margins": 10.889573256174724, + "rewards/rejected": -6.6444651285807295, + "step": 7635 + }, + { + "epoch": 0.6976701690269529, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 2.0990056797233794e-06, + "logits/chosen": 719498922.6666666, + "logits/rejected": 582870144.0, + "logps/chosen": -170.11686197916666, + "logps/rejected": -507.69793701171875, + "loss": 0.0341, + "rewards/chosen": 3.4181690216064453, + "rewards/margins": 12.71059799194336, + "rewards/rejected": -9.292428970336914, + "step": 7636 + }, + { + "epoch": 0.6977615349474646, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 2.097834747874756e-06, + "logits/chosen": 653534122.6666666, + "logits/rejected": 743922944.0, + "logps/chosen": -350.4819742838542, + "logps/rejected": -422.7221984863281, + "loss": 0.02, + "rewards/chosen": 4.168094952901204, + "rewards/margins": 11.943436940511067, + "rewards/rejected": -7.775341987609863, + "step": 7637 + }, + { + "epoch": 0.6978529008679762, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 2.0966640560083113e-06, + "logits/chosen": 596551065.6, + "logits/rejected": 380156117.3333333, + "logps/chosen": -370.05615234375, + "logps/rejected": -338.20591227213544, + "loss": 0.0103, + "rewards/chosen": 4.381970977783203, + "rewards/margins": 11.92472801208496, + "rewards/rejected": -7.542757034301758, + "step": 7638 + }, + { + "epoch": 0.6979442667884879, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 2.0954936042208496e-06, + "logits/chosen": 420772565.3333333, + "logits/rejected": 287370624.0, + "logps/chosen": -316.1191813151042, + "logps/rejected": -410.8213806152344, + "loss": 0.0291, + "rewards/chosen": 3.790797551472982, + "rewards/margins": 14.742255528767904, + "rewards/rejected": -10.951457977294922, + "step": 7639 + }, + { + "epoch": 0.6980356327089996, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 2.094323392609158e-06, + "logits/chosen": 801651882.6666666, + "logits/rejected": 731669120.0, + "logps/chosen": -352.4878743489583, + "logps/rejected": -715.2235107421875, + "loss": 0.0408, + "rewards/chosen": 3.3663708368937173, + "rewards/margins": 15.871823946634928, + "rewards/rejected": -12.505453109741211, + "step": 7640 + }, + { + "epoch": 0.6981269986295112, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 2.0931534212699956e-06, + "logits/chosen": 841023283.2, + "logits/rejected": 474188714.6666667, + "logps/chosen": -291.5919189453125, + "logps/rejected": -382.49560546875, + "loss": 0.0233, + "rewards/chosen": 3.721335601806641, + "rewards/margins": 13.522720209757487, + "rewards/rejected": -9.801384607950846, + "step": 7641 + }, + { + "epoch": 0.6982183645500228, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 2.091983690300115e-06, + "logits/chosen": 611854336.0, + "logits/rejected": 556488405.3333334, + "logps/chosen": -385.642822265625, + "logps/rejected": -677.5196940104166, + "loss": 0.0251, + "rewards/chosen": 3.6640335083007813, + "rewards/margins": 14.040091451009115, + "rewards/rejected": -10.376057942708334, + "step": 7642 + }, + { + "epoch": 0.6983097304705345, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 2.09081419979624e-06, + "logits/chosen": 777531050.6666666, + "logits/rejected": 1136301977.6, + "logps/chosen": -211.07548014322916, + "logps/rejected": -485.99853515625, + "loss": 0.0101, + "rewards/chosen": 3.837287267049154, + "rewards/margins": 12.273346074422202, + "rewards/rejected": -8.436058807373048, + "step": 7643 + }, + { + "epoch": 0.6984010963910462, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 2.0896449498550766e-06, + "logits/chosen": 637727445.3333334, + "logits/rejected": 704090316.8, + "logps/chosen": -456.6116536458333, + "logps/rejected": -515.65068359375, + "loss": 0.0169, + "rewards/chosen": 3.5408032735188804, + "rewards/margins": 13.829270680745443, + "rewards/rejected": -10.288467407226562, + "step": 7644 + }, + { + "epoch": 0.6984924623115578, + "grad_norm": 0.84375, + "kl": 0.0, + "learning_rate": 2.088475940573309e-06, + "logits/chosen": 1572787712.0, + "logits/rejected": 540090581.3333334, + "logps/chosen": -237.65899658203125, + "logps/rejected": -466.997314453125, + "loss": 0.0044, + "rewards/chosen": 4.37504768371582, + "rewards/margins": 12.182253519694012, + "rewards/rejected": -7.80720583597819, + "step": 7645 + }, + { + "epoch": 0.6985838282320694, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 2.0873071720476067e-06, + "logits/chosen": 225056588.8, + "logits/rejected": 378585770.6666667, + "logps/chosen": -198.86817626953126, + "logps/rejected": -490.3174641927083, + "loss": 0.0054, + "rewards/chosen": 5.068443679809571, + "rewards/margins": 14.806947453816733, + "rewards/rejected": -9.738503774007162, + "step": 7646 + }, + { + "epoch": 0.6986751941525811, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 2.0861386443746145e-06, + "logits/chosen": 920180394.6666666, + "logits/rejected": 671057664.0, + "logps/chosen": -392.233154296875, + "logps/rejected": -485.341650390625, + "loss": 0.0076, + "rewards/chosen": 4.010362307230632, + "rewards/margins": 13.661938540140788, + "rewards/rejected": -9.651576232910156, + "step": 7647 + }, + { + "epoch": 0.6987665600730928, + "grad_norm": 0.90234375, + "kl": 0.0, + "learning_rate": 2.0849703576509595e-06, + "logits/chosen": 310848128.0, + "logits/rejected": 636483072.0, + "logps/chosen": -165.84555053710938, + "logps/rejected": -349.96490478515625, + "loss": 0.0234, + "rewards/chosen": 4.00986909866333, + "rewards/margins": 12.106125354766846, + "rewards/rejected": -8.096256256103516, + "step": 7648 + }, + { + "epoch": 0.6988579259936044, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.0838023119732454e-06, + "logits/chosen": 370457280.0, + "logits/rejected": 304802912.0, + "logps/chosen": -301.3116455078125, + "logps/rejected": -389.6817626953125, + "loss": 0.0165, + "rewards/chosen": 4.290308952331543, + "rewards/margins": 12.07177448272705, + "rewards/rejected": -7.781465530395508, + "step": 7649 + }, + { + "epoch": 0.698949291914116, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 2.0826345074380627e-06, + "logits/chosen": 1381002496.0, + "logits/rejected": 1097361554.2857144, + "logps/chosen": -576.203857421875, + "logps/rejected": -639.9441266741071, + "loss": 0.0027, + "rewards/chosen": 3.982421875, + "rewards/margins": 13.316944667271205, + "rewards/rejected": -9.334522792271205, + "step": 7650 + }, + { + "epoch": 0.6990406578346277, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 2.081466944141976e-06, + "logits/chosen": 434796864.0, + "logits/rejected": 606420416.0, + "logps/chosen": -140.55430603027344, + "logps/rejected": -824.1477661132812, + "loss": 0.0244, + "rewards/chosen": 3.2779808044433594, + "rewards/margins": 13.987970352172852, + "rewards/rejected": -10.709989547729492, + "step": 7651 + }, + { + "epoch": 0.6991320237551394, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 2.0802996221815326e-06, + "logits/chosen": 577202176.0, + "logits/rejected": 515413888.0, + "logps/chosen": -310.104736328125, + "logps/rejected": -397.35174560546875, + "loss": 0.0162, + "rewards/chosen": 3.9354796409606934, + "rewards/margins": 12.746965885162354, + "rewards/rejected": -8.81148624420166, + "step": 7652 + }, + { + "epoch": 0.699223389675651, + "grad_norm": 39.0, + "kl": 0.0, + "learning_rate": 2.079132541653259e-06, + "logits/chosen": 689925824.0, + "logits/rejected": 519043114.6666667, + "logps/chosen": -285.43994140625, + "logps/rejected": -423.3336181640625, + "loss": 0.0625, + "rewards/chosen": 3.2712388038635254, + "rewards/margins": 10.227467060089111, + "rewards/rejected": -6.956228256225586, + "step": 7653 + }, + { + "epoch": 0.6993147555961626, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 2.0779657026536593e-06, + "logits/chosen": 317099306.6666667, + "logits/rejected": 344753817.6, + "logps/chosen": -386.6528727213542, + "logps/rejected": -422.568505859375, + "loss": 0.009, + "rewards/chosen": 3.910165468851725, + "rewards/margins": 12.77282797495524, + "rewards/rejected": -8.862662506103515, + "step": 7654 + }, + { + "epoch": 0.6994061215166743, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 2.076799105279225e-06, + "logits/chosen": 662103859.2, + "logits/rejected": 463463125.3333333, + "logps/chosen": -495.81796875, + "logps/rejected": -263.87416585286456, + "loss": 0.0152, + "rewards/chosen": 3.802576446533203, + "rewards/margins": 12.440439605712891, + "rewards/rejected": -8.637863159179688, + "step": 7655 + }, + { + "epoch": 0.699497487437186, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 2.0756327496264204e-06, + "logits/chosen": 506506581.3333333, + "logits/rejected": 242140108.8, + "logps/chosen": -346.19873046875, + "logps/rejected": -487.78125, + "loss": 0.0039, + "rewards/chosen": 4.7049986521403, + "rewards/margins": 15.833610407511394, + "rewards/rejected": -11.128611755371093, + "step": 7656 + }, + { + "epoch": 0.6995888533576976, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 2.0744666357916925e-06, + "logits/chosen": 920973107.2, + "logits/rejected": 648267946.6666666, + "logps/chosen": -239.1633544921875, + "logps/rejected": -542.2403564453125, + "loss": 0.0242, + "rewards/chosen": 3.7186622619628906, + "rewards/margins": 12.080772399902344, + "rewards/rejected": -8.362110137939453, + "step": 7657 + }, + { + "epoch": 0.6996802192782092, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 2.073300763871467e-06, + "logits/chosen": 296406528.0, + "logits/rejected": 489106278.4, + "logps/chosen": -194.92500813802084, + "logps/rejected": -390.87626953125, + "loss": 0.0094, + "rewards/chosen": 4.382430712381999, + "rewards/margins": 12.102396074930827, + "rewards/rejected": -7.7199653625488285, + "step": 7658 + }, + { + "epoch": 0.6997715851987208, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 2.072135133962153e-06, + "logits/chosen": 496632832.0, + "logits/rejected": 386804800.0, + "logps/chosen": -388.3082682291667, + "logps/rejected": -531.2611694335938, + "loss": 0.0188, + "rewards/chosen": 3.7741416295369468, + "rewards/margins": 13.115312894185385, + "rewards/rejected": -9.341171264648438, + "step": 7659 + }, + { + "epoch": 0.6998629511192326, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 2.0709697461601366e-06, + "logits/chosen": 890037964.8, + "logits/rejected": 505547008.0, + "logps/chosen": -384.62509765625, + "logps/rejected": -494.2698160807292, + "loss": 0.0154, + "rewards/chosen": 4.082450866699219, + "rewards/margins": 12.711741129557291, + "rewards/rejected": -8.629290262858072, + "step": 7660 + }, + { + "epoch": 0.6999543170397442, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 2.069804600561784e-06, + "logits/chosen": 797662528.0, + "logits/rejected": 492736608.0, + "logps/chosen": -544.8746337890625, + "logps/rejected": -528.1494140625, + "loss": 0.0158, + "rewards/chosen": 3.5694267749786377, + "rewards/margins": 12.076348543167114, + "rewards/rejected": -8.506921768188477, + "step": 7661 + }, + { + "epoch": 0.7000456829602558, + "grad_norm": 0.78125, + "kl": 0.0, + "learning_rate": 2.0686396972634415e-06, + "logits/chosen": 546618026.6666666, + "logits/rejected": 474537011.2, + "logps/chosen": -271.75384521484375, + "logps/rejected": -516.8462890625, + "loss": 0.0044, + "rewards/chosen": 4.717699686686198, + "rewards/margins": 14.78043467203776, + "rewards/rejected": -10.062734985351563, + "step": 7662 + }, + { + "epoch": 0.7001370488807674, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 2.067475036361435e-06, + "logits/chosen": 585013674.6666666, + "logits/rejected": 213351392.0, + "logps/chosen": -320.46645100911456, + "logps/rejected": -372.9669189453125, + "loss": 0.017, + "rewards/chosen": 3.9972642262776694, + "rewards/margins": 12.788448651631674, + "rewards/rejected": -8.791184425354004, + "step": 7663 + }, + { + "epoch": 0.7002284148012792, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 2.0663106179520742e-06, + "logits/chosen": 499720192.0, + "logits/rejected": 557936810.6666666, + "logps/chosen": -420.71953125, + "logps/rejected": -669.060302734375, + "loss": 0.024, + "rewards/chosen": 3.6811843872070313, + "rewards/margins": 13.623097991943359, + "rewards/rejected": -9.941913604736328, + "step": 7664 + }, + { + "epoch": 0.7003197807217908, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 2.0651464421316446e-06, + "logits/chosen": 820517120.0, + "logits/rejected": 679724928.0, + "logps/chosen": -314.1781921386719, + "logps/rejected": -506.88983154296875, + "loss": 0.0158, + "rewards/chosen": 3.640915870666504, + "rewards/margins": 13.673125267028809, + "rewards/rejected": -10.032209396362305, + "step": 7665 + }, + { + "epoch": 0.7004111466423024, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 2.0639825089964116e-06, + "logits/chosen": 610853888.0, + "logits/rejected": 288338240.0, + "logps/chosen": -510.6098937988281, + "logps/rejected": -440.73114013671875, + "loss": 0.0166, + "rewards/chosen": 3.5315017700195312, + "rewards/margins": 15.32595443725586, + "rewards/rejected": -11.794452667236328, + "step": 7666 + }, + { + "epoch": 0.700502512562814, + "grad_norm": 53.0, + "kl": 0.0, + "learning_rate": 2.062818818642623e-06, + "logits/chosen": 559128256.0, + "logits/rejected": 407898208.0, + "logps/chosen": -146.4944610595703, + "logps/rejected": -464.76348876953125, + "loss": 0.075, + "rewards/chosen": 2.6461191177368164, + "rewards/margins": 11.187376022338867, + "rewards/rejected": -8.54125690460205, + "step": 7667 + }, + { + "epoch": 0.7005938784833258, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 2.061655371166504e-06, + "logits/chosen": 471629696.0, + "logits/rejected": 831236544.0, + "logps/chosen": -276.8240051269531, + "logps/rejected": -457.827392578125, + "loss": 0.0249, + "rewards/chosen": 3.2440710067749023, + "rewards/margins": 11.370136260986328, + "rewards/rejected": -8.126065254211426, + "step": 7668 + }, + { + "epoch": 0.7006852444038374, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 2.060492166664261e-06, + "logits/chosen": 814893226.6666666, + "logits/rejected": 617196697.6, + "logps/chosen": -311.3292643229167, + "logps/rejected": -390.3763671875, + "loss": 0.0072, + "rewards/chosen": 4.702277183532715, + "rewards/margins": 14.356093406677246, + "rewards/rejected": -9.653816223144531, + "step": 7669 + }, + { + "epoch": 0.700776610324349, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 2.0593292052320797e-06, + "logits/chosen": 723816832.0, + "logits/rejected": 793661632.0, + "logps/chosen": -503.92205810546875, + "logps/rejected": -328.05938720703125, + "loss": 0.1072, + "rewards/chosen": 2.30783748626709, + "rewards/margins": 10.082121849060059, + "rewards/rejected": -7.774284362792969, + "step": 7670 + }, + { + "epoch": 0.7008679762448606, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 2.058166486966128e-06, + "logits/chosen": 355392213.3333333, + "logits/rejected": 214788480.0, + "logps/chosen": -221.03857421875, + "logps/rejected": -403.1348571777344, + "loss": 0.0254, + "rewards/chosen": 3.8964878718058267, + "rewards/margins": 17.25350062052409, + "rewards/rejected": -13.357012748718262, + "step": 7671 + }, + { + "epoch": 0.7009593421653724, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 2.0570040119625524e-06, + "logits/chosen": 301236352.0, + "logits/rejected": 345155072.0, + "logps/chosen": -357.446923828125, + "logps/rejected": -459.7041422526042, + "loss": 0.0203, + "rewards/chosen": 3.6689380645751952, + "rewards/margins": 11.97023048400879, + "rewards/rejected": -8.301292419433594, + "step": 7672 + }, + { + "epoch": 0.701050708085884, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 2.0558417803174766e-06, + "logits/chosen": 913084416.0, + "logits/rejected": 561805397.3333334, + "logps/chosen": -234.7234375, + "logps/rejected": -285.79880777994794, + "loss": 0.0229, + "rewards/chosen": 3.606796646118164, + "rewards/margins": 10.198134485880534, + "rewards/rejected": -6.59133783976237, + "step": 7673 + }, + { + "epoch": 0.7011420740063956, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 2.054679792127008e-06, + "logits/chosen": 756034048.0, + "logits/rejected": 495227596.8, + "logps/chosen": -396.8082682291667, + "logps/rejected": -353.36259765625, + "loss": 0.0153, + "rewards/chosen": 3.268803278605143, + "rewards/margins": 12.986826960245768, + "rewards/rejected": -9.718023681640625, + "step": 7674 + }, + { + "epoch": 0.7012334399269072, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 2.0535180474872297e-06, + "logits/chosen": 468211456.0, + "logits/rejected": 534089600.0, + "logps/chosen": -402.2673746744792, + "logps/rejected": -346.951416015625, + "loss": 0.019, + "rewards/chosen": 3.874429702758789, + "rewards/margins": 14.884123802185059, + "rewards/rejected": -11.00969409942627, + "step": 7675 + }, + { + "epoch": 0.701324805847419, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 2.0523565464942104e-06, + "logits/chosen": 470763072.0, + "logits/rejected": 452131520.0, + "logps/chosen": -189.2669219970703, + "logps/rejected": -385.11285400390625, + "loss": 0.0081, + "rewards/chosen": 4.609242916107178, + "rewards/margins": 12.194342613220215, + "rewards/rejected": -7.585099697113037, + "step": 7676 + }, + { + "epoch": 0.7014161717679306, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 2.0511952892439947e-06, + "logits/chosen": 570129493.3333334, + "logits/rejected": 786005452.8, + "logps/chosen": -202.68741861979166, + "logps/rejected": -644.58076171875, + "loss": 0.0088, + "rewards/chosen": 4.1636168162028, + "rewards/margins": 15.908976618448893, + "rewards/rejected": -11.745359802246094, + "step": 7677 + }, + { + "epoch": 0.7015075376884422, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 2.050034275832607e-06, + "logits/chosen": 591398400.0, + "logits/rejected": 396428032.0, + "logps/chosen": -331.7741455078125, + "logps/rejected": -361.7148844401042, + "loss": 0.0167, + "rewards/chosen": 4.147383499145508, + "rewards/margins": 13.111203638712563, + "rewards/rejected": -8.963820139567057, + "step": 7678 + }, + { + "epoch": 0.7015989036089538, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.0488735063560505e-06, + "logits/chosen": 1184538316.8, + "logits/rejected": 607414869.3333334, + "logps/chosen": -452.59716796875, + "logps/rejected": -447.6944986979167, + "loss": 0.0151, + "rewards/chosen": 4.04534683227539, + "rewards/margins": 12.628287251790365, + "rewards/rejected": -8.582940419514975, + "step": 7679 + }, + { + "epoch": 0.7016902695294656, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.0477129809103147e-06, + "logits/chosen": 326942954.6666667, + "logits/rejected": 723015552.0, + "logps/chosen": -297.86572265625, + "logps/rejected": -807.3840942382812, + "loss": 0.0164, + "rewards/chosen": 4.314400990804036, + "rewards/margins": 21.00284703572591, + "rewards/rejected": -16.688446044921875, + "step": 7680 + }, + { + "epoch": 0.7017816354499772, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 2.0465526995913616e-06, + "logits/chosen": 220968160.0, + "logits/rejected": 449170432.0, + "logps/chosen": -152.23056030273438, + "logps/rejected": -534.71240234375, + "loss": 0.0188, + "rewards/chosen": 3.381483793258667, + "rewards/margins": 11.879551649093628, + "rewards/rejected": -8.498067855834961, + "step": 7681 + }, + { + "epoch": 0.7018730013704888, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 2.045392662495136e-06, + "logits/chosen": 296643968.0, + "logits/rejected": 349036256.0, + "logps/chosen": -322.40289306640625, + "logps/rejected": -520.9971923828125, + "loss": 0.0085, + "rewards/chosen": 4.613219261169434, + "rewards/margins": 15.143786430358887, + "rewards/rejected": -10.530567169189453, + "step": 7682 + }, + { + "epoch": 0.7019643672910004, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 2.044232869717562e-06, + "logits/chosen": 331903648.0, + "logits/rejected": 462705728.0, + "logps/chosen": -229.97994995117188, + "logps/rejected": -415.4857482910156, + "loss": 0.0269, + "rewards/chosen": 3.7455925941467285, + "rewards/margins": 12.558043003082275, + "rewards/rejected": -8.812450408935547, + "step": 7683 + }, + { + "epoch": 0.7020557332115122, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 2.0430733213545422e-06, + "logits/chosen": 471383082.6666667, + "logits/rejected": 508733235.2, + "logps/chosen": -359.4169108072917, + "logps/rejected": -555.84833984375, + "loss": 0.0106, + "rewards/chosen": 3.5886659622192383, + "rewards/margins": 12.537500190734864, + "rewards/rejected": -8.948834228515626, + "step": 7684 + }, + { + "epoch": 0.7021470991320238, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 2.0419140175019642e-06, + "logits/chosen": 1289110357.3333333, + "logits/rejected": 672370995.2, + "logps/chosen": -236.93221028645834, + "logps/rejected": -599.01943359375, + "loss": 0.0187, + "rewards/chosen": 3.488572438557943, + "rewards/margins": 12.93423589070638, + "rewards/rejected": -9.445663452148438, + "step": 7685 + }, + { + "epoch": 0.7022384650525354, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 2.040754958255689e-06, + "logits/chosen": 576760256.0, + "logits/rejected": 307663584.0, + "logps/chosen": -224.88467407226562, + "logps/rejected": -397.70928955078125, + "loss": 0.031, + "rewards/chosen": 2.8266172409057617, + "rewards/margins": 13.852904319763184, + "rewards/rejected": -11.026287078857422, + "step": 7686 + }, + { + "epoch": 0.702329830973047, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.0395961437115615e-06, + "logits/chosen": 794592460.8, + "logits/rejected": 453805312.0, + "logps/chosen": -469.03349609375, + "logps/rejected": -383.1738688151042, + "loss": 0.022, + "rewards/chosen": 3.4248600006103516, + "rewards/margins": 10.956349054972332, + "rewards/rejected": -7.5314890543619795, + "step": 7687 + }, + { + "epoch": 0.7024211968935588, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 2.0384375739654016e-06, + "logits/chosen": 306020224.0, + "logits/rejected": 443123776.0, + "logps/chosen": -187.331298828125, + "logps/rejected": -427.57672119140625, + "loss": 0.0128, + "rewards/chosen": 3.947744607925415, + "rewards/margins": 12.543681383132935, + "rewards/rejected": -8.59593677520752, + "step": 7688 + }, + { + "epoch": 0.7025125628140704, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.037279249113017e-06, + "logits/chosen": 315835488.0, + "logits/rejected": 623978752.0, + "logps/chosen": -239.26348876953125, + "logps/rejected": -548.76220703125, + "loss": 0.0231, + "rewards/chosen": 3.7932796478271484, + "rewards/margins": 12.22365951538086, + "rewards/rejected": -8.430379867553711, + "step": 7689 + }, + { + "epoch": 0.702603928734582, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 2.036121169250189e-06, + "logits/chosen": 745124454.4, + "logits/rejected": 1061787392.0, + "logps/chosen": -409.51611328125, + "logps/rejected": -992.7295735677084, + "loss": 0.0232, + "rewards/chosen": 3.320006561279297, + "rewards/margins": 16.777144622802734, + "rewards/rejected": -13.457138061523438, + "step": 7690 + }, + { + "epoch": 0.7026952946550936, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.0349633344726785e-06, + "logits/chosen": 497354368.0, + "logits/rejected": 482265120.0, + "logps/chosen": -329.15472412109375, + "logps/rejected": -561.6478881835938, + "loss": 0.0366, + "rewards/chosen": 3.014010429382324, + "rewards/margins": 11.455133438110352, + "rewards/rejected": -8.441123008728027, + "step": 7691 + }, + { + "epoch": 0.7027866605756053, + "grad_norm": 72.5, + "kl": 0.0, + "learning_rate": 2.033805744876227e-06, + "logits/chosen": 683245440.0, + "logits/rejected": 854537130.6666666, + "logps/chosen": -340.8006591796875, + "logps/rejected": -419.3988444010417, + "loss": 0.0252, + "rewards/chosen": 2.962118625640869, + "rewards/margins": 11.765450636545816, + "rewards/rejected": -8.803332010904947, + "step": 7692 + }, + { + "epoch": 0.702878026496117, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 2.0326484005565617e-06, + "logits/chosen": 584264396.8, + "logits/rejected": 352041749.3333333, + "logps/chosen": -451.746435546875, + "logps/rejected": -367.5945638020833, + "loss": 0.0209, + "rewards/chosen": 3.5384841918945313, + "rewards/margins": 13.222559865315755, + "rewards/rejected": -9.684075673421225, + "step": 7693 + }, + { + "epoch": 0.7029693924166286, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 2.0314913016093784e-06, + "logits/chosen": 530118765.71428573, + "logits/rejected": 1007711808.0, + "logps/chosen": -201.74879673549108, + "logps/rejected": -399.26708984375, + "loss": 0.0207, + "rewards/chosen": 4.192429678780692, + "rewards/margins": 13.686478751046316, + "rewards/rejected": -9.494049072265625, + "step": 7694 + }, + { + "epoch": 0.7030607583371402, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 2.0303344481303612e-06, + "logits/chosen": 915013056.0, + "logits/rejected": 575891712.0, + "logps/chosen": -439.58135986328125, + "logps/rejected": -504.6927185058594, + "loss": 0.0087, + "rewards/chosen": 4.315402030944824, + "rewards/margins": 12.090620994567871, + "rewards/rejected": -7.775218963623047, + "step": 7695 + }, + { + "epoch": 0.703152124257652, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 2.0291778402151685e-06, + "logits/chosen": 823270809.6, + "logits/rejected": 1108500992.0, + "logps/chosen": -533.842626953125, + "logps/rejected": -522.1582845052084, + "loss": 0.0295, + "rewards/chosen": 3.0877613067626952, + "rewards/margins": 12.514900588989258, + "rewards/rejected": -9.427139282226562, + "step": 7696 + }, + { + "epoch": 0.7032434901781636, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 2.028021477959445e-06, + "logits/chosen": 563773312.0, + "logits/rejected": 640395520.0, + "logps/chosen": -362.436767578125, + "logps/rejected": -418.09521484375, + "loss": 0.0277, + "rewards/chosen": 3.5844032764434814, + "rewards/margins": 11.619974374771118, + "rewards/rejected": -8.035571098327637, + "step": 7697 + }, + { + "epoch": 0.7033348560986752, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 2.0268653614588086e-06, + "logits/chosen": 779655616.0, + "logits/rejected": 551006144.0, + "logps/chosen": -190.58352661132812, + "logps/rejected": -463.0130615234375, + "loss": 0.0132, + "rewards/chosen": 4.292023181915283, + "rewards/margins": 13.570948123931885, + "rewards/rejected": -9.278924942016602, + "step": 7698 + }, + { + "epoch": 0.7034262220191868, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 2.025709490808861e-06, + "logits/chosen": 353563584.0, + "logits/rejected": 320814208.0, + "logps/chosen": -397.36474609375, + "logps/rejected": -464.8644104003906, + "loss": 0.0224, + "rewards/chosen": 4.348090171813965, + "rewards/margins": 14.056754112243652, + "rewards/rejected": -9.708663940429688, + "step": 7699 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 8.375, + "kl": 1.5602760314941406, + "learning_rate": 2.024553866105179e-06, + "logits/chosen": 449334125.71428573, + "logits/rejected": 293925056.0, + "logps/chosen": -282.29799107142856, + "logps/rejected": -256.4339599609375, + "loss": 0.0713, + "rewards/chosen": 2.8390276772635326, + "rewards/margins": 9.16933182307652, + "rewards/rejected": -6.330304145812988, + "step": 7700 + }, + { + "epoch": 0.7036089538602102, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 2.0233984874433253e-06, + "logits/chosen": 880583253.3333334, + "logits/rejected": 598246912.0, + "logps/chosen": -333.3411458333333, + "logps/rejected": -125.75656127929688, + "loss": 0.0499, + "rewards/chosen": 3.1934083302815757, + "rewards/margins": 9.169303258260092, + "rewards/rejected": -5.975894927978516, + "step": 7701 + }, + { + "epoch": 0.7037003197807218, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 2.0222433549188387e-06, + "logits/chosen": 448059648.0, + "logits/rejected": 481865258.6666667, + "logps/chosen": -207.44625854492188, + "logps/rejected": -293.49462890625, + "loss": 0.0098, + "rewards/chosen": 3.3347580432891846, + "rewards/margins": 11.604429165522257, + "rewards/rejected": -8.269671122233072, + "step": 7702 + }, + { + "epoch": 0.7037916857012334, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 2.021088468627237e-06, + "logits/chosen": 222197632.0, + "logits/rejected": 525327232.0, + "logps/chosen": -178.1121063232422, + "logps/rejected": -517.876953125, + "loss": 0.009, + "rewards/chosen": 5.15601110458374, + "rewards/margins": 14.521726131439209, + "rewards/rejected": -9.365715026855469, + "step": 7703 + }, + { + "epoch": 0.7038830516217451, + "grad_norm": 2.40625, + "kl": 3.5692214965820312, + "learning_rate": 2.0199338286640186e-06, + "logits/chosen": 559775890.2857143, + "logits/rejected": 241017664.0, + "logps/chosen": -295.21728515625, + "logps/rejected": -326.74981689453125, + "loss": 0.0193, + "rewards/chosen": 4.572417122977121, + "rewards/margins": 13.416802270071848, + "rewards/rejected": -8.844385147094727, + "step": 7704 + }, + { + "epoch": 0.7039744175422568, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 2.0187794351246596e-06, + "logits/chosen": 401630549.3333333, + "logits/rejected": 607176576.0, + "logps/chosen": -278.1002604166667, + "logps/rejected": -607.6982421875, + "loss": 0.0197, + "rewards/chosen": 3.8155174255371094, + "rewards/margins": 12.538427352905273, + "rewards/rejected": -8.722909927368164, + "step": 7705 + }, + { + "epoch": 0.7040657834627684, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 2.017625288104622e-06, + "logits/chosen": 494482688.0, + "logits/rejected": 341980492.8, + "logps/chosen": -391.8448079427083, + "logps/rejected": -276.509423828125, + "loss": 0.0068, + "rewards/chosen": 4.122521082560222, + "rewards/margins": 12.64838930765788, + "rewards/rejected": -8.525868225097657, + "step": 7706 + }, + { + "epoch": 0.70415714938328, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 2.01647138769934e-06, + "logits/chosen": 731461120.0, + "logits/rejected": 444213418.6666667, + "logps/chosen": -399.0999755859375, + "logps/rejected": -518.0562337239584, + "loss": 0.0068, + "rewards/chosen": 4.101118564605713, + "rewards/margins": 12.9927659034729, + "rewards/rejected": -8.891647338867188, + "step": 7707 + }, + { + "epoch": 0.7042485153037917, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 2.0153177340042324e-06, + "logits/chosen": 791665561.6, + "logits/rejected": 354245674.6666667, + "logps/chosen": -418.6626953125, + "logps/rejected": -848.9519856770834, + "loss": 0.0275, + "rewards/chosen": 3.306759262084961, + "rewards/margins": 13.277629597981772, + "rewards/rejected": -9.97087033589681, + "step": 7708 + }, + { + "epoch": 0.7043398812243034, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.014164327114692e-06, + "logits/chosen": 406104576.0, + "logits/rejected": 532392320.0, + "logps/chosen": -284.8816223144531, + "logps/rejected": -519.9161376953125, + "loss": 0.0109, + "rewards/chosen": 4.320414066314697, + "rewards/margins": 12.384697437286377, + "rewards/rejected": -8.06428337097168, + "step": 7709 + }, + { + "epoch": 0.704431247144815, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 2.0130111671260986e-06, + "logits/chosen": 447963699.2, + "logits/rejected": 388051413.3333333, + "logps/chosen": -310.97490234375, + "logps/rejected": -506.4522298177083, + "loss": 0.0236, + "rewards/chosen": 3.748113250732422, + "rewards/margins": 14.662450408935547, + "rewards/rejected": -10.914337158203125, + "step": 7710 + }, + { + "epoch": 0.7045226130653266, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.0118582541338076e-06, + "logits/chosen": 347487530.6666667, + "logits/rejected": 409788192.0, + "logps/chosen": -249.78690592447916, + "logps/rejected": -557.9136352539062, + "loss": 0.0253, + "rewards/chosen": 3.963165283203125, + "rewards/margins": 16.214138984680176, + "rewards/rejected": -12.25097370147705, + "step": 7711 + }, + { + "epoch": 0.7046139789858383, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.0107055882331526e-06, + "logits/chosen": 635683008.0, + "logits/rejected": 432726496.0, + "logps/chosen": -294.0548400878906, + "logps/rejected": -453.8711853027344, + "loss": 0.0166, + "rewards/chosen": 3.749476671218872, + "rewards/margins": 14.790146112442017, + "rewards/rejected": -11.040669441223145, + "step": 7712 + }, + { + "epoch": 0.70470534490635, + "grad_norm": 48.5, + "kl": 0.0, + "learning_rate": 2.0095531695194477e-06, + "logits/chosen": 500500096.0, + "logits/rejected": 545072576.0, + "logps/chosen": -155.919189453125, + "logps/rejected": -443.682373046875, + "loss": 0.1355, + "rewards/chosen": 2.9626755714416504, + "rewards/margins": 8.605832576751709, + "rewards/rejected": -5.643157005310059, + "step": 7713 + }, + { + "epoch": 0.7047967108268616, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 2.0084009980879903e-06, + "logits/chosen": 600666240.0, + "logits/rejected": 762962048.0, + "logps/chosen": -323.50408935546875, + "logps/rejected": -584.61962890625, + "loss": 0.0321, + "rewards/chosen": 2.8308024406433105, + "rewards/margins": 12.118241786956787, + "rewards/rejected": -9.287439346313477, + "step": 7714 + }, + { + "epoch": 0.7048880767473732, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 2.0072490740340524e-06, + "logits/chosen": 545235865.6, + "logits/rejected": 809752064.0, + "logps/chosen": -271.105224609375, + "logps/rejected": -434.2992350260417, + "loss": 0.0127, + "rewards/chosen": 4.326351547241211, + "rewards/margins": 12.87477149963379, + "rewards/rejected": -8.548419952392578, + "step": 7715 + }, + { + "epoch": 0.7049794426678849, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 2.0060973974528873e-06, + "logits/chosen": 562488256.0, + "logits/rejected": 655608320.0, + "logps/chosen": -323.5625915527344, + "logps/rejected": -334.123291015625, + "loss": 0.0068, + "rewards/chosen": 3.7520461082458496, + "rewards/margins": 11.684464931488037, + "rewards/rejected": -7.9324188232421875, + "step": 7716 + }, + { + "epoch": 0.7050708085883965, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 2.0049459684397286e-06, + "logits/chosen": 501452501.3333333, + "logits/rejected": 410863872.0, + "logps/chosen": -382.7399088541667, + "logps/rejected": -504.48076171875, + "loss": 0.0088, + "rewards/chosen": 4.176350275675456, + "rewards/margins": 14.311660639444987, + "rewards/rejected": -10.13531036376953, + "step": 7717 + }, + { + "epoch": 0.7051621745089082, + "grad_norm": 33.75, + "kl": 0.0, + "learning_rate": 2.0037947870897872e-06, + "logits/chosen": 491735338.6666667, + "logits/rejected": 259947040.0, + "logps/chosen": -254.69498697916666, + "logps/rejected": -260.6230773925781, + "loss": 0.1878, + "rewards/chosen": 2.093134085337321, + "rewards/margins": 9.05804411570231, + "rewards/rejected": -6.96491003036499, + "step": 7718 + }, + { + "epoch": 0.7052535404294198, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 2.0026438534982576e-06, + "logits/chosen": 609198336.0, + "logits/rejected": 1365452544.0, + "logps/chosen": -242.1114501953125, + "logps/rejected": -672.0875854492188, + "loss": 0.0205, + "rewards/chosen": 3.9930073420206704, + "rewards/margins": 11.884499231974283, + "rewards/rejected": -7.891491889953613, + "step": 7719 + }, + { + "epoch": 0.7053449063499315, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 2.001493167760312e-06, + "logits/chosen": 388958336.0, + "logits/rejected": 346130240.0, + "logps/chosen": -324.44287109375, + "logps/rejected": -420.3214111328125, + "loss": 0.0224, + "rewards/chosen": 3.4234228134155273, + "rewards/margins": 12.915080070495605, + "rewards/rejected": -9.491657257080078, + "step": 7720 + }, + { + "epoch": 0.7054362722704431, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 2.0003427299710966e-06, + "logits/chosen": 502654549.3333333, + "logits/rejected": 247789360.0, + "logps/chosen": -320.2230224609375, + "logps/rejected": -498.7269287109375, + "loss": 0.0186, + "rewards/chosen": 3.912349065144857, + "rewards/margins": 14.86359723409017, + "rewards/rejected": -10.951248168945312, + "step": 7721 + }, + { + "epoch": 0.7055276381909548, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.999192540225747e-06, + "logits/chosen": 431577472.0, + "logits/rejected": 557428736.0, + "logps/chosen": -235.95697021484375, + "logps/rejected": -544.655517578125, + "loss": 0.0168, + "rewards/chosen": 3.474437713623047, + "rewards/margins": 12.482285499572754, + "rewards/rejected": -9.007847785949707, + "step": 7722 + }, + { + "epoch": 0.7056190041114664, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.9980425986193703e-06, + "logits/chosen": 562636646.4, + "logits/rejected": 466917632.0, + "logps/chosen": -389.4980712890625, + "logps/rejected": -563.2189534505209, + "loss": 0.0112, + "rewards/chosen": 5.062363815307617, + "rewards/margins": 14.967451604207358, + "rewards/rejected": -9.90508778889974, + "step": 7723 + }, + { + "epoch": 0.7057103700319781, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 1.996892905247057e-06, + "logits/chosen": 466501427.2, + "logits/rejected": 521823061.3333333, + "logps/chosen": -411.19130859375, + "logps/rejected": -309.3246663411458, + "loss": 0.028, + "rewards/chosen": 3.558293914794922, + "rewards/margins": 11.68577537536621, + "rewards/rejected": -8.127481460571289, + "step": 7724 + }, + { + "epoch": 0.7058017359524897, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 1.9957434602038765e-06, + "logits/chosen": 702595669.3333334, + "logits/rejected": 577595136.0, + "logps/chosen": -386.9140625, + "logps/rejected": -588.965576171875, + "loss": 0.0141, + "rewards/chosen": 4.0391496022542315, + "rewards/margins": 14.869822820027668, + "rewards/rejected": -10.830673217773438, + "step": 7725 + }, + { + "epoch": 0.7058931018730014, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.9945942635848745e-06, + "logits/chosen": 841597312.0, + "logits/rejected": 543068800.0, + "logps/chosen": -719.9738159179688, + "logps/rejected": -472.5826110839844, + "loss": 0.0189, + "rewards/chosen": 3.381577968597412, + "rewards/margins": 12.656284809112549, + "rewards/rejected": -9.274706840515137, + "step": 7726 + }, + { + "epoch": 0.705984467793513, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 1.9934453154850835e-06, + "logits/chosen": 798252629.3333334, + "logits/rejected": 607822182.4, + "logps/chosen": -308.0253499348958, + "logps/rejected": -588.872412109375, + "loss": 0.006, + "rewards/chosen": 4.493353525797526, + "rewards/margins": 14.414661661783853, + "rewards/rejected": -9.921308135986328, + "step": 7727 + }, + { + "epoch": 0.7060758337140247, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 1.9922966159995087e-06, + "logits/chosen": 538692309.3333334, + "logits/rejected": 411453888.0, + "logps/chosen": -295.19077555338544, + "logps/rejected": -533.1514892578125, + "loss": 0.1311, + "rewards/chosen": 3.8896004358927407, + "rewards/margins": 14.630221048990885, + "rewards/rejected": -10.740620613098145, + "step": 7728 + }, + { + "epoch": 0.7061671996345363, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.9911481652231364e-06, + "logits/chosen": 328107958.85714287, + "logits/rejected": 952028928.0, + "logps/chosen": -265.70474679129467, + "logps/rejected": -866.3195190429688, + "loss": 0.016, + "rewards/chosen": 4.693303789411273, + "rewards/margins": 17.33997794560024, + "rewards/rejected": -12.646674156188965, + "step": 7729 + }, + { + "epoch": 0.706258565555048, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.9899999632509314e-06, + "logits/chosen": 574018304.0, + "logits/rejected": 306857130.6666667, + "logps/chosen": -252.1721435546875, + "logps/rejected": -340.66200764973956, + "loss": 0.0144, + "rewards/chosen": 4.312525939941406, + "rewards/margins": 13.318962605794272, + "rewards/rejected": -9.006436665852865, + "step": 7730 + }, + { + "epoch": 0.7063499314755596, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.988852010177843e-06, + "logits/chosen": 999915690.6666666, + "logits/rejected": 761885312.0, + "logps/chosen": -277.9999593098958, + "logps/rejected": -814.4420166015625, + "loss": 0.0234, + "rewards/chosen": 3.8468662897745767, + "rewards/margins": 14.635053316752115, + "rewards/rejected": -10.788187026977539, + "step": 7731 + }, + { + "epoch": 0.7064412973960713, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 1.9877043060987943e-06, + "logits/chosen": 822453888.0, + "logits/rejected": 535173888.0, + "logps/chosen": -273.5790100097656, + "logps/rejected": -457.6052551269531, + "loss": 0.0106, + "rewards/chosen": 4.02069091796875, + "rewards/margins": 13.348739624023438, + "rewards/rejected": -9.328048706054688, + "step": 7732 + }, + { + "epoch": 0.7065326633165829, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.98655685110869e-06, + "logits/chosen": 941365696.0, + "logits/rejected": 629363072.0, + "logps/chosen": -190.84425354003906, + "logps/rejected": -450.43316650390625, + "loss": 0.0206, + "rewards/chosen": 3.6460318565368652, + "rewards/margins": 12.482158184051514, + "rewards/rejected": -8.836126327514648, + "step": 7733 + }, + { + "epoch": 0.7066240292370946, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 1.9854096453024126e-06, + "logits/chosen": 815029248.0, + "logits/rejected": 564763776.0, + "logps/chosen": -420.5463053385417, + "logps/rejected": -478.5880432128906, + "loss": 0.0345, + "rewards/chosen": 3.4159717559814453, + "rewards/margins": 12.410528182983398, + "rewards/rejected": -8.994556427001953, + "step": 7734 + }, + { + "epoch": 0.7067153951576062, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.9842626887748284e-06, + "logits/chosen": 533748736.0, + "logits/rejected": 301635584.0, + "logps/chosen": -369.94217354910717, + "logps/rejected": -323.4637756347656, + "loss": 0.0306, + "rewards/chosen": 3.773815155029297, + "rewards/margins": 12.990972518920898, + "rewards/rejected": -9.217157363891602, + "step": 7735 + }, + { + "epoch": 0.7068067610781179, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 1.9831159816207777e-06, + "logits/chosen": 622079232.0, + "logits/rejected": 539529642.6666666, + "logps/chosen": -282.928271484375, + "logps/rejected": -508.9246826171875, + "loss": 0.0255, + "rewards/chosen": 3.4184326171875, + "rewards/margins": 15.680330657958985, + "rewards/rejected": -12.261898040771484, + "step": 7736 + }, + { + "epoch": 0.7068981269986295, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 1.981969523935084e-06, + "logits/chosen": 913300845.7142857, + "logits/rejected": 1229434624.0, + "logps/chosen": -289.5441196986607, + "logps/rejected": -140.83926391601562, + "loss": 0.0233, + "rewards/chosen": 4.074433735438755, + "rewards/margins": 9.177615097590856, + "rewards/rejected": -5.1031813621521, + "step": 7737 + }, + { + "epoch": 0.7069894929191411, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 1.980823315812548e-06, + "logits/chosen": 574637610.6666666, + "logits/rejected": 928107724.8, + "logps/chosen": -233.7503662109375, + "logps/rejected": -606.803759765625, + "loss": 0.0067, + "rewards/chosen": 4.1679995854695635, + "rewards/margins": 14.248495801289877, + "rewards/rejected": -10.080496215820313, + "step": 7738 + }, + { + "epoch": 0.7070808588396528, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 1.9796773573479482e-06, + "logits/chosen": 601107925.3333334, + "logits/rejected": 400173152.0, + "logps/chosen": -447.94921875, + "logps/rejected": -349.69281005859375, + "loss": 0.0241, + "rewards/chosen": 3.719507853190104, + "rewards/margins": 10.686960379282633, + "rewards/rejected": -6.967452526092529, + "step": 7739 + }, + { + "epoch": 0.7071722247601645, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 1.9785316486360496e-06, + "logits/chosen": 683461504.0, + "logits/rejected": 366787968.0, + "logps/chosen": -197.0516815185547, + "logps/rejected": -443.8633626302083, + "loss": 0.1112, + "rewards/chosen": 2.6093034744262695, + "rewards/margins": 10.526498476664226, + "rewards/rejected": -7.917195002237956, + "step": 7740 + }, + { + "epoch": 0.7072635906806761, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.9773861897715887e-06, + "logits/chosen": 550388906.6666666, + "logits/rejected": 618828800.0, + "logps/chosen": -263.4466552734375, + "logps/rejected": -460.05283203125, + "loss": 0.0161, + "rewards/chosen": 3.3834667205810547, + "rewards/margins": 12.924215316772461, + "rewards/rejected": -9.540748596191406, + "step": 7741 + }, + { + "epoch": 0.7073549566011877, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.9762409808492846e-06, + "logits/chosen": 633019733.3333334, + "logits/rejected": 1059112640.0, + "logps/chosen": -234.61271158854166, + "logps/rejected": -852.4195556640625, + "loss": 0.0234, + "rewards/chosen": 4.552191416422526, + "rewards/margins": 19.350965181986492, + "rewards/rejected": -14.798773765563965, + "step": 7742 + }, + { + "epoch": 0.7074463225216994, + "grad_norm": 0.66015625, + "kl": 0.0, + "learning_rate": 1.975096021963834e-06, + "logits/chosen": 219960832.0, + "logits/rejected": 433053849.6, + "logps/chosen": -154.94746907552084, + "logps/rejected": -568.82626953125, + "loss": 0.0054, + "rewards/chosen": 4.252515157063802, + "rewards/margins": 14.054819234212239, + "rewards/rejected": -9.802304077148438, + "step": 7743 + }, + { + "epoch": 0.7075376884422111, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 1.9739513132099176e-06, + "logits/chosen": 420291285.3333333, + "logits/rejected": 360805632.0, + "logps/chosen": -249.41630045572916, + "logps/rejected": -606.7802734375, + "loss": 0.0337, + "rewards/chosen": 3.495673497517904, + "rewards/margins": 12.05496342976888, + "rewards/rejected": -8.559289932250977, + "step": 7744 + }, + { + "epoch": 0.7076290543627227, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.972806854682191e-06, + "logits/chosen": 353301610.6666667, + "logits/rejected": 447402720.0, + "logps/chosen": -331.4571126302083, + "logps/rejected": -558.7565307617188, + "loss": 0.012, + "rewards/chosen": 4.639228820800781, + "rewards/margins": 14.340503692626953, + "rewards/rejected": -9.701274871826172, + "step": 7745 + }, + { + "epoch": 0.7077204202832343, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.9716626464752896e-06, + "logits/chosen": 649378962.2857143, + "logits/rejected": 598247552.0, + "logps/chosen": -439.5957728794643, + "logps/rejected": -507.5636291503906, + "loss": 0.0227, + "rewards/chosen": 4.142606462751116, + "rewards/margins": 16.249891008649556, + "rewards/rejected": -12.107284545898438, + "step": 7746 + }, + { + "epoch": 0.707811786203746, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 1.970518688683829e-06, + "logits/chosen": 474521770.6666667, + "logits/rejected": 583234867.2, + "logps/chosen": -341.7605794270833, + "logps/rejected": -392.8621337890625, + "loss": 0.0122, + "rewards/chosen": 4.2782847086588545, + "rewards/margins": 12.976005808512369, + "rewards/rejected": -8.697721099853515, + "step": 7747 + }, + { + "epoch": 0.7079031521242577, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 1.969374981402405e-06, + "logits/chosen": 447622348.8, + "logits/rejected": 178243274.66666666, + "logps/chosen": -330.98876953125, + "logps/rejected": -397.333251953125, + "loss": 0.0251, + "rewards/chosen": 3.4195014953613283, + "rewards/margins": 11.96998774210612, + "rewards/rejected": -8.550486246744791, + "step": 7748 + }, + { + "epoch": 0.7079945180447693, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 1.9682315247255897e-06, + "logits/chosen": 414652979.2, + "logits/rejected": 513508650.6666667, + "logps/chosen": -330.507861328125, + "logps/rejected": -650.8374430338541, + "loss": 0.0176, + "rewards/chosen": 4.7343189239501955, + "rewards/margins": 17.540607325236003, + "rewards/rejected": -12.806288401285807, + "step": 7749 + }, + { + "epoch": 0.7080858839652809, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 1.9670883187479377e-06, + "logits/chosen": 758246080.0, + "logits/rejected": 984294592.0, + "logps/chosen": -419.672119140625, + "logps/rejected": -793.05419921875, + "loss": 0.0221, + "rewards/chosen": 3.304708480834961, + "rewards/margins": 14.777274131774902, + "rewards/rejected": -11.472565650939941, + "step": 7750 + }, + { + "epoch": 0.7081772498857926, + "grad_norm": 0.62890625, + "kl": 0.0, + "learning_rate": 1.9659453635639795e-06, + "logits/chosen": 503415904.0, + "logits/rejected": 398081493.3333333, + "logps/chosen": -308.2801818847656, + "logps/rejected": -590.5399576822916, + "loss": 0.003, + "rewards/chosen": 4.5762224197387695, + "rewards/margins": 15.580764452616373, + "rewards/rejected": -11.004542032877604, + "step": 7751 + }, + { + "epoch": 0.7082686158063043, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 1.96480265926823e-06, + "logits/chosen": 447799509.3333333, + "logits/rejected": 502193817.6, + "logps/chosen": -56.532989501953125, + "logps/rejected": -439.695703125, + "loss": 0.026, + "rewards/chosen": 2.661326249440511, + "rewards/margins": 12.3793651898702, + "rewards/rejected": -9.718038940429688, + "step": 7752 + }, + { + "epoch": 0.7083599817268159, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.9636602059551795e-06, + "logits/chosen": 474611763.2, + "logits/rejected": 360245461.3333333, + "logps/chosen": -199.083447265625, + "logps/rejected": -354.7320963541667, + "loss": 0.1444, + "rewards/chosen": 2.05384521484375, + "rewards/margins": 10.812828191121419, + "rewards/rejected": -8.75898297627767, + "step": 7753 + }, + { + "epoch": 0.7084513476473275, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.962518003719297e-06, + "logits/chosen": 599855104.0, + "logits/rejected": 337408938.6666667, + "logps/chosen": -354.2175048828125, + "logps/rejected": -421.7880045572917, + "loss": 0.022, + "rewards/chosen": 3.794758605957031, + "rewards/margins": 14.432750193277993, + "rewards/rejected": -10.637991587320963, + "step": 7754 + }, + { + "epoch": 0.7085427135678392, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.9613760526550313e-06, + "logits/chosen": 494099072.0, + "logits/rejected": 434953088.0, + "logps/chosen": -243.3557891845703, + "logps/rejected": -475.2808430989583, + "loss": 0.0085, + "rewards/chosen": 3.5774593353271484, + "rewards/margins": 12.401713053385416, + "rewards/rejected": -8.824253718058268, + "step": 7755 + }, + { + "epoch": 0.7086340794883509, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 1.9602343528568145e-06, + "logits/chosen": 987603763.2, + "logits/rejected": 561558314.6666666, + "logps/chosen": -295.385888671875, + "logps/rejected": -402.3317057291667, + "loss": 0.0313, + "rewards/chosen": 3.5752487182617188, + "rewards/margins": 11.944000879923502, + "rewards/rejected": -8.368752161661783, + "step": 7756 + }, + { + "epoch": 0.7087254454088625, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 1.9590929044190523e-06, + "logits/chosen": 402684074.6666667, + "logits/rejected": 480822425.6, + "logps/chosen": -279.7426350911458, + "logps/rejected": -478.947705078125, + "loss": 0.0336, + "rewards/chosen": 3.207199732462565, + "rewards/margins": 12.82646853129069, + "rewards/rejected": -9.619268798828125, + "step": 7757 + }, + { + "epoch": 0.7088168113293741, + "grad_norm": 0.921875, + "kl": 0.0, + "learning_rate": 1.9579517074361326e-06, + "logits/chosen": 802473856.0, + "logits/rejected": 452991573.3333333, + "logps/chosen": -363.38739013671875, + "logps/rejected": -380.75439453125, + "loss": 0.0042, + "rewards/chosen": 4.2942352294921875, + "rewards/margins": 12.474321365356445, + "rewards/rejected": -8.180086135864258, + "step": 7758 + }, + { + "epoch": 0.7089081772498858, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 1.956810762002421e-06, + "logits/chosen": 726561962.6666666, + "logits/rejected": 579665305.6, + "logps/chosen": -549.8497721354166, + "logps/rejected": -527.93193359375, + "loss": 0.0096, + "rewards/chosen": 3.9071499506632485, + "rewards/margins": 13.751494280497232, + "rewards/rejected": -9.844344329833984, + "step": 7759 + }, + { + "epoch": 0.7089995431703975, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.9556700682122627e-06, + "logits/chosen": 629195556.5714285, + "logits/rejected": 514242432.0, + "logps/chosen": -371.94454520089283, + "logps/rejected": -592.5810546875, + "loss": 0.0401, + "rewards/chosen": 3.833711896623884, + "rewards/margins": 13.472512517656599, + "rewards/rejected": -9.638800621032715, + "step": 7760 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.9545296261599844e-06, + "logits/chosen": 490105856.0, + "logits/rejected": 314956544.0, + "logps/chosen": -290.5675862630208, + "logps/rejected": -410.593017578125, + "loss": 0.0185, + "rewards/chosen": 2.9896084467569985, + "rewards/margins": 12.83993345896403, + "rewards/rejected": -9.850325012207032, + "step": 7761 + }, + { + "epoch": 0.7091822750114207, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.953389435939889e-06, + "logits/chosen": 487505715.2, + "logits/rejected": 419056725.3333333, + "logps/chosen": -347.662353515625, + "logps/rejected": -462.7142740885417, + "loss": 0.0424, + "rewards/chosen": 3.0636030197143556, + "rewards/margins": 13.440690294901529, + "rewards/rejected": -10.377087275187174, + "step": 7762 + }, + { + "epoch": 0.7092736409319323, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.9522494976462603e-06, + "logits/chosen": 504055705.6, + "logits/rejected": 416875093.3333333, + "logps/chosen": -235.9634765625, + "logps/rejected": -1003.414306640625, + "loss": 0.0162, + "rewards/chosen": 3.779027557373047, + "rewards/margins": 15.823687489827474, + "rewards/rejected": -12.044659932454428, + "step": 7763 + }, + { + "epoch": 0.7093650068524441, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 1.9511098113733585e-06, + "logits/chosen": 596210496.0, + "logits/rejected": 591414826.6666666, + "logps/chosen": -343.3387145996094, + "logps/rejected": -577.5555826822916, + "loss": 0.0121, + "rewards/chosen": 3.067563056945801, + "rewards/margins": 14.291836738586426, + "rewards/rejected": -11.224273681640625, + "step": 7764 + }, + { + "epoch": 0.7094563727729557, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 1.9499703772154276e-06, + "logits/chosen": 506982229.3333333, + "logits/rejected": 466920038.4, + "logps/chosen": -286.8470052083333, + "logps/rejected": -426.487890625, + "loss": 0.0148, + "rewards/chosen": 3.4401321411132812, + "rewards/margins": 12.632234191894531, + "rewards/rejected": -9.19210205078125, + "step": 7765 + }, + { + "epoch": 0.7095477386934673, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 1.9488311952666884e-06, + "logits/chosen": 395192106.6666667, + "logits/rejected": 354011392.0, + "logps/chosen": -257.9759521484375, + "logps/rejected": -330.882568359375, + "loss": 0.041, + "rewards/chosen": 3.628237724304199, + "rewards/margins": 10.136143684387207, + "rewards/rejected": -6.507905960083008, + "step": 7766 + }, + { + "epoch": 0.7096391046139789, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 1.9476922656213387e-06, + "logits/chosen": 346882496.0, + "logits/rejected": 718141030.4, + "logps/chosen": -208.59326171875, + "logps/rejected": -540.470458984375, + "loss": 0.0109, + "rewards/chosen": 4.205061594645183, + "rewards/margins": 13.633519236246745, + "rewards/rejected": -9.428457641601563, + "step": 7767 + }, + { + "epoch": 0.7097304705344907, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 1.946553588373559e-06, + "logits/chosen": 443217312.0, + "logits/rejected": 387165952.0, + "logps/chosen": -270.0257263183594, + "logps/rejected": -438.22833251953125, + "loss": 0.0104, + "rewards/chosen": 4.214802265167236, + "rewards/margins": 12.027732372283936, + "rewards/rejected": -7.812930107116699, + "step": 7768 + }, + { + "epoch": 0.7098218364550023, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.9454151636175045e-06, + "logits/chosen": 531566634.6666667, + "logits/rejected": 644995072.0, + "logps/chosen": -366.3457438151042, + "logps/rejected": -739.3197021484375, + "loss": 0.0351, + "rewards/chosen": 3.0377041498819985, + "rewards/margins": 14.533908526102701, + "rewards/rejected": -11.496204376220703, + "step": 7769 + }, + { + "epoch": 0.7099132023755139, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 1.944276991447317e-06, + "logits/chosen": 591790848.0, + "logits/rejected": 528739942.4, + "logps/chosen": -319.6016031901042, + "logps/rejected": -415.941259765625, + "loss": 0.0082, + "rewards/chosen": 4.056112925211589, + "rewards/margins": 13.294124094645184, + "rewards/rejected": -9.238011169433594, + "step": 7770 + }, + { + "epoch": 0.7100045682960255, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 1.9431390719571096e-06, + "logits/chosen": 546757888.0, + "logits/rejected": 503656576.0, + "logps/chosen": -431.11480712890625, + "logps/rejected": -475.82501220703125, + "loss": 0.0231, + "rewards/chosen": 3.240800619125366, + "rewards/margins": 13.040612936019897, + "rewards/rejected": -9.799812316894531, + "step": 7771 + }, + { + "epoch": 0.7100959342165373, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 1.9420014052409793e-06, + "logits/chosen": 594045610.6666666, + "logits/rejected": 334611584.0, + "logps/chosen": -346.9790852864583, + "logps/rejected": -273.8150329589844, + "loss": 0.0138, + "rewards/chosen": 4.29551378885905, + "rewards/margins": 10.486168543497723, + "rewards/rejected": -6.190654754638672, + "step": 7772 + }, + { + "epoch": 0.7101873001370489, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 1.9408639913929994e-06, + "logits/chosen": 1368964096.0, + "logits/rejected": 708804010.6666666, + "logps/chosen": -494.4013366699219, + "logps/rejected": -547.77392578125, + "loss": 0.0068, + "rewards/chosen": 3.739854574203491, + "rewards/margins": 14.698116699854532, + "rewards/rejected": -10.958262125651041, + "step": 7773 + }, + { + "epoch": 0.7102786660575605, + "grad_norm": 0.734375, + "kl": 0.0, + "learning_rate": 1.9397268305072236e-06, + "logits/chosen": 813442816.0, + "logits/rejected": 474825344.0, + "logps/chosen": -327.134033203125, + "logps/rejected": -286.194580078125, + "loss": 0.0054, + "rewards/chosen": 4.695379257202148, + "rewards/margins": 12.342009544372559, + "rewards/rejected": -7.64663028717041, + "step": 7774 + }, + { + "epoch": 0.7103700319780721, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 1.9385899226776857e-06, + "logits/chosen": 523496533.3333333, + "logits/rejected": 639948236.8, + "logps/chosen": -181.63741048177084, + "logps/rejected": -401.3746337890625, + "loss": 0.0739, + "rewards/chosen": 3.702862103780111, + "rewards/margins": 12.487066968282065, + "rewards/rejected": -8.784204864501953, + "step": 7775 + }, + { + "epoch": 0.7104613978985839, + "grad_norm": 0.53125, + "kl": 0.0, + "learning_rate": 1.9374532679983944e-06, + "logits/chosen": 1025804544.0, + "logits/rejected": 479616597.3333333, + "logps/chosen": -317.0017395019531, + "logps/rejected": -454.8414306640625, + "loss": 0.0036, + "rewards/chosen": 5.334198951721191, + "rewards/margins": 13.641484896341959, + "rewards/rejected": -8.307285944620768, + "step": 7776 + }, + { + "epoch": 0.7105527638190955, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 1.9363168665633442e-06, + "logits/chosen": 772744089.6, + "logits/rejected": 755931562.6666666, + "logps/chosen": -185.0179443359375, + "logps/rejected": -410.7874348958333, + "loss": 0.0485, + "rewards/chosen": 3.9698680877685546, + "rewards/margins": 11.316629155476887, + "rewards/rejected": -7.346761067708333, + "step": 7777 + }, + { + "epoch": 0.7106441297396071, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.9351807184665032e-06, + "logits/chosen": 920259993.6, + "logits/rejected": 798296405.3333334, + "logps/chosen": -368.8975341796875, + "logps/rejected": -578.5107421875, + "loss": 0.0181, + "rewards/chosen": 4.1201332092285154, + "rewards/margins": 12.658306884765626, + "rewards/rejected": -8.53817367553711, + "step": 7778 + }, + { + "epoch": 0.7107354956601187, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 1.934044823801821e-06, + "logits/chosen": 403968192.0, + "logits/rejected": 454530048.0, + "logps/chosen": -205.90618896484375, + "logps/rejected": -646.4134521484375, + "loss": 0.0151, + "rewards/chosen": 3.535641670227051, + "rewards/margins": 14.035247802734375, + "rewards/rejected": -10.499606132507324, + "step": 7779 + }, + { + "epoch": 0.7108268615806305, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 1.9329091826632258e-06, + "logits/chosen": 392927573.3333333, + "logits/rejected": 490968371.2, + "logps/chosen": -198.7041015625, + "logps/rejected": -711.806396484375, + "loss": 0.0111, + "rewards/chosen": 3.6276206970214844, + "rewards/margins": 13.416302490234376, + "rewards/rejected": -9.788681793212891, + "step": 7780 + }, + { + "epoch": 0.7109182275011421, + "grad_norm": 0.5390625, + "kl": 0.0, + "learning_rate": 1.9317737951446213e-06, + "logits/chosen": 477614784.0, + "logits/rejected": 433958546.28571427, + "logps/chosen": -376.1148681640625, + "logps/rejected": -482.0005580357143, + "loss": 0.0022, + "rewards/chosen": 4.353528022766113, + "rewards/margins": 13.638392175946917, + "rewards/rejected": -9.284864153180804, + "step": 7781 + }, + { + "epoch": 0.7110095934216537, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 1.930638661339899e-06, + "logits/chosen": 488304640.0, + "logits/rejected": 412257280.0, + "logps/chosen": -327.941943359375, + "logps/rejected": -536.5996500651041, + "loss": 0.0242, + "rewards/chosen": 4.067139434814453, + "rewards/margins": 13.82358309427897, + "rewards/rejected": -9.756443659464518, + "step": 7782 + }, + { + "epoch": 0.7111009593421653, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 1.92950378134292e-06, + "logits/chosen": 962356480.0, + "logits/rejected": 560065996.8, + "logps/chosen": -413.1593424479167, + "logps/rejected": -420.54560546875, + "loss": 0.0068, + "rewards/chosen": 4.950958887736003, + "rewards/margins": 14.561566797892254, + "rewards/rejected": -9.61060791015625, + "step": 7783 + }, + { + "epoch": 0.7111923252626771, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.9283691552475298e-06, + "logits/chosen": 526735701.3333333, + "logits/rejected": 366287936.0, + "logps/chosen": -365.0757649739583, + "logps/rejected": -459.8251953125, + "loss": 0.0285, + "rewards/chosen": 3.597130457560221, + "rewards/margins": 13.799143473307291, + "rewards/rejected": -10.20201301574707, + "step": 7784 + }, + { + "epoch": 0.7112836911831887, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 1.9272347831475498e-06, + "logits/chosen": 423857408.0, + "logits/rejected": 611410858.6666666, + "logps/chosen": -364.23480224609375, + "logps/rejected": -487.4272054036458, + "loss": 0.006, + "rewards/chosen": 3.7720117568969727, + "rewards/margins": 12.577043851216635, + "rewards/rejected": -8.805032094319662, + "step": 7785 + }, + { + "epoch": 0.7113750571037003, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 1.9261006651367846e-06, + "logits/chosen": 365369301.3333333, + "logits/rejected": 537820364.8, + "logps/chosen": -343.1523030598958, + "logps/rejected": -258.8207275390625, + "loss": 0.1125, + "rewards/chosen": 4.350468635559082, + "rewards/margins": 10.540632820129394, + "rewards/rejected": -6.190164184570312, + "step": 7786 + }, + { + "epoch": 0.7114664230242119, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 1.9249668013090145e-06, + "logits/chosen": 408194730.6666667, + "logits/rejected": 544288153.6, + "logps/chosen": -220.03499348958334, + "logps/rejected": -757.37861328125, + "loss": 0.0229, + "rewards/chosen": 3.1594457626342773, + "rewards/margins": 15.261376762390137, + "rewards/rejected": -12.10193099975586, + "step": 7787 + }, + { + "epoch": 0.7115577889447237, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 1.9238331917579995e-06, + "logits/chosen": 487172812.8, + "logits/rejected": 545865984.0, + "logps/chosen": -223.319580078125, + "logps/rejected": -598.6240234375, + "loss": 0.0344, + "rewards/chosen": 3.182653617858887, + "rewards/margins": 13.18358694712321, + "rewards/rejected": -10.000933329264322, + "step": 7788 + }, + { + "epoch": 0.7116491548652353, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.9226998365774775e-06, + "logits/chosen": 464165216.0, + "logits/rejected": 249941376.0, + "logps/chosen": -346.1934814453125, + "logps/rejected": -411.9600524902344, + "loss": 0.0122, + "rewards/chosen": 4.677040100097656, + "rewards/margins": 11.846923828125, + "rewards/rejected": -7.169883728027344, + "step": 7789 + }, + { + "epoch": 0.7117405207857469, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.9215667358611658e-06, + "logits/chosen": 613781312.0, + "logits/rejected": 390660522.6666667, + "logps/chosen": -461.7450866699219, + "logps/rejected": -374.7464192708333, + "loss": 0.0163, + "rewards/chosen": 2.6680908203125, + "rewards/margins": 12.901397705078125, + "rewards/rejected": -10.233306884765625, + "step": 7790 + }, + { + "epoch": 0.7118318867062585, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 1.920433889702764e-06, + "logits/chosen": 420860160.0, + "logits/rejected": 479508633.6, + "logps/chosen": -252.6910603841146, + "logps/rejected": -607.71435546875, + "loss": 0.0091, + "rewards/chosen": 3.7673463821411133, + "rewards/margins": 13.647455787658691, + "rewards/rejected": -9.880109405517578, + "step": 7791 + }, + { + "epoch": 0.7119232526267703, + "grad_norm": 42.0, + "kl": 0.0, + "learning_rate": 1.9193012981959473e-06, + "logits/chosen": 313788108.8, + "logits/rejected": 558110720.0, + "logps/chosen": -219.675439453125, + "logps/rejected": -490.4103190104167, + "loss": 0.0801, + "rewards/chosen": 3.7400230407714843, + "rewards/margins": 12.276641591389975, + "rewards/rejected": -8.53661855061849, + "step": 7792 + }, + { + "epoch": 0.7120146185472819, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 1.9181689614343696e-06, + "logits/chosen": 751599872.0, + "logits/rejected": 429264640.0, + "logps/chosen": -193.91586303710938, + "logps/rejected": -505.0894775390625, + "loss": 0.0257, + "rewards/chosen": 3.289233922958374, + "rewards/margins": 13.043724775314331, + "rewards/rejected": -9.754490852355957, + "step": 7793 + }, + { + "epoch": 0.7121059844677935, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.9170368795116635e-06, + "logits/chosen": 855781034.6666666, + "logits/rejected": 658789440.0, + "logps/chosen": -265.72243245442706, + "logps/rejected": -370.5810546875, + "loss": 0.0225, + "rewards/chosen": 3.6400578816731772, + "rewards/margins": 11.625407536824545, + "rewards/rejected": -7.985349655151367, + "step": 7794 + }, + { + "epoch": 0.7121973503883051, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.915905052521445e-06, + "logits/chosen": 648441792.0, + "logits/rejected": 356126848.0, + "logps/chosen": -537.928955078125, + "logps/rejected": -450.3247375488281, + "loss": 0.0152, + "rewards/chosen": 3.5409111976623535, + "rewards/margins": 12.402668476104736, + "rewards/rejected": -8.861757278442383, + "step": 7795 + }, + { + "epoch": 0.7122887163088168, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 1.914773480557304e-06, + "logits/chosen": 602964480.0, + "logits/rejected": 695004928.0, + "logps/chosen": -432.354248046875, + "logps/rejected": -589.090576171875, + "loss": 0.0304, + "rewards/chosen": 2.9628262519836426, + "rewards/margins": 16.94046640396118, + "rewards/rejected": -13.977640151977539, + "step": 7796 + }, + { + "epoch": 0.7123800822293285, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 1.91364216371281e-06, + "logits/chosen": 508402534.4, + "logits/rejected": 787314773.3333334, + "logps/chosen": -224.376171875, + "logps/rejected": -740.40771484375, + "loss": 0.0247, + "rewards/chosen": 3.8166038513183596, + "rewards/margins": 16.1558474222819, + "rewards/rejected": -12.339243570963541, + "step": 7797 + }, + { + "epoch": 0.7124714481498401, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 1.9125111020815122e-06, + "logits/chosen": 385881760.0, + "logits/rejected": 396000992.0, + "logps/chosen": -206.69107055664062, + "logps/rejected": -493.386474609375, + "loss": 0.0078, + "rewards/chosen": 4.6289520263671875, + "rewards/margins": 13.754592895507812, + "rewards/rejected": -9.125640869140625, + "step": 7798 + }, + { + "epoch": 0.7125628140703517, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.9113802957569423e-06, + "logits/chosen": 442840800.0, + "logits/rejected": 700856512.0, + "logps/chosen": -274.86651611328125, + "logps/rejected": -494.75738525390625, + "loss": 0.0206, + "rewards/chosen": 3.8618733882904053, + "rewards/margins": 14.210613012313843, + "rewards/rejected": -10.348739624023438, + "step": 7799 + }, + { + "epoch": 0.7126541799908634, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.9102497448326062e-06, + "logits/chosen": 414108501.3333333, + "logits/rejected": 59166368.0, + "logps/chosen": -414.0101725260417, + "logps/rejected": -507.57232666015625, + "loss": 0.0183, + "rewards/chosen": 3.960353215535482, + "rewards/margins": 11.715669949849447, + "rewards/rejected": -7.755316734313965, + "step": 7800 + }, + { + "epoch": 0.7127455459113751, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 1.9091194494019876e-06, + "logits/chosen": 540933632.0, + "logits/rejected": 559596141.7142857, + "logps/chosen": -263.7031555175781, + "logps/rejected": -456.078125, + "loss": 0.0054, + "rewards/chosen": 3.13494873046875, + "rewards/margins": 12.395117623465401, + "rewards/rejected": -9.260168892996651, + "step": 7801 + }, + { + "epoch": 0.7128369118318867, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.907989409558551e-06, + "logits/chosen": 506790080.0, + "logits/rejected": 311399387.4285714, + "logps/chosen": -255.6954803466797, + "logps/rejected": -352.8761509486607, + "loss": 0.0064, + "rewards/chosen": 2.954951524734497, + "rewards/margins": 12.177365813936506, + "rewards/rejected": -9.222414289202009, + "step": 7802 + }, + { + "epoch": 0.7129282777523983, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 1.9068596253957438e-06, + "logits/chosen": 641007513.6, + "logits/rejected": 496638293.3333333, + "logps/chosen": -348.17001953125, + "logps/rejected": -612.7061360677084, + "loss": 0.0193, + "rewards/chosen": 3.6579151153564453, + "rewards/margins": 12.750295003255209, + "rewards/rejected": -9.092379887898764, + "step": 7803 + }, + { + "epoch": 0.71301964367291, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 1.905730097006987e-06, + "logits/chosen": 498322432.0, + "logits/rejected": 455531093.3333333, + "logps/chosen": -423.01865234375, + "logps/rejected": -338.95835367838544, + "loss": 0.0173, + "rewards/chosen": 4.239311981201172, + "rewards/margins": 10.957443364461263, + "rewards/rejected": -6.718131383260091, + "step": 7804 + }, + { + "epoch": 0.7131110095934217, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 1.9046008244856823e-06, + "logits/chosen": 652710272.0, + "logits/rejected": 573060864.0, + "logps/chosen": -355.160888671875, + "logps/rejected": -509.11865234375, + "loss": 0.0099, + "rewards/chosen": 3.795099894205729, + "rewards/margins": 12.472116343180337, + "rewards/rejected": -8.677016448974609, + "step": 7805 + }, + { + "epoch": 0.7132023755139333, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.9034718079252086e-06, + "logits/chosen": 718200490.6666666, + "logits/rejected": 918793856.0, + "logps/chosen": -282.92161051432294, + "logps/rejected": -859.3905029296875, + "loss": 0.0151, + "rewards/chosen": 4.296353975931804, + "rewards/margins": 15.240653673807781, + "rewards/rejected": -10.944299697875977, + "step": 7806 + }, + { + "epoch": 0.7132937414344449, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.9023430474189274e-06, + "logits/chosen": 401810560.0, + "logits/rejected": 337196192.0, + "logps/chosen": -299.124755859375, + "logps/rejected": -271.32354736328125, + "loss": 0.0191, + "rewards/chosen": 3.5350265502929688, + "rewards/margins": 10.59407901763916, + "rewards/rejected": -7.059052467346191, + "step": 7807 + }, + { + "epoch": 0.7133851073549566, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 1.9012145430601758e-06, + "logits/chosen": 527601664.0, + "logits/rejected": 988507648.0, + "logps/chosen": -370.31650390625, + "logps/rejected": -1072.637451171875, + "loss": 0.0405, + "rewards/chosen": 2.7915653228759765, + "rewards/margins": 14.239195887247721, + "rewards/rejected": -11.447630564371744, + "step": 7808 + }, + { + "epoch": 0.7134764732754683, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 1.900086294942271e-06, + "logits/chosen": 345596441.6, + "logits/rejected": 216115968.0, + "logps/chosen": -213.8591064453125, + "logps/rejected": -281.9918619791667, + "loss": 0.0452, + "rewards/chosen": 3.3239456176757813, + "rewards/margins": 9.341841634114584, + "rewards/rejected": -6.017896016438802, + "step": 7809 + }, + { + "epoch": 0.7135678391959799, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.898958303158508e-06, + "logits/chosen": 686434944.0, + "logits/rejected": 545629824.0, + "logps/chosen": -377.8334655761719, + "logps/rejected": -514.2255859375, + "loss": 0.0311, + "rewards/chosen": 2.8939881324768066, + "rewards/margins": 11.419193744659424, + "rewards/rejected": -8.525205612182617, + "step": 7810 + }, + { + "epoch": 0.7136592051164915, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 1.8978305678021598e-06, + "logits/chosen": 786135978.6666666, + "logits/rejected": 399684032.0, + "logps/chosen": -404.6295572916667, + "logps/rejected": -461.9861145019531, + "loss": 0.0316, + "rewards/chosen": 3.3857091267903647, + "rewards/margins": 13.216422398885092, + "rewards/rejected": -9.830713272094727, + "step": 7811 + }, + { + "epoch": 0.7137505710370032, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.8967030889664834e-06, + "logits/chosen": 594858666.6666666, + "logits/rejected": 534538368.0, + "logps/chosen": -361.8877766927083, + "logps/rejected": -437.9017639160156, + "loss": 0.0476, + "rewards/chosen": 2.8302739461263022, + "rewards/margins": 10.182433923085531, + "rewards/rejected": -7.3521599769592285, + "step": 7812 + }, + { + "epoch": 0.7138419369575149, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 1.8955758667447088e-06, + "logits/chosen": 897439061.3333334, + "logits/rejected": 374471488.0, + "logps/chosen": -339.8249918619792, + "logps/rejected": -441.0579833984375, + "loss": 0.0126, + "rewards/chosen": 4.3651383717854815, + "rewards/margins": 18.373504002888996, + "rewards/rejected": -14.008365631103516, + "step": 7813 + }, + { + "epoch": 0.7139333028780265, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 1.8944489012300466e-06, + "logits/chosen": 562969190.4, + "logits/rejected": 352301333.3333333, + "logps/chosen": -396.6559326171875, + "logps/rejected": -374.7945149739583, + "loss": 0.0264, + "rewards/chosen": 3.158132553100586, + "rewards/margins": 13.662555694580078, + "rewards/rejected": -10.504423141479492, + "step": 7814 + }, + { + "epoch": 0.7140246687985381, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 1.8933221925156853e-06, + "logits/chosen": 562339520.0, + "logits/rejected": 653778048.0, + "logps/chosen": -306.6129150390625, + "logps/rejected": -526.5962524414062, + "loss": 0.0202, + "rewards/chosen": 3.2006795406341553, + "rewards/margins": 11.149844884872437, + "rewards/rejected": -7.949165344238281, + "step": 7815 + }, + { + "epoch": 0.7141160347190498, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 1.892195740694796e-06, + "logits/chosen": 701875328.0, + "logits/rejected": 691584192.0, + "logps/chosen": -489.0962219238281, + "logps/rejected": -529.0841064453125, + "loss": 0.092, + "rewards/chosen": 3.8579931259155273, + "rewards/margins": 10.682950973510742, + "rewards/rejected": -6.824957847595215, + "step": 7816 + }, + { + "epoch": 0.7142074006395615, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 1.8910695458605249e-06, + "logits/chosen": 450831360.0, + "logits/rejected": 254790485.33333334, + "logps/chosen": -365.5028076171875, + "logps/rejected": -324.65150960286456, + "loss": 0.0386, + "rewards/chosen": 2.3258774280548096, + "rewards/margins": 8.938002824783325, + "rewards/rejected": -6.612125396728516, + "step": 7817 + }, + { + "epoch": 0.7142987665600731, + "grad_norm": 0.8515625, + "kl": 0.0, + "learning_rate": 1.8899436081059974e-06, + "logits/chosen": 368788053.3333333, + "logits/rejected": 356849408.0, + "logps/chosen": -256.28863525390625, + "logps/rejected": -564.555615234375, + "loss": 0.0039, + "rewards/chosen": 4.8850657145182295, + "rewards/margins": 14.80552469889323, + "rewards/rejected": -9.920458984375, + "step": 7818 + }, + { + "epoch": 0.7143901324805847, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 1.888817927524318e-06, + "logits/chosen": 770564403.2, + "logits/rejected": 649431253.3333334, + "logps/chosen": -312.29716796875, + "logps/rejected": -425.12646484375, + "loss": 0.0265, + "rewards/chosen": 3.8407215118408202, + "rewards/margins": 14.098452631632487, + "rewards/rejected": -10.257731119791666, + "step": 7819 + }, + { + "epoch": 0.7144814984010964, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.8876925042085686e-06, + "logits/chosen": 208342229.33333334, + "logits/rejected": 620532736.0, + "logps/chosen": -508.3766682942708, + "logps/rejected": -635.171484375, + "loss": 0.0062, + "rewards/chosen": 4.805184046427409, + "rewards/margins": 15.398038355509442, + "rewards/rejected": -10.592854309082032, + "step": 7820 + }, + { + "epoch": 0.714572864321608, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 1.8865673382518146e-06, + "logits/chosen": 460814272.0, + "logits/rejected": 619065216.0, + "logps/chosen": -272.8125, + "logps/rejected": -535.421142578125, + "loss": 0.1315, + "rewards/chosen": 3.061162233352661, + "rewards/margins": 7.831734895706177, + "rewards/rejected": -4.770572662353516, + "step": 7821 + }, + { + "epoch": 0.7146642302421197, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.8854424297470952e-06, + "logits/chosen": 448024384.0, + "logits/rejected": 602738944.0, + "logps/chosen": -217.4232635498047, + "logps/rejected": -319.442138671875, + "loss": 0.0132, + "rewards/chosen": 4.5293426513671875, + "rewards/margins": 12.897445678710938, + "rewards/rejected": -8.36810302734375, + "step": 7822 + }, + { + "epoch": 0.7147555961626313, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 1.88431777878743e-06, + "logits/chosen": 485424768.0, + "logits/rejected": 308394112.0, + "logps/chosen": -206.3011474609375, + "logps/rejected": -304.77239990234375, + "loss": 0.0267, + "rewards/chosen": 3.2831718921661377, + "rewards/margins": 11.983085870742798, + "rewards/rejected": -8.69991397857666, + "step": 7823 + }, + { + "epoch": 0.714846962083143, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 1.8831933854658152e-06, + "logits/chosen": 827932160.0, + "logits/rejected": 255851059.2, + "logps/chosen": -489.1753336588542, + "logps/rejected": -325.88583984375, + "loss": 0.0103, + "rewards/chosen": 3.783714930216471, + "rewards/margins": 12.917703119913737, + "rewards/rejected": -9.133988189697266, + "step": 7824 + }, + { + "epoch": 0.7149383280036546, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 1.8820692498752314e-06, + "logits/chosen": 320270080.0, + "logits/rejected": 321317792.0, + "logps/chosen": -262.9852294921875, + "logps/rejected": -386.23065185546875, + "loss": 0.0152, + "rewards/chosen": 3.840524673461914, + "rewards/margins": 11.904090881347656, + "rewards/rejected": -8.063566207885742, + "step": 7825 + }, + { + "epoch": 0.7150296939241663, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 1.8809453721086323e-06, + "logits/chosen": 833383082.6666666, + "logits/rejected": 241203984.0, + "logps/chosen": -378.2169596354167, + "logps/rejected": -274.2856750488281, + "loss": 0.0286, + "rewards/chosen": 3.6515801747639975, + "rewards/margins": 9.563327153523764, + "rewards/rejected": -5.911746978759766, + "step": 7826 + }, + { + "epoch": 0.7151210598446779, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 1.8798217522589523e-06, + "logits/chosen": 464268083.2, + "logits/rejected": 663463552.0, + "logps/chosen": -247.365869140625, + "logps/rejected": -347.0904134114583, + "loss": 0.0598, + "rewards/chosen": 3.180606460571289, + "rewards/margins": 10.673361078898113, + "rewards/rejected": -7.492754618326823, + "step": 7827 + }, + { + "epoch": 0.7152124257651896, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 1.8786983904191043e-06, + "logits/chosen": 551740757.3333334, + "logits/rejected": 553370828.8, + "logps/chosen": -369.8859456380208, + "logps/rejected": -640.391650390625, + "loss": 0.0193, + "rewards/chosen": 3.1327854792277017, + "rewards/margins": 12.597375933329264, + "rewards/rejected": -9.464590454101563, + "step": 7828 + }, + { + "epoch": 0.7153037916857012, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 1.8775752866819796e-06, + "logits/chosen": 765190997.3333334, + "logits/rejected": 360811084.8, + "logps/chosen": -173.330810546875, + "logps/rejected": -443.38076171875, + "loss": 0.0178, + "rewards/chosen": 3.3929055531819663, + "rewards/margins": 11.488063939412436, + "rewards/rejected": -8.09515838623047, + "step": 7829 + }, + { + "epoch": 0.7153951576062129, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 1.8764524411404494e-06, + "logits/chosen": 332898304.0, + "logits/rejected": 715385088.0, + "logps/chosen": -229.337548828125, + "logps/rejected": -455.476806640625, + "loss": 0.0138, + "rewards/chosen": 4.263447570800781, + "rewards/margins": 14.368714141845704, + "rewards/rejected": -10.105266571044922, + "step": 7830 + }, + { + "epoch": 0.7154865235267245, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 1.8753298538873615e-06, + "logits/chosen": 671812736.0, + "logits/rejected": 561141184.0, + "logps/chosen": -405.2463684082031, + "logps/rejected": -425.39544677734375, + "loss": 0.0089, + "rewards/chosen": 4.387322902679443, + "rewards/margins": 12.850831508636475, + "rewards/rejected": -8.463508605957031, + "step": 7831 + }, + { + "epoch": 0.7155778894472362, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 1.8742075250155423e-06, + "logits/chosen": 500356608.0, + "logits/rejected": 475902156.8, + "logps/chosen": -504.5591634114583, + "logps/rejected": -641.55791015625, + "loss": 0.01, + "rewards/chosen": 4.042517026265462, + "rewards/margins": 14.559634335835774, + "rewards/rejected": -10.517117309570313, + "step": 7832 + }, + { + "epoch": 0.7156692553677478, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 1.8730854546178011e-06, + "logits/chosen": 506439232.0, + "logits/rejected": 412748800.0, + "logps/chosen": -272.87017822265625, + "logps/rejected": -254.44631958007812, + "loss": 0.1188, + "rewards/chosen": 4.156741619110107, + "rewards/margins": 8.769335269927979, + "rewards/rejected": -4.612593650817871, + "step": 7833 + }, + { + "epoch": 0.7157606212882595, + "grad_norm": 4.84375, + "kl": 8.972616195678711, + "learning_rate": 1.871963642786922e-06, + "logits/chosen": 330551954.28571427, + "logits/rejected": 517773376.0, + "logps/chosen": -339.4315708705357, + "logps/rejected": -312.7301025390625, + "loss": 0.0277, + "rewards/chosen": 4.676436288016183, + "rewards/margins": 12.032500130789622, + "rewards/rejected": -7.3560638427734375, + "step": 7834 + }, + { + "epoch": 0.7158519872087711, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 1.8708420896156676e-06, + "logits/chosen": 646613440.0, + "logits/rejected": 476313024.0, + "logps/chosen": -330.7798767089844, + "logps/rejected": -324.0285949707031, + "loss": 0.0205, + "rewards/chosen": 3.2798304557800293, + "rewards/margins": 11.55069875717163, + "rewards/rejected": -8.270868301391602, + "step": 7835 + }, + { + "epoch": 0.7159433531292828, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 1.8697207951967778e-06, + "logits/chosen": 493034496.0, + "logits/rejected": 556900608.0, + "logps/chosen": -370.98974609375, + "logps/rejected": -591.51513671875, + "loss": 0.0132, + "rewards/chosen": 4.190601348876953, + "rewards/margins": 15.19973373413086, + "rewards/rejected": -11.009132385253906, + "step": 7836 + }, + { + "epoch": 0.7160347190497944, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 1.8685997596229778e-06, + "logits/chosen": 478571296.0, + "logits/rejected": 623507712.0, + "logps/chosen": -459.1351318359375, + "logps/rejected": -641.5851643880209, + "loss": 0.0047, + "rewards/chosen": 4.4925537109375, + "rewards/margins": 14.481889724731445, + "rewards/rejected": -9.989336013793945, + "step": 7837 + }, + { + "epoch": 0.716126084970306, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.8674789829869644e-06, + "logits/chosen": 842868428.8, + "logits/rejected": 503789994.6666667, + "logps/chosen": -544.089111328125, + "logps/rejected": -612.8338623046875, + "loss": 0.0102, + "rewards/chosen": 4.447724151611328, + "rewards/margins": 17.31510238647461, + "rewards/rejected": -12.867378234863281, + "step": 7838 + }, + { + "epoch": 0.7162174508908177, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 1.866358465381416e-06, + "logits/chosen": 718501248.0, + "logits/rejected": 413291178.6666667, + "logps/chosen": -168.4173583984375, + "logps/rejected": -361.2386067708333, + "loss": 0.0088, + "rewards/chosen": 3.9259896278381348, + "rewards/margins": 12.189843972524008, + "rewards/rejected": -8.263854344685873, + "step": 7839 + }, + { + "epoch": 0.7163088168113294, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 1.8652382068989883e-06, + "logits/chosen": 453291264.0, + "logits/rejected": 533467852.8, + "logps/chosen": -254.1540730794271, + "logps/rejected": -581.4552734375, + "loss": 0.026, + "rewards/chosen": 2.9271043141682944, + "rewards/margins": 16.12077496846517, + "rewards/rejected": -13.193670654296875, + "step": 7840 + }, + { + "epoch": 0.716400182731841, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 1.864118207632315e-06, + "logits/chosen": 441600000.0, + "logits/rejected": 398380373.3333333, + "logps/chosen": -274.2371826171875, + "logps/rejected": -508.7432861328125, + "loss": 0.0097, + "rewards/chosen": 4.296315383911133, + "rewards/margins": 12.473227055867515, + "rewards/rejected": -8.17691167195638, + "step": 7841 + }, + { + "epoch": 0.7164915486523526, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 1.8629984676740132e-06, + "logits/chosen": 651162794.6666666, + "logits/rejected": 961887436.8, + "logps/chosen": -363.60693359375, + "logps/rejected": -448.2298828125, + "loss": 0.0224, + "rewards/chosen": 2.8862489064534507, + "rewards/margins": 10.938151677449545, + "rewards/rejected": -8.051902770996094, + "step": 7842 + }, + { + "epoch": 0.7165829145728643, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 1.861878987116673e-06, + "logits/chosen": 362270272.0, + "logits/rejected": 563840768.0, + "logps/chosen": -468.2615661621094, + "logps/rejected": -332.3013916015625, + "loss": 0.0482, + "rewards/chosen": 3.352438449859619, + "rewards/margins": 9.978463967641193, + "rewards/rejected": -6.626025517781575, + "step": 7843 + }, + { + "epoch": 0.716674280493376, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 1.8607597660528654e-06, + "logits/chosen": 525770720.0, + "logits/rejected": 571797824.0, + "logps/chosen": -334.02960205078125, + "logps/rejected": -413.5875549316406, + "loss": 0.0102, + "rewards/chosen": 4.1314287185668945, + "rewards/margins": 16.0537109375, + "rewards/rejected": -11.922282218933105, + "step": 7844 + }, + { + "epoch": 0.7167656464138876, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 1.8596408045751374e-06, + "logits/chosen": 512343072.0, + "logits/rejected": 392117248.0, + "logps/chosen": -270.115966796875, + "logps/rejected": -387.8907063802083, + "loss": 0.0124, + "rewards/chosen": 3.985809326171875, + "rewards/margins": 12.022638956705729, + "rewards/rejected": -8.036829630533854, + "step": 7845 + }, + { + "epoch": 0.7168570123343992, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.8585221027760209e-06, + "logits/chosen": 297053952.0, + "logits/rejected": 274795680.0, + "logps/chosen": -364.9736328125, + "logps/rejected": -487.48626708984375, + "loss": 0.0087, + "rewards/chosen": 4.522480010986328, + "rewards/margins": 13.544332504272461, + "rewards/rejected": -9.021852493286133, + "step": 7846 + }, + { + "epoch": 0.7169483782549109, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 1.85740366074802e-06, + "logits/chosen": 626190284.8, + "logits/rejected": 759458474.6666666, + "logps/chosen": -404.843212890625, + "logps/rejected": -464.5614420572917, + "loss": 0.0355, + "rewards/chosen": 2.954234313964844, + "rewards/margins": 11.267483901977538, + "rewards/rejected": -8.313249588012695, + "step": 7847 + }, + { + "epoch": 0.7170397441754226, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 1.8562854785836192e-06, + "logits/chosen": 630146918.4, + "logits/rejected": 447840128.0, + "logps/chosen": -242.342431640625, + "logps/rejected": -445.587890625, + "loss": 0.0358, + "rewards/chosen": 3.2812530517578127, + "rewards/margins": 12.065212122599284, + "rewards/rejected": -8.78395907084147, + "step": 7848 + }, + { + "epoch": 0.7171311100959342, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.8551675563752808e-06, + "logits/chosen": 572888362.6666666, + "logits/rejected": 673414016.0, + "logps/chosen": -319.80641682942706, + "logps/rejected": -542.660888671875, + "loss": 0.0165, + "rewards/chosen": 4.396648406982422, + "rewards/margins": 16.536855697631836, + "rewards/rejected": -12.140207290649414, + "step": 7849 + }, + { + "epoch": 0.7172224760164458, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 1.8540498942154495e-06, + "logits/chosen": 644835008.0, + "logits/rejected": 751097984.0, + "logps/chosen": -341.542236328125, + "logps/rejected": -820.9435424804688, + "loss": 0.0199, + "rewards/chosen": 3.768211603164673, + "rewards/margins": 15.697473764419556, + "rewards/rejected": -11.929262161254883, + "step": 7850 + }, + { + "epoch": 0.7173138419369575, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.852932492196544e-06, + "logits/chosen": 468480512.0, + "logits/rejected": 651986240.0, + "logps/chosen": -436.5517883300781, + "logps/rejected": -746.146240234375, + "loss": 0.0258, + "rewards/chosen": 3.5950212478637695, + "rewards/margins": 16.0269193649292, + "rewards/rejected": -12.43189811706543, + "step": 7851 + }, + { + "epoch": 0.7174052078574692, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 1.8518153504109636e-06, + "logits/chosen": 553651584.0, + "logits/rejected": 510055008.0, + "logps/chosen": -375.45513916015625, + "logps/rejected": -369.41461181640625, + "loss": 0.0192, + "rewards/chosen": 3.993800640106201, + "rewards/margins": 12.620587825775146, + "rewards/rejected": -8.626787185668945, + "step": 7852 + }, + { + "epoch": 0.7174965737779808, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 1.8506984689510849e-06, + "logits/chosen": 486989494.85714287, + "logits/rejected": 241451536.0, + "logps/chosen": -343.33900669642856, + "logps/rejected": -600.8912353515625, + "loss": 0.0489, + "rewards/chosen": 2.9018096923828125, + "rewards/margins": 19.691757202148438, + "rewards/rejected": -16.789947509765625, + "step": 7853 + }, + { + "epoch": 0.7175879396984924, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.8495818479092636e-06, + "logits/chosen": 495066154.6666667, + "logits/rejected": 842642944.0, + "logps/chosen": -277.426025390625, + "logps/rejected": -618.1187744140625, + "loss": 0.0145, + "rewards/chosen": 4.513143221537272, + "rewards/margins": 14.239153544108074, + "rewards/rejected": -9.7260103225708, + "step": 7854 + }, + { + "epoch": 0.7176793056190041, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 1.8484654873778345e-06, + "logits/chosen": 279379891.2, + "logits/rejected": 348576192.0, + "logps/chosen": -340.984423828125, + "logps/rejected": -660.8277994791666, + "loss": 0.0144, + "rewards/chosen": 4.351778030395508, + "rewards/margins": 17.931506474812828, + "rewards/rejected": -13.579728444417318, + "step": 7855 + }, + { + "epoch": 0.7177706715395158, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 1.84734938744911e-06, + "logits/chosen": 1104168789.3333333, + "logits/rejected": 521479628.8, + "logps/chosen": -368.5619710286458, + "logps/rejected": -589.40419921875, + "loss": 0.0103, + "rewards/chosen": 3.7816263834635415, + "rewards/margins": 14.644246927897134, + "rewards/rejected": -10.862620544433593, + "step": 7856 + }, + { + "epoch": 0.7178620374600274, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 1.8462335482153792e-06, + "logits/chosen": 402522880.0, + "logits/rejected": 225756448.0, + "logps/chosen": -301.29718017578125, + "logps/rejected": -439.0836181640625, + "loss": 0.0244, + "rewards/chosen": 3.8555053075154624, + "rewards/margins": 12.884781201680502, + "rewards/rejected": -9.029275894165039, + "step": 7857 + }, + { + "epoch": 0.717953403380539, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 1.845117969768916e-06, + "logits/chosen": 400488490.6666667, + "logits/rejected": 196345280.0, + "logps/chosen": -266.1123453776042, + "logps/rejected": -401.7892150878906, + "loss": 0.0124, + "rewards/chosen": 4.393564224243164, + "rewards/margins": 16.5703182220459, + "rewards/rejected": -12.176753997802734, + "step": 7858 + }, + { + "epoch": 0.7180447693010507, + "grad_norm": 25.75, + "kl": 0.0, + "learning_rate": 1.8440026522019654e-06, + "logits/chosen": 570358592.0, + "logits/rejected": 502008128.0, + "logps/chosen": -352.39068603515625, + "logps/rejected": -249.6474609375, + "loss": 0.0595, + "rewards/chosen": 2.6374154090881348, + "rewards/margins": 8.875439167022705, + "rewards/rejected": -6.23802375793457, + "step": 7859 + }, + { + "epoch": 0.7181361352215624, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.8428875956067549e-06, + "logits/chosen": 161284000.0, + "logits/rejected": 500216992.0, + "logps/chosen": -226.69094848632812, + "logps/rejected": -702.311767578125, + "loss": 0.0123, + "rewards/chosen": 4.202269554138184, + "rewards/margins": 14.481138229370117, + "rewards/rejected": -10.278868675231934, + "step": 7860 + }, + { + "epoch": 0.718227501142074, + "grad_norm": 4.5, + "kl": 10.6517333984375, + "learning_rate": 1.8417728000754887e-06, + "logits/chosen": 503968032.0, + "logps/chosen": -230.39295959472656, + "loss": 0.0446, + "rewards/chosen": 4.452118873596191, + "step": 7861 + }, + { + "epoch": 0.7183188670625856, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.8406582657003486e-06, + "logits/chosen": 512858368.0, + "logits/rejected": 868690688.0, + "logps/chosen": -215.70266723632812, + "logps/rejected": -795.131103515625, + "loss": 0.0297, + "rewards/chosen": 2.8865914344787598, + "rewards/margins": 14.235527515411377, + "rewards/rejected": -11.348936080932617, + "step": 7862 + }, + { + "epoch": 0.7184102329830973, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 1.8395439925734998e-06, + "logits/chosen": 675200554.6666666, + "logits/rejected": 474144160.0, + "logps/chosen": -266.8277994791667, + "logps/rejected": -410.72821044921875, + "loss": 0.0509, + "rewards/chosen": 2.959133783976237, + "rewards/margins": 10.206363360087076, + "rewards/rejected": -7.24722957611084, + "step": 7863 + }, + { + "epoch": 0.718501598903609, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.838429980787081e-06, + "logits/chosen": 603535286.8571428, + "logits/rejected": 506640960.0, + "logps/chosen": -363.15401785714283, + "logps/rejected": -431.53466796875, + "loss": 0.0226, + "rewards/chosen": 3.8388470241001675, + "rewards/margins": 13.685554368155344, + "rewards/rejected": -9.846707344055176, + "step": 7864 + }, + { + "epoch": 0.7185929648241206, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 1.8373162304332093e-06, + "logits/chosen": 592348224.0, + "logits/rejected": 520148128.0, + "logps/chosen": -304.9580993652344, + "logps/rejected": -512.1243286132812, + "loss": 0.0177, + "rewards/chosen": 3.600389242172241, + "rewards/margins": 13.984522581100464, + "rewards/rejected": -10.384133338928223, + "step": 7865 + }, + { + "epoch": 0.7186843307446322, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 1.8362027416039813e-06, + "logits/chosen": 516416938.6666667, + "logits/rejected": 451283046.4, + "logps/chosen": -398.0456949869792, + "logps/rejected": -504.97236328125, + "loss": 0.0329, + "rewards/chosen": 3.0884520212809243, + "rewards/margins": 13.361335627237954, + "rewards/rejected": -10.27288360595703, + "step": 7866 + }, + { + "epoch": 0.7187756966651438, + "grad_norm": 50.75, + "kl": 0.0, + "learning_rate": 1.8350895143914748e-06, + "logits/chosen": 338918528.0, + "logits/rejected": 446156288.0, + "logps/chosen": -196.20111083984375, + "logps/rejected": -588.091015625, + "loss": 0.0599, + "rewards/chosen": 2.3483352661132812, + "rewards/margins": 12.648458862304688, + "rewards/rejected": -10.300123596191407, + "step": 7867 + }, + { + "epoch": 0.7188670625856556, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.833976548887742e-06, + "logits/chosen": 720497066.6666666, + "logits/rejected": 435715481.6, + "logps/chosen": -353.822265625, + "logps/rejected": -491.478955078125, + "loss": 0.0106, + "rewards/chosen": 4.1813615163167315, + "rewards/margins": 13.890377934773763, + "rewards/rejected": -9.709016418457031, + "step": 7868 + }, + { + "epoch": 0.7189584285061672, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 1.8328638451848146e-06, + "logits/chosen": 639748044.8, + "logits/rejected": 482000682.6666667, + "logps/chosen": -347.34970703125, + "logps/rejected": -410.6468098958333, + "loss": 0.0312, + "rewards/chosen": 3.3975154876708986, + "rewards/margins": 12.467529805501304, + "rewards/rejected": -9.070014317830404, + "step": 7869 + }, + { + "epoch": 0.7190497944266788, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 1.831751403374702e-06, + "logits/chosen": 639389610.6666666, + "logits/rejected": 675174246.4, + "logps/chosen": -359.9873453776042, + "logps/rejected": -358.025244140625, + "loss": 0.005, + "rewards/chosen": 4.599242210388184, + "rewards/margins": 12.945524406433105, + "rewards/rejected": -8.346282196044921, + "step": 7870 + }, + { + "epoch": 0.7191411603471904, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 1.8306392235493946e-06, + "logits/chosen": 516590656.0, + "logits/rejected": 686946048.0, + "logps/chosen": -297.8150634765625, + "logps/rejected": -831.6761474609375, + "loss": 0.0106, + "rewards/chosen": 4.357434272766113, + "rewards/margins": 16.686327934265137, + "rewards/rejected": -12.328893661499023, + "step": 7871 + }, + { + "epoch": 0.7192325262677022, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 1.8295273058008596e-06, + "logits/chosen": 446454528.0, + "logits/rejected": 709139353.6, + "logps/chosen": -236.09501139322916, + "logps/rejected": -593.87861328125, + "loss": 0.0087, + "rewards/chosen": 4.081258773803711, + "rewards/margins": 12.978200912475586, + "rewards/rejected": -8.896942138671875, + "step": 7872 + }, + { + "epoch": 0.7193238921882138, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 1.8284156502210404e-06, + "logits/chosen": 460743424.0, + "logits/rejected": 677742336.0, + "logps/chosen": -267.23052978515625, + "logps/rejected": -423.2450256347656, + "loss": 0.0112, + "rewards/chosen": 4.549400329589844, + "rewards/margins": 13.47526741027832, + "rewards/rejected": -8.925867080688477, + "step": 7873 + }, + { + "epoch": 0.7194152581087254, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 1.8273042569018623e-06, + "logits/chosen": 823107754.6666666, + "logits/rejected": 763887513.6, + "logps/chosen": -363.9217936197917, + "logps/rejected": -612.774658203125, + "loss": 0.0078, + "rewards/chosen": 4.426025390625, + "rewards/margins": 13.99590835571289, + "rewards/rejected": -9.56988296508789, + "step": 7874 + }, + { + "epoch": 0.719506624029237, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.8261931259352238e-06, + "logits/chosen": 561950003.2, + "logits/rejected": 432543658.6666667, + "logps/chosen": -220.2720947265625, + "logps/rejected": -318.62013753255206, + "loss": 0.0193, + "rewards/chosen": 3.666304016113281, + "rewards/margins": 12.262328211466471, + "rewards/rejected": -8.59602419535319, + "step": 7875 + }, + { + "epoch": 0.7195979899497488, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.8250822574130105e-06, + "logits/chosen": 506720544.0, + "logits/rejected": 441922528.0, + "logps/chosen": -261.92822265625, + "logps/rejected": -489.86077880859375, + "loss": 0.0195, + "rewards/chosen": 3.269341468811035, + "rewards/margins": 12.81290054321289, + "rewards/rejected": -9.543559074401855, + "step": 7876 + }, + { + "epoch": 0.7196893558702604, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 1.8239716514270778e-06, + "logits/chosen": 626924096.0, + "logits/rejected": 507113152.0, + "logps/chosen": -415.7210693359375, + "logps/rejected": -285.0234375, + "loss": 0.0117, + "rewards/chosen": 4.398467063903809, + "rewards/margins": 11.281422138214111, + "rewards/rejected": -6.882955074310303, + "step": 7877 + }, + { + "epoch": 0.719780721790772, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 1.8228613080692631e-06, + "logits/chosen": 754585002.6666666, + "logits/rejected": 530896025.6, + "logps/chosen": -366.6224772135417, + "logps/rejected": -383.4811767578125, + "loss": 0.0177, + "rewards/chosen": 3.2837842305501304, + "rewards/margins": 11.568388112386069, + "rewards/rejected": -8.284603881835938, + "step": 7878 + }, + { + "epoch": 0.7198720877112836, + "grad_norm": 1.0625, + "kl": 0.0, + "learning_rate": 1.8217512274313798e-06, + "logits/chosen": 687052800.0, + "logits/rejected": 398715733.3333333, + "logps/chosen": -267.3314453125, + "logps/rejected": -388.9405517578125, + "loss": 0.0073, + "rewards/chosen": 4.625489807128906, + "rewards/margins": 14.765695190429687, + "rewards/rejected": -10.140205383300781, + "step": 7879 + }, + { + "epoch": 0.7199634536317954, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 1.8206414096052245e-06, + "logits/chosen": 476086930.28571427, + "logits/rejected": 198572992.0, + "logps/chosen": -291.4735630580357, + "logps/rejected": -132.38357543945312, + "loss": 0.0266, + "rewards/chosen": 3.897132328578404, + "rewards/margins": 9.286087921687535, + "rewards/rejected": -5.388955593109131, + "step": 7880 + }, + { + "epoch": 0.720054819552307, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 1.8195318546825686e-06, + "logits/chosen": 445427200.0, + "logps/chosen": -296.2521667480469, + "loss": 0.0344, + "rewards/chosen": 3.925588846206665, + "step": 7881 + }, + { + "epoch": 0.7201461854728186, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.8184225627551594e-06, + "logits/chosen": 682212096.0, + "logits/rejected": 719852117.3333334, + "logps/chosen": -418.7571105957031, + "logps/rejected": -483.6240234375, + "loss": 0.035, + "rewards/chosen": 3.0375473499298096, + "rewards/margins": 11.329835812250773, + "rewards/rejected": -8.292288462320963, + "step": 7882 + }, + { + "epoch": 0.7202375513933302, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 1.817313533914724e-06, + "logits/chosen": 517748906.6666667, + "logits/rejected": 849240832.0, + "logps/chosen": -159.5170694986979, + "logps/rejected": -404.448193359375, + "loss": 0.0181, + "rewards/chosen": 4.236517588297526, + "rewards/margins": 11.831194559733074, + "rewards/rejected": -7.594676971435547, + "step": 7883 + }, + { + "epoch": 0.720328917313842, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.8162047682529732e-06, + "logits/chosen": 590526054.4, + "logits/rejected": 355141717.3333333, + "logps/chosen": -335.649560546875, + "logps/rejected": -408.6736246744792, + "loss": 0.0217, + "rewards/chosen": 3.50279541015625, + "rewards/margins": 12.301633707682292, + "rewards/rejected": -8.798838297526041, + "step": 7884 + }, + { + "epoch": 0.7204202832343536, + "grad_norm": 3.203125, + "kl": 1.8048057556152344, + "learning_rate": 1.8150962658615895e-06, + "logits/chosen": 944903253.3333334, + "logits/rejected": 330435904.0, + "logps/chosen": -433.125244140625, + "logps/rejected": -168.6108856201172, + "loss": 0.0256, + "rewards/chosen": 3.9333553314208984, + "rewards/margins": 10.717438220977783, + "rewards/rejected": -6.784082889556885, + "step": 7885 + }, + { + "epoch": 0.7205116491548652, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 1.8139880268322358e-06, + "logits/chosen": 714738474.6666666, + "logits/rejected": 822810265.6, + "logps/chosen": -283.9927571614583, + "logps/rejected": -546.005126953125, + "loss": 0.0175, + "rewards/chosen": 3.0930614471435547, + "rewards/margins": 12.10128059387207, + "rewards/rejected": -9.008219146728516, + "step": 7886 + }, + { + "epoch": 0.7206030150753768, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 1.8128800512565514e-06, + "logits/chosen": 1262503765.3333333, + "logits/rejected": 597838950.4, + "logps/chosen": -401.9628092447917, + "logps/rejected": -380.4390625, + "loss": 0.0621, + "rewards/chosen": 3.928863525390625, + "rewards/margins": 11.112753295898438, + "rewards/rejected": -7.183889770507813, + "step": 7887 + }, + { + "epoch": 0.7206943809958886, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 1.8117723392261594e-06, + "logits/chosen": 498972928.0, + "logits/rejected": 388240224.0, + "logps/chosen": -300.8134765625, + "logps/rejected": -454.996337890625, + "loss": 0.0076, + "rewards/chosen": 4.375319957733154, + "rewards/margins": 14.562958240509033, + "rewards/rejected": -10.187638282775879, + "step": 7888 + }, + { + "epoch": 0.7207857469164002, + "grad_norm": 44.75, + "kl": 0.0, + "learning_rate": 1.8106648908326552e-06, + "logits/chosen": 729279658.6666666, + "logits/rejected": 469469900.8, + "logps/chosen": -195.84566243489584, + "logps/rejected": -477.27578125, + "loss": 0.0525, + "rewards/chosen": 2.8447265625, + "rewards/margins": 9.963597106933594, + "rewards/rejected": -7.118870544433594, + "step": 7889 + }, + { + "epoch": 0.7208771128369118, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.809557706167615e-06, + "logits/chosen": 368353792.0, + "logits/rejected": 420568192.0, + "logps/chosen": -380.90155029296875, + "logps/rejected": -489.9551188151042, + "loss": 0.014, + "rewards/chosen": 3.809497356414795, + "rewards/margins": 13.005888144175211, + "rewards/rejected": -9.196390787760416, + "step": 7890 + }, + { + "epoch": 0.7209684787574235, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 1.8084507853225902e-06, + "logits/chosen": 505157376.0, + "logits/rejected": 936007040.0, + "logps/chosen": -334.06121826171875, + "logps/rejected": -665.1651611328125, + "loss": 0.0107, + "rewards/chosen": 4.252828598022461, + "rewards/margins": 14.211901664733887, + "rewards/rejected": -9.959073066711426, + "step": 7891 + }, + { + "epoch": 0.7210598446779352, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 1.8073441283891168e-06, + "logits/chosen": 551001685.3333334, + "logits/rejected": 661804032.0, + "logps/chosen": -276.4792073567708, + "logps/rejected": -466.80328369140625, + "loss": 0.0412, + "rewards/chosen": 3.353780746459961, + "rewards/margins": 15.724071502685547, + "rewards/rejected": -12.370290756225586, + "step": 7892 + }, + { + "epoch": 0.7211512105984468, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.806237735458704e-06, + "logits/chosen": 556234880.0, + "logits/rejected": 961866752.0, + "logps/chosen": -358.535400390625, + "logps/rejected": -577.5392456054688, + "loss": 0.0159, + "rewards/chosen": 4.606077194213867, + "rewards/margins": 14.835418701171875, + "rewards/rejected": -10.229341506958008, + "step": 7893 + }, + { + "epoch": 0.7212425765189584, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 1.8051316066228392e-06, + "logits/chosen": 638657920.0, + "logits/rejected": 383889042.28571427, + "logps/chosen": -464.1492004394531, + "logps/rejected": -382.30140904017856, + "loss": 0.0141, + "rewards/chosen": 5.011727809906006, + "rewards/margins": 12.505716936928884, + "rewards/rejected": -7.493989127022879, + "step": 7894 + }, + { + "epoch": 0.7213339424394701, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.804025741972989e-06, + "logits/chosen": 344814378.6666667, + "logits/rejected": 562185676.8, + "logps/chosen": -246.990234375, + "logps/rejected": -583.61953125, + "loss": 0.0199, + "rewards/chosen": 3.8656625747680664, + "rewards/margins": 12.817077827453613, + "rewards/rejected": -8.951415252685546, + "step": 7895 + }, + { + "epoch": 0.7214253083599818, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 1.8029201416005976e-06, + "logits/chosen": 547366400.0, + "logits/rejected": 410983040.0, + "logps/chosen": -325.0645751953125, + "logps/rejected": -274.5289611816406, + "loss": 0.0336, + "rewards/chosen": 2.9472084045410156, + "rewards/margins": 10.381508827209473, + "rewards/rejected": -7.434300422668457, + "step": 7896 + }, + { + "epoch": 0.7215166742804934, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 1.8018148055970901e-06, + "logits/chosen": 426881024.0, + "logits/rejected": 480335200.0, + "logps/chosen": -318.8446044921875, + "logps/rejected": -520.8509521484375, + "loss": 0.0072, + "rewards/chosen": 4.463982582092285, + "rewards/margins": 13.384284019470215, + "rewards/rejected": -8.92030143737793, + "step": 7897 + }, + { + "epoch": 0.721608040201005, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.8007097340538666e-06, + "logits/chosen": 873545728.0, + "logits/rejected": 886899008.0, + "logps/chosen": -515.5391845703125, + "logps/rejected": -456.4392395019531, + "loss": 0.0177, + "rewards/chosen": 3.5484962463378906, + "rewards/margins": 11.720940589904785, + "rewards/rejected": -8.172444343566895, + "step": 7898 + }, + { + "epoch": 0.7216994061215167, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 1.7996049270623061e-06, + "logits/chosen": 404337536.0, + "logits/rejected": 419995712.0, + "logps/chosen": -410.6206359863281, + "logps/rejected": -575.774169921875, + "loss": 0.0192, + "rewards/chosen": 3.4700868129730225, + "rewards/margins": 15.916491270065308, + "rewards/rejected": -12.446404457092285, + "step": 7899 + }, + { + "epoch": 0.7217907720420284, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 1.7985003847137639e-06, + "logits/chosen": 436081203.2, + "logits/rejected": 263976917.33333334, + "logps/chosen": -138.00567626953125, + "logps/rejected": -563.5011393229166, + "loss": 0.0434, + "rewards/chosen": 3.1149723052978517, + "rewards/margins": 14.746663284301757, + "rewards/rejected": -11.631690979003906, + "step": 7900 + }, + { + "epoch": 0.72188213796254, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 1.7973961070995788e-06, + "logits/chosen": 545092437.3333334, + "logits/rejected": 276098432.0, + "logps/chosen": -435.2314046223958, + "logps/rejected": -381.0332763671875, + "loss": 0.0067, + "rewards/chosen": 4.234132130940755, + "rewards/margins": 13.794814809163412, + "rewards/rejected": -9.560682678222657, + "step": 7901 + }, + { + "epoch": 0.7219735038830516, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.7962920943110635e-06, + "logits/chosen": 590305945.6, + "logits/rejected": 502902869.3333333, + "logps/chosen": -435.95791015625, + "logps/rejected": -560.9993896484375, + "loss": 0.0146, + "rewards/chosen": 3.8066524505615233, + "rewards/margins": 14.07936642964681, + "rewards/rejected": -10.272713979085287, + "step": 7902 + }, + { + "epoch": 0.7220648698035633, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 1.795188346439508e-06, + "logits/chosen": 442934912.0, + "logits/rejected": 570398592.0, + "logps/chosen": -245.9105987548828, + "logps/rejected": -517.6781005859375, + "loss": 0.006, + "rewards/chosen": 4.478418827056885, + "rewards/margins": 13.820155302683512, + "rewards/rejected": -9.341736475626627, + "step": 7903 + }, + { + "epoch": 0.722156235724075, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 1.7940848635761837e-06, + "logits/chosen": 719024896.0, + "logits/rejected": 1084493568.0, + "logps/chosen": -362.8031311035156, + "logps/rejected": -314.44439697265625, + "loss": 0.0083, + "rewards/chosen": 4.477066516876221, + "rewards/margins": 13.213741779327393, + "rewards/rejected": -8.736675262451172, + "step": 7904 + }, + { + "epoch": 0.7222476016445866, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 1.7929816458123362e-06, + "logits/chosen": 446355763.2, + "logits/rejected": 686184362.6666666, + "logps/chosen": -280.250146484375, + "logps/rejected": -713.771728515625, + "loss": 0.0248, + "rewards/chosen": 4.5744483947753904, + "rewards/margins": 16.235648600260415, + "rewards/rejected": -11.661200205485025, + "step": 7905 + }, + { + "epoch": 0.7223389675650982, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.7918786932391945e-06, + "logits/chosen": 453368320.0, + "logits/rejected": 380788736.0, + "logps/chosen": -352.6952718098958, + "logps/rejected": -373.174609375, + "loss": 0.0239, + "rewards/chosen": 3.000059127807617, + "rewards/margins": 11.544375991821289, + "rewards/rejected": -8.544316864013672, + "step": 7906 + }, + { + "epoch": 0.7224303334856099, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.7907760059479624e-06, + "logits/chosen": 669313792.0, + "logits/rejected": 565012352.0, + "logps/chosen": -400.9473470052083, + "logps/rejected": -659.1497192382812, + "loss": 0.0263, + "rewards/chosen": 3.845611572265625, + "rewards/margins": 15.552026748657227, + "rewards/rejected": -11.706415176391602, + "step": 7907 + }, + { + "epoch": 0.7225216994061215, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.7896735840298173e-06, + "logits/chosen": 616539562.6666666, + "logits/rejected": 546512448.0, + "logps/chosen": -302.26458740234375, + "logps/rejected": -379.0428771972656, + "loss": 0.0285, + "rewards/chosen": 3.4133822123209634, + "rewards/margins": 11.090662638346354, + "rewards/rejected": -7.677280426025391, + "step": 7908 + }, + { + "epoch": 0.7226130653266332, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 1.788571427575924e-06, + "logits/chosen": 453719040.0, + "logits/rejected": 894473600.0, + "logps/chosen": -499.4967447916667, + "logps/rejected": -408.25665283203125, + "loss": 0.0344, + "rewards/chosen": 3.6402365366617837, + "rewards/margins": 12.46243158976237, + "rewards/rejected": -8.822195053100586, + "step": 7909 + }, + { + "epoch": 0.7227044312471448, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 1.7874695366774191e-06, + "logits/chosen": 969221546.6666666, + "logits/rejected": 270249574.4, + "logps/chosen": -187.4584757486979, + "logps/rejected": -450.83515625, + "loss": 0.0152, + "rewards/chosen": 3.781007766723633, + "rewards/margins": 11.665214920043946, + "rewards/rejected": -7.884207153320313, + "step": 7910 + }, + { + "epoch": 0.7227957971676565, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.786367911425419e-06, + "logits/chosen": 428715904.0, + "logits/rejected": 545050048.0, + "logps/chosen": -236.2305908203125, + "logps/rejected": -529.484375, + "loss": 0.0242, + "rewards/chosen": 3.6222203572591147, + "rewards/margins": 13.084172566731771, + "rewards/rejected": -9.461952209472656, + "step": 7911 + }, + { + "epoch": 0.7228871630881681, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 1.7852665519110157e-06, + "logits/chosen": 744257792.0, + "logits/rejected": 1868607488.0, + "logps/chosen": -375.3556315104167, + "logps/rejected": -713.40966796875, + "loss": 0.022, + "rewards/chosen": 3.9129155476888022, + "rewards/margins": 12.403793652852377, + "rewards/rejected": -8.490878105163574, + "step": 7912 + }, + { + "epoch": 0.7229785290086798, + "grad_norm": 65.5, + "kl": 0.0, + "learning_rate": 1.7841654582252848e-06, + "logits/chosen": 285186858.6666667, + "logits/rejected": 427418905.6, + "logps/chosen": -512.251708984375, + "logps/rejected": -546.4494140625, + "loss": 0.0621, + "rewards/chosen": 3.640554428100586, + "rewards/margins": 13.456808090209961, + "rewards/rejected": -9.816253662109375, + "step": 7913 + }, + { + "epoch": 0.7230698949291914, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 1.7830646304592758e-06, + "logits/chosen": 577072896.0, + "logits/rejected": 639051724.8, + "logps/chosen": -318.34055582682294, + "logps/rejected": -519.342822265625, + "loss": 0.0061, + "rewards/chosen": 4.293300946553548, + "rewards/margins": 14.182561047871907, + "rewards/rejected": -9.88926010131836, + "step": 7914 + }, + { + "epoch": 0.7231612608497031, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 1.7819640687040158e-06, + "logits/chosen": 291817600.0, + "logits/rejected": 452881322.6666667, + "logps/chosen": -299.1338806152344, + "logps/rejected": -402.80126953125, + "loss": 0.0055, + "rewards/chosen": 4.66756534576416, + "rewards/margins": 12.247513771057129, + "rewards/rejected": -7.579948425292969, + "step": 7915 + }, + { + "epoch": 0.7232526267702147, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 1.780863773050512e-06, + "logits/chosen": 691439786.6666666, + "logits/rejected": 445923993.6, + "logps/chosen": -435.0074869791667, + "logps/rejected": -601.8681640625, + "loss": 0.0081, + "rewards/chosen": 3.996354420979818, + "rewards/margins": 14.902068837483725, + "rewards/rejected": -10.905714416503907, + "step": 7916 + }, + { + "epoch": 0.7233439926907264, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.7797637435897464e-06, + "logits/chosen": 439653152.0, + "logits/rejected": 412318112.0, + "logps/chosen": -248.52688598632812, + "logps/rejected": -466.8720703125, + "loss": 0.1265, + "rewards/chosen": 2.6836228370666504, + "rewards/margins": 11.237337589263916, + "rewards/rejected": -8.553714752197266, + "step": 7917 + }, + { + "epoch": 0.723435358611238, + "grad_norm": 53.75, + "kl": 0.0, + "learning_rate": 1.7786639804126842e-06, + "logits/chosen": 651086165.3333334, + "logits/rejected": 617356390.4, + "logps/chosen": -364.2862955729167, + "logps/rejected": -275.5662109375, + "loss": 0.0653, + "rewards/chosen": 3.4721832275390625, + "rewards/margins": 9.804714965820313, + "rewards/rejected": -6.33253173828125, + "step": 7918 + }, + { + "epoch": 0.7235267245317497, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 1.7775644836102652e-06, + "logits/chosen": 318927008.0, + "logits/rejected": 281591424.0, + "logps/chosen": -152.57891845703125, + "logps/rejected": -465.9569091796875, + "loss": 0.0222, + "rewards/chosen": 3.428718090057373, + "rewards/margins": 13.050818920135498, + "rewards/rejected": -9.622100830078125, + "step": 7919 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 1.7764652532734067e-06, + "logits/chosen": 549861120.0, + "logits/rejected": 401293482.6666667, + "logps/chosen": -465.30670166015625, + "logps/rejected": -679.9825439453125, + "loss": 0.0076, + "rewards/chosen": 3.476130723953247, + "rewards/margins": 15.828405300776163, + "rewards/rejected": -12.352274576822916, + "step": 7920 + }, + { + "epoch": 0.723709456372773, + "grad_norm": 46.5, + "kl": 0.0, + "learning_rate": 1.775366289493003e-06, + "logits/chosen": 454772032.0, + "logits/rejected": 392523840.0, + "logps/chosen": -275.6033935546875, + "logps/rejected": -372.24896240234375, + "loss": 0.0642, + "rewards/chosen": 3.7874512672424316, + "rewards/margins": 11.711615085601807, + "rewards/rejected": -7.924163818359375, + "step": 7921 + }, + { + "epoch": 0.7238008222932846, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 1.7742675923599323e-06, + "logits/chosen": 789829836.8, + "logits/rejected": 318216832.0, + "logps/chosen": -480.337890625, + "logps/rejected": -402.8148600260417, + "loss": 0.0121, + "rewards/chosen": 4.559566497802734, + "rewards/margins": 15.775657653808594, + "rewards/rejected": -11.21609115600586, + "step": 7922 + }, + { + "epoch": 0.7238921882137963, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.7731691619650444e-06, + "logits/chosen": 365434880.0, + "logits/rejected": 577351372.8, + "logps/chosen": -315.50685628255206, + "logps/rejected": -414.075390625, + "loss": 0.0128, + "rewards/chosen": 4.259497324625651, + "rewards/margins": 12.78384730021159, + "rewards/rejected": -8.524349975585938, + "step": 7923 + }, + { + "epoch": 0.7239835541343079, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.7720709983991695e-06, + "logits/chosen": 360159146.6666667, + "logits/rejected": 523774976.0, + "logps/chosen": -239.58121744791666, + "logps/rejected": -757.8869140625, + "loss": 0.0162, + "rewards/chosen": 3.727764129638672, + "rewards/margins": 14.106774139404298, + "rewards/rejected": -10.379010009765626, + "step": 7924 + }, + { + "epoch": 0.7240749200548195, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.7709731017531156e-06, + "logits/chosen": 689106944.0, + "logits/rejected": 394965376.0, + "logps/chosen": -313.4265625, + "logps/rejected": -810.6691080729166, + "loss": 0.0207, + "rewards/chosen": 3.9676979064941404, + "rewards/margins": 13.26064249674479, + "rewards/rejected": -9.29294459025065, + "step": 7925 + }, + { + "epoch": 0.7241662859753312, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 1.7698754721176674e-06, + "logits/chosen": 626266624.0, + "logits/rejected": 387353216.0, + "logps/chosen": -362.7677408854167, + "logps/rejected": -456.246240234375, + "loss": 0.013, + "rewards/chosen": 3.6268768310546875, + "rewards/margins": 11.970504760742188, + "rewards/rejected": -8.3436279296875, + "step": 7926 + }, + { + "epoch": 0.7242576518958429, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.7687781095835909e-06, + "logits/chosen": 604235072.0, + "logits/rejected": 586929749.3333334, + "logps/chosen": -683.832763671875, + "logps/rejected": -392.4607747395833, + "loss": 0.018, + "rewards/chosen": 3.097503662109375, + "rewards/margins": 11.027799606323242, + "rewards/rejected": -7.930295944213867, + "step": 7927 + }, + { + "epoch": 0.7243490178163545, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.7676810142416274e-06, + "logits/chosen": 374411443.2, + "logits/rejected": 487910272.0, + "logps/chosen": -287.6, + "logps/rejected": -386.5499674479167, + "loss": 0.0226, + "rewards/chosen": 4.526030349731445, + "rewards/margins": 11.630416361490886, + "rewards/rejected": -7.10438601175944, + "step": 7928 + }, + { + "epoch": 0.7244403837368661, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.7665841861824955e-06, + "logits/chosen": 534842336.0, + "logits/rejected": 402914240.0, + "logps/chosen": -277.868896484375, + "logps/rejected": -376.36944580078125, + "loss": 0.0311, + "rewards/chosen": 3.364058494567871, + "rewards/margins": 11.728903770446777, + "rewards/rejected": -8.364845275878906, + "step": 7929 + }, + { + "epoch": 0.7245317496573778, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.7654876254968917e-06, + "logits/chosen": 341902688.0, + "logits/rejected": 436899498.6666667, + "logps/chosen": -230.1843719482422, + "logps/rejected": -674.1668294270834, + "loss": 0.0089, + "rewards/chosen": 3.3754653930664062, + "rewards/margins": 13.104560852050781, + "rewards/rejected": -9.729095458984375, + "step": 7930 + }, + { + "epoch": 0.7246231155778895, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.7643913322754946e-06, + "logits/chosen": 345694752.0, + "logits/rejected": 389920576.0, + "logps/chosen": -199.08262634277344, + "logps/rejected": -437.1707763671875, + "loss": 0.0214, + "rewards/chosen": 3.290433883666992, + "rewards/margins": 13.083366394042969, + "rewards/rejected": -9.792932510375977, + "step": 7931 + }, + { + "epoch": 0.7247144814984011, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 1.763295306608956e-06, + "logits/chosen": 377289856.0, + "logits/rejected": 344200064.0, + "logps/chosen": -271.55214436848956, + "logps/rejected": -367.2814697265625, + "loss": 0.0102, + "rewards/chosen": 4.33837890625, + "rewards/margins": 13.189485168457031, + "rewards/rejected": -8.851106262207031, + "step": 7932 + }, + { + "epoch": 0.7248058474189127, + "grad_norm": 40.0, + "kl": 6.6249260902404785, + "learning_rate": 1.7621995485879062e-06, + "logits/chosen": 776659072.0, + "logps/chosen": -254.74658203125, + "loss": 0.1704, + "rewards/chosen": 2.944887161254883, + "step": 7933 + }, + { + "epoch": 0.7248972133394244, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 1.7611040583029547e-06, + "logits/chosen": 521856256.0, + "logits/rejected": 580198297.6, + "logps/chosen": -248.72224934895834, + "logps/rejected": -663.974658203125, + "loss": 0.0217, + "rewards/chosen": 2.8366597493489585, + "rewards/margins": 11.948535664876303, + "rewards/rejected": -9.111875915527344, + "step": 7934 + }, + { + "epoch": 0.7249885792599361, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.7600088358446893e-06, + "logits/chosen": 799157504.0, + "logits/rejected": 555829248.0, + "logps/chosen": -318.3414306640625, + "logps/rejected": -252.48602294921875, + "loss": 0.015, + "rewards/chosen": 4.583268165588379, + "rewards/margins": 12.507457733154297, + "rewards/rejected": -7.924189567565918, + "step": 7935 + }, + { + "epoch": 0.7250799451804477, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 1.7589138813036732e-06, + "logits/chosen": 366471680.0, + "logits/rejected": 432171315.2, + "logps/chosen": -513.6331380208334, + "logps/rejected": -463.590283203125, + "loss": 0.0083, + "rewards/chosen": 3.8799091974894204, + "rewards/margins": 14.02169157663981, + "rewards/rejected": -10.14178237915039, + "step": 7936 + }, + { + "epoch": 0.7251713111009593, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.7578191947704505e-06, + "logits/chosen": 433266278.4, + "logits/rejected": 268259114.66666666, + "logps/chosen": -340.891796875, + "logps/rejected": -345.085205078125, + "loss": 0.0144, + "rewards/chosen": 4.044014739990234, + "rewards/margins": 12.720416514078774, + "rewards/rejected": -8.676401774088541, + "step": 7937 + }, + { + "epoch": 0.725262677021471, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 1.756724776335539e-06, + "logits/chosen": 229389909.33333334, + "logits/rejected": 481605836.8, + "logps/chosen": -176.06229654947916, + "logps/rejected": -615.81767578125, + "loss": 0.0062, + "rewards/chosen": 4.437320073445638, + "rewards/margins": 15.608003362019858, + "rewards/rejected": -11.170683288574219, + "step": 7938 + }, + { + "epoch": 0.7253540429419827, + "grad_norm": 45.5, + "kl": 0.0, + "learning_rate": 1.7556306260894402e-06, + "logits/chosen": 440453056.0, + "logits/rejected": 421791445.3333333, + "logps/chosen": -345.5457458496094, + "logps/rejected": -488.2550862630208, + "loss": 0.0312, + "rewards/chosen": 3.019458055496216, + "rewards/margins": 11.985631227493286, + "rewards/rejected": -8.96617317199707, + "step": 7939 + }, + { + "epoch": 0.7254454088624943, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 1.7545367441226292e-06, + "logits/chosen": 974994346.6666666, + "logits/rejected": 1128711987.2, + "logps/chosen": -658.7509358723959, + "logps/rejected": -476.00634765625, + "loss": 0.0087, + "rewards/chosen": 3.9621480305989585, + "rewards/margins": 12.88659871419271, + "rewards/rejected": -8.92445068359375, + "step": 7940 + }, + { + "epoch": 0.7255367747830059, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 1.7534431305255596e-06, + "logits/chosen": 459859763.2, + "logits/rejected": 558453802.6666666, + "logps/chosen": -388.43505859375, + "logps/rejected": -792.8111979166666, + "loss": 0.0282, + "rewards/chosen": 3.8421680450439455, + "rewards/margins": 15.457364273071288, + "rewards/rejected": -11.615196228027344, + "step": 7941 + }, + { + "epoch": 0.7256281407035176, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 1.7523497853886613e-06, + "logits/chosen": 690167125.3333334, + "logits/rejected": 412744243.2, + "logps/chosen": -345.434814453125, + "logps/rejected": -442.065478515625, + "loss": 0.0219, + "rewards/chosen": 3.0657898585001626, + "rewards/margins": 11.647763125101724, + "rewards/rejected": -8.581973266601562, + "step": 7942 + }, + { + "epoch": 0.7257195066240293, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.7512567088023475e-06, + "logits/chosen": 528484576.0, + "logits/rejected": 444008768.0, + "logps/chosen": -216.61473083496094, + "logps/rejected": -508.9849548339844, + "loss": 0.019, + "rewards/chosen": 3.502237319946289, + "rewards/margins": 12.531867027282715, + "rewards/rejected": -9.029629707336426, + "step": 7943 + }, + { + "epoch": 0.7258108725445409, + "grad_norm": 41.25, + "kl": 0.0, + "learning_rate": 1.750163900857003e-06, + "logits/chosen": 363323520.0, + "logits/rejected": 451870037.3333333, + "logps/chosen": -241.39187622070312, + "logps/rejected": -516.7963053385416, + "loss": 0.0562, + "rewards/chosen": 2.1508982181549072, + "rewards/margins": 11.650933821996054, + "rewards/rejected": -9.500035603841146, + "step": 7944 + }, + { + "epoch": 0.7259022384650525, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.7490713616429938e-06, + "logits/chosen": 399333760.0, + "logits/rejected": 582391210.6666666, + "logps/chosen": -332.1564453125, + "logps/rejected": -499.1790364583333, + "loss": 0.0183, + "rewards/chosen": 4.289780807495117, + "rewards/margins": 13.516076278686523, + "rewards/rejected": -9.226295471191406, + "step": 7945 + }, + { + "epoch": 0.7259936043855642, + "grad_norm": 2.96875, + "kl": 1.6562786102294922, + "learning_rate": 1.7479790912506628e-06, + "logits/chosen": 666435949.7142857, + "logits/rejected": 272548320.0, + "logps/chosen": -305.27186802455356, + "logps/rejected": -207.73486328125, + "loss": 0.136, + "rewards/chosen": 3.2540040697370256, + "rewards/margins": 12.373624392918178, + "rewards/rejected": -9.119620323181152, + "step": 7946 + }, + { + "epoch": 0.7260849703060759, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 1.7468870897703282e-06, + "logits/chosen": 769336960.0, + "logits/rejected": 1668869120.0, + "logps/chosen": -242.07301330566406, + "logps/rejected": -675.2079467773438, + "loss": 0.0222, + "rewards/chosen": 3.4205713272094727, + "rewards/margins": 13.357488632202148, + "rewards/rejected": -9.936917304992676, + "step": 7947 + }, + { + "epoch": 0.7261763362265875, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.7457953572922925e-06, + "logits/chosen": 553776822.8571428, + "logits/rejected": 682697728.0, + "logps/chosen": -223.71815708705358, + "logps/rejected": -444.52984619140625, + "loss": 0.0215, + "rewards/chosen": 3.8731689453125, + "rewards/margins": 10.894085884094238, + "rewards/rejected": -7.020916938781738, + "step": 7948 + }, + { + "epoch": 0.7262677021470991, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 1.7447038939068301e-06, + "logits/chosen": 557557077.3333334, + "logits/rejected": 619508992.0, + "logps/chosen": -314.08315022786456, + "logps/rejected": -633.332421875, + "loss": 0.008, + "rewards/chosen": 3.8627395629882812, + "rewards/margins": 13.728172302246094, + "rewards/rejected": -9.865432739257812, + "step": 7949 + }, + { + "epoch": 0.7263590680676107, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.7436126997041941e-06, + "logits/chosen": 186834773.33333334, + "logits/rejected": 320543590.4, + "logps/chosen": -107.23677571614583, + "logps/rejected": -372.944873046875, + "loss": 0.0161, + "rewards/chosen": 4.0983835856119795, + "rewards/margins": 12.662211100260418, + "rewards/rejected": -8.563827514648438, + "step": 7950 + }, + { + "epoch": 0.7264504339881225, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 1.7425217747746153e-06, + "logits/chosen": 613432832.0, + "logits/rejected": 761913920.0, + "logps/chosen": -527.822265625, + "logps/rejected": -588.12548828125, + "loss": 0.0173, + "rewards/chosen": 3.8821611404418945, + "rewards/margins": 12.632800102233887, + "rewards/rejected": -8.750638961791992, + "step": 7951 + }, + { + "epoch": 0.7265417999086341, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.7414311192083067e-06, + "logits/chosen": 473740697.6, + "logits/rejected": 514301141.3333333, + "logps/chosen": -242.207666015625, + "logps/rejected": -655.4842529296875, + "loss": 0.0196, + "rewards/chosen": 3.668243408203125, + "rewards/margins": 14.36478042602539, + "rewards/rejected": -10.696537017822266, + "step": 7952 + }, + { + "epoch": 0.7266331658291457, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 1.7403407330954525e-06, + "logits/chosen": 594968115.2, + "logits/rejected": 700208170.6666666, + "logps/chosen": -409.4801513671875, + "logps/rejected": -447.2882486979167, + "loss": 0.0295, + "rewards/chosen": 3.166061019897461, + "rewards/margins": 14.358526484171549, + "rewards/rejected": -11.192465464274088, + "step": 7953 + }, + { + "epoch": 0.7267245317496573, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 1.7392506165262186e-06, + "logits/chosen": 394245888.0, + "logits/rejected": 512975328.0, + "logps/chosen": -193.3601531982422, + "logps/rejected": -421.7218933105469, + "loss": 0.015, + "rewards/chosen": 4.151670932769775, + "rewards/margins": 13.37319803237915, + "rewards/rejected": -9.221527099609375, + "step": 7954 + }, + { + "epoch": 0.7268158976701691, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 1.7381607695907454e-06, + "logits/chosen": 477721770.6666667, + "logits/rejected": 585778432.0, + "logps/chosen": -195.0157470703125, + "logps/rejected": -524.5877075195312, + "loss": 0.0133, + "rewards/chosen": 4.4375356038411455, + "rewards/margins": 15.337674458821613, + "rewards/rejected": -10.900138854980469, + "step": 7955 + }, + { + "epoch": 0.7269072635906807, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 1.7370711923791567e-06, + "logits/chosen": 519946393.6, + "logits/rejected": 379328597.3333333, + "logps/chosen": -466.21376953125, + "logps/rejected": -504.8766276041667, + "loss": 0.0067, + "rewards/chosen": 4.72424545288086, + "rewards/margins": 15.323362096150717, + "rewards/rejected": -10.599116643269857, + "step": 7956 + }, + { + "epoch": 0.7269986295111923, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 1.7359818849815485e-06, + "logits/chosen": 285326890.6666667, + "logits/rejected": 221915468.8, + "logps/chosen": -310.498291015625, + "logps/rejected": -365.4322265625, + "loss": 0.0089, + "rewards/chosen": 4.0047000249226885, + "rewards/margins": 13.946608289082846, + "rewards/rejected": -9.941908264160157, + "step": 7957 + }, + { + "epoch": 0.7270899954317039, + "grad_norm": 4.8125, + "kl": 1.8401527404785156, + "learning_rate": 1.7348928474879967e-06, + "logits/chosen": 408883858.28571427, + "logits/rejected": 338933376.0, + "logps/chosen": -334.9425571986607, + "logps/rejected": -256.2208251953125, + "loss": 0.0347, + "rewards/chosen": 4.030940464564732, + "rewards/margins": 10.74430411202567, + "rewards/rejected": -6.7133636474609375, + "step": 7958 + }, + { + "epoch": 0.7271813613522157, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.7338040799885547e-06, + "logits/chosen": 642050662.4, + "logits/rejected": 504152789.3333333, + "logps/chosen": -303.26689453125, + "logps/rejected": -663.3121744791666, + "loss": 0.1227, + "rewards/chosen": 3.2978561401367186, + "rewards/margins": 12.031798171997071, + "rewards/rejected": -8.733942031860352, + "step": 7959 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 1.7327155825732517e-06, + "logits/chosen": 325215296.0, + "logits/rejected": 443575637.3333333, + "logps/chosen": -229.40139770507812, + "logps/rejected": -642.4214680989584, + "loss": 0.0119, + "rewards/chosen": 3.941760778427124, + "rewards/margins": 14.960567235946655, + "rewards/rejected": -11.018806457519531, + "step": 7960 + }, + { + "epoch": 0.7273640931932389, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 1.7316273553321012e-06, + "logits/chosen": 391924565.3333333, + "logits/rejected": 418837939.2, + "logps/chosen": -222.0694783528646, + "logps/rejected": -413.19462890625, + "loss": 0.0161, + "rewards/chosen": 3.4097687403361, + "rewards/margins": 11.991896883646646, + "rewards/rejected": -8.582128143310547, + "step": 7961 + }, + { + "epoch": 0.7274554591137505, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 1.7305393983550844e-06, + "logits/chosen": 996666368.0, + "logits/rejected": 579259968.0, + "logps/chosen": -306.0086669921875, + "logps/rejected": -435.5126953125, + "loss": 0.0186, + "rewards/chosen": 3.351672887802124, + "rewards/margins": 13.701597929000854, + "rewards/rejected": -10.34992504119873, + "step": 7962 + }, + { + "epoch": 0.7275468250342623, + "grad_norm": 51.25, + "kl": 1.5169830322265625, + "learning_rate": 1.7294517117321657e-06, + "logits/chosen": 449198153.14285713, + "logits/rejected": 221994832.0, + "logps/chosen": -246.70277622767858, + "logps/rejected": -129.6454620361328, + "loss": 0.1195, + "rewards/chosen": 3.1480590275355746, + "rewards/margins": 6.912637369973319, + "rewards/rejected": -3.764578342437744, + "step": 7963 + }, + { + "epoch": 0.7276381909547739, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 1.7283642955532898e-06, + "logits/chosen": 436239776.0, + "logits/rejected": 262332560.0, + "logps/chosen": -402.9362487792969, + "logps/rejected": -332.80810546875, + "loss": 0.0065, + "rewards/chosen": 4.97650146484375, + "rewards/margins": 13.995746612548828, + "rewards/rejected": -9.019245147705078, + "step": 7964 + }, + { + "epoch": 0.7277295568752855, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 1.727277149908374e-06, + "logits/chosen": 1153971882.6666667, + "logits/rejected": 486212864.0, + "logps/chosen": -336.98968505859375, + "logps/rejected": -278.029248046875, + "loss": 0.0108, + "rewards/chosen": 3.8674742380777993, + "rewards/margins": 12.691682306925456, + "rewards/rejected": -8.824208068847657, + "step": 7965 + }, + { + "epoch": 0.7278209227957971, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.7261902748873155e-06, + "logits/chosen": 369145856.0, + "logits/rejected": 480013376.0, + "logps/chosen": -330.64919026692706, + "logps/rejected": -556.1092529296875, + "loss": 0.028, + "rewards/chosen": 3.3669865926106772, + "rewards/margins": 13.952486356099447, + "rewards/rejected": -10.58549976348877, + "step": 7966 + }, + { + "epoch": 0.7279122887163089, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 1.7251036705799889e-06, + "logits/chosen": 535377824.0, + "logits/rejected": 734473898.6666666, + "logps/chosen": -218.5763702392578, + "logps/rejected": -547.9069010416666, + "loss": 0.0117, + "rewards/chosen": 3.7401223182678223, + "rewards/margins": 13.37657912572225, + "rewards/rejected": -9.636456807454428, + "step": 7967 + }, + { + "epoch": 0.7280036546368205, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 1.7240173370762442e-06, + "logits/chosen": 873778432.0, + "logits/rejected": 314298496.0, + "logps/chosen": -273.3075866699219, + "logps/rejected": -310.341796875, + "loss": 0.0152, + "rewards/chosen": 3.5668787956237793, + "rewards/margins": 13.00193166732788, + "rewards/rejected": -9.435052871704102, + "step": 7968 + }, + { + "epoch": 0.7280950205573321, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.7229312744659156e-06, + "logits/chosen": 1301309269.3333333, + "logits/rejected": 1440247705.6, + "logps/chosen": -263.757568359375, + "logps/rejected": -603.288916015625, + "loss": 0.0123, + "rewards/chosen": 3.466134707132975, + "rewards/margins": 12.991723696390787, + "rewards/rejected": -9.525588989257812, + "step": 7969 + }, + { + "epoch": 0.7281863864778437, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 1.721845482838807e-06, + "logits/chosen": 575040128.0, + "logits/rejected": 609885354.6666666, + "logps/chosen": -240.22718811035156, + "logps/rejected": -409.0901692708333, + "loss": 0.0167, + "rewards/chosen": 2.679929256439209, + "rewards/margins": 11.43876028060913, + "rewards/rejected": -8.758831024169922, + "step": 7970 + }, + { + "epoch": 0.7282777523983555, + "grad_norm": 67.5, + "kl": 0.0, + "learning_rate": 1.7207599622847042e-06, + "logits/chosen": 568164650.6666666, + "logits/rejected": 431985408.0, + "logps/chosen": -248.85286458333334, + "logps/rejected": -519.846630859375, + "loss": 0.0689, + "rewards/chosen": 2.4896605809529624, + "rewards/margins": 13.08118584950765, + "rewards/rejected": -10.591525268554687, + "step": 7971 + }, + { + "epoch": 0.7283691183188671, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.7196747128933684e-06, + "logits/chosen": 852968704.0, + "logits/rejected": 590306355.2, + "logps/chosen": -524.5099690755209, + "logps/rejected": -594.85615234375, + "loss": 0.0115, + "rewards/chosen": 3.553389549255371, + "rewards/margins": 13.060717582702637, + "rewards/rejected": -9.507328033447266, + "step": 7972 + }, + { + "epoch": 0.7284604842393787, + "grad_norm": 0.38671875, + "kl": 0.0, + "learning_rate": 1.7185897347545426e-06, + "logits/chosen": 412136384.0, + "logits/rejected": 382425941.3333333, + "logps/chosen": -369.7584228515625, + "logps/rejected": -630.9352213541666, + "loss": 0.0015, + "rewards/chosen": 5.1832098960876465, + "rewards/margins": 17.35282405217489, + "rewards/rejected": -12.16961415608724, + "step": 7973 + }, + { + "epoch": 0.7285518501598903, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.7175050279579426e-06, + "logits/chosen": 1631510630.4, + "logits/rejected": 615730474.6666666, + "logps/chosen": -375.005078125, + "logps/rejected": -510.9677734375, + "loss": 0.0138, + "rewards/chosen": 4.067205429077148, + "rewards/margins": 11.806881586710613, + "rewards/rejected": -7.739676157633464, + "step": 7974 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.7164205925932641e-06, + "logits/chosen": 483799808.0, + "logits/rejected": 328100480.0, + "logps/chosen": -271.9195861816406, + "logps/rejected": -405.39080810546875, + "loss": 0.0122, + "rewards/chosen": 4.093345642089844, + "rewards/margins": 13.352909088134766, + "rewards/rejected": -9.259563446044922, + "step": 7975 + }, + { + "epoch": 0.7287345820009137, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 1.7153364287501778e-06, + "logits/chosen": 932907885.7142857, + "logits/rejected": 773654592.0, + "logps/chosen": -377.19642857142856, + "logps/rejected": -791.9484252929688, + "loss": 0.0396, + "rewards/chosen": 3.5895293099539622, + "rewards/margins": 12.749575887407575, + "rewards/rejected": -9.160046577453613, + "step": 7976 + }, + { + "epoch": 0.7288259479214253, + "grad_norm": 56.25, + "kl": 0.0, + "learning_rate": 1.7142525365183372e-06, + "logits/chosen": 671078400.0, + "logits/rejected": 500960864.0, + "logps/chosen": -191.93675231933594, + "logps/rejected": -441.4395751953125, + "loss": 0.0449, + "rewards/chosen": 3.025491714477539, + "rewards/margins": 11.673622131347656, + "rewards/rejected": -8.648130416870117, + "step": 7977 + }, + { + "epoch": 0.7289173138419369, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 1.7131689159873687e-06, + "logits/chosen": 435610794.6666667, + "logits/rejected": 593699686.4, + "logps/chosen": -176.58951822916666, + "logps/rejected": -519.99736328125, + "loss": 0.0072, + "rewards/chosen": 4.987387975056966, + "rewards/margins": 13.776816685994465, + "rewards/rejected": -8.7894287109375, + "step": 7978 + }, + { + "epoch": 0.7290086797624487, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 1.7120855672468779e-06, + "logits/chosen": 468173376.0, + "logits/rejected": 665262720.0, + "logps/chosen": -233.75955200195312, + "logps/rejected": -719.87109375, + "loss": 0.0097, + "rewards/chosen": 4.282234191894531, + "rewards/margins": 14.080778121948242, + "rewards/rejected": -9.798543930053711, + "step": 7979 + }, + { + "epoch": 0.7291000456829603, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 1.7110024903864475e-06, + "logits/chosen": 530254233.6, + "logits/rejected": 388677290.6666667, + "logps/chosen": -383.5578125, + "logps/rejected": -550.20703125, + "loss": 0.0178, + "rewards/chosen": 4.1504261016845705, + "rewards/margins": 13.79500821431478, + "rewards/rejected": -9.644582112630209, + "step": 7980 + }, + { + "epoch": 0.7291914116034719, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.7099196854956357e-06, + "logits/chosen": 605064960.0, + "logits/rejected": 718202112.0, + "logps/chosen": -400.7591552734375, + "logps/rejected": -617.8937377929688, + "loss": 0.0221, + "rewards/chosen": 3.2740886211395264, + "rewards/margins": 15.059462308883667, + "rewards/rejected": -11.78537368774414, + "step": 7981 + }, + { + "epoch": 0.7292827775239835, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.7088371526639852e-06, + "logits/chosen": 507414442.6666667, + "logits/rejected": 429271040.0, + "logps/chosen": -320.2934163411458, + "logps/rejected": -271.18572998046875, + "loss": 0.0237, + "rewards/chosen": 3.9786192576090493, + "rewards/margins": 10.842260042826334, + "rewards/rejected": -6.863640785217285, + "step": 7982 + }, + { + "epoch": 0.7293741434444952, + "grad_norm": 3.015625, + "kl": 2.3750534057617188, + "learning_rate": 1.7077548919810083e-06, + "logits/chosen": 556473280.0, + "logps/chosen": -381.23004150390625, + "loss": 0.015, + "rewards/chosen": 4.749795436859131, + "step": 7983 + }, + { + "epoch": 0.7294655093650069, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.7066729035361985e-06, + "logits/chosen": 506065766.4, + "logits/rejected": 266870528.0, + "logps/chosen": -358.7745361328125, + "logps/rejected": -353.7278238932292, + "loss": 0.0465, + "rewards/chosen": 3.3297069549560545, + "rewards/margins": 11.744013595581055, + "rewards/rejected": -8.414306640625, + "step": 7984 + }, + { + "epoch": 0.7295568752855185, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 1.7055911874190246e-06, + "logits/chosen": 450580448.0, + "logits/rejected": 616643200.0, + "logps/chosen": -112.77345275878906, + "logps/rejected": -495.064453125, + "loss": 0.0156, + "rewards/chosen": 2.728832721710205, + "rewards/margins": 12.43491792678833, + "rewards/rejected": -9.706085205078125, + "step": 7985 + }, + { + "epoch": 0.7296482412060301, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 1.7045097437189379e-06, + "logits/chosen": 616581312.0, + "logits/rejected": 299416736.0, + "logps/chosen": -242.2721710205078, + "logps/rejected": -442.94683837890625, + "loss": 0.0097, + "rewards/chosen": 4.566387176513672, + "rewards/margins": 14.723569869995117, + "rewards/rejected": -10.157182693481445, + "step": 7986 + }, + { + "epoch": 0.7297396071265418, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 1.7034285725253618e-06, + "logits/chosen": 611478912.0, + "logits/rejected": 949485952.0, + "logps/chosen": -438.590576171875, + "logps/rejected": -693.197021484375, + "loss": 0.0085, + "rewards/chosen": 4.557260513305664, + "rewards/margins": 13.250778198242188, + "rewards/rejected": -8.693517684936523, + "step": 7987 + }, + { + "epoch": 0.7298309730470535, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 1.7023476739277012e-06, + "logits/chosen": 897141120.0, + "logits/rejected": 599579200.0, + "logps/chosen": -377.9903564453125, + "logps/rejected": -398.16778564453125, + "loss": 0.0078, + "rewards/chosen": 4.7058868408203125, + "rewards/margins": 13.294605255126953, + "rewards/rejected": -8.58871841430664, + "step": 7988 + }, + { + "epoch": 0.7299223389675651, + "grad_norm": 0.361328125, + "kl": 0.0, + "learning_rate": 1.7012670480153315e-06, + "logits/chosen": 561592192.0, + "logits/rejected": 500841472.0, + "logps/chosen": -391.9258728027344, + "logps/rejected": -483.77462332589283, + "loss": 0.0018, + "rewards/chosen": 4.647653102874756, + "rewards/margins": 13.812463283538818, + "rewards/rejected": -9.164810180664062, + "step": 7989 + }, + { + "epoch": 0.7300137048880767, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.7001866948776147e-06, + "logits/chosen": 1012110080.0, + "logits/rejected": 429767168.0, + "logps/chosen": -412.7893371582031, + "logps/rejected": -355.2147216796875, + "loss": 0.0159, + "rewards/chosen": 3.8356423377990723, + "rewards/margins": 12.492181301116943, + "rewards/rejected": -8.656538963317871, + "step": 7990 + }, + { + "epoch": 0.7301050708085884, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 1.6991066146038848e-06, + "logits/chosen": 1065958272.0, + "logits/rejected": 947836416.0, + "logps/chosen": -86.90013885498047, + "logps/rejected": -605.2977294921875, + "loss": 0.0132, + "rewards/chosen": 2.910566806793213, + "rewards/margins": 13.791070143381754, + "rewards/rejected": -10.880503336588541, + "step": 7991 + }, + { + "epoch": 0.7301964367291001, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 1.6980268072834543e-06, + "logits/chosen": 688818773.3333334, + "logits/rejected": 535612928.0, + "logps/chosen": -330.20102945963544, + "logps/rejected": -785.883056640625, + "loss": 0.0206, + "rewards/chosen": 2.8579413096110025, + "rewards/margins": 15.271909205118815, + "rewards/rejected": -12.413967895507813, + "step": 7992 + }, + { + "epoch": 0.7302878026496117, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.6969472730056119e-06, + "logits/chosen": 371201877.3333333, + "logits/rejected": 413596185.6, + "logps/chosen": -438.5927734375, + "logps/rejected": -429.0390625, + "loss": 0.024, + "rewards/chosen": 2.81695556640625, + "rewards/margins": 11.328056335449219, + "rewards/rejected": -8.511100769042969, + "step": 7993 + }, + { + "epoch": 0.7303791685701233, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 1.6958680118596288e-06, + "logits/chosen": 706871808.0, + "logits/rejected": 491832524.8, + "logps/chosen": -390.8412272135417, + "logps/rejected": -556.581640625, + "loss": 0.01, + "rewards/chosen": 3.6447855631510415, + "rewards/margins": 14.0847292582194, + "rewards/rejected": -10.43994369506836, + "step": 7994 + }, + { + "epoch": 0.730470534490635, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.694789023934747e-06, + "logits/chosen": 649955200.0, + "logits/rejected": 860050022.4, + "logps/chosen": -313.96645100911456, + "logps/rejected": -484.27705078125, + "loss": 0.0115, + "rewards/chosen": 3.769089380900065, + "rewards/margins": 14.216358057657876, + "rewards/rejected": -10.447268676757812, + "step": 7995 + }, + { + "epoch": 0.7305619004111467, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 1.6937103093201895e-06, + "logits/chosen": 735901120.0, + "logits/rejected": 712153258.6666666, + "logps/chosen": -422.014892578125, + "logps/rejected": -646.0748697916666, + "loss": 0.0126, + "rewards/chosen": 2.981066942214966, + "rewards/margins": 12.653859376907349, + "rewards/rejected": -9.672792434692383, + "step": 7996 + }, + { + "epoch": 0.7306532663316583, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.6926318681051563e-06, + "logits/chosen": 621423232.0, + "logits/rejected": 1210349312.0, + "logps/chosen": -308.7674560546875, + "logps/rejected": -423.5738525390625, + "loss": 0.0158, + "rewards/chosen": 4.184473991394043, + "rewards/margins": 13.439932823181152, + "rewards/rejected": -9.25545883178711, + "step": 7997 + }, + { + "epoch": 0.7307446322521699, + "grad_norm": 0.83984375, + "kl": 0.0, + "learning_rate": 1.6915537003788217e-06, + "logits/chosen": 179909120.0, + "logits/rejected": 348004693.3333333, + "logps/chosen": -190.96604919433594, + "logps/rejected": -610.1387532552084, + "loss": 0.0033, + "rewards/chosen": 4.8822021484375, + "rewards/margins": 15.211601893107096, + "rewards/rejected": -10.329399744669596, + "step": 7998 + }, + { + "epoch": 0.7308359981726816, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 1.690475806230345e-06, + "logits/chosen": 659218176.0, + "logits/rejected": 803722240.0, + "logps/chosen": -369.4064534505208, + "logps/rejected": -600.704150390625, + "loss": 0.0157, + "rewards/chosen": 3.7739299138387046, + "rewards/margins": 13.2753932317098, + "rewards/rejected": -9.501463317871094, + "step": 7999 + }, + { + "epoch": 0.7309273640931933, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 1.6893981857488556e-06, + "logits/chosen": 528497888.0, + "logits/rejected": 570754112.0, + "logps/chosen": -389.2122497558594, + "logps/rejected": -411.43646240234375, + "loss": 0.0743, + "rewards/chosen": 4.564411163330078, + "rewards/margins": 10.487627506256104, + "rewards/rejected": -5.923216342926025, + "step": 8000 + }, + { + "epoch": 0.7310187300137049, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 1.688320839023463e-06, + "logits/chosen": 687825920.0, + "logits/rejected": 623486912.0, + "logps/chosen": -423.98468017578125, + "logps/rejected": -718.0926513671875, + "loss": 0.004, + "rewards/chosen": 5.175654888153076, + "rewards/margins": 16.59028387069702, + "rewards/rejected": -11.414628982543945, + "step": 8001 + }, + { + "epoch": 0.7311100959342165, + "grad_norm": 0.482421875, + "kl": 0.0, + "learning_rate": 1.6872437661432518e-06, + "logits/chosen": 345397760.0, + "logits/rejected": 570077098.6666666, + "logps/chosen": -286.04437255859375, + "logps/rejected": -543.1717122395834, + "loss": 0.0021, + "rewards/chosen": 5.1111249923706055, + "rewards/margins": 13.628453890482584, + "rewards/rejected": -8.517328898111979, + "step": 8002 + }, + { + "epoch": 0.7312014618547282, + "grad_norm": 60.25, + "kl": 0.0, + "learning_rate": 1.6861669671972892e-06, + "logits/chosen": 480368469.3333333, + "logits/rejected": 677025587.2, + "logps/chosen": -373.6952718098958, + "logps/rejected": -488.84404296875, + "loss": 0.0799, + "rewards/chosen": 3.413100242614746, + "rewards/margins": 11.528996086120605, + "rewards/rejected": -8.115895843505859, + "step": 8003 + }, + { + "epoch": 0.7312928277752399, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.685090442274615e-06, + "logits/chosen": 745996800.0, + "logits/rejected": 447970901.3333333, + "logps/chosen": -295.343310546875, + "logps/rejected": -481.0760904947917, + "loss": 0.1341, + "rewards/chosen": 2.330609130859375, + "rewards/margins": 11.540502675374348, + "rewards/rejected": -9.209893544514975, + "step": 8004 + }, + { + "epoch": 0.7313841936957515, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 1.6840141914642482e-06, + "logits/chosen": 360325504.0, + "logits/rejected": 599157077.3333334, + "logps/chosen": -215.3426055908203, + "logps/rejected": -474.8585611979167, + "loss": 0.0224, + "rewards/chosen": 4.095080375671387, + "rewards/margins": 13.313714663187662, + "rewards/rejected": -9.218634287516275, + "step": 8005 + }, + { + "epoch": 0.7314755596162631, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.6829382148551832e-06, + "logits/chosen": 479823155.2, + "logits/rejected": 700377770.6666666, + "logps/chosen": -173.254931640625, + "logps/rejected": -539.0110677083334, + "loss": 0.0322, + "rewards/chosen": 3.2961799621582033, + "rewards/margins": 12.508348337809245, + "rewards/rejected": -9.212168375651041, + "step": 8006 + }, + { + "epoch": 0.7315669255367748, + "grad_norm": 0.98828125, + "kl": 0.0, + "learning_rate": 1.6818625125363958e-06, + "logits/chosen": 636310272.0, + "logits/rejected": 576992448.0, + "logps/chosen": -260.53306070963544, + "logps/rejected": -692.0661010742188, + "loss": 0.0064, + "rewards/chosen": 5.312092463175456, + "rewards/margins": 14.024662653605144, + "rewards/rejected": -8.712570190429688, + "step": 8007 + }, + { + "epoch": 0.7316582914572864, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 1.6807870845968349e-06, + "logits/chosen": 440229845.3333333, + "logits/rejected": 439796256.0, + "logps/chosen": -294.456298828125, + "logps/rejected": -494.6923828125, + "loss": 0.0217, + "rewards/chosen": 3.859368324279785, + "rewards/margins": 13.599276542663574, + "rewards/rejected": -9.739908218383789, + "step": 8008 + }, + { + "epoch": 0.7317496573777981, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.6797119311254294e-06, + "logits/chosen": 514560204.8, + "logits/rejected": 869142016.0, + "logps/chosen": -319.9598388671875, + "logps/rejected": -415.7438151041667, + "loss": 0.0197, + "rewards/chosen": 4.088256072998047, + "rewards/margins": 11.461819839477538, + "rewards/rejected": -7.373563766479492, + "step": 8009 + }, + { + "epoch": 0.7318410232983097, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 1.678637052211084e-06, + "logits/chosen": 763146240.0, + "logits/rejected": 428461056.0, + "logps/chosen": -402.1634216308594, + "logps/rejected": -515.1483764648438, + "loss": 0.0264, + "rewards/chosen": 3.6526122093200684, + "rewards/margins": 11.693698406219482, + "rewards/rejected": -8.041086196899414, + "step": 8010 + }, + { + "epoch": 0.7319323892188214, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.67756244794268e-06, + "logits/chosen": 545425280.0, + "logits/rejected": 360884224.0, + "logps/chosen": -343.7710774739583, + "logps/rejected": -336.8525390625, + "loss": 0.0095, + "rewards/chosen": 4.03573735555013, + "rewards/margins": 13.582858022054037, + "rewards/rejected": -9.547120666503906, + "step": 8011 + }, + { + "epoch": 0.732023755139333, + "grad_norm": 31.875, + "kl": 0.0, + "learning_rate": 1.6764881184090797e-06, + "logits/chosen": 497239398.4, + "logits/rejected": 1034113024.0, + "logps/chosen": -268.6514404296875, + "logps/rejected": -550.563232421875, + "loss": 0.0519, + "rewards/chosen": 3.387739562988281, + "rewards/margins": 13.37219467163086, + "rewards/rejected": -9.984455108642578, + "step": 8012 + }, + { + "epoch": 0.7321151210598447, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 1.675414063699119e-06, + "logits/chosen": 325263168.0, + "logits/rejected": 679010133.3333334, + "logps/chosen": -135.99395751953125, + "logps/rejected": -403.4342447916667, + "loss": 0.0091, + "rewards/chosen": 3.800821304321289, + "rewards/margins": 12.735675175984701, + "rewards/rejected": -8.934853871663412, + "step": 8013 + }, + { + "epoch": 0.7322064869803563, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 1.6743402839016127e-06, + "logits/chosen": 338588313.6, + "logits/rejected": 415447253.3333333, + "logps/chosen": -408.860693359375, + "logps/rejected": -361.2825520833333, + "loss": 0.04, + "rewards/chosen": 3.4154083251953127, + "rewards/margins": 9.017340850830077, + "rewards/rejected": -5.601932525634766, + "step": 8014 + }, + { + "epoch": 0.732297852900868, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.6732667791053514e-06, + "logits/chosen": 460304588.8, + "logits/rejected": 468085845.3333333, + "logps/chosen": -363.96796875, + "logps/rejected": -511.8404947916667, + "loss": 0.0126, + "rewards/chosen": 4.399024200439453, + "rewards/margins": 13.527007293701171, + "rewards/rejected": -9.127983093261719, + "step": 8015 + }, + { + "epoch": 0.7323892188213796, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 1.6721935493991048e-06, + "logits/chosen": 462320810.6666667, + "logits/rejected": 342110515.2, + "logps/chosen": -200.19000244140625, + "logps/rejected": -447.543505859375, + "loss": 0.0133, + "rewards/chosen": 3.406126022338867, + "rewards/margins": 12.262575912475587, + "rewards/rejected": -8.85644989013672, + "step": 8016 + }, + { + "epoch": 0.7324805847418913, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.671120594871619e-06, + "logits/chosen": 470344384.0, + "logits/rejected": 285716032.0, + "logps/chosen": -423.88360595703125, + "logps/rejected": -305.6005859375, + "loss": 0.0142, + "rewards/chosen": 4.493441104888916, + "rewards/margins": 13.314423084259033, + "rewards/rejected": -8.820981979370117, + "step": 8017 + }, + { + "epoch": 0.7325719506624029, + "grad_norm": 41.25, + "kl": 0.0, + "learning_rate": 1.6700479156116172e-06, + "logits/chosen": 364189312.0, + "logits/rejected": 315420902.4, + "logps/chosen": -169.78732299804688, + "logps/rejected": -299.3455078125, + "loss": 0.0626, + "rewards/chosen": 3.283566157023112, + "rewards/margins": 12.277763239542642, + "rewards/rejected": -8.99419708251953, + "step": 8018 + }, + { + "epoch": 0.7326633165829146, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.6689755117077983e-06, + "logits/chosen": 580238378.6666666, + "logits/rejected": 656105216.0, + "logps/chosen": -258.9372151692708, + "logps/rejected": -482.966796875, + "loss": 0.0197, + "rewards/chosen": 3.145615259806315, + "rewards/margins": 13.505728022257486, + "rewards/rejected": -10.360112762451172, + "step": 8019 + }, + { + "epoch": 0.7327546825034262, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 1.6679033832488433e-06, + "logits/chosen": 644202496.0, + "logits/rejected": 851046058.6666666, + "logps/chosen": -379.2431640625, + "logps/rejected": -403.7210286458333, + "loss": 0.1044, + "rewards/chosen": 2.7371864318847656, + "rewards/margins": 8.506815592447918, + "rewards/rejected": -5.769629160563151, + "step": 8020 + }, + { + "epoch": 0.7328460484239379, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 1.6668315303234068e-06, + "logits/chosen": 728378496.0, + "logits/rejected": 526942976.0, + "logps/chosen": -358.91839599609375, + "logps/rejected": -624.9979654947916, + "loss": 0.0072, + "rewards/chosen": 3.5639984607696533, + "rewards/margins": 15.929990688959757, + "rewards/rejected": -12.365992228190104, + "step": 8021 + }, + { + "epoch": 0.7329374143444495, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 1.6657599530201197e-06, + "logits/chosen": 663202969.6, + "logits/rejected": 517131264.0, + "logps/chosen": -331.4561767578125, + "logps/rejected": -488.9918619791667, + "loss": 0.0369, + "rewards/chosen": 3.4573833465576174, + "rewards/margins": 12.543626022338866, + "rewards/rejected": -9.08624267578125, + "step": 8022 + }, + { + "epoch": 0.7330287802649612, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 1.6646886514275906e-06, + "logits/chosen": 407877120.0, + "logits/rejected": 203389520.0, + "logps/chosen": -278.2458902994792, + "logps/rejected": -336.2763366699219, + "loss": 0.0265, + "rewards/chosen": 3.504547119140625, + "rewards/margins": 12.787479400634766, + "rewards/rejected": -9.28293228149414, + "step": 8023 + }, + { + "epoch": 0.7331201461854728, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.663617625634409e-06, + "logits/chosen": 696660608.0, + "logits/rejected": 410423744.0, + "logps/chosen": -240.55221557617188, + "logps/rejected": -408.07696533203125, + "loss": 0.0278, + "rewards/chosen": 3.318361282348633, + "rewards/margins": 11.54558277130127, + "rewards/rejected": -8.227221488952637, + "step": 8024 + }, + { + "epoch": 0.7332115121059845, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.6625468757291379e-06, + "logits/chosen": 532256548.5714286, + "logits/rejected": 283563968.0, + "logps/chosen": -295.4528111049107, + "logps/rejected": -452.698486328125, + "loss": 0.0284, + "rewards/chosen": 3.6462674822126115, + "rewards/margins": 11.713027545383998, + "rewards/rejected": -8.066760063171387, + "step": 8025 + }, + { + "epoch": 0.7333028780264961, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 1.6614764018003177e-06, + "logits/chosen": 342148672.0, + "logits/rejected": 301198720.0, + "logps/chosen": -285.18316650390625, + "logps/rejected": -295.6399841308594, + "loss": 0.0156, + "rewards/chosen": 4.459084510803223, + "rewards/margins": 13.632445335388184, + "rewards/rejected": -9.173360824584961, + "step": 8026 + }, + { + "epoch": 0.7333942439470078, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 1.6604062039364654e-06, + "logits/chosen": 596389034.6666666, + "logits/rejected": 598117580.8, + "logps/chosen": -244.83160400390625, + "logps/rejected": -595.582568359375, + "loss": 0.011, + "rewards/chosen": 3.6063543955485025, + "rewards/margins": 12.922633997599283, + "rewards/rejected": -9.31627960205078, + "step": 8027 + }, + { + "epoch": 0.7334856098675194, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 1.6593362822260795e-06, + "logits/chosen": 744055616.0, + "logits/rejected": 1151989504.0, + "logps/chosen": -303.0887451171875, + "logps/rejected": -686.281494140625, + "loss": 0.0152, + "rewards/chosen": 3.6133108139038086, + "rewards/margins": 13.783076286315918, + "rewards/rejected": -10.16976547241211, + "step": 8028 + }, + { + "epoch": 0.733576975788031, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 1.6582666367576318e-06, + "logits/chosen": 719541101.7142857, + "logits/rejected": 1541273600.0, + "logps/chosen": -263.9083949497768, + "logps/rejected": -1339.777099609375, + "loss": 0.0394, + "rewards/chosen": 3.766096387590681, + "rewards/margins": 14.526985440935407, + "rewards/rejected": -10.760889053344727, + "step": 8029 + }, + { + "epoch": 0.7336683417085427, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 1.6571972676195709e-06, + "logits/chosen": 655862016.0, + "logits/rejected": 391866956.8, + "logps/chosen": -388.69970703125, + "logps/rejected": -479.949853515625, + "loss": 0.0076, + "rewards/chosen": 3.907351811726888, + "rewards/margins": 14.526785405476888, + "rewards/rejected": -10.61943359375, + "step": 8030 + }, + { + "epoch": 0.7337597076290544, + "grad_norm": 0.9453125, + "kl": 0.0, + "learning_rate": 1.6561281749003239e-06, + "logits/chosen": 512407808.0, + "logits/rejected": 484672665.6, + "logps/chosen": -294.39404296875, + "logps/rejected": -608.7806640625, + "loss": 0.0047, + "rewards/chosen": 4.812580108642578, + "rewards/margins": 13.323171234130859, + "rewards/rejected": -8.510591125488281, + "step": 8031 + }, + { + "epoch": 0.733851073549566, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 1.655059358688294e-06, + "logits/chosen": 433295616.0, + "logits/rejected": 402031180.8, + "logps/chosen": -332.3091227213542, + "logps/rejected": -446.27880859375, + "loss": 0.0155, + "rewards/chosen": 3.8908894856770835, + "rewards/margins": 13.696232350667318, + "rewards/rejected": -9.805342864990234, + "step": 8032 + }, + { + "epoch": 0.7339424394700776, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 1.653990819071865e-06, + "logits/chosen": 780870741.3333334, + "logits/rejected": 889432832.0, + "logps/chosen": -461.1379801432292, + "logps/rejected": -481.435546875, + "loss": 0.0192, + "rewards/chosen": 4.0666154225667315, + "rewards/margins": 12.675151189168293, + "rewards/rejected": -8.608535766601562, + "step": 8033 + }, + { + "epoch": 0.7340338053905893, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 1.6529225561393936e-06, + "logits/chosen": 535205120.0, + "logits/rejected": 637120256.0, + "logps/chosen": -396.52471923828125, + "logps/rejected": -588.3341064453125, + "loss": 0.0095, + "rewards/chosen": 4.309192657470703, + "rewards/margins": 14.00775146484375, + "rewards/rejected": -9.698558807373047, + "step": 8034 + }, + { + "epoch": 0.734125171311101, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.6518545699792161e-06, + "logits/chosen": 560932949.3333334, + "logits/rejected": 490060800.0, + "logps/chosen": -329.618408203125, + "logps/rejected": -356.9886474609375, + "loss": 0.026, + "rewards/chosen": 4.2308502197265625, + "rewards/margins": 12.044473648071289, + "rewards/rejected": -7.813623428344727, + "step": 8035 + }, + { + "epoch": 0.7342165372316126, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.6507868606796424e-06, + "logits/chosen": 709834880.0, + "logits/rejected": 532102592.0, + "logps/chosen": -389.50775146484375, + "logps/rejected": -627.994140625, + "loss": 0.0145, + "rewards/chosen": 4.080552101135254, + "rewards/margins": 11.82207727432251, + "rewards/rejected": -7.741525173187256, + "step": 8036 + }, + { + "epoch": 0.7343079031521242, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 1.6497194283289658e-06, + "logits/chosen": 405690538.6666667, + "logits/rejected": 473791846.4, + "logps/chosen": -248.6519571940104, + "logps/rejected": -530.11611328125, + "loss": 0.0124, + "rewards/chosen": 4.332863807678223, + "rewards/margins": 15.196497535705566, + "rewards/rejected": -10.863633728027343, + "step": 8037 + }, + { + "epoch": 0.7343992690726359, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 1.6486522730154513e-06, + "logits/chosen": 726726912.0, + "logits/rejected": 295599488.0, + "logps/chosen": -510.0487365722656, + "logps/rejected": -226.85235595703125, + "loss": 0.0184, + "rewards/chosen": 3.452192783355713, + "rewards/margins": 10.543514251708984, + "rewards/rejected": -7.0913214683532715, + "step": 8038 + }, + { + "epoch": 0.7344906349931476, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 1.6475853948273429e-06, + "logits/chosen": 565975488.0, + "logits/rejected": 445999936.0, + "logps/chosen": -129.74766540527344, + "logps/rejected": -410.5630798339844, + "loss": 0.0155, + "rewards/chosen": 3.709416389465332, + "rewards/margins": 11.793292999267578, + "rewards/rejected": -8.083876609802246, + "step": 8039 + }, + { + "epoch": 0.7345820009136592, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 1.6465187938528615e-06, + "logits/chosen": 428113237.3333333, + "logits/rejected": 517410099.2, + "logps/chosen": -312.9102783203125, + "logps/rejected": -594.851171875, + "loss": 0.0061, + "rewards/chosen": 4.60863463083903, + "rewards/margins": 14.517398516337078, + "rewards/rejected": -9.908763885498047, + "step": 8040 + }, + { + "epoch": 0.7346733668341708, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 1.6454524701802055e-06, + "logits/chosen": 390018457.6, + "logits/rejected": 472320298.6666667, + "logps/chosen": -207.3138671875, + "logps/rejected": -437.9720865885417, + "loss": 0.0341, + "rewards/chosen": 3.3971351623535155, + "rewards/margins": 13.438246154785157, + "rewards/rejected": -10.04111099243164, + "step": 8041 + }, + { + "epoch": 0.7347647327546825, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 1.6443864238975494e-06, + "logits/chosen": 450047283.2, + "logits/rejected": 571802154.6666666, + "logps/chosen": -347.04990234375, + "logps/rejected": -857.4697265625, + "loss": 0.0312, + "rewards/chosen": 3.558728790283203, + "rewards/margins": 15.839545186360677, + "rewards/rejected": -12.280816396077475, + "step": 8042 + }, + { + "epoch": 0.7348560986751942, + "grad_norm": 36.5, + "kl": 0.0, + "learning_rate": 1.6433206550930458e-06, + "logits/chosen": 505850304.0, + "logits/rejected": 446555520.0, + "logps/chosen": -194.44296264648438, + "logps/rejected": -671.1344604492188, + "loss": 0.0452, + "rewards/chosen": 2.763044834136963, + "rewards/margins": 12.04236650466919, + "rewards/rejected": -9.279321670532227, + "step": 8043 + }, + { + "epoch": 0.7349474645957058, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.6422551638548219e-06, + "logits/chosen": 1023514624.0, + "logits/rejected": 1093862144.0, + "logps/chosen": -365.9087829589844, + "logps/rejected": -645.1875, + "loss": 0.0085, + "rewards/chosen": 4.407233238220215, + "rewards/margins": 13.901505470275879, + "rewards/rejected": -9.494272232055664, + "step": 8044 + }, + { + "epoch": 0.7350388305162174, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.6411899502709873e-06, + "logits/chosen": 643460736.0, + "logits/rejected": 463990112.0, + "logps/chosen": -404.00225830078125, + "logps/rejected": -446.3464660644531, + "loss": 0.0261, + "rewards/chosen": 3.1302638053894043, + "rewards/margins": 13.137139797210693, + "rewards/rejected": -10.006875991821289, + "step": 8045 + }, + { + "epoch": 0.735130196436729, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 1.6401250144296239e-06, + "logits/chosen": 365638246.4, + "logits/rejected": 480301738.6666667, + "logps/chosen": -230.412841796875, + "logps/rejected": -614.9911295572916, + "loss": 0.007, + "rewards/chosen": 5.196873474121094, + "rewards/margins": 14.829163360595704, + "rewards/rejected": -9.63228988647461, + "step": 8046 + }, + { + "epoch": 0.7352215623572408, + "grad_norm": 0.6484375, + "kl": 0.0, + "learning_rate": 1.6390603564187918e-06, + "logits/chosen": 653647018.6666666, + "logits/rejected": 409224883.2, + "logps/chosen": -222.3817138671875, + "logps/rejected": -429.89443359375, + "loss": 0.0037, + "rewards/chosen": 4.945423762003581, + "rewards/margins": 13.871986262003581, + "rewards/rejected": -8.9265625, + "step": 8047 + }, + { + "epoch": 0.7353129282777524, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 1.6379959763265268e-06, + "logits/chosen": 489633331.2, + "logits/rejected": 504727552.0, + "logps/chosen": -370.209912109375, + "logps/rejected": -427.0464274088542, + "loss": 0.0288, + "rewards/chosen": 3.357807922363281, + "rewards/margins": 11.287701416015626, + "rewards/rejected": -7.929893493652344, + "step": 8048 + }, + { + "epoch": 0.735404294198264, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 1.6369318742408464e-06, + "logits/chosen": 508157644.8, + "logits/rejected": 1319038805.3333333, + "logps/chosen": -169.515185546875, + "logps/rejected": -686.6451822916666, + "loss": 0.0266, + "rewards/chosen": 3.6615264892578123, + "rewards/margins": 13.390534845987954, + "rewards/rejected": -9.729008356730143, + "step": 8049 + }, + { + "epoch": 0.7354956601187757, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 1.635868050249741e-06, + "logits/chosen": 477521493.3333333, + "logits/rejected": 488250176.0, + "logps/chosen": -267.7149658203125, + "logps/rejected": -338.66326904296875, + "loss": 0.0276, + "rewards/chosen": 3.4223410288492837, + "rewards/margins": 11.30190626780192, + "rewards/rejected": -7.879565238952637, + "step": 8050 + }, + { + "epoch": 0.7355870260392874, + "grad_norm": 0.62890625, + "kl": 0.0, + "learning_rate": 1.6348045044411782e-06, + "logits/chosen": 631743936.0, + "logits/rejected": 392460672.0, + "logps/chosen": -423.9068603515625, + "logps/rejected": -457.2174479166667, + "loss": 0.0023, + "rewards/chosen": 5.069901466369629, + "rewards/margins": 14.040066083272299, + "rewards/rejected": -8.97016461690267, + "step": 8051 + }, + { + "epoch": 0.735678391959799, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 1.6337412369031031e-06, + "logits/chosen": 309231232.0, + "logits/rejected": 624897450.6666666, + "logps/chosen": -260.812109375, + "logps/rejected": -477.714111328125, + "loss": 0.016, + "rewards/chosen": 4.5501056671142575, + "rewards/margins": 12.135974502563476, + "rewards/rejected": -7.585868835449219, + "step": 8052 + }, + { + "epoch": 0.7357697578803106, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 1.6326782477234377e-06, + "logits/chosen": 974275072.0, + "logits/rejected": 907510272.0, + "logps/chosen": -233.47269694010416, + "logps/rejected": -666.61240234375, + "loss": 0.0061, + "rewards/chosen": 4.279625256856282, + "rewards/margins": 15.891914685567219, + "rewards/rejected": -11.612289428710938, + "step": 8053 + }, + { + "epoch": 0.7358611238008222, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 1.6316155369900833e-06, + "logits/chosen": 550360874.6666666, + "logits/rejected": 732259174.4, + "logps/chosen": -206.88094075520834, + "logps/rejected": -500.5212890625, + "loss": 0.0068, + "rewards/chosen": 4.737510045369466, + "rewards/margins": 14.001684697469074, + "rewards/rejected": -9.264174652099609, + "step": 8054 + }, + { + "epoch": 0.735952489721334, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 1.6305531047909156e-06, + "logits/chosen": 464630485.3333333, + "logits/rejected": 438262579.2, + "logps/chosen": -481.4733072916667, + "logps/rejected": -522.023388671875, + "loss": 0.0157, + "rewards/chosen": 3.400570551554362, + "rewards/margins": 13.182894007364908, + "rewards/rejected": -9.782323455810547, + "step": 8055 + }, + { + "epoch": 0.7360438556418456, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.6294909512137874e-06, + "logits/chosen": 704955392.0, + "logits/rejected": 598164138.6666666, + "logps/chosen": -360.54267578125, + "logps/rejected": -564.2432047526041, + "loss": 0.0141, + "rewards/chosen": 3.967765045166016, + "rewards/margins": 13.64872538248698, + "rewards/rejected": -9.680960337320963, + "step": 8056 + }, + { + "epoch": 0.7361352215623572, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 1.6284290763465277e-06, + "logits/chosen": 565864960.0, + "logits/rejected": 1069647462.4, + "logps/chosen": -371.1483968098958, + "logps/rejected": -517.517138671875, + "loss": 0.0108, + "rewards/chosen": 3.5417404174804688, + "rewards/margins": 13.507284545898438, + "rewards/rejected": -9.965544128417969, + "step": 8057 + }, + { + "epoch": 0.7362265874828688, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.6273674802769468e-06, + "logits/chosen": 285169612.8, + "logits/rejected": 448353024.0, + "logps/chosen": -198.0585693359375, + "logps/rejected": -426.448974609375, + "loss": 0.0193, + "rewards/chosen": 4.203224945068359, + "rewards/margins": 11.58869883219401, + "rewards/rejected": -7.385473887125651, + "step": 8058 + }, + { + "epoch": 0.7363179534033806, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.6263061630928268e-06, + "logits/chosen": 517521472.0, + "logits/rejected": 872481408.0, + "logps/chosen": -318.10687255859375, + "logps/rejected": -344.15948486328125, + "loss": 0.0166, + "rewards/chosen": 3.740227222442627, + "rewards/margins": 11.640740871429443, + "rewards/rejected": -7.900513648986816, + "step": 8059 + }, + { + "epoch": 0.7364093193238922, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.625245124881929e-06, + "logits/chosen": 561339084.8, + "logits/rejected": 576302208.0, + "logps/chosen": -316.7751953125, + "logps/rejected": -575.3022867838541, + "loss": 0.0131, + "rewards/chosen": 4.174147033691407, + "rewards/margins": 13.10737902323405, + "rewards/rejected": -8.933231989542643, + "step": 8060 + }, + { + "epoch": 0.7365006852444038, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 1.6241843657319923e-06, + "logits/chosen": 633593753.6, + "logits/rejected": 232063317.33333334, + "logps/chosen": -234.1913330078125, + "logps/rejected": -539.1033528645834, + "loss": 0.0238, + "rewards/chosen": 3.6466129302978514, + "rewards/margins": 14.509343592325845, + "rewards/rejected": -10.862730662027994, + "step": 8061 + }, + { + "epoch": 0.7365920511649154, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 1.6231238857307284e-06, + "logits/chosen": 476533333.3333333, + "logits/rejected": 435245670.4, + "logps/chosen": -351.3212076822917, + "logps/rejected": -368.1835205078125, + "loss": 0.092, + "rewards/chosen": 3.9260082244873047, + "rewards/margins": 9.723625946044923, + "rewards/rejected": -5.797617721557617, + "step": 8062 + }, + { + "epoch": 0.7366834170854272, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.6220636849658338e-06, + "logits/chosen": 448413184.0, + "logits/rejected": 618431658.6666666, + "logps/chosen": -239.20272827148438, + "logps/rejected": -397.9803466796875, + "loss": 0.0131, + "rewards/chosen": 3.0658349990844727, + "rewards/margins": 11.79035727183024, + "rewards/rejected": -8.724522272745768, + "step": 8063 + }, + { + "epoch": 0.7367747830059388, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 1.6210037635249754e-06, + "logits/chosen": 231517081.6, + "logits/rejected": 186466816.0, + "logps/chosen": -261.445166015625, + "logps/rejected": -291.0471598307292, + "loss": 0.0221, + "rewards/chosen": 4.036552810668946, + "rewards/margins": 12.747084681193034, + "rewards/rejected": -8.710531870524088, + "step": 8064 + }, + { + "epoch": 0.7368661489264504, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 1.6199441214957985e-06, + "logits/chosen": 841930496.0, + "logits/rejected": 787490389.3333334, + "logps/chosen": -279.0534912109375, + "logps/rejected": -644.502685546875, + "loss": 0.0142, + "rewards/chosen": 4.230786895751953, + "rewards/margins": 13.505014038085937, + "rewards/rejected": -9.274227142333984, + "step": 8065 + }, + { + "epoch": 0.736957514846962, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.618884758965924e-06, + "logits/chosen": 612051157.3333334, + "logits/rejected": 683284032.0, + "logps/chosen": -314.35056559244794, + "logps/rejected": -360.9402160644531, + "loss": 0.0229, + "rewards/chosen": 3.5687878926595054, + "rewards/margins": 12.467765172322592, + "rewards/rejected": -8.898977279663086, + "step": 8066 + }, + { + "epoch": 0.7370488807674738, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.617825676022955e-06, + "logits/chosen": 417932288.0, + "logits/rejected": 280643200.0, + "logps/chosen": -223.76510184151786, + "logps/rejected": -295.4514465332031, + "loss": 0.0233, + "rewards/chosen": 4.024283272879464, + "rewards/margins": 12.492496354239329, + "rewards/rejected": -8.468213081359863, + "step": 8067 + }, + { + "epoch": 0.7371402466879854, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.616766872754467e-06, + "logits/chosen": 330345600.0, + "logits/rejected": 313111398.4, + "logps/chosen": -212.21126302083334, + "logps/rejected": -412.14775390625, + "loss": 0.011, + "rewards/chosen": 3.850893020629883, + "rewards/margins": 12.723971176147462, + "rewards/rejected": -8.873078155517579, + "step": 8068 + }, + { + "epoch": 0.737231612608497, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 1.6157083492480101e-06, + "logits/chosen": 420421120.0, + "logits/rejected": 716700876.8, + "logps/chosen": -295.03387451171875, + "logps/rejected": -562.8009765625, + "loss": 0.0149, + "rewards/chosen": 4.053420066833496, + "rewards/margins": 13.293314933776855, + "rewards/rejected": -9.23989486694336, + "step": 8069 + }, + { + "epoch": 0.7373229785290086, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.6146501055911173e-06, + "logits/chosen": 332471040.0, + "logits/rejected": 500700576.0, + "logps/chosen": -205.17759704589844, + "logps/rejected": -611.3945922851562, + "loss": 0.0196, + "rewards/chosen": 3.691112995147705, + "rewards/margins": 15.14721155166626, + "rewards/rejected": -11.456098556518555, + "step": 8070 + }, + { + "epoch": 0.7374143444495204, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.6135921418712959e-06, + "logits/chosen": 402038246.4, + "logits/rejected": 323872768.0, + "logps/chosen": -273.36533203125, + "logps/rejected": -334.80133056640625, + "loss": 0.034, + "rewards/chosen": 2.9406105041503907, + "rewards/margins": 10.808315531412761, + "rewards/rejected": -7.86770502726237, + "step": 8071 + }, + { + "epoch": 0.737505710370032, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 1.6125344581760277e-06, + "logits/chosen": 1133284096.0, + "logits/rejected": 548353499.4285715, + "logps/chosen": -366.00592041015625, + "logps/rejected": -505.55311802455356, + "loss": 0.0027, + "rewards/chosen": 3.9341461658477783, + "rewards/margins": 12.887328386306763, + "rewards/rejected": -8.953182220458984, + "step": 8072 + }, + { + "epoch": 0.7375970762905436, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 1.6114770545927743e-06, + "logits/chosen": 643950976.0, + "logits/rejected": 373907584.0, + "logps/chosen": -449.10968017578125, + "logps/rejected": -433.02593994140625, + "loss": 0.0285, + "rewards/chosen": 3.71244478225708, + "rewards/margins": 14.197147846221924, + "rewards/rejected": -10.484703063964844, + "step": 8073 + }, + { + "epoch": 0.7376884422110552, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 1.610419931208972e-06, + "logits/chosen": 440688448.0, + "logits/rejected": 456862240.0, + "logps/chosen": -271.21527099609375, + "logps/rejected": -737.5526733398438, + "loss": 0.0088, + "rewards/chosen": 4.325989723205566, + "rewards/margins": 17.17934799194336, + "rewards/rejected": -12.853358268737793, + "step": 8074 + }, + { + "epoch": 0.737779808131567, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 1.609363088112037e-06, + "logits/chosen": 427884467.2, + "logits/rejected": 360184277.3333333, + "logps/chosen": -346.62109375, + "logps/rejected": -447.6495768229167, + "loss": 0.0141, + "rewards/chosen": 4.12864990234375, + "rewards/margins": 13.762967936197917, + "rewards/rejected": -9.634318033854166, + "step": 8075 + }, + { + "epoch": 0.7378711740520786, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 1.6083065253893605e-06, + "logits/chosen": 640265420.8, + "logits/rejected": 662420138.6666666, + "logps/chosen": -336.524365234375, + "logps/rejected": -657.150634765625, + "loss": 0.0277, + "rewards/chosen": 3.300840377807617, + "rewards/margins": 14.328001022338867, + "rewards/rejected": -11.02716064453125, + "step": 8076 + }, + { + "epoch": 0.7379625399725902, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 1.6072502431283093e-06, + "logits/chosen": 702757120.0, + "logits/rejected": 652616661.3333334, + "logps/chosen": -267.97210693359375, + "logps/rejected": -510.3044026692708, + "loss": 0.0101, + "rewards/chosen": 3.196977138519287, + "rewards/margins": 13.472596009572348, + "rewards/rejected": -10.27561887105306, + "step": 8077 + }, + { + "epoch": 0.7380539058931018, + "grad_norm": 0.5390625, + "kl": 0.0, + "learning_rate": 1.6061942414162268e-06, + "logits/chosen": 181022800.0, + "logits/rejected": 290854442.6666667, + "logps/chosen": -165.89707946777344, + "logps/rejected": -446.9388020833333, + "loss": 0.0024, + "rewards/chosen": 4.8031182289123535, + "rewards/margins": 14.801271279652914, + "rewards/rejected": -9.99815305074056, + "step": 8078 + }, + { + "epoch": 0.7381452718136136, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.6051385203404379e-06, + "logits/chosen": 431426090.6666667, + "logits/rejected": 428630502.4, + "logps/chosen": -605.857421875, + "logps/rejected": -394.6695068359375, + "loss": 0.1114, + "rewards/chosen": 3.430251121520996, + "rewards/margins": 9.226658058166503, + "rewards/rejected": -5.796406936645508, + "step": 8079 + }, + { + "epoch": 0.7382366377341252, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.604083079988239e-06, + "logits/chosen": 425276123.4285714, + "logits/rejected": 348424672.0, + "logps/chosen": -341.82718331473217, + "logps/rejected": -588.096923828125, + "loss": 0.0211, + "rewards/chosen": 4.865047999790737, + "rewards/margins": 16.243872233799525, + "rewards/rejected": -11.378824234008789, + "step": 8080 + }, + { + "epoch": 0.7383280036546368, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 1.6030279204469056e-06, + "logits/chosen": 314344853.3333333, + "logits/rejected": 389629696.0, + "logps/chosen": -295.16412353515625, + "logps/rejected": -493.67919921875, + "loss": 0.005, + "rewards/chosen": 5.180685679117839, + "rewards/margins": 14.84359308878581, + "rewards/rejected": -9.66290740966797, + "step": 8081 + }, + { + "epoch": 0.7384193695751484, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 1.6019730418036899e-06, + "logits/chosen": 979216576.0, + "logits/rejected": 583501696.0, + "logps/chosen": -431.3493347167969, + "logps/rejected": -554.705810546875, + "loss": 0.0132, + "rewards/chosen": 3.74582839012146, + "rewards/margins": 13.445922613143921, + "rewards/rejected": -9.700094223022461, + "step": 8082 + }, + { + "epoch": 0.7385107354956602, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 1.6009184441458186e-06, + "logits/chosen": 390140202.6666667, + "logits/rejected": 555055308.8, + "logps/chosen": -177.43636067708334, + "logps/rejected": -546.85439453125, + "loss": 0.0166, + "rewards/chosen": 3.372956911722819, + "rewards/margins": 13.632927004496256, + "rewards/rejected": -10.259970092773438, + "step": 8083 + }, + { + "epoch": 0.7386021014161718, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 1.5998641275605003e-06, + "logits/chosen": 520619562.6666667, + "logits/rejected": 407874688.0, + "logps/chosen": -370.4920654296875, + "logps/rejected": -314.302734375, + "loss": 0.134, + "rewards/chosen": 2.6627405484517417, + "rewards/margins": 9.91123374303182, + "rewards/rejected": -7.248493194580078, + "step": 8084 + }, + { + "epoch": 0.7386934673366834, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 1.5988100921349154e-06, + "logits/chosen": 453362986.6666667, + "logits/rejected": 1314225920.0, + "logps/chosen": -243.3773193359375, + "logps/rejected": -520.8418579101562, + "loss": 0.0233, + "rewards/chosen": 3.8120644887288413, + "rewards/margins": 13.156380971272787, + "rewards/rejected": -9.344316482543945, + "step": 8085 + }, + { + "epoch": 0.738784833257195, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 1.5977563379562234e-06, + "logits/chosen": 387314336.0, + "logits/rejected": 241428256.0, + "logps/chosen": -334.7589111328125, + "logps/rejected": -464.3271789550781, + "loss": 0.012, + "rewards/chosen": 4.973904609680176, + "rewards/margins": 14.9348726272583, + "rewards/rejected": -9.960968017578125, + "step": 8086 + }, + { + "epoch": 0.7388761991777067, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.5967028651115579e-06, + "logits/chosen": 690356736.0, + "logits/rejected": 816653414.4, + "logps/chosen": -442.1684977213542, + "logps/rejected": -566.81923828125, + "loss": 0.0115, + "rewards/chosen": 3.8161001205444336, + "rewards/margins": 14.390439414978028, + "rewards/rejected": -10.574339294433594, + "step": 8087 + }, + { + "epoch": 0.7389675650982184, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 1.5956496736880357e-06, + "logits/chosen": 1289944678.4, + "logits/rejected": 580318848.0, + "logps/chosen": -598.825, + "logps/rejected": -466.7779947916667, + "loss": 0.0192, + "rewards/chosen": 4.326480102539063, + "rewards/margins": 14.286727142333984, + "rewards/rejected": -9.960247039794922, + "step": 8088 + }, + { + "epoch": 0.73905893101873, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 1.5945967637727422e-06, + "logits/chosen": 538189696.0, + "logits/rejected": 205649472.0, + "logps/chosen": -366.5495910644531, + "logps/rejected": -314.80230712890625, + "loss": 0.0295, + "rewards/chosen": 2.97467041015625, + "rewards/margins": 10.71985673904419, + "rewards/rejected": -7.7451863288879395, + "step": 8089 + }, + { + "epoch": 0.7391502969392416, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.5935441354527452e-06, + "logits/chosen": 389015424.0, + "logits/rejected": 323920960.0, + "logps/chosen": -250.91551208496094, + "logps/rejected": -519.9237060546875, + "loss": 0.0128, + "rewards/chosen": 3.7969069480895996, + "rewards/margins": 13.067609310150146, + "rewards/rejected": -9.270702362060547, + "step": 8090 + }, + { + "epoch": 0.7392416628597533, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 1.5924917888150855e-06, + "logits/chosen": 760708608.0, + "logits/rejected": 342099050.6666667, + "logps/chosen": -360.1110534667969, + "logps/rejected": -411.807373046875, + "loss": 0.0049, + "rewards/chosen": 4.01825475692749, + "rewards/margins": 13.593016465504965, + "rewards/rejected": -9.574761708577475, + "step": 8091 + }, + { + "epoch": 0.739333028780265, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 1.5914397239467843e-06, + "logits/chosen": 280330400.0, + "logits/rejected": 393616576.0, + "logps/chosen": -234.2755126953125, + "logps/rejected": -605.2492065429688, + "loss": 0.0152, + "rewards/chosen": 3.795189380645752, + "rewards/margins": 15.235493183135986, + "rewards/rejected": -11.440303802490234, + "step": 8092 + }, + { + "epoch": 0.7394243947007766, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 1.5903879409348378e-06, + "logits/chosen": 586401792.0, + "logits/rejected": 473462528.0, + "logps/chosen": -466.679248046875, + "logps/rejected": -596.5396728515625, + "loss": 0.0073, + "rewards/chosen": 4.5900310516357425, + "rewards/margins": 13.964048131306967, + "rewards/rejected": -9.374017079671225, + "step": 8093 + }, + { + "epoch": 0.7395157606212882, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 1.5893364398662175e-06, + "logits/chosen": 370370944.0, + "logits/rejected": 541480192.0, + "logps/chosen": -227.35134887695312, + "logps/rejected": -714.3267822265625, + "loss": 0.0247, + "rewards/chosen": 3.060131072998047, + "rewards/margins": 13.243767738342285, + "rewards/rejected": -10.183636665344238, + "step": 8094 + }, + { + "epoch": 0.7396071265417999, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 1.5882852208278727e-06, + "logits/chosen": 532033493.3333333, + "logits/rejected": 213497280.0, + "logps/chosen": -341.8354085286458, + "logps/rejected": -144.42794799804688, + "loss": 0.1595, + "rewards/chosen": 2.497568448384603, + "rewards/margins": 8.703100045522055, + "rewards/rejected": -6.205531597137451, + "step": 8095 + }, + { + "epoch": 0.7396984924623116, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.5872342839067305e-06, + "logits/chosen": 764560676.5714285, + "logits/rejected": 716628480.0, + "logps/chosen": -263.13466099330356, + "logps/rejected": -327.1569519042969, + "loss": 0.0315, + "rewards/chosen": 3.85909298488072, + "rewards/margins": 12.56972530909947, + "rewards/rejected": -8.71063232421875, + "step": 8096 + }, + { + "epoch": 0.7397898583828232, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 1.5861836291896926e-06, + "logits/chosen": 752007552.0, + "logits/rejected": 548379520.0, + "logps/chosen": -410.9685974121094, + "logps/rejected": -390.5859375, + "loss": 0.0157, + "rewards/chosen": 2.7134315967559814, + "rewards/margins": 12.612228314081827, + "rewards/rejected": -9.898796717325846, + "step": 8097 + }, + { + "epoch": 0.7398812243033348, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.585133256763639e-06, + "logits/chosen": 434683520.0, + "logits/rejected": 256161696.0, + "logps/chosen": -248.99600219726562, + "logps/rejected": -322.225341796875, + "loss": 0.0159, + "rewards/chosen": 4.432524681091309, + "rewards/margins": 13.80590534210205, + "rewards/rejected": -9.373380661010742, + "step": 8098 + }, + { + "epoch": 0.7399725902238465, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.5840831667154245e-06, + "logits/chosen": 1060208576.0, + "logits/rejected": 573981056.0, + "logps/chosen": -294.918212890625, + "logps/rejected": -469.39605712890625, + "loss": 0.0217, + "rewards/chosen": 3.365196943283081, + "rewards/margins": 12.938713788986206, + "rewards/rejected": -9.573516845703125, + "step": 8099 + }, + { + "epoch": 0.7400639561443582, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 1.5830333591318836e-06, + "logits/chosen": 512941344.0, + "logits/rejected": 552223488.0, + "logps/chosen": -340.0588073730469, + "logps/rejected": -438.0722351074219, + "loss": 0.0246, + "rewards/chosen": 3.7963075637817383, + "rewards/margins": 11.320866584777832, + "rewards/rejected": -7.524559020996094, + "step": 8100 + }, + { + "epoch": 0.7401553220648698, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 1.5819838340998257e-06, + "logits/chosen": 665934336.0, + "logits/rejected": 618159616.0, + "logps/chosen": -359.849609375, + "logps/rejected": -491.39677734375, + "loss": 0.0126, + "rewards/chosen": 3.4693355560302734, + "rewards/margins": 12.055751419067382, + "rewards/rejected": -8.586415863037109, + "step": 8101 + }, + { + "epoch": 0.7402466879853814, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 1.5809345917060358e-06, + "logits/chosen": 864447616.0, + "logits/rejected": 693065216.0, + "logps/chosen": -337.45953369140625, + "logps/rejected": -530.1693725585938, + "loss": 0.0106, + "rewards/chosen": 4.170950889587402, + "rewards/margins": 13.21997356414795, + "rewards/rejected": -9.049022674560547, + "step": 8102 + }, + { + "epoch": 0.7403380539058931, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.579885632037277e-06, + "logits/chosen": 571464448.0, + "logits/rejected": 570009216.0, + "logps/chosen": -295.49761962890625, + "logps/rejected": -576.029296875, + "loss": 0.0135, + "rewards/chosen": 4.1427764892578125, + "rewards/margins": 12.945180892944336, + "rewards/rejected": -8.802404403686523, + "step": 8103 + }, + { + "epoch": 0.7404294198264048, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.5788369551802869e-06, + "logits/chosen": 740240640.0, + "logits/rejected": 603381657.6, + "logps/chosen": -481.2626953125, + "logps/rejected": -460.01572265625, + "loss": 0.0117, + "rewards/chosen": 3.592653274536133, + "rewards/margins": 11.957866287231445, + "rewards/rejected": -8.365213012695312, + "step": 8104 + }, + { + "epoch": 0.7405207857469164, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 1.5777885612217842e-06, + "logits/chosen": 548486357.3333334, + "logits/rejected": 180780224.0, + "logps/chosen": -390.5135904947917, + "logps/rejected": -458.32965087890625, + "loss": 0.0262, + "rewards/chosen": 3.897496223449707, + "rewards/margins": 16.87049388885498, + "rewards/rejected": -12.972997665405273, + "step": 8105 + }, + { + "epoch": 0.740612151667428, + "grad_norm": 0.64453125, + "kl": 0.0, + "learning_rate": 1.5767404502484596e-06, + "logits/chosen": 678477184.0, + "logits/rejected": 885598549.3333334, + "logps/chosen": -337.884521484375, + "logps/rejected": -611.8511555989584, + "loss": 0.0039, + "rewards/chosen": 4.5762763023376465, + "rewards/margins": 13.735988775889078, + "rewards/rejected": -9.159712473551432, + "step": 8106 + }, + { + "epoch": 0.7407035175879397, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 1.5756926223469832e-06, + "logits/chosen": 837099648.0, + "logits/rejected": 450078016.0, + "logps/chosen": -337.2105712890625, + "logps/rejected": -417.6631774902344, + "loss": 0.0133, + "rewards/chosen": 3.817798614501953, + "rewards/margins": 13.844566345214844, + "rewards/rejected": -10.02676773071289, + "step": 8107 + }, + { + "epoch": 0.7407948835084514, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.5746450776039985e-06, + "logits/chosen": 641547980.8, + "logits/rejected": 843435264.0, + "logps/chosen": -221.57392578125, + "logps/rejected": -638.2681477864584, + "loss": 0.0153, + "rewards/chosen": 4.043186187744141, + "rewards/margins": 14.405089060465494, + "rewards/rejected": -10.361902872721354, + "step": 8108 + }, + { + "epoch": 0.740886249428963, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 1.5735978161061304e-06, + "logits/chosen": 658712448.0, + "logits/rejected": 341058688.0, + "logps/chosen": -248.55596923828125, + "logps/rejected": -372.83612060546875, + "loss": 0.1312, + "rewards/chosen": 2.507876396179199, + "rewards/margins": 12.178573608398438, + "rewards/rejected": -9.670697212219238, + "step": 8109 + }, + { + "epoch": 0.7409776153494746, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.5725508379399767e-06, + "logits/chosen": 1045014869.3333334, + "logits/rejected": 781488384.0, + "logps/chosen": -325.25433349609375, + "logps/rejected": -609.2198486328125, + "loss": 0.0208, + "rewards/chosen": 3.6368306477864585, + "rewards/margins": 12.801674207051596, + "rewards/rejected": -9.164843559265137, + "step": 8110 + }, + { + "epoch": 0.7410689812699863, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 1.5715041431921124e-06, + "logits/chosen": 794697011.2, + "logits/rejected": 558809770.6666666, + "logps/chosen": -377.6364990234375, + "logps/rejected": -653.863037109375, + "loss": 0.0244, + "rewards/chosen": 3.5770225524902344, + "rewards/margins": 12.88371721903483, + "rewards/rejected": -9.306694666544596, + "step": 8111 + }, + { + "epoch": 0.741160347190498, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 1.5704577319490887e-06, + "logits/chosen": 599266816.0, + "logits/rejected": 414196266.6666667, + "logps/chosen": -242.3685302734375, + "logps/rejected": -371.0991617838542, + "loss": 0.0048, + "rewards/chosen": 4.770664215087891, + "rewards/margins": 13.87733014424642, + "rewards/rejected": -9.10666592915853, + "step": 8112 + }, + { + "epoch": 0.7412517131110096, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 1.5694116042974366e-06, + "logits/chosen": 281372128.0, + "logits/rejected": 598352768.0, + "logps/chosen": -260.8265380859375, + "logps/rejected": -517.3759155273438, + "loss": 0.0222, + "rewards/chosen": 3.481092929840088, + "rewards/margins": 12.932426929473877, + "rewards/rejected": -9.451333999633789, + "step": 8113 + }, + { + "epoch": 0.7413430790315212, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 1.5683657603236595e-06, + "logits/chosen": 679913557.3333334, + "logits/rejected": 896587200.0, + "logps/chosen": -304.8578287760417, + "logps/rejected": -165.55877685546875, + "loss": 0.0264, + "rewards/chosen": 3.5435702006022134, + "rewards/margins": 10.179863611857096, + "rewards/rejected": -6.636293411254883, + "step": 8114 + }, + { + "epoch": 0.7414344449520329, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 1.5673202001142396e-06, + "logits/chosen": 385863884.8, + "logits/rejected": 436987690.6666667, + "logps/chosen": -363.7744873046875, + "logps/rejected": -468.9114176432292, + "loss": 0.0281, + "rewards/chosen": 3.724134063720703, + "rewards/margins": 14.090105946858724, + "rewards/rejected": -10.365971883138021, + "step": 8115 + }, + { + "epoch": 0.7415258108725445, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 1.566274923755634e-06, + "logits/chosen": 391393088.0, + "logits/rejected": 674795904.0, + "logps/chosen": -210.93692016601562, + "logps/rejected": -511.81103515625, + "loss": 0.0135, + "rewards/chosen": 4.430985927581787, + "rewards/margins": 14.849117755889893, + "rewards/rejected": -10.418131828308105, + "step": 8116 + }, + { + "epoch": 0.7416171767930562, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 1.5652299313342772e-06, + "logits/chosen": 609200960.0, + "logits/rejected": 395487200.0, + "logps/chosen": -340.5806884765625, + "logps/rejected": -583.78466796875, + "loss": 0.0333, + "rewards/chosen": 2.7378525733947754, + "rewards/margins": 10.439011573791504, + "rewards/rejected": -7.7011590003967285, + "step": 8117 + }, + { + "epoch": 0.7417085427135678, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.5641852229365823e-06, + "logits/chosen": 475504512.0, + "logits/rejected": 378045664.0, + "logps/chosen": -370.83917236328125, + "logps/rejected": -606.88720703125, + "loss": 0.0143, + "rewards/chosen": 3.8409857749938965, + "rewards/margins": 13.137813091278076, + "rewards/rejected": -9.29682731628418, + "step": 8118 + }, + { + "epoch": 0.7417999086340795, + "grad_norm": 45.75, + "kl": 0.0, + "learning_rate": 1.5631407986489362e-06, + "logits/chosen": 222636544.0, + "logits/rejected": 394189376.0, + "logps/chosen": -179.7728729248047, + "logps/rejected": -479.24755859375, + "loss": 0.044, + "rewards/chosen": 4.482808589935303, + "rewards/margins": 14.514390468597412, + "rewards/rejected": -10.03158187866211, + "step": 8119 + }, + { + "epoch": 0.7418912745545911, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.562096658557703e-06, + "logits/chosen": 369784224.0, + "logits/rejected": 176187296.0, + "logps/chosen": -306.27911376953125, + "logps/rejected": -260.16876220703125, + "loss": 0.0232, + "rewards/chosen": 3.6605424880981445, + "rewards/margins": 11.972291946411133, + "rewards/rejected": -8.311749458312988, + "step": 8120 + }, + { + "epoch": 0.7419826404751028, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 1.561052802749221e-06, + "logits/chosen": 681334784.0, + "logits/rejected": 867260928.0, + "logps/chosen": -473.6207682291667, + "logps/rejected": -478.8603515625, + "loss": 0.0169, + "rewards/chosen": 3.277815500895182, + "rewards/margins": 12.823956553141276, + "rewards/rejected": -9.546141052246094, + "step": 8121 + }, + { + "epoch": 0.7420740063956144, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 1.5600092313098135e-06, + "logits/chosen": 619892736.0, + "logits/rejected": 947631168.0, + "logps/chosen": -276.07818603515625, + "logps/rejected": -628.3909912109375, + "loss": 0.0371, + "rewards/chosen": 2.55981707572937, + "rewards/margins": 13.389764547348022, + "rewards/rejected": -10.829947471618652, + "step": 8122 + }, + { + "epoch": 0.7421653723161261, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 1.5589659443257682e-06, + "logits/chosen": 681884928.0, + "logits/rejected": 658830643.2, + "logps/chosen": -424.97265625, + "logps/rejected": -639.94287109375, + "loss": 0.0048, + "rewards/chosen": 4.420032501220703, + "rewards/margins": 14.280403900146485, + "rewards/rejected": -9.860371398925782, + "step": 8123 + }, + { + "epoch": 0.7422567382366377, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 1.557922941883358e-06, + "logits/chosen": 339029184.0, + "logits/rejected": 470059878.4, + "logps/chosen": -250.41715494791666, + "logps/rejected": -489.1265625, + "loss": 0.0089, + "rewards/chosen": 4.174859682718913, + "rewards/margins": 13.439230410257977, + "rewards/rejected": -9.264370727539063, + "step": 8124 + }, + { + "epoch": 0.7423481041571494, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 1.5568802240688268e-06, + "logits/chosen": 1171282816.0, + "logits/rejected": 610043904.0, + "logps/chosen": -319.64129638671875, + "logps/rejected": -439.4647216796875, + "loss": 0.0211, + "rewards/chosen": 3.1367344856262207, + "rewards/margins": 12.586403369903564, + "rewards/rejected": -9.449668884277344, + "step": 8125 + }, + { + "epoch": 0.742439470077661, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 1.5558377909684018e-06, + "logits/chosen": 621648384.0, + "logits/rejected": 561270681.6, + "logps/chosen": -304.2667643229167, + "logps/rejected": -730.23701171875, + "loss": 0.0099, + "rewards/chosen": 4.231939633687337, + "rewards/margins": 13.018906339009604, + "rewards/rejected": -8.786966705322266, + "step": 8126 + }, + { + "epoch": 0.7425308359981727, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 1.5547956426682808e-06, + "logits/chosen": 452350720.0, + "logits/rejected": 412398489.6, + "logps/chosen": -275.1103108723958, + "logps/rejected": -437.0474609375, + "loss": 0.0084, + "rewards/chosen": 4.257261276245117, + "rewards/margins": 12.384802627563477, + "rewards/rejected": -8.12754135131836, + "step": 8127 + }, + { + "epoch": 0.7426222019186843, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 1.5537537792546393e-06, + "logits/chosen": 702675712.0, + "logits/rejected": 414966400.0, + "logps/chosen": -426.8981526692708, + "logps/rejected": -643.19775390625, + "loss": 0.0199, + "rewards/chosen": 3.7872085571289062, + "rewards/margins": 15.532745361328125, + "rewards/rejected": -11.745536804199219, + "step": 8128 + }, + { + "epoch": 0.742713567839196, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.5527122008136287e-06, + "logits/chosen": 507016480.0, + "logits/rejected": 430562112.0, + "logps/chosen": -211.62277221679688, + "logps/rejected": -442.6211853027344, + "loss": 0.017, + "rewards/chosen": 3.4667506217956543, + "rewards/margins": 12.773799419403076, + "rewards/rejected": -9.307048797607422, + "step": 8129 + }, + { + "epoch": 0.7428049337597076, + "grad_norm": 0.5, + "kl": 0.0, + "learning_rate": 1.5516709074313807e-06, + "logits/chosen": 352741120.0, + "logits/rejected": 397782698.6666667, + "logps/chosen": -380.0315856933594, + "logps/rejected": -444.6497395833333, + "loss": 0.0022, + "rewards/chosen": 4.848781108856201, + "rewards/margins": 14.519871552785238, + "rewards/rejected": -9.671090443929037, + "step": 8130 + }, + { + "epoch": 0.7428962996802193, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 1.5506298991939995e-06, + "logits/chosen": 870066944.0, + "logits/rejected": 416258739.2, + "logps/chosen": -265.71287027994794, + "logps/rejected": -436.35537109375, + "loss": 0.0562, + "rewards/chosen": 2.112419764200846, + "rewards/margins": 10.880750147501628, + "rewards/rejected": -8.768330383300782, + "step": 8131 + }, + { + "epoch": 0.7429876656007309, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 1.5495891761875658e-06, + "logits/chosen": 310320128.0, + "logits/rejected": 471077546.6666667, + "logps/chosen": -129.65472412109375, + "logps/rejected": -594.86865234375, + "loss": 0.0439, + "rewards/chosen": 3.0809030532836914, + "rewards/margins": 12.496339480082193, + "rewards/rejected": -9.415436426798502, + "step": 8132 + }, + { + "epoch": 0.7430790315212426, + "grad_norm": 0.765625, + "kl": 0.0, + "learning_rate": 1.5485487384981374e-06, + "logits/chosen": 587871957.3333334, + "logits/rejected": 616277401.6, + "logps/chosen": -444.310302734375, + "logps/rejected": -707.32236328125, + "loss": 0.0042, + "rewards/chosen": 4.578471819559733, + "rewards/margins": 14.528435198465985, + "rewards/rejected": -9.94996337890625, + "step": 8133 + }, + { + "epoch": 0.7431703974417542, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.547508586211751e-06, + "logits/chosen": 437947818.6666667, + "logits/rejected": 396074240.0, + "logps/chosen": -212.0627237955729, + "logps/rejected": -454.037109375, + "loss": 0.0261, + "rewards/chosen": 2.6569201151529946, + "rewards/margins": 11.25852076212565, + "rewards/rejected": -8.601600646972656, + "step": 8134 + }, + { + "epoch": 0.7432617633622659, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.5464687194144173e-06, + "logits/chosen": 646689024.0, + "logits/rejected": 456699904.0, + "logps/chosen": -526.6502075195312, + "logps/rejected": -577.45068359375, + "loss": 0.0137, + "rewards/chosen": 3.918684959411621, + "rewards/margins": 12.460047721862793, + "rewards/rejected": -8.541362762451172, + "step": 8135 + }, + { + "epoch": 0.7433531292827775, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.5454291381921222e-06, + "logits/chosen": 561941350.4, + "logits/rejected": 815529984.0, + "logps/chosen": -382.6620849609375, + "logps/rejected": -625.1238199869791, + "loss": 0.0148, + "rewards/chosen": 3.9085784912109376, + "rewards/margins": 16.573812103271486, + "rewards/rejected": -12.665233612060547, + "step": 8136 + }, + { + "epoch": 0.7434444952032891, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.5443898426308307e-06, + "logits/chosen": 874420032.0, + "logits/rejected": 785117824.0, + "logps/chosen": -397.1998291015625, + "logps/rejected": -568.9862670898438, + "loss": 0.0282, + "rewards/chosen": 2.9901833534240723, + "rewards/margins": 12.22473955154419, + "rewards/rejected": -9.234556198120117, + "step": 8137 + }, + { + "epoch": 0.7435358611238008, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 1.5433508328164797e-06, + "logits/chosen": 1293511168.0, + "logits/rejected": 686014976.0, + "logps/chosen": -519.1075846354166, + "logps/rejected": -608.5201171875, + "loss": 0.0066, + "rewards/chosen": 4.148708979288737, + "rewards/margins": 14.729554621378583, + "rewards/rejected": -10.580845642089844, + "step": 8138 + }, + { + "epoch": 0.7436272270443125, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.5423121088349908e-06, + "logits/chosen": 567500864.0, + "logits/rejected": 592434816.0, + "logps/chosen": -234.6009979248047, + "logps/rejected": -567.4911499023438, + "loss": 0.0232, + "rewards/chosen": 3.6503207683563232, + "rewards/margins": 13.363381147384644, + "rewards/rejected": -9.71306037902832, + "step": 8139 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.5412736707722537e-06, + "logits/chosen": 308850112.0, + "logits/rejected": 724246528.0, + "logps/chosen": -243.73985290527344, + "logps/rejected": -535.51171875, + "loss": 0.0245, + "rewards/chosen": 2.3851804733276367, + "rewards/margins": 13.38379192352295, + "rewards/rejected": -10.998611450195312, + "step": 8140 + }, + { + "epoch": 0.7438099588853357, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 1.5402355187141387e-06, + "logits/chosen": 486980522.6666667, + "logits/rejected": 470678528.0, + "logps/chosen": -359.7734375, + "logps/rejected": -243.73959350585938, + "loss": 0.0397, + "rewards/chosen": 3.533902168273926, + "rewards/margins": 10.473589897155762, + "rewards/rejected": -6.939687728881836, + "step": 8141 + }, + { + "epoch": 0.7439013248058474, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 1.5391976527464886e-06, + "logits/chosen": 557908309.3333334, + "logits/rejected": 167209072.0, + "logps/chosen": -361.6788736979167, + "logps/rejected": -157.03184509277344, + "loss": 0.0218, + "rewards/chosen": 4.340231895446777, + "rewards/margins": 11.18088436126709, + "rewards/rejected": -6.8406524658203125, + "step": 8142 + }, + { + "epoch": 0.7439926907263591, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 1.53816007295513e-06, + "logits/chosen": 711621546.6666666, + "logits/rejected": 538643763.2, + "logps/chosen": -148.14515177408853, + "logps/rejected": -624.616162109375, + "loss": 0.0269, + "rewards/chosen": 2.9226853052775064, + "rewards/margins": 12.841442171732584, + "rewards/rejected": -9.918756866455078, + "step": 8143 + }, + { + "epoch": 0.7440840566468707, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 1.5371227794258581e-06, + "logits/chosen": 671961429.3333334, + "logits/rejected": 414974233.6, + "logps/chosen": -413.0872395833333, + "logps/rejected": -473.975, + "loss": 0.0155, + "rewards/chosen": 3.3662439982096353, + "rewards/margins": 13.674544779459636, + "rewards/rejected": -10.30830078125, + "step": 8144 + }, + { + "epoch": 0.7441754225673823, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 1.5360857722444483e-06, + "logits/chosen": 955553280.0, + "logits/rejected": 864521856.0, + "logps/chosen": -305.8813069661458, + "logps/rejected": -616.066650390625, + "loss": 0.0756, + "rewards/chosen": 2.8960288365681968, + "rewards/margins": 16.551084836324055, + "rewards/rejected": -13.65505599975586, + "step": 8145 + }, + { + "epoch": 0.7442667884878941, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.5350490514966509e-06, + "logits/chosen": 480945237.3333333, + "logits/rejected": 1100990873.6, + "logps/chosen": -447.5546875, + "logps/rejected": -800.509765625, + "loss": 0.0083, + "rewards/chosen": 3.855905532836914, + "rewards/margins": 14.094446182250977, + "rewards/rejected": -10.238540649414062, + "step": 8146 + }, + { + "epoch": 0.7443581544084057, + "grad_norm": 0.93359375, + "kl": 0.0, + "learning_rate": 1.5340126172681924e-06, + "logits/chosen": 417862592.0, + "logits/rejected": 391840448.0, + "logps/chosen": -316.47369384765625, + "logps/rejected": -374.06103515625, + "loss": 0.0051, + "rewards/chosen": 4.998806953430176, + "rewards/margins": 14.021125793457031, + "rewards/rejected": -9.022318840026855, + "step": 8147 + }, + { + "epoch": 0.7444495203289173, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.5329764696447802e-06, + "logits/chosen": 460927392.0, + "logits/rejected": 556901120.0, + "logps/chosen": -322.95050048828125, + "logps/rejected": -599.121826171875, + "loss": 0.0253, + "rewards/chosen": 3.035480260848999, + "rewards/margins": 11.25275206565857, + "rewards/rejected": -8.21727180480957, + "step": 8148 + }, + { + "epoch": 0.7445408862494289, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.5319406087120896e-06, + "logits/chosen": 772713318.4, + "logits/rejected": 580428501.3333334, + "logps/chosen": -434.38046875, + "logps/rejected": -299.4913330078125, + "loss": 0.0197, + "rewards/chosen": 3.897753143310547, + "rewards/margins": 11.842512512207032, + "rewards/rejected": -7.944759368896484, + "step": 8149 + }, + { + "epoch": 0.7446322521699407, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 1.5309050345557774e-06, + "logits/chosen": 561590476.8, + "logits/rejected": 245814186.66666666, + "logps/chosen": -289.277001953125, + "logps/rejected": -379.8640950520833, + "loss": 0.0129, + "rewards/chosen": 4.411300277709961, + "rewards/margins": 14.235502497355142, + "rewards/rejected": -9.824202219645182, + "step": 8150 + }, + { + "epoch": 0.7447236180904523, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 1.5298697472614782e-06, + "logits/chosen": 801347498.6666666, + "logits/rejected": 420157337.6, + "logps/chosen": -509.2006022135417, + "logps/rejected": -494.180859375, + "loss": 0.0193, + "rewards/chosen": 2.9953330357869468, + "rewards/margins": 11.271702512105307, + "rewards/rejected": -8.27636947631836, + "step": 8151 + }, + { + "epoch": 0.7448149840109639, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.528834746914799e-06, + "logits/chosen": 542148608.0, + "logits/rejected": 352680981.3333333, + "logps/chosen": -349.09326171875, + "logps/rejected": -599.8644612630209, + "loss": 0.0163, + "rewards/chosen": 3.866315460205078, + "rewards/margins": 14.862345631917318, + "rewards/rejected": -10.99603017171224, + "step": 8152 + }, + { + "epoch": 0.7449063499314755, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.527800033601326e-06, + "logits/chosen": 210133845.33333334, + "logits/rejected": 512210022.4, + "logps/chosen": -89.95994059244792, + "logps/rejected": -421.77001953125, + "loss": 0.0098, + "rewards/chosen": 3.92255433400472, + "rewards/margins": 13.552736218770345, + "rewards/rejected": -9.630181884765625, + "step": 8153 + }, + { + "epoch": 0.7449977158519873, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 1.5267656074066173e-06, + "logits/chosen": 604746393.6, + "logits/rejected": 622116181.3333334, + "logps/chosen": -326.7412109375, + "logps/rejected": -674.1177571614584, + "loss": 0.011, + "rewards/chosen": 4.532448959350586, + "rewards/margins": 14.173344802856445, + "rewards/rejected": -9.64089584350586, + "step": 8154 + }, + { + "epoch": 0.7450890817724989, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 1.5257314684162145e-06, + "logits/chosen": 365796320.0, + "logits/rejected": 437609472.0, + "logps/chosen": -302.24969482421875, + "logps/rejected": -473.8177490234375, + "loss": 0.025, + "rewards/chosen": 3.680023670196533, + "rewards/margins": 12.659395694732666, + "rewards/rejected": -8.979372024536133, + "step": 8155 + }, + { + "epoch": 0.7451804476930105, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.5246976167156296e-06, + "logits/chosen": 850656384.0, + "logits/rejected": 451902080.0, + "logps/chosen": -658.8857421875, + "logps/rejected": -455.14227294921875, + "loss": 0.0118, + "rewards/chosen": 4.157769203186035, + "rewards/margins": 14.398048400878906, + "rewards/rejected": -10.240279197692871, + "step": 8156 + }, + { + "epoch": 0.7452718136135221, + "grad_norm": 0.16796875, + "kl": 0.0, + "learning_rate": 1.5236640523903517e-06, + "logits/chosen": 700146880.0, + "logits/rejected": 379975277.71428573, + "logps/chosen": -403.4277038574219, + "logps/rejected": -356.58314732142856, + "loss": 0.0008, + "rewards/chosen": 5.710986614227295, + "rewards/margins": 13.561320781707764, + "rewards/rejected": -7.850334167480469, + "step": 8157 + }, + { + "epoch": 0.7453631795340339, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 1.5226307755258484e-06, + "logits/chosen": 642127872.0, + "logits/rejected": 346506922.6666667, + "logps/chosen": -504.59033203125, + "logps/rejected": -289.90325927734375, + "loss": 0.0292, + "rewards/chosen": 3.0777379989624025, + "rewards/margins": 11.890059979756675, + "rewards/rejected": -8.812321980794271, + "step": 8158 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 49.75, + "kl": 0.0, + "learning_rate": 1.52159778620756e-06, + "logits/chosen": 689314432.0, + "logits/rejected": 846461696.0, + "logps/chosen": -318.38360595703125, + "logps/rejected": -255.36123657226562, + "loss": 0.0851, + "rewards/chosen": 4.045940399169922, + "rewards/margins": 10.604157447814941, + "rewards/rejected": -6.5582170486450195, + "step": 8159 + }, + { + "epoch": 0.7455459113750571, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 1.5205650845209076e-06, + "logits/chosen": 614218496.0, + "logits/rejected": 1195512576.0, + "logps/chosen": -324.8045349121094, + "logps/rejected": -574.6989135742188, + "loss": 0.014, + "rewards/chosen": 4.2846198081970215, + "rewards/margins": 13.836033344268799, + "rewards/rejected": -9.551413536071777, + "step": 8160 + }, + { + "epoch": 0.7456372772955687, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 1.5195326705512852e-06, + "logits/chosen": 633759488.0, + "logits/rejected": 1016159232.0, + "logps/chosen": -418.35595703125, + "logps/rejected": -548.4303588867188, + "loss": 0.0194, + "rewards/chosen": 3.70257830619812, + "rewards/margins": 12.405075788497925, + "rewards/rejected": -8.702497482299805, + "step": 8161 + }, + { + "epoch": 0.7457286432160805, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 1.5185005443840638e-06, + "logits/chosen": 713725696.0, + "logits/rejected": 458910016.0, + "logps/chosen": -268.3898620605469, + "logps/rejected": -379.8675842285156, + "loss": 0.0154, + "rewards/chosen": 3.7004024982452393, + "rewards/margins": 12.370242357254028, + "rewards/rejected": -8.669839859008789, + "step": 8162 + }, + { + "epoch": 0.7458200091365921, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 1.517468706104589e-06, + "logits/chosen": 440968704.0, + "logits/rejected": 580014933.3333334, + "logps/chosen": -248.527978515625, + "logps/rejected": -998.0732421875, + "loss": 0.0215, + "rewards/chosen": 3.7968589782714846, + "rewards/margins": 16.686563364664714, + "rewards/rejected": -12.889704386393229, + "step": 8163 + }, + { + "epoch": 0.7459113750571037, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 1.516437155798187e-06, + "logits/chosen": 590417408.0, + "logits/rejected": 566688213.3333334, + "logps/chosen": -342.776904296875, + "logps/rejected": -570.4114583333334, + "loss": 0.0166, + "rewards/chosen": 3.906189727783203, + "rewards/margins": 14.243185170491536, + "rewards/rejected": -10.336995442708334, + "step": 8164 + }, + { + "epoch": 0.7460027409776153, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.515405893550157e-06, + "logits/chosen": 506032800.0, + "logits/rejected": 361756736.0, + "logps/chosen": -216.58407592773438, + "logps/rejected": -453.41943359375, + "loss": 0.0131, + "rewards/chosen": 4.58466911315918, + "rewards/margins": 14.03900146484375, + "rewards/rejected": -9.45433235168457, + "step": 8165 + }, + { + "epoch": 0.746094106898127, + "grad_norm": 3.765625, + "kl": 1.6958198547363281, + "learning_rate": 1.5143749194457736e-06, + "logits/chosen": 548814921.1428572, + "logits/rejected": 507952704.0, + "logps/chosen": -410.76981026785717, + "logps/rejected": -790.8045654296875, + "loss": 0.025, + "rewards/chosen": 4.155872344970703, + "rewards/margins": 13.870814323425293, + "rewards/rejected": -9.71494197845459, + "step": 8166 + }, + { + "epoch": 0.7461854728186387, + "grad_norm": 0.95703125, + "kl": 0.0, + "learning_rate": 1.5133442335702897e-06, + "logits/chosen": 511720192.0, + "logits/rejected": 442937312.0, + "logps/chosen": -345.2781677246094, + "logps/rejected": -425.0416259765625, + "loss": 0.0058, + "rewards/chosen": 4.652307510375977, + "rewards/margins": 13.978065490722656, + "rewards/rejected": -9.32575798034668, + "step": 8167 + }, + { + "epoch": 0.7462768387391503, + "grad_norm": 0.87890625, + "kl": 0.0, + "learning_rate": 1.5123138360089312e-06, + "logits/chosen": 548011776.0, + "logits/rejected": 732887808.0, + "logps/chosen": -234.9403533935547, + "logps/rejected": -439.3012390136719, + "loss": 0.0038, + "rewards/chosen": 5.304932594299316, + "rewards/margins": 14.04794979095459, + "rewards/rejected": -8.743017196655273, + "step": 8168 + }, + { + "epoch": 0.7463682046596619, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.511283726846906e-06, + "logits/chosen": 690167552.0, + "logits/rejected": 306448554.6666667, + "logps/chosen": -329.9681640625, + "logps/rejected": -398.41650390625, + "loss": 0.0193, + "rewards/chosen": 3.9577568054199217, + "rewards/margins": 14.334585316975911, + "rewards/rejected": -10.37682851155599, + "step": 8169 + }, + { + "epoch": 0.7464595705801736, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 1.5102539061693927e-06, + "logits/chosen": 798601280.0, + "logits/rejected": 610394112.0, + "logps/chosen": -404.148193359375, + "logps/rejected": -780.7619018554688, + "loss": 0.0097, + "rewards/chosen": 4.542240142822266, + "rewards/margins": 15.073978424072266, + "rewards/rejected": -10.53173828125, + "step": 8170 + }, + { + "epoch": 0.7465509365006853, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 1.5092243740615486e-06, + "logits/chosen": 441754880.0, + "logits/rejected": 694457344.0, + "logps/chosen": -198.50018310546875, + "logps/rejected": -614.4845377604166, + "loss": 0.015, + "rewards/chosen": 4.320059967041016, + "rewards/margins": 14.366348139444987, + "rewards/rejected": -10.04628817240397, + "step": 8171 + }, + { + "epoch": 0.7466423024211969, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 1.5081951306085042e-06, + "logits/chosen": 407213994.6666667, + "logits/rejected": 512691763.2, + "logps/chosen": -302.87636311848956, + "logps/rejected": -334.9067626953125, + "loss": 0.0616, + "rewards/chosen": 3.866168975830078, + "rewards/margins": 11.13720703125, + "rewards/rejected": -7.271038055419922, + "step": 8172 + }, + { + "epoch": 0.7467336683417085, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 1.5071661758953716e-06, + "logits/chosen": 292637141.3333333, + "logits/rejected": 314117427.2, + "logps/chosen": -183.46464029947916, + "logps/rejected": -574.3984375, + "loss": 0.0153, + "rewards/chosen": 3.9172115325927734, + "rewards/margins": 14.21784782409668, + "rewards/rejected": -10.300636291503906, + "step": 8173 + }, + { + "epoch": 0.7468250342622202, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 1.5061375100072345e-06, + "logits/chosen": 454421184.0, + "logits/rejected": 455822368.0, + "logps/chosen": -322.97943115234375, + "logps/rejected": -448.3832092285156, + "loss": 0.0106, + "rewards/chosen": 4.423547744750977, + "rewards/margins": 12.465206146240234, + "rewards/rejected": -8.041658401489258, + "step": 8174 + }, + { + "epoch": 0.7469164001827319, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 1.5051091330291557e-06, + "logits/chosen": 749592371.2, + "logits/rejected": 521596245.3333333, + "logps/chosen": -313.104296875, + "logps/rejected": -566.9919840494791, + "loss": 0.0189, + "rewards/chosen": 3.6209136962890627, + "rewards/margins": 13.809275817871093, + "rewards/rejected": -10.188362121582031, + "step": 8175 + }, + { + "epoch": 0.7470077661032435, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 1.5040810450461674e-06, + "logits/chosen": 715470272.0, + "logits/rejected": 876381866.6666666, + "logps/chosen": -353.986083984375, + "logps/rejected": -403.4659423828125, + "loss": 0.0149, + "rewards/chosen": 2.8361663818359375, + "rewards/margins": 11.49697494506836, + "rewards/rejected": -8.660808563232422, + "step": 8176 + }, + { + "epoch": 0.7470991320237551, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 1.5030532461432884e-06, + "logits/chosen": 226829013.33333334, + "logits/rejected": 491111526.4, + "logps/chosen": -327.8865559895833, + "logps/rejected": -567.0119140625, + "loss": 0.0084, + "rewards/chosen": 4.350858370463054, + "rewards/margins": 14.794623629252115, + "rewards/rejected": -10.443765258789062, + "step": 8177 + }, + { + "epoch": 0.7471904979442668, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 1.5020257364055051e-06, + "logits/chosen": 472860211.2, + "logits/rejected": 337285290.6666667, + "logps/chosen": -399.67958984375, + "logps/rejected": -398.9224853515625, + "loss": 0.0418, + "rewards/chosen": 3.195081329345703, + "rewards/margins": 10.760111745198568, + "rewards/rejected": -7.565030415852864, + "step": 8178 + }, + { + "epoch": 0.7472818638647785, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 1.5009985159177847e-06, + "logits/chosen": 451330272.0, + "logits/rejected": 607557504.0, + "logps/chosen": -268.3140869140625, + "logps/rejected": -416.11126708984375, + "loss": 0.1252, + "rewards/chosen": 3.8595829010009766, + "rewards/margins": 9.392233848571777, + "rewards/rejected": -5.532650947570801, + "step": 8179 + }, + { + "epoch": 0.7473732297852901, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.4999715847650665e-06, + "logits/chosen": 536215466.6666667, + "logits/rejected": 400288928.0, + "logps/chosen": -308.7395426432292, + "logps/rejected": -468.5081787109375, + "loss": 0.0279, + "rewards/chosen": 3.8325042724609375, + "rewards/margins": 12.204360008239746, + "rewards/rejected": -8.371855735778809, + "step": 8180 + }, + { + "epoch": 0.7474645957058017, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.4989449430322705e-06, + "logits/chosen": 990448640.0, + "logits/rejected": 706421120.0, + "logps/chosen": -363.824072265625, + "logps/rejected": -603.1627604166666, + "loss": 0.027, + "rewards/chosen": 3.5477676391601562, + "rewards/margins": 10.869155883789062, + "rewards/rejected": -7.321388244628906, + "step": 8181 + }, + { + "epoch": 0.7475559616263134, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.4979185908042904e-06, + "logits/chosen": 519351125.3333333, + "logits/rejected": 931609344.0, + "logps/chosen": -402.2631022135417, + "logps/rejected": -786.2307739257812, + "loss": 0.0162, + "rewards/chosen": 4.7665761311848955, + "rewards/margins": 14.125607808430988, + "rewards/rejected": -9.359031677246094, + "step": 8182 + }, + { + "epoch": 0.7476473275468251, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.4968925281659952e-06, + "logits/chosen": 431569109.3333333, + "logits/rejected": 451735654.4, + "logps/chosen": -255.3309326171875, + "logps/rejected": -496.65087890625, + "loss": 0.0065, + "rewards/chosen": 4.754243850708008, + "rewards/margins": 15.239325332641602, + "rewards/rejected": -10.485081481933594, + "step": 8183 + }, + { + "epoch": 0.7477386934673367, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.4958667552022293e-06, + "logits/chosen": 432980582.4, + "logits/rejected": 675995562.6666666, + "logps/chosen": -284.326025390625, + "logps/rejected": -579.7950032552084, + "loss": 0.0195, + "rewards/chosen": 4.098702239990234, + "rewards/margins": 14.133776473999024, + "rewards/rejected": -10.035074234008789, + "step": 8184 + }, + { + "epoch": 0.7478300593878483, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 1.494841271997818e-06, + "logits/chosen": 423115852.8, + "logits/rejected": 743279274.6666666, + "logps/chosen": -248.07587890625, + "logps/rejected": -411.8709309895833, + "loss": 0.0135, + "rewards/chosen": 4.390684127807617, + "rewards/margins": 12.015832901000977, + "rewards/rejected": -7.625148773193359, + "step": 8185 + }, + { + "epoch": 0.74792142530836, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 1.4938160786375571e-06, + "logits/chosen": 696271040.0, + "logits/rejected": 713880064.0, + "logps/chosen": -405.85943603515625, + "logps/rejected": -498.56903076171875, + "loss": 0.0113, + "rewards/chosen": 3.933650016784668, + "rewards/margins": 13.015445709228516, + "rewards/rejected": -9.081795692443848, + "step": 8186 + }, + { + "epoch": 0.7480127912288717, + "grad_norm": 0.71484375, + "kl": 0.0, + "learning_rate": 1.4927911752062214e-06, + "logits/chosen": 706811264.0, + "logits/rejected": 645478570.6666666, + "logps/chosen": -141.21563720703125, + "logps/rejected": -591.2047119140625, + "loss": 0.0043, + "rewards/chosen": 4.085414886474609, + "rewards/margins": 13.413703282674154, + "rewards/rejected": -9.328288396199545, + "step": 8187 + }, + { + "epoch": 0.7481041571493833, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 1.4917665617885601e-06, + "logits/chosen": 779131392.0, + "logits/rejected": 428907712.0, + "logps/chosen": -279.8667297363281, + "logps/rejected": -464.57598876953125, + "loss": 0.0243, + "rewards/chosen": 3.1249542236328125, + "rewards/margins": 12.340827941894531, + "rewards/rejected": -9.215873718261719, + "step": 8188 + }, + { + "epoch": 0.7481955230698949, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.4907422384692987e-06, + "logits/chosen": 523673907.2, + "logits/rejected": 723227136.0, + "logps/chosen": -298.82626953125, + "logps/rejected": -679.9265950520834, + "loss": 0.0268, + "rewards/chosen": 3.3638763427734375, + "rewards/margins": 13.969347635904947, + "rewards/rejected": -10.60547129313151, + "step": 8189 + }, + { + "epoch": 0.7482868889904066, + "grad_norm": 56.25, + "kl": 0.0, + "learning_rate": 1.489718205333141e-06, + "logits/chosen": 924713856.0, + "logits/rejected": 674129792.0, + "logps/chosen": -401.653564453125, + "logps/rejected": -404.5326334635417, + "loss": 0.0802, + "rewards/chosen": 4.106176853179932, + "rewards/margins": 12.209366957346598, + "rewards/rejected": -8.103190104166666, + "step": 8190 + }, + { + "epoch": 0.7483782549109183, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 1.4886944624647647e-06, + "logits/chosen": 513169600.0, + "logits/rejected": 495518080.0, + "logps/chosen": -311.1691589355469, + "logps/rejected": -326.80523681640625, + "loss": 0.014, + "rewards/chosen": 3.8798155784606934, + "rewards/margins": 13.147365093231201, + "rewards/rejected": -9.267549514770508, + "step": 8191 + }, + { + "epoch": 0.7484696208314299, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 1.4876710099488234e-06, + "logits/chosen": 380279381.3333333, + "logits/rejected": 739825920.0, + "logps/chosen": -328.01609293619794, + "logps/rejected": -699.02890625, + "loss": 0.0147, + "rewards/chosen": 3.4412781397501626, + "rewards/margins": 11.125824038187663, + "rewards/rejected": -7.6845458984375, + "step": 8192 + }, + { + "epoch": 0.7485609867519415, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 1.4866478478699453e-06, + "logits/chosen": 1060835942.4, + "logits/rejected": 1056167850.6666666, + "logps/chosen": -222.901611328125, + "logps/rejected": -437.4742024739583, + "loss": 0.1296, + "rewards/chosen": 2.684031295776367, + "rewards/margins": 10.59204241434733, + "rewards/rejected": -7.908011118570964, + "step": 8193 + }, + { + "epoch": 0.7486523526724532, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 1.48562497631274e-06, + "logits/chosen": 822231405.7142857, + "logits/rejected": 104118624.0, + "logps/chosen": -304.70919363839283, + "logps/rejected": -733.6593017578125, + "loss": 0.0289, + "rewards/chosen": 3.8197122301374162, + "rewards/margins": 13.861106463841029, + "rewards/rejected": -10.041394233703613, + "step": 8194 + }, + { + "epoch": 0.7487437185929648, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.4846023953617877e-06, + "logits/chosen": 512657376.0, + "logits/rejected": 415761280.0, + "logps/chosen": -322.79827880859375, + "logps/rejected": -476.96905517578125, + "loss": 0.0232, + "rewards/chosen": 3.1969947814941406, + "rewards/margins": 13.783288955688477, + "rewards/rejected": -10.586294174194336, + "step": 8195 + }, + { + "epoch": 0.7488350845134765, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 1.4835801051016463e-06, + "logits/chosen": 441470272.0, + "logits/rejected": 473515221.3333333, + "logps/chosen": -322.9666748046875, + "logps/rejected": -479.1668294270833, + "loss": 0.0185, + "rewards/chosen": 2.5579299926757812, + "rewards/margins": 11.620105743408203, + "rewards/rejected": -9.062175750732422, + "step": 8196 + }, + { + "epoch": 0.7489264504339881, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 1.4825581056168498e-06, + "logits/chosen": 534715712.0, + "logits/rejected": 574855424.0, + "logps/chosen": -388.3976745605469, + "logps/rejected": -530.1044311523438, + "loss": 0.0238, + "rewards/chosen": 4.169028282165527, + "rewards/margins": 12.295546531677246, + "rewards/rejected": -8.126518249511719, + "step": 8197 + }, + { + "epoch": 0.7490178163544998, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.4815363969919067e-06, + "logits/chosen": 501845196.8, + "logits/rejected": 758140501.3333334, + "logps/chosen": -592.549560546875, + "logps/rejected": -562.89794921875, + "loss": 0.0165, + "rewards/chosen": 3.7751625061035154, + "rewards/margins": 14.698245493570962, + "rewards/rejected": -10.923082987467447, + "step": 8198 + }, + { + "epoch": 0.7491091822750114, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.480514979311306e-06, + "logits/chosen": 542016512.0, + "logits/rejected": 554931148.8, + "logps/chosen": -351.1066487630208, + "logps/rejected": -410.154345703125, + "loss": 0.0094, + "rewards/chosen": 4.416866620381673, + "rewards/margins": 13.069655927022296, + "rewards/rejected": -8.652789306640624, + "step": 8199 + }, + { + "epoch": 0.7492005481955231, + "grad_norm": 0.75, + "kl": 0.0, + "learning_rate": 1.4794938526595076e-06, + "logits/chosen": 361764992.0, + "logits/rejected": 591399552.0, + "logps/chosen": -207.41238403320312, + "logps/rejected": -530.9709879557291, + "loss": 0.0047, + "rewards/chosen": 4.367178440093994, + "rewards/margins": 13.399398644765219, + "rewards/rejected": -9.032220204671225, + "step": 8200 + }, + { + "epoch": 0.7492919141160347, + "grad_norm": 0.88671875, + "kl": 0.0, + "learning_rate": 1.4784730171209493e-06, + "logits/chosen": 869568085.3333334, + "logits/rejected": 824171827.2, + "logps/chosen": -448.6164143880208, + "logps/rejected": -585.54384765625, + "loss": 0.0051, + "rewards/chosen": 4.341037114461263, + "rewards/margins": 18.280136235555013, + "rewards/rejected": -13.93909912109375, + "step": 8201 + }, + { + "epoch": 0.7493832800365464, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 1.477452472780045e-06, + "logits/chosen": 541520896.0, + "logits/rejected": 202515360.0, + "logps/chosen": -415.30218505859375, + "logps/rejected": -282.68463134765625, + "loss": 0.0057, + "rewards/chosen": 4.571148872375488, + "rewards/margins": 14.71767807006836, + "rewards/rejected": -10.146529197692871, + "step": 8202 + }, + { + "epoch": 0.749474645957058, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 1.4764322197211838e-06, + "logits/chosen": 1328037546.6666667, + "logits/rejected": 884739200.0, + "logps/chosen": -544.631591796875, + "logps/rejected": -975.1503295898438, + "loss": 0.024, + "rewards/chosen": 4.126619338989258, + "rewards/margins": 13.911169052124023, + "rewards/rejected": -9.784549713134766, + "step": 8203 + }, + { + "epoch": 0.7495660118775697, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 1.4754122580287317e-06, + "logits/chosen": 677953152.0, + "logits/rejected": 442299392.0, + "logps/chosen": -490.968994140625, + "logps/rejected": -505.5857340494792, + "loss": 0.0075, + "rewards/chosen": 3.5926408767700195, + "rewards/margins": 12.482179959615072, + "rewards/rejected": -8.889539082845053, + "step": 8204 + }, + { + "epoch": 0.7496573777980813, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.4743925877870286e-06, + "logits/chosen": 343891430.4, + "logits/rejected": 663195946.6666666, + "logps/chosen": -226.176025390625, + "logps/rejected": -331.56138102213544, + "loss": 0.016, + "rewards/chosen": 4.373315048217774, + "rewards/margins": 12.95803311665853, + "rewards/rejected": -8.584718068440756, + "step": 8205 + }, + { + "epoch": 0.749748743718593, + "grad_norm": 0.373046875, + "kl": 0.0, + "learning_rate": 1.4733732090803937e-06, + "logits/chosen": 358872128.0, + "logits/rejected": 472208640.0, + "logps/chosen": -311.364990234375, + "logps/rejected": -462.1621907552083, + "loss": 0.0016, + "rewards/chosen": 5.265364170074463, + "rewards/margins": 14.945660750071207, + "rewards/rejected": -9.680296579996744, + "step": 8206 + }, + { + "epoch": 0.7498401096391046, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 1.4723541219931203e-06, + "logits/chosen": 628871765.3333334, + "logits/rejected": 590143897.6, + "logps/chosen": -388.0642496744792, + "logps/rejected": -732.75390625, + "loss": 0.0133, + "rewards/chosen": 3.5578368504842124, + "rewards/margins": 14.712481625874839, + "rewards/rejected": -11.154644775390626, + "step": 8207 + }, + { + "epoch": 0.7499314755596163, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 1.4713353266094764e-06, + "logits/chosen": 811795916.8, + "logits/rejected": 645898880.0, + "logps/chosen": -411.2228515625, + "logps/rejected": -339.8448079427083, + "loss": 0.0216, + "rewards/chosen": 3.9049148559570312, + "rewards/margins": 12.459693908691406, + "rewards/rejected": -8.554779052734375, + "step": 8208 + }, + { + "epoch": 0.7500228414801279, + "grad_norm": 1.0703125, + "kl": 0.0, + "learning_rate": 1.4703168230137072e-06, + "logits/chosen": 590873685.3333334, + "logits/rejected": 573574041.6, + "logps/chosen": -295.37754313151044, + "logps/rejected": -668.48798828125, + "loss": 0.0055, + "rewards/chosen": 4.375080108642578, + "rewards/margins": 14.934560394287109, + "rewards/rejected": -10.559480285644531, + "step": 8209 + }, + { + "epoch": 0.7501142074006396, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.4692986112900315e-06, + "logits/chosen": 661925273.6, + "logits/rejected": 776068096.0, + "logps/chosen": -486.210205078125, + "logps/rejected": -899.0947265625, + "loss": 0.0305, + "rewards/chosen": 3.187617301940918, + "rewards/margins": 14.732386589050293, + "rewards/rejected": -11.544769287109375, + "step": 8210 + }, + { + "epoch": 0.7502055733211512, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.46828069152265e-06, + "logits/chosen": 761104213.3333334, + "logits/rejected": 1045368832.0, + "logps/chosen": -482.0050048828125, + "logps/rejected": -597.10732421875, + "loss": 0.0115, + "rewards/chosen": 4.162938117980957, + "rewards/margins": 12.797242164611816, + "rewards/rejected": -8.63430404663086, + "step": 8211 + }, + { + "epoch": 0.7502969392416629, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 1.467263063795733e-06, + "logits/chosen": 547796377.6, + "logits/rejected": 253191530.66666666, + "logps/chosen": -359.296630859375, + "logps/rejected": -336.56494140625, + "loss": 0.0197, + "rewards/chosen": 3.652490997314453, + "rewards/margins": 11.630542500813801, + "rewards/rejected": -7.978051503499349, + "step": 8212 + }, + { + "epoch": 0.7503883051621745, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 1.466245728193429e-06, + "logits/chosen": 681988300.8, + "logits/rejected": 428584618.6666667, + "logps/chosen": -440.163232421875, + "logps/rejected": -533.810302734375, + "loss": 0.1445, + "rewards/chosen": 2.1748424530029298, + "rewards/margins": 14.420989608764648, + "rewards/rejected": -12.246147155761719, + "step": 8213 + }, + { + "epoch": 0.7504796710826862, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 1.4652286847998609e-06, + "logits/chosen": 512132128.0, + "logits/rejected": 594261568.0, + "logps/chosen": -311.7168884277344, + "logps/rejected": -784.8052978515625, + "loss": 0.0227, + "rewards/chosen": 3.301307201385498, + "rewards/margins": 14.253085613250732, + "rewards/rejected": -10.951778411865234, + "step": 8214 + }, + { + "epoch": 0.7505710370031978, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.4642119336991318e-06, + "logits/chosen": 467755904.0, + "logits/rejected": 608005717.3333334, + "logps/chosen": -342.79754638671875, + "logps/rejected": -800.1568196614584, + "loss": 0.0113, + "rewards/chosen": 3.0683107376098633, + "rewards/margins": 14.42699146270752, + "rewards/rejected": -11.358680725097656, + "step": 8215 + }, + { + "epoch": 0.7506624029237094, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.4631954749753164e-06, + "logits/chosen": 571503052.8, + "logits/rejected": 946813098.6666666, + "logps/chosen": -383.45869140625, + "logps/rejected": -693.978759765625, + "loss": 0.0166, + "rewards/chosen": 3.8297073364257814, + "rewards/margins": 13.701529693603515, + "rewards/rejected": -9.871822357177734, + "step": 8216 + }, + { + "epoch": 0.7507537688442211, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.4621793087124653e-06, + "logits/chosen": 553136554.6666666, + "logits/rejected": 535802560.0, + "logps/chosen": -255.98795572916666, + "logps/rejected": -838.711181640625, + "loss": 0.0195, + "rewards/chosen": 4.043912251790364, + "rewards/margins": 20.965033849080402, + "rewards/rejected": -16.92112159729004, + "step": 8217 + }, + { + "epoch": 0.7508451347647328, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.4611634349946068e-06, + "logits/chosen": 548930560.0, + "logits/rejected": 669981440.0, + "logps/chosen": -366.7845764160156, + "logps/rejected": -608.2054443359375, + "loss": 0.0458, + "rewards/chosen": 2.6655569076538086, + "rewards/margins": 12.832874298095703, + "rewards/rejected": -10.167317390441895, + "step": 8218 + }, + { + "epoch": 0.7509365006852444, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 1.460147853905743e-06, + "logits/chosen": 422587434.6666667, + "logits/rejected": 661242880.0, + "logps/chosen": -316.68947347005206, + "logps/rejected": -441.947802734375, + "loss": 0.0073, + "rewards/chosen": 4.207626024881999, + "rewards/margins": 13.740684191385906, + "rewards/rejected": -9.533058166503906, + "step": 8219 + }, + { + "epoch": 0.751027866605756, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 1.4591325655298555e-06, + "logits/chosen": 436567296.0, + "logits/rejected": 516774985.14285713, + "logps/chosen": -265.14501953125, + "logps/rejected": -448.13364955357144, + "loss": 0.0199, + "rewards/chosen": 1.824005126953125, + "rewards/margins": 11.074883597237724, + "rewards/rejected": -9.250878470284599, + "step": 8220 + }, + { + "epoch": 0.7511192325262677, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.4581175699508982e-06, + "logits/chosen": 848209664.0, + "logits/rejected": 681841749.3333334, + "logps/chosen": -271.521630859375, + "logps/rejected": -590.9130859375, + "loss": 0.0098, + "rewards/chosen": 4.833231353759766, + "rewards/margins": 16.173450215657553, + "rewards/rejected": -11.340218861897787, + "step": 8221 + }, + { + "epoch": 0.7512105984467794, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 1.4571028672528016e-06, + "logits/chosen": 646459648.0, + "logits/rejected": 377101531.4285714, + "logps/chosen": -461.8904724121094, + "logps/rejected": -289.8490513392857, + "loss": 0.0056, + "rewards/chosen": 3.3096771240234375, + "rewards/margins": 10.698213849748884, + "rewards/rejected": -7.388536725725446, + "step": 8222 + }, + { + "epoch": 0.751301964367291, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.4560884575194701e-06, + "logits/chosen": 285377612.8, + "logits/rejected": 389077034.6666667, + "logps/chosen": -275.988916015625, + "logps/rejected": -557.460693359375, + "loss": 0.01, + "rewards/chosen": 4.966542053222656, + "rewards/margins": 15.218132781982423, + "rewards/rejected": -10.251590728759766, + "step": 8223 + }, + { + "epoch": 0.7513933302878026, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 1.45507434083479e-06, + "logits/chosen": 519850291.2, + "logits/rejected": 600548437.3333334, + "logps/chosen": -418.70009765625, + "logps/rejected": -537.4784342447916, + "loss": 0.0258, + "rewards/chosen": 3.5892635345458985, + "rewards/margins": 11.649458185831705, + "rewards/rejected": -8.060194651285807, + "step": 8224 + }, + { + "epoch": 0.7514846962083143, + "grad_norm": 0.8359375, + "kl": 0.0, + "learning_rate": 1.4540605172826178e-06, + "logits/chosen": 439845333.3333333, + "logits/rejected": 354941747.2, + "logps/chosen": -415.7811686197917, + "logps/rejected": -455.39697265625, + "loss": 0.0043, + "rewards/chosen": 4.589964548746745, + "rewards/margins": 14.54190394083659, + "rewards/rejected": -9.951939392089844, + "step": 8225 + }, + { + "epoch": 0.751576062128826, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 1.4530469869467862e-06, + "logits/chosen": 454259609.6, + "logits/rejected": 529861632.0, + "logps/chosen": -235.3364990234375, + "logps/rejected": -389.3368733723958, + "loss": 0.0318, + "rewards/chosen": 3.4083553314208985, + "rewards/margins": 11.333713150024414, + "rewards/rejected": -7.925357818603516, + "step": 8226 + }, + { + "epoch": 0.7516674280493376, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.4520337499111048e-06, + "logits/chosen": 916766105.6, + "logits/rejected": 551561386.6666666, + "logps/chosen": -347.614501953125, + "logps/rejected": -391.899658203125, + "loss": 0.0277, + "rewards/chosen": 3.1654403686523436, + "rewards/margins": 10.106688308715821, + "rewards/rejected": -6.941247940063477, + "step": 8227 + }, + { + "epoch": 0.7517587939698492, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 1.4510208062593607e-06, + "logits/chosen": 650417971.2, + "logits/rejected": 742583552.0, + "logps/chosen": -349.8586669921875, + "logps/rejected": -253.11995442708334, + "loss": 0.0707, + "rewards/chosen": 4.247652816772461, + "rewards/margins": 9.293575731913249, + "rewards/rejected": -5.045922915140788, + "step": 8228 + }, + { + "epoch": 0.7518501598903609, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 1.4500081560753154e-06, + "logits/chosen": 543760691.2, + "logits/rejected": 416786858.6666667, + "logps/chosen": -370.285009765625, + "logps/rejected": -462.3452962239583, + "loss": 0.0319, + "rewards/chosen": 3.583299255371094, + "rewards/margins": 12.82761027018229, + "rewards/rejected": -9.244311014811197, + "step": 8229 + }, + { + "epoch": 0.7519415258108726, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 1.448995799442703e-06, + "logits/chosen": 512199884.8, + "logits/rejected": 741330090.6666666, + "logps/chosen": -211.6630859375, + "logps/rejected": -566.3144124348959, + "loss": 0.021, + "rewards/chosen": 4.325561141967773, + "rewards/margins": 10.852133051554361, + "rewards/rejected": -6.526571909586589, + "step": 8230 + }, + { + "epoch": 0.7520328917313842, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 1.4479837364452355e-06, + "logits/chosen": 497115721.14285713, + "logits/rejected": 397173312.0, + "logps/chosen": -443.89432198660717, + "logps/rejected": -546.606201171875, + "loss": 0.0236, + "rewards/chosen": 4.031315394810268, + "rewards/margins": 12.168113299778529, + "rewards/rejected": -8.136797904968262, + "step": 8231 + }, + { + "epoch": 0.7521242576518958, + "grad_norm": 0.53125, + "kl": 0.0, + "learning_rate": 1.4469719671666043e-06, + "logits/chosen": 926470528.0, + "logits/rejected": 408076873.14285713, + "logps/chosen": -95.69281005859375, + "logps/rejected": -447.6103515625, + "loss": 0.003, + "rewards/chosen": 3.806016683578491, + "rewards/margins": 12.924437420708793, + "rewards/rejected": -9.118420737130302, + "step": 8232 + }, + { + "epoch": 0.7522156235724075, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 1.4459604916904718e-06, + "logits/chosen": 543148339.2, + "logits/rejected": 453320746.6666667, + "logps/chosen": -256.02138671875, + "logps/rejected": -367.9319254557292, + "loss": 0.0367, + "rewards/chosen": 2.9834644317626955, + "rewards/margins": 9.888527425130208, + "rewards/rejected": -6.905062993367513, + "step": 8233 + }, + { + "epoch": 0.7523069894929192, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 1.444949310100478e-06, + "logits/chosen": 327066752.0, + "logits/rejected": 559392384.0, + "logps/chosen": -212.13173828125, + "logps/rejected": -641.9088541666666, + "loss": 0.0167, + "rewards/chosen": 4.301860809326172, + "rewards/margins": 16.644625345865883, + "rewards/rejected": -12.342764536539713, + "step": 8234 + }, + { + "epoch": 0.7523983554134308, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 1.443938422480236e-06, + "logits/chosen": 636707157.3333334, + "logits/rejected": 672479795.2, + "logps/chosen": -405.1847330729167, + "logps/rejected": -488.734521484375, + "loss": 0.011, + "rewards/chosen": 4.487057367960612, + "rewards/margins": 11.592956415812175, + "rewards/rejected": -7.105899047851563, + "step": 8235 + }, + { + "epoch": 0.7524897213339424, + "grad_norm": 32.0, + "kl": 0.0, + "learning_rate": 1.4429278289133407e-06, + "logits/chosen": 449847381.3333333, + "logits/rejected": 553731072.0, + "logps/chosen": -317.144775390625, + "logps/rejected": -507.7734375, + "loss": 0.0232, + "rewards/chosen": 3.9078006744384766, + "rewards/margins": 12.747751998901368, + "rewards/rejected": -8.839951324462891, + "step": 8236 + }, + { + "epoch": 0.752581087254454, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 1.441917529483357e-06, + "logits/chosen": 607937177.6, + "logits/rejected": 349851605.3333333, + "logps/chosen": -284.47724609375, + "logps/rejected": -412.8280436197917, + "loss": 0.0299, + "rewards/chosen": 3.742999267578125, + "rewards/margins": 11.4652587890625, + "rewards/rejected": -7.722259521484375, + "step": 8237 + }, + { + "epoch": 0.7526724531749658, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 1.440907524273827e-06, + "logits/chosen": 437551104.0, + "logits/rejected": 611471786.6666666, + "logps/chosen": -263.5024658203125, + "logps/rejected": -432.3628336588542, + "loss": 0.0402, + "rewards/chosen": 3.14810733795166, + "rewards/margins": 12.392995007832845, + "rewards/rejected": -9.244887669881185, + "step": 8238 + }, + { + "epoch": 0.7527638190954774, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.4398978133682696e-06, + "logits/chosen": 605458432.0, + "logits/rejected": 363193280.0, + "logps/chosen": -375.0706481933594, + "logps/rejected": -591.2806396484375, + "loss": 0.0256, + "rewards/chosen": 3.2643113136291504, + "rewards/margins": 14.463195323944092, + "rewards/rejected": -11.198884010314941, + "step": 8239 + }, + { + "epoch": 0.752855185015989, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 1.4388883968501754e-06, + "logits/chosen": 728164010.6666666, + "logits/rejected": 345288576.0, + "logps/chosen": -307.35455322265625, + "logps/rejected": -525.3026733398438, + "loss": 0.0217, + "rewards/chosen": 3.7179648081461587, + "rewards/margins": 14.725702921549479, + "rewards/rejected": -11.00773811340332, + "step": 8240 + }, + { + "epoch": 0.7529465509365006, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.4378792748030184e-06, + "logits/chosen": 504053930.6666667, + "logits/rejected": 380158720.0, + "logps/chosen": -238.13277180989584, + "logps/rejected": -447.6837158203125, + "loss": 0.0222, + "rewards/chosen": 3.817878087361654, + "rewards/margins": 11.098858197530111, + "rewards/rejected": -7.280980110168457, + "step": 8241 + }, + { + "epoch": 0.7530379168570124, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 1.436870447310242e-06, + "logits/chosen": 796114112.0, + "logits/rejected": 529285536.0, + "logps/chosen": -282.2408447265625, + "logps/rejected": -537.604736328125, + "loss": 0.0203, + "rewards/chosen": 4.557574272155762, + "rewards/margins": 13.749529838562012, + "rewards/rejected": -9.19195556640625, + "step": 8242 + }, + { + "epoch": 0.753129282777524, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 1.435861914455266e-06, + "logits/chosen": 374410624.0, + "logits/rejected": 210593280.0, + "logps/chosen": -230.783154296875, + "logps/rejected": -409.8891194661458, + "loss": 0.0163, + "rewards/chosen": 3.9816253662109373, + "rewards/margins": 13.79146842956543, + "rewards/rejected": -9.809843063354492, + "step": 8243 + }, + { + "epoch": 0.7532206486980356, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 1.434853676321486e-06, + "logits/chosen": 929528832.0, + "logits/rejected": 605024426.6666666, + "logps/chosen": -325.92978515625, + "logps/rejected": -478.4806315104167, + "loss": 0.0124, + "rewards/chosen": 4.235404968261719, + "rewards/margins": 15.16253916422526, + "rewards/rejected": -10.927134195963541, + "step": 8244 + }, + { + "epoch": 0.7533120146185472, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.433845732992276e-06, + "logits/chosen": 560998400.0, + "logits/rejected": 326589845.3333333, + "logps/chosen": -347.692919921875, + "logps/rejected": -413.4396158854167, + "loss": 0.0264, + "rewards/chosen": 3.7500274658203123, + "rewards/margins": 12.695310084025063, + "rewards/rejected": -8.945282618204752, + "step": 8245 + }, + { + "epoch": 0.753403380539059, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 1.4328380845509837e-06, + "logits/chosen": 301930176.0, + "logits/rejected": 902304170.6666666, + "logps/chosen": -117.64080047607422, + "logps/rejected": -444.345703125, + "loss": 0.0119, + "rewards/chosen": 4.025655746459961, + "rewards/margins": 12.308122634887695, + "rewards/rejected": -8.282466888427734, + "step": 8246 + }, + { + "epoch": 0.7534947464595706, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.431830731080931e-06, + "logits/chosen": 511815637.3333333, + "logits/rejected": 554306048.0, + "logps/chosen": -201.9913330078125, + "logps/rejected": -498.4042053222656, + "loss": 0.0428, + "rewards/chosen": 3.024562199910482, + "rewards/margins": 14.124188741048178, + "rewards/rejected": -11.099626541137695, + "step": 8247 + }, + { + "epoch": 0.7535861123800822, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 1.4308236726654151e-06, + "logits/chosen": 422095232.0, + "logits/rejected": 351276006.4, + "logps/chosen": -228.98360188802084, + "logps/rejected": -438.728271484375, + "loss": 0.0215, + "rewards/chosen": 3.33360226949056, + "rewards/margins": 11.471003087361654, + "rewards/rejected": -8.137400817871093, + "step": 8248 + }, + { + "epoch": 0.7536774783005938, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.429816909387714e-06, + "logits/chosen": 464371029.3333333, + "logits/rejected": 548631910.4, + "logps/chosen": -283.99985758463544, + "logps/rejected": -600.1, + "loss": 0.0247, + "rewards/chosen": 3.3522745768229165, + "rewards/margins": 13.22674814860026, + "rewards/rejected": -9.874473571777344, + "step": 8249 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.428810441331076e-06, + "logits/chosen": 580317849.6, + "logits/rejected": 630824234.6666666, + "logps/chosen": -287.405810546875, + "logps/rejected": -536.8454996744791, + "loss": 0.0161, + "rewards/chosen": 4.475481796264648, + "rewards/margins": 12.470099258422852, + "rewards/rejected": -7.994617462158203, + "step": 8250 + }, + { + "epoch": 0.7538602101416172, + "grad_norm": 11.375, + "kl": 15.177886962890625, + "learning_rate": 1.427804268578727e-06, + "logits/chosen": 691403968.0, + "logps/chosen": -419.06060791015625, + "loss": 0.1189, + "rewards/chosen": 3.7974138259887695, + "step": 8251 + }, + { + "epoch": 0.7539515760621288, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 1.4267983912138672e-06, + "logits/chosen": 223497557.33333334, + "logits/rejected": 564577382.4, + "logps/chosen": -245.2181396484375, + "logps/rejected": -336.069189453125, + "loss": 0.0105, + "rewards/chosen": 4.044927597045898, + "rewards/margins": 12.045620346069336, + "rewards/rejected": -8.000692749023438, + "step": 8252 + }, + { + "epoch": 0.7540429419826404, + "grad_norm": 0.43359375, + "kl": 0.0, + "learning_rate": 1.4257928093196732e-06, + "logits/chosen": 1091298304.0, + "logits/rejected": 414673536.0, + "logps/chosen": -539.79833984375, + "logps/rejected": -453.6648763020833, + "loss": 0.002, + "rewards/chosen": 5.261023044586182, + "rewards/margins": 14.864457289377848, + "rewards/rejected": -9.603434244791666, + "step": 8253 + }, + { + "epoch": 0.7541343079031522, + "grad_norm": 0.376953125, + "kl": 0.0, + "learning_rate": 1.424787522979299e-06, + "logits/chosen": 605117696.0, + "logits/rejected": 434001578.6666667, + "logps/chosen": -327.4710998535156, + "logps/rejected": -589.6029459635416, + "loss": 0.0017, + "rewards/chosen": 5.115997314453125, + "rewards/margins": 16.30835723876953, + "rewards/rejected": -11.192359924316406, + "step": 8254 + }, + { + "epoch": 0.7542256738236638, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.4237825322758735e-06, + "logits/chosen": 449405747.2, + "logits/rejected": 610121130.6666666, + "logps/chosen": -335.479931640625, + "logps/rejected": -992.1119791666666, + "loss": 0.0154, + "rewards/chosen": 4.463457870483398, + "rewards/margins": 15.71285514831543, + "rewards/rejected": -11.249397277832031, + "step": 8255 + }, + { + "epoch": 0.7543170397441754, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.4227778372924954e-06, + "logits/chosen": 527284544.0, + "logits/rejected": 372578816.0, + "logps/chosen": -385.14569091796875, + "logps/rejected": -530.0528564453125, + "loss": 0.0193, + "rewards/chosen": 3.327078104019165, + "rewards/margins": 13.868861436843872, + "rewards/rejected": -10.541783332824707, + "step": 8256 + }, + { + "epoch": 0.754408405664687, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.4217734381122478e-06, + "logits/chosen": 342981973.3333333, + "logits/rejected": 444635392.0, + "logps/chosen": -153.9963175455729, + "logps/rejected": -405.195654296875, + "loss": 0.0203, + "rewards/chosen": 3.6489044825236, + "rewards/margins": 12.657672182718912, + "rewards/rejected": -9.008767700195312, + "step": 8257 + }, + { + "epoch": 0.7544997715851988, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 1.4207693348181834e-06, + "logits/chosen": 547596697.6, + "logits/rejected": 534921472.0, + "logps/chosen": -362.085595703125, + "logps/rejected": -751.86279296875, + "loss": 0.0111, + "rewards/chosen": 4.536524963378906, + "rewards/margins": 14.182902526855468, + "rewards/rejected": -9.646377563476562, + "step": 8258 + }, + { + "epoch": 0.7545911375057104, + "grad_norm": 0.9140625, + "kl": 0.0, + "learning_rate": 1.419765527493333e-06, + "logits/chosen": 390243840.0, + "logits/rejected": 535166400.0, + "logps/chosen": -198.90841674804688, + "logps/rejected": -697.322265625, + "loss": 0.0147, + "rewards/chosen": 4.883247375488281, + "rewards/margins": 16.027061462402344, + "rewards/rejected": -11.143814086914062, + "step": 8259 + }, + { + "epoch": 0.754682503426222, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.4187620162207022e-06, + "logits/chosen": 607514009.6, + "logits/rejected": 480801749.3333333, + "logps/chosen": -344.063720703125, + "logps/rejected": -584.4677734375, + "loss": 0.0256, + "rewards/chosen": 4.0381622314453125, + "rewards/margins": 11.952044169108074, + "rewards/rejected": -7.913881937662761, + "step": 8260 + }, + { + "epoch": 0.7547738693467336, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 1.4177588010832699e-06, + "logits/chosen": 663051392.0, + "logits/rejected": 246986560.0, + "logps/chosen": -323.9698893229167, + "logps/rejected": -193.0207061767578, + "loss": 0.0124, + "rewards/chosen": 4.2485659917195635, + "rewards/margins": 12.781900723775227, + "rewards/rejected": -8.533334732055664, + "step": 8261 + }, + { + "epoch": 0.7548652352672454, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 1.416755882163996e-06, + "logits/chosen": 770597376.0, + "logits/rejected": 667370240.0, + "logps/chosen": -312.707275390625, + "logps/rejected": -529.6094156901041, + "loss": 0.0049, + "rewards/chosen": 4.19431209564209, + "rewards/margins": 12.576065381368002, + "rewards/rejected": -8.381753285725912, + "step": 8262 + }, + { + "epoch": 0.754956601187757, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 1.4157532595458111e-06, + "logits/chosen": 471017164.8, + "logits/rejected": 1030301354.6666666, + "logps/chosen": -204.78564453125, + "logps/rejected": -1222.5955403645833, + "loss": 0.0157, + "rewards/chosen": 4.120108032226563, + "rewards/margins": 17.137349192301432, + "rewards/rejected": -13.01724116007487, + "step": 8263 + }, + { + "epoch": 0.7550479671082686, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.4147509333116228e-06, + "logits/chosen": 699201920.0, + "logits/rejected": 514820768.0, + "logps/chosen": -310.38323974609375, + "logps/rejected": -429.69482421875, + "loss": 0.0132, + "rewards/chosen": 3.9236762523651123, + "rewards/margins": 13.090930700302124, + "rewards/rejected": -9.167254447937012, + "step": 8264 + }, + { + "epoch": 0.7551393330287802, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 1.413748903544313e-06, + "logits/chosen": 421574451.2, + "logits/rejected": 567471786.6666666, + "logps/chosen": -225.4111328125, + "logps/rejected": -244.8963419596354, + "loss": 0.1333, + "rewards/chosen": 2.5767898559570312, + "rewards/margins": 11.287164052327475, + "rewards/rejected": -8.710374196370443, + "step": 8265 + }, + { + "epoch": 0.755230698949292, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 1.4127471703267432e-06, + "logits/chosen": 412087833.6, + "logits/rejected": 282056938.6666667, + "logps/chosen": -294.2318115234375, + "logps/rejected": -523.4868570963541, + "loss": 0.0212, + "rewards/chosen": 3.5411403656005858, + "rewards/margins": 12.67371826171875, + "rewards/rejected": -9.132577896118164, + "step": 8266 + }, + { + "epoch": 0.7553220648698036, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.4117457337417456e-06, + "logits/chosen": 674746982.4, + "logits/rejected": 293956053.3333333, + "logps/chosen": -404.6625244140625, + "logps/rejected": -370.6946614583333, + "loss": 0.0189, + "rewards/chosen": 3.7645858764648437, + "rewards/margins": 15.272745513916016, + "rewards/rejected": -11.508159637451172, + "step": 8267 + }, + { + "epoch": 0.7554134307903152, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 1.4107445938721299e-06, + "logits/chosen": 589347200.0, + "logits/rejected": 1230762240.0, + "logps/chosen": -280.95452880859375, + "logps/rejected": -493.27020263671875, + "loss": 0.0146, + "rewards/chosen": 4.608509063720703, + "rewards/margins": 11.392073631286621, + "rewards/rejected": -6.783564567565918, + "step": 8268 + }, + { + "epoch": 0.7555047967108268, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.4097437508006794e-06, + "logits/chosen": 815814400.0, + "logits/rejected": 595718656.0, + "logps/chosen": -342.3723551432292, + "logps/rejected": -533.7013549804688, + "loss": 0.0159, + "rewards/chosen": 4.033904393513997, + "rewards/margins": 11.754469235738117, + "rewards/rejected": -7.720564842224121, + "step": 8269 + }, + { + "epoch": 0.7555961626313386, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 1.4087432046101578e-06, + "logits/chosen": 580616755.2, + "logits/rejected": 814786730.6666666, + "logps/chosen": -375.640283203125, + "logps/rejected": -434.17578125, + "loss": 0.0325, + "rewards/chosen": 3.6836475372314452, + "rewards/margins": 11.617807133992512, + "rewards/rejected": -7.934159596761067, + "step": 8270 + }, + { + "epoch": 0.7556875285518502, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 1.4077429553832995e-06, + "logits/chosen": 558297770.6666666, + "logits/rejected": 811667968.0, + "logps/chosen": -330.27964274088544, + "logps/rejected": -499.26263427734375, + "loss": 0.0353, + "rewards/chosen": 3.530419031778971, + "rewards/margins": 13.214720408121744, + "rewards/rejected": -9.684301376342773, + "step": 8271 + }, + { + "epoch": 0.7557788944723618, + "grad_norm": 0.82421875, + "kl": 0.0, + "learning_rate": 1.4067430032028157e-06, + "logits/chosen": 510700096.0, + "logits/rejected": 378726838.85714287, + "logps/chosen": -308.7945556640625, + "logps/rejected": -545.5021623883929, + "loss": 0.0035, + "rewards/chosen": 3.72715163230896, + "rewards/margins": 13.043742077691215, + "rewards/rejected": -9.316590445382255, + "step": 8272 + }, + { + "epoch": 0.7558702603928734, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 1.4057433481513932e-06, + "logits/chosen": 613203865.6, + "logits/rejected": 752396800.0, + "logps/chosen": -320.0416015625, + "logps/rejected": -319.9847412109375, + "loss": 0.1286, + "rewards/chosen": 2.3849018096923826, + "rewards/margins": 10.327276992797852, + "rewards/rejected": -7.942375183105469, + "step": 8273 + }, + { + "epoch": 0.7559616263133851, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 1.4047439903116933e-06, + "logits/chosen": 877139541.3333334, + "logits/rejected": 543892352.0, + "logps/chosen": -420.1424967447917, + "logps/rejected": -491.06671142578125, + "loss": 0.0177, + "rewards/chosen": 4.230398178100586, + "rewards/margins": 14.040923118591309, + "rewards/rejected": -9.810524940490723, + "step": 8274 + }, + { + "epoch": 0.7560529922338968, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 1.4037449297663558e-06, + "logits/chosen": 715557120.0, + "logits/rejected": 553180672.0, + "logps/chosen": -406.0456787109375, + "logps/rejected": -530.947021484375, + "loss": 0.01, + "rewards/chosen": 4.271619415283203, + "rewards/margins": 14.434009806315103, + "rewards/rejected": -10.1623903910319, + "step": 8275 + }, + { + "epoch": 0.7561443581544084, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.4027461665979924e-06, + "logits/chosen": 543290368.0, + "logits/rejected": 384479808.0, + "logps/chosen": -205.15374755859375, + "logps/rejected": -483.7695617675781, + "loss": 0.1353, + "rewards/chosen": 1.9976482391357422, + "rewards/margins": 13.800793647766113, + "rewards/rejected": -11.803145408630371, + "step": 8276 + }, + { + "epoch": 0.75623572407492, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 1.4017477008891916e-06, + "logits/chosen": 594515712.0, + "logits/rejected": 483731808.0, + "logps/chosen": -375.6000671386719, + "logps/rejected": -460.595703125, + "loss": 0.0204, + "rewards/chosen": 3.321211338043213, + "rewards/margins": 12.876848697662354, + "rewards/rejected": -9.55563735961914, + "step": 8277 + }, + { + "epoch": 0.7563270899954317, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.4007495327225162e-06, + "logits/chosen": 481227840.0, + "logits/rejected": 575375104.0, + "logps/chosen": -454.69097900390625, + "logps/rejected": -447.6039733886719, + "loss": 0.0143, + "rewards/chosen": 4.28266716003418, + "rewards/margins": 13.557275772094727, + "rewards/rejected": -9.274608612060547, + "step": 8278 + }, + { + "epoch": 0.7564184559159434, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 1.3997516621805086e-06, + "logits/chosen": 407683136.0, + "logits/rejected": 679486976.0, + "logps/chosen": -373.56689453125, + "logps/rejected": -454.89361572265625, + "loss": 0.1102, + "rewards/chosen": 3.868426561355591, + "rewards/margins": 11.379446268081665, + "rewards/rejected": -7.511019706726074, + "step": 8279 + }, + { + "epoch": 0.756509821836455, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 1.3987540893456808e-06, + "logits/chosen": 649755852.8, + "logits/rejected": 653584597.3333334, + "logps/chosen": -558.134814453125, + "logps/rejected": -392.7259114583333, + "loss": 0.0175, + "rewards/chosen": 3.8511146545410155, + "rewards/margins": 13.059534454345703, + "rewards/rejected": -9.208419799804688, + "step": 8280 + }, + { + "epoch": 0.7566011877569666, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 1.3977568143005237e-06, + "logits/chosen": 659337045.3333334, + "logits/rejected": 731004825.6, + "logps/chosen": -422.410888671875, + "logps/rejected": -381.794580078125, + "loss": 0.1291, + "rewards/chosen": 2.9692373275756836, + "rewards/margins": 8.77205982208252, + "rewards/rejected": -5.802822494506836, + "step": 8281 + }, + { + "epoch": 0.7566925536774783, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 1.3967598371275026e-06, + "logits/chosen": 690616576.0, + "logits/rejected": 846725888.0, + "logps/chosen": -190.4345245361328, + "logps/rejected": -492.3461507161458, + "loss": 0.0059, + "rewards/chosen": 3.833517551422119, + "rewards/margins": 14.637364546457926, + "rewards/rejected": -10.803846995035807, + "step": 8282 + }, + { + "epoch": 0.75678391959799, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.3957631579090581e-06, + "logits/chosen": 645124710.4, + "logits/rejected": 541000832.0, + "logps/chosen": -329.724267578125, + "logps/rejected": -311.460693359375, + "loss": 0.024, + "rewards/chosen": 3.29107666015625, + "rewards/margins": 12.215337117513021, + "rewards/rejected": -8.924260457356771, + "step": 8283 + }, + { + "epoch": 0.7568752855185016, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.3947667767276063e-06, + "logits/chosen": 170188160.0, + "logits/rejected": 404250806.85714287, + "logps/chosen": -90.92575073242188, + "logps/rejected": -500.6742466517857, + "loss": 0.0188, + "rewards/chosen": 1.7381401062011719, + "rewards/margins": 12.745676858084542, + "rewards/rejected": -11.00753675188337, + "step": 8284 + }, + { + "epoch": 0.7569666514390132, + "grad_norm": 40.0, + "kl": 0.0, + "learning_rate": 1.3937706936655394e-06, + "logits/chosen": 709542336.0, + "logits/rejected": 682437632.0, + "logps/chosen": -380.5798645019531, + "logps/rejected": -404.8797607421875, + "loss": 0.0505, + "rewards/chosen": 3.9869744777679443, + "rewards/margins": 10.58824610710144, + "rewards/rejected": -6.601271629333496, + "step": 8285 + }, + { + "epoch": 0.7570580173595249, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 1.3927749088052218e-06, + "logits/chosen": 583111424.0, + "logits/rejected": 1007542681.6, + "logps/chosen": -270.9441731770833, + "logps/rejected": -503.720947265625, + "loss": 0.0192, + "rewards/chosen": 3.1380650202433267, + "rewards/margins": 12.389638964335123, + "rewards/rejected": -9.251573944091797, + "step": 8286 + }, + { + "epoch": 0.7571493832800366, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.3917794222289988e-06, + "logits/chosen": 379766195.2, + "logits/rejected": 356460266.6666667, + "logps/chosen": -288.031884765625, + "logps/rejected": -384.8417561848958, + "loss": 0.0221, + "rewards/chosen": 3.6513362884521485, + "rewards/margins": 12.241047159830728, + "rewards/rejected": -8.58971087137858, + "step": 8287 + }, + { + "epoch": 0.7572407492005482, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.3907842340191875e-06, + "logits/chosen": 532681184.0, + "logits/rejected": 589064704.0, + "logps/chosen": -313.1520080566406, + "logps/rejected": -341.2751770019531, + "loss": 0.022, + "rewards/chosen": 4.100953578948975, + "rewards/margins": 12.078917503356934, + "rewards/rejected": -7.977963924407959, + "step": 8288 + }, + { + "epoch": 0.7573321151210598, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.389789344258079e-06, + "logits/chosen": 692652800.0, + "logits/rejected": 249089408.0, + "logps/chosen": -396.4991861979167, + "logps/rejected": -1039.1944580078125, + "loss": 0.0339, + "rewards/chosen": 3.6230738957722983, + "rewards/margins": 15.544226010640463, + "rewards/rejected": -11.921152114868164, + "step": 8289 + }, + { + "epoch": 0.7574234810415715, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 1.3887947530279411e-06, + "logits/chosen": 704621568.0, + "logits/rejected": 420521907.2, + "logps/chosen": -453.7911783854167, + "logps/rejected": -445.83056640625, + "loss": 0.0256, + "rewards/chosen": 2.9309183756510415, + "rewards/margins": 12.776800791422525, + "rewards/rejected": -9.845882415771484, + "step": 8290 + }, + { + "epoch": 0.7575148469620832, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.3878004604110207e-06, + "logits/chosen": 1266361088.0, + "logits/rejected": 513374400.0, + "logps/chosen": -399.9807535807292, + "logps/rejected": -460.74530029296875, + "loss": 0.03, + "rewards/chosen": 3.8779710133870444, + "rewards/margins": 14.290732701619467, + "rewards/rejected": -10.412761688232422, + "step": 8291 + }, + { + "epoch": 0.7576062128825948, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 1.3868064664895338e-06, + "logits/chosen": 539556480.0, + "logits/rejected": 439605796.5714286, + "logps/chosen": -326.3577880859375, + "logps/rejected": -489.17857142857144, + "loss": 0.015, + "rewards/chosen": 2.0020720958709717, + "rewards/margins": 12.320491416113716, + "rewards/rejected": -10.318419320242745, + "step": 8292 + }, + { + "epoch": 0.7576975788031064, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 1.3858127713456754e-06, + "logits/chosen": 889640789.3333334, + "logits/rejected": 866048204.8, + "logps/chosen": -432.7923583984375, + "logps/rejected": -357.27890625, + "loss": 0.0105, + "rewards/chosen": 3.666629155476888, + "rewards/margins": 11.78974469502767, + "rewards/rejected": -8.123115539550781, + "step": 8293 + }, + { + "epoch": 0.7577889447236181, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 1.3848193750616146e-06, + "logits/chosen": 544902976.0, + "logits/rejected": 898286336.0, + "logps/chosen": -326.62109375, + "logps/rejected": -576.3267822265625, + "loss": 0.0129, + "rewards/chosen": 4.209413528442383, + "rewards/margins": 13.478926658630371, + "rewards/rejected": -9.269513130187988, + "step": 8294 + }, + { + "epoch": 0.7578803106441298, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 1.3838262777194945e-06, + "logits/chosen": 400280985.6, + "logits/rejected": 459738240.0, + "logps/chosen": -353.9322998046875, + "logps/rejected": -854.4471028645834, + "loss": 0.0353, + "rewards/chosen": 3.4116657257080076, + "rewards/margins": 14.952303695678712, + "rewards/rejected": -11.540637969970703, + "step": 8295 + }, + { + "epoch": 0.7579716765646414, + "grad_norm": 48.75, + "kl": 0.0, + "learning_rate": 1.382833479401438e-06, + "logits/chosen": 365101275.4285714, + "logits/rejected": 187392512.0, + "logps/chosen": -238.70427594866072, + "logps/rejected": -354.65264892578125, + "loss": 0.0773, + "rewards/chosen": 4.197975431169782, + "rewards/margins": 14.4959442956107, + "rewards/rejected": -10.297968864440918, + "step": 8296 + }, + { + "epoch": 0.758063042485153, + "grad_norm": 0.91015625, + "kl": 0.0, + "learning_rate": 1.3818409801895393e-06, + "logits/chosen": 327444181.3333333, + "logits/rejected": 525905664.0, + "logps/chosen": -292.0163167317708, + "logps/rejected": -420.595361328125, + "loss": 0.0051, + "rewards/chosen": 4.641984303792317, + "rewards/margins": 12.340211232503254, + "rewards/rejected": -7.6982269287109375, + "step": 8297 + }, + { + "epoch": 0.7581544084056647, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 1.3808487801658682e-06, + "logits/chosen": 363791974.4, + "logits/rejected": 503077930.6666667, + "logps/chosen": -321.31201171875, + "logps/rejected": -452.2931315104167, + "loss": 0.0392, + "rewards/chosen": 3.9054500579833986, + "rewards/margins": 10.548705291748046, + "rewards/rejected": -6.643255233764648, + "step": 8298 + }, + { + "epoch": 0.7582457743261763, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 1.3798568794124696e-06, + "logits/chosen": 658291404.8, + "logits/rejected": 499874261.3333333, + "logps/chosen": -342.442041015625, + "logps/rejected": -575.64599609375, + "loss": 0.0379, + "rewards/chosen": 2.861550521850586, + "rewards/margins": 12.714045079549155, + "rewards/rejected": -9.852494557698568, + "step": 8299 + }, + { + "epoch": 0.758337140246688, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 1.378865278011367e-06, + "logits/chosen": 589621760.0, + "logits/rejected": 288014912.0, + "logps/chosen": -408.1256917317708, + "logps/rejected": -430.9631042480469, + "loss": 0.0184, + "rewards/chosen": 3.885248819986979, + "rewards/margins": 12.701162974039713, + "rewards/rejected": -8.815914154052734, + "step": 8300 + }, + { + "epoch": 0.7584285061671996, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.3778739760445552e-06, + "logits/chosen": 348135584.0, + "logits/rejected": 311570304.0, + "logps/chosen": -269.8395080566406, + "logps/rejected": -512.0165405273438, + "loss": 0.0158, + "rewards/chosen": 4.359105587005615, + "rewards/margins": 14.276122570037842, + "rewards/rejected": -9.917016983032227, + "step": 8301 + }, + { + "epoch": 0.7585198720877113, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 1.3768829735940065e-06, + "logits/chosen": 561683392.0, + "logits/rejected": 741331648.0, + "logps/chosen": -365.86602783203125, + "logps/rejected": -515.970458984375, + "loss": 0.0608, + "rewards/chosen": 3.366887331008911, + "rewards/margins": 10.665203332901001, + "rewards/rejected": -7.29831600189209, + "step": 8302 + }, + { + "epoch": 0.7586112380082229, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 1.3758922707416667e-06, + "logits/chosen": 433376597.3333333, + "logits/rejected": 732923968.0, + "logps/chosen": -234.59647623697916, + "logps/rejected": -956.1572875976562, + "loss": 0.0223, + "rewards/chosen": 4.0067094167073565, + "rewards/margins": 16.634795506795246, + "rewards/rejected": -12.62808609008789, + "step": 8303 + }, + { + "epoch": 0.7587026039287346, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 1.3749018675694559e-06, + "logits/chosen": 686814805.3333334, + "logits/rejected": 393224857.6, + "logps/chosen": -323.5045572916667, + "logps/rejected": -458.6484375, + "loss": 0.0059, + "rewards/chosen": 4.206698735555013, + "rewards/margins": 13.921785863240558, + "rewards/rejected": -9.715087127685546, + "step": 8304 + }, + { + "epoch": 0.7587939698492462, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.3739117641592752e-06, + "logits/chosen": 657058816.0, + "logits/rejected": 519765248.0, + "logps/chosen": -499.9806315104167, + "logps/rejected": -422.06572265625, + "loss": 0.0154, + "rewards/chosen": 3.1909395853678384, + "rewards/margins": 12.060969034830729, + "rewards/rejected": -8.87002944946289, + "step": 8305 + }, + { + "epoch": 0.7588853357697579, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 1.3729219605929955e-06, + "logits/chosen": 916023705.6, + "logits/rejected": 690907946.6666666, + "logps/chosen": -337.0040771484375, + "logps/rejected": -772.7517903645834, + "loss": 0.0388, + "rewards/chosen": 3.498554229736328, + "rewards/margins": 16.235209147135414, + "rewards/rejected": -12.736654917399088, + "step": 8306 + }, + { + "epoch": 0.7589767016902695, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 1.371932456952464e-06, + "logits/chosen": 871303808.0, + "logits/rejected": 1209809024.0, + "logps/chosen": -285.87176513671875, + "logps/rejected": -445.5521545410156, + "loss": 0.007, + "rewards/chosen": 4.981531620025635, + "rewards/margins": 11.86939001083374, + "rewards/rejected": -6.8878583908081055, + "step": 8307 + }, + { + "epoch": 0.7590680676107812, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 1.3709432533195016e-06, + "logits/chosen": 341716160.0, + "logits/rejected": 334457450.6666667, + "logps/chosen": -309.7421875, + "logps/rejected": -472.6202392578125, + "loss": 0.0103, + "rewards/chosen": 4.774041652679443, + "rewards/margins": 14.956579367319742, + "rewards/rejected": -10.182537714640299, + "step": 8308 + }, + { + "epoch": 0.7591594335312928, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 1.3699543497759115e-06, + "logits/chosen": 374575328.0, + "logits/rejected": 691427712.0, + "logps/chosen": -202.3642578125, + "logps/rejected": -334.3098449707031, + "loss": 0.1163, + "rewards/chosen": 3.6460037231445312, + "rewards/margins": 10.585303783416748, + "rewards/rejected": -6.939300060272217, + "step": 8309 + }, + { + "epoch": 0.7592507994518045, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 1.3689657464034617e-06, + "logits/chosen": 474049365.3333333, + "logits/rejected": 485567488.0, + "logps/chosen": -210.33367919921875, + "logps/rejected": -508.802099609375, + "loss": 0.0104, + "rewards/chosen": 4.089865048726399, + "rewards/margins": 12.695249112447101, + "rewards/rejected": -8.605384063720702, + "step": 8310 + }, + { + "epoch": 0.7593421653723161, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 1.3679774432839016e-06, + "logits/chosen": 678657536.0, + "logits/rejected": 801470156.8, + "logps/chosen": -271.36395263671875, + "logps/rejected": -507.84072265625, + "loss": 0.0128, + "rewards/chosen": 3.7396672566731772, + "rewards/margins": 13.29858144124349, + "rewards/rejected": -9.558914184570312, + "step": 8311 + }, + { + "epoch": 0.7594335312928278, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.3669894404989564e-06, + "logits/chosen": 534747392.0, + "logits/rejected": 388986777.6, + "logps/chosen": -320.8298746744792, + "logps/rejected": -674.500390625, + "loss": 0.0133, + "rewards/chosen": 3.3499879837036133, + "rewards/margins": 15.15846118927002, + "rewards/rejected": -11.808473205566406, + "step": 8312 + }, + { + "epoch": 0.7595248972133394, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.3660017381303242e-06, + "logits/chosen": 746490880.0, + "logits/rejected": 357242700.8, + "logps/chosen": -589.9669596354166, + "logps/rejected": -528.53935546875, + "loss": 0.0134, + "rewards/chosen": 3.3587252298990884, + "rewards/margins": 14.454564158121745, + "rewards/rejected": -11.095838928222657, + "step": 8313 + }, + { + "epoch": 0.7596162631338511, + "grad_norm": 0.036865234375, + "kl": 0.0, + "learning_rate": 1.365014336259678e-06, + "logits/rejected": 475693152.0, + "logps/rejected": -669.850341796875, + "loss": 0.0001, + "rewards/rejected": -11.030515670776367, + "step": 8314 + }, + { + "epoch": 0.7597076290543627, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 1.3640272349686678e-06, + "logits/chosen": 495653600.0, + "logits/rejected": 422990496.0, + "logps/chosen": -265.07708740234375, + "logps/rejected": -447.47418212890625, + "loss": 0.0214, + "rewards/chosen": 3.382810115814209, + "rewards/margins": 13.746241092681885, + "rewards/rejected": -10.363430976867676, + "step": 8315 + }, + { + "epoch": 0.7597989949748744, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 1.363040434338916e-06, + "logits/chosen": 813525162.6666666, + "logits/rejected": 671369011.2, + "logps/chosen": -360.0463460286458, + "logps/rejected": -625.227392578125, + "loss": 0.0107, + "rewards/chosen": 3.797440528869629, + "rewards/margins": 14.26661548614502, + "rewards/rejected": -10.46917495727539, + "step": 8316 + }, + { + "epoch": 0.759890360895386, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.362053934452024e-06, + "logits/chosen": 282224682.6666667, + "logits/rejected": 747500224.0, + "logps/chosen": -228.8346964518229, + "logps/rejected": -484.4925842285156, + "loss": 0.0133, + "rewards/chosen": 4.8878434499104815, + "rewards/margins": 14.672774632771809, + "rewards/rejected": -9.784931182861328, + "step": 8317 + }, + { + "epoch": 0.7599817268158977, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 1.3610677353895652e-06, + "logits/chosen": 484335200.0, + "logits/rejected": 400632576.0, + "logps/chosen": -245.79531860351562, + "logps/rejected": -499.7273254394531, + "loss": 0.01, + "rewards/chosen": 3.994452714920044, + "rewards/margins": 14.103485345840454, + "rewards/rejected": -10.10903263092041, + "step": 8318 + }, + { + "epoch": 0.7600730927364093, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 1.36008183723309e-06, + "logits/chosen": 502525184.0, + "logits/rejected": 291397632.0, + "logps/chosen": -271.206298828125, + "logps/rejected": -437.46002197265625, + "loss": 0.0304, + "rewards/chosen": 3.792091687520345, + "rewards/margins": 13.115548451741537, + "rewards/rejected": -9.323456764221191, + "step": 8319 + }, + { + "epoch": 0.760164458656921, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 1.35909624006412e-06, + "logits/chosen": 450297280.0, + "logits/rejected": 494855456.0, + "logps/chosen": -351.04986572265625, + "logps/rejected": -406.76983642578125, + "loss": 0.0149, + "rewards/chosen": 4.257725715637207, + "rewards/margins": 13.13059139251709, + "rewards/rejected": -8.872865676879883, + "step": 8320 + }, + { + "epoch": 0.7602558245774326, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.3581109439641587e-06, + "logits/chosen": 597320704.0, + "logits/rejected": 1163732864.0, + "logps/chosen": -303.4920654296875, + "logps/rejected": -529.9683227539062, + "loss": 0.0141, + "rewards/chosen": 4.0430521965026855, + "rewards/margins": 11.801790714263916, + "rewards/rejected": -7.7587385177612305, + "step": 8321 + }, + { + "epoch": 0.7603471904979443, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.3571259490146798e-06, + "logits/chosen": 462112341.3333333, + "logits/rejected": 340068480.0, + "logps/chosen": -254.3058064778646, + "logps/rejected": -398.9426574707031, + "loss": 0.0175, + "rewards/chosen": 4.273759841918945, + "rewards/margins": 12.94080924987793, + "rewards/rejected": -8.667049407958984, + "step": 8322 + }, + { + "epoch": 0.7604385564184559, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 1.356141255297133e-06, + "logits/chosen": 577278293.3333334, + "logits/rejected": 632376268.8, + "logps/chosen": -215.1134236653646, + "logps/rejected": -545.360693359375, + "loss": 0.0108, + "rewards/chosen": 3.6802285512288413, + "rewards/margins": 13.959965642293295, + "rewards/rejected": -10.279737091064453, + "step": 8323 + }, + { + "epoch": 0.7605299223389675, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 1.3551568628929434e-06, + "logits/chosen": 579837184.0, + "logits/rejected": 456148053.3333333, + "logps/chosen": -317.9497375488281, + "logps/rejected": -608.338134765625, + "loss": 0.0189, + "rewards/chosen": 2.543509006500244, + "rewards/margins": 12.735540866851807, + "rewards/rejected": -10.192031860351562, + "step": 8324 + }, + { + "epoch": 0.7606212882594792, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 1.3541727718835097e-06, + "logits/chosen": 525754880.0, + "logits/rejected": 426791936.0, + "logps/chosen": -353.70556640625, + "logps/rejected": -300.8759521484375, + "loss": 0.0048, + "rewards/chosen": 4.587148030598958, + "rewards/margins": 12.655264790852865, + "rewards/rejected": -8.068116760253906, + "step": 8325 + }, + { + "epoch": 0.7607126541799909, + "grad_norm": 0.96484375, + "kl": 0.0, + "learning_rate": 1.3531889823502092e-06, + "logits/chosen": 669784576.0, + "logits/rejected": 386472917.3333333, + "logps/chosen": -296.0090576171875, + "logps/rejected": -590.9964192708334, + "loss": 0.0066, + "rewards/chosen": 4.705908203125, + "rewards/margins": 16.155526224772135, + "rewards/rejected": -11.449618021647135, + "step": 8326 + }, + { + "epoch": 0.7608040201005025, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 1.3522054943743917e-06, + "logits/chosen": 564306278.4, + "logits/rejected": 548568917.3333334, + "logps/chosen": -267.2607421875, + "logps/rejected": -671.7459309895834, + "loss": 0.1387, + "rewards/chosen": 2.1983402252197264, + "rewards/margins": 13.11642468770345, + "rewards/rejected": -10.918084462483725, + "step": 8327 + }, + { + "epoch": 0.7608953860210141, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.3512223080373826e-06, + "logits/chosen": 645631744.0, + "logits/rejected": 710051669.3333334, + "logps/chosen": -416.64068603515625, + "logps/rejected": -501.7416585286458, + "loss": 0.0181, + "rewards/chosen": 2.7128143310546875, + "rewards/margins": 11.863815307617188, + "rewards/rejected": -9.1510009765625, + "step": 8328 + }, + { + "epoch": 0.7609867519415258, + "grad_norm": 0.408203125, + "kl": 0.0, + "learning_rate": 1.3502394234204803e-06, + "logits/chosen": 438164928.0, + "logits/rejected": 507487744.0, + "logps/chosen": -407.9393310546875, + "logps/rejected": -632.7202845982143, + "loss": 0.0016, + "rewards/chosen": 4.377728462219238, + "rewards/margins": 14.822582108633858, + "rewards/rejected": -10.44485364641462, + "step": 8329 + }, + { + "epoch": 0.7610781178620375, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 1.3492568406049627e-06, + "logits/chosen": 536045696.0, + "logits/rejected": 484680928.0, + "logps/chosen": -485.4808044433594, + "logps/rejected": -443.96575927734375, + "loss": 0.0079, + "rewards/chosen": 4.675558090209961, + "rewards/margins": 13.069160461425781, + "rewards/rejected": -8.39360237121582, + "step": 8330 + }, + { + "epoch": 0.7611694837825491, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 1.3482745596720804e-06, + "logits/chosen": 693065088.0, + "logits/rejected": 1176032256.0, + "logps/chosen": -364.97210693359375, + "logps/rejected": -652.4867553710938, + "loss": 0.0089, + "rewards/chosen": 4.172060966491699, + "rewards/margins": 14.408061027526855, + "rewards/rejected": -10.236000061035156, + "step": 8331 + }, + { + "epoch": 0.7612608497030607, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 1.3472925807030579e-06, + "logits/chosen": 252743872.0, + "logits/rejected": 693652160.0, + "logps/chosen": -141.36834716796875, + "logps/rejected": -766.204833984375, + "loss": 0.0286, + "rewards/chosen": 3.572833299636841, + "rewards/margins": 13.428565263748169, + "rewards/rejected": -9.855731964111328, + "step": 8332 + }, + { + "epoch": 0.7613522156235724, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 1.346310903779094e-06, + "logits/chosen": 660589056.0, + "logits/rejected": 1517831850.6666667, + "logps/chosen": -382.54033203125, + "logps/rejected": -671.0814615885416, + "loss": 0.0171, + "rewards/chosen": 3.8593372344970702, + "rewards/margins": 12.256817245483399, + "rewards/rejected": -8.397480010986328, + "step": 8333 + }, + { + "epoch": 0.7614435815440841, + "grad_norm": 0.9453125, + "kl": 0.0, + "learning_rate": 1.3453295289813677e-06, + "logits/chosen": 506886656.0, + "logits/rejected": 478150016.0, + "logps/chosen": -317.9093017578125, + "logps/rejected": -446.21295166015625, + "loss": 0.0063, + "rewards/chosen": 4.560920715332031, + "rewards/margins": 12.767688751220703, + "rewards/rejected": -8.206768035888672, + "step": 8334 + }, + { + "epoch": 0.7615349474645957, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.344348456391028e-06, + "logits/chosen": 576812800.0, + "logits/rejected": 648968755.2, + "logps/chosen": -212.32625325520834, + "logps/rejected": -509.187841796875, + "loss": 0.0197, + "rewards/chosen": 3.0216217041015625, + "rewards/margins": 12.924559020996094, + "rewards/rejected": -9.902937316894532, + "step": 8335 + }, + { + "epoch": 0.7616263133851073, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.3433676860892025e-06, + "logits/chosen": 549143296.0, + "logits/rejected": 542928384.0, + "logps/chosen": -443.5523274739583, + "logps/rejected": -369.8460693359375, + "loss": 0.0292, + "rewards/chosen": 3.40840212504069, + "rewards/margins": 16.130720774332683, + "rewards/rejected": -12.722318649291992, + "step": 8336 + }, + { + "epoch": 0.761717679305619, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 1.3423872181569868e-06, + "logits/chosen": 509157792.0, + "logits/rejected": 537917184.0, + "logps/chosen": -335.17962646484375, + "logps/rejected": -513.908447265625, + "loss": 0.0085, + "rewards/chosen": 4.393202304840088, + "rewards/margins": 12.488114833831787, + "rewards/rejected": -8.0949125289917, + "step": 8337 + }, + { + "epoch": 0.7618090452261307, + "grad_norm": 81.5, + "kl": 0.0, + "learning_rate": 1.3414070526754613e-06, + "logits/chosen": 459477802.6666667, + "logits/rejected": 536055712.0, + "logps/chosen": -270.1695963541667, + "logps/rejected": -598.953857421875, + "loss": 0.0576, + "rewards/chosen": 3.416803995768229, + "rewards/margins": 13.203742663065592, + "rewards/rejected": -9.786938667297363, + "step": 8338 + }, + { + "epoch": 0.7619004111466423, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 1.3404271897256743e-06, + "logits/chosen": 229666672.0, + "logits/rejected": 529501738.6666667, + "logps/chosen": -118.78644561767578, + "logps/rejected": -618.97412109375, + "loss": 0.1296, + "rewards/chosen": -0.3647247552871704, + "rewards/margins": 10.809611996014914, + "rewards/rejected": -11.174336751302084, + "step": 8339 + }, + { + "epoch": 0.7619917770671539, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 1.3394476293886527e-06, + "logits/chosen": 562951082.6666666, + "logits/rejected": 494429491.2, + "logps/chosen": -218.832275390625, + "logps/rejected": -578.91640625, + "loss": 0.012, + "rewards/chosen": 4.087443033854167, + "rewards/margins": 13.857811228434244, + "rewards/rejected": -9.770368194580078, + "step": 8340 + }, + { + "epoch": 0.7620831429876656, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.3384683717453944e-06, + "logits/chosen": 434370457.6, + "logits/rejected": 369011712.0, + "logps/chosen": -308.752685546875, + "logps/rejected": -538.43408203125, + "loss": 0.0316, + "rewards/chosen": 3.085023880004883, + "rewards/margins": 11.343783696492514, + "rewards/rejected": -8.25875981648763, + "step": 8341 + }, + { + "epoch": 0.7621745089081773, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 1.3374894168768786e-06, + "logits/chosen": 503106355.2, + "logits/rejected": 388201130.6666667, + "logps/chosen": -275.7560791015625, + "logps/rejected": -444.5357259114583, + "loss": 0.0303, + "rewards/chosen": 3.3043258666992186, + "rewards/margins": 10.804676818847657, + "rewards/rejected": -7.5003509521484375, + "step": 8342 + }, + { + "epoch": 0.7622658748286889, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 1.336510764864054e-06, + "logits/chosen": 1206545408.0, + "logits/rejected": 696255104.0, + "logps/chosen": -306.7337158203125, + "logps/rejected": -550.775146484375, + "loss": 0.0859, + "rewards/chosen": 2.613465118408203, + "rewards/margins": 11.615353393554688, + "rewards/rejected": -9.001888275146484, + "step": 8343 + }, + { + "epoch": 0.7623572407492005, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 1.3355324157878462e-06, + "logits/chosen": 996306496.0, + "logits/rejected": 1010236032.0, + "logps/chosen": -264.158447265625, + "logps/rejected": -641.5731201171875, + "loss": 0.0353, + "rewards/chosen": 3.614602565765381, + "rewards/margins": 14.182467937469482, + "rewards/rejected": -10.567865371704102, + "step": 8344 + }, + { + "epoch": 0.7624486066697121, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 1.3345543697291557e-06, + "logits/chosen": 650005760.0, + "logits/rejected": 429416288.0, + "logps/chosen": -264.19573974609375, + "logps/rejected": -547.427734375, + "loss": 0.0306, + "rewards/chosen": 2.9009270668029785, + "rewards/margins": 12.713572978973389, + "rewards/rejected": -9.81264591217041, + "step": 8345 + }, + { + "epoch": 0.7625399725902239, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 1.3335766267688566e-06, + "logits/chosen": 314927936.0, + "logits/rejected": 433841728.0, + "logps/chosen": -292.1354675292969, + "logps/rejected": -597.6525268554688, + "loss": 0.007, + "rewards/chosen": 4.914933204650879, + "rewards/margins": 16.783674240112305, + "rewards/rejected": -11.868741035461426, + "step": 8346 + }, + { + "epoch": 0.7626313385107355, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.3325991869878013e-06, + "logits/chosen": 459918677.3333333, + "logits/rejected": 408394598.4, + "logps/chosen": -398.2374267578125, + "logps/rejected": -440.906982421875, + "loss": 0.0133, + "rewards/chosen": 3.6368592580159507, + "rewards/margins": 14.441974004109701, + "rewards/rejected": -10.80511474609375, + "step": 8347 + }, + { + "epoch": 0.7627227044312471, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 1.3316220504668143e-06, + "logits/chosen": 463342752.0, + "logits/rejected": 574245312.0, + "logps/chosen": -275.6566162109375, + "logps/rejected": -349.383056640625, + "loss": 0.025, + "rewards/chosen": 3.57171368598938, + "rewards/margins": 11.779065370559692, + "rewards/rejected": -8.207351684570312, + "step": 8348 + }, + { + "epoch": 0.7628140703517587, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 1.330645217286695e-06, + "logits/chosen": 775795072.0, + "logits/rejected": 675148416.0, + "logps/chosen": -222.74798583984375, + "logps/rejected": -576.446533203125, + "loss": 0.0286, + "rewards/chosen": 3.0303263664245605, + "rewards/margins": 11.624829769134521, + "rewards/rejected": -8.594503402709961, + "step": 8349 + }, + { + "epoch": 0.7629054362722705, + "grad_norm": 47.5, + "kl": 0.0, + "learning_rate": 1.3296686875282178e-06, + "logits/chosen": 700140032.0, + "logits/rejected": 857036224.0, + "logps/chosen": -339.4615478515625, + "logps/rejected": -406.4187316894531, + "loss": 0.0841, + "rewards/chosen": 2.7683732509613037, + "rewards/margins": 13.674834966659546, + "rewards/rejected": -10.906461715698242, + "step": 8350 + }, + { + "epoch": 0.7629968021927821, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 1.3286924612721347e-06, + "logits/chosen": 573247146.6666666, + "logits/rejected": 461994854.4, + "logps/chosen": -525.770751953125, + "logps/rejected": -324.6685546875, + "loss": 0.0121, + "rewards/chosen": 3.639183680216471, + "rewards/margins": 13.666815058390299, + "rewards/rejected": -10.027631378173828, + "step": 8351 + }, + { + "epoch": 0.7630881681132937, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 1.3277165385991702e-06, + "logits/chosen": 1159472896.0, + "logits/rejected": 331100832.0, + "logps/chosen": -446.54718017578125, + "logps/rejected": -232.65464782714844, + "loss": 0.025, + "rewards/chosen": 3.0713348388671875, + "rewards/margins": 10.54081678390503, + "rewards/rejected": -7.469481945037842, + "step": 8352 + }, + { + "epoch": 0.7631795340338053, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.3267409195900233e-06, + "logits/chosen": 254184816.0, + "logits/rejected": 549502933.3333334, + "logps/chosen": -254.89654541015625, + "logps/rejected": -597.120849609375, + "loss": 0.0246, + "rewards/chosen": 2.3049659729003906, + "rewards/margins": 12.215591430664062, + "rewards/rejected": -9.910625457763672, + "step": 8353 + }, + { + "epoch": 0.7632708999543171, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 1.3257656043253674e-06, + "logits/chosen": 556997888.0, + "logits/rejected": 353789632.0, + "logps/chosen": -334.27036539713544, + "logps/rejected": -575.798583984375, + "loss": 0.017, + "rewards/chosen": 4.223128000895183, + "rewards/margins": 15.766859690348308, + "rewards/rejected": -11.543731689453125, + "step": 8354 + }, + { + "epoch": 0.7633622658748287, + "grad_norm": 67.0, + "kl": 0.0, + "learning_rate": 1.3247905928858552e-06, + "logits/chosen": 1633324202.6666667, + "logits/rejected": 470804736.0, + "logps/chosen": -197.1946004231771, + "logps/rejected": -503.927734375, + "loss": 0.0784, + "rewards/chosen": 1.8901891708374023, + "rewards/margins": 13.61247615814209, + "rewards/rejected": -11.722286987304688, + "step": 8355 + }, + { + "epoch": 0.7634536317953403, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 1.323815885352109e-06, + "logits/chosen": 462509482.6666667, + "logits/rejected": 1159238144.0, + "logps/chosen": -272.16363525390625, + "logps/rejected": -495.9173583984375, + "loss": 0.0188, + "rewards/chosen": 4.295261065165202, + "rewards/margins": 14.350510279337566, + "rewards/rejected": -10.055249214172363, + "step": 8356 + }, + { + "epoch": 0.7635449977158519, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.3228414818047286e-06, + "logits/chosen": 435497779.2, + "logits/rejected": 555786026.6666666, + "logps/chosen": -366.6229736328125, + "logps/rejected": -501.7576497395833, + "loss": 0.0136, + "rewards/chosen": 4.258723449707031, + "rewards/margins": 14.370406341552734, + "rewards/rejected": -10.111682891845703, + "step": 8357 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.3218673823242879e-06, + "logits/chosen": 671011754.6666666, + "logits/rejected": 569260595.2, + "logps/chosen": -275.0899251302083, + "logps/rejected": -521.50849609375, + "loss": 0.011, + "rewards/chosen": 3.9324938456217446, + "rewards/margins": 11.271824900309245, + "rewards/rejected": -7.3393310546875, + "step": 8358 + }, + { + "epoch": 0.7637277295568753, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.3208935869913347e-06, + "logits/chosen": 680036992.0, + "logits/rejected": 516682649.6, + "logps/chosen": -407.6488850911458, + "logps/rejected": -479.49443359375, + "loss": 0.0173, + "rewards/chosen": 3.132068951924642, + "rewards/margins": 11.238437207539876, + "rewards/rejected": -8.106368255615234, + "step": 8359 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 1.3199200958863956e-06, + "logits/chosen": 662337344.0, + "logits/rejected": 1128240640.0, + "logps/chosen": -248.87950134277344, + "logps/rejected": -529.9188842773438, + "loss": 0.0266, + "rewards/chosen": 3.1949548721313477, + "rewards/margins": 12.437190055847168, + "rewards/rejected": -9.24223518371582, + "step": 8360 + }, + { + "epoch": 0.7639104613978985, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 1.3189469090899682e-06, + "logits/chosen": 642846634.6666666, + "logits/rejected": 611580544.0, + "logps/chosen": -363.7137451171875, + "logps/rejected": -430.119873046875, + "loss": 0.0257, + "rewards/chosen": 3.6319897969563804, + "rewards/margins": 11.205791791280111, + "rewards/rejected": -7.5738019943237305, + "step": 8361 + }, + { + "epoch": 0.7640018273184103, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 1.3179740266825253e-06, + "logits/chosen": 864877158.4, + "logits/rejected": 1084670293.3333333, + "logps/chosen": -281.0661376953125, + "logps/rejected": -547.58447265625, + "loss": 0.0094, + "rewards/chosen": 4.563136291503906, + "rewards/margins": 13.47501932779948, + "rewards/rejected": -8.911883036295572, + "step": 8362 + }, + { + "epoch": 0.7640931932389219, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 1.317001448744516e-06, + "logits/chosen": 512257177.6, + "logits/rejected": 616905685.3333334, + "logps/chosen": -292.08740234375, + "logps/rejected": -431.5367024739583, + "loss": 0.012, + "rewards/chosen": 4.2186637878417965, + "rewards/margins": 14.700658416748047, + "rewards/rejected": -10.48199462890625, + "step": 8363 + }, + { + "epoch": 0.7641845591594335, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 1.3160291753563625e-06, + "logits/chosen": 320761024.0, + "logits/rejected": 768388437.3333334, + "logps/chosen": -246.31695556640625, + "logps/rejected": -581.8663330078125, + "loss": 0.0109, + "rewards/chosen": 4.0512261390686035, + "rewards/margins": 15.479151248931885, + "rewards/rejected": -11.427925109863281, + "step": 8364 + }, + { + "epoch": 0.7642759250799451, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.315057206598464e-06, + "logits/chosen": 640839168.0, + "logits/rejected": 364487104.0, + "logps/chosen": -230.46307373046875, + "logps/rejected": -279.9101257324219, + "loss": 0.0157, + "rewards/chosen": 3.986541748046875, + "rewards/margins": 11.589916229248047, + "rewards/rejected": -7.603374481201172, + "step": 8365 + }, + { + "epoch": 0.7643672910004569, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 1.3140855425511928e-06, + "logits/chosen": 243431792.0, + "logits/rejected": 396923093.3333333, + "logps/chosen": -155.64132690429688, + "logps/rejected": -567.7905680338541, + "loss": 0.008, + "rewards/chosen": 4.665539741516113, + "rewards/margins": 14.027225812276205, + "rewards/rejected": -9.361686070760092, + "step": 8366 + }, + { + "epoch": 0.7644586569209685, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 1.3131141832948946e-06, + "logits/chosen": 305092074.6666667, + "logits/rejected": 344217292.8, + "logps/chosen": -184.5379638671875, + "logps/rejected": -499.57021484375, + "loss": 0.008, + "rewards/chosen": 4.322117169698079, + "rewards/margins": 14.46625779469808, + "rewards/rejected": -10.144140625, + "step": 8367 + }, + { + "epoch": 0.7645500228414801, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 1.3121431289098958e-06, + "logits/chosen": 508421504.0, + "logits/rejected": 373874560.0, + "logps/chosen": -235.7528839111328, + "logps/rejected": -323.1004638671875, + "loss": 0.0164, + "rewards/chosen": 4.06815767288208, + "rewards/margins": 13.46873140335083, + "rewards/rejected": -9.40057373046875, + "step": 8368 + }, + { + "epoch": 0.7646413887619917, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.3111723794764909e-06, + "logits/chosen": 668764672.0, + "logits/rejected": 430470314.6666667, + "logps/chosen": -409.1162109375, + "logps/rejected": -674.5680745442709, + "loss": 0.016, + "rewards/chosen": 4.0225788116455075, + "rewards/margins": 16.36543871561686, + "rewards/rejected": -12.342859903971354, + "step": 8369 + }, + { + "epoch": 0.7647327546825035, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.3102019350749528e-06, + "logits/chosen": 548069824.0, + "logits/rejected": 511836800.0, + "logps/chosen": -332.3739318847656, + "logps/rejected": -546.629638671875, + "loss": 0.0097, + "rewards/chosen": 4.566577911376953, + "rewards/margins": 14.038698196411133, + "rewards/rejected": -9.47212028503418, + "step": 8370 + }, + { + "epoch": 0.7648241206030151, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.309231795785526e-06, + "logits/chosen": 658946986.6666666, + "logits/rejected": 731483033.6, + "logps/chosen": -419.3387044270833, + "logps/rejected": -547.81884765625, + "loss": 0.0146, + "rewards/chosen": 3.7600110371907554, + "rewards/margins": 14.297004445393881, + "rewards/rejected": -10.536993408203125, + "step": 8371 + }, + { + "epoch": 0.7649154865235267, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.3082619616884362e-06, + "logits/chosen": 427811302.4, + "logits/rejected": 397284778.6666667, + "logps/chosen": -246.27841796875, + "logps/rejected": -685.6192220052084, + "loss": 0.0179, + "rewards/chosen": 4.259693908691406, + "rewards/margins": 13.922086207071938, + "rewards/rejected": -9.662392298380533, + "step": 8372 + }, + { + "epoch": 0.7650068524440383, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.3072924328638769e-06, + "logits/chosen": 434007040.0, + "logits/rejected": 385407296.0, + "logps/chosen": -366.3511962890625, + "logps/rejected": -463.0106201171875, + "loss": 0.0136, + "rewards/chosen": 4.127837181091309, + "rewards/margins": 14.235156059265137, + "rewards/rejected": -10.107318878173828, + "step": 8373 + }, + { + "epoch": 0.76509821836455, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.3063232093920197e-06, + "logits/chosen": 780392533.3333334, + "logits/rejected": 622174464.0, + "logps/chosen": -388.5479329427083, + "logps/rejected": -370.5446044921875, + "loss": 0.0076, + "rewards/chosen": 4.609646479288737, + "rewards/margins": 12.795503107706708, + "rewards/rejected": -8.18585662841797, + "step": 8374 + }, + { + "epoch": 0.7651895842850617, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 1.3053542913530104e-06, + "logits/chosen": 732335257.6, + "logits/rejected": 345968021.3333333, + "logps/chosen": -553.19619140625, + "logps/rejected": -325.3523356119792, + "loss": 0.0451, + "rewards/chosen": 2.9533012390136717, + "rewards/margins": 11.730066426595052, + "rewards/rejected": -8.77676518758138, + "step": 8375 + }, + { + "epoch": 0.7652809502055733, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 1.304385678826968e-06, + "logits/chosen": 1045179562.6666666, + "logits/rejected": 704285568.0, + "logps/chosen": -436.9300944010417, + "logps/rejected": -422.786865234375, + "loss": 0.0121, + "rewards/chosen": 4.557316780090332, + "rewards/margins": 12.281302452087402, + "rewards/rejected": -7.72398567199707, + "step": 8376 + }, + { + "epoch": 0.7653723161260849, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.3034173718939913e-06, + "logits/chosen": 495870668.8, + "logits/rejected": 671029376.0, + "logps/chosen": -366.7972412109375, + "logps/rejected": -470.8934733072917, + "loss": 0.0158, + "rewards/chosen": 4.168972778320312, + "rewards/margins": 14.429076639811196, + "rewards/rejected": -10.260103861490885, + "step": 8377 + }, + { + "epoch": 0.7654636820465966, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 1.3024493706341485e-06, + "logits/chosen": 633015552.0, + "logits/rejected": 333825100.8, + "logps/chosen": -401.8513590494792, + "logps/rejected": -472.2263671875, + "loss": 0.0085, + "rewards/chosen": 3.9398924509684243, + "rewards/margins": 13.450092951456705, + "rewards/rejected": -9.510200500488281, + "step": 8378 + }, + { + "epoch": 0.7655550479671083, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 1.3014816751274834e-06, + "logits/chosen": 613242240.0, + "logits/rejected": 431260576.0, + "logps/chosen": -202.62710571289062, + "logps/rejected": -390.2420349121094, + "loss": 0.0342, + "rewards/chosen": 3.4303102493286133, + "rewards/margins": 11.87475299835205, + "rewards/rejected": -8.444442749023438, + "step": 8379 + }, + { + "epoch": 0.7656464138876199, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 1.3005142854540153e-06, + "logits/chosen": 1759176576.0, + "logits/rejected": 541129258.6666666, + "logps/chosen": -223.36502075195312, + "logps/rejected": -475.4803873697917, + "loss": 0.0114, + "rewards/chosen": 3.1461615562438965, + "rewards/margins": 13.35921303431193, + "rewards/rejected": -10.213051478068033, + "step": 8380 + }, + { + "epoch": 0.7657377798081315, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.2995472016937405e-06, + "logits/chosen": 530936371.2, + "logits/rejected": 200943338.66666666, + "logps/chosen": -292.822314453125, + "logps/rejected": -406.7693684895833, + "loss": 0.028, + "rewards/chosen": 3.387337875366211, + "rewards/margins": 16.356527837117515, + "rewards/rejected": -12.969189961751303, + "step": 8381 + }, + { + "epoch": 0.7658291457286432, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.298580423926627e-06, + "logits/chosen": 521882624.0, + "logits/rejected": 503562656.0, + "logps/chosen": -364.296875, + "logps/rejected": -528.9772338867188, + "loss": 0.0203, + "rewards/chosen": 3.2068347930908203, + "rewards/margins": 11.44741439819336, + "rewards/rejected": -8.240579605102539, + "step": 8382 + }, + { + "epoch": 0.7659205116491549, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 1.2976139522326176e-06, + "logits/chosen": 822399680.0, + "logits/rejected": 797155648.0, + "logps/chosen": -404.94244384765625, + "logps/rejected": -465.40228271484375, + "loss": 0.0178, + "rewards/chosen": 3.7329659461975098, + "rewards/margins": 12.468056201934814, + "rewards/rejected": -8.735090255737305, + "step": 8383 + }, + { + "epoch": 0.7660118775696665, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 1.2966477866916294e-06, + "logits/chosen": 446361088.0, + "logits/rejected": 438505130.6666667, + "logps/chosen": -289.6574462890625, + "logps/rejected": -607.718994140625, + "loss": 0.0267, + "rewards/chosen": 3.6004570007324217, + "rewards/margins": 14.348395792643228, + "rewards/rejected": -10.747938791910807, + "step": 8384 + }, + { + "epoch": 0.7661032434901781, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.295681927383559e-06, + "logits/chosen": 626711552.0, + "logits/rejected": 694350080.0, + "logps/chosen": -422.361328125, + "logps/rejected": -684.027685546875, + "loss": 0.0099, + "rewards/chosen": 4.549180348714192, + "rewards/margins": 13.145194753011069, + "rewards/rejected": -8.596014404296875, + "step": 8385 + }, + { + "epoch": 0.7661946094106898, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 1.2947163743882712e-06, + "logits/chosen": 993110869.3333334, + "logits/rejected": 807963904.0, + "logps/chosen": -362.0122477213542, + "logps/rejected": -709.299658203125, + "loss": 0.0067, + "rewards/chosen": 4.077956199645996, + "rewards/margins": 13.60804042816162, + "rewards/rejected": -9.530084228515625, + "step": 8386 + }, + { + "epoch": 0.7662859753312015, + "grad_norm": 33.75, + "kl": 0.0, + "learning_rate": 1.2937511277856096e-06, + "logits/chosen": 603590357.3333334, + "logits/rejected": 574284902.4, + "logps/chosen": -375.6752115885417, + "logps/rejected": -598.859033203125, + "loss": 0.1044, + "rewards/chosen": 2.991264979044596, + "rewards/margins": 11.398911158243815, + "rewards/rejected": -8.407646179199219, + "step": 8387 + }, + { + "epoch": 0.7663773412517131, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.2927861876553904e-06, + "logits/chosen": 580360768.0, + "logits/rejected": 646944448.0, + "logps/chosen": -244.51377868652344, + "logps/rejected": -750.8505249023438, + "loss": 0.0228, + "rewards/chosen": 3.330331802368164, + "rewards/margins": 12.59694766998291, + "rewards/rejected": -9.266615867614746, + "step": 8388 + }, + { + "epoch": 0.7664687071722247, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.2918215540774054e-06, + "logits/chosen": 398847232.0, + "logits/rejected": 540764032.0, + "logps/chosen": -332.9111328125, + "logps/rejected": -490.4015197753906, + "loss": 0.0238, + "rewards/chosen": 3.444873571395874, + "rewards/margins": 13.480489492416382, + "rewards/rejected": -10.035615921020508, + "step": 8389 + }, + { + "epoch": 0.7665600730927364, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.2908572271314213e-06, + "logits/chosen": 629599360.0, + "logits/rejected": 556518016.0, + "logps/chosen": -248.0888875325521, + "logps/rejected": -720.9501342773438, + "loss": 0.0459, + "rewards/chosen": 3.225401242574056, + "rewards/margins": 13.715049107869467, + "rewards/rejected": -10.48964786529541, + "step": 8390 + }, + { + "epoch": 0.7666514390132481, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 1.2898932068971776e-06, + "logits/chosen": 361079398.4, + "logits/rejected": 506565845.3333333, + "logps/chosen": -198.54197998046874, + "logps/rejected": -690.4108072916666, + "loss": 0.0218, + "rewards/chosen": 3.6395187377929688, + "rewards/margins": 11.796853383382162, + "rewards/rejected": -8.157334645589193, + "step": 8391 + }, + { + "epoch": 0.7667428049337597, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 1.28892949345439e-06, + "logits/chosen": 404509504.0, + "logits/rejected": 472746240.0, + "logps/chosen": -394.38128662109375, + "logps/rejected": -589.02001953125, + "loss": 0.0508, + "rewards/chosen": 3.586992025375366, + "rewards/margins": 10.63873839378357, + "rewards/rejected": -7.051746368408203, + "step": 8392 + }, + { + "epoch": 0.7668341708542713, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 1.2879660868827508e-06, + "logits/chosen": 546471936.0, + "logits/rejected": 421574476.8, + "logps/chosen": -274.5218098958333, + "logps/rejected": -507.02314453125, + "loss": 0.0176, + "rewards/chosen": 3.266221046447754, + "rewards/margins": 12.15935459136963, + "rewards/rejected": -8.893133544921875, + "step": 8393 + }, + { + "epoch": 0.766925536774783, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 1.2870029872619234e-06, + "logits/chosen": 274747392.0, + "logits/rejected": 519680085.3333333, + "logps/chosen": -224.515283203125, + "logps/rejected": -696.4928385416666, + "loss": 0.0249, + "rewards/chosen": 3.7805877685546876, + "rewards/margins": 14.045699055989584, + "rewards/rejected": -10.265111287434896, + "step": 8394 + }, + { + "epoch": 0.7670169026952947, + "grad_norm": 42.0, + "kl": 0.0, + "learning_rate": 1.2860401946715474e-06, + "logits/chosen": 707454421.3333334, + "logits/rejected": 770040985.6, + "logps/chosen": -472.2481282552083, + "logps/rejected": -470.94443359375, + "loss": 0.0858, + "rewards/chosen": 5.052492777506511, + "rewards/margins": 11.886965433756512, + "rewards/rejected": -6.83447265625, + "step": 8395 + }, + { + "epoch": 0.7671082686158063, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 1.2850777091912364e-06, + "logits/chosen": 965654912.0, + "logits/rejected": 605142272.0, + "logps/chosen": -270.57305908203125, + "logps/rejected": -362.30517578125, + "loss": 0.0073, + "rewards/chosen": 4.257403373718262, + "rewards/margins": 12.486496925354004, + "rewards/rejected": -8.229093551635742, + "step": 8396 + }, + { + "epoch": 0.7671996345363179, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 1.2841155309005776e-06, + "logits/chosen": 496036736.0, + "logits/rejected": 623378560.0, + "logps/chosen": -417.80389404296875, + "logps/rejected": -401.5283203125, + "loss": 0.0088, + "rewards/chosen": 4.3512282371521, + "rewards/margins": 11.410489559173584, + "rewards/rejected": -7.059261322021484, + "step": 8397 + }, + { + "epoch": 0.7672910004568296, + "grad_norm": 69.0, + "kl": 0.0, + "learning_rate": 1.2831536598791382e-06, + "logits/chosen": 608950080.0, + "logits/rejected": 737480448.0, + "logps/chosen": -277.8221130371094, + "logps/rejected": -580.717529296875, + "loss": 0.065, + "rewards/chosen": 3.448218584060669, + "rewards/margins": 13.891410112380981, + "rewards/rejected": -10.443191528320312, + "step": 8398 + }, + { + "epoch": 0.7673823663773413, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 1.2821920962064532e-06, + "logits/chosen": 596936405.3333334, + "logits/rejected": 487239008.0, + "logps/chosen": -305.17681884765625, + "logps/rejected": -308.49969482421875, + "loss": 0.1417, + "rewards/chosen": 2.9310906728108725, + "rewards/margins": 7.65458091100057, + "rewards/rejected": -4.723490238189697, + "step": 8399 + }, + { + "epoch": 0.7674737322978529, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.281230839962036e-06, + "logits/chosen": 778931097.6, + "logits/rejected": 491416533.3333333, + "logps/chosen": -411.95478515625, + "logps/rejected": -436.4691162109375, + "loss": 0.0121, + "rewards/chosen": 4.161824035644531, + "rewards/margins": 13.44458465576172, + "rewards/rejected": -9.282760620117188, + "step": 8400 + }, + { + "epoch": 0.7675650982183646, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.2802698912253713e-06, + "logits/chosen": 644911308.8, + "logits/rejected": 923559594.6666666, + "logps/chosen": -342.3541748046875, + "logps/rejected": -730.338134765625, + "loss": 0.0144, + "rewards/chosen": 4.360988616943359, + "rewards/margins": 14.20013682047526, + "rewards/rejected": -9.8391482035319, + "step": 8401 + }, + { + "epoch": 0.7676564641388762, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 1.2793092500759247e-06, + "logits/chosen": 486150400.0, + "logits/rejected": 227448672.0, + "logps/chosen": -417.9547119140625, + "logps/rejected": -371.7103271484375, + "loss": 0.0186, + "rewards/chosen": 3.536489963531494, + "rewards/margins": 12.81137990951538, + "rewards/rejected": -9.274889945983887, + "step": 8402 + }, + { + "epoch": 0.7677478300593878, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.2783489165931296e-06, + "logits/chosen": 519720277.3333333, + "logits/rejected": 385130368.0, + "logps/chosen": -201.36014811197916, + "logps/rejected": -350.4514465332031, + "loss": 0.019, + "rewards/chosen": 4.379222551981608, + "rewards/margins": 13.53516165415446, + "rewards/rejected": -9.155939102172852, + "step": 8403 + }, + { + "epoch": 0.7678391959798995, + "grad_norm": 31.25, + "kl": 0.0, + "learning_rate": 1.2773888908563969e-06, + "logits/chosen": 476245350.4, + "logits/rejected": 1432608085.3333333, + "logps/chosen": -240.441455078125, + "logps/rejected": -403.7032877604167, + "loss": 0.1049, + "rewards/chosen": 3.6931396484375, + "rewards/margins": 15.286428070068359, + "rewards/rejected": -11.59328842163086, + "step": 8404 + }, + { + "epoch": 0.7679305619004112, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.2764291729451112e-06, + "logits/chosen": 971518549.3333334, + "logits/rejected": 629358182.4, + "logps/chosen": -564.0372721354166, + "logps/rejected": -487.80234375, + "loss": 0.0116, + "rewards/chosen": 3.683901786804199, + "rewards/margins": 13.769782829284669, + "rewards/rejected": -10.08588104248047, + "step": 8405 + }, + { + "epoch": 0.7680219278209228, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 1.2754697629386348e-06, + "logits/chosen": 598514102.8571428, + "logits/rejected": 435005888.0, + "logps/chosen": -312.56821986607144, + "logps/rejected": -398.3052062988281, + "loss": 0.0399, + "rewards/chosen": 3.089191436767578, + "rewards/margins": 11.998185157775879, + "rewards/rejected": -8.9089937210083, + "step": 8406 + }, + { + "epoch": 0.7681132937414344, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 1.2745106609162999e-06, + "logits/chosen": 365884842.6666667, + "logits/rejected": 493597542.4, + "logps/chosen": -279.6254069010417, + "logps/rejected": -421.09853515625, + "loss": 0.0194, + "rewards/chosen": 3.257814407348633, + "rewards/margins": 14.037540054321289, + "rewards/rejected": -10.779725646972656, + "step": 8407 + }, + { + "epoch": 0.7682046596619461, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 1.2735518669574164e-06, + "logits/chosen": 1014890325.3333334, + "logits/rejected": 873332224.0, + "logps/chosen": -353.9338785807292, + "logps/rejected": -647.2196044921875, + "loss": 0.0192, + "rewards/chosen": 3.847412427266439, + "rewards/margins": 13.497370084126791, + "rewards/rejected": -9.649957656860352, + "step": 8408 + }, + { + "epoch": 0.7682960255824578, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 1.2725933811412672e-06, + "logits/chosen": 727665834.6666666, + "logits/rejected": 972329164.8, + "logps/chosen": -223.37699381510416, + "logps/rejected": -592.838427734375, + "loss": 0.0224, + "rewards/chosen": 3.664852778116862, + "rewards/margins": 12.607502619425455, + "rewards/rejected": -8.942649841308594, + "step": 8409 + }, + { + "epoch": 0.7683873915029694, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 1.2716352035471085e-06, + "logits/chosen": 623849881.6, + "logits/rejected": 258266048.0, + "logps/chosen": -275.20517578125, + "logps/rejected": -351.6482340494792, + "loss": 0.0107, + "rewards/chosen": 4.263478088378906, + "rewards/margins": 12.476859537760415, + "rewards/rejected": -8.21338144938151, + "step": 8410 + }, + { + "epoch": 0.768478757423481, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.270677334254176e-06, + "logits/chosen": 599163136.0, + "logits/rejected": 192346032.0, + "logps/chosen": -418.4489339192708, + "logps/rejected": -253.48046875, + "loss": 0.0267, + "rewards/chosen": 3.568589210510254, + "rewards/margins": 12.219077110290527, + "rewards/rejected": -8.650487899780273, + "step": 8411 + }, + { + "epoch": 0.7685701233439927, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 1.2697197733416754e-06, + "logits/chosen": 346703904.0, + "logits/rejected": 610387712.0, + "logps/chosen": -91.95236206054688, + "logps/rejected": -630.29833984375, + "loss": 0.0344, + "rewards/chosen": 2.826996326446533, + "rewards/margins": 11.033546924591064, + "rewards/rejected": -8.206550598144531, + "step": 8412 + }, + { + "epoch": 0.7686614892645044, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 1.2687625208887872e-06, + "logits/chosen": 684117196.8, + "logits/rejected": 376950272.0, + "logps/chosen": -419.5515625, + "logps/rejected": -323.33734130859375, + "loss": 0.0247, + "rewards/chosen": 3.324748992919922, + "rewards/margins": 11.070615768432617, + "rewards/rejected": -7.745866775512695, + "step": 8413 + }, + { + "epoch": 0.768752855185016, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 1.2678055769746667e-06, + "logits/chosen": 995604480.0, + "logits/rejected": 800131584.0, + "logps/chosen": -277.9079284667969, + "logps/rejected": -494.382568359375, + "loss": 0.0121, + "rewards/chosen": 4.269265174865723, + "rewards/margins": 13.547261873881022, + "rewards/rejected": -9.277996699015299, + "step": 8414 + }, + { + "epoch": 0.7688442211055276, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 1.2668489416784464e-06, + "logits/chosen": 457498419.2, + "logits/rejected": 325550634.6666667, + "logps/chosen": -372.805126953125, + "logps/rejected": -359.412109375, + "loss": 0.0618, + "rewards/chosen": 3.7833221435546873, + "rewards/margins": 12.987381744384766, + "rewards/rejected": -9.204059600830078, + "step": 8415 + }, + { + "epoch": 0.7689355870260393, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.2658926150792321e-06, + "logits/chosen": 597079616.0, + "logits/rejected": 442037632.0, + "logps/chosen": -409.14215087890625, + "logps/rejected": -661.7821044921875, + "loss": 0.0138, + "rewards/chosen": 3.718341112136841, + "rewards/margins": 14.133713960647583, + "rewards/rejected": -10.415372848510742, + "step": 8416 + }, + { + "epoch": 0.769026952946551, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 1.2649365972560996e-06, + "logits/chosen": 776735436.8, + "logits/rejected": 143777408.0, + "logps/chosen": -482.12978515625, + "logps/rejected": -606.0109049479166, + "loss": 0.0248, + "rewards/chosen": 3.496295166015625, + "rewards/margins": 12.279860687255859, + "rewards/rejected": -8.783565521240234, + "step": 8417 + }, + { + "epoch": 0.7691183188670626, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.2639808882881032e-06, + "logits/chosen": 1102294323.2, + "logits/rejected": 566219861.3333334, + "logps/chosen": -386.96474609375, + "logps/rejected": -622.0716959635416, + "loss": 0.0169, + "rewards/chosen": 4.008527374267578, + "rewards/margins": 15.865426635742187, + "rewards/rejected": -11.85689926147461, + "step": 8418 + }, + { + "epoch": 0.7692096847875742, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 1.2630254882542737e-06, + "logits/chosen": 445715968.0, + "logits/rejected": 440086930.28571427, + "logps/chosen": -322.99420166015625, + "logps/rejected": -542.5908551897321, + "loss": 0.0688, + "rewards/chosen": 5.091821193695068, + "rewards/margins": 14.183804443904332, + "rewards/rejected": -9.091983250209264, + "step": 8419 + }, + { + "epoch": 0.7693010507080859, + "grad_norm": 3.375, + "kl": 4.326681137084961, + "learning_rate": 1.2620703972336124e-06, + "logits/chosen": 516219428.5714286, + "logits/rejected": 342622176.0, + "logps/chosen": -212.96200125558036, + "logps/rejected": -560.8289794921875, + "loss": 0.0273, + "rewards/chosen": 4.601411546979632, + "rewards/margins": 15.27192565373012, + "rewards/rejected": -10.670514106750488, + "step": 8420 + }, + { + "epoch": 0.7693924166285976, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 1.2611156153050963e-06, + "logits/chosen": 1045849526.8571428, + "logits/rejected": 539464960.0, + "logps/chosen": -274.23660714285717, + "logps/rejected": -372.491943359375, + "loss": 0.0626, + "rewards/chosen": 3.0287159511021207, + "rewards/margins": 12.469094140189036, + "rewards/rejected": -9.440378189086914, + "step": 8421 + }, + { + "epoch": 0.7694837825491092, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 1.2601611425476761e-06, + "logits/chosen": 878770816.0, + "logits/rejected": 1175262720.0, + "logps/chosen": -298.33953857421875, + "logps/rejected": -794.68212890625, + "loss": 0.012, + "rewards/chosen": 4.028416633605957, + "rewards/margins": 12.973896980285645, + "rewards/rejected": -8.945480346679688, + "step": 8422 + }, + { + "epoch": 0.7695751484696208, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 1.2592069790402806e-06, + "logits/chosen": 769263769.6, + "logits/rejected": 402510464.0, + "logps/chosen": -364.6987548828125, + "logps/rejected": -434.5775960286458, + "loss": 0.0225, + "rewards/chosen": 4.302836990356445, + "rewards/margins": 14.417358016967773, + "rewards/rejected": -10.114521026611328, + "step": 8423 + }, + { + "epoch": 0.7696665143901325, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.2582531248618085e-06, + "logits/chosen": 422809514.6666667, + "logits/rejected": 468427968.0, + "logps/chosen": -258.8794759114583, + "logps/rejected": -480.09521484375, + "loss": 0.0156, + "rewards/chosen": 4.285857200622559, + "rewards/margins": 14.029731750488281, + "rewards/rejected": -9.743874549865723, + "step": 8424 + }, + { + "epoch": 0.7697578803106442, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 1.2572995800911348e-06, + "logits/chosen": 601585194.6666666, + "logits/rejected": 279258432.0, + "logps/chosen": -343.8867594401042, + "logps/rejected": -264.7084045410156, + "loss": 0.0138, + "rewards/chosen": 4.263973871866862, + "rewards/margins": 12.797306696573894, + "rewards/rejected": -8.533332824707031, + "step": 8425 + }, + { + "epoch": 0.7698492462311558, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 1.2563463448071078e-06, + "logits/chosen": 696081664.0, + "logits/rejected": 952775296.0, + "logps/chosen": -485.71771240234375, + "logps/rejected": -292.8907165527344, + "loss": 0.0279, + "rewards/chosen": 3.1299545764923096, + "rewards/margins": 10.80189299583435, + "rewards/rejected": -7.671938419342041, + "step": 8426 + }, + { + "epoch": 0.7699406121516674, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 1.2553934190885535e-06, + "logits/chosen": 727635865.6, + "logits/rejected": 483456512.0, + "logps/chosen": -308.99765625, + "logps/rejected": -559.7353108723959, + "loss": 0.0089, + "rewards/chosen": 5.188739013671875, + "rewards/margins": 15.132730356852214, + "rewards/rejected": -9.943991343180338, + "step": 8427 + }, + { + "epoch": 0.770031978072179, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 1.2544408030142696e-06, + "logits/chosen": 594936729.6, + "logits/rejected": 350294656.0, + "logps/chosen": -300.5123779296875, + "logps/rejected": -396.5869954427083, + "loss": 0.0264, + "rewards/chosen": 3.49881591796875, + "rewards/margins": 13.951783879597983, + "rewards/rejected": -10.452967961629232, + "step": 8428 + }, + { + "epoch": 0.7701233439926908, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 1.2534884966630272e-06, + "logits/chosen": 678836672.0, + "logits/rejected": 579797674.6666666, + "logps/chosen": -260.089599609375, + "logps/rejected": -527.7332763671875, + "loss": 0.0244, + "rewards/chosen": 3.3604049682617188, + "rewards/margins": 11.913201649983725, + "rewards/rejected": -8.552796681722006, + "step": 8429 + }, + { + "epoch": 0.7702147099132024, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 1.2525365001135741e-06, + "logits/chosen": 411965977.6, + "logits/rejected": 373271893.3333333, + "logps/chosen": -315.75283203125, + "logps/rejected": -701.96533203125, + "loss": 0.0093, + "rewards/chosen": 4.617942810058594, + "rewards/margins": 16.40129597981771, + "rewards/rejected": -11.783353169759115, + "step": 8430 + }, + { + "epoch": 0.770306075833714, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.2515848134446301e-06, + "logits/chosen": 542550698.6666666, + "logits/rejected": 704486144.0, + "logps/chosen": -377.4728597005208, + "logps/rejected": -692.9840087890625, + "loss": 0.0165, + "rewards/chosen": 4.286953608194987, + "rewards/margins": 12.219744841257732, + "rewards/rejected": -7.932791233062744, + "step": 8431 + }, + { + "epoch": 0.7703974417542256, + "grad_norm": 6.53125, + "kl": 5.255517959594727, + "learning_rate": 1.2506334367348933e-06, + "logits/chosen": 498450048.0, + "logps/chosen": -290.68060302734375, + "loss": 0.0604, + "rewards/chosen": 3.482433795928955, + "step": 8432 + }, + { + "epoch": 0.7704888076747374, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 1.2496823700630329e-06, + "logits/chosen": 647554048.0, + "logits/rejected": 409160064.0, + "logps/chosen": -262.0334065755208, + "logps/rejected": -423.49912109375, + "loss": 0.0161, + "rewards/chosen": 3.438309669494629, + "rewards/margins": 12.234826850891114, + "rewards/rejected": -8.796517181396485, + "step": 8433 + }, + { + "epoch": 0.770580173595249, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 1.2487316135076931e-06, + "logits/chosen": 714440384.0, + "logits/rejected": 399724928.0, + "logps/chosen": -342.95343017578125, + "logps/rejected": -463.3875732421875, + "loss": 0.0121, + "rewards/chosen": 3.98532772064209, + "rewards/margins": 12.98180866241455, + "rewards/rejected": -8.996480941772461, + "step": 8434 + }, + { + "epoch": 0.7706715395157606, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.247781167147491e-06, + "logits/chosen": 469609557.3333333, + "logits/rejected": 720313664.0, + "logps/chosen": -388.8188883463542, + "logps/rejected": -843.1451416015625, + "loss": 0.0181, + "rewards/chosen": 4.125856081644694, + "rewards/margins": 13.794734636942547, + "rewards/rejected": -9.668878555297852, + "step": 8435 + }, + { + "epoch": 0.7707629054362722, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 1.2468310310610227e-06, + "logits/chosen": 600899968.0, + "logits/rejected": 721523392.0, + "logps/chosen": -283.91375732421875, + "logps/rejected": -484.1163635253906, + "loss": 0.0104, + "rewards/chosen": 4.73490047454834, + "rewards/margins": 13.540122985839844, + "rewards/rejected": -8.805222511291504, + "step": 8436 + }, + { + "epoch": 0.770854271356784, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.2458812053268543e-06, + "logits/chosen": 459628970.6666667, + "logits/rejected": 98881112.0, + "logps/chosen": -403.293701171875, + "logps/rejected": -635.1575927734375, + "loss": 0.0245, + "rewards/chosen": 4.0684458414713545, + "rewards/margins": 16.354268709818523, + "rewards/rejected": -12.285822868347168, + "step": 8437 + }, + { + "epoch": 0.7709456372772956, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 1.2449316900235275e-06, + "logits/chosen": 1077052211.2, + "logits/rejected": 606423125.3333334, + "logps/chosen": -442.952734375, + "logps/rejected": -461.6158040364583, + "loss": 0.0256, + "rewards/chosen": 3.417239761352539, + "rewards/margins": 12.734607315063476, + "rewards/rejected": -9.317367553710938, + "step": 8438 + }, + { + "epoch": 0.7710370031978072, + "grad_norm": 0.57421875, + "kl": 0.0, + "learning_rate": 1.243982485229559e-06, + "logits/chosen": 345690272.0, + "logits/rejected": 473565888.0, + "logps/chosen": -176.39517211914062, + "logps/rejected": -531.5028076171875, + "loss": 0.004, + "rewards/chosen": 5.043790340423584, + "rewards/margins": 15.06526517868042, + "rewards/rejected": -10.021474838256836, + "step": 8439 + }, + { + "epoch": 0.7711283691183188, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 1.2430335910234365e-06, + "logits/chosen": 784796364.8, + "logits/rejected": 652135722.6666666, + "logps/chosen": -372.981591796875, + "logps/rejected": -549.6538899739584, + "loss": 0.0262, + "rewards/chosen": 3.4872249603271483, + "rewards/margins": 13.28951555887858, + "rewards/rejected": -9.802290598551432, + "step": 8440 + }, + { + "epoch": 0.7712197350388306, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.2420850074836288e-06, + "logits/chosen": 576264089.6, + "logits/rejected": 608339669.3333334, + "logps/chosen": -383.6909912109375, + "logps/rejected": -377.1603190104167, + "loss": 0.0205, + "rewards/chosen": 3.9498550415039064, + "rewards/margins": 12.16568489074707, + "rewards/rejected": -8.215829849243164, + "step": 8441 + }, + { + "epoch": 0.7713111009593422, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.2411367346885727e-06, + "logits/chosen": 625897344.0, + "logits/rejected": 826376192.0, + "logps/chosen": -442.0965576171875, + "logps/rejected": -488.3085123697917, + "loss": 0.0092, + "rewards/chosen": 3.471217393875122, + "rewards/margins": 13.34371050198873, + "rewards/rejected": -9.872493108113607, + "step": 8442 + }, + { + "epoch": 0.7714024668798538, + "grad_norm": 16.5, + "kl": 35.90379333496094, + "learning_rate": 1.2401887727166828e-06, + "logits/chosen": 999031744.0, + "logps/chosen": -869.6777954101562, + "loss": 0.294, + "rewards/chosen": 6.419641494750977, + "step": 8443 + }, + { + "epoch": 0.7714938328003654, + "grad_norm": 7.25, + "kl": 8.865949630737305, + "learning_rate": 1.2392411216463451e-06, + "logits/chosen": 426613942.85714287, + "logits/rejected": 884040960.0, + "logps/chosen": -346.95333426339283, + "logps/rejected": -501.2235107421875, + "loss": 0.0639, + "rewards/chosen": 3.7949676513671875, + "rewards/margins": 13.3399076461792, + "rewards/rejected": -9.544939994812012, + "step": 8444 + }, + { + "epoch": 0.7715851987208772, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.2382937815559232e-06, + "logits/chosen": 596681728.0, + "logits/rejected": 662251904.0, + "logps/chosen": -317.0569152832031, + "logps/rejected": -610.2861938476562, + "loss": 0.0108, + "rewards/chosen": 4.140364646911621, + "rewards/margins": 14.77611255645752, + "rewards/rejected": -10.635747909545898, + "step": 8445 + }, + { + "epoch": 0.7716765646413888, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.237346752523752e-06, + "logits/chosen": 574175616.0, + "logits/rejected": 393463680.0, + "logps/chosen": -291.3389587402344, + "logps/rejected": -399.0336507161458, + "loss": 0.0087, + "rewards/chosen": 3.644808292388916, + "rewards/margins": 12.119066715240479, + "rewards/rejected": -8.474258422851562, + "step": 8446 + }, + { + "epoch": 0.7717679305619004, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.2364000346281413e-06, + "logits/chosen": 595455317.3333334, + "logits/rejected": 565921382.4, + "logps/chosen": -331.8204752604167, + "logps/rejected": -349.638916015625, + "loss": 0.0123, + "rewards/chosen": 3.8208182652791343, + "rewards/margins": 12.856656583150228, + "rewards/rejected": -9.035838317871093, + "step": 8447 + }, + { + "epoch": 0.771859296482412, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.235453627947379e-06, + "logits/chosen": 552852224.0, + "logits/rejected": 1085920256.0, + "logps/chosen": -245.5531005859375, + "logps/rejected": -382.74481201171875, + "loss": 0.0238, + "rewards/chosen": 3.3618409633636475, + "rewards/margins": 12.775678396224976, + "rewards/rejected": -9.413837432861328, + "step": 8448 + }, + { + "epoch": 0.7719506624029238, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.2345075325597217e-06, + "logits/chosen": 395113376.0, + "logits/rejected": 534387648.0, + "logps/chosen": -354.6217041015625, + "logps/rejected": -360.57708740234375, + "loss": 0.0209, + "rewards/chosen": 3.941472053527832, + "rewards/margins": 12.112885475158691, + "rewards/rejected": -8.17141342163086, + "step": 8449 + }, + { + "epoch": 0.7720420283234354, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 1.2335617485434043e-06, + "logits/chosen": 1302483968.0, + "logits/rejected": 663420288.0, + "logps/chosen": -349.2669677734375, + "logps/rejected": -457.0113932291667, + "loss": 0.0126, + "rewards/chosen": 3.160116672515869, + "rewards/margins": 12.34490696589152, + "rewards/rejected": -9.18479029337565, + "step": 8450 + }, + { + "epoch": 0.772133394243947, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.2326162759766325e-06, + "logits/chosen": 661888384.0, + "logits/rejected": 478832704.0, + "logps/chosen": -463.8697509765625, + "logps/rejected": -600.8272094726562, + "loss": 0.012, + "rewards/chosen": 3.9652183055877686, + "rewards/margins": 13.698528051376343, + "rewards/rejected": -9.733309745788574, + "step": 8451 + }, + { + "epoch": 0.7722247601644586, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 1.2316711149375882e-06, + "logits/chosen": 549366528.0, + "logits/rejected": 502646080.0, + "logps/chosen": -292.3935953776042, + "logps/rejected": -596.153564453125, + "loss": 0.0195, + "rewards/chosen": 4.199064254760742, + "rewards/margins": 15.097576141357422, + "rewards/rejected": -10.89851188659668, + "step": 8452 + }, + { + "epoch": 0.7723161260849704, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 1.23072626550443e-06, + "logits/chosen": 718473728.0, + "logits/rejected": 854847232.0, + "logps/chosen": -479.1451009114583, + "logps/rejected": -503.763330078125, + "loss": 0.0508, + "rewards/chosen": 3.3115666707356772, + "rewards/margins": 10.704463704427084, + "rewards/rejected": -7.392897033691407, + "step": 8453 + }, + { + "epoch": 0.772407492005482, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 1.229781727755286e-06, + "logits/chosen": 632165845.3333334, + "logits/rejected": 326365120.0, + "logps/chosen": -329.87111409505206, + "logps/rejected": -175.0840606689453, + "loss": 0.0133, + "rewards/chosen": 4.181925455729167, + "rewards/margins": 10.011924425760906, + "rewards/rejected": -5.829998970031738, + "step": 8454 + }, + { + "epoch": 0.7724988579259936, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.2288375017682614e-06, + "logits/chosen": 478835264.0, + "logits/rejected": 453933056.0, + "logps/chosen": -345.14947509765625, + "logps/rejected": -561.3700358072916, + "loss": 0.0086, + "rewards/chosen": 4.135287284851074, + "rewards/margins": 13.354295412699381, + "rewards/rejected": -9.219008127848307, + "step": 8455 + }, + { + "epoch": 0.7725902238465052, + "grad_norm": 0.94140625, + "kl": 0.0, + "learning_rate": 1.2278935876214338e-06, + "logits/chosen": 741458752.0, + "logits/rejected": 601872310.8571428, + "logps/chosen": -33.40069580078125, + "logps/rejected": -440.1556919642857, + "loss": 0.0067, + "rewards/chosen": 2.9256343841552734, + "rewards/margins": 12.049497059413365, + "rewards/rejected": -9.123862675258092, + "step": 8456 + }, + { + "epoch": 0.772681589767017, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 1.226949985392858e-06, + "logits/chosen": 593467904.0, + "logits/rejected": 533917098.6666667, + "logps/chosen": -350.2921630859375, + "logps/rejected": -366.1883544921875, + "loss": 0.0225, + "rewards/chosen": 3.5280242919921876, + "rewards/margins": 12.100504811604818, + "rewards/rejected": -8.57248051961263, + "step": 8457 + }, + { + "epoch": 0.7727729556875286, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 1.2260066951605608e-06, + "logits/chosen": 606990592.0, + "logits/rejected": 393859072.0, + "logps/chosen": -523.5650024414062, + "logps/rejected": -401.76043701171875, + "loss": 0.0234, + "rewards/chosen": 3.4393677711486816, + "rewards/margins": 12.19431734085083, + "rewards/rejected": -8.754949569702148, + "step": 8458 + }, + { + "epoch": 0.7728643216080402, + "grad_norm": 55.75, + "kl": 0.0, + "learning_rate": 1.2250637170025425e-06, + "logits/chosen": 386700714.6666667, + "logits/rejected": 452168396.8, + "logps/chosen": -318.53466796875, + "logps/rejected": -389.1337890625, + "loss": 0.0843, + "rewards/chosen": 2.9629974365234375, + "rewards/margins": 10.329361724853516, + "rewards/rejected": -7.366364288330078, + "step": 8459 + }, + { + "epoch": 0.7729556875285518, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 1.2241210509967799e-06, + "logits/chosen": 523551296.0, + "logits/rejected": 292366165.3333333, + "logps/chosen": -260.625244140625, + "logps/rejected": -341.4883219401042, + "loss": 0.0066, + "rewards/chosen": 3.8904404640197754, + "rewards/margins": 12.34572458267212, + "rewards/rejected": -8.455284118652344, + "step": 8460 + }, + { + "epoch": 0.7730470534490635, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.2231786972212206e-06, + "logits/chosen": 566790875.4285715, + "logits/rejected": 576541696.0, + "logps/chosen": -354.94517299107144, + "logps/rejected": -438.59735107421875, + "loss": 0.0195, + "rewards/chosen": 4.235213143484933, + "rewards/margins": 14.783880097525461, + "rewards/rejected": -10.548666954040527, + "step": 8461 + }, + { + "epoch": 0.7731384193695752, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 1.2222366557537911e-06, + "logits/chosen": 684760384.0, + "logits/rejected": 713790784.0, + "logps/chosen": -479.5823974609375, + "logps/rejected": -503.40533447265625, + "loss": 0.0156, + "rewards/chosen": 4.011265277862549, + "rewards/margins": 12.549669742584229, + "rewards/rejected": -8.53840446472168, + "step": 8462 + }, + { + "epoch": 0.7732297852900868, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 1.2212949266723884e-06, + "logits/chosen": 485917952.0, + "logits/rejected": 452647722.6666667, + "logps/chosen": -442.3819885253906, + "logps/rejected": -521.0025227864584, + "loss": 0.003, + "rewards/chosen": 4.480560302734375, + "rewards/margins": 13.623491287231445, + "rewards/rejected": -9.14293098449707, + "step": 8463 + }, + { + "epoch": 0.7733211512105984, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 1.2203535100548853e-06, + "logits/chosen": 696499626.6666666, + "logits/rejected": 462808928.0, + "logps/chosen": -480.5313313802083, + "logps/rejected": -442.5819091796875, + "loss": 0.0218, + "rewards/chosen": 3.6868158976236978, + "rewards/margins": 11.586395899454752, + "rewards/rejected": -7.899580001831055, + "step": 8464 + }, + { + "epoch": 0.7734125171311101, + "grad_norm": 0.9609375, + "kl": 0.0, + "learning_rate": 1.2194124059791258e-06, + "logits/chosen": 426367040.0, + "logits/rejected": 822532096.0, + "logps/chosen": -285.2917175292969, + "logps/rejected": -305.48370361328125, + "loss": 0.0064, + "rewards/chosen": 4.679766654968262, + "rewards/margins": 12.440878868103027, + "rewards/rejected": -7.761112213134766, + "step": 8465 + }, + { + "epoch": 0.7735038830516218, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 1.2184716145229336e-06, + "logits/chosen": 586861568.0, + "logits/rejected": 679773504.0, + "logps/chosen": -359.3643493652344, + "logps/rejected": -507.8109436035156, + "loss": 0.0236, + "rewards/chosen": 3.799672842025757, + "rewards/margins": 12.426655530929565, + "rewards/rejected": -8.626982688903809, + "step": 8466 + }, + { + "epoch": 0.7735952489721334, + "grad_norm": 26.75, + "kl": 0.0, + "learning_rate": 1.2175311357641029e-06, + "logits/chosen": 270523072.0, + "logits/rejected": 685166762.6666666, + "logps/chosen": -195.81329345703125, + "logps/rejected": -535.255126953125, + "loss": 0.0303, + "rewards/chosen": 5.603424072265625, + "rewards/margins": 14.30927594502767, + "rewards/rejected": -8.705851872762045, + "step": 8467 + }, + { + "epoch": 0.773686614892645, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 1.2165909697804012e-06, + "logits/chosen": 425370624.0, + "logits/rejected": 1214148736.0, + "logps/chosen": -253.83650716145834, + "logps/rejected": -818.957763671875, + "loss": 0.0542, + "rewards/chosen": 2.9128634134928384, + "rewards/margins": 15.829363505045572, + "rewards/rejected": -12.916500091552734, + "step": 8468 + }, + { + "epoch": 0.7737779808131567, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 1.2156511166495715e-06, + "logits/chosen": 601287360.0, + "logits/rejected": 426677632.0, + "logps/chosen": -552.571044921875, + "logps/rejected": -643.0367431640625, + "loss": 0.0162, + "rewards/chosen": 3.7601349353790283, + "rewards/margins": 14.543662309646606, + "rewards/rejected": -10.783527374267578, + "step": 8469 + }, + { + "epoch": 0.7738693467336684, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.2147115764493345e-06, + "logits/chosen": 601679462.4, + "logits/rejected": 313236160.0, + "logps/chosen": -446.88203125, + "logps/rejected": -341.19964599609375, + "loss": 0.0194, + "rewards/chosen": 3.775812530517578, + "rewards/margins": 12.114561462402344, + "rewards/rejected": -8.338748931884766, + "step": 8470 + }, + { + "epoch": 0.77396071265418, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 1.2137723492573766e-06, + "logits/chosen": 754335296.0, + "logits/rejected": 513763680.0, + "logps/chosen": -385.51416015625, + "logps/rejected": -613.9114990234375, + "loss": 0.0257, + "rewards/chosen": 3.016726016998291, + "rewards/margins": 14.769116878509521, + "rewards/rejected": -11.75239086151123, + "step": 8471 + }, + { + "epoch": 0.7740520785746916, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 1.212833435151366e-06, + "logits/chosen": 640092620.8, + "logits/rejected": 780051456.0, + "logps/chosen": -195.1490966796875, + "logps/rejected": -441.24560546875, + "loss": 0.1345, + "rewards/chosen": 2.8100337982177734, + "rewards/margins": 11.300423940022787, + "rewards/rejected": -8.490390141805014, + "step": 8472 + }, + { + "epoch": 0.7741434444952033, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.2118948342089392e-06, + "logits/chosen": 497080704.0, + "logits/rejected": 298831308.8, + "logps/chosen": -319.4466145833333, + "logps/rejected": -363.575830078125, + "loss": 0.0102, + "rewards/chosen": 4.170351028442383, + "rewards/margins": 12.938401412963866, + "rewards/rejected": -8.768050384521484, + "step": 8473 + }, + { + "epoch": 0.774234810415715, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 1.210956546507714e-06, + "logits/chosen": 370146474.6666667, + "logits/rejected": 227529574.4, + "logps/chosen": -213.7808837890625, + "logps/rejected": -403.5421142578125, + "loss": 0.0163, + "rewards/chosen": 3.56679630279541, + "rewards/margins": 13.829448127746582, + "rewards/rejected": -10.262651824951172, + "step": 8474 + }, + { + "epoch": 0.7743261763362266, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 1.210018572125276e-06, + "logits/chosen": 597226240.0, + "logits/rejected": 609659520.0, + "logps/chosen": -340.7566223144531, + "logps/rejected": -428.3320617675781, + "loss": 0.0181, + "rewards/chosen": 4.0075836181640625, + "rewards/margins": 12.0123872756958, + "rewards/rejected": -8.004803657531738, + "step": 8475 + }, + { + "epoch": 0.7744175422567382, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.209080911139187e-06, + "logits/chosen": 460577426.28571427, + "logits/rejected": 475896512.0, + "logps/chosen": -389.9697265625, + "logps/rejected": -165.3448944091797, + "loss": 0.0407, + "rewards/chosen": 3.6677916390555247, + "rewards/margins": 10.277095113481794, + "rewards/rejected": -6.6093034744262695, + "step": 8476 + }, + { + "epoch": 0.7745089081772499, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 1.2081435636269823e-06, + "logits/chosen": 645500518.4, + "logits/rejected": 437148928.0, + "logps/chosen": -320.315087890625, + "logps/rejected": -269.78774007161456, + "loss": 0.0474, + "rewards/chosen": 2.95650577545166, + "rewards/margins": 9.88411808013916, + "rewards/rejected": -6.9276123046875, + "step": 8477 + }, + { + "epoch": 0.7746002740977616, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.2072065296661733e-06, + "logits/chosen": 604528486.4, + "logits/rejected": 588498005.3333334, + "logps/chosen": -356.515380859375, + "logps/rejected": -779.6072591145834, + "loss": 0.0378, + "rewards/chosen": 3.0912288665771483, + "rewards/margins": 14.290403111775717, + "rewards/rejected": -11.199174245198568, + "step": 8478 + }, + { + "epoch": 0.7746916400182732, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.2062698093342435e-06, + "logits/chosen": 455113728.0, + "logits/rejected": 296227968.0, + "logps/chosen": -385.349951171875, + "logps/rejected": -471.9921061197917, + "loss": 0.0199, + "rewards/chosen": 3.578695297241211, + "rewards/margins": 13.95306396484375, + "rewards/rejected": -10.374368667602539, + "step": 8479 + }, + { + "epoch": 0.7747830059387848, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 1.2053334027086506e-06, + "logits/chosen": 1103771392.0, + "logits/rejected": 414860544.0, + "logps/chosen": -345.3287353515625, + "logps/rejected": -405.56744384765625, + "loss": 0.0069, + "rewards/chosen": 4.598624229431152, + "rewards/margins": 12.323210716247559, + "rewards/rejected": -7.724586486816406, + "step": 8480 + }, + { + "epoch": 0.7748743718592965, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.2043973098668266e-06, + "logits/chosen": 1206615040.0, + "logits/rejected": 545640192.0, + "logps/chosen": -408.690283203125, + "logps/rejected": -415.764404296875, + "loss": 0.0141, + "rewards/chosen": 4.057796859741211, + "rewards/margins": 12.540817006429037, + "rewards/rejected": -8.483020146687826, + "step": 8481 + }, + { + "epoch": 0.7749657377798082, + "grad_norm": 25.875, + "kl": 0.0, + "learning_rate": 1.2034615308861775e-06, + "logits/chosen": 968601497.6, + "logits/rejected": 546727765.3333334, + "logps/chosen": -231.0013916015625, + "logps/rejected": -361.140869140625, + "loss": 0.0448, + "rewards/chosen": 3.2524688720703123, + "rewards/margins": 10.29509874979655, + "rewards/rejected": -7.042629877726237, + "step": 8482 + }, + { + "epoch": 0.7750571037003198, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.2025260658440846e-06, + "logits/chosen": 604378009.6, + "logits/rejected": 515531349.3333333, + "logps/chosen": -431.55859375, + "logps/rejected": -576.9222819010416, + "loss": 0.019, + "rewards/chosen": 3.5232307434082033, + "rewards/margins": 16.216288757324218, + "rewards/rejected": -12.693058013916016, + "step": 8483 + }, + { + "epoch": 0.7751484696208314, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 1.2015909148179021e-06, + "logits/chosen": 630706261.3333334, + "logits/rejected": 458613145.6, + "logps/chosen": -368.3426106770833, + "logps/rejected": -390.37099609375, + "loss": 0.0098, + "rewards/chosen": 3.872536023457845, + "rewards/margins": 13.479801495869955, + "rewards/rejected": -9.60726547241211, + "step": 8484 + }, + { + "epoch": 0.7752398355413431, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.200656077884958e-06, + "logits/chosen": 689512345.6, + "logits/rejected": 1310451200.0, + "logps/chosen": -409.551025390625, + "logps/rejected": -427.1025390625, + "loss": 0.0135, + "rewards/chosen": 4.125140380859375, + "rewards/margins": 11.676202646891277, + "rewards/rejected": -7.551062266031901, + "step": 8485 + }, + { + "epoch": 0.7753312014618547, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 1.1997215551225533e-06, + "logits/chosen": 963728998.4, + "logits/rejected": 433152853.3333333, + "logps/chosen": -321.20810546875, + "logps/rejected": -339.4974772135417, + "loss": 0.0226, + "rewards/chosen": 3.955096435546875, + "rewards/margins": 11.87315165201823, + "rewards/rejected": -7.9180552164713545, + "step": 8486 + }, + { + "epoch": 0.7754225673823664, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 1.1987873466079663e-06, + "logits/chosen": 630384320.0, + "logits/rejected": 311604406.85714287, + "logps/chosen": -131.6226806640625, + "logps/rejected": -455.14317103794644, + "loss": 0.0038, + "rewards/chosen": 3.5942704677581787, + "rewards/margins": 12.719496079853602, + "rewards/rejected": -9.125225612095424, + "step": 8487 + }, + { + "epoch": 0.775513933302878, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.1978534524184475e-06, + "logits/chosen": 383279052.8, + "logits/rejected": 436210218.6666667, + "logps/chosen": -283.2425537109375, + "logps/rejected": -583.421630859375, + "loss": 0.016, + "rewards/chosen": 3.995710754394531, + "rewards/margins": 13.711981201171875, + "rewards/rejected": -9.716270446777344, + "step": 8488 + }, + { + "epoch": 0.7756052992233897, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.1969198726312204e-06, + "logits/chosen": 406267221.3333333, + "logits/rejected": 480112793.6, + "logps/chosen": -322.1937662760417, + "logps/rejected": -460.040380859375, + "loss": 0.0112, + "rewards/chosen": 3.615091323852539, + "rewards/margins": 14.195874404907226, + "rewards/rejected": -10.580783081054687, + "step": 8489 + }, + { + "epoch": 0.7756966651439013, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.1959866073234815e-06, + "logits/chosen": 516950400.0, + "logits/rejected": 440290611.2, + "logps/chosen": -252.02362060546875, + "logps/rejected": -622.9931640625, + "loss": 0.0166, + "rewards/chosen": 3.102264404296875, + "rewards/margins": 13.801470947265624, + "rewards/rejected": -10.69920654296875, + "step": 8490 + }, + { + "epoch": 0.775788031064413, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 1.1950536565724075e-06, + "logits/chosen": 340440832.0, + "logits/rejected": 489999008.0, + "logps/chosen": -163.63414001464844, + "logps/rejected": -735.6236572265625, + "loss": 0.0463, + "rewards/chosen": 2.819061517715454, + "rewards/margins": 14.776424646377563, + "rewards/rejected": -11.95736312866211, + "step": 8491 + }, + { + "epoch": 0.7758793969849246, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 1.1941210204551419e-06, + "logits/chosen": 505701376.0, + "logits/rejected": 519254080.0, + "logps/chosen": -326.0537109375, + "logps/rejected": -460.7583923339844, + "loss": 0.0176, + "rewards/chosen": 3.3579797744750977, + "rewards/margins": 13.4599027633667, + "rewards/rejected": -10.101922988891602, + "step": 8492 + }, + { + "epoch": 0.7759707629054363, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.1931886990488057e-06, + "logits/chosen": 708669235.2, + "logits/rejected": 440228181.3333333, + "logps/chosen": -320.627197265625, + "logps/rejected": -426.4383138020833, + "loss": 0.0211, + "rewards/chosen": 3.5323509216308593, + "rewards/margins": 13.640772247314453, + "rewards/rejected": -10.108421325683594, + "step": 8493 + }, + { + "epoch": 0.7760621288259479, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 1.1922566924304935e-06, + "logits/chosen": 602653696.0, + "logits/rejected": 1002930995.2, + "logps/chosen": -287.5338948567708, + "logps/rejected": -371.8326904296875, + "loss": 0.0392, + "rewards/chosen": 2.5005617141723633, + "rewards/margins": 11.23695240020752, + "rewards/rejected": -8.736390686035156, + "step": 8494 + }, + { + "epoch": 0.7761534947464596, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 1.1913250006772714e-06, + "logits/chosen": 714323558.4, + "logits/rejected": 830551893.3333334, + "logps/chosen": -369.423388671875, + "logps/rejected": -370.381591796875, + "loss": 0.0159, + "rewards/chosen": 3.7946277618408204, + "rewards/margins": 13.793012873331705, + "rewards/rejected": -9.998385111490885, + "step": 8495 + }, + { + "epoch": 0.7762448606669712, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.1903936238661868e-06, + "logits/chosen": 646363008.0, + "logits/rejected": 469566634.6666667, + "logps/chosen": -532.6213989257812, + "logps/rejected": -541.2290445963541, + "loss": 0.011, + "rewards/chosen": 3.243635654449463, + "rewards/margins": 13.224483331044516, + "rewards/rejected": -9.980847676595053, + "step": 8496 + }, + { + "epoch": 0.7763362265874829, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.189462562074251e-06, + "logits/chosen": 665673600.0, + "logits/rejected": 930411520.0, + "logps/chosen": -375.80816650390625, + "logps/rejected": -460.7047932942708, + "loss": 0.0089, + "rewards/chosen": 3.3334898948669434, + "rewards/margins": 13.655578136444092, + "rewards/rejected": -10.322088241577148, + "step": 8497 + }, + { + "epoch": 0.7764275925079945, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 1.1885318153784548e-06, + "logits/chosen": 666548608.0, + "logits/rejected": 294065824.0, + "logps/chosen": -416.678466796875, + "logps/rejected": -415.2027587890625, + "loss": 0.0213, + "rewards/chosen": 3.8212761878967285, + "rewards/margins": 12.587414264678955, + "rewards/rejected": -8.766138076782227, + "step": 8498 + }, + { + "epoch": 0.7765189584285062, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 1.187601383855765e-06, + "logits/chosen": 419147733.3333333, + "logits/rejected": 537810329.6, + "logps/chosen": -428.69482421875, + "logps/rejected": -308.307275390625, + "loss": 0.0132, + "rewards/chosen": 3.7700535456339517, + "rewards/margins": 13.90459410349528, + "rewards/rejected": -10.134540557861328, + "step": 8499 + }, + { + "epoch": 0.7766103243490178, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 1.1866712675831177e-06, + "logits/chosen": 586341478.4, + "logits/rejected": 530684032.0, + "logps/chosen": -341.341162109375, + "logps/rejected": -396.8889567057292, + "loss": 0.0136, + "rewards/chosen": 4.374796295166016, + "rewards/margins": 13.358257039388022, + "rewards/rejected": -8.983460744222006, + "step": 8500 + }, + { + "epoch": 0.7767016902695295, + "grad_norm": 0.578125, + "kl": 0.0, + "learning_rate": 1.1857414666374255e-06, + "logits/chosen": 165249888.0, + "logits/rejected": 331357482.6666667, + "logps/chosen": -147.0980224609375, + "logps/rejected": -404.1466471354167, + "loss": 0.0189, + "rewards/chosen": 3.9551830291748047, + "rewards/margins": 13.739154179890951, + "rewards/rejected": -9.783971150716146, + "step": 8501 + }, + { + "epoch": 0.7767930561900411, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 1.184811981095574e-06, + "logits/chosen": 542968883.2, + "logits/rejected": 637434965.3333334, + "logps/chosen": -352.917529296875, + "logps/rejected": -652.5066324869791, + "loss": 0.014, + "rewards/chosen": 4.140807342529297, + "rewards/margins": 12.259093475341796, + "rewards/rejected": -8.1182861328125, + "step": 8502 + }, + { + "epoch": 0.7768844221105528, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 1.183882811034422e-06, + "logits/chosen": 386741196.8, + "logits/rejected": 418033024.0, + "logps/chosen": -269.44814453125, + "logps/rejected": -801.6583658854166, + "loss": 0.1384, + "rewards/chosen": 3.540876007080078, + "rewards/margins": 11.990365600585937, + "rewards/rejected": -8.44948959350586, + "step": 8503 + }, + { + "epoch": 0.7769757880310644, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.1829539565308057e-06, + "logits/chosen": 424945280.0, + "logits/rejected": 528373792.0, + "logps/chosen": -266.80865478515625, + "logps/rejected": -810.3638916015625, + "loss": 0.0121, + "rewards/chosen": 4.509147644042969, + "rewards/margins": 16.834409713745117, + "rewards/rejected": -12.325262069702148, + "step": 8504 + }, + { + "epoch": 0.7770671539515761, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 1.1820254176615315e-06, + "logits/chosen": 629433088.0, + "logits/rejected": 666495360.0, + "logps/chosen": -335.7950439453125, + "logps/rejected": -521.1671752929688, + "loss": 0.0202, + "rewards/chosen": 3.3495495319366455, + "rewards/margins": 14.909639596939087, + "rewards/rejected": -11.560090065002441, + "step": 8505 + }, + { + "epoch": 0.7771585198720877, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.1810971945033807e-06, + "logits/chosen": 326295628.8, + "logits/rejected": 540333056.0, + "logps/chosen": -227.6662841796875, + "logps/rejected": -520.2916666666666, + "loss": 0.0285, + "rewards/chosen": 3.7393867492675783, + "rewards/margins": 11.704047393798827, + "rewards/rejected": -7.96466064453125, + "step": 8506 + }, + { + "epoch": 0.7772498857925993, + "grad_norm": 25.625, + "kl": 0.0, + "learning_rate": 1.1801692871331076e-06, + "logits/chosen": 524867392.0, + "logits/rejected": 498432128.0, + "logps/chosen": -306.39251708984375, + "logps/rejected": -405.53326416015625, + "loss": 0.0419, + "rewards/chosen": 3.548473358154297, + "rewards/margins": 9.847911834716797, + "rewards/rejected": -6.2994384765625, + "step": 8507 + }, + { + "epoch": 0.777341251713111, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.1792416956274443e-06, + "logits/chosen": 379610016.0, + "logits/rejected": 552015061.3333334, + "logps/chosen": -186.97238159179688, + "logps/rejected": -464.7740885416667, + "loss": 0.0117, + "rewards/chosen": 3.409964084625244, + "rewards/margins": 13.174697717030844, + "rewards/rejected": -9.7647336324056, + "step": 8508 + }, + { + "epoch": 0.7774326176336227, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.178314420063093e-06, + "logits/chosen": 956021056.0, + "logits/rejected": 623970048.0, + "logps/chosen": -272.05072021484375, + "logps/rejected": -571.7673950195312, + "loss": 0.0148, + "rewards/chosen": 4.207393646240234, + "rewards/margins": 14.656800270080566, + "rewards/rejected": -10.449406623840332, + "step": 8509 + }, + { + "epoch": 0.7775239835541343, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 1.17738746051673e-06, + "logits/chosen": 858174683.4285715, + "logits/rejected": 680836160.0, + "logps/chosen": -280.6268833705357, + "logps/rejected": -623.2003173828125, + "loss": 0.0236, + "rewards/chosen": 4.036591666085379, + "rewards/margins": 17.999644415719168, + "rewards/rejected": -13.963052749633789, + "step": 8510 + }, + { + "epoch": 0.777615349474646, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 1.1764608170650061e-06, + "logits/chosen": 583784064.0, + "logits/rejected": 587584896.0, + "logps/chosen": -415.77789306640625, + "logps/rejected": -486.53778076171875, + "loss": 0.0213, + "rewards/chosen": 3.246354818344116, + "rewards/margins": 12.482296705245972, + "rewards/rejected": -9.235941886901855, + "step": 8511 + }, + { + "epoch": 0.7777067153951576, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.1755344897845478e-06, + "logits/chosen": 579422822.4, + "logits/rejected": 257203605.33333334, + "logps/chosen": -343.8308349609375, + "logps/rejected": -431.1271158854167, + "loss": 0.0196, + "rewards/chosen": 3.66365966796875, + "rewards/margins": 16.804639434814455, + "rewards/rejected": -13.140979766845703, + "step": 8512 + }, + { + "epoch": 0.7777980813156693, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.1746084787519524e-06, + "logits/chosen": 709378432.0, + "logits/rejected": 599640268.8, + "logps/chosen": -230.4691162109375, + "logps/rejected": -552.5283203125, + "loss": 0.0184, + "rewards/chosen": 3.103705724080404, + "rewards/margins": 13.059559758504232, + "rewards/rejected": -9.955854034423828, + "step": 8513 + }, + { + "epoch": 0.7778894472361809, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 1.1736827840437932e-06, + "logits/chosen": 517826560.0, + "logits/rejected": 741461120.0, + "logps/chosen": -328.29638671875, + "logps/rejected": -327.3412170410156, + "loss": 0.0231, + "rewards/chosen": 3.3125481605529785, + "rewards/margins": 12.723299503326416, + "rewards/rejected": -9.410751342773438, + "step": 8514 + }, + { + "epoch": 0.7779808131566925, + "grad_norm": 54.25, + "kl": 0.0, + "learning_rate": 1.1727574057366154e-06, + "logits/chosen": 329256704.0, + "logits/rejected": 584862720.0, + "logps/chosen": -160.29559326171875, + "logps/rejected": -546.84375, + "loss": 0.0657, + "rewards/chosen": 2.9099245071411133, + "rewards/margins": 14.849740028381348, + "rewards/rejected": -11.939815521240234, + "step": 8515 + }, + { + "epoch": 0.7780721790772042, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 1.171832343906939e-06, + "logits/chosen": 583972249.6, + "logits/rejected": 438875648.0, + "logps/chosen": -388.53291015625, + "logps/rejected": -337.9121907552083, + "loss": 0.022, + "rewards/chosen": 3.4184638977050783, + "rewards/margins": 10.926359685262044, + "rewards/rejected": -7.507895787556966, + "step": 8516 + }, + { + "epoch": 0.7781635449977159, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 1.1709075986312607e-06, + "logits/chosen": 464683008.0, + "logits/rejected": 647855616.0, + "logps/chosen": -364.493896484375, + "logps/rejected": -576.4588216145834, + "loss": 0.0184, + "rewards/chosen": 4.093996047973633, + "rewards/margins": 12.876555760701498, + "rewards/rejected": -8.782559712727865, + "step": 8517 + }, + { + "epoch": 0.7782549109182275, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 1.1699831699860464e-06, + "logits/chosen": 679508326.4, + "logits/rejected": 945764181.3333334, + "logps/chosen": -373.015380859375, + "logps/rejected": -409.3206380208333, + "loss": 0.0714, + "rewards/chosen": 3.5300853729248045, + "rewards/margins": 9.463277308146159, + "rewards/rejected": -5.9331919352213545, + "step": 8518 + }, + { + "epoch": 0.7783462768387391, + "grad_norm": 65.0, + "kl": 0.0, + "learning_rate": 1.1690590580477384e-06, + "logits/chosen": 406606656.0, + "logits/rejected": 451958720.0, + "logps/chosen": -256.07489013671875, + "logps/rejected": -549.7577514648438, + "loss": 0.0763, + "rewards/chosen": 2.5469613075256348, + "rewards/margins": 12.301867008209229, + "rewards/rejected": -9.754905700683594, + "step": 8519 + }, + { + "epoch": 0.7784376427592508, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 1.16813526289275e-06, + "logits/chosen": 568284672.0, + "logits/rejected": 647840085.3333334, + "logps/chosen": -430.711767578125, + "logps/rejected": -303.96600341796875, + "loss": 0.0164, + "rewards/chosen": 4.050016784667969, + "rewards/margins": 13.069589996337891, + "rewards/rejected": -9.019573211669922, + "step": 8520 + }, + { + "epoch": 0.7785290086797625, + "grad_norm": 0.73828125, + "kl": 0.0, + "learning_rate": 1.167211784597474e-06, + "logits/chosen": 589362048.0, + "logits/rejected": 350739123.2, + "logps/chosen": -572.0116373697916, + "logps/rejected": -412.7982421875, + "loss": 0.0037, + "rewards/chosen": 5.161244710286458, + "rewards/margins": 13.909827168782552, + "rewards/rejected": -8.748582458496093, + "step": 8521 + }, + { + "epoch": 0.7786203746002741, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.1662886232382715e-06, + "logits/chosen": 485243434.6666667, + "logits/rejected": 285699584.0, + "logps/chosen": -194.73771158854166, + "logps/rejected": -430.61962890625, + "loss": 0.0114, + "rewards/chosen": 4.070895512898763, + "rewards/margins": 13.130806096394856, + "rewards/rejected": -9.059910583496094, + "step": 8522 + }, + { + "epoch": 0.7787117405207857, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 1.1653657788914813e-06, + "logits/chosen": 492080597.3333333, + "logits/rejected": 388883648.0, + "logps/chosen": -316.3446858723958, + "logps/rejected": -249.42747497558594, + "loss": 0.0321, + "rewards/chosen": 3.8800233205159507, + "rewards/margins": 11.227443059285482, + "rewards/rejected": -7.347419738769531, + "step": 8523 + }, + { + "epoch": 0.7788031064412974, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.1644432516334099e-06, + "logits/chosen": 307972928.0, + "logits/rejected": 415537920.0, + "logps/chosen": -313.51706949869794, + "logps/rejected": -529.1691284179688, + "loss": 0.029, + "rewards/chosen": 4.475150108337402, + "rewards/margins": 13.977099418640137, + "rewards/rejected": -9.501949310302734, + "step": 8524 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.1635210415403453e-06, + "logits/chosen": 758930688.0, + "logits/rejected": 508269866.6666667, + "logps/chosen": -375.9939453125, + "logps/rejected": -546.5804036458334, + "loss": 0.0138, + "rewards/chosen": 4.262959289550781, + "rewards/margins": 13.622863260904946, + "rewards/rejected": -9.359903971354166, + "step": 8525 + }, + { + "epoch": 0.7789858382823207, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 1.1625991486885452e-06, + "logits/chosen": 283217984.0, + "logits/rejected": 627744512.0, + "logps/chosen": -399.94976806640625, + "logps/rejected": -671.6835327148438, + "loss": 0.0071, + "rewards/chosen": 4.360775947570801, + "rewards/margins": 14.582828521728516, + "rewards/rejected": -10.222052574157715, + "step": 8526 + }, + { + "epoch": 0.7790772042028323, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 1.1616775731542412e-06, + "logits/chosen": 587546496.0, + "logits/rejected": 1181173120.0, + "logps/chosen": -271.8463134765625, + "logps/rejected": -520.95458984375, + "loss": 0.1041, + "rewards/chosen": 3.4689576625823975, + "rewards/margins": 13.899458646774292, + "rewards/rejected": -10.430500984191895, + "step": 8527 + }, + { + "epoch": 0.779168570123344, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 1.1607563150136374e-06, + "logits/chosen": 369470592.0, + "logits/rejected": 442073429.3333333, + "logps/chosen": -246.547607421875, + "logps/rejected": -609.6358235677084, + "loss": 0.114, + "rewards/chosen": 3.3848335266113283, + "rewards/margins": 10.451968383789062, + "rewards/rejected": -7.067134857177734, + "step": 8528 + }, + { + "epoch": 0.7792599360438557, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 1.159835374342917e-06, + "logits/rejected": 639252480.0, + "logps/rejected": -469.056884765625, + "loss": 0.0023, + "rewards/rejected": -8.375277519226074, + "step": 8529 + }, + { + "epoch": 0.7793513019643673, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 1.158914751218231e-06, + "logits/chosen": 765067861.3333334, + "logits/rejected": 847366144.0, + "logps/chosen": -335.5300699869792, + "logps/rejected": -458.43740234375, + "loss": 0.0267, + "rewards/chosen": 2.6061293284098306, + "rewards/margins": 11.312571589152018, + "rewards/rejected": -8.706442260742188, + "step": 8530 + }, + { + "epoch": 0.7794426678848789, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 1.157994445715706e-06, + "logits/chosen": 560378944.0, + "logits/rejected": 845143872.0, + "logps/chosen": -407.106689453125, + "logps/rejected": -438.71539306640625, + "loss": 0.0244, + "rewards/chosen": 3.62253999710083, + "rewards/margins": 12.130896091461182, + "rewards/rejected": -8.508356094360352, + "step": 8531 + }, + { + "epoch": 0.7795340338053905, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.1570744579114435e-06, + "logits/chosen": 690128230.4, + "logits/rejected": 241054357.33333334, + "logps/chosen": -318.9149658203125, + "logps/rejected": -437.9352620442708, + "loss": 0.0165, + "rewards/chosen": 3.8990455627441407, + "rewards/margins": 13.94693972269694, + "rewards/rejected": -10.047894159952799, + "step": 8532 + }, + { + "epoch": 0.7796253997259023, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 1.1561547878815172e-06, + "logits/chosen": 914202624.0, + "logits/rejected": 1333569365.3333333, + "logps/chosen": -373.5990234375, + "logps/rejected": -771.1090494791666, + "loss": 0.0197, + "rewards/chosen": 3.972583770751953, + "rewards/margins": 12.6062863667806, + "rewards/rejected": -8.633702596028646, + "step": 8533 + }, + { + "epoch": 0.7797167656464139, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.1552354357019763e-06, + "logits/chosen": 700412330.6666666, + "logits/rejected": 1132211097.6, + "logps/chosen": -428.7267659505208, + "logps/rejected": -549.38671875, + "loss": 0.014, + "rewards/chosen": 3.3978726069132485, + "rewards/margins": 13.015356127421061, + "rewards/rejected": -9.617483520507813, + "step": 8534 + }, + { + "epoch": 0.7798081315669255, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.1543164014488428e-06, + "logits/chosen": 413390976.0, + "logits/rejected": 594331648.0, + "logps/chosen": -454.68524169921875, + "logps/rejected": -758.2880859375, + "loss": 0.0164, + "rewards/chosen": 3.064619541168213, + "rewards/margins": 14.557356357574463, + "rewards/rejected": -11.49273681640625, + "step": 8535 + }, + { + "epoch": 0.7798994974874371, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 1.1533976851981121e-06, + "logits/chosen": 496361252.5714286, + "logits/rejected": 323060448.0, + "logps/chosen": -301.5391322544643, + "logps/rejected": -694.668212890625, + "loss": 0.0372, + "rewards/chosen": 3.648113795689174, + "rewards/margins": 17.537494250706263, + "rewards/rejected": -13.88938045501709, + "step": 8536 + }, + { + "epoch": 0.7799908634079489, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 1.1524792870257518e-06, + "logits/chosen": 566347562.6666666, + "logits/rejected": 326193510.4, + "logps/chosen": -162.60367838541666, + "logps/rejected": -302.214013671875, + "loss": 0.0724, + "rewards/chosen": 3.030546506245931, + "rewards/margins": 8.860268338521323, + "rewards/rejected": -5.829721832275391, + "step": 8537 + }, + { + "epoch": 0.7800822293284605, + "grad_norm": 0.1875, + "kl": 0.0, + "learning_rate": 1.1515612070077075e-06, + "logits/chosen": 978480896.0, + "logits/rejected": 536883346.2857143, + "logps/chosen": -170.63394165039062, + "logps/rejected": -592.8104073660714, + "loss": 0.0008, + "rewards/chosen": 5.198108196258545, + "rewards/margins": 15.86226224899292, + "rewards/rejected": -10.664154052734375, + "step": 8538 + }, + { + "epoch": 0.7801735952489721, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 1.150643445219895e-06, + "logits/chosen": 627492147.2, + "logits/rejected": 383986176.0, + "logps/chosen": -450.255224609375, + "logps/rejected": -449.3008626302083, + "loss": 0.0208, + "rewards/chosen": 3.5780380249023436, + "rewards/margins": 12.459837722778321, + "rewards/rejected": -8.881799697875977, + "step": 8539 + }, + { + "epoch": 0.7802649611694837, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 1.1497260017382044e-06, + "logits/chosen": 316651968.0, + "logits/rejected": 527450528.0, + "logps/chosen": -258.30617268880206, + "logps/rejected": -465.46881103515625, + "loss": 0.1318, + "rewards/chosen": 3.136040369669596, + "rewards/margins": 12.73743979136149, + "rewards/rejected": -9.601399421691895, + "step": 8540 + }, + { + "epoch": 0.7803563270899955, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 1.1488088766384987e-06, + "logits/chosen": 511554016.0, + "logits/rejected": 718208448.0, + "logps/chosen": -225.17626953125, + "logps/rejected": -557.0994873046875, + "loss": 0.0141, + "rewards/chosen": 4.001906871795654, + "rewards/margins": 13.13534688949585, + "rewards/rejected": -9.133440017700195, + "step": 8541 + }, + { + "epoch": 0.7804476930105071, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 1.1478920699966179e-06, + "logits/chosen": 642744576.0, + "logits/rejected": 341079616.0, + "logps/chosen": -397.7904052734375, + "logps/rejected": -237.55853271484375, + "loss": 0.0351, + "rewards/chosen": 3.155989646911621, + "rewards/margins": 10.18094778060913, + "rewards/rejected": -7.02495813369751, + "step": 8542 + }, + { + "epoch": 0.7805390589310187, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.1469755818883726e-06, + "logits/chosen": 550384725.3333334, + "logits/rejected": 201653584.0, + "logps/chosen": -307.1575520833333, + "logps/rejected": -420.29339599609375, + "loss": 0.033, + "rewards/chosen": 3.4634548823038735, + "rewards/margins": 15.536938349405924, + "rewards/rejected": -12.07348346710205, + "step": 8543 + }, + { + "epoch": 0.7806304248515303, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 1.1460594123895473e-06, + "logits/chosen": 600348288.0, + "logits/rejected": 470104064.0, + "logps/chosen": -192.7150115966797, + "logps/rejected": -340.24169921875, + "loss": 0.0061, + "rewards/chosen": 3.0792863368988037, + "rewards/margins": 11.084359339305333, + "rewards/rejected": -8.00507300240653, + "step": 8544 + }, + { + "epoch": 0.7807217907720421, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 1.1451435615759015e-06, + "logits/chosen": 305005568.0, + "logits/rejected": 436857120.0, + "logps/chosen": -189.9044647216797, + "logps/rejected": -539.9622802734375, + "loss": 0.0068, + "rewards/chosen": 4.6779069900512695, + "rewards/margins": 15.893242835998535, + "rewards/rejected": -11.215335845947266, + "step": 8545 + }, + { + "epoch": 0.7808131566925537, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.1442280295231656e-06, + "logits/chosen": 549207756.8, + "logits/rejected": 480280277.3333333, + "logps/chosen": -247.198583984375, + "logps/rejected": -361.0194498697917, + "loss": 0.0297, + "rewards/chosen": 3.509189224243164, + "rewards/margins": 11.372055943806966, + "rewards/rejected": -7.862866719563802, + "step": 8546 + }, + { + "epoch": 0.7809045226130653, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.1433128163070484e-06, + "logits/chosen": 1109709568.0, + "logits/rejected": 454727616.0, + "logps/chosen": -288.1939697265625, + "logps/rejected": -377.00946044921875, + "loss": 0.0208, + "rewards/chosen": 3.6894659996032715, + "rewards/margins": 13.243018627166748, + "rewards/rejected": -9.553552627563477, + "step": 8547 + }, + { + "epoch": 0.7809958885335769, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 1.1423979220032284e-06, + "logits/chosen": 182778026.66666666, + "logits/rejected": 481633484.8, + "logps/chosen": -310.66969807942706, + "logps/rejected": -507.889990234375, + "loss": 0.005, + "rewards/chosen": 4.492376327514648, + "rewards/margins": 13.661584854125977, + "rewards/rejected": -9.169208526611328, + "step": 8548 + }, + { + "epoch": 0.7810872544540887, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 1.1414833466873587e-06, + "logits/chosen": 509920213.3333333, + "logits/rejected": 392380723.2, + "logps/chosen": -388.8327229817708, + "logps/rejected": -391.76494140625, + "loss": 0.016, + "rewards/chosen": 3.398148854573568, + "rewards/margins": 11.78001454671224, + "rewards/rejected": -8.381865692138671, + "step": 8549 + }, + { + "epoch": 0.7811786203746003, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 1.1405690904350664e-06, + "logits/chosen": 468137728.0, + "logits/rejected": 424085120.0, + "logps/chosen": -398.0869140625, + "logps/rejected": -509.1126708984375, + "loss": 0.0281, + "rewards/chosen": 2.982302188873291, + "rewards/margins": 11.912509441375732, + "rewards/rejected": -8.930207252502441, + "step": 8550 + }, + { + "epoch": 0.7812699862951119, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.1396551533219512e-06, + "logits/chosen": 437166438.4, + "logits/rejected": 503269205.3333333, + "logps/chosen": -326.7064208984375, + "logps/rejected": -803.3141276041666, + "loss": 0.0154, + "rewards/chosen": 4.489265441894531, + "rewards/margins": 16.640446980794273, + "rewards/rejected": -12.15118153889974, + "step": 8551 + }, + { + "epoch": 0.7813613522156235, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 1.1387415354235887e-06, + "logits/chosen": 456424362.6666667, + "logits/rejected": 468798412.8, + "logps/chosen": -361.5235188802083, + "logps/rejected": -560.99912109375, + "loss": 0.0159, + "rewards/chosen": 4.105723063151042, + "rewards/margins": 13.329033915201823, + "rewards/rejected": -9.223310852050782, + "step": 8552 + }, + { + "epoch": 0.7814527181361353, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 1.1378282368155257e-06, + "logits/chosen": 605890201.6, + "logits/rejected": 338202965.3333333, + "logps/chosen": -380.613720703125, + "logps/rejected": -359.7260335286458, + "loss": 0.0199, + "rewards/chosen": 3.674775314331055, + "rewards/margins": 11.34759381612142, + "rewards/rejected": -7.672818501790364, + "step": 8553 + }, + { + "epoch": 0.7815440840566469, + "grad_norm": 0.73828125, + "kl": 0.0, + "learning_rate": 1.1369152575732823e-06, + "logits/chosen": 411066112.0, + "logits/rejected": 331935680.0, + "logps/chosen": -373.2845764160156, + "logps/rejected": -377.403076171875, + "loss": 0.0031, + "rewards/chosen": 4.523896217346191, + "rewards/margins": 14.225982348124186, + "rewards/rejected": -9.702086130777994, + "step": 8554 + }, + { + "epoch": 0.7816354499771585, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.1360025977723566e-06, + "logits/chosen": 410197674.6666667, + "logits/rejected": 269338752.0, + "logps/chosen": -385.5160725911458, + "logps/rejected": -572.6561279296875, + "loss": 0.0271, + "rewards/chosen": 4.1152544021606445, + "rewards/margins": 12.687697410583496, + "rewards/rejected": -8.572443008422852, + "step": 8555 + }, + { + "epoch": 0.7817268158976701, + "grad_norm": 7.0625, + "kl": 5.370458602905273, + "learning_rate": 1.1350902574882144e-06, + "logits/chosen": 615221504.0, + "logps/chosen": -284.7458801269531, + "loss": 0.0565, + "rewards/chosen": 3.7183828353881836, + "step": 8556 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 1.1341782367962994e-06, + "logits/chosen": 1054755413.3333334, + "logits/rejected": 445098688.0, + "logps/chosen": -403.5059000651042, + "logps/rejected": -458.8005065917969, + "loss": 0.0432, + "rewards/chosen": 3.0536937713623047, + "rewards/margins": 14.631542205810547, + "rewards/rejected": -11.577848434448242, + "step": 8557 + }, + { + "epoch": 0.7819095477386935, + "grad_norm": 7.5625, + "kl": 3.32244873046875, + "learning_rate": 1.133266535772025e-06, + "logits/chosen": 639002916.5714285, + "logits/rejected": 477182912.0, + "logps/chosen": -482.3656529017857, + "logps/rejected": -452.7802429199219, + "loss": 0.0628, + "rewards/chosen": 3.13391603742327, + "rewards/margins": 12.214909689767019, + "rewards/rejected": -9.08099365234375, + "step": 8558 + }, + { + "epoch": 0.7820009136592051, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 1.1323551544907835e-06, + "logits/chosen": 481995980.8, + "logits/rejected": 425973034.6666667, + "logps/chosen": -201.41837158203126, + "logps/rejected": -450.3168131510417, + "loss": 0.0241, + "rewards/chosen": 3.883599853515625, + "rewards/margins": 12.739670054117838, + "rewards/rejected": -8.856070200602213, + "step": 8559 + }, + { + "epoch": 0.7820922795797167, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.1314440930279363e-06, + "logits/chosen": 768644480.0, + "logits/rejected": 399113824.0, + "logps/chosen": -306.9136962890625, + "logps/rejected": -427.16162109375, + "loss": 0.0212, + "rewards/chosen": 3.6808013916015625, + "rewards/margins": 13.883167266845703, + "rewards/rejected": -10.20236587524414, + "step": 8560 + }, + { + "epoch": 0.7821836455002285, + "grad_norm": 0.98046875, + "kl": 0.0, + "learning_rate": 1.1305333514588195e-06, + "logits/chosen": 445606912.0, + "logits/rejected": 373568704.0, + "logps/chosen": -355.99200439453125, + "logps/rejected": -458.55096435546875, + "loss": 0.006, + "rewards/chosen": 4.720774173736572, + "rewards/margins": 14.240383625030518, + "rewards/rejected": -9.519609451293945, + "step": 8561 + }, + { + "epoch": 0.7822750114207401, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 1.1296229298587413e-06, + "logits/chosen": 308542080.0, + "logits/rejected": 707141717.3333334, + "logps/chosen": -257.002783203125, + "logps/rejected": -409.1687825520833, + "loss": 0.0269, + "rewards/chosen": 3.751844787597656, + "rewards/margins": 14.386970265706381, + "rewards/rejected": -10.635125478108725, + "step": 8562 + }, + { + "epoch": 0.7823663773412517, + "grad_norm": 0.9375, + "kl": 0.0, + "learning_rate": 1.128712828302989e-06, + "logits/chosen": 1671230464.0, + "logits/rejected": 727833685.3333334, + "logps/chosen": -674.473876953125, + "logps/rejected": -678.2186279296875, + "loss": 0.0036, + "rewards/chosen": 4.2777557373046875, + "rewards/margins": 13.244546254475912, + "rewards/rejected": -8.966790517171225, + "step": 8563 + }, + { + "epoch": 0.7824577432617633, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 1.1278030468668172e-06, + "logits/chosen": 481078272.0, + "logits/rejected": 1032161792.0, + "logps/chosen": -237.96842447916666, + "logps/rejected": -296.07891845703125, + "loss": 0.0326, + "rewards/chosen": 3.572760264078776, + "rewards/margins": 11.554694811503092, + "rewards/rejected": -7.981934547424316, + "step": 8564 + }, + { + "epoch": 0.782549109182275, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 1.126893585625457e-06, + "logits/chosen": 518788352.0, + "logits/rejected": 826075648.0, + "logps/chosen": -341.572216796875, + "logps/rejected": -631.4486897786459, + "loss": 0.0364, + "rewards/chosen": 3.1850048065185548, + "rewards/margins": 12.333893712361654, + "rewards/rejected": -9.1488889058431, + "step": 8565 + }, + { + "epoch": 0.7826404751027867, + "grad_norm": 0.4375, + "kl": 0.0, + "learning_rate": 1.1259844446541119e-06, + "logits/chosen": 564520320.0, + "logits/rejected": 381928864.0, + "logps/chosen": -280.72906494140625, + "logps/rejected": -384.4459228515625, + "loss": 0.0033, + "rewards/chosen": 5.190601825714111, + "rewards/margins": 12.816856861114502, + "rewards/rejected": -7.626255035400391, + "step": 8566 + }, + { + "epoch": 0.7827318410232983, + "grad_norm": 70.5, + "kl": 0.0, + "learning_rate": 1.1250756240279581e-06, + "logits/chosen": 1021899584.0, + "logits/rejected": 603375616.0, + "logps/chosen": -693.493896484375, + "logps/rejected": -519.6978759765625, + "loss": 0.0627, + "rewards/chosen": 4.242149353027344, + "rewards/margins": 11.561852931976318, + "rewards/rejected": -7.319703578948975, + "step": 8567 + }, + { + "epoch": 0.7828232069438099, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 1.1241671238221501e-06, + "logits/chosen": 615956437.3333334, + "logits/rejected": 520067686.4, + "logps/chosen": -502.5472819010417, + "logps/rejected": -605.15634765625, + "loss": 0.0062, + "rewards/chosen": 4.446058591206868, + "rewards/margins": 15.036952145894368, + "rewards/rejected": -10.5908935546875, + "step": 8568 + }, + { + "epoch": 0.7829145728643216, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 1.1232589441118103e-06, + "logits/chosen": 714145472.0, + "logits/rejected": 557102720.0, + "logps/chosen": -342.1187744140625, + "logps/rejected": -388.2989807128906, + "loss": 0.0279, + "rewards/chosen": 2.9826996326446533, + "rewards/margins": 9.66437554359436, + "rewards/rejected": -6.681675910949707, + "step": 8569 + }, + { + "epoch": 0.7830059387848333, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 1.122351084972037e-06, + "logits/chosen": 1348284544.0, + "logits/rejected": 789782272.0, + "logps/chosen": -418.8464660644531, + "logps/rejected": -470.564453125, + "loss": 0.0113, + "rewards/chosen": 3.9464917182922363, + "rewards/margins": 13.216892719268799, + "rewards/rejected": -9.270401000976562, + "step": 8570 + }, + { + "epoch": 0.7830973047053449, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 1.1214435464779006e-06, + "logits/chosen": 451394176.0, + "logits/rejected": 416094016.0, + "logps/chosen": -325.7637939453125, + "logps/rejected": -533.8331298828125, + "loss": 0.0086, + "rewards/chosen": 4.444658279418945, + "rewards/margins": 13.203310012817383, + "rewards/rejected": -8.758651733398438, + "step": 8571 + }, + { + "epoch": 0.7831886706258565, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.1205363287044485e-06, + "logits/chosen": 711628160.0, + "logits/rejected": 459805312.0, + "logps/chosen": -483.4190368652344, + "logps/rejected": -505.6370544433594, + "loss": 0.0145, + "rewards/chosen": 3.604489326477051, + "rewards/margins": 12.515021324157715, + "rewards/rejected": -8.910531997680664, + "step": 8572 + }, + { + "epoch": 0.7832800365463682, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 1.1196294317266975e-06, + "logits/chosen": 569521877.3333334, + "logits/rejected": 616583872.0, + "logps/chosen": -372.6935221354167, + "logps/rejected": -497.1121826171875, + "loss": 0.0212, + "rewards/chosen": 4.156012852986653, + "rewards/margins": 11.454465707143147, + "rewards/rejected": -7.298452854156494, + "step": 8573 + }, + { + "epoch": 0.7833714024668799, + "grad_norm": 38.25, + "kl": 0.0, + "learning_rate": 1.1187228556196412e-06, + "logits/chosen": 456931040.0, + "logits/rejected": 401024960.0, + "logps/chosen": -205.18838500976562, + "logps/rejected": -582.273193359375, + "loss": 0.09, + "rewards/chosen": 2.1900670528411865, + "rewards/margins": 13.43198561668396, + "rewards/rejected": -11.241918563842773, + "step": 8574 + }, + { + "epoch": 0.7834627683873915, + "grad_norm": 0.75, + "kl": 0.0, + "learning_rate": 1.1178166004582431e-06, + "logits/chosen": 696152917.3333334, + "logits/rejected": 1234129510.4, + "logps/chosen": -318.17628987630206, + "logps/rejected": -615.6251953125, + "loss": 0.0042, + "rewards/chosen": 4.6807708740234375, + "rewards/margins": 13.765560913085938, + "rewards/rejected": -9.0847900390625, + "step": 8575 + }, + { + "epoch": 0.7835541343079031, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 1.1169106663174417e-06, + "logits/chosen": 519459968.0, + "logits/rejected": 595104640.0, + "logps/chosen": -366.057861328125, + "logps/rejected": -566.3414306640625, + "loss": 0.0533, + "rewards/chosen": 4.572740077972412, + "rewards/margins": 12.027831554412842, + "rewards/rejected": -7.45509147644043, + "step": 8576 + }, + { + "epoch": 0.7836455002284148, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 1.1160050532721527e-06, + "logits/chosen": 579317504.0, + "logits/rejected": 1242693376.0, + "logps/chosen": -264.86357421875, + "logps/rejected": -556.6007486979166, + "loss": 0.0106, + "rewards/chosen": 4.130219650268555, + "rewards/margins": 13.304633967081706, + "rewards/rejected": -9.17441431681315, + "step": 8577 + }, + { + "epoch": 0.7837368661489265, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.115099761397259e-06, + "logits/chosen": 537612672.0, + "logits/rejected": 882346112.0, + "logps/chosen": -216.2831268310547, + "logps/rejected": -448.8774108886719, + "loss": 0.1367, + "rewards/chosen": 1.7582132816314697, + "rewards/margins": 11.131019830703735, + "rewards/rejected": -9.372806549072266, + "step": 8578 + }, + { + "epoch": 0.7838282320694381, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.1141947907676198e-06, + "logits/chosen": 472603050.6666667, + "logits/rejected": 673902387.2, + "logps/chosen": -440.2205810546875, + "logps/rejected": -673.366552734375, + "loss": 0.0068, + "rewards/chosen": 4.764148712158203, + "rewards/margins": 13.455985260009765, + "rewards/rejected": -8.691836547851562, + "step": 8579 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 1.1132901414580694e-06, + "logits/chosen": 676399424.0, + "logits/rejected": 760046506.6666666, + "logps/chosen": -393.4819641113281, + "logps/rejected": -643.44482421875, + "loss": 0.0141, + "rewards/chosen": 2.9561891555786133, + "rewards/margins": 13.869741757710775, + "rewards/rejected": -10.913552602132162, + "step": 8580 + }, + { + "epoch": 0.7840109639104614, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 1.112385813543413e-06, + "logits/chosen": 754881126.4, + "logits/rejected": 1576937984.0, + "logps/chosen": -299.2831787109375, + "logps/rejected": -615.5750325520834, + "loss": 0.0273, + "rewards/chosen": 3.487286376953125, + "rewards/margins": 10.38961804707845, + "rewards/rejected": -6.902331670125325, + "step": 8581 + }, + { + "epoch": 0.784102329830973, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.1114818070984308e-06, + "logits/chosen": 930125888.0, + "logits/rejected": 632688256.0, + "logps/chosen": -278.44970703125, + "logps/rejected": -456.45074462890625, + "loss": 0.0123, + "rewards/chosen": 3.7608659267425537, + "rewards/margins": 13.868926763534546, + "rewards/rejected": -10.108060836791992, + "step": 8582 + }, + { + "epoch": 0.7841936957514847, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 1.1105781221978735e-06, + "logits/chosen": 610938965.3333334, + "logits/rejected": 628718016.0, + "logps/chosen": -356.3043212890625, + "logps/rejected": -705.4083251953125, + "loss": 0.0179, + "rewards/chosen": 4.119919459025065, + "rewards/margins": 13.067276636759441, + "rewards/rejected": -8.947357177734375, + "step": 8583 + }, + { + "epoch": 0.7842850616719963, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 1.1096747589164702e-06, + "logits/chosen": 492275520.0, + "logits/rejected": 734232576.0, + "logps/chosen": -220.60548400878906, + "logps/rejected": -317.53704833984375, + "loss": 0.006, + "rewards/chosen": 4.19399881362915, + "rewards/margins": 12.145511468251545, + "rewards/rejected": -7.9515126546223955, + "step": 8584 + }, + { + "epoch": 0.784376427592508, + "grad_norm": 56.5, + "kl": 0.0, + "learning_rate": 1.1087717173289203e-06, + "logits/chosen": 557525094.4, + "logits/rejected": 574548394.6666666, + "logps/chosen": -247.4197509765625, + "logps/rejected": -538.6781412760416, + "loss": 0.06, + "rewards/chosen": 3.185527038574219, + "rewards/margins": 13.01365966796875, + "rewards/rejected": -9.828132629394531, + "step": 8585 + }, + { + "epoch": 0.7844677935130197, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 1.1078689975098961e-06, + "logits/chosen": 642116608.0, + "logits/rejected": 547629525.3333334, + "logps/chosen": -270.8551025390625, + "logps/rejected": -351.9025065104167, + "loss": 0.0098, + "rewards/chosen": 3.3355178833007812, + "rewards/margins": 12.21914545694987, + "rewards/rejected": -8.883627573649088, + "step": 8586 + }, + { + "epoch": 0.7845591594335313, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 1.1069665995340445e-06, + "logits/chosen": 550896025.6, + "logits/rejected": 601985621.3333334, + "logps/chosen": -199.763037109375, + "logps/rejected": -423.5404459635417, + "loss": 0.0248, + "rewards/chosen": 3.854638671875, + "rewards/margins": 12.071216201782226, + "rewards/rejected": -8.216577529907227, + "step": 8587 + }, + { + "epoch": 0.7846505253540429, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.1060645234759836e-06, + "logits/chosen": 816242005.3333334, + "logits/rejected": 467550822.4, + "logps/chosen": -288.31154378255206, + "logps/rejected": -499.3771484375, + "loss": 0.0156, + "rewards/chosen": 3.179149309794108, + "rewards/margins": 15.765376726786295, + "rewards/rejected": -12.586227416992188, + "step": 8588 + }, + { + "epoch": 0.7847418912745546, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 1.1051627694103106e-06, + "logits/chosen": 799142758.4, + "logits/rejected": 439226325.3333333, + "logps/chosen": -312.615771484375, + "logps/rejected": -413.5760091145833, + "loss": 0.0331, + "rewards/chosen": 3.069224166870117, + "rewards/margins": 11.277485020955403, + "rewards/rejected": -8.208260854085287, + "step": 8589 + }, + { + "epoch": 0.7848332571950662, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.1042613374115896e-06, + "logits/chosen": 855535872.0, + "logits/rejected": 606698752.0, + "logps/chosen": -251.77972412109375, + "logps/rejected": -710.5599365234375, + "loss": 0.0368, + "rewards/chosen": 3.357643445332845, + "rewards/margins": 13.875905354817709, + "rewards/rejected": -10.518261909484863, + "step": 8590 + }, + { + "epoch": 0.7849246231155779, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 1.1033602275543614e-06, + "logits/chosen": 668313386.6666666, + "logits/rejected": 732883648.0, + "logps/chosen": -337.8651123046875, + "logps/rejected": -733.3232421875, + "loss": 0.0245, + "rewards/chosen": 4.121969223022461, + "rewards/margins": 14.345006942749023, + "rewards/rejected": -10.223037719726562, + "step": 8591 + }, + { + "epoch": 0.7850159890360895, + "grad_norm": 0.1875, + "kl": 0.0, + "learning_rate": 1.1024594399131377e-06, + "logits/chosen": 424736928.0, + "logits/rejected": 996935021.7142857, + "logps/chosen": -334.4064025878906, + "logps/rejected": -618.8344029017857, + "loss": 0.0006, + "rewards/chosen": 5.53804349899292, + "rewards/margins": 15.20778226852417, + "rewards/rejected": -9.66973876953125, + "step": 8592 + }, + { + "epoch": 0.7851073549566012, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 1.1015589745624079e-06, + "logits/chosen": 511294656.0, + "logits/rejected": 453108928.0, + "logps/chosen": -304.8594970703125, + "logps/rejected": -444.5477600097656, + "loss": 0.013, + "rewards/chosen": 3.8521862030029297, + "rewards/margins": 13.217114448547363, + "rewards/rejected": -9.364928245544434, + "step": 8593 + }, + { + "epoch": 0.7851987208771128, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 1.1006588315766314e-06, + "logits/chosen": 1437788842.6666667, + "logits/rejected": 729628467.2, + "logps/chosen": -259.40631103515625, + "logps/rejected": -649.230078125, + "loss": 0.03, + "rewards/chosen": 2.67430845896403, + "rewards/margins": 12.949400011698405, + "rewards/rejected": -10.275091552734375, + "step": 8594 + }, + { + "epoch": 0.7852900867976245, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.0997590110302413e-06, + "logits/chosen": 772322596.5714285, + "logits/rejected": 540473664.0, + "logps/chosen": -367.3101283482143, + "logps/rejected": -483.4674987792969, + "loss": 0.0298, + "rewards/chosen": 3.7831262860979353, + "rewards/margins": 11.564135483333043, + "rewards/rejected": -7.781009197235107, + "step": 8595 + }, + { + "epoch": 0.7853814527181361, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.0988595129976444e-06, + "logits/chosen": 458778931.2, + "logits/rejected": 606950826.6666666, + "logps/chosen": -277.049658203125, + "logps/rejected": -546.9053141276041, + "loss": 0.0227, + "rewards/chosen": 3.6794387817382814, + "rewards/margins": 12.726588439941406, + "rewards/rejected": -9.047149658203125, + "step": 8596 + }, + { + "epoch": 0.7854728186386478, + "grad_norm": 37.5, + "kl": 0.0, + "learning_rate": 1.0979603375532194e-06, + "logits/chosen": 660735658.6666666, + "logits/rejected": 1097945600.0, + "logps/chosen": -433.4131266276042, + "logps/rejected": -953.5313110351562, + "loss": 0.0422, + "rewards/chosen": 3.6094512939453125, + "rewards/margins": 17.2286434173584, + "rewards/rejected": -13.619192123413086, + "step": 8597 + }, + { + "epoch": 0.7855641845591594, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 1.097061484771323e-06, + "logits/chosen": 368211840.0, + "logits/rejected": 370156800.0, + "logps/chosen": -350.15850830078125, + "logps/rejected": -522.6292724609375, + "loss": 0.0323, + "rewards/chosen": 3.104275703430176, + "rewards/margins": 13.962499618530273, + "rewards/rejected": -10.858223915100098, + "step": 8598 + }, + { + "epoch": 0.7856555504796711, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.0961629547262804e-06, + "logits/chosen": 554772928.0, + "logits/rejected": 912897728.0, + "logps/chosen": -370.1807861328125, + "logps/rejected": -325.0704650878906, + "loss": 0.0118, + "rewards/chosen": 4.320272445678711, + "rewards/margins": 12.00760555267334, + "rewards/rejected": -7.687333106994629, + "step": 8599 + }, + { + "epoch": 0.7857469164001827, + "grad_norm": 0.306640625, + "kl": 0.0, + "learning_rate": 1.095264747492391e-06, + "logits/chosen": 903152128.0, + "logits/rejected": 534365769.14285713, + "logps/chosen": -328.2796936035156, + "logps/rejected": -553.7926199776786, + "loss": 0.0007, + "rewards/chosen": 6.4629669189453125, + "rewards/margins": 16.327811104910715, + "rewards/rejected": -9.864844185965401, + "step": 8600 + }, + { + "epoch": 0.7858382823206944, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.0943668631439274e-06, + "logits/chosen": 360173568.0, + "logits/rejected": 549823616.0, + "logps/chosen": -379.20013427734375, + "logps/rejected": -402.28692626953125, + "loss": 0.0118, + "rewards/chosen": 4.271435737609863, + "rewards/margins": 13.29118824005127, + "rewards/rejected": -9.019752502441406, + "step": 8601 + }, + { + "epoch": 0.785929648241206, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.0934693017551395e-06, + "logits/chosen": 590155417.6, + "logits/rejected": 339535744.0, + "logps/chosen": -395.521142578125, + "logps/rejected": -411.49609375, + "loss": 0.0174, + "rewards/chosen": 3.6053394317626952, + "rewards/margins": 13.867771530151368, + "rewards/rejected": -10.262432098388672, + "step": 8602 + }, + { + "epoch": 0.7860210141617177, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.0925720634002456e-06, + "logits/chosen": 469956821.3333333, + "logits/rejected": 382149120.0, + "logps/chosen": -376.9966227213542, + "logps/rejected": -696.8260498046875, + "loss": 0.0173, + "rewards/chosen": 4.778535525004069, + "rewards/margins": 12.978456179300945, + "rewards/rejected": -8.199920654296875, + "step": 8603 + }, + { + "epoch": 0.7861123800822293, + "grad_norm": 0.91796875, + "kl": 0.0, + "learning_rate": 1.0916751481534389e-06, + "logits/chosen": 694041664.0, + "logits/rejected": 472236928.0, + "logps/chosen": -487.55035400390625, + "logps/rejected": -506.177734375, + "loss": 0.004, + "rewards/chosen": 4.984576225280762, + "rewards/margins": 16.050299644470215, + "rewards/rejected": -11.065723419189453, + "step": 8604 + }, + { + "epoch": 0.786203746002741, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.0907785560888857e-06, + "logits/chosen": 574379776.0, + "logits/rejected": 758286144.0, + "logps/chosen": -262.021240234375, + "logps/rejected": -588.2086791992188, + "loss": 0.0173, + "rewards/chosen": 3.5888593196868896, + "rewards/margins": 16.654249906539917, + "rewards/rejected": -13.065390586853027, + "step": 8605 + }, + { + "epoch": 0.7862951119232526, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.0898822872807264e-06, + "logits/chosen": 838646323.2, + "logits/rejected": 449495509.3333333, + "logps/chosen": -457.08857421875, + "logps/rejected": -452.8038736979167, + "loss": 0.0135, + "rewards/chosen": 4.029468154907226, + "rewards/margins": 14.35242551167806, + "rewards/rejected": -10.322957356770834, + "step": 8606 + }, + { + "epoch": 0.7863864778437643, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 1.0889863418030738e-06, + "logits/chosen": 715749973.3333334, + "logits/rejected": 354490176.0, + "logps/chosen": -340.71103922526044, + "logps/rejected": -396.47503662109375, + "loss": 0.024, + "rewards/chosen": 4.119575500488281, + "rewards/margins": 13.879777908325195, + "rewards/rejected": -9.760202407836914, + "step": 8607 + }, + { + "epoch": 0.7864778437642759, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 1.0880907197300144e-06, + "logits/chosen": 566772096.0, + "logits/rejected": 680943744.0, + "logps/chosen": -339.29461669921875, + "logps/rejected": -455.8292541503906, + "loss": 0.0185, + "rewards/chosen": 3.714019775390625, + "rewards/margins": 10.539912223815918, + "rewards/rejected": -6.825892448425293, + "step": 8608 + }, + { + "epoch": 0.7865692096847876, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 1.0871954211356068e-06, + "logits/chosen": 1072162133.3333334, + "logits/rejected": 463773824.0, + "logps/chosen": -369.0961507161458, + "logps/rejected": -372.61065673828125, + "loss": 0.022, + "rewards/chosen": 3.7033093770345054, + "rewards/margins": 9.989190419514975, + "rewards/rejected": -6.285881042480469, + "step": 8609 + }, + { + "epoch": 0.7866605756052992, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 1.0863004460938858e-06, + "logits/chosen": 530293504.0, + "logits/rejected": 621780096.0, + "logps/chosen": -309.5795084635417, + "logps/rejected": -577.8345336914062, + "loss": 0.0285, + "rewards/chosen": 3.628098805745443, + "rewards/margins": 13.409240086873373, + "rewards/rejected": -9.78114128112793, + "step": 8610 + }, + { + "epoch": 0.7867519415258108, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.0854057946788572e-06, + "logits/chosen": 763322026.6666666, + "logits/rejected": 408830771.2, + "logps/chosen": -248.95817057291666, + "logps/rejected": -636.522998046875, + "loss": 0.0155, + "rewards/chosen": 3.438976287841797, + "rewards/margins": 13.385730743408203, + "rewards/rejected": -9.946754455566406, + "step": 8611 + }, + { + "epoch": 0.7868433074463225, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 1.0845114669644995e-06, + "logits/chosen": 322424576.0, + "logits/rejected": 660360192.0, + "logps/chosen": -212.43032836914062, + "logps/rejected": -709.6541748046875, + "loss": 0.0153, + "rewards/chosen": 4.026071548461914, + "rewards/margins": 15.228364944458008, + "rewards/rejected": -11.202293395996094, + "step": 8612 + }, + { + "epoch": 0.7869346733668342, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 1.0836174630247643e-06, + "logits/chosen": 1158082048.0, + "logits/rejected": 635148672.0, + "logps/chosen": -434.3343811035156, + "logps/rejected": -511.38800048828125, + "loss": 0.0168, + "rewards/chosen": 3.5477280616760254, + "rewards/margins": 12.962321758270264, + "rewards/rejected": -9.414593696594238, + "step": 8613 + }, + { + "epoch": 0.7870260392873458, + "grad_norm": 47.25, + "kl": 0.0, + "learning_rate": 1.0827237829335801e-06, + "logits/chosen": 577977856.0, + "logits/rejected": 393436352.0, + "logps/chosen": -503.38165283203125, + "logps/rejected": -350.73553466796875, + "loss": 0.1153, + "rewards/chosen": 3.6954903602600098, + "rewards/margins": 9.859696388244629, + "rewards/rejected": -6.164206027984619, + "step": 8614 + }, + { + "epoch": 0.7871174052078574, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 1.081830426764845e-06, + "logits/chosen": 792277376.0, + "logits/rejected": 472435968.0, + "logps/chosen": -201.35757446289062, + "logps/rejected": -597.024658203125, + "loss": 0.007, + "rewards/chosen": 3.817662239074707, + "rewards/margins": 12.841338793436686, + "rewards/rejected": -9.023676554361979, + "step": 8615 + }, + { + "epoch": 0.7872087711283691, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 1.0809373945924307e-06, + "logits/chosen": 526997760.0, + "logits/rejected": 515722137.6, + "logps/chosen": -119.4339090983073, + "logps/rejected": -356.026025390625, + "loss": 0.0289, + "rewards/chosen": 3.141289393107096, + "rewards/margins": 12.856815210978189, + "rewards/rejected": -9.715525817871093, + "step": 8616 + }, + { + "epoch": 0.7873001370488808, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.0800446864901832e-06, + "logits/chosen": 655589741.7142857, + "logits/rejected": 184228672.0, + "logps/chosen": -379.74473353794644, + "logps/rejected": -182.6083984375, + "loss": 0.0152, + "rewards/chosen": 4.413901192801339, + "rewards/margins": 12.951564652579172, + "rewards/rejected": -8.537663459777832, + "step": 8617 + }, + { + "epoch": 0.7873915029693924, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.0791523025319189e-06, + "logits/chosen": 677307392.0, + "logits/rejected": 473494336.0, + "logps/chosen": -259.7764485677083, + "logps/rejected": -217.13601684570312, + "loss": 0.027, + "rewards/chosen": 3.80815060933431, + "rewards/margins": 10.905239423116049, + "rewards/rejected": -7.097088813781738, + "step": 8618 + }, + { + "epoch": 0.787482868889904, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.078260242791434e-06, + "logits/chosen": 506216448.0, + "logits/rejected": 718512640.0, + "logps/chosen": -318.2948404947917, + "logps/rejected": -333.0780334472656, + "loss": 0.0154, + "rewards/chosen": 4.572690327962239, + "rewards/margins": 12.162857850392658, + "rewards/rejected": -7.59016752243042, + "step": 8619 + }, + { + "epoch": 0.7875742348104157, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 1.0773685073424906e-06, + "logits/chosen": 443865120.0, + "logits/rejected": 859579562.6666666, + "logps/chosen": -258.52374267578125, + "logps/rejected": -785.869873046875, + "loss": 0.0071, + "rewards/chosen": 3.8029496669769287, + "rewards/margins": 15.390677531560263, + "rewards/rejected": -11.587727864583334, + "step": 8620 + }, + { + "epoch": 0.7876656007309274, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 1.0764770962588278e-06, + "logits/chosen": 382464000.0, + "logits/rejected": 392491296.0, + "logps/chosen": -256.9469909667969, + "logps/rejected": -420.20172119140625, + "loss": 0.0155, + "rewards/chosen": 3.8161418437957764, + "rewards/margins": 12.671820402145386, + "rewards/rejected": -8.85567855834961, + "step": 8621 + }, + { + "epoch": 0.787756966651439, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 1.0755860096141557e-06, + "logits/chosen": 652517376.0, + "logits/rejected": 311211488.0, + "logps/chosen": -380.78955078125, + "logps/rejected": -364.75408935546875, + "loss": 0.0258, + "rewards/chosen": 3.676640192667643, + "rewards/margins": 10.859071413675943, + "rewards/rejected": -7.182431221008301, + "step": 8622 + }, + { + "epoch": 0.7878483325719506, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.0746952474821615e-06, + "logits/chosen": 509967513.6, + "logits/rejected": 511876352.0, + "logps/chosen": -434.399951171875, + "logps/rejected": -632.5528971354166, + "loss": 0.0136, + "rewards/chosen": 4.29992446899414, + "rewards/margins": 11.713268534342447, + "rewards/rejected": -7.413344065348308, + "step": 8623 + }, + { + "epoch": 0.7879396984924623, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.0738048099365016e-06, + "logits/chosen": 283606357.3333333, + "logits/rejected": 366769075.2, + "logps/chosen": -105.60435994466145, + "logps/rejected": -486.811083984375, + "loss": 0.0213, + "rewards/chosen": 3.3246707916259766, + "rewards/margins": 13.255191421508789, + "rewards/rejected": -9.930520629882812, + "step": 8624 + }, + { + "epoch": 0.788031064412974, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 1.072914697050807e-06, + "logits/chosen": 632820736.0, + "logits/rejected": 583154346.6666666, + "logps/chosen": -469.97833251953125, + "logps/rejected": -715.1171875, + "loss": 0.0065, + "rewards/chosen": 4.055365085601807, + "rewards/margins": 13.737305800120035, + "rewards/rejected": -9.681940714518229, + "step": 8625 + }, + { + "epoch": 0.7881224303334856, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 1.0720249088986801e-06, + "logits/chosen": 328647296.0, + "logits/rejected": 517753504.0, + "logps/chosen": -225.65750122070312, + "logps/rejected": -537.56787109375, + "loss": 0.0109, + "rewards/chosen": 4.204544544219971, + "rewards/margins": 13.858675479888916, + "rewards/rejected": -9.654130935668945, + "step": 8626 + }, + { + "epoch": 0.7882137962539972, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.071135445553701e-06, + "logits/chosen": 550484992.0, + "logits/rejected": 455444576.0, + "logps/chosen": -397.54669189453125, + "logps/rejected": -609.34130859375, + "loss": 0.009, + "rewards/chosen": 4.221682548522949, + "rewards/margins": 17.23154067993164, + "rewards/rejected": -13.009858131408691, + "step": 8627 + }, + { + "epoch": 0.7883051621745089, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 1.0702463070894192e-06, + "logits/chosen": 488456832.0, + "logits/rejected": 320576768.0, + "logps/chosen": -221.52069091796875, + "logps/rejected": -436.420166015625, + "loss": 0.0048, + "rewards/chosen": 4.461199760437012, + "rewards/margins": 14.29654852549235, + "rewards/rejected": -9.835348765055338, + "step": 8628 + }, + { + "epoch": 0.7883965280950206, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 1.0693574935793571e-06, + "logits/chosen": 490455142.4, + "logits/rejected": 568751018.6666666, + "logps/chosen": -407.118408203125, + "logps/rejected": -575.0608317057291, + "loss": 0.006, + "rewards/chosen": 4.965557479858399, + "rewards/margins": 14.16112174987793, + "rewards/rejected": -9.195564270019531, + "step": 8629 + }, + { + "epoch": 0.7884878940155322, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 1.0684690050970125e-06, + "logits/chosen": 750078272.0, + "logits/rejected": 312257664.0, + "logps/chosen": -491.63067626953125, + "logps/rejected": -524.656005859375, + "loss": 0.0081, + "rewards/chosen": 3.461778402328491, + "rewards/margins": 12.296347697575888, + "rewards/rejected": -8.834569295247396, + "step": 8630 + }, + { + "epoch": 0.7885792599360438, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 1.067580841715855e-06, + "logits/chosen": 563283648.0, + "logits/rejected": 837775872.0, + "logps/chosen": -697.502197265625, + "logps/rejected": -471.1974182128906, + "loss": 0.0698, + "rewards/chosen": 3.2541580200195312, + "rewards/margins": 10.436195373535156, + "rewards/rejected": -7.182037353515625, + "step": 8631 + }, + { + "epoch": 0.7886706258565555, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 1.066693003509327e-06, + "logits/chosen": 1006430412.8, + "logits/rejected": 331752832.0, + "logps/chosen": -397.6513671875, + "logps/rejected": -382.659912109375, + "loss": 0.0141, + "rewards/chosen": 4.034381866455078, + "rewards/margins": 11.934834162394207, + "rewards/rejected": -7.900452295939128, + "step": 8632 + }, + { + "epoch": 0.7887619917770672, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 1.0658054905508446e-06, + "logits/chosen": 365627050.6666667, + "logits/rejected": 404644659.2, + "logps/chosen": -233.99825032552084, + "logps/rejected": -603.20595703125, + "loss": 0.0086, + "rewards/chosen": 4.085690498352051, + "rewards/margins": 15.379770088195801, + "rewards/rejected": -11.29407958984375, + "step": 8633 + }, + { + "epoch": 0.7888533576975788, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.0649183029137962e-06, + "logits/chosen": 341322944.0, + "logits/rejected": 499659136.0, + "logps/chosen": -419.0655212402344, + "logps/rejected": -380.6739501953125, + "loss": 0.0247, + "rewards/chosen": 3.4605727195739746, + "rewards/margins": 12.803255875905355, + "rewards/rejected": -9.34268315633138, + "step": 8634 + }, + { + "epoch": 0.7889447236180904, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.0640314406715457e-06, + "logits/chosen": 587163904.0, + "logits/rejected": 512827596.8, + "logps/chosen": -340.6871744791667, + "logps/rejected": -494.47978515625, + "loss": 0.0117, + "rewards/chosen": 3.7914346059163413, + "rewards/margins": 14.407489903767905, + "rewards/rejected": -10.616055297851563, + "step": 8635 + }, + { + "epoch": 0.789036089538602, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 1.0631449038974274e-06, + "logits/chosen": 743691093.3333334, + "logits/rejected": 613993536.0, + "logps/chosen": -247.18990071614584, + "logps/rejected": -761.79931640625, + "loss": 0.0132, + "rewards/chosen": 4.567778905232747, + "rewards/margins": 15.77600320180257, + "rewards/rejected": -11.208224296569824, + "step": 8636 + }, + { + "epoch": 0.7891274554591138, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 1.0622586926647498e-06, + "logits/chosen": 632964522.6666666, + "logits/rejected": 557216640.0, + "logps/chosen": -413.7510172526042, + "logps/rejected": -810.6312255859375, + "loss": 0.0395, + "rewards/chosen": 3.1122331619262695, + "rewards/margins": 12.728822708129883, + "rewards/rejected": -9.616589546203613, + "step": 8637 + }, + { + "epoch": 0.7892188213796254, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 1.0613728070467944e-06, + "logits/chosen": 348102336.0, + "logits/rejected": 555938560.0, + "logps/chosen": -411.6445617675781, + "logps/rejected": -407.13446044921875, + "loss": 0.0203, + "rewards/chosen": 3.525261402130127, + "rewards/margins": 12.863440036773682, + "rewards/rejected": -9.338178634643555, + "step": 8638 + }, + { + "epoch": 0.789310187300137, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 1.0604872471168144e-06, + "logits/chosen": 467363925.3333333, + "logits/rejected": 231437158.4, + "logps/chosen": -315.3863525390625, + "logps/rejected": -288.658740234375, + "loss": 0.0142, + "rewards/chosen": 3.432413419087728, + "rewards/margins": 11.234938748677571, + "rewards/rejected": -7.802525329589844, + "step": 8639 + }, + { + "epoch": 0.7894015532206486, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 1.05960201294804e-06, + "logits/chosen": 384190624.0, + "logits/rejected": 640825216.0, + "logps/chosen": -295.4031677246094, + "logps/rejected": -601.6660766601562, + "loss": 0.0071, + "rewards/chosen": 4.6004180908203125, + "rewards/margins": 14.130887031555176, + "rewards/rejected": -9.530468940734863, + "step": 8640 + }, + { + "epoch": 0.7894929191411604, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.0587171046136706e-06, + "logits/chosen": 561843520.0, + "logits/rejected": 570197248.0, + "logps/chosen": -464.12615966796875, + "logps/rejected": -361.83734130859375, + "loss": 0.0152, + "rewards/chosen": 3.726588487625122, + "rewards/margins": 11.800734758377075, + "rewards/rejected": -8.074146270751953, + "step": 8641 + }, + { + "epoch": 0.789584285061672, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.0578325221868797e-06, + "logits/chosen": 980815701.3333334, + "logits/rejected": 636282688.0, + "logps/chosen": -360.0523274739583, + "logps/rejected": -513.4829711914062, + "loss": 0.0186, + "rewards/chosen": 3.8006979624430337, + "rewards/margins": 13.64659563700358, + "rewards/rejected": -9.845897674560547, + "step": 8642 + }, + { + "epoch": 0.7896756509821836, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.0569482657408132e-06, + "logits/chosen": 480369066.6666667, + "logits/rejected": 373853312.0, + "logps/chosen": -184.3923543294271, + "logps/rejected": -509.4859619140625, + "loss": 0.0301, + "rewards/chosen": 3.662883758544922, + "rewards/margins": 15.801590919494629, + "rewards/rejected": -12.138707160949707, + "step": 8643 + }, + { + "epoch": 0.7897670169026952, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 1.0560643353485929e-06, + "logits/chosen": 364723370.6666667, + "logits/rejected": 367555328.0, + "logps/chosen": -213.53580729166666, + "logps/rejected": -476.1154296875, + "loss": 0.0176, + "rewards/chosen": 3.2133913040161133, + "rewards/margins": 10.509102058410644, + "rewards/rejected": -7.295710754394531, + "step": 8644 + }, + { + "epoch": 0.789858382823207, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 1.0551807310833107e-06, + "logits/chosen": 720727756.8, + "logits/rejected": 369192277.3333333, + "logps/chosen": -311.4212890625, + "logps/rejected": -370.8384195963542, + "loss": 0.0115, + "rewards/chosen": 4.217160034179687, + "rewards/margins": 12.474524943033853, + "rewards/rejected": -8.257364908854166, + "step": 8645 + }, + { + "epoch": 0.7899497487437186, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.0542974530180327e-06, + "logits/chosen": 378462028.8, + "logits/rejected": 749778602.6666666, + "logps/chosen": -140.90631103515625, + "logps/rejected": -337.7049967447917, + "loss": 0.0196, + "rewards/chosen": 3.697953796386719, + "rewards/margins": 11.5410400390625, + "rewards/rejected": -7.843086242675781, + "step": 8646 + }, + { + "epoch": 0.7900411146642302, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.0534145012257962e-06, + "logits/chosen": 474033024.0, + "logits/rejected": 471674240.0, + "logps/chosen": -228.9341278076172, + "logps/rejected": -412.0787658691406, + "loss": 0.0173, + "rewards/chosen": 3.681105136871338, + "rewards/margins": 12.8315110206604, + "rewards/rejected": -9.150405883789062, + "step": 8647 + }, + { + "epoch": 0.7901324805847418, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 1.0525318757796155e-06, + "logits/chosen": 445370837.3333333, + "logits/rejected": 490963558.4, + "logps/chosen": -335.0249837239583, + "logps/rejected": -430.5232421875, + "loss": 0.0109, + "rewards/chosen": 3.888312339782715, + "rewards/margins": 12.096749687194825, + "rewards/rejected": -8.20843734741211, + "step": 8648 + }, + { + "epoch": 0.7902238465052536, + "grad_norm": 0.80078125, + "kl": 0.0, + "learning_rate": 1.0516495767524747e-06, + "logits/chosen": 516270944.0, + "logits/rejected": 693576533.3333334, + "logps/chosen": -334.3023681640625, + "logps/rejected": -493.2405192057292, + "loss": 0.0035, + "rewards/chosen": 4.627464294433594, + "rewards/margins": 13.560155232747396, + "rewards/rejected": -8.932690938313803, + "step": 8649 + }, + { + "epoch": 0.7903152124257652, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 1.0507676042173321e-06, + "logits/chosen": 632017237.3333334, + "logits/rejected": 456541376.0, + "logps/chosen": -419.3019205729167, + "logps/rejected": -332.65826416015625, + "loss": 0.0272, + "rewards/chosen": 3.6974461873372397, + "rewards/margins": 13.487222035725912, + "rewards/rejected": -9.789775848388672, + "step": 8650 + }, + { + "epoch": 0.7904065783462768, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 1.0498859582471176e-06, + "logits/chosen": 411060480.0, + "logits/rejected": 296626624.0, + "logps/chosen": -368.181689453125, + "logps/rejected": -388.3001302083333, + "loss": 0.0168, + "rewards/chosen": 4.32861213684082, + "rewards/margins": 13.883385467529298, + "rewards/rejected": -9.554773330688477, + "step": 8651 + }, + { + "epoch": 0.7904979442667884, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 1.0490046389147352e-06, + "logits/chosen": 761089920.0, + "logits/rejected": 457905993.14285713, + "logps/chosen": -150.86904907226562, + "logps/rejected": -632.7103794642857, + "loss": 0.0051, + "rewards/chosen": 3.170275926589966, + "rewards/margins": 14.861438035964966, + "rewards/rejected": -11.691162109375, + "step": 8652 + }, + { + "epoch": 0.7905893101873002, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 1.0481236462930628e-06, + "logits/chosen": 572339200.0, + "logits/rejected": 573304985.6, + "logps/chosen": -302.3276774088542, + "logps/rejected": -697.267724609375, + "loss": 0.023, + "rewards/chosen": 3.371697425842285, + "rewards/margins": 14.30461368560791, + "rewards/rejected": -10.932916259765625, + "step": 8653 + }, + { + "epoch": 0.7906806761078118, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.0472429804549505e-06, + "logits/chosen": 382181952.0, + "logits/rejected": 522864480.0, + "logps/chosen": -243.99563598632812, + "logps/rejected": -471.0858459472656, + "loss": 0.0132, + "rewards/chosen": 4.245697498321533, + "rewards/margins": 12.464577198028564, + "rewards/rejected": -8.218879699707031, + "step": 8654 + }, + { + "epoch": 0.7907720420283234, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.0463626414732203e-06, + "logits/chosen": 559571797.3333334, + "logits/rejected": 720950988.8, + "logps/chosen": -396.9795328776042, + "logps/rejected": -444.2326171875, + "loss": 0.0115, + "rewards/chosen": 4.250780423482259, + "rewards/margins": 12.648541959126788, + "rewards/rejected": -8.39776153564453, + "step": 8655 + }, + { + "epoch": 0.7908634079488351, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.0454826294206672e-06, + "logits/chosen": 553132492.8, + "logits/rejected": 794484309.3333334, + "logps/chosen": -278.8221435546875, + "logps/rejected": -647.7124837239584, + "loss": 0.0169, + "rewards/chosen": 3.9157699584960937, + "rewards/margins": 14.250851058959961, + "rewards/rejected": -10.335081100463867, + "step": 8656 + }, + { + "epoch": 0.7909547738693468, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.0446029443700638e-06, + "logits/chosen": 470910250.6666667, + "logits/rejected": 356882976.0, + "logps/chosen": -327.00714111328125, + "logps/rejected": -437.744873046875, + "loss": 0.0346, + "rewards/chosen": 3.4036553700764975, + "rewards/margins": 10.05436642964681, + "rewards/rejected": -6.6507110595703125, + "step": 8657 + }, + { + "epoch": 0.7910461397898584, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.043723586394148e-06, + "logits/chosen": 712633770.6666666, + "logits/rejected": 967987712.0, + "logps/chosen": -389.188720703125, + "logps/rejected": -360.19805908203125, + "loss": 0.0111, + "rewards/chosen": 4.789833068847656, + "rewards/margins": 13.061458587646484, + "rewards/rejected": -8.271625518798828, + "step": 8658 + }, + { + "epoch": 0.79113750571037, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 1.0428445555656363e-06, + "logits/chosen": 760323584.0, + "logits/rejected": 553209600.0, + "logps/chosen": -590.3975423177084, + "logps/rejected": -631.931396484375, + "loss": 0.0143, + "rewards/chosen": 3.3125101725260415, + "rewards/margins": 13.929073588053384, + "rewards/rejected": -10.616563415527343, + "step": 8659 + }, + { + "epoch": 0.7912288716308817, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 1.041965851957214e-06, + "logits/chosen": 368343168.0, + "logits/rejected": 515102336.0, + "logps/chosen": -237.40057373046875, + "logps/rejected": -401.18072509765625, + "loss": 0.0184, + "rewards/chosen": 3.9383034706115723, + "rewards/margins": 12.920885562896729, + "rewards/rejected": -8.982582092285156, + "step": 8660 + }, + { + "epoch": 0.7913202375513934, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 1.0410874756415456e-06, + "logits/chosen": 450001766.4, + "logits/rejected": 562816896.0, + "logps/chosen": -371.638037109375, + "logps/rejected": -667.8476969401041, + "loss": 0.0222, + "rewards/chosen": 3.988785171508789, + "rewards/margins": 12.232949956258139, + "rewards/rejected": -8.24416478474935, + "step": 8661 + }, + { + "epoch": 0.791411603471905, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 1.0402094266912626e-06, + "logits/chosen": 546284672.0, + "logits/rejected": 414756064.0, + "logps/chosen": -275.90570068359375, + "logps/rejected": -381.2041015625, + "loss": 0.0128, + "rewards/chosen": 3.666837692260742, + "rewards/margins": 13.60460090637207, + "rewards/rejected": -9.937763214111328, + "step": 8662 + }, + { + "epoch": 0.7915029693924166, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 1.0393317051789714e-06, + "logits/chosen": 492767936.0, + "logits/rejected": 389722208.0, + "logps/chosen": -353.3052062988281, + "logps/rejected": -340.553466796875, + "loss": 0.0153, + "rewards/chosen": 4.111536502838135, + "rewards/margins": 12.446035861968994, + "rewards/rejected": -8.33449935913086, + "step": 8663 + }, + { + "epoch": 0.7915943353129283, + "grad_norm": 30.0, + "kl": 0.0, + "learning_rate": 1.038454311177251e-06, + "logits/chosen": 863555925.3333334, + "logits/rejected": 455736217.6, + "logps/chosen": -465.0377604166667, + "logps/rejected": -364.161279296875, + "loss": 0.0347, + "rewards/chosen": 3.2604411443074546, + "rewards/margins": 12.724453290303549, + "rewards/rejected": -9.464012145996094, + "step": 8664 + }, + { + "epoch": 0.79168570123344, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 1.0375772447586557e-06, + "logits/chosen": 434731315.2, + "logits/rejected": 576382549.3333334, + "logps/chosen": -427.395166015625, + "logps/rejected": -683.510986328125, + "loss": 0.0444, + "rewards/chosen": 2.750037956237793, + "rewards/margins": 14.412579663594563, + "rewards/rejected": -11.662541707356771, + "step": 8665 + }, + { + "epoch": 0.7917770671539516, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 1.0367005059957097e-06, + "logits/chosen": 665087040.0, + "logits/rejected": 699706752.0, + "logps/chosen": -547.4556884765625, + "logps/rejected": -587.5421752929688, + "loss": 0.0191, + "rewards/chosen": 3.321044921875, + "rewards/margins": 13.496670722961426, + "rewards/rejected": -10.175625801086426, + "step": 8666 + }, + { + "epoch": 0.7918684330744632, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.0358240949609104e-06, + "logits/chosen": 530582720.0, + "logits/rejected": 670136789.3333334, + "logps/chosen": -275.5185852050781, + "logps/rejected": -611.7991536458334, + "loss": 0.0176, + "rewards/chosen": 2.6383895874023438, + "rewards/margins": 12.087940216064453, + "rewards/rejected": -9.44955062866211, + "step": 8667 + }, + { + "epoch": 0.7919597989949749, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 1.034948011726729e-06, + "logits/chosen": 457659232.0, + "logits/rejected": 490224981.3333333, + "logps/chosen": -268.64556884765625, + "logps/rejected": -504.9018961588542, + "loss": 0.0165, + "rewards/chosen": 2.7141051292419434, + "rewards/margins": 12.593170960744223, + "rewards/rejected": -9.87906583150228, + "step": 8668 + }, + { + "epoch": 0.7920511649154865, + "grad_norm": 38.25, + "kl": 0.0, + "learning_rate": 1.0340722563656109e-06, + "logits/chosen": 662540928.0, + "logits/rejected": 795851776.0, + "logps/chosen": -426.35546875, + "logps/rejected": -522.88720703125, + "loss": 0.0924, + "rewards/chosen": 2.770738124847412, + "rewards/margins": 11.600729783376059, + "rewards/rejected": -8.829991658528646, + "step": 8669 + }, + { + "epoch": 0.7921425308359982, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.0331968289499723e-06, + "logits/chosen": 465536716.8, + "logits/rejected": 245916885.33333334, + "logps/chosen": -313.006884765625, + "logps/rejected": -465.453369140625, + "loss": 0.0153, + "rewards/chosen": 4.0970458984375, + "rewards/margins": 15.210519154866537, + "rewards/rejected": -11.113473256429037, + "step": 8670 + }, + { + "epoch": 0.7922338967565098, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.0323217295522026e-06, + "logits/chosen": 740204288.0, + "logits/rejected": 571223082.6666666, + "logps/chosen": -226.74652099609375, + "logps/rejected": -398.3461100260417, + "loss": 0.0195, + "rewards/chosen": 3.293889045715332, + "rewards/margins": 12.522220929463705, + "rewards/rejected": -9.228331883748373, + "step": 8671 + }, + { + "epoch": 0.7923252626770215, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.0314469582446645e-06, + "logits/chosen": 438960469.3333333, + "logits/rejected": 334757632.0, + "logps/chosen": -365.7367350260417, + "logps/rejected": -476.1125, + "loss": 0.0126, + "rewards/chosen": 3.82320245107015, + "rewards/margins": 13.82436440785726, + "rewards/rejected": -10.00116195678711, + "step": 8672 + }, + { + "epoch": 0.7924166285975331, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 1.030572515099692e-06, + "logits/chosen": 685283157.3333334, + "logits/rejected": 561260697.6, + "logps/chosen": -388.9205322265625, + "logps/rejected": -667.41064453125, + "loss": 0.0091, + "rewards/chosen": 3.9825929005940757, + "rewards/margins": 13.68289655049642, + "rewards/rejected": -9.700303649902343, + "step": 8673 + }, + { + "epoch": 0.7925079945180448, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 1.0296984001895964e-06, + "logits/chosen": 712659148.8, + "logits/rejected": 302121941.3333333, + "logps/chosen": -428.3763671875, + "logps/rejected": -622.1134847005209, + "loss": 0.0124, + "rewards/chosen": 4.395683670043946, + "rewards/margins": 15.107738876342774, + "rewards/rejected": -10.712055206298828, + "step": 8674 + }, + { + "epoch": 0.7925993604385564, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 1.028824613586658e-06, + "logits/chosen": 229169552.0, + "logits/rejected": 336439040.0, + "logps/chosen": -247.30343627929688, + "logps/rejected": -478.6102818080357, + "loss": 0.006, + "rewards/chosen": 3.0574920177459717, + "rewards/margins": 12.890471492494855, + "rewards/rejected": -9.832979474748884, + "step": 8675 + }, + { + "epoch": 0.7926907263590681, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.0279511553631295e-06, + "logits/chosen": 480715827.2, + "logits/rejected": 376028117.3333333, + "logps/chosen": -295.5242431640625, + "logps/rejected": -357.7821858723958, + "loss": 0.0218, + "rewards/chosen": 3.7163711547851563, + "rewards/margins": 13.499275461832681, + "rewards/rejected": -9.782904307047525, + "step": 8676 + }, + { + "epoch": 0.7927820922795797, + "grad_norm": 0.53125, + "kl": 0.0, + "learning_rate": 1.0270780255912378e-06, + "logits/chosen": 416444928.0, + "logits/rejected": 695949738.6666666, + "logps/chosen": -374.503662109375, + "logps/rejected": -560.3492838541666, + "loss": 0.0024, + "rewards/chosen": 4.8135881423950195, + "rewards/margins": 13.48729165395101, + "rewards/rejected": -8.67370351155599, + "step": 8677 + }, + { + "epoch": 0.7928734582000914, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 1.0262052243431846e-06, + "logits/chosen": 475968921.6, + "logits/rejected": 784434176.0, + "logps/chosen": -274.872509765625, + "logps/rejected": -731.9102376302084, + "loss": 0.0168, + "rewards/chosen": 4.099995422363281, + "rewards/margins": 13.056916681925454, + "rewards/rejected": -8.956921259562174, + "step": 8678 + }, + { + "epoch": 0.792964824120603, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 1.025332751691141e-06, + "logits/chosen": 1432446976.0, + "logits/rejected": 808792371.2, + "logps/chosen": -486.5319417317708, + "logps/rejected": -377.67314453125, + "loss": 0.0235, + "rewards/chosen": 3.672268549601237, + "rewards/margins": 10.608927790323893, + "rewards/rejected": -6.936659240722657, + "step": 8679 + }, + { + "epoch": 0.7930561900411147, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.0244606077072533e-06, + "logits/chosen": 1036873792.0, + "logits/rejected": 747826496.0, + "logps/chosen": -279.84747314453125, + "logps/rejected": -592.906005859375, + "loss": 0.0147, + "rewards/chosen": 3.601863145828247, + "rewards/margins": 13.300736665725708, + "rewards/rejected": -9.698873519897461, + "step": 8680 + }, + { + "epoch": 0.7931475559616263, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 1.023588792463639e-06, + "logits/chosen": 454263091.2, + "logits/rejected": 984739242.6666666, + "logps/chosen": -255.0214599609375, + "logps/rejected": -503.7089029947917, + "loss": 0.0197, + "rewards/chosen": 3.7579456329345704, + "rewards/margins": 13.849113082885742, + "rewards/rejected": -10.091167449951172, + "step": 8681 + }, + { + "epoch": 0.793238921882138, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.0227173060323874e-06, + "logits/chosen": 427053977.6, + "logits/rejected": 559888725.3333334, + "logps/chosen": -226.92900390625, + "logps/rejected": -657.3412679036459, + "loss": 0.0232, + "rewards/chosen": 3.9452919006347655, + "rewards/margins": 10.249388122558594, + "rewards/rejected": -6.304096221923828, + "step": 8682 + }, + { + "epoch": 0.7933302878026496, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 1.0218461484855663e-06, + "logits/chosen": 490424320.0, + "logits/rejected": 486527296.0, + "logps/chosen": -241.6981201171875, + "logps/rejected": -619.1844482421875, + "loss": 0.0174, + "rewards/chosen": 4.417599042256673, + "rewards/margins": 14.725128491719563, + "rewards/rejected": -10.30752944946289, + "step": 8683 + }, + { + "epoch": 0.7934216537231613, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 1.0209753198952111e-06, + "logits/chosen": 755886080.0, + "logits/rejected": 588137216.0, + "logps/chosen": -418.9399820963542, + "logps/rejected": -486.6419982910156, + "loss": 0.0364, + "rewards/chosen": 3.4915520350138345, + "rewards/margins": 14.80886427561442, + "rewards/rejected": -11.317312240600586, + "step": 8684 + }, + { + "epoch": 0.7935130196436729, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.0201048203333281e-06, + "logits/chosen": 908448128.0, + "logits/rejected": 590075328.0, + "logps/chosen": -386.28900146484375, + "logps/rejected": -318.437744140625, + "loss": 0.1207, + "rewards/chosen": 3.1766107082366943, + "rewards/margins": 9.131503343582153, + "rewards/rejected": -5.954892635345459, + "step": 8685 + }, + { + "epoch": 0.7936043855641846, + "grad_norm": 29.0, + "kl": 0.0, + "learning_rate": 1.019234649871903e-06, + "logits/chosen": 592351616.0, + "logits/rejected": 427233728.0, + "logps/chosen": -315.5977478027344, + "logps/rejected": -598.4830322265625, + "loss": 0.0548, + "rewards/chosen": 2.6393001079559326, + "rewards/margins": 11.36447787284851, + "rewards/rejected": -8.725177764892578, + "step": 8686 + }, + { + "epoch": 0.7936957514846962, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 1.01836480858289e-06, + "logits/chosen": 784251306.6666666, + "logits/rejected": 510254912.0, + "logps/chosen": -334.9356282552083, + "logps/rejected": -485.9989318847656, + "loss": 0.0256, + "rewards/chosen": 3.5237064361572266, + "rewards/margins": 13.367721557617188, + "rewards/rejected": -9.844015121459961, + "step": 8687 + }, + { + "epoch": 0.7937871174052079, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 1.0174952965382167e-06, + "logits/chosen": 401643904.0, + "logits/rejected": 471869525.3333333, + "logps/chosen": -246.2421875, + "logps/rejected": -398.6261800130208, + "loss": 0.0117, + "rewards/chosen": 4.200158309936524, + "rewards/margins": 12.982805760701499, + "rewards/rejected": -8.782647450764975, + "step": 8688 + }, + { + "epoch": 0.7938784833257195, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 1.0166261138097822e-06, + "logits/chosen": 650724556.8, + "logits/rejected": 989724501.3333334, + "logps/chosen": -210.642333984375, + "logps/rejected": -894.33154296875, + "loss": 0.0249, + "rewards/chosen": 3.9196212768554686, + "rewards/margins": 17.430064646402993, + "rewards/rejected": -13.510443369547525, + "step": 8689 + }, + { + "epoch": 0.7939698492462312, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 1.0157572604694626e-06, + "logits/chosen": 762748074.6666666, + "logits/rejected": 440978752.0, + "logps/chosen": -385.7913004557292, + "logps/rejected": -455.77276611328125, + "loss": 0.0343, + "rewards/chosen": 3.3526792526245117, + "rewards/margins": 12.050671577453613, + "rewards/rejected": -8.697992324829102, + "step": 8690 + }, + { + "epoch": 0.7940612151667428, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.0148887365891025e-06, + "logits/chosen": 500927808.0, + "logits/rejected": 297058592.0, + "logps/chosen": -546.6392822265625, + "logps/rejected": -408.09259033203125, + "loss": 0.0189, + "rewards/chosen": 3.5101027488708496, + "rewards/margins": 13.705230236053467, + "rewards/rejected": -10.195127487182617, + "step": 8691 + }, + { + "epoch": 0.7941525810872545, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 1.0140205422405213e-06, + "logits/chosen": 536457676.8, + "logits/rejected": 600342954.6666666, + "logps/chosen": -427.275, + "logps/rejected": -309.42311604817706, + "loss": 0.0104, + "rewards/chosen": 4.602679443359375, + "rewards/margins": 12.994955571492515, + "rewards/rejected": -8.392276128133139, + "step": 8692 + }, + { + "epoch": 0.7942439470077661, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.0131526774955103e-06, + "logits/chosen": 400338432.0, + "logits/rejected": 274812629.3333333, + "logps/chosen": -352.0902587890625, + "logps/rejected": -317.23016357421875, + "loss": 0.0216, + "rewards/chosen": 4.160678100585938, + "rewards/margins": 12.859610748291015, + "rewards/rejected": -8.698932647705078, + "step": 8693 + }, + { + "epoch": 0.7943353129282777, + "grad_norm": 0.578125, + "kl": 0.0, + "learning_rate": 1.0122851424258328e-06, + "logits/chosen": 766083328.0, + "logits/rejected": 510712649.14285713, + "logps/chosen": -593.9751586914062, + "logps/rejected": -413.005859375, + "loss": 0.0022, + "rewards/chosen": 4.490307807922363, + "rewards/margins": 13.172376223972865, + "rewards/rejected": -8.682068416050502, + "step": 8694 + }, + { + "epoch": 0.7944266788487894, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 1.0114179371032278e-06, + "logits/chosen": 472568268.8, + "logits/rejected": 345900842.6666667, + "logps/chosen": -276.181396484375, + "logps/rejected": -371.1759440104167, + "loss": 0.0217, + "rewards/chosen": 4.033364105224609, + "rewards/margins": 11.989534505208333, + "rewards/rejected": -7.956170399983724, + "step": 8695 + }, + { + "epoch": 0.7945180447693011, + "grad_norm": 0.625, + "kl": 0.0, + "learning_rate": 1.0105510615994051e-06, + "logits/chosen": 1183177472.0, + "logits/rejected": 490945462.85714287, + "logps/chosen": -142.1919403076172, + "logps/rejected": -384.68314034598217, + "loss": 0.0029, + "rewards/chosen": 3.847834825515747, + "rewards/margins": 13.718960183007377, + "rewards/rejected": -9.87112535749163, + "step": 8696 + }, + { + "epoch": 0.7946094106898127, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 1.0096845159860464e-06, + "logits/chosen": 266574131.2, + "logits/rejected": 304710485.3333333, + "logps/chosen": -177.40947265625, + "logps/rejected": -449.5368245442708, + "loss": 0.0261, + "rewards/chosen": 3.5781570434570313, + "rewards/margins": 15.385140991210937, + "rewards/rejected": -11.806983947753906, + "step": 8697 + }, + { + "epoch": 0.7947007766103243, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 1.008818300334805e-06, + "logits/chosen": 462830421.3333333, + "logits/rejected": 352828096.0, + "logps/chosen": -330.57216389973956, + "logps/rejected": -284.52032470703125, + "loss": 0.0303, + "rewards/chosen": 3.2501633961995444, + "rewards/margins": 12.725249608357748, + "rewards/rejected": -9.475086212158203, + "step": 8698 + }, + { + "epoch": 0.794792142530836, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.0079524147173131e-06, + "logits/chosen": 719058176.0, + "logits/rejected": 345560021.3333333, + "logps/chosen": -432.83017578125, + "logps/rejected": -533.1434733072916, + "loss": 0.0353, + "rewards/chosen": 3.1042932510375976, + "rewards/margins": 14.107833290100098, + "rewards/rejected": -11.0035400390625, + "step": 8699 + }, + { + "epoch": 0.7948835084513477, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 1.007086859205169e-06, + "logits/chosen": 331995840.0, + "logits/rejected": 537425100.8, + "logps/chosen": -206.96451822916666, + "logps/rejected": -529.5427734375, + "loss": 0.0105, + "rewards/chosen": 4.570811589558919, + "rewards/margins": 12.560330327351888, + "rewards/rejected": -7.989518737792968, + "step": 8700 + }, + { + "epoch": 0.7949748743718593, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 1.0062216338699471e-06, + "logits/chosen": 332756070.4, + "logits/rejected": 506329514.6666667, + "logps/chosen": -220.795263671875, + "logps/rejected": -460.3506673177083, + "loss": 0.0226, + "rewards/chosen": 4.527878952026367, + "rewards/margins": 13.978446833292644, + "rewards/rejected": -9.450567881266275, + "step": 8701 + }, + { + "epoch": 0.7950662402923709, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 1.0053567387831924e-06, + "logits/chosen": 421941952.0, + "logits/rejected": 310451904.0, + "logps/chosen": -272.1473083496094, + "logps/rejected": -483.850341796875, + "loss": 0.0152, + "rewards/chosen": 3.6055946350097656, + "rewards/margins": 13.184696197509766, + "rewards/rejected": -9.5791015625, + "step": 8702 + }, + { + "epoch": 0.7951576062128826, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.0044921740164232e-06, + "logits/chosen": 349685152.0, + "logits/rejected": 834752000.0, + "logps/chosen": -127.41940307617188, + "logps/rejected": -610.1802978515625, + "loss": 0.017, + "rewards/chosen": 3.774595260620117, + "rewards/margins": 14.174938201904297, + "rewards/rejected": -10.40034294128418, + "step": 8703 + }, + { + "epoch": 0.7952489721333943, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 1.003627939641133e-06, + "logits/chosen": 601329877.3333334, + "logits/rejected": 611575936.0, + "logps/chosen": -320.02956136067706, + "logps/rejected": -753.3980712890625, + "loss": 0.0301, + "rewards/chosen": 3.342548688252767, + "rewards/margins": 12.354284604390463, + "rewards/rejected": -9.011735916137695, + "step": 8704 + }, + { + "epoch": 0.7953403380539059, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 1.0027640357287849e-06, + "logits/chosen": 948728149.3333334, + "logits/rejected": 425062323.2, + "logps/chosen": -362.6963297526042, + "logps/rejected": -467.317431640625, + "loss": 0.0224, + "rewards/chosen": 3.304202397664388, + "rewards/margins": 13.264520390828451, + "rewards/rejected": -9.960317993164063, + "step": 8705 + }, + { + "epoch": 0.7954317039744175, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 1.0019004623508161e-06, + "logits/chosen": 364545664.0, + "logits/rejected": 510826240.0, + "logps/chosen": -186.65052795410156, + "logps/rejected": -637.2667643229166, + "loss": 0.005, + "rewards/chosen": 4.011620044708252, + "rewards/margins": 14.224808851877848, + "rewards/rejected": -10.213188807169596, + "step": 8706 + }, + { + "epoch": 0.7955230698949292, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 1.0010372195786345e-06, + "logits/chosen": 714271232.0, + "logits/rejected": 269314611.2, + "logps/chosen": -399.652587890625, + "logps/rejected": -339.244580078125, + "loss": 0.0124, + "rewards/chosen": 3.6967366536458335, + "rewards/margins": 11.550702412923178, + "rewards/rejected": -7.853965759277344, + "step": 8707 + }, + { + "epoch": 0.7956144358154409, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 1.0001743074836245e-06, + "logits/chosen": 503593152.0, + "logits/rejected": 278438976.0, + "logps/chosen": -360.0918273925781, + "logps/rejected": -421.60540771484375, + "loss": 0.0048, + "rewards/chosen": 4.8782958984375, + "rewards/margins": 12.411754131317139, + "rewards/rejected": -7.533458232879639, + "step": 8708 + }, + { + "epoch": 0.7957058017359525, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.993117261371404e-07, + "logits/chosen": 586881126.4, + "logits/rejected": 366824874.6666667, + "logps/chosen": -316.9271484375, + "logps/rejected": -429.9530436197917, + "loss": 0.0163, + "rewards/chosen": 3.7915924072265623, + "rewards/margins": 14.263096618652344, + "rewards/rejected": -10.471504211425781, + "step": 8709 + }, + { + "epoch": 0.7957971676564641, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 9.984494756105096e-07, + "logits/chosen": 337021760.0, + "logits/rejected": 452322918.4, + "logps/chosen": -375.0213216145833, + "logps/rejected": -519.654296875, + "loss": 0.0109, + "rewards/chosen": 4.300682067871094, + "rewards/margins": 13.352986907958984, + "rewards/rejected": -9.05230484008789, + "step": 8710 + }, + { + "epoch": 0.7958885335769758, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.97587555975032e-07, + "logits/chosen": 733286809.6, + "logits/rejected": 732789589.3333334, + "logps/chosen": -562.53115234375, + "logps/rejected": -427.4259033203125, + "loss": 0.0249, + "rewards/chosen": 3.836504364013672, + "rewards/margins": 13.06348139444987, + "rewards/rejected": -9.226977030436197, + "step": 8711 + }, + { + "epoch": 0.7959798994974875, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 9.967259673019797e-07, + "logits/chosen": 432845312.0, + "logits/rejected": 327659050.6666667, + "logps/chosen": -280.838232421875, + "logps/rejected": -626.8932291666666, + "loss": 0.0219, + "rewards/chosen": 3.6285987854003907, + "rewards/margins": 17.333641306559244, + "rewards/rejected": -13.705042521158854, + "step": 8712 + }, + { + "epoch": 0.7960712654179991, + "grad_norm": 0.80078125, + "kl": 0.0, + "learning_rate": 9.958647096625994e-07, + "logits/chosen": 945618090.6666666, + "logits/rejected": 640222873.6, + "logps/chosen": -135.19866943359375, + "logps/rejected": -545.98466796875, + "loss": 0.0075, + "rewards/chosen": 3.978338877360026, + "rewards/margins": 13.98479258219401, + "rewards/rejected": -10.006453704833984, + "step": 8713 + }, + { + "epoch": 0.7961626313385107, + "grad_norm": 0.54296875, + "kl": 0.0, + "learning_rate": 9.950037831281085e-07, + "logits/chosen": 600375936.0, + "logits/rejected": 451991210.6666667, + "logps/chosen": -306.2960205078125, + "logps/rejected": -468.7648518880208, + "loss": 0.0024, + "rewards/chosen": 5.270975589752197, + "rewards/margins": 14.341208616892496, + "rewards/rejected": -9.070233027140299, + "step": 8714 + }, + { + "epoch": 0.7962539972590224, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 9.941431877696955e-07, + "logits/chosen": 508644044.8, + "logits/rejected": 1196080554.6666667, + "logps/chosen": -234.48349609375, + "logps/rejected": -686.2639973958334, + "loss": 0.0279, + "rewards/chosen": 3.310761642456055, + "rewards/margins": 14.794160079956054, + "rewards/rejected": -11.4833984375, + "step": 8715 + }, + { + "epoch": 0.7963453631795341, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 9.932829236585277e-07, + "logits/chosen": 626659157.3333334, + "logits/rejected": 1307187200.0, + "logps/chosen": -405.8398030598958, + "logps/rejected": -1061.4183349609375, + "loss": 0.0128, + "rewards/chosen": 4.231701215108235, + "rewards/margins": 13.572738011678059, + "rewards/rejected": -9.341036796569824, + "step": 8716 + }, + { + "epoch": 0.7964367291000457, + "grad_norm": 28.75, + "kl": 0.0, + "learning_rate": 9.92422990865739e-07, + "logits/chosen": 561642788.5714285, + "logits/rejected": 591567232.0, + "logps/chosen": -234.48297991071428, + "logps/rejected": -431.27911376953125, + "loss": 0.1063, + "rewards/chosen": 3.902123042515346, + "rewards/margins": 3.1726583327565874, + "rewards/rejected": 0.7294647097587585, + "step": 8717 + }, + { + "epoch": 0.7965280950205573, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 9.915633894624383e-07, + "logits/chosen": 259528048.0, + "logits/rejected": 377191040.0, + "logps/chosen": -215.85244750976562, + "logps/rejected": -457.69329833984375, + "loss": 0.014, + "rewards/chosen": 4.067509651184082, + "rewards/margins": 14.244807243347168, + "rewards/rejected": -10.177297592163086, + "step": 8718 + }, + { + "epoch": 0.796619460941069, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 9.907041195197042e-07, + "logits/chosen": 505849728.0, + "logits/rejected": 438952576.0, + "logps/chosen": -365.2859191894531, + "logps/rejected": -425.3112386067708, + "loss": 0.0047, + "rewards/chosen": 4.145376682281494, + "rewards/margins": 14.503206729888916, + "rewards/rejected": -10.357830047607422, + "step": 8719 + }, + { + "epoch": 0.7967108268615807, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 9.898451811085946e-07, + "logits/chosen": 501340074.6666667, + "logits/rejected": 468184166.4, + "logps/chosen": -356.3366292317708, + "logps/rejected": -495.47353515625, + "loss": 0.014, + "rewards/chosen": 3.360153834025065, + "rewards/margins": 11.173413721720378, + "rewards/rejected": -7.813259887695312, + "step": 8720 + }, + { + "epoch": 0.7968021927820923, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 9.889865743001332e-07, + "logits/chosen": 454162602.6666667, + "logits/rejected": 659928627.2, + "logps/chosen": -314.06951904296875, + "logps/rejected": -445.85390625, + "loss": 0.0138, + "rewards/chosen": 3.997447967529297, + "rewards/margins": 11.667816925048829, + "rewards/rejected": -7.670368957519531, + "step": 8721 + }, + { + "epoch": 0.7968935587026039, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 9.88128299165319e-07, + "logits/chosen": 306328992.0, + "logits/rejected": 633228032.0, + "logps/chosen": -395.6900634765625, + "logps/rejected": -450.9486083984375, + "loss": 0.0093, + "rewards/chosen": 3.3623077869415283, + "rewards/margins": 11.386587699254354, + "rewards/rejected": -8.024279912312826, + "step": 8722 + }, + { + "epoch": 0.7969849246231155, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 9.87270355775124e-07, + "logits/chosen": 588870464.0, + "logits/rejected": 520685824.0, + "logps/chosen": -325.2497253417969, + "logps/rejected": -578.7021077473959, + "loss": 0.0078, + "rewards/chosen": 4.359701633453369, + "rewards/margins": 12.709398746490479, + "rewards/rejected": -8.34969711303711, + "step": 8723 + }, + { + "epoch": 0.7970762905436273, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 9.864127442004906e-07, + "logits/chosen": 729349222.4, + "logits/rejected": 518559829.3333333, + "logps/chosen": -405.69296875, + "logps/rejected": -607.0296223958334, + "loss": 0.0205, + "rewards/chosen": 3.6388648986816405, + "rewards/margins": 15.993575795491537, + "rewards/rejected": -12.354710896809896, + "step": 8724 + }, + { + "epoch": 0.7971676564641389, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 9.855554645123371e-07, + "logits/chosen": 649095125.3333334, + "logits/rejected": 641644480.0, + "logps/chosen": -409.8072102864583, + "logps/rejected": -604.8135986328125, + "loss": 0.0158, + "rewards/chosen": 4.331007639567058, + "rewards/margins": 13.941517512003582, + "rewards/rejected": -9.610509872436523, + "step": 8725 + }, + { + "epoch": 0.7972590223846505, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 9.84698516781553e-07, + "logits/chosen": 633689984.0, + "logits/rejected": 731691264.0, + "logps/chosen": -360.82269287109375, + "logps/rejected": -741.3860473632812, + "loss": 0.0243, + "rewards/chosen": 3.1250081062316895, + "rewards/margins": 12.997255802154541, + "rewards/rejected": -9.872247695922852, + "step": 8726 + }, + { + "epoch": 0.7973503883051621, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 9.838419010789984e-07, + "logits/chosen": 682119387.4285715, + "logits/rejected": 572904960.0, + "logps/chosen": -313.50980050223217, + "logps/rejected": -171.68423461914062, + "loss": 0.0228, + "rewards/chosen": 3.943978718348912, + "rewards/margins": 11.712152889796666, + "rewards/rejected": -7.768174171447754, + "step": 8727 + }, + { + "epoch": 0.7974417542256739, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 9.829856174755071e-07, + "logits/chosen": 414117930.6666667, + "logits/rejected": 163214208.0, + "logps/chosen": -270.8099772135417, + "logps/rejected": -251.4093475341797, + "loss": 0.0126, + "rewards/chosen": 4.379800478617351, + "rewards/margins": 12.726874987284344, + "rewards/rejected": -8.347074508666992, + "step": 8728 + }, + { + "epoch": 0.7975331201461855, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.821296660418878e-07, + "logits/chosen": 546964138.6666666, + "logits/rejected": 187354112.0, + "logps/chosen": -396.071044921875, + "logps/rejected": -449.2964782714844, + "loss": 0.0301, + "rewards/chosen": 3.5334459940592446, + "rewards/margins": 11.845995585123697, + "rewards/rejected": -8.312549591064453, + "step": 8729 + }, + { + "epoch": 0.7976244860666971, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 9.812740468489186e-07, + "logits/chosen": 251632656.0, + "logits/rejected": 459378752.0, + "logps/chosen": -223.7308349609375, + "logps/rejected": -580.4866943359375, + "loss": 0.0069, + "rewards/chosen": 4.980583190917969, + "rewards/margins": 15.39474105834961, + "rewards/rejected": -10.41415786743164, + "step": 8730 + }, + { + "epoch": 0.7977158519872087, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 9.804187599673514e-07, + "logits/chosen": 452353920.0, + "logits/rejected": 380105824.0, + "logps/chosen": -260.831787109375, + "logps/rejected": -374.3488464355469, + "loss": 0.0247, + "rewards/chosen": 3.8880016803741455, + "rewards/margins": 12.777021646499634, + "rewards/rejected": -8.889019966125488, + "step": 8731 + }, + { + "epoch": 0.7978072179077205, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.795638054679097e-07, + "logits/chosen": 559237717.3333334, + "logits/rejected": 612419840.0, + "logps/chosen": -386.2059326171875, + "logps/rejected": -573.6959838867188, + "loss": 0.0308, + "rewards/chosen": 3.4695262908935547, + "rewards/margins": 13.931581497192383, + "rewards/rejected": -10.462055206298828, + "step": 8732 + }, + { + "epoch": 0.7978985838282321, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 9.78709183421292e-07, + "logits/chosen": 487241813.3333333, + "logits/rejected": 494365536.0, + "logps/chosen": -296.5424397786458, + "logps/rejected": -487.8506774902344, + "loss": 0.0222, + "rewards/chosen": 3.8268470764160156, + "rewards/margins": 15.893532752990723, + "rewards/rejected": -12.066685676574707, + "step": 8733 + }, + { + "epoch": 0.7979899497487437, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 9.77854893898167e-07, + "logits/chosen": 248688533.33333334, + "logits/rejected": 486998528.0, + "logps/chosen": -170.63362630208334, + "logps/rejected": -459.228955078125, + "loss": 0.0088, + "rewards/chosen": 4.477399826049805, + "rewards/margins": 12.87386360168457, + "rewards/rejected": -8.396463775634766, + "step": 8734 + }, + { + "epoch": 0.7980813156692553, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 9.770009369691763e-07, + "logits/chosen": 606285952.0, + "logits/rejected": 349996224.0, + "logps/chosen": -441.2422790527344, + "logps/rejected": -403.22674560546875, + "loss": 0.0084, + "rewards/chosen": 4.171391487121582, + "rewards/margins": 16.18702220916748, + "rewards/rejected": -12.015630722045898, + "step": 8735 + }, + { + "epoch": 0.7981726815897671, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 9.761473127049336e-07, + "logits/chosen": 652745472.0, + "logits/rejected": 373436672.0, + "logps/chosen": -357.7131652832031, + "logps/rejected": -487.41827392578125, + "loss": 0.0151, + "rewards/chosen": 3.6926605701446533, + "rewards/margins": 14.488765001296997, + "rewards/rejected": -10.796104431152344, + "step": 8736 + }, + { + "epoch": 0.7982640475102787, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 9.75294021176027e-07, + "logits/chosen": 440611737.6, + "logits/rejected": 709756202.6666666, + "logps/chosen": -297.9707763671875, + "logps/rejected": -651.9208577473959, + "loss": 0.0266, + "rewards/chosen": 3.6501235961914062, + "rewards/margins": 12.697373708089193, + "rewards/rejected": -9.047250111897787, + "step": 8737 + }, + { + "epoch": 0.7983554134307903, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 9.744410624530148e-07, + "logits/chosen": 663443200.0, + "logits/rejected": 582660032.0, + "logps/chosen": -261.4237467447917, + "logps/rejected": -458.8257141113281, + "loss": 0.0342, + "rewards/chosen": 3.4979419708251953, + "rewards/margins": 13.304381370544434, + "rewards/rejected": -9.806439399719238, + "step": 8738 + }, + { + "epoch": 0.7984467793513019, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 9.73588436606429e-07, + "logits/chosen": 476191692.8, + "logits/rejected": 408233301.3333333, + "logps/chosen": -339.4924560546875, + "logps/rejected": -517.8437906901041, + "loss": 0.0082, + "rewards/chosen": 4.85246467590332, + "rewards/margins": 14.688900629679361, + "rewards/rejected": -9.836435953776041, + "step": 8739 + }, + { + "epoch": 0.7985381452718137, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 9.72736143706773e-07, + "logits/chosen": 492054240.0, + "logits/rejected": 355105856.0, + "logps/chosen": -139.822509765625, + "logps/rejected": -435.593017578125, + "loss": 0.0177, + "rewards/chosen": 4.426207065582275, + "rewards/margins": 13.713315486907959, + "rewards/rejected": -9.287108421325684, + "step": 8740 + }, + { + "epoch": 0.7986295111923253, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 9.718841838245257e-07, + "logits/chosen": 956104640.0, + "logits/rejected": 1466609920.0, + "logps/chosen": -239.15225219726562, + "logps/rejected": -678.9044189453125, + "loss": 0.0334, + "rewards/chosen": 3.022137403488159, + "rewards/margins": 13.825779676437378, + "rewards/rejected": -10.803642272949219, + "step": 8741 + }, + { + "epoch": 0.7987208771128369, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 9.710325570301348e-07, + "logits/chosen": 620217898.6666666, + "logits/rejected": 954174054.4, + "logps/chosen": -234.09708658854166, + "logps/rejected": -685.728173828125, + "loss": 0.0115, + "rewards/chosen": 3.981448491414388, + "rewards/margins": 12.617953618367514, + "rewards/rejected": -8.636505126953125, + "step": 8742 + }, + { + "epoch": 0.7988122430333485, + "grad_norm": 35.0, + "kl": 0.0, + "learning_rate": 9.701812633940227e-07, + "logits/chosen": 621580480.0, + "logits/rejected": 1079870208.0, + "logps/chosen": -244.36785888671875, + "logps/rejected": -408.3446044921875, + "loss": 0.0394, + "rewards/chosen": 2.6788930892944336, + "rewards/margins": 12.282122611999512, + "rewards/rejected": -9.603229522705078, + "step": 8743 + }, + { + "epoch": 0.7989036089538603, + "grad_norm": 86.0, + "kl": 0.0, + "learning_rate": 9.693303029865825e-07, + "logits/chosen": 445748480.0, + "logits/rejected": 659161344.0, + "logps/chosen": -256.1038513183594, + "logps/rejected": -557.6907958984375, + "loss": 0.0774, + "rewards/chosen": 3.08286452293396, + "rewards/margins": 13.296348333358765, + "rewards/rejected": -10.213483810424805, + "step": 8744 + }, + { + "epoch": 0.7989949748743719, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 9.6847967587818e-07, + "logits/chosen": 359598336.0, + "logits/rejected": 442938931.2, + "logps/chosen": -243.14701334635416, + "logps/rejected": -439.992578125, + "loss": 0.008, + "rewards/chosen": 4.085803985595703, + "rewards/margins": 15.766598510742188, + "rewards/rejected": -11.680794525146485, + "step": 8745 + }, + { + "epoch": 0.7990863407948835, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.676293821391568e-07, + "logits/chosen": 279581184.0, + "logits/rejected": 757084160.0, + "logps/chosen": -427.051416015625, + "logps/rejected": -904.5952962239584, + "loss": 0.0259, + "rewards/chosen": 3.1563447952270507, + "rewards/margins": 12.197176043192545, + "rewards/rejected": -9.040831247965494, + "step": 8746 + }, + { + "epoch": 0.7991777067153951, + "grad_norm": 0.1396484375, + "kl": 0.0, + "learning_rate": 9.667794218398236e-07, + "logits/chosen": 247095472.0, + "logits/rejected": 466590098.28571427, + "logps/chosen": -156.35076904296875, + "logps/rejected": -342.36171177455356, + "loss": 0.0006, + "rewards/chosen": 6.013858318328857, + "rewards/margins": 15.361702033451625, + "rewards/rejected": -9.347843715122767, + "step": 8747 + }, + { + "epoch": 0.7992690726359069, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 9.659297950504632e-07, + "logits/chosen": 417342784.0, + "logits/rejected": 508240170.6666667, + "logps/chosen": -175.17352294921875, + "logps/rejected": -442.3674723307292, + "loss": 0.0116, + "rewards/chosen": 4.4110260009765625, + "rewards/margins": 13.435763676961264, + "rewards/rejected": -9.024737675984701, + "step": 8748 + }, + { + "epoch": 0.7993604385564185, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 9.650805018413307e-07, + "logits/chosen": 860368768.0, + "logits/rejected": 484173866.6666667, + "logps/chosen": -432.09130859375, + "logps/rejected": -579.2418619791666, + "loss": 0.0061, + "rewards/chosen": 3.697619676589966, + "rewards/margins": 15.022269487380981, + "rewards/rejected": -11.324649810791016, + "step": 8749 + }, + { + "epoch": 0.7994518044769301, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 9.642315422826575e-07, + "logits/chosen": 463742272.0, + "logits/rejected": 431851648.0, + "logps/chosen": -233.80560302734375, + "logps/rejected": -376.037109375, + "loss": 0.0199, + "rewards/chosen": 3.381283760070801, + "rewards/margins": 11.153671741485596, + "rewards/rejected": -7.772387981414795, + "step": 8750 + }, + { + "epoch": 0.7995431703974417, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 9.63382916444644e-07, + "logits/chosen": 420528384.0, + "logits/rejected": 260031744.0, + "logps/chosen": -337.66455078125, + "logps/rejected": -436.390234375, + "loss": 0.0217, + "rewards/chosen": 2.838807741800944, + "rewards/margins": 12.766782442728678, + "rewards/rejected": -9.927974700927734, + "step": 8751 + }, + { + "epoch": 0.7996345363179534, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 9.625346243974631e-07, + "logits/chosen": 610308710.4, + "logits/rejected": 1232826538.6666667, + "logps/chosen": -323.4810791015625, + "logps/rejected": -664.1372477213541, + "loss": 0.0128, + "rewards/chosen": 4.326633071899414, + "rewards/margins": 15.121928787231445, + "rewards/rejected": -10.795295715332031, + "step": 8752 + }, + { + "epoch": 0.7997259022384651, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.616866662112606e-07, + "logits/chosen": 364889856.0, + "logits/rejected": 493482803.2, + "logps/chosen": -150.59379069010416, + "logps/rejected": -518.57216796875, + "loss": 0.0308, + "rewards/chosen": 3.396718978881836, + "rewards/margins": 13.919227981567383, + "rewards/rejected": -10.522509002685547, + "step": 8753 + }, + { + "epoch": 0.7998172681589767, + "grad_norm": 0.29296875, + "kl": 0.0, + "learning_rate": 9.608390419561537e-07, + "logits/chosen": 546910464.0, + "logits/rejected": 332969152.0, + "logps/chosen": -262.090576171875, + "logps/rejected": -425.1475423177083, + "loss": 0.0013, + "rewards/chosen": 5.952960968017578, + "rewards/margins": 15.424373626708984, + "rewards/rejected": -9.471412658691406, + "step": 8754 + }, + { + "epoch": 0.7999086340794883, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 9.599917517022362e-07, + "logits/chosen": 706318122.6666666, + "logits/rejected": 524878643.2, + "logps/chosen": -504.5760904947917, + "logps/rejected": -596.2431640625, + "loss": 0.0201, + "rewards/chosen": 3.0800628662109375, + "rewards/margins": 11.134967041015624, + "rewards/rejected": -8.054904174804687, + "step": 8755 + }, + { + "epoch": 0.8, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 9.591447955195688e-07, + "logits/chosen": 491775232.0, + "logits/rejected": 500113834.6666667, + "logps/chosen": -571.292822265625, + "logps/rejected": -567.003173828125, + "loss": 0.0161, + "rewards/chosen": 3.869597625732422, + "rewards/margins": 12.431360117594402, + "rewards/rejected": -8.561762491861979, + "step": 8756 + }, + { + "epoch": 0.8000913659205117, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 9.582981734781876e-07, + "logits/chosen": 677682432.0, + "logits/rejected": 622682496.0, + "logps/chosen": -513.7556762695312, + "logps/rejected": -522.8121948242188, + "loss": 0.005, + "rewards/chosen": 4.9460649490356445, + "rewards/margins": 13.287897109985352, + "rewards/rejected": -8.341832160949707, + "step": 8757 + }, + { + "epoch": 0.8001827318410233, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.574518856480986e-07, + "logits/chosen": 809709994.6666666, + "logits/rejected": 851719424.0, + "logps/chosen": -184.74629720052084, + "logps/rejected": -344.718505859375, + "loss": 0.0282, + "rewards/chosen": 3.8710409800211587, + "rewards/margins": 11.402036348978678, + "rewards/rejected": -7.5309953689575195, + "step": 8758 + }, + { + "epoch": 0.8002740977615349, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 9.566059320992854e-07, + "logits/chosen": 481557888.0, + "logits/rejected": 558877312.0, + "logps/chosen": -248.5833282470703, + "logps/rejected": -456.29278564453125, + "loss": 0.009, + "rewards/chosen": 4.676508903503418, + "rewards/margins": 14.125052452087402, + "rewards/rejected": -9.448543548583984, + "step": 8759 + }, + { + "epoch": 0.8003654636820466, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 9.557603129016984e-07, + "logits/chosen": 488933760.0, + "logits/rejected": 701454336.0, + "logps/chosen": -403.7433776855469, + "logps/rejected": -520.705322265625, + "loss": 0.0097, + "rewards/chosen": 4.286964416503906, + "rewards/margins": 13.202644348144531, + "rewards/rejected": -8.915679931640625, + "step": 8760 + }, + { + "epoch": 0.8004568296025583, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 9.549150281252633e-07, + "logits/chosen": 519271776.0, + "logits/rejected": 402181589.3333333, + "logps/chosen": -238.78118896484375, + "logps/rejected": -442.1370035807292, + "loss": 0.007, + "rewards/chosen": 4.122208595275879, + "rewards/margins": 14.551517804463705, + "rewards/rejected": -10.429309209187826, + "step": 8761 + }, + { + "epoch": 0.8005481955230699, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 9.540700778398754e-07, + "logits/chosen": 545024614.4, + "logits/rejected": 554936448.0, + "logps/chosen": -283.4849609375, + "logps/rejected": -524.864013671875, + "loss": 0.0294, + "rewards/chosen": 3.305402374267578, + "rewards/margins": 12.460650380452474, + "rewards/rejected": -9.155248006184896, + "step": 8762 + }, + { + "epoch": 0.8006395614435815, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.532254621154063e-07, + "logits/chosen": 423709738.6666667, + "logits/rejected": 250563248.0, + "logps/chosen": -194.10015869140625, + "logps/rejected": -515.957763671875, + "loss": 0.0526, + "rewards/chosen": 2.762864430745443, + "rewards/margins": 13.864505132039389, + "rewards/rejected": -11.101640701293945, + "step": 8763 + }, + { + "epoch": 0.8007309273640932, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 9.523811810216993e-07, + "logits/chosen": 389939498.6666667, + "logits/rejected": 419529420.8, + "logps/chosen": -262.08290608723956, + "logps/rejected": -394.9171875, + "loss": 0.0033, + "rewards/chosen": 5.1676451365153, + "rewards/margins": 13.849476496378582, + "rewards/rejected": -8.681831359863281, + "step": 8764 + }, + { + "epoch": 0.8008222932846049, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.51537234628565e-07, + "logits/chosen": 493269056.0, + "logits/rejected": 304785856.0, + "logps/chosen": -195.2037353515625, + "logps/rejected": -303.397705078125, + "loss": 0.0335, + "rewards/chosen": 3.8203582763671875, + "rewards/margins": 11.444910049438477, + "rewards/rejected": -7.624551773071289, + "step": 8765 + }, + { + "epoch": 0.8009136592051165, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 9.506936230057906e-07, + "logits/chosen": 588349824.0, + "logits/rejected": 768159180.8, + "logps/chosen": -445.3153076171875, + "logps/rejected": -340.3389404296875, + "loss": 0.0143, + "rewards/chosen": 3.891139348347982, + "rewards/margins": 13.577124150594075, + "rewards/rejected": -9.685984802246093, + "step": 8766 + }, + { + "epoch": 0.8010050251256281, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.498503462231374e-07, + "logits/chosen": 388082368.0, + "logits/rejected": 452446720.0, + "logps/chosen": -336.5224609375, + "logps/rejected": -404.7502136230469, + "loss": 0.0151, + "rewards/chosen": 3.914214849472046, + "rewards/margins": 13.068092107772827, + "rewards/rejected": -9.153877258300781, + "step": 8767 + }, + { + "epoch": 0.8010963910461398, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 9.490074043503344e-07, + "logits/chosen": 521926144.0, + "logits/rejected": 362250048.0, + "logps/chosen": -362.1929931640625, + "logps/rejected": -339.174560546875, + "loss": 0.0213, + "rewards/chosen": 3.3674099445343018, + "rewards/margins": 11.66078782081604, + "rewards/rejected": -8.293377876281738, + "step": 8768 + }, + { + "epoch": 0.8011877569666515, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.481647974570863e-07, + "logits/chosen": 511265792.0, + "logits/rejected": 491116586.6666667, + "logps/chosen": -336.166552734375, + "logps/rejected": -350.855224609375, + "loss": 0.0768, + "rewards/chosen": 4.450992584228516, + "rewards/margins": 9.813957214355469, + "rewards/rejected": -5.362964630126953, + "step": 8769 + }, + { + "epoch": 0.8012791228871631, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.473225256130663e-07, + "logits/chosen": 727991808.0, + "logits/rejected": 637214720.0, + "logps/chosen": -489.63193359375, + "logps/rejected": -351.2827962239583, + "loss": 0.017, + "rewards/chosen": 3.8081634521484373, + "rewards/margins": 13.673231252034505, + "rewards/rejected": -9.865067799886068, + "step": 8770 + }, + { + "epoch": 0.8013704888076747, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 9.464805888879264e-07, + "logits/chosen": 653286314.6666666, + "logits/rejected": 733937024.0, + "logps/chosen": -356.6473795572917, + "logps/rejected": -1291.9639892578125, + "loss": 0.0139, + "rewards/chosen": 4.227807362874349, + "rewards/margins": 16.86412461598714, + "rewards/rejected": -12.636317253112793, + "step": 8771 + }, + { + "epoch": 0.8014618547281864, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 9.456389873512856e-07, + "logits/chosen": 501491763.2, + "logits/rejected": 680925269.3333334, + "logps/chosen": -375.581591796875, + "logps/rejected": -751.89453125, + "loss": 0.0224, + "rewards/chosen": 3.6802955627441407, + "rewards/margins": 14.547257995605468, + "rewards/rejected": -10.866962432861328, + "step": 8772 + }, + { + "epoch": 0.801553220648698, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 9.447977210727355e-07, + "logits/chosen": 304520032.0, + "logits/rejected": 333335392.0, + "logps/chosen": -287.30859375, + "logps/rejected": -592.984375, + "loss": 0.0113, + "rewards/chosen": 3.987309217453003, + "rewards/margins": 15.981487035751343, + "rewards/rejected": -11.99417781829834, + "step": 8773 + }, + { + "epoch": 0.8016445865692097, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 9.439567901218416e-07, + "logits/chosen": 654908096.0, + "logits/rejected": 587930752.0, + "logps/chosen": -490.02508544921875, + "logps/rejected": -724.6373291015625, + "loss": 0.0163, + "rewards/chosen": 3.593517303466797, + "rewards/margins": 13.130416870117188, + "rewards/rejected": -9.53689956665039, + "step": 8774 + }, + { + "epoch": 0.8017359524897213, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 9.431161945681394e-07, + "logits/chosen": 830579916.8, + "logits/rejected": 338271381.3333333, + "logps/chosen": -443.98349609375, + "logps/rejected": -415.916748046875, + "loss": 0.0106, + "rewards/chosen": 4.206108093261719, + "rewards/margins": 14.125896453857422, + "rewards/rejected": -9.919788360595703, + "step": 8775 + }, + { + "epoch": 0.801827318410233, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.422759344811421e-07, + "logits/chosen": 623327436.8, + "logits/rejected": 679667114.6666666, + "logps/chosen": -473.9216796875, + "logps/rejected": -760.717041015625, + "loss": 0.0343, + "rewards/chosen": 3.370472717285156, + "rewards/margins": 13.48328094482422, + "rewards/rejected": -10.112808227539062, + "step": 8776 + }, + { + "epoch": 0.8019186843307446, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 9.414360099303288e-07, + "logits/chosen": 650747187.2, + "logits/rejected": 476354560.0, + "logps/chosen": -311.0037353515625, + "logps/rejected": -592.1997884114584, + "loss": 0.0254, + "rewards/chosen": 3.377775192260742, + "rewards/margins": 13.14083162943522, + "rewards/rejected": -9.763056437174479, + "step": 8777 + }, + { + "epoch": 0.8020100502512563, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 9.405964209851542e-07, + "logits/chosen": 420934144.0, + "logits/rejected": 285118058.6666667, + "logps/chosen": -377.6716796875, + "logps/rejected": -419.3554280598958, + "loss": 0.0163, + "rewards/chosen": 3.8179290771484373, + "rewards/margins": 13.89090805053711, + "rewards/rejected": -10.072978973388672, + "step": 8778 + }, + { + "epoch": 0.8021014161717679, + "grad_norm": 53.25, + "kl": 0.0, + "learning_rate": 9.397571677150435e-07, + "logits/chosen": 245292560.0, + "logits/rejected": 502379072.0, + "logps/chosen": -367.6546630859375, + "logps/rejected": -581.603515625, + "loss": 0.0982, + "rewards/chosen": 2.9455177783966064, + "rewards/margins": 11.702821016311646, + "rewards/rejected": -8.757303237915039, + "step": 8779 + }, + { + "epoch": 0.8021927820922796, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 9.389182501893967e-07, + "logits/chosen": 523965747.2, + "logits/rejected": 271991424.0, + "logps/chosen": -263.1657958984375, + "logps/rejected": -510.0986328125, + "loss": 0.0305, + "rewards/chosen": 3.2189567565917967, + "rewards/margins": 13.66560567220052, + "rewards/rejected": -10.446648915608725, + "step": 8780 + }, + { + "epoch": 0.8022841480127912, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.380796684775845e-07, + "logits/chosen": 533698340.5714286, + "logits/rejected": 1078083328.0, + "logps/chosen": -348.53445870535717, + "logps/rejected": -715.8175659179688, + "loss": 0.0334, + "rewards/chosen": 3.4777352469308034, + "rewards/margins": 12.759650911603655, + "rewards/rejected": -9.281915664672852, + "step": 8781 + }, + { + "epoch": 0.8023755139333029, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 9.372414226489485e-07, + "logits/chosen": 716648832.0, + "logits/rejected": 604203648.0, + "logps/chosen": -304.9755554199219, + "logps/rejected": -533.5538330078125, + "loss": 0.0062, + "rewards/chosen": 4.768924236297607, + "rewards/margins": 12.382229328155518, + "rewards/rejected": -7.61330509185791, + "step": 8782 + }, + { + "epoch": 0.8024668798538145, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 9.364035127728038e-07, + "logits/chosen": 413336064.0, + "logits/rejected": 309455680.0, + "logps/chosen": -288.08001708984375, + "logps/rejected": -412.4617614746094, + "loss": 0.016, + "rewards/chosen": 4.016293525695801, + "rewards/margins": 13.320286750793457, + "rewards/rejected": -9.303993225097656, + "step": 8783 + }, + { + "epoch": 0.8025582457743262, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 9.355659389184396e-07, + "logits/chosen": 467621952.0, + "logits/rejected": 620934314.6666666, + "logps/chosen": -401.0377197265625, + "logps/rejected": -553.3905436197916, + "loss": 0.0162, + "rewards/chosen": 2.7212822437286377, + "rewards/margins": 12.89997410774231, + "rewards/rejected": -10.178691864013672, + "step": 8784 + }, + { + "epoch": 0.8026496116948378, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 9.347287011551142e-07, + "logits/chosen": 999751296.0, + "logits/rejected": 908305115.4285715, + "logps/chosen": -386.8531494140625, + "logps/rejected": -619.0288783482143, + "loss": 0.0082, + "rewards/chosen": 2.6777589321136475, + "rewards/margins": 13.733832802091326, + "rewards/rejected": -11.056073869977679, + "step": 8785 + }, + { + "epoch": 0.8027409776153495, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 9.338917995520603e-07, + "logits/chosen": 380505728.0, + "logits/rejected": 600089472.0, + "logps/chosen": -188.0504353841146, + "logps/rejected": -799.6171264648438, + "loss": 0.0236, + "rewards/chosen": 3.7353175481160483, + "rewards/margins": 13.2721160252889, + "rewards/rejected": -9.536798477172852, + "step": 8786 + }, + { + "epoch": 0.8028323435358611, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.330552341784805e-07, + "logits/chosen": 605248170.6666666, + "logits/rejected": 722855808.0, + "logps/chosen": -418.8699137369792, + "logps/rejected": -398.6845703125, + "loss": 0.0292, + "rewards/chosen": 3.6425259908040366, + "rewards/margins": 12.000140508015951, + "rewards/rejected": -8.357614517211914, + "step": 8787 + }, + { + "epoch": 0.8029237094563728, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 9.322190051035512e-07, + "logits/chosen": 398629696.0, + "logits/rejected": 575832832.0, + "logps/chosen": -194.60308837890625, + "logps/rejected": -672.1888427734375, + "loss": 0.0105, + "rewards/chosen": 4.391107082366943, + "rewards/margins": 13.409696102142334, + "rewards/rejected": -9.01858901977539, + "step": 8788 + }, + { + "epoch": 0.8030150753768844, + "grad_norm": 0.79296875, + "kl": 0.0, + "learning_rate": 9.313831123964224e-07, + "logits/chosen": 632665536.0, + "logits/rejected": 385275538.28571427, + "logps/chosen": -129.3446044921875, + "logps/rejected": -418.27015904017856, + "loss": 0.0034, + "rewards/chosen": 3.6686768531799316, + "rewards/margins": 12.850821018218994, + "rewards/rejected": -9.182144165039062, + "step": 8789 + }, + { + "epoch": 0.8031064412973961, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 9.305475561262134e-07, + "logits/chosen": 548897484.8, + "logits/rejected": 608551594.6666666, + "logps/chosen": -296.2874267578125, + "logps/rejected": -557.9930826822916, + "loss": 0.0186, + "rewards/chosen": 3.931475067138672, + "rewards/margins": 13.876187388102213, + "rewards/rejected": -9.944712320963541, + "step": 8790 + }, + { + "epoch": 0.8031978072179077, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.297123363620169e-07, + "logits/chosen": 630581952.0, + "logits/rejected": 399322496.0, + "logps/chosen": -296.4940185546875, + "logps/rejected": -487.83148193359375, + "loss": 0.0096, + "rewards/chosen": 4.523601531982422, + "rewards/margins": 15.023533821105957, + "rewards/rejected": -10.499932289123535, + "step": 8791 + }, + { + "epoch": 0.8032891731384194, + "grad_norm": 26.5, + "kl": 0.0, + "learning_rate": 9.288774531728989e-07, + "logits/chosen": 612426496.0, + "logits/rejected": 482220672.0, + "logps/chosen": -379.8827311197917, + "logps/rejected": -573.008056640625, + "loss": 0.0291, + "rewards/chosen": 3.7323923110961914, + "rewards/margins": 14.928301811218262, + "rewards/rejected": -11.19590950012207, + "step": 8792 + }, + { + "epoch": 0.803380539058931, + "grad_norm": 0.93359375, + "kl": 0.0, + "learning_rate": 9.280429066278951e-07, + "logits/chosen": 751476224.0, + "logits/rejected": 452600115.2, + "logps/chosen": -490.0155843098958, + "logps/rejected": -452.740380859375, + "loss": 0.0047, + "rewards/chosen": 4.737906773885091, + "rewards/margins": 13.301696904500325, + "rewards/rejected": -8.563790130615235, + "step": 8793 + }, + { + "epoch": 0.8034719049794427, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 9.272086967960159e-07, + "logits/chosen": 325896768.0, + "logits/rejected": 363437440.0, + "logps/chosen": -266.51239013671875, + "logps/rejected": -570.5616455078125, + "loss": 0.0187, + "rewards/chosen": 3.8708229064941406, + "rewards/margins": 15.836067199707031, + "rewards/rejected": -11.96524429321289, + "step": 8794 + }, + { + "epoch": 0.8035632708999543, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 9.263748237462416e-07, + "logits/chosen": 629679872.0, + "logits/rejected": 790424320.0, + "logps/chosen": -273.1207763671875, + "logps/rejected": -523.9345296223959, + "loss": 0.0152, + "rewards/chosen": 4.372090148925781, + "rewards/margins": 12.729155985514321, + "rewards/rejected": -8.357065836588541, + "step": 8795 + }, + { + "epoch": 0.803654636820466, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.255412875475256e-07, + "logits/chosen": 1180756070.4, + "logits/rejected": 531437354.6666667, + "logps/chosen": -351.07353515625, + "logps/rejected": -705.6656087239584, + "loss": 0.0307, + "rewards/chosen": 3.516377258300781, + "rewards/margins": 12.787258911132813, + "rewards/rejected": -9.270881652832031, + "step": 8796 + }, + { + "epoch": 0.8037460027409776, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 9.247080882687953e-07, + "logits/chosen": 755830579.2, + "logits/rejected": 766322346.6666666, + "logps/chosen": -507.722412109375, + "logps/rejected": -208.4703572591146, + "loss": 0.0206, + "rewards/chosen": 3.6248565673828126, + "rewards/margins": 10.0492982228597, + "rewards/rejected": -6.424441655476888, + "step": 8797 + }, + { + "epoch": 0.8038373686614892, + "grad_norm": 35.0, + "kl": 0.0, + "learning_rate": 9.238752259789486e-07, + "logits/chosen": 388153941.3333333, + "logits/rejected": 306174566.4, + "logps/chosen": -210.69771321614584, + "logps/rejected": -414.704638671875, + "loss": 0.0289, + "rewards/chosen": 3.2026468912760415, + "rewards/margins": 11.619542185465495, + "rewards/rejected": -8.416895294189453, + "step": 8798 + }, + { + "epoch": 0.8039287345820009, + "grad_norm": 59.25, + "kl": 0.0, + "learning_rate": 9.230427007468534e-07, + "logits/chosen": 599346176.0, + "logits/rejected": 272959180.8, + "logps/chosen": -473.6883138020833, + "logps/rejected": -391.6911865234375, + "loss": 0.0559, + "rewards/chosen": 2.3496268590291343, + "rewards/margins": 12.022974332173666, + "rewards/rejected": -9.673347473144531, + "step": 8799 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 9.222105126413522e-07, + "logits/chosen": 564273664.0, + "logits/rejected": 764808192.0, + "logps/chosen": -277.16156005859375, + "logps/rejected": -457.5291748046875, + "loss": 0.0133, + "rewards/chosen": 4.107908248901367, + "rewards/margins": 12.172372817993164, + "rewards/rejected": -8.064464569091797, + "step": 8800 + }, + { + "epoch": 0.8041114664230242, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.213786617312614e-07, + "logits/chosen": 1051530581.3333334, + "logits/rejected": 488203808.0, + "logps/chosen": -333.0668538411458, + "logps/rejected": -314.97479248046875, + "loss": 0.0354, + "rewards/chosen": 3.3373190561930337, + "rewards/margins": 13.503349939982096, + "rewards/rejected": -10.166030883789062, + "step": 8801 + }, + { + "epoch": 0.8042028323435358, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 9.205471480853661e-07, + "logits/chosen": 472071168.0, + "logits/rejected": 630120384.0, + "logps/chosen": -343.66107177734375, + "logps/rejected": -531.056884765625, + "loss": 0.0162, + "rewards/chosen": 3.9714274406433105, + "rewards/margins": 12.165721416473389, + "rewards/rejected": -8.194293975830078, + "step": 8802 + }, + { + "epoch": 0.8042941982640475, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 9.197159717724247e-07, + "logits/chosen": 438133555.2, + "logits/rejected": 509955840.0, + "logps/chosen": -419.928515625, + "logps/rejected": -543.2845052083334, + "loss": 0.0239, + "rewards/chosen": 3.9121940612792967, + "rewards/margins": 13.124319203694661, + "rewards/rejected": -9.212125142415365, + "step": 8803 + }, + { + "epoch": 0.8043855641845592, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 9.188851328611664e-07, + "logits/chosen": 855062016.0, + "logits/rejected": 896064614.4, + "logps/chosen": -531.8148193359375, + "logps/rejected": -623.44736328125, + "loss": 0.0164, + "rewards/chosen": 3.3887449900309243, + "rewards/margins": 11.667817560831706, + "rewards/rejected": -8.279072570800782, + "step": 8804 + }, + { + "epoch": 0.8044769301050708, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 9.180546314202965e-07, + "logits/chosen": 495175594.6666667, + "logits/rejected": 450197708.8, + "logps/chosen": -365.3898111979167, + "logps/rejected": -486.4912109375, + "loss": 0.0133, + "rewards/chosen": 4.048302332560222, + "rewards/margins": 13.46166598002116, + "rewards/rejected": -9.413363647460937, + "step": 8805 + }, + { + "epoch": 0.8045682960255824, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 9.172244675184888e-07, + "logits/chosen": 557863321.6, + "logits/rejected": 674437589.3333334, + "logps/chosen": -372.685302734375, + "logps/rejected": -505.506103515625, + "loss": 0.014, + "rewards/chosen": 4.474489212036133, + "rewards/margins": 13.81651496887207, + "rewards/rejected": -9.342025756835938, + "step": 8806 + }, + { + "epoch": 0.8046596619460941, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 9.163946412243896e-07, + "logits/chosen": 416269952.0, + "logits/rejected": 445515264.0, + "logps/chosen": -355.6571044921875, + "logps/rejected": -382.56494140625, + "loss": 0.0126, + "rewards/chosen": 4.021999359130859, + "rewards/margins": 13.774885177612305, + "rewards/rejected": -9.752885818481445, + "step": 8807 + }, + { + "epoch": 0.8047510278666058, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 9.155651526066179e-07, + "logits/chosen": 590933145.6, + "logits/rejected": 342120640.0, + "logps/chosen": -374.2481201171875, + "logps/rejected": -361.6222737630208, + "loss": 0.0213, + "rewards/chosen": 3.7425697326660154, + "rewards/margins": 12.567464955647786, + "rewards/rejected": -8.824895222981771, + "step": 8808 + }, + { + "epoch": 0.8048423937871174, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.147360017337636e-07, + "logits/chosen": 505060352.0, + "logits/rejected": 492546560.0, + "logps/chosen": -373.09404296875, + "logps/rejected": -446.0785319010417, + "loss": 0.0265, + "rewards/chosen": 3.919147491455078, + "rewards/margins": 9.82829475402832, + "rewards/rejected": -5.909147262573242, + "step": 8809 + }, + { + "epoch": 0.804933759707629, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 9.139071886743928e-07, + "logits/chosen": 748454451.2, + "logits/rejected": 817705130.6666666, + "logps/chosen": -315.7408203125, + "logps/rejected": -474.6650797526042, + "loss": 0.0329, + "rewards/chosen": 3.2377647399902343, + "rewards/margins": 10.369395955403645, + "rewards/rejected": -7.131631215413411, + "step": 8810 + }, + { + "epoch": 0.8050251256281407, + "grad_norm": 0.62890625, + "kl": 0.0, + "learning_rate": 9.130787134970392e-07, + "logits/chosen": 753860928.0, + "logits/rejected": 727692288.0, + "logps/chosen": -575.5771484375, + "logps/rejected": -607.7288643973214, + "loss": 0.0019, + "rewards/chosen": 4.20350980758667, + "rewards/margins": 15.082456384386335, + "rewards/rejected": -10.878946576799665, + "step": 8811 + }, + { + "epoch": 0.8051164915486524, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.122505762702094e-07, + "logits/chosen": 1436053760.0, + "logits/rejected": 1158126208.0, + "logps/chosen": -493.9911804199219, + "logps/rejected": -537.42822265625, + "loss": 0.023, + "rewards/chosen": 3.329005479812622, + "rewards/margins": 11.523118257522583, + "rewards/rejected": -8.194112777709961, + "step": 8812 + }, + { + "epoch": 0.805207857469164, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 9.114227770623818e-07, + "logits/chosen": 372303232.0, + "logits/rejected": 455228800.0, + "logps/chosen": -288.341552734375, + "logps/rejected": -496.9512634277344, + "loss": 0.0227, + "rewards/chosen": 3.447906970977783, + "rewards/margins": 14.333459377288818, + "rewards/rejected": -10.885552406311035, + "step": 8813 + }, + { + "epoch": 0.8052992233896756, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.105953159420106e-07, + "logits/chosen": 610693120.0, + "logits/rejected": 435431509.3333333, + "logps/chosen": -341.392822265625, + "logps/rejected": -303.061767578125, + "loss": 0.0312, + "rewards/chosen": 3.7198535919189455, + "rewards/margins": 12.273345184326171, + "rewards/rejected": -8.553491592407227, + "step": 8814 + }, + { + "epoch": 0.8053905893101873, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 9.097681929775176e-07, + "logits/chosen": 584185408.0, + "logits/rejected": 511608416.0, + "logps/chosen": -470.6827392578125, + "logps/rejected": -304.9200744628906, + "loss": 0.0123, + "rewards/chosen": 3.852410316467285, + "rewards/margins": 11.613164901733398, + "rewards/rejected": -7.760754585266113, + "step": 8815 + }, + { + "epoch": 0.805481955230699, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.089414082372982e-07, + "logits/chosen": 503915520.0, + "logits/rejected": 529332800.0, + "logps/chosen": -460.6890869140625, + "logps/rejected": -481.1290283203125, + "loss": 0.0221, + "rewards/chosen": 3.8254334131876626, + "rewards/margins": 10.665122667948404, + "rewards/rejected": -6.839689254760742, + "step": 8816 + }, + { + "epoch": 0.8055733211512106, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 9.081149617897195e-07, + "logits/chosen": 753487104.0, + "logits/rejected": 575073408.0, + "logps/chosen": -377.2499694824219, + "logps/rejected": -364.6837158203125, + "loss": 0.0233, + "rewards/chosen": 3.3046813011169434, + "rewards/margins": 11.616389751434326, + "rewards/rejected": -8.311708450317383, + "step": 8817 + }, + { + "epoch": 0.8056646870717222, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 9.072888537031216e-07, + "logits/chosen": 390489548.8, + "logits/rejected": 237405034.66666666, + "logps/chosen": -272.13427734375, + "logps/rejected": -373.8677571614583, + "loss": 0.0242, + "rewards/chosen": 3.685150909423828, + "rewards/margins": 12.569813537597657, + "rewards/rejected": -8.884662628173828, + "step": 8818 + }, + { + "epoch": 0.8057560529922339, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 9.064630840458155e-07, + "logits/chosen": 424208460.8, + "logits/rejected": 291399509.3333333, + "logps/chosen": -192.1107666015625, + "logps/rejected": -663.2347005208334, + "loss": 0.0251, + "rewards/chosen": 3.7370452880859375, + "rewards/margins": 13.60226821899414, + "rewards/rejected": -9.865222930908203, + "step": 8819 + }, + { + "epoch": 0.8058474189127456, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 9.056376528860849e-07, + "logits/chosen": 455115520.0, + "logits/rejected": 581653452.8, + "logps/chosen": -329.258056640625, + "logps/rejected": -617.137451171875, + "loss": 0.0238, + "rewards/chosen": 2.748652140299479, + "rewards/margins": 11.89330851236979, + "rewards/rejected": -9.144656372070312, + "step": 8820 + }, + { + "epoch": 0.8059387848332572, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 9.048125602921843e-07, + "logits/chosen": 625529557.3333334, + "logits/rejected": 1039797145.6, + "logps/chosen": -432.9866536458333, + "logps/rejected": -716.07470703125, + "loss": 0.0073, + "rewards/chosen": 3.975674311319987, + "rewards/margins": 14.240434137980143, + "rewards/rejected": -10.264759826660157, + "step": 8821 + }, + { + "epoch": 0.8060301507537688, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 9.039878063323432e-07, + "logits/chosen": 1596428970.6666667, + "logits/rejected": 684365926.4, + "logps/chosen": -364.0211588541667, + "logps/rejected": -369.857958984375, + "loss": 0.0258, + "rewards/chosen": 2.896413803100586, + "rewards/margins": 12.398781967163085, + "rewards/rejected": -9.5023681640625, + "step": 8822 + }, + { + "epoch": 0.8061215166742804, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 9.031633910747606e-07, + "logits/chosen": 316976576.0, + "logits/rejected": 670041344.0, + "logps/chosen": -112.43778991699219, + "logps/rejected": -556.3544514973959, + "loss": 0.0155, + "rewards/chosen": 2.9548981189727783, + "rewards/margins": 13.572708686192831, + "rewards/rejected": -10.617810567220053, + "step": 8823 + }, + { + "epoch": 0.8062128825947922, + "grad_norm": 5.21875, + "kl": 6.417991638183594, + "learning_rate": 9.023393145876069e-07, + "logits/chosen": 367766954.6666667, + "logits/rejected": 277817856.0, + "logps/chosen": -318.98752848307294, + "logps/rejected": -298.81121826171875, + "loss": 0.0347, + "rewards/chosen": 4.558214823404948, + "rewards/margins": 10.481709639231365, + "rewards/rejected": -5.923494815826416, + "step": 8824 + }, + { + "epoch": 0.8063042485153038, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 9.015155769390249e-07, + "logits/chosen": 376460704.0, + "logits/rejected": 652314282.6666666, + "logps/chosen": -252.1361083984375, + "logps/rejected": -381.3155924479167, + "loss": 0.0043, + "rewards/chosen": 4.3528594970703125, + "rewards/margins": 12.619463602701822, + "rewards/rejected": -8.26660410563151, + "step": 8825 + }, + { + "epoch": 0.8063956144358154, + "grad_norm": 0.55859375, + "kl": 0.0, + "learning_rate": 9.006921781971334e-07, + "logits/chosen": 1131852202.6666667, + "logits/rejected": 542555187.2, + "logps/chosen": -535.8720296223959, + "logps/rejected": -493.687890625, + "loss": 0.002, + "rewards/chosen": 5.451127370198567, + "rewards/margins": 14.305534108479819, + "rewards/rejected": -8.85440673828125, + "step": 8826 + }, + { + "epoch": 0.806486980356327, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 8.998691184300173e-07, + "logits/chosen": 707670784.0, + "logits/rejected": 525778016.0, + "logps/chosen": -514.7380981445312, + "logps/rejected": -468.5275573730469, + "loss": 0.0161, + "rewards/chosen": 4.064496994018555, + "rewards/margins": 12.745885848999023, + "rewards/rejected": -8.681388854980469, + "step": 8827 + }, + { + "epoch": 0.8065783462768388, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 8.990463977057368e-07, + "logits/chosen": 523433301.3333333, + "logits/rejected": 1374881536.0, + "logps/chosen": -282.10400390625, + "logps/rejected": -747.5369873046875, + "loss": 0.0255, + "rewards/chosen": 3.7215449015299478, + "rewards/margins": 12.740801493326822, + "rewards/rejected": -9.019256591796875, + "step": 8828 + }, + { + "epoch": 0.8066697121973504, + "grad_norm": 12.6875, + "kl": 5.752124786376953, + "learning_rate": 8.982240160923228e-07, + "logits/chosen": 407794907.4285714, + "logits/rejected": 620980096.0, + "logps/chosen": -269.31529017857144, + "logps/rejected": -587.5050659179688, + "loss": 0.0645, + "rewards/chosen": 3.8104967389787947, + "rewards/margins": 6.302604845591954, + "rewards/rejected": -2.492108106613159, + "step": 8829 + }, + { + "epoch": 0.806761078117862, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 8.974019736577777e-07, + "logits/chosen": 542741760.0, + "logits/rejected": 371488938.6666667, + "logps/chosen": -479.05302734375, + "logps/rejected": -329.4429931640625, + "loss": 0.0326, + "rewards/chosen": 2.9846567153930663, + "rewards/margins": 10.84145933787028, + "rewards/rejected": -7.856802622477214, + "step": 8830 + }, + { + "epoch": 0.8068524440383736, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 8.965802704700787e-07, + "logits/chosen": 670496000.0, + "logits/rejected": 392902048.0, + "logps/chosen": -395.689208984375, + "logps/rejected": -374.64019775390625, + "loss": 0.0339, + "rewards/chosen": 4.021955490112305, + "rewards/margins": 12.061288833618164, + "rewards/rejected": -8.03933334350586, + "step": 8831 + }, + { + "epoch": 0.8069438099588854, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 8.957589065971728e-07, + "logits/chosen": 570240768.0, + "logits/rejected": 607541145.6, + "logps/chosen": -287.2230631510417, + "logps/rejected": -345.8345703125, + "loss": 0.0228, + "rewards/chosen": 3.031879742940267, + "rewards/margins": 10.734101422627766, + "rewards/rejected": -7.7022216796875, + "step": 8832 + }, + { + "epoch": 0.807035175879397, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 8.949378821069788e-07, + "logits/chosen": 574268416.0, + "logits/rejected": 353382400.0, + "logps/chosen": -315.4855041503906, + "logps/rejected": -433.00469970703125, + "loss": 0.0236, + "rewards/chosen": 3.1262643337249756, + "rewards/margins": 11.141030073165894, + "rewards/rejected": -8.014765739440918, + "step": 8833 + }, + { + "epoch": 0.8071265417999086, + "grad_norm": 84.0, + "kl": 0.0, + "learning_rate": 8.941171970673862e-07, + "logits/chosen": 550094528.0, + "logits/rejected": 458015552.0, + "logps/chosen": -356.8556823730469, + "logps/rejected": -493.2156677246094, + "loss": 0.1478, + "rewards/chosen": 3.8569560050964355, + "rewards/margins": 8.684162616729736, + "rewards/rejected": -4.827206611633301, + "step": 8834 + }, + { + "epoch": 0.8072179077204202, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 8.932968515462615e-07, + "logits/chosen": 653654886.4, + "logits/rejected": 893812565.3333334, + "logps/chosen": -271.407080078125, + "logps/rejected": -509.5362955729167, + "loss": 0.0396, + "rewards/chosen": 3.0258899688720704, + "rewards/margins": 13.006606038411459, + "rewards/rejected": -9.980716069539389, + "step": 8835 + }, + { + "epoch": 0.807309273640932, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 8.924768456114369e-07, + "logits/chosen": 712117952.0, + "logits/rejected": 456244992.0, + "logps/chosen": -591.37744140625, + "logps/rejected": -565.16064453125, + "loss": 0.0092, + "rewards/chosen": 4.348466873168945, + "rewards/margins": 13.465719223022461, + "rewards/rejected": -9.117252349853516, + "step": 8836 + }, + { + "epoch": 0.8074006395614436, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 8.916571793307205e-07, + "logits/chosen": 546683989.3333334, + "logits/rejected": 452631648.0, + "logps/chosen": -413.8465576171875, + "logps/rejected": -420.0965881347656, + "loss": 0.0236, + "rewards/chosen": 3.893495559692383, + "rewards/margins": 13.55368709564209, + "rewards/rejected": -9.660191535949707, + "step": 8837 + }, + { + "epoch": 0.8074920054819552, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 8.908378527718903e-07, + "logits/chosen": 682159424.0, + "logits/rejected": 578854784.0, + "logps/chosen": -295.11627197265625, + "logps/rejected": -572.51953125, + "loss": 0.0099, + "rewards/chosen": 4.101502895355225, + "rewards/margins": 12.970929622650146, + "rewards/rejected": -8.869426727294922, + "step": 8838 + }, + { + "epoch": 0.8075833714024668, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 8.900188660026965e-07, + "logits/chosen": 439556352.0, + "logits/rejected": 899961241.6, + "logps/chosen": -276.888671875, + "logps/rejected": -385.8578125, + "loss": 0.0086, + "rewards/chosen": 4.417226791381836, + "rewards/margins": 13.295083236694335, + "rewards/rejected": -8.8778564453125, + "step": 8839 + }, + { + "epoch": 0.8076747373229786, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 8.892002190908638e-07, + "logits/chosen": 695642965.3333334, + "logits/rejected": 335002912.0, + "logps/chosen": -536.2579345703125, + "logps/rejected": -583.5631103515625, + "loss": 0.0184, + "rewards/chosen": 3.9112491607666016, + "rewards/margins": 15.352927207946777, + "rewards/rejected": -11.441678047180176, + "step": 8840 + }, + { + "epoch": 0.8077661032434902, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 8.88381912104086e-07, + "logits/chosen": 1085093973.3333333, + "logits/rejected": 806188544.0, + "logps/chosen": -537.892333984375, + "logps/rejected": -425.767431640625, + "loss": 0.0127, + "rewards/chosen": 3.528792381286621, + "rewards/margins": 12.965521812438965, + "rewards/rejected": -9.436729431152344, + "step": 8841 + }, + { + "epoch": 0.8078574691640018, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 8.875639451100288e-07, + "logits/chosen": 633605717.3333334, + "logits/rejected": 301885376.0, + "logps/chosen": -286.3240152994792, + "logps/rejected": -418.121826171875, + "loss": 0.0203, + "rewards/chosen": 3.8187440236409507, + "rewards/margins": 12.700236638387045, + "rewards/rejected": -8.881492614746094, + "step": 8842 + }, + { + "epoch": 0.8079488350845134, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.867463181763297e-07, + "logits/chosen": 309020544.0, + "logits/rejected": 467089408.0, + "logps/chosen": -543.653076171875, + "logps/rejected": -434.6088053385417, + "loss": 0.0126, + "rewards/chosen": 2.9806947708129883, + "rewards/margins": 12.645004590352377, + "rewards/rejected": -9.664309819539389, + "step": 8843 + }, + { + "epoch": 0.8080402010050252, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.85929031370601e-07, + "logits/chosen": 372388053.3333333, + "logits/rejected": 830556800.0, + "logps/chosen": -384.6315104166667, + "logps/rejected": -532.4773559570312, + "loss": 0.0157, + "rewards/chosen": 4.239674886067708, + "rewards/margins": 12.27380593617757, + "rewards/rejected": -8.034131050109863, + "step": 8844 + }, + { + "epoch": 0.8081315669255368, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 8.85112084760425e-07, + "logits/chosen": 644908544.0, + "logits/rejected": 471873450.6666667, + "logps/chosen": -275.87347412109375, + "logps/rejected": -559.9488932291666, + "loss": 0.0157, + "rewards/chosen": 2.763139247894287, + "rewards/margins": 12.226761023203531, + "rewards/rejected": -9.463621775309244, + "step": 8845 + }, + { + "epoch": 0.8082229328460484, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 8.842954784133517e-07, + "logits/chosen": 781022976.0, + "logits/rejected": 650550208.0, + "logps/chosen": -340.8103332519531, + "logps/rejected": -385.053466796875, + "loss": 0.0143, + "rewards/chosen": 3.9936416149139404, + "rewards/margins": 13.705673933029175, + "rewards/rejected": -9.712032318115234, + "step": 8846 + }, + { + "epoch": 0.80831429876656, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 8.834792123969105e-07, + "logits/chosen": 683128661.3333334, + "logits/rejected": 772759168.0, + "logps/chosen": -328.5295817057292, + "logps/rejected": -459.72943115234375, + "loss": 0.0199, + "rewards/chosen": 3.7120866775512695, + "rewards/margins": 12.627568244934082, + "rewards/rejected": -8.915481567382812, + "step": 8847 + }, + { + "epoch": 0.8084056646870718, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 8.826632867785983e-07, + "logits/chosen": 633191552.0, + "logits/rejected": 418762410.6666667, + "logps/chosen": -507.625244140625, + "logps/rejected": -407.4685465494792, + "loss": 0.0085, + "rewards/chosen": 4.0158538818359375, + "rewards/margins": 13.535534540812174, + "rewards/rejected": -9.519680658976236, + "step": 8848 + }, + { + "epoch": 0.8084970306075834, + "grad_norm": 1.140625, + "kl": 0.0, + "learning_rate": 8.818477016258842e-07, + "logits/chosen": 537992320.0, + "logits/rejected": 991620096.0, + "logps/chosen": -231.6468505859375, + "logps/rejected": -440.2238362630208, + "loss": 0.0058, + "rewards/chosen": 4.414129257202148, + "rewards/margins": 14.657627741495768, + "rewards/rejected": -10.24349848429362, + "step": 8849 + }, + { + "epoch": 0.808588396528095, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 8.810324570062101e-07, + "logits/chosen": 589082112.0, + "logits/rejected": 603052160.0, + "logps/chosen": -387.3909912109375, + "logps/rejected": -638.62255859375, + "loss": 0.0169, + "rewards/chosen": 3.491252899169922, + "rewards/margins": 13.516792297363281, + "rewards/rejected": -10.02553939819336, + "step": 8850 + }, + { + "epoch": 0.8086797624486066, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 8.802175529869872e-07, + "logits/chosen": 456409600.0, + "logits/rejected": 278684352.0, + "logps/chosen": -320.1507873535156, + "logps/rejected": -231.4620361328125, + "loss": 0.0252, + "rewards/chosen": 3.65238094329834, + "rewards/margins": 10.694698333740234, + "rewards/rejected": -7.0423173904418945, + "step": 8851 + }, + { + "epoch": 0.8087711283691184, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 8.794029896356032e-07, + "logits/chosen": 262883056.0, + "logits/rejected": 320470930.28571427, + "logps/chosen": -50.71724319458008, + "logps/rejected": -392.01834542410717, + "loss": 0.0145, + "rewards/chosen": 2.0634167194366455, + "rewards/margins": 11.450291327067784, + "rewards/rejected": -9.386874607631139, + "step": 8852 + }, + { + "epoch": 0.80886249428963, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 8.785887670194137e-07, + "logits/chosen": 577980196.5714285, + "logits/rejected": 328087744.0, + "logps/chosen": -205.612548828125, + "logps/rejected": -199.79832458496094, + "loss": 0.0454, + "rewards/chosen": 3.167172295706613, + "rewards/margins": 9.433499200003489, + "rewards/rejected": -6.266326904296875, + "step": 8853 + }, + { + "epoch": 0.8089538602101416, + "grad_norm": 65.5, + "kl": 0.0, + "learning_rate": 8.777748852057472e-07, + "logits/chosen": 1086558003.2, + "logits/rejected": 968009557.3333334, + "logps/chosen": -455.419921875, + "logps/rejected": -271.45782470703125, + "loss": 0.0516, + "rewards/chosen": 3.6978988647460938, + "rewards/margins": 10.421550750732422, + "rewards/rejected": -6.723651885986328, + "step": 8854 + }, + { + "epoch": 0.8090452261306532, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 8.769613442619035e-07, + "logits/chosen": 707835221.3333334, + "logits/rejected": 364894156.8, + "logps/chosen": -224.70023600260416, + "logps/rejected": -568.515087890625, + "loss": 0.0224, + "rewards/chosen": 2.9311351776123047, + "rewards/margins": 14.986220932006836, + "rewards/rejected": -12.055085754394531, + "step": 8855 + }, + { + "epoch": 0.809136592051165, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.761481442551573e-07, + "logits/chosen": 518351008.0, + "logits/rejected": 615241536.0, + "logps/chosen": -320.2381896972656, + "logps/rejected": -546.7474365234375, + "loss": 0.0198, + "rewards/chosen": 3.872681140899658, + "rewards/margins": 12.081634998321533, + "rewards/rejected": -8.208953857421875, + "step": 8856 + }, + { + "epoch": 0.8092279579716766, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 8.753352852527513e-07, + "logits/chosen": 281548864.0, + "logits/rejected": 453044608.0, + "logps/chosen": -224.91033935546875, + "logps/rejected": -390.4228820800781, + "loss": 0.0074, + "rewards/chosen": 5.146257400512695, + "rewards/margins": 13.443243026733398, + "rewards/rejected": -8.296985626220703, + "step": 8857 + }, + { + "epoch": 0.8093193238921882, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 8.745227673219008e-07, + "logits/chosen": 489370240.0, + "logits/rejected": 715803520.0, + "logps/chosen": -257.83306884765625, + "logps/rejected": -491.028076171875, + "loss": 0.006, + "rewards/chosen": 4.597131729125977, + "rewards/margins": 16.299507776896156, + "rewards/rejected": -11.702376047770182, + "step": 8858 + }, + { + "epoch": 0.8094106898126998, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 8.737105905297943e-07, + "logits/chosen": 447359616.0, + "logits/rejected": 466625962.6666667, + "logps/chosen": -216.49844360351562, + "logps/rejected": -406.3304036458333, + "loss": 0.0106, + "rewards/chosen": 3.3294601440429688, + "rewards/margins": 13.36629867553711, + "rewards/rejected": -10.03683853149414, + "step": 8859 + }, + { + "epoch": 0.8095020557332115, + "grad_norm": 58.25, + "kl": 0.0, + "learning_rate": 8.728987549435897e-07, + "logits/chosen": 290621312.0, + "logits/rejected": 386946944.0, + "logps/chosen": -226.01220703125, + "logps/rejected": -413.2862243652344, + "loss": 0.1122, + "rewards/chosen": 3.0730579921177457, + "rewards/margins": 11.193670136587961, + "rewards/rejected": -8.120612144470215, + "step": 8860 + }, + { + "epoch": 0.8095934216537232, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 8.720872606304209e-07, + "logits/chosen": 480153760.0, + "logits/rejected": 491927552.0, + "logps/chosen": -364.5760192871094, + "logps/rejected": -452.8316345214844, + "loss": 0.0209, + "rewards/chosen": 3.9121291637420654, + "rewards/margins": 14.427656888961792, + "rewards/rejected": -10.515527725219727, + "step": 8861 + }, + { + "epoch": 0.8096847875742348, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 8.7127610765739e-07, + "logits/chosen": 934338432.0, + "logits/rejected": 843734698.6666666, + "logps/chosen": -342.90985107421875, + "logps/rejected": -489.10693359375, + "loss": 0.012, + "rewards/chosen": 3.346806287765503, + "rewards/margins": 12.59605860710144, + "rewards/rejected": -9.249252319335938, + "step": 8862 + }, + { + "epoch": 0.8097761534947464, + "grad_norm": 0.95703125, + "kl": 0.0, + "learning_rate": 8.704652960915716e-07, + "logits/chosen": 374626112.0, + "logits/rejected": 459524181.3333333, + "logps/chosen": -188.95074462890625, + "logps/rejected": -623.6636149088541, + "loss": 0.0043, + "rewards/chosen": 4.668262481689453, + "rewards/margins": 14.055133819580078, + "rewards/rejected": -9.386871337890625, + "step": 8863 + }, + { + "epoch": 0.8098675194152581, + "grad_norm": 3.734375, + "kl": 3.222776412963867, + "learning_rate": 8.696548260000104e-07, + "logits/chosen": 820141056.0, + "logits/rejected": 397348384.0, + "logps/chosen": -411.0489501953125, + "logps/rejected": -328.644775390625, + "loss": 0.0267, + "rewards/chosen": 4.073552131652832, + "rewards/margins": 11.806439399719238, + "rewards/rejected": -7.732887268066406, + "step": 8864 + }, + { + "epoch": 0.8099588853357698, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 8.688446974497283e-07, + "logits/chosen": 752487552.0, + "logits/rejected": 339111488.0, + "logps/chosen": -285.2617492675781, + "logps/rejected": -493.7569580078125, + "loss": 0.025, + "rewards/chosen": 3.115917682647705, + "rewards/margins": 12.860666751861572, + "rewards/rejected": -9.744749069213867, + "step": 8865 + }, + { + "epoch": 0.8100502512562814, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 8.680349105077134e-07, + "logits/chosen": 628842325.3333334, + "logits/rejected": 291654272.0, + "logps/chosen": -374.6956380208333, + "logps/rejected": -489.98809814453125, + "loss": 0.0302, + "rewards/chosen": 3.290866216023763, + "rewards/margins": 13.87737782796224, + "rewards/rejected": -10.586511611938477, + "step": 8866 + }, + { + "epoch": 0.810141617176793, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 8.672254652409279e-07, + "logits/chosen": 1096531558.4, + "logits/rejected": 584534485.3333334, + "logps/chosen": -328.5664794921875, + "logps/rejected": -388.0670166015625, + "loss": 0.0196, + "rewards/chosen": 3.6538101196289063, + "rewards/margins": 12.296389516194662, + "rewards/rejected": -8.642579396565756, + "step": 8867 + }, + { + "epoch": 0.8102329830973047, + "grad_norm": 2.21875, + "kl": 2.7437286376953125, + "learning_rate": 8.664163617163046e-07, + "logits/chosen": 780254208.0, + "logits/rejected": 765334464.0, + "logps/chosen": -375.15122767857144, + "logps/rejected": -422.91705322265625, + "loss": 0.014, + "rewards/chosen": 4.7525787353515625, + "rewards/margins": 12.481378078460693, + "rewards/rejected": -7.728799343109131, + "step": 8868 + }, + { + "epoch": 0.8103243490178164, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 8.656076000007507e-07, + "logits/chosen": 932871616.0, + "logits/rejected": 509504384.0, + "logps/chosen": -592.2940063476562, + "logps/rejected": -339.8533935546875, + "loss": 0.091, + "rewards/chosen": 3.3614487648010254, + "rewards/margins": 10.484462261199951, + "rewards/rejected": -7.123013496398926, + "step": 8869 + }, + { + "epoch": 0.810415714938328, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 8.647991801611421e-07, + "logits/chosen": 859529152.0, + "logits/rejected": 434417621.3333333, + "logps/chosen": -490.23394775390625, + "logps/rejected": -430.7277018229167, + "loss": 0.008, + "rewards/chosen": 3.476824998855591, + "rewards/margins": 12.523769934972128, + "rewards/rejected": -9.046944936116537, + "step": 8870 + }, + { + "epoch": 0.8105070808588396, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 8.639911022643288e-07, + "logits/chosen": 547656064.0, + "logits/rejected": 349827776.0, + "logps/chosen": -328.74444580078125, + "logps/rejected": -352.3812255859375, + "loss": 0.0138, + "rewards/chosen": 3.6819214820861816, + "rewards/margins": 12.283785343170166, + "rewards/rejected": -8.601863861083984, + "step": 8871 + }, + { + "epoch": 0.8105984467793513, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 8.631833663771282e-07, + "logits/chosen": 874150720.0, + "logits/rejected": 709881344.0, + "logps/chosen": -245.64297485351562, + "logps/rejected": -510.13531494140625, + "loss": 0.0282, + "rewards/chosen": 2.913607120513916, + "rewards/margins": 11.615731716156006, + "rewards/rejected": -8.70212459564209, + "step": 8872 + }, + { + "epoch": 0.810689812699863, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 8.62375972566336e-07, + "logits/chosen": 295666099.2, + "logits/rejected": 423431338.6666667, + "logps/chosen": -173.93231201171875, + "logps/rejected": -411.2263997395833, + "loss": 0.024, + "rewards/chosen": 3.8143661499023436, + "rewards/margins": 11.585088221232096, + "rewards/rejected": -7.770722071329753, + "step": 8873 + }, + { + "epoch": 0.8107811786203746, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 8.615689208987155e-07, + "logits/chosen": 751189248.0, + "logits/rejected": 1361526613.3333333, + "logps/chosen": -352.8705810546875, + "logps/rejected": -345.677978515625, + "loss": 0.0312, + "rewards/chosen": 3.2886383056640627, + "rewards/margins": 11.534878285725913, + "rewards/rejected": -8.24623998006185, + "step": 8874 + }, + { + "epoch": 0.8108725445408862, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 8.607622114410014e-07, + "logits/chosen": 397392725.3333333, + "logits/rejected": 474090803.2, + "logps/chosen": -367.814208984375, + "logps/rejected": -346.794580078125, + "loss": 0.0394, + "rewards/chosen": 2.521530787150065, + "rewards/margins": 11.027204004923503, + "rewards/rejected": -8.505673217773438, + "step": 8875 + }, + { + "epoch": 0.8109639104613979, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 8.599558442598998e-07, + "logits/chosen": 695699456.0, + "logits/rejected": 550877354.6666666, + "logps/chosen": -448.971435546875, + "logps/rejected": -361.8819173177083, + "loss": 0.0088, + "rewards/chosen": 3.449794054031372, + "rewards/margins": 11.579918464024862, + "rewards/rejected": -8.13012440999349, + "step": 8876 + }, + { + "epoch": 0.8110552763819096, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 8.591498194220932e-07, + "logits/chosen": 771387392.0, + "logits/rejected": 380890240.0, + "logps/chosen": -270.44069417317706, + "logps/rejected": -534.2723388671875, + "loss": 0.0219, + "rewards/chosen": 3.7470925649007163, + "rewards/margins": 13.211261113484701, + "rewards/rejected": -9.464168548583984, + "step": 8877 + }, + { + "epoch": 0.8111466423024212, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 8.583441369942308e-07, + "logits/chosen": 494255392.0, + "logits/rejected": 561724074.6666666, + "logps/chosen": -324.5763854980469, + "logps/rejected": -566.1263834635416, + "loss": 0.0063, + "rewards/chosen": 4.060934543609619, + "rewards/margins": 13.767970244089762, + "rewards/rejected": -9.707035700480143, + "step": 8878 + }, + { + "epoch": 0.8112380082229328, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 8.575387970429344e-07, + "logits/chosen": 384203456.0, + "logits/rejected": 533495082.6666667, + "logps/chosen": -186.82347106933594, + "logps/rejected": -602.8502197265625, + "loss": 0.0042, + "rewards/chosen": 4.480603218078613, + "rewards/margins": 14.597346305847168, + "rewards/rejected": -10.116743087768555, + "step": 8879 + }, + { + "epoch": 0.8113293741434445, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 8.567337996347996e-07, + "logits/chosen": 794887232.0, + "logits/rejected": 761853220.5714285, + "logps/chosen": -504.6251525878906, + "logps/rejected": -523.8880440848214, + "loss": 0.004, + "rewards/chosen": 3.476644992828369, + "rewards/margins": 12.125532627105713, + "rewards/rejected": -8.648887634277344, + "step": 8880 + }, + { + "epoch": 0.8114207400639561, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 8.559291448363893e-07, + "logits/chosen": 521936320.0, + "logits/rejected": 713491712.0, + "logps/chosen": -410.4346923828125, + "logps/rejected": -607.1920166015625, + "loss": 0.019, + "rewards/chosen": 3.483295202255249, + "rewards/margins": 11.77387022972107, + "rewards/rejected": -8.29057502746582, + "step": 8881 + }, + { + "epoch": 0.8115121059844678, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.551248327142447e-07, + "logits/chosen": 448648396.8, + "logits/rejected": 311770176.0, + "logps/chosen": -259.434912109375, + "logps/rejected": -584.5200602213541, + "loss": 0.0292, + "rewards/chosen": 3.3650047302246096, + "rewards/margins": 14.705720774332683, + "rewards/rejected": -11.340716044108072, + "step": 8882 + }, + { + "epoch": 0.8116034719049794, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 8.543208633348732e-07, + "logits/chosen": 472560932.5714286, + "logits/rejected": 886550784.0, + "logps/chosen": -162.67780412946428, + "logps/rejected": -173.90768432617188, + "loss": 0.0217, + "rewards/chosen": 4.130604335239956, + "rewards/margins": 9.471603461674281, + "rewards/rejected": -5.340999126434326, + "step": 8883 + }, + { + "epoch": 0.8116948378254911, + "grad_norm": 48.0, + "kl": 0.0, + "learning_rate": 8.53517236764756e-07, + "logits/chosen": 611767232.0, + "logits/rejected": 462093653.3333333, + "logps/chosen": -440.672119140625, + "logps/rejected": -398.0817057291667, + "loss": 0.0999, + "rewards/chosen": 3.663182258605957, + "rewards/margins": 10.575720151265461, + "rewards/rejected": -6.912537892659505, + "step": 8884 + }, + { + "epoch": 0.8117862037460027, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 8.527139530703438e-07, + "logits/chosen": 627237461.3333334, + "logits/rejected": 168186534.4, + "logps/chosen": -287.4494222005208, + "logps/rejected": -205.4569580078125, + "loss": 0.0082, + "rewards/chosen": 4.27105712890625, + "rewards/margins": 12.001333618164063, + "rewards/rejected": -7.730276489257813, + "step": 8885 + }, + { + "epoch": 0.8118775696665144, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 8.519110123180635e-07, + "logits/chosen": 483302604.8, + "logits/rejected": 381546624.0, + "logps/chosen": -326.312548828125, + "logps/rejected": -486.4099527994792, + "loss": 0.014, + "rewards/chosen": 3.9397861480712892, + "rewards/margins": 13.411814244588218, + "rewards/rejected": -9.472028096516928, + "step": 8886 + }, + { + "epoch": 0.811968935587026, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.511084145743099e-07, + "logits/chosen": 333141930.6666667, + "logits/rejected": 459007488.0, + "logps/chosen": -134.96626790364584, + "logps/rejected": -511.806640625, + "loss": 0.0208, + "rewards/chosen": 3.9934908548990884, + "rewards/margins": 12.991238657633463, + "rewards/rejected": -8.997747802734375, + "step": 8887 + }, + { + "epoch": 0.8120603015075377, + "grad_norm": 0.5078125, + "kl": 0.0, + "learning_rate": 8.503061599054497e-07, + "logits/chosen": 621293440.0, + "logits/rejected": 412260949.3333333, + "logps/chosen": -506.1816101074219, + "logps/rejected": -504.596435546875, + "loss": 0.002, + "rewards/chosen": 4.888885498046875, + "rewards/margins": 15.70660146077474, + "rewards/rejected": -10.817715962727865, + "step": 8888 + }, + { + "epoch": 0.8121516674280493, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.49504248377821e-07, + "logits/chosen": 561956693.3333334, + "logits/rejected": 802062848.0, + "logps/chosen": -365.7560221354167, + "logps/rejected": -478.8783874511719, + "loss": 0.0261, + "rewards/chosen": 3.560140291849772, + "rewards/margins": 14.010129610697428, + "rewards/rejected": -10.449989318847656, + "step": 8889 + }, + { + "epoch": 0.812243033348561, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 8.487026800577375e-07, + "logits/chosen": 529429760.0, + "logits/rejected": 543116185.6, + "logps/chosen": -272.20163981119794, + "logps/rejected": -526.6888671875, + "loss": 0.0073, + "rewards/chosen": 4.547440528869629, + "rewards/margins": 13.17098331451416, + "rewards/rejected": -8.623542785644531, + "step": 8890 + }, + { + "epoch": 0.8123343992690726, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 8.479014550114789e-07, + "logits/chosen": 556966195.2, + "logits/rejected": 471507370.6666667, + "logps/chosen": -307.375146484375, + "logps/rejected": -576.443603515625, + "loss": 0.0325, + "rewards/chosen": 3.4739994049072265, + "rewards/margins": 14.603619003295899, + "rewards/rejected": -11.129619598388672, + "step": 8891 + }, + { + "epoch": 0.8124257651895843, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 8.471005733053006e-07, + "logits/chosen": 988028928.0, + "logits/rejected": 820052032.0, + "logps/chosen": -372.1857096354167, + "logps/rejected": -645.8275146484375, + "loss": 0.0273, + "rewards/chosen": 3.6319592793782554, + "rewards/margins": 15.199851353963217, + "rewards/rejected": -11.567892074584961, + "step": 8892 + }, + { + "epoch": 0.8125171311100959, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 8.463000350054273e-07, + "logits/chosen": 718516838.4, + "logits/rejected": 421978368.0, + "logps/chosen": -337.7302001953125, + "logps/rejected": -367.2415364583333, + "loss": 0.0172, + "rewards/chosen": 4.1916656494140625, + "rewards/margins": 11.43753433227539, + "rewards/rejected": -7.245868682861328, + "step": 8893 + }, + { + "epoch": 0.8126084970306076, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 8.454998401780551e-07, + "logits/chosen": 359282508.8, + "logits/rejected": 418403157.3333333, + "logps/chosen": -254.6681884765625, + "logps/rejected": -422.6846110026042, + "loss": 0.0112, + "rewards/chosen": 4.7338813781738285, + "rewards/margins": 11.927866363525391, + "rewards/rejected": -7.1939849853515625, + "step": 8894 + }, + { + "epoch": 0.8126998629511192, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 8.446999888893547e-07, + "logits/chosen": 408528096.0, + "logits/rejected": 410239573.3333333, + "logps/chosen": -285.1492919921875, + "logps/rejected": -434.1165771484375, + "loss": 0.0146, + "rewards/chosen": 3.2092385292053223, + "rewards/margins": 13.032799879709879, + "rewards/rejected": -9.823561350504557, + "step": 8895 + }, + { + "epoch": 0.8127912288716309, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 8.439004812054658e-07, + "logits/chosen": 960523264.0, + "logits/rejected": 825574144.0, + "logps/chosen": -257.74697265625, + "logps/rejected": -464.3077392578125, + "loss": 0.0279, + "rewards/chosen": 3.180740737915039, + "rewards/margins": 14.238163375854493, + "rewards/rejected": -11.057422637939453, + "step": 8896 + }, + { + "epoch": 0.8128825947921425, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 8.431013171924995e-07, + "logits/chosen": 665825728.0, + "logits/rejected": 470103680.0, + "logps/chosen": -409.00213623046875, + "logps/rejected": -881.26416015625, + "loss": 0.0082, + "rewards/chosen": 4.145536422729492, + "rewards/margins": 13.466178894042969, + "rewards/rejected": -9.320642471313477, + "step": 8897 + }, + { + "epoch": 0.8129739607126542, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 8.423024969165405e-07, + "logits/chosen": 500733120.0, + "logits/rejected": 381870208.0, + "logps/chosen": -296.610107421875, + "logps/rejected": -393.1484375, + "loss": 0.2246, + "rewards/chosen": 2.8961029052734375, + "rewards/margins": 9.925895690917969, + "rewards/rejected": -7.029792785644531, + "step": 8898 + }, + { + "epoch": 0.8130653266331658, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 8.415040204436426e-07, + "logits/chosen": 437454592.0, + "logits/rejected": 365142272.0, + "logps/chosen": -427.96201171875, + "logps/rejected": -556.1569010416666, + "loss": 0.0357, + "rewards/chosen": 3.9388404846191407, + "rewards/margins": 14.306376393636068, + "rewards/rejected": -10.367535909016928, + "step": 8899 + }, + { + "epoch": 0.8131566925536775, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 8.407058878398328e-07, + "logits/chosen": 722854016.0, + "logits/rejected": 1146198400.0, + "logps/chosen": -277.61431884765625, + "logps/rejected": -553.22119140625, + "loss": 0.0165, + "rewards/chosen": 3.887458086013794, + "rewards/margins": 15.608861207962036, + "rewards/rejected": -11.721403121948242, + "step": 8900 + }, + { + "epoch": 0.8132480584741891, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.399080991711095e-07, + "logits/chosen": 833379840.0, + "logits/rejected": 1094724437.3333333, + "logps/chosen": -248.3055419921875, + "logps/rejected": -543.21044921875, + "loss": 0.0458, + "rewards/chosen": 2.7681852340698243, + "rewards/margins": 12.850564257303873, + "rewards/rejected": -10.082379023234049, + "step": 8901 + }, + { + "epoch": 0.8133394243947007, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 8.391106545034416e-07, + "logits/chosen": 864819507.2, + "logits/rejected": 975832661.3333334, + "logps/chosen": -267.715869140625, + "logps/rejected": -532.139892578125, + "loss": 0.0535, + "rewards/chosen": 3.360617446899414, + "rewards/margins": 10.82764892578125, + "rewards/rejected": -7.467031478881836, + "step": 8902 + }, + { + "epoch": 0.8134307903152124, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 8.383135539027714e-07, + "logits/chosen": 563205120.0, + "logits/rejected": 414057088.0, + "logps/chosen": -427.62750244140625, + "logps/rejected": -418.8331298828125, + "loss": 0.1163, + "rewards/chosen": 3.018977403640747, + "rewards/margins": 11.185804923375448, + "rewards/rejected": -8.166827519734701, + "step": 8903 + }, + { + "epoch": 0.8135221562357241, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 8.375167974350118e-07, + "logits/chosen": 467381824.0, + "logits/rejected": 436872032.0, + "logps/chosen": -362.7818603515625, + "logps/rejected": -512.9049072265625, + "loss": 0.0073, + "rewards/chosen": 4.446733474731445, + "rewards/margins": 13.684356689453125, + "rewards/rejected": -9.23762321472168, + "step": 8904 + }, + { + "epoch": 0.8136135221562357, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 8.367203851660472e-07, + "logits/chosen": 837912320.0, + "logits/rejected": 756676608.0, + "logps/chosen": -475.5811767578125, + "logps/rejected": -584.4888916015625, + "loss": 0.0116, + "rewards/chosen": 4.030770301818848, + "rewards/margins": 14.036036491394043, + "rewards/rejected": -10.005266189575195, + "step": 8905 + }, + { + "epoch": 0.8137048880767473, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 8.359243171617315e-07, + "logits/chosen": 685251686.4, + "logits/rejected": 566052778.6666666, + "logps/chosen": -286.1956787109375, + "logps/rejected": -487.7089029947917, + "loss": 0.0261, + "rewards/chosen": 4.210264587402344, + "rewards/margins": 15.25429890950521, + "rewards/rejected": -11.044034322102865, + "step": 8906 + }, + { + "epoch": 0.813796253997259, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 8.35128593487895e-07, + "logits/chosen": 1015763712.0, + "logits/rejected": 1319026176.0, + "logps/chosen": -457.80633544921875, + "logps/rejected": -648.8046875, + "loss": 0.01, + "rewards/chosen": 4.287966728210449, + "rewards/margins": 14.21469783782959, + "rewards/rejected": -9.92673110961914, + "step": 8907 + }, + { + "epoch": 0.8138876199177707, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 8.34333214210335e-07, + "logits/chosen": 475514464.0, + "logits/rejected": 464090048.0, + "logps/chosen": -287.77203369140625, + "logps/rejected": -390.34796142578125, + "loss": 0.0121, + "rewards/chosen": 4.471748352050781, + "rewards/margins": 12.092598915100098, + "rewards/rejected": -7.620850563049316, + "step": 8908 + }, + { + "epoch": 0.8139789858382823, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 8.335381793948233e-07, + "logits/chosen": 792103104.0, + "logits/rejected": 1507955968.0, + "logps/chosen": -252.56564331054688, + "logps/rejected": -856.142578125, + "loss": 0.0135, + "rewards/chosen": 3.641272783279419, + "rewards/margins": 15.166239023208618, + "rewards/rejected": -11.5249662399292, + "step": 8909 + }, + { + "epoch": 0.8140703517587939, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 8.327434891071001e-07, + "logits/chosen": 496204074.6666667, + "logits/rejected": 939835801.6, + "logps/chosen": -253.35245768229166, + "logps/rejected": -444.090625, + "loss": 0.0131, + "rewards/chosen": 3.9231090545654297, + "rewards/margins": 13.434000778198243, + "rewards/rejected": -9.510891723632813, + "step": 8910 + }, + { + "epoch": 0.8141617176793057, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 8.319491434128795e-07, + "logits/chosen": 593683097.6, + "logits/rejected": 953988010.6666666, + "logps/chosen": -328.431005859375, + "logps/rejected": -533.3623453776041, + "loss": 0.0163, + "rewards/chosen": 4.252291870117188, + "rewards/margins": 12.71037089029948, + "rewards/rejected": -8.458079020182291, + "step": 8911 + }, + { + "epoch": 0.8142530835998173, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 8.311551423778474e-07, + "logits/chosen": 839338688.0, + "logits/rejected": 470794386.28571427, + "logps/chosen": -302.2420349121094, + "logps/rejected": -355.0491420200893, + "loss": 0.079, + "rewards/chosen": 3.888314962387085, + "rewards/margins": 11.097065005983625, + "rewards/rejected": -7.20875004359654, + "step": 8912 + }, + { + "epoch": 0.8143444495203289, + "grad_norm": 56.5, + "kl": 0.0, + "learning_rate": 8.303614860676601e-07, + "logits/chosen": 711104128.0, + "logits/rejected": 798599808.0, + "logps/chosen": -387.3938293457031, + "logps/rejected": -362.01373291015625, + "loss": 0.0649, + "rewards/chosen": 3.667412757873535, + "rewards/margins": 12.471068382263184, + "rewards/rejected": -8.803655624389648, + "step": 8913 + }, + { + "epoch": 0.8144358154408405, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 8.295681745479455e-07, + "logits/chosen": 661801856.0, + "logits/rejected": 436508757.3333333, + "logps/chosen": -340.9317932128906, + "logps/rejected": -524.552734375, + "loss": 0.0189, + "rewards/chosen": 4.507843017578125, + "rewards/margins": 13.59347661336263, + "rewards/rejected": -9.085633595784506, + "step": 8914 + }, + { + "epoch": 0.8145271813613523, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 8.287752078843019e-07, + "logits/chosen": 780361581.7142857, + "logits/rejected": 583403520.0, + "logps/chosen": -340.5945521763393, + "logps/rejected": -1084.5380859375, + "loss": 0.0145, + "rewards/chosen": 4.533756801060268, + "rewards/margins": 13.848759242466517, + "rewards/rejected": -9.31500244140625, + "step": 8915 + }, + { + "epoch": 0.8146185472818639, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 8.279825861423018e-07, + "logits/chosen": 618585258.6666666, + "logits/rejected": 276177075.2, + "logps/chosen": -405.1741536458333, + "logps/rejected": -325.707763671875, + "loss": 0.011, + "rewards/chosen": 4.630002975463867, + "rewards/margins": 14.61436424255371, + "rewards/rejected": -9.984361267089843, + "step": 8916 + }, + { + "epoch": 0.8147099132023755, + "grad_norm": 87.5, + "kl": 0.0, + "learning_rate": 8.271903093874878e-07, + "logits/chosen": 629943296.0, + "logits/rejected": 557951890.2857143, + "logps/chosen": -401.5010070800781, + "logps/rejected": -596.7451869419643, + "loss": 0.0546, + "rewards/chosen": 3.7603302001953125, + "rewards/margins": 12.799079895019531, + "rewards/rejected": -9.038749694824219, + "step": 8917 + }, + { + "epoch": 0.8148012791228871, + "grad_norm": 74.5, + "kl": 0.0, + "learning_rate": 8.263983776853734e-07, + "logits/chosen": 521927594.6666667, + "logits/rejected": 451751520.0, + "logps/chosen": -382.5857747395833, + "logps/rejected": -334.12457275390625, + "loss": 0.0824, + "rewards/chosen": 3.348519961039225, + "rewards/margins": 9.524855295817057, + "rewards/rejected": -6.176335334777832, + "step": 8918 + }, + { + "epoch": 0.8148926450433989, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 8.256067911014426e-07, + "logits/chosen": 497331968.0, + "logits/rejected": 256538880.0, + "logps/chosen": -362.36669921875, + "logps/rejected": -448.40615234375, + "loss": 0.0082, + "rewards/chosen": 3.8781251907348633, + "rewards/margins": 12.704477119445801, + "rewards/rejected": -8.826351928710938, + "step": 8919 + }, + { + "epoch": 0.8149840109639105, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.248155497011545e-07, + "logits/chosen": 487454427.4285714, + "logits/rejected": 201383120.0, + "logps/chosen": -244.64372907366072, + "logps/rejected": -316.0215759277344, + "loss": 0.0365, + "rewards/chosen": 3.6150414603097096, + "rewards/margins": 11.620806421552386, + "rewards/rejected": -8.005764961242676, + "step": 8920 + }, + { + "epoch": 0.8150753768844221, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 8.240246535499369e-07, + "logits/chosen": 322879308.8, + "logits/rejected": 458888704.0, + "logps/chosen": -310.4107421875, + "logps/rejected": -571.9947916666666, + "loss": 0.0202, + "rewards/chosen": 3.9469619750976563, + "rewards/margins": 13.14074223836263, + "rewards/rejected": -9.193780263264975, + "step": 8921 + }, + { + "epoch": 0.8151667428049337, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 8.232341027131885e-07, + "logits/chosen": 510655232.0, + "logits/rejected": 690370112.0, + "logps/chosen": -333.8652648925781, + "logps/rejected": -608.863037109375, + "loss": 0.0185, + "rewards/chosen": 4.186404228210449, + "rewards/margins": 14.071131706237793, + "rewards/rejected": -9.884727478027344, + "step": 8922 + }, + { + "epoch": 0.8152581087254455, + "grad_norm": 39.0, + "kl": 0.0, + "learning_rate": 8.22443897256282e-07, + "logits/chosen": 292735360.0, + "logits/rejected": 587352640.0, + "logps/chosen": -186.81495666503906, + "logps/rejected": -439.016357421875, + "loss": 0.0498, + "rewards/chosen": 3.849431037902832, + "rewards/margins": 11.521132469177246, + "rewards/rejected": -7.671701431274414, + "step": 8923 + }, + { + "epoch": 0.8153494746459571, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 8.216540372445575e-07, + "logits/chosen": 591780608.0, + "logits/rejected": 772331571.2, + "logps/chosen": -411.2253824869792, + "logps/rejected": -492.89169921875, + "loss": 0.0103, + "rewards/chosen": 3.9428879419962564, + "rewards/margins": 14.184265200297038, + "rewards/rejected": -10.241377258300782, + "step": 8924 + }, + { + "epoch": 0.8154408405664687, + "grad_norm": 0.66015625, + "kl": 0.0, + "learning_rate": 8.208645227433338e-07, + "logits/chosen": 316659712.0, + "logits/rejected": 515317077.3333333, + "logps/chosen": -257.06378173828125, + "logps/rejected": -429.6722005208333, + "loss": 0.003, + "rewards/chosen": 4.682219505310059, + "rewards/margins": 13.21790854136149, + "rewards/rejected": -8.535689036051432, + "step": 8925 + }, + { + "epoch": 0.8155322064869803, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 8.200753538178918e-07, + "logits/chosen": 425987891.2, + "logits/rejected": 493929984.0, + "logps/chosen": -260.62607421875, + "logps/rejected": -595.2474365234375, + "loss": 0.0206, + "rewards/chosen": 3.6598556518554686, + "rewards/margins": 12.299249776204427, + "rewards/rejected": -8.639394124348959, + "step": 8926 + }, + { + "epoch": 0.8156235724074921, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 8.192865305334891e-07, + "logits/chosen": 358694937.6, + "logits/rejected": 455912192.0, + "logps/chosen": -227.5625732421875, + "logps/rejected": -411.742431640625, + "loss": 0.0318, + "rewards/chosen": 3.4534576416015623, + "rewards/margins": 14.77271728515625, + "rewards/rejected": -11.319259643554688, + "step": 8927 + }, + { + "epoch": 0.8157149383280037, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 8.184980529553566e-07, + "logits/chosen": 479389152.0, + "logits/rejected": 577656640.0, + "logps/chosen": -367.8571472167969, + "logps/rejected": -459.834716796875, + "loss": 0.0076, + "rewards/chosen": 4.624824047088623, + "rewards/margins": 12.283989429473877, + "rewards/rejected": -7.659165382385254, + "step": 8928 + }, + { + "epoch": 0.8158063042485153, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 8.177099211486922e-07, + "logits/chosen": 485162720.0, + "logits/rejected": 1059729664.0, + "logps/chosen": -530.8297119140625, + "logps/rejected": -673.8741048177084, + "loss": 0.008, + "rewards/chosen": 3.5599961280822754, + "rewards/margins": 13.307262897491455, + "rewards/rejected": -9.74726676940918, + "step": 8929 + }, + { + "epoch": 0.8158976701690269, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 8.169221351786683e-07, + "logits/chosen": 733390592.0, + "logits/rejected": 322355104.0, + "logps/chosen": -285.11395263671875, + "logps/rejected": -405.71136474609375, + "loss": 0.0114, + "rewards/chosen": 4.408867359161377, + "rewards/margins": 14.878481388092041, + "rewards/rejected": -10.469614028930664, + "step": 8930 + }, + { + "epoch": 0.8159890360895387, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 8.16134695110426e-07, + "logits/chosen": 486542028.8, + "logits/rejected": 291547477.3333333, + "logps/chosen": -319.412841796875, + "logps/rejected": -452.4722086588542, + "loss": 0.0209, + "rewards/chosen": 4.419094848632812, + "rewards/margins": 11.982624816894532, + "rewards/rejected": -7.563529968261719, + "step": 8931 + }, + { + "epoch": 0.8160804020100503, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 8.153476010090789e-07, + "logits/chosen": 702639360.0, + "logits/rejected": 1093022573.7142856, + "logps/chosen": -334.64678955078125, + "logps/rejected": -477.9135044642857, + "loss": 0.0047, + "rewards/chosen": 3.265881299972534, + "rewards/margins": 13.404665436063494, + "rewards/rejected": -10.13878413609096, + "step": 8932 + }, + { + "epoch": 0.8161717679305619, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 8.145608529397153e-07, + "logits/chosen": 703535744.0, + "logits/rejected": 430629568.0, + "logps/chosen": -318.11322021484375, + "logps/rejected": -431.4210205078125, + "loss": 0.0134, + "rewards/chosen": 4.200349807739258, + "rewards/margins": 14.511611938476562, + "rewards/rejected": -10.311262130737305, + "step": 8933 + }, + { + "epoch": 0.8162631338510735, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 8.137744509673895e-07, + "logits/chosen": 721805653.3333334, + "logits/rejected": 527354880.0, + "logps/chosen": -301.4625651041667, + "logps/rejected": -498.173486328125, + "loss": 0.0223, + "rewards/chosen": 3.1401875813802085, + "rewards/margins": 11.814139302571615, + "rewards/rejected": -8.673951721191406, + "step": 8934 + }, + { + "epoch": 0.8163544997715853, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.12988395157131e-07, + "logits/chosen": 644166229.3333334, + "logits/rejected": 183190112.0, + "logps/chosen": -301.16579182942706, + "logps/rejected": -236.68679809570312, + "loss": 0.1327, + "rewards/chosen": 3.1913331349690757, + "rewards/margins": 12.677776654561361, + "rewards/rejected": -9.486443519592285, + "step": 8935 + }, + { + "epoch": 0.8164458656920969, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 8.122026855739374e-07, + "logits/chosen": 661012940.8, + "logits/rejected": 411232000.0, + "logps/chosen": -428.17109375, + "logps/rejected": -473.4461263020833, + "loss": 0.029, + "rewards/chosen": 3.461621856689453, + "rewards/margins": 13.535728073120117, + "rewards/rejected": -10.074106216430664, + "step": 8936 + }, + { + "epoch": 0.8165372316126085, + "grad_norm": 31.0, + "kl": 0.0, + "learning_rate": 8.114173222827814e-07, + "logits/chosen": 354405952.0, + "logits/rejected": 280568480.0, + "logps/chosen": -177.38046264648438, + "logps/rejected": -422.8042907714844, + "loss": 0.1415, + "rewards/chosen": 2.0143680572509766, + "rewards/margins": 11.663761138916016, + "rewards/rejected": -9.649393081665039, + "step": 8937 + }, + { + "epoch": 0.8166285975331201, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 8.106323053486054e-07, + "logits/chosen": 358751232.0, + "logits/rejected": 459124992.0, + "logps/chosen": -244.7323974609375, + "logps/rejected": -642.9475911458334, + "loss": 0.0275, + "rewards/chosen": 3.3851173400878904, + "rewards/margins": 14.733179219563802, + "rewards/rejected": -11.348061879475912, + "step": 8938 + }, + { + "epoch": 0.8167199634536318, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 8.098476348363227e-07, + "logits/chosen": 447299626.6666667, + "logits/rejected": 331803648.0, + "logps/chosen": -340.97609456380206, + "logps/rejected": -434.43829345703125, + "loss": 0.0171, + "rewards/chosen": 4.443006197611491, + "rewards/margins": 13.439290682474773, + "rewards/rejected": -8.996284484863281, + "step": 8939 + }, + { + "epoch": 0.8168113293741435, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.090633108108159e-07, + "logits/chosen": 435523328.0, + "logits/rejected": 384315562.6666667, + "logps/chosen": -308.080419921875, + "logps/rejected": -456.5555013020833, + "loss": 0.0388, + "rewards/chosen": 3.025721549987793, + "rewards/margins": 11.718223253885904, + "rewards/rejected": -8.692501703898111, + "step": 8940 + }, + { + "epoch": 0.8169026952946551, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 8.082793333369454e-07, + "logits/chosen": 617764224.0, + "logits/rejected": 454696490.6666667, + "logps/chosen": -427.8846435546875, + "logps/rejected": -482.8884684244792, + "loss": 0.0065, + "rewards/chosen": 3.6749207973480225, + "rewards/margins": 13.180963277816772, + "rewards/rejected": -9.50604248046875, + "step": 8941 + }, + { + "epoch": 0.8169940612151667, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.074957024795366e-07, + "logits/chosen": 712728746.6666666, + "logits/rejected": 471184230.4, + "logps/chosen": -359.7705078125, + "logps/rejected": -445.069580078125, + "loss": 0.019, + "rewards/chosen": 3.000246047973633, + "rewards/margins": 12.38575325012207, + "rewards/rejected": -9.385507202148437, + "step": 8942 + }, + { + "epoch": 0.8170854271356784, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 8.067124183033887e-07, + "logits/chosen": 582161664.0, + "logits/rejected": 553909888.0, + "logps/chosen": -307.8686218261719, + "logps/rejected": -370.1846008300781, + "loss": 0.0132, + "rewards/chosen": 4.035717010498047, + "rewards/margins": 11.518861770629883, + "rewards/rejected": -7.483144760131836, + "step": 8943 + }, + { + "epoch": 0.8171767930561901, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 8.05929480873272e-07, + "logits/chosen": 547785642.6666666, + "logits/rejected": 411107532.8, + "logps/chosen": -269.31886800130206, + "logps/rejected": -373.8927734375, + "loss": 0.022, + "rewards/chosen": 3.3730637232462564, + "rewards/margins": 12.185423342386882, + "rewards/rejected": -8.812359619140626, + "step": 8944 + }, + { + "epoch": 0.8172681589767017, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 8.051468902539272e-07, + "logits/chosen": 492039987.2, + "logits/rejected": 436976042.6666667, + "logps/chosen": -283.650537109375, + "logps/rejected": -514.8693033854166, + "loss": 0.0128, + "rewards/chosen": 4.2069244384765625, + "rewards/margins": 15.872215270996094, + "rewards/rejected": -11.665290832519531, + "step": 8945 + }, + { + "epoch": 0.8173595248972133, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 8.043646465100696e-07, + "logits/chosen": 690500096.0, + "logits/rejected": 1510240000.0, + "logps/chosen": -358.5704345703125, + "logps/rejected": -629.46533203125, + "loss": 0.0307, + "rewards/chosen": 3.1913846333821616, + "rewards/margins": 12.869616826375326, + "rewards/rejected": -9.678232192993164, + "step": 8946 + }, + { + "epoch": 0.817450890817725, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 8.035827497063825e-07, + "logits/chosen": 607085568.0, + "logits/rejected": 410084576.0, + "logps/chosen": -264.8023376464844, + "logps/rejected": -565.659912109375, + "loss": 0.1353, + "rewards/chosen": 1.7994797229766846, + "rewards/margins": 11.269341707229614, + "rewards/rejected": -9.46986198425293, + "step": 8947 + }, + { + "epoch": 0.8175422567382367, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 8.028011999075213e-07, + "logits/chosen": 433849728.0, + "logits/rejected": 530596992.0, + "logps/chosen": -234.05947875976562, + "logps/rejected": -541.1885375976562, + "loss": 0.0136, + "rewards/chosen": 4.440309524536133, + "rewards/margins": 12.622527122497559, + "rewards/rejected": -8.182217597961426, + "step": 8948 + }, + { + "epoch": 0.8176336226587483, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 8.020199971781118e-07, + "logits/chosen": 759641856.0, + "logits/rejected": 666356736.0, + "logps/chosen": -523.9771728515625, + "logps/rejected": -709.4507649739584, + "loss": 0.0044, + "rewards/chosen": 4.126745700836182, + "rewards/margins": 14.883331139882406, + "rewards/rejected": -10.756585439046225, + "step": 8949 + }, + { + "epoch": 0.8177249885792599, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 8.012391415827542e-07, + "logits/chosen": 481887573.3333333, + "logits/rejected": 346335462.4, + "logps/chosen": -283.84922281901044, + "logps/rejected": -394.78515625, + "loss": 0.0161, + "rewards/chosen": 3.361830393473307, + "rewards/margins": 12.887077585856119, + "rewards/rejected": -9.525247192382812, + "step": 8950 + }, + { + "epoch": 0.8178163544997716, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 8.004586331860176e-07, + "logits/chosen": 962420053.3333334, + "logits/rejected": 540456960.0, + "logps/chosen": -270.90386962890625, + "logps/rejected": -370.38934326171875, + "loss": 0.1393, + "rewards/chosen": 2.494161605834961, + "rewards/margins": 11.899491310119629, + "rewards/rejected": -9.405329704284668, + "step": 8951 + }, + { + "epoch": 0.8179077204202833, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 7.996784720524431e-07, + "logits/chosen": 539552896.0, + "logits/rejected": 331233173.3333333, + "logps/chosen": -263.13494873046875, + "logps/rejected": -336.7786865234375, + "loss": 0.012, + "rewards/chosen": 3.035304307937622, + "rewards/margins": 13.100133180618286, + "rewards/rejected": -10.064828872680664, + "step": 8952 + }, + { + "epoch": 0.8179990863407949, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.988986582465403e-07, + "logits/chosen": 556046336.0, + "logits/rejected": 493595818.6666667, + "logps/chosen": -218.1306640625, + "logps/rejected": -234.37471516927084, + "loss": 0.0316, + "rewards/chosen": 3.280020523071289, + "rewards/margins": 11.85472755432129, + "rewards/rejected": -8.57470703125, + "step": 8953 + }, + { + "epoch": 0.8180904522613065, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 7.981191918327952e-07, + "logits/chosen": 663801216.0, + "logits/rejected": 538443840.0, + "logps/chosen": -499.448974609375, + "logps/rejected": -684.8814697265625, + "loss": 0.0201, + "rewards/chosen": 3.7086172103881836, + "rewards/margins": 13.92080307006836, + "rewards/rejected": -10.212185859680176, + "step": 8954 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.97340072875662e-07, + "logits/chosen": 618065493.3333334, + "logits/rejected": 384315584.0, + "logps/chosen": -348.1385091145833, + "logps/rejected": -493.64971923828125, + "loss": 0.022, + "rewards/chosen": 4.261836687723796, + "rewards/margins": 11.019056002298992, + "rewards/rejected": -6.757219314575195, + "step": 8955 + }, + { + "epoch": 0.8182731841023299, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 7.965613014395662e-07, + "logits/chosen": 459894869.3333333, + "logits/rejected": 316320819.2, + "logps/chosen": -311.7582600911458, + "logps/rejected": -264.75732421875, + "loss": 0.0168, + "rewards/chosen": 3.5272394816080728, + "rewards/margins": 10.331160990397136, + "rewards/rejected": -6.803921508789062, + "step": 8956 + }, + { + "epoch": 0.8183645500228415, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 7.957828775889037e-07, + "logits/chosen": 498984320.0, + "logits/rejected": 473053081.6, + "logps/chosen": -336.78615315755206, + "logps/rejected": -394.897314453125, + "loss": 0.0127, + "rewards/chosen": 4.404155731201172, + "rewards/margins": 12.970189666748047, + "rewards/rejected": -8.566033935546875, + "step": 8957 + }, + { + "epoch": 0.8184559159433531, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 7.950048013880457e-07, + "logits/chosen": 632387136.0, + "logits/rejected": 612703872.0, + "logps/chosen": -445.15179443359375, + "logps/rejected": -530.2133178710938, + "loss": 0.0131, + "rewards/chosen": 4.112051486968994, + "rewards/margins": 13.04702615737915, + "rewards/rejected": -8.934974670410156, + "step": 8958 + }, + { + "epoch": 0.8185472818638648, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.942270729013302e-07, + "logits/chosen": 735759189.3333334, + "logits/rejected": 892459072.0, + "logps/chosen": -247.6941935221354, + "logps/rejected": -673.361328125, + "loss": 0.1527, + "rewards/chosen": 2.0021637280782065, + "rewards/margins": 12.266968568166098, + "rewards/rejected": -10.26480484008789, + "step": 8959 + }, + { + "epoch": 0.8186386477843764, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 7.934496921930684e-07, + "logits/chosen": 180238112.0, + "logits/rejected": 413893632.0, + "logps/chosen": -150.7972412109375, + "logps/rejected": -468.430419921875, + "loss": 0.1129, + "rewards/chosen": -2.2099838256835938, + "rewards/margins": 7.239416939871651, + "rewards/rejected": -9.449400765555245, + "step": 8960 + }, + { + "epoch": 0.8187300137048881, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 7.926726593275414e-07, + "logits/chosen": 497126400.0, + "logits/rejected": 348764757.3333333, + "logps/chosen": -358.2393798828125, + "logps/rejected": -497.2919921875, + "loss": 0.0265, + "rewards/chosen": 3.2908695220947264, + "rewards/margins": 14.42882932027181, + "rewards/rejected": -11.137959798177084, + "step": 8961 + }, + { + "epoch": 0.8188213796253997, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 7.918959743690052e-07, + "logits/chosen": 421392554.6666667, + "logits/rejected": 254436096.0, + "logps/chosen": -327.2376302083333, + "logps/rejected": -305.70977783203125, + "loss": 0.0151, + "rewards/chosen": 4.237244923909505, + "rewards/margins": 12.479060490926106, + "rewards/rejected": -8.241815567016602, + "step": 8962 + }, + { + "epoch": 0.8189127455459114, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 7.911196373816826e-07, + "logits/chosen": 622007808.0, + "logits/rejected": 850790464.0, + "logps/chosen": -476.3079833984375, + "logps/rejected": -660.609130859375, + "loss": 0.0115, + "rewards/chosen": 4.424324989318848, + "rewards/margins": 15.140176773071289, + "rewards/rejected": -10.715851783752441, + "step": 8963 + }, + { + "epoch": 0.819004111466423, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 7.903436484297699e-07, + "logits/chosen": 665367680.0, + "logits/rejected": 302495795.2, + "logps/chosen": -354.4518636067708, + "logps/rejected": -368.860888671875, + "loss": 0.014, + "rewards/chosen": 3.3169987996419272, + "rewards/margins": 13.320194753011068, + "rewards/rejected": -10.00319595336914, + "step": 8964 + }, + { + "epoch": 0.8190954773869347, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 7.895680075774342e-07, + "logits/chosen": 1353392512.0, + "logits/rejected": 576115797.3333334, + "logps/chosen": -257.634765625, + "logps/rejected": -387.5115152994792, + "loss": 0.0099, + "rewards/chosen": 3.5357491970062256, + "rewards/margins": 12.169856150945028, + "rewards/rejected": -8.634106953938803, + "step": 8965 + }, + { + "epoch": 0.8191868433074463, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 7.887927148888124e-07, + "logits/chosen": 790736640.0, + "logits/rejected": 547028352.0, + "logps/chosen": -484.6152648925781, + "logps/rejected": -389.73333740234375, + "loss": 0.1133, + "rewards/chosen": 3.647724151611328, + "rewards/margins": 9.466371536254883, + "rewards/rejected": -5.818647384643555, + "step": 8966 + }, + { + "epoch": 0.819278209227958, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 7.880177704280168e-07, + "logits/chosen": 476418848.0, + "logits/rejected": 614783360.0, + "logps/chosen": -302.9097900390625, + "logps/rejected": -610.9845581054688, + "loss": 0.005, + "rewards/chosen": 5.028175354003906, + "rewards/margins": 13.979094505310059, + "rewards/rejected": -8.950919151306152, + "step": 8967 + }, + { + "epoch": 0.8193695751484696, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 7.872431742591268e-07, + "logits/chosen": 613550272.0, + "logits/rejected": 768560000.0, + "logps/chosen": -280.84356689453125, + "logps/rejected": -424.92828369140625, + "loss": 0.0249, + "rewards/chosen": 3.170722007751465, + "rewards/margins": 12.888606071472168, + "rewards/rejected": -9.717884063720703, + "step": 8968 + }, + { + "epoch": 0.8194609410689813, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 7.864689264461939e-07, + "logits/chosen": 747545856.0, + "logits/rejected": 812023859.2, + "logps/chosen": -420.2834065755208, + "logps/rejected": -428.381103515625, + "loss": 0.0077, + "rewards/chosen": 4.049418131510417, + "rewards/margins": 13.206061045328777, + "rewards/rejected": -9.15664291381836, + "step": 8969 + }, + { + "epoch": 0.8195523069894929, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 7.85695027053241e-07, + "logits/chosen": 480284000.0, + "logits/rejected": 793619200.0, + "logps/chosen": -168.92318725585938, + "logps/rejected": -486.8056945800781, + "loss": 0.1471, + "rewards/chosen": 1.0883545875549316, + "rewards/margins": 11.254476070404053, + "rewards/rejected": -10.166121482849121, + "step": 8970 + }, + { + "epoch": 0.8196436729100046, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 7.849214761442637e-07, + "logits/chosen": 532423424.0, + "logits/rejected": 420858419.2, + "logps/chosen": -306.6376139322917, + "logps/rejected": -397.779296875, + "loss": 0.0112, + "rewards/chosen": 3.8308868408203125, + "rewards/margins": 12.220359802246094, + "rewards/rejected": -8.389472961425781, + "step": 8971 + }, + { + "epoch": 0.8197350388305162, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 7.841482737832262e-07, + "logits/chosen": 537828249.6, + "logits/rejected": 530133888.0, + "logps/chosen": -410.2328125, + "logps/rejected": -629.7578938802084, + "loss": 0.0167, + "rewards/chosen": 4.176788330078125, + "rewards/margins": 13.083429972330729, + "rewards/rejected": -8.906641642252604, + "step": 8972 + }, + { + "epoch": 0.8198264047510279, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 7.833754200340665e-07, + "logits/chosen": 617535274.6666666, + "logits/rejected": 361766048.0, + "logps/chosen": -273.68267822265625, + "logps/rejected": -344.41485595703125, + "loss": 0.0265, + "rewards/chosen": 4.078802108764648, + "rewards/margins": 13.38193130493164, + "rewards/rejected": -9.303129196166992, + "step": 8973 + }, + { + "epoch": 0.8199177706715395, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 7.826029149606912e-07, + "logits/chosen": 395647274.6666667, + "logits/rejected": 339618201.6, + "logps/chosen": -392.4315999348958, + "logps/rejected": -456.3154296875, + "loss": 0.0138, + "rewards/chosen": 4.312965393066406, + "rewards/margins": 12.125389099121094, + "rewards/rejected": -7.8124237060546875, + "step": 8974 + }, + { + "epoch": 0.8200091365920512, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 7.818307586269791e-07, + "logits/chosen": 1715689344.0, + "logits/rejected": 776419968.0, + "logps/chosen": -563.2471313476562, + "logps/rejected": -692.966064453125, + "loss": 0.0164, + "rewards/chosen": 3.544978380203247, + "rewards/margins": 13.373915910720825, + "rewards/rejected": -9.828937530517578, + "step": 8975 + }, + { + "epoch": 0.8201005025125628, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 7.81058951096782e-07, + "logits/chosen": 428007509.3333333, + "logits/rejected": 529053286.4, + "logps/chosen": -222.88688151041666, + "logps/rejected": -418.475244140625, + "loss": 0.1191, + "rewards/chosen": 2.597407341003418, + "rewards/margins": 11.816943168640137, + "rewards/rejected": -9.219535827636719, + "step": 8976 + }, + { + "epoch": 0.8201918684330745, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 7.802874924339204e-07, + "logits/chosen": 420872106.6666667, + "logits/rejected": 309140992.0, + "logps/chosen": -414.1573893229167, + "logps/rejected": -366.6072021484375, + "loss": 0.0246, + "rewards/chosen": 2.877312342325846, + "rewards/margins": 10.581313196818034, + "rewards/rejected": -7.704000854492188, + "step": 8977 + }, + { + "epoch": 0.8202832343535861, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 7.795163827021862e-07, + "logits/chosen": 479648992.0, + "logits/rejected": 554700544.0, + "logps/chosen": -449.448486328125, + "logps/rejected": -507.1833801269531, + "loss": 0.0117, + "rewards/chosen": 3.843519449234009, + "rewards/margins": 15.229534387588501, + "rewards/rejected": -11.386014938354492, + "step": 8978 + }, + { + "epoch": 0.8203746002740978, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 7.787456219653438e-07, + "logits/chosen": 901553792.0, + "logits/rejected": 663748096.0, + "logps/chosen": -186.80026245117188, + "logps/rejected": -425.9969482421875, + "loss": 0.1243, + "rewards/chosen": 0.06660008430480957, + "rewards/margins": 7.907902320226033, + "rewards/rejected": -7.841302235921224, + "step": 8979 + }, + { + "epoch": 0.8204659661946094, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 7.779752102871279e-07, + "logits/chosen": 537135616.0, + "logits/rejected": 685473689.6, + "logps/chosen": -286.1601969401042, + "logps/rejected": -430.425390625, + "loss": 0.017, + "rewards/chosen": 3.1779438654581704, + "rewards/margins": 12.189325396219889, + "rewards/rejected": -9.011381530761719, + "step": 8980 + }, + { + "epoch": 0.820557332115121, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.772051477312436e-07, + "logits/chosen": 538938828.8, + "logits/rejected": 643375872.0, + "logps/chosen": -256.473388671875, + "logps/rejected": -364.3000895182292, + "loss": 0.0328, + "rewards/chosen": 3.228781890869141, + "rewards/margins": 11.499691518147788, + "rewards/rejected": -8.270909627278646, + "step": 8981 + }, + { + "epoch": 0.8206486980356327, + "grad_norm": 0.84375, + "kl": 0.0, + "learning_rate": 7.764354343613679e-07, + "logits/chosen": 616261440.0, + "logits/rejected": 421936810.6666667, + "logps/chosen": -390.4331970214844, + "logps/rejected": -564.491943359375, + "loss": 0.0029, + "rewards/chosen": 4.579453468322754, + "rewards/margins": 13.72103468577067, + "rewards/rejected": -9.141581217447916, + "step": 8982 + }, + { + "epoch": 0.8207400639561444, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 7.756660702411506e-07, + "logits/chosen": 808374272.0, + "logits/rejected": 386891264.0, + "logps/chosen": -237.93868001302084, + "logps/rejected": -451.0933532714844, + "loss": 0.0281, + "rewards/chosen": 3.7202021280924478, + "rewards/margins": 13.338990847269693, + "rewards/rejected": -9.618788719177246, + "step": 8983 + }, + { + "epoch": 0.820831429876656, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 7.7489705543421e-07, + "logits/chosen": 596874112.0, + "logits/rejected": 698365760.0, + "logps/chosen": -284.10504150390625, + "logps/rejected": -593.51953125, + "loss": 0.0166, + "rewards/chosen": 4.202347278594971, + "rewards/margins": 15.572668552398682, + "rewards/rejected": -11.370321273803711, + "step": 8984 + }, + { + "epoch": 0.8209227957971676, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 7.741283900041363e-07, + "logits/chosen": 522271232.0, + "logits/rejected": 624794538.6666666, + "logps/chosen": -485.8017578125, + "logps/rejected": -716.4793294270834, + "loss": 0.0156, + "rewards/chosen": 4.224647521972656, + "rewards/margins": 13.51954460144043, + "rewards/rejected": -9.294897079467773, + "step": 8985 + }, + { + "epoch": 0.8210141617176793, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 7.733600740144915e-07, + "logits/chosen": 517248921.6, + "logits/rejected": 355124949.3333333, + "logps/chosen": -397.4634521484375, + "logps/rejected": -478.8365071614583, + "loss": 0.0304, + "rewards/chosen": 3.07741641998291, + "rewards/margins": 14.37679131825765, + "rewards/rejected": -11.29937489827474, + "step": 8986 + }, + { + "epoch": 0.821105527638191, + "grad_norm": 50.0, + "kl": 0.0, + "learning_rate": 7.725921075288068e-07, + "logits/chosen": 868786380.8, + "logits/rejected": 361003221.3333333, + "logps/chosen": -188.18157958984375, + "logps/rejected": -392.9977213541667, + "loss": 0.2028, + "rewards/chosen": 1.9026287078857422, + "rewards/margins": 12.976557540893555, + "rewards/rejected": -11.073928833007812, + "step": 8987 + }, + { + "epoch": 0.8211968935587026, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 7.718244906105876e-07, + "logits/chosen": 390525107.2, + "logits/rejected": 498172458.6666667, + "logps/chosen": -122.4869140625, + "logps/rejected": -490.09716796875, + "loss": 0.0101, + "rewards/chosen": 4.159634399414062, + "rewards/margins": 13.68385518391927, + "rewards/rejected": -9.524220784505209, + "step": 8988 + }, + { + "epoch": 0.8212882594792142, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 7.710572233233082e-07, + "logits/chosen": 454209280.0, + "logits/rejected": 422438752.0, + "logps/chosen": -339.5455017089844, + "logps/rejected": -446.1008605957031, + "loss": 0.0219, + "rewards/chosen": 3.1764302253723145, + "rewards/margins": 11.678795337677002, + "rewards/rejected": -8.502365112304688, + "step": 8989 + }, + { + "epoch": 0.8213796253997259, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 7.702903057304146e-07, + "logits/chosen": 471537728.0, + "logits/rejected": 363700928.0, + "logps/chosen": -207.48748779296875, + "logps/rejected": -454.1763916015625, + "loss": 0.0345, + "rewards/chosen": 3.8921351432800293, + "rewards/margins": 11.516730308532715, + "rewards/rejected": -7.6245951652526855, + "step": 8990 + }, + { + "epoch": 0.8214709913202376, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 7.695237378953224e-07, + "logits/chosen": 488817590.85714287, + "logits/rejected": 666033024.0, + "logps/chosen": -263.30796595982144, + "logps/rejected": -484.18975830078125, + "loss": 0.0352, + "rewards/chosen": 3.381941114153181, + "rewards/margins": 12.914262090410505, + "rewards/rejected": -9.532320976257324, + "step": 8991 + }, + { + "epoch": 0.8215623572407492, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 7.687575198814223e-07, + "logits/chosen": 283942528.0, + "logits/rejected": 716528768.0, + "logps/chosen": -302.6019592285156, + "logps/rejected": -735.199462890625, + "loss": 0.0109, + "rewards/chosen": 4.119209289550781, + "rewards/margins": 13.695262908935547, + "rewards/rejected": -9.576053619384766, + "step": 8992 + }, + { + "epoch": 0.8216537231612608, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 7.679916517520719e-07, + "logits/chosen": 467416768.0, + "logits/rejected": 404256288.0, + "logps/chosen": -272.2381591796875, + "logps/rejected": -418.2347106933594, + "loss": 0.0245, + "rewards/chosen": 3.3910627365112305, + "rewards/margins": 12.47340202331543, + "rewards/rejected": -9.0823392868042, + "step": 8993 + }, + { + "epoch": 0.8217450890817725, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 7.672261335706011e-07, + "logits/chosen": 480865920.0, + "logits/rejected": 620354090.6666666, + "logps/chosen": -327.8199157714844, + "logps/rejected": -594.2135823567709, + "loss": 0.0104, + "rewards/chosen": 3.2122421264648438, + "rewards/margins": 11.814032236735025, + "rewards/rejected": -8.601790110270182, + "step": 8994 + }, + { + "epoch": 0.8218364550022842, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 7.664609654003118e-07, + "logits/chosen": 841927338.6666666, + "logits/rejected": 391976601.6, + "logps/chosen": -382.0062255859375, + "logps/rejected": -341.8568359375, + "loss": 0.1225, + "rewards/chosen": 1.8961607615152996, + "rewards/margins": 11.528019587198893, + "rewards/rejected": -9.631858825683594, + "step": 8995 + }, + { + "epoch": 0.8219278209227958, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 7.656961473044744e-07, + "logits/chosen": 763582549.3333334, + "logits/rejected": 550741299.2, + "logps/chosen": -355.1365559895833, + "logps/rejected": -566.27275390625, + "loss": 0.0063, + "rewards/chosen": 4.243965148925781, + "rewards/margins": 15.889937591552734, + "rewards/rejected": -11.645972442626952, + "step": 8996 + }, + { + "epoch": 0.8220191868433074, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 7.649316793463357e-07, + "logits/chosen": 637691093.3333334, + "logits/rejected": 278902348.8, + "logps/chosen": -627.6136881510416, + "logps/rejected": -400.9909423828125, + "loss": 0.0163, + "rewards/chosen": 3.676990826924642, + "rewards/margins": 13.937125714619954, + "rewards/rejected": -10.260134887695312, + "step": 8997 + }, + { + "epoch": 0.8221105527638191, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.641675615891081e-07, + "logits/chosen": 706878656.0, + "logits/rejected": 549862912.0, + "logps/chosen": -480.6170349121094, + "logps/rejected": -452.8279724121094, + "loss": 0.0405, + "rewards/chosen": 3.121612548828125, + "rewards/margins": 9.73669147491455, + "rewards/rejected": -6.615078926086426, + "step": 8998 + }, + { + "epoch": 0.8222019186843308, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 7.634037940959777e-07, + "logits/chosen": 600169386.6666666, + "logits/rejected": 738674739.2, + "logps/chosen": -352.3439127604167, + "logps/rejected": -401.059765625, + "loss": 0.0065, + "rewards/chosen": 4.2484385172526045, + "rewards/margins": 12.890786997477214, + "rewards/rejected": -8.642348480224609, + "step": 8999 + }, + { + "epoch": 0.8222932846048424, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 7.626403769300989e-07, + "logits/chosen": 305532586.6666667, + "logits/rejected": 487559833.6, + "logps/chosen": -201.81974283854166, + "logps/rejected": -541.155859375, + "loss": 0.1239, + "rewards/chosen": 1.6569228172302246, + "rewards/margins": 10.903645992279053, + "rewards/rejected": -9.246723175048828, + "step": 9000 + }, + { + "epoch": 0.822384650525354, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 7.618773101546023e-07, + "logits/chosen": 520818912.0, + "logits/rejected": 427125952.0, + "logps/chosen": -422.50909423828125, + "logps/rejected": -485.48040771484375, + "loss": 0.0235, + "rewards/chosen": 3.5475635528564453, + "rewards/margins": 14.391670227050781, + "rewards/rejected": -10.844106674194336, + "step": 9001 + }, + { + "epoch": 0.8224760164458657, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 7.611145938325848e-07, + "logits/chosen": 521175910.4, + "logits/rejected": 336749781.3333333, + "logps/chosen": -314.42900390625, + "logps/rejected": -656.0244140625, + "loss": 0.0156, + "rewards/chosen": 4.148109054565429, + "rewards/margins": 18.721203740437826, + "rewards/rejected": -14.573094685872396, + "step": 9002 + }, + { + "epoch": 0.8225673823663774, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.603522280271164e-07, + "logits/chosen": 778893653.3333334, + "logits/rejected": 705990246.4, + "logps/chosen": -178.50958251953125, + "logps/rejected": -677.44345703125, + "loss": 0.016, + "rewards/chosen": 3.6321821212768555, + "rewards/margins": 12.541895866394043, + "rewards/rejected": -8.909713745117188, + "step": 9003 + }, + { + "epoch": 0.822658748286889, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 7.595902128012367e-07, + "logits/chosen": 478564522.6666667, + "logits/rejected": 534924928.0, + "logps/chosen": -393.2390950520833, + "logps/rejected": -625.86767578125, + "loss": 0.0478, + "rewards/chosen": 3.7223119735717773, + "rewards/margins": 14.19509506225586, + "rewards/rejected": -10.472783088684082, + "step": 9004 + }, + { + "epoch": 0.8227501142074006, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 7.588285482179597e-07, + "logits/chosen": 512582176.0, + "logits/rejected": 384485024.0, + "logps/chosen": -399.52227783203125, + "logps/rejected": -557.611083984375, + "loss": 0.0145, + "rewards/chosen": 3.5746817588806152, + "rewards/margins": 12.467566013336182, + "rewards/rejected": -8.892884254455566, + "step": 9005 + }, + { + "epoch": 0.8228414801279123, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 7.580672343402662e-07, + "logits/chosen": 627077888.0, + "logits/rejected": 487607616.0, + "logps/chosen": -326.86029052734375, + "logps/rejected": -609.090087890625, + "loss": 0.0264, + "rewards/chosen": 2.949280261993408, + "rewards/margins": 13.31132459640503, + "rewards/rejected": -10.362044334411621, + "step": 9006 + }, + { + "epoch": 0.822932846048424, + "grad_norm": 0.0257568359375, + "kl": 0.0, + "learning_rate": 7.573062712311092e-07, + "logits/rejected": 497794048.0, + "logps/rejected": -633.7330322265625, + "loss": 0.0001, + "rewards/rejected": -10.14096450805664, + "step": 9007 + }, + { + "epoch": 0.8230242119689356, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 7.565456589534137e-07, + "logits/chosen": 717323008.0, + "logits/rejected": 511282585.6, + "logps/chosen": -522.9439290364584, + "logps/rejected": -393.0132080078125, + "loss": 0.007, + "rewards/chosen": 4.050459861755371, + "rewards/margins": 11.80244426727295, + "rewards/rejected": -7.751984405517578, + "step": 9008 + }, + { + "epoch": 0.8231155778894472, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 7.557853975700763e-07, + "logits/chosen": 817124454.4, + "logits/rejected": 506178986.6666667, + "logps/chosen": -255.5486328125, + "logps/rejected": -538.9375406901041, + "loss": 0.1215, + "rewards/chosen": 3.338429641723633, + "rewards/margins": 12.377370071411132, + "rewards/rejected": -9.0389404296875, + "step": 9009 + }, + { + "epoch": 0.8232069438099588, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 7.550254871439633e-07, + "logits/chosen": 521630048.0, + "logits/rejected": 804650944.0, + "logps/chosen": -458.59515380859375, + "logps/rejected": -510.244384765625, + "loss": 0.0263, + "rewards/chosen": 3.1217586994171143, + "rewards/margins": 13.261870622634888, + "rewards/rejected": -10.140111923217773, + "step": 9010 + }, + { + "epoch": 0.8232983097304706, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 7.542659277379122e-07, + "logits/chosen": 770831552.0, + "logits/rejected": 376551360.0, + "logps/chosen": -480.0733642578125, + "logps/rejected": -507.3092346191406, + "loss": 0.0119, + "rewards/chosen": 4.107220649719238, + "rewards/margins": 13.794966697692871, + "rewards/rejected": -9.687746047973633, + "step": 9011 + }, + { + "epoch": 0.8233896756509822, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 7.535067194147294e-07, + "logits/chosen": 636097382.4, + "logits/rejected": 702920362.6666666, + "logps/chosen": -172.20692138671876, + "logps/rejected": -480.6088460286458, + "loss": 0.0169, + "rewards/chosen": 4.23160400390625, + "rewards/margins": 13.57421849568685, + "rewards/rejected": -9.3426144917806, + "step": 9012 + }, + { + "epoch": 0.8234810415714938, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 7.527478622371981e-07, + "logits/chosen": 581444096.0, + "logits/rejected": 412228224.0, + "logps/chosen": -331.94964599609375, + "logps/rejected": -277.1607360839844, + "loss": 0.0307, + "rewards/chosen": 3.2233424186706543, + "rewards/margins": 12.16600751876831, + "rewards/rejected": -8.942665100097656, + "step": 9013 + }, + { + "epoch": 0.8235724074920054, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 7.519893562680663e-07, + "logits/chosen": 453343680.0, + "logits/rejected": 467802528.0, + "logps/chosen": -393.1964111328125, + "logps/rejected": -495.95281982421875, + "loss": 0.0093, + "rewards/chosen": 3.998488664627075, + "rewards/margins": 13.010697603225708, + "rewards/rejected": -9.012208938598633, + "step": 9014 + }, + { + "epoch": 0.8236637734125172, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 7.512312015700568e-07, + "logits/chosen": 695404672.0, + "logits/rejected": 646810816.0, + "logps/chosen": -357.6683044433594, + "logps/rejected": -450.9006042480469, + "loss": 0.0132, + "rewards/chosen": 3.8668317794799805, + "rewards/margins": 12.308438301086426, + "rewards/rejected": -8.441606521606445, + "step": 9015 + }, + { + "epoch": 0.8237551393330288, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 7.50473398205861e-07, + "logits/chosen": 368075616.0, + "logits/rejected": 483463616.0, + "logps/chosen": -242.5082244873047, + "logps/rejected": -392.341796875, + "loss": 0.0081, + "rewards/chosen": 4.223956108093262, + "rewards/margins": 13.2606840133667, + "rewards/rejected": -9.036727905273438, + "step": 9016 + }, + { + "epoch": 0.8238465052535404, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 7.497159462381409e-07, + "logits/chosen": 653801301.3333334, + "logits/rejected": 560746048.0, + "logps/chosen": -400.0207926432292, + "logps/rejected": -530.14306640625, + "loss": 0.0186, + "rewards/chosen": 3.8233461380004883, + "rewards/margins": 12.830451011657715, + "rewards/rejected": -9.007104873657227, + "step": 9017 + }, + { + "epoch": 0.823937871174052, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 7.489588457295339e-07, + "logits/chosen": 501144320.0, + "logits/rejected": 526067712.0, + "logps/chosen": -342.2928873697917, + "logps/rejected": -530.635595703125, + "loss": 0.0085, + "rewards/chosen": 4.334850629170735, + "rewards/margins": 15.467245038350423, + "rewards/rejected": -11.132394409179687, + "step": 9018 + }, + { + "epoch": 0.8240292370945638, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 7.482020967426429e-07, + "logits/chosen": 579070668.8, + "logits/rejected": 517981056.0, + "logps/chosen": -342.498046875, + "logps/rejected": -363.2322998046875, + "loss": 0.0217, + "rewards/chosen": 3.574640655517578, + "rewards/margins": 12.806156539916993, + "rewards/rejected": -9.231515884399414, + "step": 9019 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.474456993400453e-07, + "logits/chosen": 440593834.6666667, + "logits/rejected": 441007513.6, + "logps/chosen": -235.6768798828125, + "logps/rejected": -538.93974609375, + "loss": 0.0155, + "rewards/chosen": 3.643930435180664, + "rewards/margins": 13.271955490112305, + "rewards/rejected": -9.62802505493164, + "step": 9020 + }, + { + "epoch": 0.824211968935587, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 7.466896535842865e-07, + "logits/chosen": 671869781.3333334, + "logits/rejected": 350605408.0, + "logps/chosen": -368.5294596354167, + "logps/rejected": -513.7008666992188, + "loss": 0.0253, + "rewards/chosen": 3.5536416371663413, + "rewards/margins": 13.18686040242513, + "rewards/rejected": -9.633218765258789, + "step": 9021 + }, + { + "epoch": 0.8243033348560986, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.459339595378861e-07, + "logits/chosen": 570948044.8, + "logits/rejected": 688347136.0, + "logps/chosen": -207.49677734375, + "logps/rejected": -518.2150065104166, + "loss": 0.0261, + "rewards/chosen": 3.753287506103516, + "rewards/margins": 14.858836364746093, + "rewards/rejected": -11.105548858642578, + "step": 9022 + }, + { + "epoch": 0.8243947007766104, + "grad_norm": 0.6484375, + "kl": 0.0, + "learning_rate": 7.451786172633329e-07, + "logits/chosen": 385197269.3333333, + "logits/rejected": 477984563.2, + "logps/chosen": -222.71809895833334, + "logps/rejected": -532.6060546875, + "loss": 0.0038, + "rewards/chosen": 4.630056381225586, + "rewards/margins": 15.878041458129882, + "rewards/rejected": -11.247985076904296, + "step": 9023 + }, + { + "epoch": 0.824486066697122, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 7.444236268230864e-07, + "logits/chosen": 968443776.0, + "logits/rejected": 629358890.6666666, + "logps/chosen": -164.63644409179688, + "logps/rejected": -432.275146484375, + "loss": 0.0078, + "rewards/chosen": 3.707977294921875, + "rewards/margins": 13.127786000569662, + "rewards/rejected": -9.419808705647787, + "step": 9024 + }, + { + "epoch": 0.8245774326176336, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 7.436689882795755e-07, + "logits/chosen": 641369856.0, + "logits/rejected": 621489152.0, + "logps/chosen": -303.2214111328125, + "logps/rejected": -561.0179036458334, + "loss": 0.0202, + "rewards/chosen": 3.5856727600097655, + "rewards/margins": 14.48232625325521, + "rewards/rejected": -10.896653493245443, + "step": 9025 + }, + { + "epoch": 0.8246687985381452, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 7.429147016952054e-07, + "logits/chosen": 343617578.6666667, + "logits/rejected": 424465254.4, + "logps/chosen": -251.0357869466146, + "logps/rejected": -430.827392578125, + "loss": 0.009, + "rewards/chosen": 4.5793107350667315, + "rewards/margins": 11.590620549519857, + "rewards/rejected": -7.011309814453125, + "step": 9026 + }, + { + "epoch": 0.824760164458657, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 7.421607671323461e-07, + "logits/chosen": 544018304.0, + "logps/chosen": -242.96849060058594, + "loss": 0.0684, + "rewards/chosen": 3.3144278526306152, + "step": 9027 + }, + { + "epoch": 0.8248515303791686, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 7.414071846533416e-07, + "logits/chosen": 493785152.0, + "logits/rejected": 362205376.0, + "logps/chosen": -308.3310852050781, + "logps/rejected": -425.15576171875, + "loss": 0.0163, + "rewards/chosen": 3.447843313217163, + "rewards/margins": 11.775136709213257, + "rewards/rejected": -8.327293395996094, + "step": 9028 + }, + { + "epoch": 0.8249428962996802, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 7.406539543205066e-07, + "logits/chosen": 418514528.0, + "logits/rejected": 462842843.4285714, + "logps/chosen": -468.3679504394531, + "logps/rejected": -508.99351283482144, + "loss": 0.0038, + "rewards/chosen": 3.5269012451171875, + "rewards/margins": 13.547872270856585, + "rewards/rejected": -10.020971025739398, + "step": 9029 + }, + { + "epoch": 0.8250342622201918, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 7.399010761961245e-07, + "logits/chosen": 446005888.0, + "logits/rejected": 342089504.0, + "logps/chosen": -301.831787109375, + "logps/rejected": -546.3841552734375, + "loss": 0.0385, + "rewards/chosen": 3.963974952697754, + "rewards/margins": 14.748652458190918, + "rewards/rejected": -10.784677505493164, + "step": 9030 + }, + { + "epoch": 0.8251256281407036, + "grad_norm": 67.0, + "kl": 0.0, + "learning_rate": 7.391485503424539e-07, + "logits/chosen": 447045120.0, + "logits/rejected": 316848332.8, + "logps/chosen": -232.57661946614584, + "logps/rejected": -211.4103271484375, + "loss": 0.0727, + "rewards/chosen": 3.625194231669108, + "rewards/margins": 9.640764300028483, + "rewards/rejected": -6.015570068359375, + "step": 9031 + }, + { + "epoch": 0.8252169940612152, + "grad_norm": 30.625, + "kl": 0.0, + "learning_rate": 7.383963768217211e-07, + "logits/chosen": 617844096.0, + "logits/rejected": 656898880.0, + "logps/chosen": -302.63677978515625, + "logps/rejected": -634.2530517578125, + "loss": 0.1149, + "rewards/chosen": 2.9892125129699707, + "rewards/margins": 13.603774547576904, + "rewards/rejected": -10.614562034606934, + "step": 9032 + }, + { + "epoch": 0.8253083599817268, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 7.376445556961215e-07, + "logits/chosen": 531929376.0, + "logits/rejected": 493160320.0, + "logps/chosen": -207.8238525390625, + "logps/rejected": -489.319091796875, + "loss": 0.0095, + "rewards/chosen": 3.2527551651000977, + "rewards/margins": 12.233643531799316, + "rewards/rejected": -8.980888366699219, + "step": 9033 + }, + { + "epoch": 0.8253997259022384, + "grad_norm": 0.87890625, + "kl": 0.0, + "learning_rate": 7.368930870278268e-07, + "logits/chosen": 540408320.0, + "logits/rejected": 578896230.4, + "logps/chosen": -283.5805257161458, + "logps/rejected": -646.08671875, + "loss": 0.0038, + "rewards/chosen": 4.846656163533528, + "rewards/margins": 14.43042386372884, + "rewards/rejected": -9.583767700195313, + "step": 9034 + }, + { + "epoch": 0.8254910918227502, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 7.361419708789752e-07, + "logits/chosen": 743387443.2, + "logits/rejected": 557428266.6666666, + "logps/chosen": -409.74560546875, + "logps/rejected": -363.4798177083333, + "loss": 0.0096, + "rewards/chosen": 4.296317672729492, + "rewards/margins": 12.869384384155273, + "rewards/rejected": -8.573066711425781, + "step": 9035 + }, + { + "epoch": 0.8255824577432618, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 7.353912073116771e-07, + "logits/chosen": 473449984.0, + "logits/rejected": 322647232.0, + "logps/chosen": -333.61676025390625, + "logps/rejected": -358.760009765625, + "loss": 0.0084, + "rewards/chosen": 4.414216041564941, + "rewards/margins": 14.281161308288574, + "rewards/rejected": -9.866945266723633, + "step": 9036 + }, + { + "epoch": 0.8256738236637734, + "grad_norm": 0.95703125, + "kl": 0.0, + "learning_rate": 7.346407963880137e-07, + "logits/chosen": 542344618.6666666, + "logits/rejected": 479562547.2, + "logps/chosen": -410.263916015625, + "logps/rejected": -555.500927734375, + "loss": 0.0043, + "rewards/chosen": 4.555951754252116, + "rewards/margins": 15.085118929545086, + "rewards/rejected": -10.529167175292969, + "step": 9037 + }, + { + "epoch": 0.825765189584285, + "grad_norm": 33.75, + "kl": 0.0, + "learning_rate": 7.338907381700356e-07, + "logits/chosen": 683374899.2, + "logits/rejected": 885581909.3333334, + "logps/chosen": -218.0951171875, + "logps/rejected": -527.1385498046875, + "loss": 0.1193, + "rewards/chosen": 2.719960021972656, + "rewards/margins": 13.627631632486978, + "rewards/rejected": -10.907671610514322, + "step": 9038 + }, + { + "epoch": 0.8258565555047968, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.331410327197686e-07, + "logits/chosen": 728811878.4, + "logits/rejected": 457563733.3333333, + "logps/chosen": -451.0357421875, + "logps/rejected": -551.8665771484375, + "loss": 0.0093, + "rewards/chosen": 4.712841415405274, + "rewards/margins": 15.188695017496745, + "rewards/rejected": -10.47585360209147, + "step": 9039 + }, + { + "epoch": 0.8259479214253084, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 7.323916800992042e-07, + "logits/chosen": 341168981.3333333, + "logits/rejected": 376451430.4, + "logps/chosen": -326.9604899088542, + "logps/rejected": -444.14697265625, + "loss": 0.0084, + "rewards/chosen": 4.603853225708008, + "rewards/margins": 14.50835304260254, + "rewards/rejected": -9.904499816894532, + "step": 9040 + }, + { + "epoch": 0.82603928734582, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 7.31642680370308e-07, + "logits/chosen": 207780320.0, + "logits/rejected": 270382720.0, + "logps/chosen": -337.04315185546875, + "logps/rejected": -414.2877197265625, + "loss": 0.0145, + "rewards/chosen": 3.7676308155059814, + "rewards/margins": 11.714254140853882, + "rewards/rejected": -7.9466233253479, + "step": 9041 + }, + { + "epoch": 0.8261306532663316, + "grad_norm": 0.59765625, + "kl": 0.0, + "learning_rate": 7.308940335950138e-07, + "logits/chosen": 576801664.0, + "logits/rejected": 620504371.2, + "logps/chosen": -193.5550333658854, + "logps/rejected": -555.22294921875, + "loss": 0.0036, + "rewards/chosen": 4.837506612141927, + "rewards/margins": 13.989404042561848, + "rewards/rejected": -9.151897430419922, + "step": 9042 + }, + { + "epoch": 0.8262220191868433, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 7.301457398352302e-07, + "logits/chosen": 568477824.0, + "logits/rejected": 640665536.0, + "logps/chosen": -377.12115478515625, + "logps/rejected": -563.0721435546875, + "loss": 0.0274, + "rewards/chosen": 2.9726624488830566, + "rewards/margins": 13.750494480133057, + "rewards/rejected": -10.77783203125, + "step": 9043 + }, + { + "epoch": 0.826313385107355, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 7.293977991528328e-07, + "logits/chosen": 534457792.0, + "logits/rejected": 540805290.6666666, + "logps/chosen": -373.9853515625, + "logps/rejected": -609.0470784505209, + "loss": 0.0104, + "rewards/chosen": 3.3010056018829346, + "rewards/margins": 12.955141623814901, + "rewards/rejected": -9.654136021931967, + "step": 9044 + }, + { + "epoch": 0.8264047510278666, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 7.286502116096688e-07, + "logits/chosen": 659623765.3333334, + "logits/rejected": 469190656.0, + "logps/chosen": -385.6084798177083, + "logps/rejected": -237.939697265625, + "loss": 0.0249, + "rewards/chosen": 3.7540562947591147, + "rewards/margins": 10.252742608388266, + "rewards/rejected": -6.49868631362915, + "step": 9045 + }, + { + "epoch": 0.8264961169483782, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 7.279029772675572e-07, + "logits/chosen": 404555110.4, + "logits/rejected": 497169749.3333333, + "logps/chosen": -226.2669189453125, + "logps/rejected": -676.8519287109375, + "loss": 0.0286, + "rewards/chosen": 3.499322509765625, + "rewards/margins": 13.964430745442709, + "rewards/rejected": -10.465108235677084, + "step": 9046 + }, + { + "epoch": 0.82658748286889, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 7.271560961882879e-07, + "logits/chosen": 231302208.0, + "logits/rejected": 527415808.0, + "logps/chosen": -340.8694254557292, + "logps/rejected": -465.0755859375, + "loss": 0.0114, + "rewards/chosen": 4.1800384521484375, + "rewards/margins": 13.7262451171875, + "rewards/rejected": -9.546206665039062, + "step": 9047 + }, + { + "epoch": 0.8266788487894016, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 7.264095684336209e-07, + "logits/chosen": 530389913.6, + "logits/rejected": 481260330.6666667, + "logps/chosen": -300.03955078125, + "logps/rejected": -290.31939697265625, + "loss": 0.11, + "rewards/chosen": 3.803350830078125, + "rewards/margins": 9.235549036661784, + "rewards/rejected": -5.432198206583659, + "step": 9048 + }, + { + "epoch": 0.8267702147099132, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 7.256633940652868e-07, + "logits/chosen": 486743168.0, + "logits/rejected": 275171392.0, + "logps/chosen": -305.9001159667969, + "logps/rejected": -420.3643798828125, + "loss": 0.0132, + "rewards/chosen": 4.128812789916992, + "rewards/margins": 12.856836318969727, + "rewards/rejected": -8.728023529052734, + "step": 9049 + }, + { + "epoch": 0.8268615806304248, + "grad_norm": 49.25, + "kl": 0.0, + "learning_rate": 7.249175731449875e-07, + "logits/chosen": 535531456.0, + "logits/rejected": 400622933.3333333, + "logps/chosen": -228.16322326660156, + "logps/rejected": -484.8757731119792, + "loss": 0.0446, + "rewards/chosen": 1.5792266130447388, + "rewards/margins": 11.245158314704895, + "rewards/rejected": -9.665931701660156, + "step": 9050 + }, + { + "epoch": 0.8269529465509365, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.241721057343942e-07, + "logits/chosen": 525923108.5714286, + "logits/rejected": 391161792.0, + "logps/chosen": -304.93868582589283, + "logps/rejected": -225.4424285888672, + "loss": 0.0155, + "rewards/chosen": 4.551641464233398, + "rewards/margins": 12.468879222869873, + "rewards/rejected": -7.917237758636475, + "step": 9051 + }, + { + "epoch": 0.8270443124714482, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 7.234269918951526e-07, + "logits/chosen": 581532288.0, + "logits/rejected": 403919530.6666667, + "logps/chosen": -235.7516326904297, + "logps/rejected": -604.9493001302084, + "loss": 0.0091, + "rewards/chosen": 3.560359477996826, + "rewards/margins": 15.473760763804117, + "rewards/rejected": -11.913401285807291, + "step": 9052 + }, + { + "epoch": 0.8271356783919598, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 7.226822316888754e-07, + "logits/chosen": 548103168.0, + "logits/rejected": 405741354.6666667, + "logps/chosen": -478.662353515625, + "logps/rejected": -625.5292154947916, + "loss": 0.0269, + "rewards/chosen": 2.9819931983947754, + "rewards/margins": 13.009291172027588, + "rewards/rejected": -10.027297973632812, + "step": 9053 + }, + { + "epoch": 0.8272270443124714, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 7.219378251771475e-07, + "logits/chosen": 410930816.0, + "logits/rejected": 781030912.0, + "logps/chosen": -364.2455139160156, + "logps/rejected": -681.6194458007812, + "loss": 0.0285, + "rewards/chosen": 3.2744028568267822, + "rewards/margins": 15.378593683242798, + "rewards/rejected": -12.104190826416016, + "step": 9054 + }, + { + "epoch": 0.8273184102329831, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 7.211937724215229e-07, + "logits/chosen": 373336160.0, + "logits/rejected": 435836736.0, + "logps/chosen": -166.45953369140625, + "logps/rejected": -473.04241943359375, + "loss": 0.0182, + "rewards/chosen": 4.011329650878906, + "rewards/margins": 13.434240341186523, + "rewards/rejected": -9.422910690307617, + "step": 9055 + }, + { + "epoch": 0.8274097761534948, + "grad_norm": 0.47265625, + "kl": 0.0, + "learning_rate": 7.204500734835307e-07, + "logits/chosen": 659265792.0, + "logits/rejected": 662242450.2857143, + "logps/chosen": -77.08744812011719, + "logps/rejected": -723.4601004464286, + "loss": 0.0026, + "rewards/chosen": 3.96927809715271, + "rewards/margins": 15.20287925856454, + "rewards/rejected": -11.23360116141183, + "step": 9056 + }, + { + "epoch": 0.8275011420740064, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 7.197067284246668e-07, + "logits/chosen": 429274086.4, + "logits/rejected": 699988437.3333334, + "logps/chosen": -343.03515625, + "logps/rejected": -342.8867594401042, + "loss": 0.0292, + "rewards/chosen": 4.034903335571289, + "rewards/margins": 12.173652776082356, + "rewards/rejected": -8.138749440511068, + "step": 9057 + }, + { + "epoch": 0.827592507994518, + "grad_norm": 26.875, + "kl": 0.0, + "learning_rate": 7.189637373063979e-07, + "logits/chosen": 793502208.0, + "logits/rejected": 476209561.6, + "logps/chosen": -543.577392578125, + "logps/rejected": -371.2021728515625, + "loss": 0.08, + "rewards/chosen": 4.704473495483398, + "rewards/margins": 10.455399703979491, + "rewards/rejected": -5.750926208496094, + "step": 9058 + }, + { + "epoch": 0.8276838739150297, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 7.182211001901634e-07, + "logits/chosen": 792003635.2, + "logits/rejected": 568674602.6666666, + "logps/chosen": -306.6103515625, + "logps/rejected": -561.7462158203125, + "loss": 0.0099, + "rewards/chosen": 4.276824951171875, + "rewards/margins": 16.729086558024086, + "rewards/rejected": -12.452261606852213, + "step": 9059 + }, + { + "epoch": 0.8277752398355414, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 7.174788171373731e-07, + "logits/chosen": 560661196.8, + "logits/rejected": 577411840.0, + "logps/chosen": -394.9935546875, + "logps/rejected": -610.651611328125, + "loss": 0.0079, + "rewards/chosen": 4.449913406372071, + "rewards/margins": 15.421486282348633, + "rewards/rejected": -10.971572875976562, + "step": 9060 + }, + { + "epoch": 0.827866605756053, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 7.167368882094055e-07, + "logits/chosen": 483773440.0, + "logits/rejected": 492005333.3333333, + "logps/chosen": -410.277001953125, + "logps/rejected": -345.87548828125, + "loss": 0.0237, + "rewards/chosen": 4.066530227661133, + "rewards/margins": 13.874255498250326, + "rewards/rejected": -9.807725270589193, + "step": 9061 + }, + { + "epoch": 0.8279579716765646, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 7.159953134676118e-07, + "logits/chosen": 525302848.0, + "logits/rejected": 562000640.0, + "logps/chosen": -429.45184326171875, + "logps/rejected": -523.2761840820312, + "loss": 0.015, + "rewards/chosen": 4.323727607727051, + "rewards/margins": 14.8711576461792, + "rewards/rejected": -10.547430038452148, + "step": 9062 + }, + { + "epoch": 0.8280493375970763, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 7.152540929733126e-07, + "logits/chosen": 453381120.0, + "logits/rejected": 432746432.0, + "logps/chosen": -290.9591064453125, + "logps/rejected": -580.0364990234375, + "loss": 0.0376, + "rewards/chosen": 2.7437963485717773, + "rewards/margins": 11.19437026977539, + "rewards/rejected": -8.450573921203613, + "step": 9063 + }, + { + "epoch": 0.828140703517588, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 7.145132267878025e-07, + "logits/chosen": 598842368.0, + "logits/rejected": 899978752.0, + "logps/chosen": -364.822265625, + "logps/rejected": -321.68072509765625, + "loss": 0.0051, + "rewards/chosen": 4.516879081726074, + "rewards/margins": 12.270189603169758, + "rewards/rejected": -7.753310521443685, + "step": 9064 + }, + { + "epoch": 0.8282320694380996, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 7.137727149723417e-07, + "logits/chosen": 561578368.0, + "logits/rejected": 508466688.0, + "logps/chosen": -430.98199462890625, + "logps/rejected": -434.0307922363281, + "loss": 0.0109, + "rewards/chosen": 3.8784050941467285, + "rewards/margins": 13.032328128814697, + "rewards/rejected": -9.153923034667969, + "step": 9065 + }, + { + "epoch": 0.8283234353586112, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 7.130325575881652e-07, + "logits/chosen": 795729920.0, + "logits/rejected": 940138496.0, + "logps/chosen": -343.2164713541667, + "logps/rejected": -387.9433349609375, + "loss": 0.0201, + "rewards/chosen": 3.197920481363932, + "rewards/margins": 11.008594767252603, + "rewards/rejected": -7.8106742858886715, + "step": 9066 + }, + { + "epoch": 0.8284148012791229, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 7.122927546964753e-07, + "logits/chosen": 506852032.0, + "logits/rejected": 654829909.3333334, + "logps/chosen": -326.56439208984375, + "logps/rejected": -415.2696940104167, + "loss": 0.005, + "rewards/chosen": 4.055600643157959, + "rewards/margins": 12.558909893035889, + "rewards/rejected": -8.50330924987793, + "step": 9067 + }, + { + "epoch": 0.8285061671996345, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 7.115533063584495e-07, + "logits/chosen": 541504512.0, + "logits/rejected": 543584512.0, + "logps/chosen": -306.538720703125, + "logps/rejected": -324.94639078776044, + "loss": 0.0379, + "rewards/chosen": 3.2359954833984377, + "rewards/margins": 11.754356892903647, + "rewards/rejected": -8.518361409505209, + "step": 9068 + }, + { + "epoch": 0.8285975331201462, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 7.108142126352313e-07, + "logits/chosen": 694443178.6666666, + "logits/rejected": 524979904.0, + "logps/chosen": -288.9670817057292, + "logps/rejected": -570.6787109375, + "loss": 0.0127, + "rewards/chosen": 4.1926619211832685, + "rewards/margins": 14.728715578715008, + "rewards/rejected": -10.536053657531738, + "step": 9069 + }, + { + "epoch": 0.8286888990406578, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 7.100754735879378e-07, + "logits/chosen": 418189260.8, + "logits/rejected": 650489429.3333334, + "logps/chosen": -255.068408203125, + "logps/rejected": -311.9053548177083, + "loss": 0.0323, + "rewards/chosen": 4.787556076049805, + "rewards/margins": 10.233705266316733, + "rewards/rejected": -5.446149190266927, + "step": 9070 + }, + { + "epoch": 0.8287802649611695, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 7.093370892776558e-07, + "logits/chosen": 929180800.0, + "logits/rejected": 394649088.0, + "logps/chosen": -463.8707580566406, + "logps/rejected": -592.79443359375, + "loss": 0.0095, + "rewards/chosen": 3.253936767578125, + "rewards/margins": 14.14272689819336, + "rewards/rejected": -10.888790130615234, + "step": 9071 + }, + { + "epoch": 0.8288716308816811, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 7.085990597654413e-07, + "logits/chosen": 310679232.0, + "logits/rejected": 268564352.0, + "logps/chosen": -266.4225158691406, + "logps/rejected": -348.688720703125, + "loss": 0.0252, + "rewards/chosen": 4.082447528839111, + "rewards/margins": 11.915156841278076, + "rewards/rejected": -7.832709312438965, + "step": 9072 + }, + { + "epoch": 0.8289629968021928, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 7.078613851123251e-07, + "logits/chosen": 445627136.0, + "logits/rejected": 500815520.0, + "logps/chosen": -362.340576171875, + "logps/rejected": -463.01953125, + "loss": 0.0235, + "rewards/chosen": 3.8909994761149087, + "rewards/margins": 13.122281710306803, + "rewards/rejected": -9.231282234191895, + "step": 9073 + }, + { + "epoch": 0.8290543627227044, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 7.071240653793049e-07, + "logits/chosen": 505404211.2, + "logits/rejected": 377597269.3333333, + "logps/chosen": -335.97265625, + "logps/rejected": -414.2783203125, + "loss": 0.016, + "rewards/chosen": 3.9017318725585937, + "rewards/margins": 15.93679402669271, + "rewards/rejected": -12.035062154134115, + "step": 9074 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.4140625, + "kl": 0.0, + "learning_rate": 7.063871006273498e-07, + "logits/chosen": 765388032.0, + "logits/rejected": 555872402.2857143, + "logps/chosen": -505.52239990234375, + "logps/rejected": -636.0659877232143, + "loss": 0.0016, + "rewards/chosen": 4.557745456695557, + "rewards/margins": 16.682274614061626, + "rewards/rejected": -12.124529157366071, + "step": 9075 + }, + { + "epoch": 0.8292370945637277, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 7.056504909173994e-07, + "logits/chosen": 400877926.4, + "logits/rejected": 423307264.0, + "logps/chosen": -331.4497802734375, + "logps/rejected": -483.059814453125, + "loss": 0.0264, + "rewards/chosen": 3.562744140625, + "rewards/margins": 12.372358957926432, + "rewards/rejected": -8.809614817301432, + "step": 9076 + }, + { + "epoch": 0.8293284604842394, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 7.049142363103662e-07, + "logits/chosen": 932328704.0, + "logits/rejected": 761486400.0, + "logps/chosen": -348.4444580078125, + "logps/rejected": -291.93499755859375, + "loss": 0.015, + "rewards/chosen": 4.075284957885742, + "rewards/margins": 11.357694625854492, + "rewards/rejected": -7.28240966796875, + "step": 9077 + }, + { + "epoch": 0.829419826404751, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 7.041783368671312e-07, + "logits/chosen": 515562325.3333333, + "logits/rejected": 577879296.0, + "logps/chosen": -311.2357991536458, + "logps/rejected": -570.848291015625, + "loss": 0.016, + "rewards/chosen": 4.046227773030599, + "rewards/margins": 12.923226674397785, + "rewards/rejected": -8.876998901367188, + "step": 9078 + }, + { + "epoch": 0.8295111923252627, + "grad_norm": 0.8203125, + "kl": 0.0, + "learning_rate": 7.034427926485459e-07, + "logits/chosen": 709373952.0, + "logits/rejected": 474836992.0, + "logps/chosen": -296.547607421875, + "logps/rejected": -496.5842590332031, + "loss": 0.0057, + "rewards/chosen": 4.524494171142578, + "rewards/margins": 13.107152938842773, + "rewards/rejected": -8.582658767700195, + "step": 9079 + }, + { + "epoch": 0.8296025582457743, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 7.027076037154329e-07, + "logits/chosen": 588583424.0, + "logits/rejected": 543034368.0, + "logps/chosen": -331.94667561848956, + "logps/rejected": -652.1356811523438, + "loss": 0.0252, + "rewards/chosen": 3.9772094090779624, + "rewards/margins": 11.819087823232016, + "rewards/rejected": -7.841878414154053, + "step": 9080 + }, + { + "epoch": 0.829693924166286, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 7.01972770128585e-07, + "logits/chosen": 870572160.0, + "logits/rejected": 441683157.3333333, + "logps/chosen": -280.4608154296875, + "logps/rejected": -534.9149576822916, + "loss": 0.0076, + "rewards/chosen": 4.144536018371582, + "rewards/margins": 13.181902885437012, + "rewards/rejected": -9.03736686706543, + "step": 9081 + }, + { + "epoch": 0.8297852900867976, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 7.012382919487681e-07, + "logits/chosen": 678168064.0, + "logits/rejected": 865504938.6666666, + "logps/chosen": -354.97919921875, + "logps/rejected": -367.6483561197917, + "loss": 0.0186, + "rewards/chosen": 4.151299285888672, + "rewards/margins": 16.057450103759766, + "rewards/rejected": -11.906150817871094, + "step": 9082 + }, + { + "epoch": 0.8298766560073093, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 7.005041692367154e-07, + "logits/chosen": 243646336.0, + "logits/rejected": 488534169.6, + "logps/chosen": -130.35248819986978, + "logps/rejected": -459.2203125, + "loss": 0.012, + "rewards/chosen": 4.3734556833903, + "rewards/margins": 12.396522394816081, + "rewards/rejected": -8.02306671142578, + "step": 9083 + }, + { + "epoch": 0.8299680219278209, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 6.997704020531321e-07, + "logits/chosen": 541152102.4, + "logits/rejected": 836049920.0, + "logps/chosen": -288.778466796875, + "logps/rejected": -507.7703857421875, + "loss": 0.0118, + "rewards/chosen": 4.757429504394532, + "rewards/margins": 14.765782419840495, + "rewards/rejected": -10.008352915445963, + "step": 9084 + }, + { + "epoch": 0.8300593878483326, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 6.99036990458693e-07, + "logits/chosen": 521363821.71428573, + "logits/rejected": 217717920.0, + "logps/chosen": -419.46146065848217, + "logps/rejected": -455.1188049316406, + "loss": 0.024, + "rewards/chosen": 4.0723114013671875, + "rewards/margins": 14.941055297851562, + "rewards/rejected": -10.868743896484375, + "step": 9085 + }, + { + "epoch": 0.8301507537688442, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 6.983039345140474e-07, + "logits/chosen": 602697045.3333334, + "logits/rejected": 514058035.2, + "logps/chosen": -282.10451253255206, + "logps/rejected": -538.27548828125, + "loss": 0.0312, + "rewards/chosen": 2.466588338216146, + "rewards/margins": 12.652313741048177, + "rewards/rejected": -10.18572540283203, + "step": 9086 + }, + { + "epoch": 0.8302421196893559, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.975712342798097e-07, + "logits/chosen": 402821440.0, + "logits/rejected": 610910634.6666666, + "logps/chosen": -366.06634521484375, + "logps/rejected": -440.4701334635417, + "loss": 0.0101, + "rewards/chosen": 5.3659820556640625, + "rewards/margins": 13.402125676472982, + "rewards/rejected": -8.03614362080892, + "step": 9087 + }, + { + "epoch": 0.8303334856098675, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 6.968388898165679e-07, + "logits/chosen": 465839530.6666667, + "logits/rejected": 627079987.2, + "logps/chosen": -330.8245849609375, + "logps/rejected": -678.85126953125, + "loss": 0.0142, + "rewards/chosen": 3.633897145589193, + "rewards/margins": 14.068198903401694, + "rewards/rejected": -10.4343017578125, + "step": 9088 + }, + { + "epoch": 0.8304248515303791, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 6.961069011848793e-07, + "logits/chosen": 386693461.3333333, + "logits/rejected": 501123481.6, + "logps/chosen": -160.80224609375, + "logps/rejected": -620.357763671875, + "loss": 0.0078, + "rewards/chosen": 4.25798225402832, + "rewards/margins": 13.990850448608398, + "rewards/rejected": -9.732868194580078, + "step": 9089 + }, + { + "epoch": 0.8305162174508908, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 6.953752684452741e-07, + "logits/chosen": 254108825.6, + "logits/rejected": 372930986.6666667, + "logps/chosen": -305.2028564453125, + "logps/rejected": -534.857177734375, + "loss": 0.0198, + "rewards/chosen": 4.372438049316406, + "rewards/margins": 14.796382522583007, + "rewards/rejected": -10.423944473266602, + "step": 9090 + }, + { + "epoch": 0.8306075833714025, + "grad_norm": 31.5, + "kl": 0.0, + "learning_rate": 6.946439916582514e-07, + "logits/chosen": 666924970.6666666, + "logits/rejected": 603459788.8, + "logps/chosen": -181.8310546875, + "logps/rejected": -878.3568359375, + "loss": 0.0236, + "rewards/chosen": 3.560252825419108, + "rewards/margins": 14.802708880106607, + "rewards/rejected": -11.2424560546875, + "step": 9091 + }, + { + "epoch": 0.8306989492919141, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 6.939130708842801e-07, + "logits/chosen": 486274986.6666667, + "logits/rejected": 458709056.0, + "logps/chosen": -283.1344401041667, + "logps/rejected": -710.1514282226562, + "loss": 0.0219, + "rewards/chosen": 4.0571543375651045, + "rewards/margins": 18.88189188639323, + "rewards/rejected": -14.824737548828125, + "step": 9092 + }, + { + "epoch": 0.8307903152124257, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 6.931825061838005e-07, + "logits/chosen": 729162496.0, + "logits/rejected": 364607744.0, + "logps/chosen": -264.05950927734375, + "logps/rejected": -369.2946472167969, + "loss": 0.0379, + "rewards/chosen": 2.79435396194458, + "rewards/margins": 12.303879261016846, + "rewards/rejected": -9.509525299072266, + "step": 9093 + }, + { + "epoch": 0.8308816811329374, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 6.924522976172249e-07, + "logits/chosen": 629675878.4, + "logits/rejected": 565012480.0, + "logps/chosen": -373.2600830078125, + "logps/rejected": -383.0352376302083, + "loss": 0.0147, + "rewards/chosen": 4.0392601013183596, + "rewards/margins": 14.264276377360027, + "rewards/rejected": -10.225016276041666, + "step": 9094 + }, + { + "epoch": 0.8309730470534491, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 6.91722445244934e-07, + "logits/chosen": 544240725.3333334, + "logits/rejected": 287125120.0, + "logps/chosen": -318.3127848307292, + "logps/rejected": -474.147900390625, + "loss": 0.013, + "rewards/chosen": 4.322377522786458, + "rewards/margins": 15.492726643880207, + "rewards/rejected": -11.17034912109375, + "step": 9095 + }, + { + "epoch": 0.8310644129739607, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 6.909929491272799e-07, + "logits/chosen": 509363456.0, + "logits/rejected": 549273536.0, + "logps/chosen": -372.1060384114583, + "logps/rejected": -268.4313049316406, + "loss": 0.0137, + "rewards/chosen": 4.413144429524739, + "rewards/margins": 13.499124844868977, + "rewards/rejected": -9.085980415344238, + "step": 9096 + }, + { + "epoch": 0.8311557788944723, + "grad_norm": 0.375, + "kl": 0.0, + "learning_rate": 6.90263809324584e-07, + "logits/chosen": 282354560.0, + "logits/rejected": 420492178.28571427, + "logps/chosen": -270.60504150390625, + "logps/rejected": -500.8212890625, + "loss": 0.0018, + "rewards/chosen": 4.897122383117676, + "rewards/margins": 14.691065515790667, + "rewards/rejected": -9.793943132672991, + "step": 9097 + }, + { + "epoch": 0.831247144814984, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 6.895350258971411e-07, + "logits/chosen": 713166555.4285715, + "logits/rejected": 220854752.0, + "logps/chosen": -219.18184988839286, + "logps/rejected": -318.9580078125, + "loss": 0.04, + "rewards/chosen": 3.314739772251674, + "rewards/margins": 13.717652865818568, + "rewards/rejected": -10.402913093566895, + "step": 9098 + }, + { + "epoch": 0.8313385107354957, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 6.888065989052151e-07, + "logits/chosen": 559683413.3333334, + "logits/rejected": 1192654592.0, + "logps/chosen": -299.71921793619794, + "logps/rejected": -541.4713745117188, + "loss": 0.0253, + "rewards/chosen": 3.6476001739501953, + "rewards/margins": 10.323033809661865, + "rewards/rejected": -6.67543363571167, + "step": 9099 + }, + { + "epoch": 0.8314298766560073, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 6.880785284090385e-07, + "logits/chosen": 412975104.0, + "logits/rejected": 318872085.3333333, + "logps/chosen": -346.92294921875, + "logps/rejected": -332.8870442708333, + "loss": 0.02, + "rewards/chosen": 4.094434356689453, + "rewards/margins": 11.976757939656576, + "rewards/rejected": -7.882323582967122, + "step": 9100 + }, + { + "epoch": 0.8315212425765189, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 6.873508144688173e-07, + "logits/chosen": 687701760.0, + "logits/rejected": 448453568.0, + "logps/chosen": -365.8134460449219, + "logps/rejected": -543.2338256835938, + "loss": 0.0146, + "rewards/chosen": 4.552956581115723, + "rewards/margins": 16.945247650146484, + "rewards/rejected": -12.392291069030762, + "step": 9101 + }, + { + "epoch": 0.8316126084970306, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.866234571447245e-07, + "logits/chosen": 495592448.0, + "logits/rejected": 882109440.0, + "logps/chosen": -272.48801676432294, + "logps/rejected": -643.09208984375, + "loss": 0.0164, + "rewards/chosen": 3.588252385457357, + "rewards/margins": 12.704831822713217, + "rewards/rejected": -9.11657943725586, + "step": 9102 + }, + { + "epoch": 0.8317039744175423, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 6.858964564969089e-07, + "logits/chosen": 484165529.6, + "logits/rejected": 545270528.0, + "logps/chosen": -294.128173828125, + "logps/rejected": -743.4644368489584, + "loss": 0.0117, + "rewards/chosen": 4.288028717041016, + "rewards/margins": 15.08712641398112, + "rewards/rejected": -10.799097696940104, + "step": 9103 + }, + { + "epoch": 0.8317953403380539, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 6.851698125854856e-07, + "logits/chosen": 331704832.0, + "logits/rejected": 380093568.0, + "logps/chosen": -360.6899719238281, + "logps/rejected": -546.36279296875, + "loss": 0.0121, + "rewards/chosen": 3.8587465286254883, + "rewards/margins": 14.402153333028158, + "rewards/rejected": -10.54340680440267, + "step": 9104 + }, + { + "epoch": 0.8318867062585655, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.844435254705407e-07, + "logits/chosen": 594735308.8, + "logits/rejected": 781787648.0, + "logps/chosen": -454.95263671875, + "logps/rejected": -503.5785725911458, + "loss": 0.0252, + "rewards/chosen": 3.748515319824219, + "rewards/margins": 12.885070037841796, + "rewards/rejected": -9.136554718017578, + "step": 9105 + }, + { + "epoch": 0.8319780721790772, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 6.837175952121305e-07, + "logits/chosen": 587885909.3333334, + "logits/rejected": 631537792.0, + "logps/chosen": -379.7143961588542, + "logps/rejected": -449.69488525390625, + "loss": 0.0136, + "rewards/chosen": 4.207754135131836, + "rewards/margins": 13.180913925170898, + "rewards/rejected": -8.973159790039062, + "step": 9106 + }, + { + "epoch": 0.8320694380995889, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 6.829920218702851e-07, + "logits/chosen": 570809813.3333334, + "logits/rejected": 936621465.6, + "logps/chosen": -266.74961344401044, + "logps/rejected": -672.503662109375, + "loss": 0.0089, + "rewards/chosen": 4.237992286682129, + "rewards/margins": 14.74037570953369, + "rewards/rejected": -10.502383422851562, + "step": 9107 + }, + { + "epoch": 0.8321608040201005, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 6.822668055050013e-07, + "logits/chosen": 431776877.71428573, + "logits/rejected": 477192512.0, + "logps/chosen": -316.0404575892857, + "logps/rejected": -548.60888671875, + "loss": 0.0125, + "rewards/chosen": 4.964910234723773, + "rewards/margins": 16.6655022757394, + "rewards/rejected": -11.700592041015625, + "step": 9108 + }, + { + "epoch": 0.8322521699406121, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.815419461762485e-07, + "logits/chosen": 356432742.4, + "logits/rejected": 425198080.0, + "logps/chosen": -207.869384765625, + "logps/rejected": -526.1173909505209, + "loss": 0.0326, + "rewards/chosen": 3.0064552307128904, + "rewards/margins": 13.015788523356118, + "rewards/rejected": -10.009333292643229, + "step": 9109 + }, + { + "epoch": 0.8323435358611238, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 6.808174439439641e-07, + "logits/chosen": 273331097.6, + "logits/rejected": 516691029.3333333, + "logps/chosen": -262.0408203125, + "logps/rejected": -412.4310709635417, + "loss": 0.0218, + "rewards/chosen": 3.9404659271240234, + "rewards/margins": 15.549762090047201, + "rewards/rejected": -11.609296162923178, + "step": 9110 + }, + { + "epoch": 0.8324349017816355, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 6.800932988680598e-07, + "logits/chosen": 542316646.4, + "logits/rejected": 563589077.3333334, + "logps/chosen": -303.927587890625, + "logps/rejected": -389.1713460286458, + "loss": 0.0199, + "rewards/chosen": 3.5979251861572266, + "rewards/margins": 13.995482762654623, + "rewards/rejected": -10.397557576497396, + "step": 9111 + }, + { + "epoch": 0.8325262677021471, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 6.793695110084159e-07, + "logits/chosen": 467048021.3333333, + "logits/rejected": 521244832.0, + "logps/chosen": -350.6539306640625, + "logps/rejected": -723.9216918945312, + "loss": 0.0173, + "rewards/chosen": 4.1212107340494795, + "rewards/margins": 14.484329859415691, + "rewards/rejected": -10.363119125366211, + "step": 9112 + }, + { + "epoch": 0.8326176336226587, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 6.786460804248812e-07, + "logits/chosen": 472853248.0, + "logits/rejected": 246696384.0, + "logps/chosen": -330.9842224121094, + "logps/rejected": -406.294677734375, + "loss": 0.0135, + "rewards/chosen": 4.275754928588867, + "rewards/margins": 15.507865905761719, + "rewards/rejected": -11.232110977172852, + "step": 9113 + }, + { + "epoch": 0.8327089995431703, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 6.779230071772763e-07, + "logits/chosen": 979299008.0, + "logits/rejected": 572207680.0, + "logps/chosen": -245.6337432861328, + "logps/rejected": -646.8812255859375, + "loss": 0.0187, + "rewards/chosen": 3.498161792755127, + "rewards/margins": 14.140156269073486, + "rewards/rejected": -10.64199447631836, + "step": 9114 + }, + { + "epoch": 0.8328003654636821, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 6.772002913253944e-07, + "logits/chosen": 691266304.0, + "logits/rejected": 546894250.6666666, + "logps/chosen": -405.3500732421875, + "logps/rejected": -357.4566243489583, + "loss": 0.0213, + "rewards/chosen": 4.12978515625, + "rewards/margins": 11.398418680826822, + "rewards/rejected": -7.268633524576823, + "step": 9115 + }, + { + "epoch": 0.8328917313841937, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 6.764779329289973e-07, + "logits/chosen": 301639116.8, + "logits/rejected": 555110229.3333334, + "logps/chosen": -227.318603515625, + "logps/rejected": -751.4359537760416, + "loss": 0.0338, + "rewards/chosen": 3.0204490661621093, + "rewards/margins": 14.691639963785807, + "rewards/rejected": -11.671190897623697, + "step": 9116 + }, + { + "epoch": 0.8329830973047053, + "grad_norm": 0.9140625, + "kl": 0.0, + "learning_rate": 6.757559320478169e-07, + "logits/chosen": 351365632.0, + "logits/rejected": 399187968.0, + "logps/chosen": -355.53253173828125, + "logps/rejected": -446.93798828125, + "loss": 0.0048, + "rewards/chosen": 5.1162109375, + "rewards/margins": 13.102343559265137, + "rewards/rejected": -7.986132621765137, + "step": 9117 + }, + { + "epoch": 0.8330744632252169, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 6.750342887415545e-07, + "logits/chosen": 699541546.6666666, + "logits/rejected": 1588806144.0, + "logps/chosen": -390.5214029947917, + "logps/rejected": -691.237548828125, + "loss": 0.0291, + "rewards/chosen": 3.3839438756306968, + "rewards/margins": 10.823803265889486, + "rewards/rejected": -7.439859390258789, + "step": 9118 + }, + { + "epoch": 0.8331658291457287, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 6.743130030698858e-07, + "logits/chosen": 658038976.0, + "logits/rejected": 780282240.0, + "logps/chosen": -514.1369018554688, + "logps/rejected": -520.412841796875, + "loss": 0.0178, + "rewards/chosen": 3.4128623008728027, + "rewards/margins": 13.532938480377197, + "rewards/rejected": -10.120076179504395, + "step": 9119 + }, + { + "epoch": 0.8332571950662403, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 6.735920750924535e-07, + "logits/chosen": 315815648.0, + "logits/rejected": 391861888.0, + "logps/chosen": -211.26654052734375, + "logps/rejected": -430.0731506347656, + "loss": 0.009, + "rewards/chosen": 4.687007904052734, + "rewards/margins": 13.14224624633789, + "rewards/rejected": -8.455238342285156, + "step": 9120 + }, + { + "epoch": 0.8333485609867519, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 6.728715048688711e-07, + "logits/chosen": 564745856.0, + "logits/rejected": 305290944.0, + "logps/chosen": -331.43743896484375, + "logps/rejected": -450.30645751953125, + "loss": 0.0087, + "rewards/chosen": 4.399533271789551, + "rewards/margins": 15.692808151245117, + "rewards/rejected": -11.293274879455566, + "step": 9121 + }, + { + "epoch": 0.8334399269072635, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 6.721512924587243e-07, + "logits/chosen": 604668160.0, + "logits/rejected": 852897792.0, + "logps/chosen": -283.93170166015625, + "logps/rejected": -723.8391723632812, + "loss": 0.0173, + "rewards/chosen": 3.612288475036621, + "rewards/margins": 12.036142349243164, + "rewards/rejected": -8.423853874206543, + "step": 9122 + }, + { + "epoch": 0.8335312928277753, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 6.714314379215653e-07, + "logits/chosen": 226477909.33333334, + "logits/rejected": 477368320.0, + "logps/chosen": -257.625, + "logps/rejected": -349.5209228515625, + "loss": 0.0387, + "rewards/chosen": 2.4247543017069497, + "rewards/margins": 11.184255949656167, + "rewards/rejected": -8.759501647949218, + "step": 9123 + }, + { + "epoch": 0.8336226587482869, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 6.707119413169228e-07, + "logits/chosen": 1415214506.6666667, + "logits/rejected": 978574745.6, + "logps/chosen": -435.3655598958333, + "logps/rejected": -444.0302734375, + "loss": 0.0089, + "rewards/chosen": 4.271128336588542, + "rewards/margins": 11.384565989176433, + "rewards/rejected": -7.113437652587891, + "step": 9124 + }, + { + "epoch": 0.8337140246687985, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 6.699928027042913e-07, + "logits/chosen": 424687104.0, + "logits/rejected": 522565990.4, + "logps/chosen": -318.90317789713544, + "logps/rejected": -583.6912109375, + "loss": 0.0206, + "rewards/chosen": 3.328425725301107, + "rewards/margins": 13.19873898824056, + "rewards/rejected": -9.870313262939453, + "step": 9125 + }, + { + "epoch": 0.8338053905893101, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 6.69274022143136e-07, + "logits/chosen": 468064192.0, + "logits/rejected": 585064576.0, + "logps/chosen": -369.66937255859375, + "logps/rejected": -656.5402221679688, + "loss": 0.0128, + "rewards/chosen": 3.99562931060791, + "rewards/margins": 12.868199348449707, + "rewards/rejected": -8.872570037841797, + "step": 9126 + }, + { + "epoch": 0.8338967565098219, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 6.685555996928933e-07, + "logits/chosen": 569963349.3333334, + "logits/rejected": 325682995.2, + "logps/chosen": -298.3227132161458, + "logps/rejected": -274.7205322265625, + "loss": 0.0071, + "rewards/chosen": 4.271501541137695, + "rewards/margins": 12.135510635375976, + "rewards/rejected": -7.864009094238281, + "step": 9127 + }, + { + "epoch": 0.8339881224303335, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 6.678375354129718e-07, + "logits/chosen": 539938176.0, + "logits/rejected": 586318848.0, + "logps/chosen": -383.0167541503906, + "logps/rejected": -453.6990051269531, + "loss": 0.0168, + "rewards/chosen": 3.6350135803222656, + "rewards/margins": 11.915386199951172, + "rewards/rejected": -8.280372619628906, + "step": 9128 + }, + { + "epoch": 0.8340794883508451, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.671198293627479e-07, + "logits/chosen": 449413546.6666667, + "logits/rejected": 582099558.4, + "logps/chosen": -318.02683512369794, + "logps/rejected": -298.4138671875, + "loss": 0.068, + "rewards/chosen": 5.013156255086263, + "rewards/margins": 11.106146748860677, + "rewards/rejected": -6.092990493774414, + "step": 9129 + }, + { + "epoch": 0.8341708542713567, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.66402481601569e-07, + "logits/chosen": 309478656.0, + "logits/rejected": 343980970.6666667, + "logps/chosen": -235.9882568359375, + "logps/rejected": -612.40771484375, + "loss": 0.0223, + "rewards/chosen": 3.629673385620117, + "rewards/margins": 16.145270411173502, + "rewards/rejected": -12.515597025553385, + "step": 9130 + }, + { + "epoch": 0.8342622201918685, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 6.656854921887529e-07, + "logits/chosen": 2634635264.0, + "logits/rejected": 747580562.2857143, + "logps/chosen": -433.54302978515625, + "logps/rejected": -574.1017020089286, + "loss": 0.0184, + "rewards/chosen": 1.7617645263671875, + "rewards/margins": 13.367599487304688, + "rewards/rejected": -11.6058349609375, + "step": 9131 + }, + { + "epoch": 0.8343535861123801, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 6.649688611835875e-07, + "logits/chosen": 626943701.3333334, + "logits/rejected": 733222604.8, + "logps/chosen": -425.1142578125, + "logps/rejected": -467.718017578125, + "loss": 0.0091, + "rewards/chosen": 3.902928670247396, + "rewards/margins": 13.410353597005209, + "rewards/rejected": -9.507424926757812, + "step": 9132 + }, + { + "epoch": 0.8344449520328917, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 6.642525886453332e-07, + "logits/chosen": 364046250.6666667, + "logits/rejected": 836604032.0, + "logps/chosen": -194.87357584635416, + "logps/rejected": -470.0638122558594, + "loss": 0.1437, + "rewards/chosen": 2.2218968073527017, + "rewards/margins": 9.820316950480143, + "rewards/rejected": -7.598420143127441, + "step": 9133 + }, + { + "epoch": 0.8345363179534033, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 6.635366746332184e-07, + "logits/chosen": 464277632.0, + "logits/rejected": 574147891.2, + "logps/chosen": -463.3180338541667, + "logps/rejected": -550.828369140625, + "loss": 0.0176, + "rewards/chosen": 3.168529192606608, + "rewards/margins": 13.704389635721842, + "rewards/rejected": -10.535860443115235, + "step": 9134 + }, + { + "epoch": 0.8346276838739151, + "grad_norm": 32.5, + "kl": 0.0, + "learning_rate": 6.628211192064421e-07, + "logits/chosen": 490591914.6666667, + "logits/rejected": 665019596.8, + "logps/chosen": -265.73486328125, + "logps/rejected": -355.185791015625, + "loss": 0.0367, + "rewards/chosen": 3.747712771097819, + "rewards/margins": 10.89860922495524, + "rewards/rejected": -7.150896453857422, + "step": 9135 + }, + { + "epoch": 0.8347190497944267, + "grad_norm": 46.5, + "kl": 0.0, + "learning_rate": 6.621059224241738e-07, + "logits/chosen": 561008576.0, + "logits/rejected": 528773824.0, + "logps/chosen": -210.817138671875, + "logps/rejected": -417.54534912109375, + "loss": 0.0396, + "rewards/chosen": 3.1243929862976074, + "rewards/margins": 9.32054090499878, + "rewards/rejected": -6.196147918701172, + "step": 9136 + }, + { + "epoch": 0.8348104157149383, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 6.613910843455551e-07, + "logits/chosen": 686904466.2857143, + "logits/rejected": 324220800.0, + "logps/chosen": -268.79087611607144, + "logps/rejected": -279.794677734375, + "loss": 0.0395, + "rewards/chosen": 3.2398785182407925, + "rewards/margins": 12.836868149893625, + "rewards/rejected": -9.596989631652832, + "step": 9137 + }, + { + "epoch": 0.8349017816354499, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 6.606766050296953e-07, + "logits/chosen": 666125107.2, + "logits/rejected": 778872490.6666666, + "logps/chosen": -450.20771484375, + "logps/rejected": -459.37109375, + "loss": 0.0126, + "rewards/chosen": 4.147737884521485, + "rewards/margins": 12.23135986328125, + "rewards/rejected": -8.083621978759766, + "step": 9138 + }, + { + "epoch": 0.8349931475559617, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 6.599624845356761e-07, + "logits/chosen": 516963157.3333333, + "logits/rejected": 421009344.0, + "logps/chosen": -358.6075439453125, + "logps/rejected": -514.2588500976562, + "loss": 0.0366, + "rewards/chosen": 3.435724894205729, + "rewards/margins": 14.681091944376627, + "rewards/rejected": -11.245367050170898, + "step": 9139 + }, + { + "epoch": 0.8350845134764733, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 6.592487229225475e-07, + "logits/chosen": 531421888.0, + "logits/rejected": 604101312.0, + "logps/chosen": -352.68658447265625, + "logps/rejected": -621.0022583007812, + "loss": 0.1093, + "rewards/chosen": 3.4440572261810303, + "rewards/margins": 10.435276746749878, + "rewards/rejected": -6.991219520568848, + "step": 9140 + }, + { + "epoch": 0.8351758793969849, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 6.585353202493322e-07, + "logits/chosen": 371956821.3333333, + "logits/rejected": 500045926.4, + "logps/chosen": -293.1697184244792, + "logps/rejected": -518.0703125, + "loss": 0.0977, + "rewards/chosen": 4.936201095581055, + "rewards/margins": 13.66659049987793, + "rewards/rejected": -8.730389404296876, + "step": 9141 + }, + { + "epoch": 0.8352672453174965, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.578222765750208e-07, + "logits/chosen": 714807040.0, + "logits/rejected": 731454208.0, + "logps/chosen": -397.13970947265625, + "logps/rejected": -810.8777465820312, + "loss": 0.0283, + "rewards/chosen": 3.0432252883911133, + "rewards/margins": 13.788012504577637, + "rewards/rejected": -10.744787216186523, + "step": 9142 + }, + { + "epoch": 0.8353586112380083, + "grad_norm": 58.5, + "kl": 0.0, + "learning_rate": 6.571095919585763e-07, + "logits/chosen": 558592640.0, + "logits/rejected": 300265920.0, + "logps/chosen": -283.2742919921875, + "logps/rejected": -492.14703369140625, + "loss": 0.0775, + "rewards/chosen": 2.529224395751953, + "rewards/margins": 14.507650375366211, + "rewards/rejected": -11.978425979614258, + "step": 9143 + }, + { + "epoch": 0.8354499771585199, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 6.563972664589297e-07, + "logits/chosen": 432701664.0, + "logits/rejected": 373610965.3333333, + "logps/chosen": -176.40171813964844, + "logps/rejected": -420.126708984375, + "loss": 0.0126, + "rewards/chosen": 3.003999710083008, + "rewards/margins": 10.992886225382488, + "rewards/rejected": -7.9888865152994795, + "step": 9144 + }, + { + "epoch": 0.8355413430790315, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 6.556853001349856e-07, + "logits/chosen": 537097685.3333334, + "logits/rejected": 205355696.0, + "logps/chosen": -215.8458251953125, + "logps/rejected": -224.76275634765625, + "loss": 0.1221, + "rewards/chosen": 3.3551479975382485, + "rewards/margins": 11.299858729044596, + "rewards/rejected": -7.944710731506348, + "step": 9145 + }, + { + "epoch": 0.8356327089995431, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 6.549736930456163e-07, + "logits/chosen": 561043712.0, + "logits/rejected": 480623923.2, + "logps/chosen": -219.83612060546875, + "logps/rejected": -454.2115234375, + "loss": 0.0151, + "rewards/chosen": 3.591618220011393, + "rewards/margins": 12.464021174112956, + "rewards/rejected": -8.872402954101563, + "step": 9146 + }, + { + "epoch": 0.8357240749200548, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 6.542624452496648e-07, + "logits/chosen": 725855914.6666666, + "logits/rejected": 829265715.2, + "logps/chosen": -476.227783203125, + "logps/rejected": -678.6576171875, + "loss": 0.0166, + "rewards/chosen": 3.1785624821980796, + "rewards/margins": 13.934374554951987, + "rewards/rejected": -10.755812072753907, + "step": 9147 + }, + { + "epoch": 0.8358154408405665, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 6.535515568059441e-07, + "logits/chosen": 1524115114.6666667, + "logits/rejected": 696193792.0, + "logps/chosen": -378.3184000651042, + "logps/rejected": -715.56806640625, + "loss": 0.0266, + "rewards/chosen": 2.898735682169596, + "rewards/margins": 13.966130701700846, + "rewards/rejected": -11.06739501953125, + "step": 9148 + }, + { + "epoch": 0.8359068067610781, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 6.528410277732406e-07, + "logits/chosen": 637244416.0, + "logits/rejected": 597630361.6, + "logps/chosen": -420.3137613932292, + "logps/rejected": -498.91455078125, + "loss": 0.0137, + "rewards/chosen": 3.305217425028483, + "rewards/margins": 12.272180620829264, + "rewards/rejected": -8.966963195800782, + "step": 9149 + }, + { + "epoch": 0.8359981726815897, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 6.521308582103059e-07, + "logits/chosen": 996056960.0, + "logits/rejected": 629686784.0, + "logps/chosen": -523.03662109375, + "logps/rejected": -573.7332356770834, + "loss": 0.0073, + "rewards/chosen": 3.5609841346740723, + "rewards/margins": 13.517663160959879, + "rewards/rejected": -9.956679026285807, + "step": 9150 + }, + { + "epoch": 0.8360895386021014, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 6.514210481758654e-07, + "logits/chosen": 394671923.2, + "logits/rejected": 1267483306.6666667, + "logps/chosen": -186.69111328125, + "logps/rejected": -270.0937906901042, + "loss": 0.0321, + "rewards/chosen": 3.0870874404907225, + "rewards/margins": 11.347584215799966, + "rewards/rejected": -8.260496775309244, + "step": 9151 + }, + { + "epoch": 0.8361809045226131, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 6.507115977286144e-07, + "logits/chosen": 1376111360.0, + "logits/rejected": 500480987.4285714, + "logps/chosen": -987.12060546875, + "logps/rejected": -513.8580496651786, + "loss": 0.0056, + "rewards/chosen": 3.582855224609375, + "rewards/margins": 12.05812018258231, + "rewards/rejected": -8.475264957972936, + "step": 9152 + }, + { + "epoch": 0.8362722704431247, + "grad_norm": 0.419921875, + "kl": 0.0, + "learning_rate": 6.500025069272153e-07, + "logits/chosen": 655849088.0, + "logits/rejected": 362416128.0, + "logps/chosen": -198.0323486328125, + "logps/rejected": -438.8439127604167, + "loss": 0.003, + "rewards/chosen": 4.541064262390137, + "rewards/margins": 13.13636557261149, + "rewards/rejected": -8.595301310221354, + "step": 9153 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 6.49293775830307e-07, + "logits/chosen": 464835072.0, + "logits/rejected": 356124778.6666667, + "logps/chosen": -525.7416015625, + "logps/rejected": -538.4813232421875, + "loss": 0.0159, + "rewards/chosen": 3.7256179809570313, + "rewards/margins": 14.546276346842447, + "rewards/rejected": -10.820658365885416, + "step": 9154 + }, + { + "epoch": 0.836455002284148, + "grad_norm": 1.0703125, + "kl": 0.0, + "learning_rate": 6.485854044964928e-07, + "logits/chosen": 256243584.0, + "logits/rejected": 433777408.0, + "logps/chosen": -243.55044555664062, + "logps/rejected": -234.88018798828125, + "loss": 0.0053, + "rewards/chosen": 4.878077030181885, + "rewards/margins": 11.741631031036377, + "rewards/rejected": -6.863554000854492, + "step": 9155 + }, + { + "epoch": 0.8365463682046597, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 6.478773929843491e-07, + "logits/chosen": 835163648.0, + "logits/rejected": 983154261.3333334, + "logps/chosen": -401.3656494140625, + "logps/rejected": -715.908447265625, + "loss": 0.0244, + "rewards/chosen": 3.400054931640625, + "rewards/margins": 14.143226623535156, + "rewards/rejected": -10.743171691894531, + "step": 9156 + }, + { + "epoch": 0.8366377341251713, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 6.471697413524203e-07, + "logits/chosen": 365832640.0, + "logits/rejected": 379391936.0, + "logps/chosen": -272.19439697265625, + "logps/rejected": -497.30706787109375, + "loss": 0.0086, + "rewards/chosen": 4.3449907302856445, + "rewards/margins": 14.197763442993164, + "rewards/rejected": -9.85277271270752, + "step": 9157 + }, + { + "epoch": 0.8367291000456829, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 6.464624496592259e-07, + "logits/chosen": 571534549.3333334, + "logits/rejected": 453031987.2, + "logps/chosen": -481.6566569010417, + "logps/rejected": -550.327197265625, + "loss": 0.0167, + "rewards/chosen": 3.687358856201172, + "rewards/margins": 14.153846740722656, + "rewards/rejected": -10.466487884521484, + "step": 9158 + }, + { + "epoch": 0.8368204659661946, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 6.457555179632496e-07, + "logits/chosen": 627397734.4, + "logits/rejected": 923593216.0, + "logps/chosen": -324.620654296875, + "logps/rejected": -388.1058756510417, + "loss": 0.0242, + "rewards/chosen": 3.781725311279297, + "rewards/margins": 12.907654190063477, + "rewards/rejected": -9.12592887878418, + "step": 9159 + }, + { + "epoch": 0.8369118318867063, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 6.450489463229493e-07, + "logits/chosen": 554692160.0, + "logits/rejected": 439195520.0, + "logps/chosen": -263.1041259765625, + "logps/rejected": -436.9542236328125, + "loss": 0.0062, + "rewards/chosen": 4.595624923706055, + "rewards/margins": 13.064640045166016, + "rewards/rejected": -8.469015121459961, + "step": 9160 + }, + { + "epoch": 0.8370031978072179, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 6.443427347967507e-07, + "logits/chosen": 460619136.0, + "logits/rejected": 507637920.0, + "logps/chosen": -311.77105712890625, + "logps/rejected": -559.9678344726562, + "loss": 0.0213, + "rewards/chosen": 3.15560245513916, + "rewards/margins": 11.887211799621582, + "rewards/rejected": -8.731609344482422, + "step": 9161 + }, + { + "epoch": 0.8370945637277295, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 6.436368834430529e-07, + "logits/chosen": 497794764.8, + "logits/rejected": 472176640.0, + "logps/chosen": -406.2720947265625, + "logps/rejected": -523.6064860026041, + "loss": 0.0116, + "rewards/chosen": 4.439046096801758, + "rewards/margins": 13.673822911580405, + "rewards/rejected": -9.234776814778646, + "step": 9162 + }, + { + "epoch": 0.8371859296482412, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 6.429313923202224e-07, + "logits/chosen": 574451712.0, + "logits/rejected": 457085056.0, + "logps/chosen": -210.1146240234375, + "logps/rejected": -465.7298177083333, + "loss": 0.0309, + "rewards/chosen": 3.6943504333496096, + "rewards/margins": 12.381851069132487, + "rewards/rejected": -8.687500635782877, + "step": 9163 + }, + { + "epoch": 0.8372772955687529, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.422262614865959e-07, + "logits/chosen": 444918016.0, + "logits/rejected": 413505194.6666667, + "logps/chosen": -272.434716796875, + "logps/rejected": -645.65625, + "loss": 0.0268, + "rewards/chosen": 3.4594161987304686, + "rewards/margins": 13.390395609537759, + "rewards/rejected": -9.930979410807291, + "step": 9164 + }, + { + "epoch": 0.8373686614892645, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 6.41521491000483e-07, + "logits/chosen": 609226956.8, + "logits/rejected": 522792704.0, + "logps/chosen": -332.6639892578125, + "logps/rejected": -542.7901611328125, + "loss": 0.0409, + "rewards/chosen": 3.0755336761474608, + "rewards/margins": 12.891628392537434, + "rewards/rejected": -9.816094716389975, + "step": 9165 + }, + { + "epoch": 0.8374600274097762, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 6.408170809201597e-07, + "logits/chosen": 540600576.0, + "logits/rejected": 403303968.0, + "logps/chosen": -338.5880126953125, + "logps/rejected": -403.982666015625, + "loss": 0.1345, + "rewards/chosen": 1.7779418230056763, + "rewards/margins": 10.19330608844757, + "rewards/rejected": -8.415364265441895, + "step": 9166 + }, + { + "epoch": 0.8375513933302878, + "grad_norm": 0.82421875, + "kl": 0.0, + "learning_rate": 6.401130313038756e-07, + "logits/chosen": 520325632.0, + "logits/rejected": 440070997.3333333, + "logps/chosen": -233.13154296875, + "logps/rejected": -474.285400390625, + "loss": 0.0049, + "rewards/chosen": 5.378826141357422, + "rewards/margins": 13.475056330362955, + "rewards/rejected": -8.096230189005533, + "step": 9167 + }, + { + "epoch": 0.8376427592507995, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 6.394093422098479e-07, + "logits/chosen": 612001408.0, + "logits/rejected": 446174528.0, + "logps/chosen": -477.1206461588542, + "logps/rejected": -394.63861083984375, + "loss": 0.0238, + "rewards/chosen": 3.7543509801228843, + "rewards/margins": 12.153238614400228, + "rewards/rejected": -8.398887634277344, + "step": 9168 + }, + { + "epoch": 0.8377341251713111, + "grad_norm": 62.75, + "kl": 0.0, + "learning_rate": 6.387060136962652e-07, + "logits/chosen": 578921062.4, + "logits/rejected": 655435392.0, + "logps/chosen": -246.219580078125, + "logps/rejected": -783.5196126302084, + "loss": 0.0629, + "rewards/chosen": 3.362314987182617, + "rewards/margins": 12.376140721638997, + "rewards/rejected": -9.01382573445638, + "step": 9169 + }, + { + "epoch": 0.8378254910918228, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 6.380030458212882e-07, + "logits/chosen": 365326131.2, + "logits/rejected": 461025621.3333333, + "logps/chosen": -227.56552734375, + "logps/rejected": -571.5428873697916, + "loss": 0.0286, + "rewards/chosen": 3.1955028533935548, + "rewards/margins": 14.369829432169595, + "rewards/rejected": -11.174326578776041, + "step": 9170 + }, + { + "epoch": 0.8379168570123344, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 6.373004386430442e-07, + "logits/chosen": 495667584.0, + "logits/rejected": 340806826.6666667, + "logps/chosen": -354.830078125, + "logps/rejected": -447.8463541666667, + "loss": 0.0171, + "rewards/chosen": 2.8249595165252686, + "rewards/margins": 12.577017545700073, + "rewards/rejected": -9.752058029174805, + "step": 9171 + }, + { + "epoch": 0.838008222932846, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.36598192219633e-07, + "logits/chosen": 578797760.0, + "logits/rejected": 406738432.0, + "logps/chosen": -616.7398681640625, + "logps/rejected": -587.0396118164062, + "loss": 0.0222, + "rewards/chosen": 3.554363250732422, + "rewards/margins": 14.809850692749023, + "rewards/rejected": -11.255487442016602, + "step": 9172 + }, + { + "epoch": 0.8380995888533577, + "grad_norm": 0.51171875, + "kl": 0.0, + "learning_rate": 6.358963066091228e-07, + "logits/chosen": 245794432.0, + "logits/rejected": 555535360.0, + "logps/chosen": -178.80357360839844, + "logps/rejected": -398.2617885044643, + "loss": 0.0019, + "rewards/chosen": 4.213183879852295, + "rewards/margins": 14.609668254852295, + "rewards/rejected": -10.396484375, + "step": 9173 + }, + { + "epoch": 0.8381909547738694, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 6.351947818695531e-07, + "logits/chosen": 651789056.0, + "logits/rejected": 483015104.0, + "logps/chosen": -273.6532287597656, + "logps/rejected": -626.4769287109375, + "loss": 0.0127, + "rewards/chosen": 3.7558741569519043, + "rewards/margins": 15.008899211883545, + "rewards/rejected": -11.25302505493164, + "step": 9174 + }, + { + "epoch": 0.838282320694381, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 6.34493618058935e-07, + "logits/chosen": 411415072.0, + "logits/rejected": 420267861.3333333, + "logps/chosen": -241.58706665039062, + "logps/rejected": -638.45361328125, + "loss": 0.013, + "rewards/chosen": 2.9626283645629883, + "rewards/margins": 13.286548296610514, + "rewards/rejected": -10.323919932047525, + "step": 9175 + }, + { + "epoch": 0.8383736866148926, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 6.337928152352479e-07, + "logits/chosen": 462070016.0, + "logits/rejected": 379903200.0, + "logps/chosen": -394.2919921875, + "logps/rejected": -532.2772827148438, + "loss": 0.0232, + "rewards/chosen": 3.6904729207356772, + "rewards/margins": 15.56522305806478, + "rewards/rejected": -11.874750137329102, + "step": 9176 + }, + { + "epoch": 0.8384650525354043, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 6.330923734564403e-07, + "logits/chosen": 253764128.0, + "logits/rejected": 443289216.0, + "logps/chosen": -181.00201416015625, + "logps/rejected": -372.1755676269531, + "loss": 0.0209, + "rewards/chosen": 3.807955026626587, + "rewards/margins": 12.118911981582642, + "rewards/rejected": -8.310956954956055, + "step": 9177 + }, + { + "epoch": 0.838556418455916, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 6.323922927804327e-07, + "logits/chosen": 636644224.0, + "logits/rejected": 861977856.0, + "logps/chosen": -316.5291748046875, + "logps/rejected": -771.355224609375, + "loss": 0.0082, + "rewards/chosen": 4.587128639221191, + "rewards/margins": 14.530375480651855, + "rewards/rejected": -9.943246841430664, + "step": 9178 + }, + { + "epoch": 0.8386477843764276, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 6.31692573265117e-07, + "logits/chosen": 476677056.0, + "logits/rejected": 390171008.0, + "logps/chosen": -303.6439208984375, + "logps/rejected": -441.1221618652344, + "loss": 0.008, + "rewards/chosen": 4.628323078155518, + "rewards/margins": 13.867475986480713, + "rewards/rejected": -9.239152908325195, + "step": 9179 + }, + { + "epoch": 0.8387391502969392, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 6.309932149683517e-07, + "logits/chosen": 208770764.8, + "logits/rejected": 573029717.3333334, + "logps/chosen": -201.75294189453126, + "logps/rejected": -632.294677734375, + "loss": 0.0342, + "rewards/chosen": 3.465550994873047, + "rewards/margins": 12.43537228902181, + "rewards/rejected": -8.969821294148764, + "step": 9180 + }, + { + "epoch": 0.8388305162174509, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 6.302942179479676e-07, + "logits/chosen": 504001331.2, + "logits/rejected": 498485973.3333333, + "logps/chosen": -267.917333984375, + "logps/rejected": -579.9640706380209, + "loss": 0.0269, + "rewards/chosen": 3.394919586181641, + "rewards/margins": 13.8119016011556, + "rewards/rejected": -10.416982014973959, + "step": 9181 + }, + { + "epoch": 0.8389218821379626, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 6.29595582261765e-07, + "logits/chosen": 388835968.0, + "logits/rejected": 349599072.0, + "logps/chosen": -392.6508483886719, + "logps/rejected": -347.28277587890625, + "loss": 0.0136, + "rewards/chosen": 4.374117374420166, + "rewards/margins": 12.407789707183838, + "rewards/rejected": -8.033672332763672, + "step": 9182 + }, + { + "epoch": 0.8390132480584742, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 6.288973079675153e-07, + "logits/chosen": 666088089.6, + "logits/rejected": 1258162688.0, + "logps/chosen": -338.421484375, + "logps/rejected": -363.2027994791667, + "loss": 0.0213, + "rewards/chosen": 3.7351005554199217, + "rewards/margins": 12.479851531982423, + "rewards/rejected": -8.7447509765625, + "step": 9183 + }, + { + "epoch": 0.8391046139789858, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 6.281993951229598e-07, + "logits/chosen": 775242666.6666666, + "logits/rejected": 430242150.4, + "logps/chosen": -403.585205078125, + "logps/rejected": -385.9589599609375, + "loss": 0.0063, + "rewards/chosen": 4.315165201822917, + "rewards/margins": 13.81974131266276, + "rewards/rejected": -9.504576110839844, + "step": 9184 + }, + { + "epoch": 0.8391959798994975, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 6.275018437858083e-07, + "logits/chosen": 568373248.0, + "logits/rejected": 686370240.0, + "logps/chosen": -369.2744445800781, + "logps/rejected": -423.31787109375, + "loss": 0.0095, + "rewards/chosen": 4.431681156158447, + "rewards/margins": 13.367772579193115, + "rewards/rejected": -8.936091423034668, + "step": 9185 + }, + { + "epoch": 0.8392873458200092, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 6.268046540137424e-07, + "logits/chosen": 561523626.6666666, + "logits/rejected": 744122828.8, + "logps/chosen": -338.7908121744792, + "logps/rejected": -493.595849609375, + "loss": 0.0324, + "rewards/chosen": 2.9243958791097007, + "rewards/margins": 12.971239598592122, + "rewards/rejected": -10.046843719482421, + "step": 9186 + }, + { + "epoch": 0.8393787117405208, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 6.261078258644121e-07, + "logits/chosen": 538124595.2, + "logits/rejected": 444560896.0, + "logps/chosen": -340.739892578125, + "logps/rejected": -464.6114908854167, + "loss": 0.0471, + "rewards/chosen": 2.9496971130371095, + "rewards/margins": 11.984379069010416, + "rewards/rejected": -9.034681955973307, + "step": 9187 + }, + { + "epoch": 0.8394700776610324, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 6.254113593954403e-07, + "logits/chosen": 316361152.0, + "logits/rejected": 277095552.0, + "logps/chosen": -344.82501220703125, + "logps/rejected": -320.352783203125, + "loss": 0.0212, + "rewards/chosen": 3.8230552673339844, + "rewards/margins": 12.153471946716309, + "rewards/rejected": -8.330416679382324, + "step": 9188 + }, + { + "epoch": 0.839561443581544, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.24715254664418e-07, + "logits/chosen": 394955904.0, + "logits/rejected": 250472000.0, + "logps/chosen": -399.5090576171875, + "logps/rejected": -432.9749348958333, + "loss": 0.0254, + "rewards/chosen": 3.924243927001953, + "rewards/margins": 12.68433698018392, + "rewards/rejected": -8.760093053181967, + "step": 9189 + }, + { + "epoch": 0.8396528095020558, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 6.240195117289061e-07, + "logits/chosen": 550768192.0, + "logits/rejected": 496669312.0, + "logps/chosen": -361.34814453125, + "logps/rejected": -674.4743041992188, + "loss": 0.0158, + "rewards/chosen": 3.9003684520721436, + "rewards/margins": 14.092255353927612, + "rewards/rejected": -10.191886901855469, + "step": 9190 + }, + { + "epoch": 0.8397441754225674, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 6.233241306464349e-07, + "logits/chosen": 388979532.8, + "logits/rejected": 658868480.0, + "logps/chosen": -272.854345703125, + "logps/rejected": -654.6642252604166, + "loss": 0.03, + "rewards/chosen": 3.4093399047851562, + "rewards/margins": 14.664728800455729, + "rewards/rejected": -11.255388895670572, + "step": 9191 + }, + { + "epoch": 0.839835541343079, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 6.226291114745086e-07, + "logits/chosen": 237030448.0, + "logits/rejected": 552634965.3333334, + "logps/chosen": -244.67108154296875, + "logps/rejected": -424.9676106770833, + "loss": 0.003, + "rewards/chosen": 5.762003421783447, + "rewards/margins": 14.609386285146078, + "rewards/rejected": -8.84738286336263, + "step": 9192 + }, + { + "epoch": 0.8399269072635906, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 6.219344542705985e-07, + "logits/chosen": 459831594.6666667, + "logits/rejected": 516251648.0, + "logps/chosen": -229.814453125, + "logps/rejected": -468.038720703125, + "loss": 0.0118, + "rewards/chosen": 3.818333625793457, + "rewards/margins": 12.729181098937989, + "rewards/rejected": -8.910847473144532, + "step": 9193 + }, + { + "epoch": 0.8400182731841024, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 6.212401590921447e-07, + "logits/chosen": 661133824.0, + "logits/rejected": 275930944.0, + "logps/chosen": -346.91400146484375, + "logps/rejected": -598.1097412109375, + "loss": 0.022, + "rewards/chosen": 3.5441060066223145, + "rewards/margins": 13.470642566680908, + "rewards/rejected": -9.926536560058594, + "step": 9194 + }, + { + "epoch": 0.840109639104614, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 6.205462259965583e-07, + "logits/chosen": 901463040.0, + "logits/rejected": 598436992.0, + "logps/chosen": -426.77001953125, + "logps/rejected": -293.05126953125, + "loss": 0.0143, + "rewards/chosen": 3.795413017272949, + "rewards/margins": 11.588653564453125, + "rewards/rejected": -7.793240547180176, + "step": 9195 + }, + { + "epoch": 0.8402010050251256, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 6.198526550412232e-07, + "logits/chosen": 616613312.0, + "logits/rejected": 597936960.0, + "logps/chosen": -472.82379150390625, + "logps/rejected": -568.3323364257812, + "loss": 0.0176, + "rewards/chosen": 3.5201072692871094, + "rewards/margins": 12.969039916992188, + "rewards/rejected": -9.448932647705078, + "step": 9196 + }, + { + "epoch": 0.8402923709456372, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 6.19159446283491e-07, + "logits/chosen": 458120192.0, + "logits/rejected": 289500032.0, + "logps/chosen": -346.6822509765625, + "logps/rejected": -245.08834838867188, + "loss": 0.0169, + "rewards/chosen": 3.5361058712005615, + "rewards/margins": 12.202277898788452, + "rewards/rejected": -8.66617202758789, + "step": 9197 + }, + { + "epoch": 0.840383736866149, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 6.184665997806832e-07, + "logits/chosen": 507994112.0, + "logits/rejected": 634642112.0, + "logps/chosen": -233.52362060546875, + "logps/rejected": -568.92333984375, + "loss": 0.0128, + "rewards/chosen": 4.707950592041016, + "rewards/margins": 13.363988876342773, + "rewards/rejected": -8.656038284301758, + "step": 9198 + }, + { + "epoch": 0.8404751027866606, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 6.177741155900901e-07, + "logits/chosen": 383959360.0, + "logits/rejected": 509505365.3333333, + "logps/chosen": -189.19656372070312, + "logps/rejected": -433.0654296875, + "loss": 0.0074, + "rewards/chosen": 4.102820873260498, + "rewards/margins": 14.709221680959066, + "rewards/rejected": -10.606400807698568, + "step": 9199 + }, + { + "epoch": 0.8405664687071722, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 6.170819937689771e-07, + "logits/chosen": 333286496.0, + "logits/rejected": 382862976.0, + "logps/chosen": -230.98001098632812, + "logps/rejected": -462.24560546875, + "loss": 0.0294, + "rewards/chosen": 2.9014761447906494, + "rewards/margins": 14.91782021522522, + "rewards/rejected": -12.01634407043457, + "step": 9200 + }, + { + "epoch": 0.8406578346276838, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 6.163902343745748e-07, + "logits/chosen": 546913792.0, + "logits/rejected": 502085017.6, + "logps/chosen": -208.74652099609375, + "logps/rejected": -520.00390625, + "loss": 0.0459, + "rewards/chosen": 2.1499547958374023, + "rewards/margins": 11.53624324798584, + "rewards/rejected": -9.386288452148438, + "step": 9201 + }, + { + "epoch": 0.8407492005481956, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 6.156988374640843e-07, + "logits/chosen": 802719104.0, + "logits/rejected": 811244480.0, + "logps/chosen": -350.627685546875, + "logps/rejected": -886.7462158203125, + "loss": 0.0237, + "rewards/chosen": 3.244217872619629, + "rewards/margins": 14.58060359954834, + "rewards/rejected": -11.336385726928711, + "step": 9202 + }, + { + "epoch": 0.8408405664687072, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 6.150078030946777e-07, + "logits/chosen": 704619200.0, + "logits/rejected": 623931520.0, + "logps/chosen": -601.759033203125, + "logps/rejected": -545.3097330729166, + "loss": 0.0052, + "rewards/chosen": 3.898843288421631, + "rewards/margins": 14.269766648610434, + "rewards/rejected": -10.370923360188803, + "step": 9203 + }, + { + "epoch": 0.8409319323892188, + "grad_norm": 57.25, + "kl": 0.0, + "learning_rate": 6.143171313234991e-07, + "logits/chosen": 302884309.3333333, + "logits/rejected": 588207820.8, + "logps/chosen": -180.96333821614584, + "logps/rejected": -583.00908203125, + "loss": 0.0677, + "rewards/chosen": 3.8256285985310874, + "rewards/margins": 12.51217187245687, + "rewards/rejected": -8.686543273925782, + "step": 9204 + }, + { + "epoch": 0.8410232983097304, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 6.136268222076597e-07, + "logits/chosen": 403731456.0, + "logits/rejected": 386036544.0, + "logps/chosen": -250.9781290690104, + "logps/rejected": -465.1302185058594, + "loss": 0.0282, + "rewards/chosen": 3.8716615041097007, + "rewards/margins": 15.93420155843099, + "rewards/rejected": -12.062540054321289, + "step": 9205 + }, + { + "epoch": 0.8411146642302422, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 6.129368758042414e-07, + "logits/chosen": 616163840.0, + "logits/rejected": 411105216.0, + "logps/chosen": -335.4288330078125, + "logps/rejected": -374.30517578125, + "loss": 0.0241, + "rewards/chosen": 3.8563003540039062, + "rewards/margins": 13.447668075561523, + "rewards/rejected": -9.591367721557617, + "step": 9206 + }, + { + "epoch": 0.8412060301507538, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 6.12247292170296e-07, + "logits/chosen": 574860928.0, + "logits/rejected": 454187571.2, + "logps/chosen": -520.5362955729166, + "logps/rejected": -571.77001953125, + "loss": 0.0058, + "rewards/chosen": 4.162646611531575, + "rewards/margins": 14.435835393269855, + "rewards/rejected": -10.27318878173828, + "step": 9207 + }, + { + "epoch": 0.8412973960712654, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 6.115580713628455e-07, + "logits/chosen": 376166528.0, + "logits/rejected": 518073600.0, + "logps/chosen": -279.084814453125, + "logps/rejected": -522.3570149739584, + "loss": 0.0146, + "rewards/chosen": 4.163381576538086, + "rewards/margins": 14.249133427937826, + "rewards/rejected": -10.08575185139974, + "step": 9208 + }, + { + "epoch": 0.841388761991777, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 6.108692134388833e-07, + "logits/chosen": 523214304.0, + "logits/rejected": 593436608.0, + "logps/chosen": -337.43255615234375, + "logps/rejected": -553.5953369140625, + "loss": 0.0105, + "rewards/chosen": 4.165482521057129, + "rewards/margins": 14.615632057189941, + "rewards/rejected": -10.450149536132812, + "step": 9209 + }, + { + "epoch": 0.8414801279122888, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 6.101807184553715e-07, + "logits/chosen": 762740672.0, + "logits/rejected": 649150080.0, + "logps/chosen": -482.9178466796875, + "logps/rejected": -501.4554850260417, + "loss": 0.0091, + "rewards/chosen": 3.745234966278076, + "rewards/margins": 11.889785925547281, + "rewards/rejected": -8.144550959269205, + "step": 9210 + }, + { + "epoch": 0.8415714938328004, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 6.094925864692414e-07, + "logits/chosen": 999638592.0, + "logits/rejected": 511364778.6666667, + "logps/chosen": -364.06036376953125, + "logps/rejected": -582.947265625, + "loss": 0.0076, + "rewards/chosen": 4.219148635864258, + "rewards/margins": 13.063645044962565, + "rewards/rejected": -8.844496409098307, + "step": 9211 + }, + { + "epoch": 0.841662859753312, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 6.088048175373945e-07, + "logits/chosen": 1145802752.0, + "logits/rejected": 516148309.3333333, + "logps/chosen": -367.960546875, + "logps/rejected": -554.0000813802084, + "loss": 0.0187, + "rewards/chosen": 3.9792022705078125, + "rewards/margins": 14.261220296223959, + "rewards/rejected": -10.282018025716146, + "step": 9212 + }, + { + "epoch": 0.8417542256738236, + "grad_norm": 47.5, + "kl": 0.0, + "learning_rate": 6.081174117167049e-07, + "logits/chosen": 790146901.3333334, + "logits/rejected": 435510425.6, + "logps/chosen": -243.32039388020834, + "logps/rejected": -372.764306640625, + "loss": 0.0302, + "rewards/chosen": 4.488293647766113, + "rewards/margins": 11.675751686096191, + "rewards/rejected": -7.187458038330078, + "step": 9213 + }, + { + "epoch": 0.8418455915943354, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 6.074303690640132e-07, + "logits/chosen": 1269005568.0, + "logits/rejected": 768385408.0, + "logps/chosen": -529.21728515625, + "logps/rejected": -609.2615356445312, + "loss": 0.0159, + "rewards/chosen": 4.226592063903809, + "rewards/margins": 11.93687629699707, + "rewards/rejected": -7.710284233093262, + "step": 9214 + }, + { + "epoch": 0.841936957514847, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 6.067436896361323e-07, + "logits/chosen": 507461580.8, + "logits/rejected": 650934101.3333334, + "logps/chosen": -256.57109375, + "logps/rejected": -373.2021891276042, + "loss": 0.007, + "rewards/chosen": 4.838938140869141, + "rewards/margins": 14.064019266764323, + "rewards/rejected": -9.225081125895182, + "step": 9215 + }, + { + "epoch": 0.8420283234353586, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 6.06057373489844e-07, + "logits/chosen": 472692544.0, + "logits/rejected": 650426368.0, + "logps/chosen": -168.476318359375, + "logps/rejected": -513.4543050130209, + "loss": 0.0095, + "rewards/chosen": 3.402055025100708, + "rewards/margins": 11.624453783035278, + "rewards/rejected": -8.22239875793457, + "step": 9216 + }, + { + "epoch": 0.8421196893558702, + "grad_norm": 0.326171875, + "kl": 0.0, + "learning_rate": 6.053714206818989e-07, + "logits/chosen": 246182512.0, + "logits/rejected": 435488597.3333333, + "logps/chosen": -220.2774658203125, + "logps/rejected": -549.732421875, + "loss": 0.0015, + "rewards/chosen": 5.22353458404541, + "rewards/margins": 15.46790599822998, + "rewards/rejected": -10.24437141418457, + "step": 9217 + }, + { + "epoch": 0.842211055276382, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 6.046858312690212e-07, + "logits/chosen": 621248192.0, + "logits/rejected": 532158880.0, + "logps/chosen": -274.72711181640625, + "logps/rejected": -444.2261962890625, + "loss": 0.0143, + "rewards/chosen": 3.7417593002319336, + "rewards/margins": 12.799072265625, + "rewards/rejected": -9.057312965393066, + "step": 9218 + }, + { + "epoch": 0.8423024211968936, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 6.040006053079029e-07, + "logits/chosen": 542072758.8571428, + "logits/rejected": 154123968.0, + "logps/chosen": -355.4125279017857, + "logps/rejected": -214.57803344726562, + "loss": 0.0294, + "rewards/chosen": 3.8436173030308316, + "rewards/margins": 12.759554726736887, + "rewards/rejected": -8.915937423706055, + "step": 9219 + }, + { + "epoch": 0.8423937871174052, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 6.033157428552027e-07, + "logits/chosen": 396159104.0, + "logits/rejected": 638288384.0, + "logps/chosen": -289.7157897949219, + "logps/rejected": -591.7886962890625, + "loss": 0.0902, + "rewards/chosen": 5.010063648223877, + "rewards/margins": 13.07063913345337, + "rewards/rejected": -8.060575485229492, + "step": 9220 + }, + { + "epoch": 0.8424851530379168, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 6.026312439675553e-07, + "logits/chosen": 648113344.0, + "logits/rejected": 422129792.0, + "logps/chosen": -518.8818359375, + "logps/rejected": -340.7674560546875, + "loss": 0.0327, + "rewards/chosen": 3.315204620361328, + "rewards/margins": 10.552947044372559, + "rewards/rejected": -7.2377424240112305, + "step": 9221 + }, + { + "epoch": 0.8425765189584286, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 6.019471087015611e-07, + "logits/chosen": 679623936.0, + "logits/rejected": 332316608.0, + "logps/chosen": -344.05096435546875, + "logps/rejected": -302.87457275390625, + "loss": 0.0121, + "rewards/chosen": 3.9014639854431152, + "rewards/margins": 12.469467639923096, + "rewards/rejected": -8.56800365447998, + "step": 9222 + }, + { + "epoch": 0.8426678848789402, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 6.012633371137927e-07, + "logits/chosen": 383969952.0, + "logits/rejected": 404731264.0, + "logps/chosen": -359.4927062988281, + "logps/rejected": -217.15191650390625, + "loss": 0.0189, + "rewards/chosen": 4.201143264770508, + "rewards/margins": 11.43502950668335, + "rewards/rejected": -7.233886241912842, + "step": 9223 + }, + { + "epoch": 0.8427592507994518, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 6.0057992926079e-07, + "logits/chosen": 437295030.85714287, + "logits/rejected": 594880896.0, + "logps/chosen": -234.646240234375, + "logps/rejected": -661.0360107421875, + "loss": 0.0345, + "rewards/chosen": 3.5247464861188615, + "rewards/margins": 13.861220904758998, + "rewards/rejected": -10.336474418640137, + "step": 9224 + }, + { + "epoch": 0.8428506167199634, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 5.998968851990666e-07, + "logits/chosen": 621780864.0, + "logits/rejected": 296775808.0, + "logps/chosen": -572.3837280273438, + "logps/rejected": -354.7969970703125, + "loss": 0.0087, + "rewards/chosen": 3.3840255737304688, + "rewards/margins": 12.466761271158854, + "rewards/rejected": -9.082735697428385, + "step": 9225 + }, + { + "epoch": 0.8429419826404752, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5.99214204985103e-07, + "logits/chosen": 603714474.6666666, + "logits/rejected": 246457136.0, + "logps/chosen": -403.7163899739583, + "logps/rejected": -383.91461181640625, + "loss": 0.03, + "rewards/chosen": 3.803123156229655, + "rewards/margins": 15.511879603068033, + "rewards/rejected": -11.708756446838379, + "step": 9226 + }, + { + "epoch": 0.8430333485609868, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 5.9853188867535e-07, + "logits/chosen": 563027584.0, + "logits/rejected": 658528768.0, + "logps/chosen": -321.8985290527344, + "logps/rejected": -705.072021484375, + "loss": 0.0185, + "rewards/chosen": 3.4745306968688965, + "rewards/margins": 13.393779277801514, + "rewards/rejected": -9.919248580932617, + "step": 9227 + }, + { + "epoch": 0.8431247144814984, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5.978499363262297e-07, + "logits/chosen": 649421994.6666666, + "logits/rejected": 305546752.0, + "logps/chosen": -433.7552490234375, + "logps/rejected": -451.5169921875, + "loss": 0.0075, + "rewards/chosen": 3.9859739939371743, + "rewards/margins": 13.336961237589518, + "rewards/rejected": -9.350987243652344, + "step": 9228 + }, + { + "epoch": 0.84321608040201, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5.97168347994132e-07, + "logits/chosen": 872100761.6, + "logits/rejected": 794649941.3333334, + "logps/chosen": -322.888671875, + "logps/rejected": -594.0685221354166, + "loss": 0.0415, + "rewards/chosen": 3.271623229980469, + "rewards/margins": 13.319698588053384, + "rewards/rejected": -10.048075358072916, + "step": 9229 + }, + { + "epoch": 0.8433074463225217, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 5.964871237354192e-07, + "logits/chosen": 411231296.0, + "logits/rejected": 579607552.0, + "logps/chosen": -312.5015563964844, + "logps/rejected": -518.6913452148438, + "loss": 0.028, + "rewards/chosen": 3.470083713531494, + "rewards/margins": 13.661686420440674, + "rewards/rejected": -10.19160270690918, + "step": 9230 + }, + { + "epoch": 0.8433988122430334, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5.958062636064227e-07, + "logits/chosen": 581160960.0, + "logits/rejected": 728841216.0, + "logps/chosen": -195.18421936035156, + "logps/rejected": -466.61468505859375, + "loss": 0.0411, + "rewards/chosen": 3.270167827606201, + "rewards/margins": 11.035449028015137, + "rewards/rejected": -7.7652812004089355, + "step": 9231 + }, + { + "epoch": 0.843490178163545, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 5.951257676634414e-07, + "logits/chosen": 667908864.0, + "logits/rejected": 366825568.0, + "logps/chosen": -368.97216796875, + "logps/rejected": -399.8083801269531, + "loss": 0.0141, + "rewards/chosen": 3.5770866870880127, + "rewards/margins": 13.265883684158325, + "rewards/rejected": -9.688796997070312, + "step": 9232 + }, + { + "epoch": 0.8435815440840566, + "grad_norm": 0.28515625, + "kl": 0.0, + "learning_rate": 5.944456359627471e-07, + "logits/chosen": 151803264.0, + "logits/rejected": 354697216.0, + "logps/chosen": -85.84607696533203, + "logps/rejected": -445.78526088169644, + "loss": 0.0015, + "rewards/chosen": 4.8750481605529785, + "rewards/margins": 13.288865430014473, + "rewards/rejected": -8.413817269461495, + "step": 9233 + }, + { + "epoch": 0.8436729100045683, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5.937658685605807e-07, + "logits/chosen": 777746773.3333334, + "logits/rejected": 839157760.0, + "logps/chosen": -572.0618489583334, + "logps/rejected": -706.1400390625, + "loss": 0.0221, + "rewards/chosen": 3.4146076838175454, + "rewards/margins": 12.732213274637857, + "rewards/rejected": -9.317605590820312, + "step": 9234 + }, + { + "epoch": 0.84376427592508, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5.930864655131524e-07, + "logits/chosen": 425566720.0, + "logits/rejected": 713437888.0, + "logps/chosen": -292.82244873046875, + "logps/rejected": -638.5444946289062, + "loss": 0.0183, + "rewards/chosen": 3.78275203704834, + "rewards/margins": 17.52950954437256, + "rewards/rejected": -13.746757507324219, + "step": 9235 + }, + { + "epoch": 0.8438556418455916, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5.924074268766422e-07, + "logits/chosen": 772329856.0, + "logits/rejected": 494010282.6666667, + "logps/chosen": -221.46759033203125, + "logps/rejected": -465.6664225260417, + "loss": 0.0932, + "rewards/chosen": 2.687645435333252, + "rewards/margins": 11.03620704015096, + "rewards/rejected": -8.348561604817709, + "step": 9236 + }, + { + "epoch": 0.8439470077661032, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5.917287527072007e-07, + "logits/chosen": 484255296.0, + "logits/rejected": 415965888.0, + "logps/chosen": -207.22752380371094, + "logps/rejected": -621.2476196289062, + "loss": 0.0252, + "rewards/chosen": 3.3977904319763184, + "rewards/margins": 13.74107313156128, + "rewards/rejected": -10.343282699584961, + "step": 9237 + }, + { + "epoch": 0.8440383736866149, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 5.91050443060947e-07, + "logits/chosen": 708160768.0, + "logits/rejected": 612991552.0, + "logps/chosen": -486.1935119628906, + "logps/rejected": -475.53363037109375, + "loss": 0.0114, + "rewards/chosen": 3.7810966968536377, + "rewards/margins": 12.377812623977661, + "rewards/rejected": -8.596715927124023, + "step": 9238 + }, + { + "epoch": 0.8441297396071266, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 5.903724979939729e-07, + "logits/chosen": 546681792.0, + "logits/rejected": 745917440.0, + "logps/chosen": -113.63485717773438, + "logps/rejected": -469.3691813151042, + "loss": 0.0222, + "rewards/chosen": 3.2925214767456055, + "rewards/margins": 13.085622469584147, + "rewards/rejected": -9.793100992838541, + "step": 9239 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.896949175623367e-07, + "logits/chosen": 394419200.0, + "logits/rejected": 235226214.4, + "logps/chosen": -287.4471435546875, + "logps/rejected": -344.874462890625, + "loss": 0.0262, + "rewards/chosen": 2.690805117289225, + "rewards/margins": 11.58615576426188, + "rewards/rejected": -8.895350646972656, + "step": 9240 + }, + { + "epoch": 0.8443124714481498, + "grad_norm": 0.93359375, + "kl": 0.0, + "learning_rate": 5.890177018220688e-07, + "logits/chosen": 839244672.0, + "logits/rejected": 771722496.0, + "logps/chosen": -600.783935546875, + "logps/rejected": -642.3886311848959, + "loss": 0.0041, + "rewards/chosen": 4.529563903808594, + "rewards/margins": 12.950919469197592, + "rewards/rejected": -8.421355565388998, + "step": 9241 + }, + { + "epoch": 0.8444038373686615, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5.883408508291671e-07, + "logits/chosen": 561840426.6666666, + "logits/rejected": 610898227.2, + "logps/chosen": -396.1619466145833, + "logps/rejected": -597.761865234375, + "loss": 0.0204, + "rewards/chosen": 2.9366671244303384, + "rewards/margins": 13.00568873087565, + "rewards/rejected": -10.069021606445313, + "step": 9242 + }, + { + "epoch": 0.8444952032891732, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5.876643646396035e-07, + "logits/chosen": 630452864.0, + "logits/rejected": 942840320.0, + "logps/chosen": -364.99334716796875, + "logps/rejected": -279.6329345703125, + "loss": 0.0101, + "rewards/chosen": 4.730482578277588, + "rewards/margins": 11.356486320495605, + "rewards/rejected": -6.626003742218018, + "step": 9243 + }, + { + "epoch": 0.8445865692096848, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5.869882433093154e-07, + "logits/chosen": 841076121.6, + "logits/rejected": 633660629.3333334, + "logps/chosen": -352.6562255859375, + "logps/rejected": -465.4860432942708, + "loss": 0.0246, + "rewards/chosen": 3.6657787322998048, + "rewards/margins": 14.664616521199545, + "rewards/rejected": -10.99883778889974, + "step": 9244 + }, + { + "epoch": 0.8446779351301964, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 5.863124868942121e-07, + "logits/chosen": 588095146.6666666, + "logits/rejected": 518639411.2, + "logps/chosen": -331.9453125, + "logps/rejected": -393.7080810546875, + "loss": 0.0075, + "rewards/chosen": 3.970128377278646, + "rewards/margins": 12.309772237141928, + "rewards/rejected": -8.339643859863282, + "step": 9245 + }, + { + "epoch": 0.8447693010507081, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5.856370954501722e-07, + "logits/chosen": 637028138.6666666, + "logits/rejected": 431334604.8, + "logps/chosen": -551.74462890625, + "logps/rejected": -523.419091796875, + "loss": 0.0134, + "rewards/chosen": 3.6446800231933594, + "rewards/margins": 13.797503662109374, + "rewards/rejected": -10.152823638916015, + "step": 9246 + }, + { + "epoch": 0.8448606669712198, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 5.849620690330448e-07, + "logits/chosen": 451882956.8, + "logits/rejected": 543248725.3333334, + "logps/chosen": -157.27847900390626, + "logps/rejected": -483.964599609375, + "loss": 0.0237, + "rewards/chosen": 4.193520355224609, + "rewards/margins": 14.800502268473306, + "rewards/rejected": -10.606981913248697, + "step": 9247 + }, + { + "epoch": 0.8449520328917314, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5.842874076986482e-07, + "logits/chosen": 459026752.0, + "logits/rejected": 724656042.6666666, + "logps/chosen": -195.94284057617188, + "logps/rejected": -473.1861979166667, + "loss": 0.0088, + "rewards/chosen": 3.8817529678344727, + "rewards/margins": 13.630895932515463, + "rewards/rejected": -9.74914296468099, + "step": 9248 + }, + { + "epoch": 0.845043398812243, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 5.836131115027704e-07, + "logits/chosen": 529652032.0, + "logits/rejected": 511720576.0, + "logps/chosen": -243.24151611328125, + "logps/rejected": -383.06787109375, + "loss": 0.0064, + "rewards/chosen": 5.131533622741699, + "rewards/margins": 15.526124954223633, + "rewards/rejected": -10.394591331481934, + "step": 9249 + }, + { + "epoch": 0.8451347647327547, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5.829391805011686e-07, + "logits/chosen": 1047083827.2, + "logits/rejected": 475857664.0, + "logps/chosen": -264.6102294921875, + "logps/rejected": -321.3370361328125, + "loss": 0.1277, + "rewards/chosen": 3.234193801879883, + "rewards/margins": 7.876468658447266, + "rewards/rejected": -4.642274856567383, + "step": 9250 + }, + { + "epoch": 0.8452261306532663, + "grad_norm": 44.0, + "kl": 0.0, + "learning_rate": 5.822656147495725e-07, + "logits/chosen": 835218048.0, + "logits/rejected": 874115456.0, + "logps/chosen": -390.7344665527344, + "logps/rejected": -488.75732421875, + "loss": 0.0889, + "rewards/chosen": 4.04808235168457, + "rewards/margins": 10.59077262878418, + "rewards/rejected": -6.542690277099609, + "step": 9251 + }, + { + "epoch": 0.845317496573778, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 5.815924143036789e-07, + "logits/chosen": 1078749184.0, + "logits/rejected": 707277312.0, + "logps/chosen": -329.7255859375, + "logps/rejected": -831.8893432617188, + "loss": 0.0169, + "rewards/chosen": 3.384608268737793, + "rewards/margins": 16.661038398742676, + "rewards/rejected": -13.276430130004883, + "step": 9252 + }, + { + "epoch": 0.8454088624942896, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5.809195792191552e-07, + "logits/chosen": 680198016.0, + "logits/rejected": 459094570.6666667, + "logps/chosen": -340.30950927734375, + "logps/rejected": -246.6293741861979, + "loss": 0.1035, + "rewards/chosen": 2.318974256515503, + "rewards/margins": 9.204246600468952, + "rewards/rejected": -6.88527234395345, + "step": 9253 + }, + { + "epoch": 0.8455002284148013, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 5.802471095516371e-07, + "logits/chosen": 363215189.3333333, + "logits/rejected": 517208192.0, + "logps/chosen": -326.7006429036458, + "logps/rejected": -390.66717529296875, + "loss": 0.0173, + "rewards/chosen": 3.852947235107422, + "rewards/margins": 14.00957202911377, + "rewards/rejected": -10.156624794006348, + "step": 9254 + }, + { + "epoch": 0.845591594335313, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 5.79575005356735e-07, + "logits/chosen": 508818773.3333333, + "logits/rejected": 482183168.0, + "logps/chosen": -467.9586588541667, + "logps/rejected": -416.6939453125, + "loss": 0.0087, + "rewards/chosen": 3.863706588745117, + "rewards/margins": 13.176701736450195, + "rewards/rejected": -9.312995147705077, + "step": 9255 + }, + { + "epoch": 0.8456829602558246, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5.78903266690023e-07, + "logits/chosen": 557484992.0, + "logits/rejected": 646296448.0, + "logps/chosen": -467.37469482421875, + "logps/rejected": -536.4130249023438, + "loss": 0.0208, + "rewards/chosen": 3.322401523590088, + "rewards/margins": 13.065294742584229, + "rewards/rejected": -9.74289321899414, + "step": 9256 + }, + { + "epoch": 0.8457743261763362, + "grad_norm": 41.75, + "kl": 0.0, + "learning_rate": 5.782318936070491e-07, + "logits/chosen": 976946304.0, + "logits/rejected": 460186432.0, + "logps/chosen": -146.42544555664062, + "logps/rejected": -253.43885803222656, + "loss": 0.1058, + "rewards/chosen": 2.6110880374908447, + "rewards/margins": 8.727150201797485, + "rewards/rejected": -6.116062164306641, + "step": 9257 + }, + { + "epoch": 0.8458656920968479, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.775608861633286e-07, + "logits/chosen": 720966400.0, + "logits/rejected": 636413184.0, + "logps/chosen": -306.3535461425781, + "logps/rejected": -691.12744140625, + "loss": 0.1187, + "rewards/chosen": 2.4831230640411377, + "rewards/margins": 11.535779237747192, + "rewards/rejected": -9.052656173706055, + "step": 9258 + }, + { + "epoch": 0.8459570580173595, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5.76890244414347e-07, + "logits/chosen": 451494464.0, + "logits/rejected": 654169984.0, + "logps/chosen": -192.69505310058594, + "logps/rejected": -763.917724609375, + "loss": 0.0121, + "rewards/chosen": 4.770869731903076, + "rewards/margins": 15.486252307891846, + "rewards/rejected": -10.71538257598877, + "step": 9259 + }, + { + "epoch": 0.8460484239378712, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 5.762199684155623e-07, + "logits/chosen": 532557312.0, + "logits/rejected": 471769600.0, + "logps/chosen": -338.69826253255206, + "logps/rejected": -579.911865234375, + "loss": 0.0084, + "rewards/chosen": 3.9340731302897134, + "rewards/margins": 14.158450571695964, + "rewards/rejected": -10.22437744140625, + "step": 9260 + }, + { + "epoch": 0.8461397898583828, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 5.755500582223994e-07, + "logits/chosen": 659719424.0, + "logits/rejected": 455084441.6, + "logps/chosen": -528.9522298177084, + "logps/rejected": -316.136083984375, + "loss": 0.0142, + "rewards/chosen": 3.325667381286621, + "rewards/margins": 12.616568565368652, + "rewards/rejected": -9.290901184082031, + "step": 9261 + }, + { + "epoch": 0.8462311557788945, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5.748805138902524e-07, + "logits/chosen": 396526796.8, + "logits/rejected": 709552469.3333334, + "logps/chosen": -239.384326171875, + "logps/rejected": -705.2764485677084, + "loss": 0.0157, + "rewards/chosen": 4.059734344482422, + "rewards/margins": 14.881790161132812, + "rewards/rejected": -10.82205581665039, + "step": 9262 + }, + { + "epoch": 0.8463225216994061, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.742113354744872e-07, + "logits/chosen": 442177877.3333333, + "logits/rejected": 1006309990.4, + "logps/chosen": -333.35935465494794, + "logps/rejected": -484.42587890625, + "loss": 0.0229, + "rewards/chosen": 3.9051148096720376, + "rewards/margins": 11.086229006449381, + "rewards/rejected": -7.181114196777344, + "step": 9263 + }, + { + "epoch": 0.8464138876199178, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5.735425230304386e-07, + "logits/chosen": 376351590.4, + "logits/rejected": 244489685.33333334, + "logps/chosen": -471.88642578125, + "logps/rejected": -310.5681559244792, + "loss": 0.0155, + "rewards/chosen": 4.048651885986328, + "rewards/margins": 14.405176035563152, + "rewards/rejected": -10.356524149576822, + "step": 9264 + }, + { + "epoch": 0.8465052535404294, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5.728740766134122e-07, + "logits/chosen": 398306201.6, + "logits/rejected": 329948586.6666667, + "logps/chosen": -315.8505859375, + "logps/rejected": -250.57763671875, + "loss": 0.0315, + "rewards/chosen": 3.4363563537597654, + "rewards/margins": 10.478330103556315, + "rewards/rejected": -7.04197374979655, + "step": 9265 + }, + { + "epoch": 0.8465966194609411, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5.722059962786813e-07, + "logits/chosen": 576505152.0, + "logits/rejected": 814194624.0, + "logps/chosen": -506.65863037109375, + "logps/rejected": -700.141845703125, + "loss": 0.0201, + "rewards/chosen": 3.4896483421325684, + "rewards/margins": 13.312161922454834, + "rewards/rejected": -9.822513580322266, + "step": 9266 + }, + { + "epoch": 0.8466879853814527, + "grad_norm": 3.828125, + "kl": 0.44768333435058594, + "learning_rate": 5.715382820814885e-07, + "logits/chosen": 622776362.6666666, + "logits/rejected": 345038592.0, + "logps/chosen": -430.767333984375, + "logps/rejected": -279.1832580566406, + "loss": 0.0292, + "rewards/chosen": 3.497677485148112, + "rewards/margins": 11.723180452982584, + "rewards/rejected": -8.225502967834473, + "step": 9267 + }, + { + "epoch": 0.8467793513019644, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.708709340770507e-07, + "logits/chosen": 744631296.0, + "logits/rejected": 445289696.0, + "logps/chosen": -218.00732421875, + "logps/rejected": -365.1142578125, + "loss": 0.0301, + "rewards/chosen": 3.1770544052124023, + "rewards/margins": 11.676335334777832, + "rewards/rejected": -8.49928092956543, + "step": 9268 + }, + { + "epoch": 0.846870717222476, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.702039523205499e-07, + "logits/chosen": 389272256.0, + "logits/rejected": 658661961.1428572, + "logps/chosen": -257.0871276855469, + "logps/rejected": -459.3583286830357, + "loss": 0.0367, + "rewards/chosen": 7.248312473297119, + "rewards/margins": 16.549629960741314, + "rewards/rejected": -9.301317487444196, + "step": 9269 + }, + { + "epoch": 0.8469620831429877, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 5.695373368671392e-07, + "logits/chosen": 571575040.0, + "logits/rejected": 667970602.6666666, + "logps/chosen": -271.23834228515625, + "logps/rejected": -364.2976888020833, + "loss": 0.0026, + "rewards/chosen": 4.871437072753906, + "rewards/margins": 13.575652440388998, + "rewards/rejected": -8.704215367635092, + "step": 9270 + }, + { + "epoch": 0.8470534490634993, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5.688710877719417e-07, + "logits/chosen": 463629604.5714286, + "logits/rejected": 204000960.0, + "logps/chosen": -350.75816127232144, + "logps/rejected": -216.1331329345703, + "loss": 0.0178, + "rewards/chosen": 4.451807294573102, + "rewards/margins": 12.61772278376988, + "rewards/rejected": -8.165915489196777, + "step": 9271 + }, + { + "epoch": 0.847144814984011, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5.682052050900482e-07, + "logits/chosen": 540285337.6, + "logits/rejected": 740360960.0, + "logps/chosen": -396.098974609375, + "logps/rejected": -737.10400390625, + "loss": 0.0236, + "rewards/chosen": 3.8444450378417967, + "rewards/margins": 14.140424219767251, + "rewards/rejected": -10.295979181925455, + "step": 9272 + }, + { + "epoch": 0.8472361809045226, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.675396888765255e-07, + "logits/chosen": 657751722.6666666, + "logits/rejected": 615547904.0, + "logps/chosen": -419.423583984375, + "logps/rejected": -363.2853088378906, + "loss": 0.029, + "rewards/chosen": 3.347883860270182, + "rewards/margins": 11.467913309733072, + "rewards/rejected": -8.12002944946289, + "step": 9273 + }, + { + "epoch": 0.8473275468250343, + "grad_norm": 67.5, + "kl": 0.0, + "learning_rate": 5.668745391864017e-07, + "logits/chosen": 473319424.0, + "logits/rejected": 1180556629.3333333, + "logps/chosen": -394.359375, + "logps/rejected": -571.0583902994791, + "loss": 0.1001, + "rewards/chosen": 3.5507366180419924, + "rewards/margins": 10.606562169392904, + "rewards/rejected": -7.055825551350911, + "step": 9274 + }, + { + "epoch": 0.8474189127455459, + "grad_norm": 95.0, + "kl": 0.0, + "learning_rate": 5.662097560746788e-07, + "logits/chosen": 580454707.2, + "logits/rejected": 411940778.6666667, + "logps/chosen": -303.8603271484375, + "logps/rejected": -445.0972493489583, + "loss": 0.075, + "rewards/chosen": 3.324443817138672, + "rewards/margins": 13.697203318277994, + "rewards/rejected": -10.372759501139322, + "step": 9275 + }, + { + "epoch": 0.8475102786660575, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5.655453395963301e-07, + "logits/chosen": 790205440.0, + "logits/rejected": 802727936.0, + "logps/chosen": -337.89892578125, + "logps/rejected": -716.4524536132812, + "loss": 0.0105, + "rewards/chosen": 4.1165876388549805, + "rewards/margins": 16.838967323303223, + "rewards/rejected": -12.722379684448242, + "step": 9276 + }, + { + "epoch": 0.8476016445865692, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 5.648812898062955e-07, + "logits/chosen": 572342186.6666666, + "logits/rejected": 552905728.0, + "logps/chosen": -300.2174479166667, + "logps/rejected": -903.2695922851562, + "loss": 0.0224, + "rewards/chosen": 4.121899922688802, + "rewards/margins": 18.40734036763509, + "rewards/rejected": -14.285440444946289, + "step": 9277 + }, + { + "epoch": 0.8476930105070809, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5.642176067594857e-07, + "logits/chosen": 555330560.0, + "logits/rejected": 540231936.0, + "logps/chosen": -416.1722819010417, + "logps/rejected": -385.2638854980469, + "loss": 0.0391, + "rewards/chosen": 3.463052749633789, + "rewards/margins": 9.787224292755127, + "rewards/rejected": -6.324171543121338, + "step": 9278 + }, + { + "epoch": 0.8477843764275925, + "grad_norm": 0.486328125, + "kl": 0.0, + "learning_rate": 5.635542905107816e-07, + "logits/chosen": 346216768.0, + "logits/rejected": 451065856.0, + "logps/chosen": -314.1870422363281, + "logps/rejected": -550.0220540364584, + "loss": 0.0026, + "rewards/chosen": 4.91010856628418, + "rewards/margins": 14.063467025756836, + "rewards/rejected": -9.153358459472656, + "step": 9279 + }, + { + "epoch": 0.8478757423481041, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 5.628913411150322e-07, + "logits/chosen": 340523584.0, + "logits/rejected": 409621632.0, + "logps/chosen": -342.12017822265625, + "logps/rejected": -573.392333984375, + "loss": 0.0111, + "rewards/chosen": 4.076152324676514, + "rewards/margins": 14.425872325897217, + "rewards/rejected": -10.349720001220703, + "step": 9280 + }, + { + "epoch": 0.8479671082686158, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 5.622287586270591e-07, + "logits/chosen": 508351808.0, + "logits/rejected": 625860480.0, + "logps/chosen": -282.0155029296875, + "logps/rejected": -330.94219970703125, + "loss": 0.0194, + "rewards/chosen": 4.05858850479126, + "rewards/margins": 12.510952472686768, + "rewards/rejected": -8.452363967895508, + "step": 9281 + }, + { + "epoch": 0.8480584741891275, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5.61566543101651e-07, + "logits/chosen": 646187328.0, + "logits/rejected": 704489685.3333334, + "logps/chosen": -411.43328857421875, + "logps/rejected": -666.1236572265625, + "loss": 0.0149, + "rewards/chosen": 2.8213486671447754, + "rewards/margins": 12.37446641921997, + "rewards/rejected": -9.553117752075195, + "step": 9282 + }, + { + "epoch": 0.8481498401096391, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 5.609046945935665e-07, + "logits/chosen": 756314112.0, + "logits/rejected": 484603861.3333333, + "logps/chosen": -220.61257934570312, + "logps/rejected": -402.958740234375, + "loss": 0.0038, + "rewards/chosen": 5.28706169128418, + "rewards/margins": 13.768078486124674, + "rewards/rejected": -8.481016794840494, + "step": 9283 + }, + { + "epoch": 0.8482412060301507, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5.602432131575336e-07, + "logits/chosen": 437409587.2, + "logits/rejected": 292081408.0, + "logps/chosen": -277.848046875, + "logps/rejected": -397.0690104166667, + "loss": 0.0116, + "rewards/chosen": 4.555786895751953, + "rewards/margins": 13.155192184448243, + "rewards/rejected": -8.599405288696289, + "step": 9284 + }, + { + "epoch": 0.8483325719506624, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 5.595820988482531e-07, + "logits/chosen": 505000832.0, + "logits/rejected": 334048128.0, + "logps/chosen": -298.28515625, + "logps/rejected": -485.5768737792969, + "loss": 0.0068, + "rewards/chosen": 4.647530555725098, + "rewards/margins": 13.627543449401855, + "rewards/rejected": -8.980012893676758, + "step": 9285 + }, + { + "epoch": 0.8484239378711741, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5.589213517203912e-07, + "logits/chosen": 1148997120.0, + "logits/rejected": 436641664.0, + "logps/chosen": -566.65283203125, + "logps/rejected": -269.29791259765625, + "loss": 0.0195, + "rewards/chosen": 3.372401714324951, + "rewards/margins": 10.521710872650146, + "rewards/rejected": -7.149309158325195, + "step": 9286 + }, + { + "epoch": 0.8485153037916857, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5.582609718285864e-07, + "logits/chosen": 718137344.0, + "logits/rejected": 413334502.4, + "logps/chosen": -394.1920166015625, + "logps/rejected": -402.9595703125, + "loss": 0.0292, + "rewards/chosen": 2.9939346313476562, + "rewards/margins": 11.956114959716796, + "rewards/rejected": -8.96218032836914, + "step": 9287 + }, + { + "epoch": 0.8486066697121973, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5.57600959227445e-07, + "logits/chosen": 235921024.0, + "logits/rejected": 584660531.2, + "logps/chosen": -216.74688720703125, + "logps/rejected": -382.8303466796875, + "loss": 0.0131, + "rewards/chosen": 3.998110771179199, + "rewards/margins": 11.867692375183106, + "rewards/rejected": -7.869581604003907, + "step": 9288 + }, + { + "epoch": 0.848698035632709, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5.569413139715441e-07, + "logits/chosen": 661579520.0, + "logits/rejected": 454975846.4, + "logps/chosen": -205.4425048828125, + "logps/rejected": -400.6718017578125, + "loss": 0.016, + "rewards/chosen": 3.2488012313842773, + "rewards/margins": 13.451364707946777, + "rewards/rejected": -10.2025634765625, + "step": 9289 + }, + { + "epoch": 0.8487894015532207, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5.562820361154315e-07, + "logits/chosen": 553250389.3333334, + "logits/rejected": 490331200.0, + "logps/chosen": -204.69478352864584, + "logps/rejected": -177.29689025878906, + "loss": 0.1213, + "rewards/chosen": 3.2301511764526367, + "rewards/margins": 11.105918884277344, + "rewards/rejected": -7.875767707824707, + "step": 9290 + }, + { + "epoch": 0.8488807674737323, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5.556231257136225e-07, + "logits/chosen": 511517888.0, + "logits/rejected": 467277994.6666667, + "logps/chosen": -190.6834716796875, + "logps/rejected": -494.8483072916667, + "loss": 0.0081, + "rewards/chosen": 3.4366555213928223, + "rewards/margins": 12.718271096547445, + "rewards/rejected": -9.281615575154623, + "step": 9291 + }, + { + "epoch": 0.8489721333942439, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5.549645828206029e-07, + "logits/chosen": 404715733.3333333, + "logits/rejected": 618598809.6, + "logps/chosen": -346.3336181640625, + "logps/rejected": -623.38916015625, + "loss": 0.0319, + "rewards/chosen": 4.493293126424153, + "rewards/margins": 12.964737828572591, + "rewards/rejected": -8.471444702148437, + "step": 9292 + }, + { + "epoch": 0.8490634993147556, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5.543064074908272e-07, + "logits/chosen": 695643072.0, + "logits/rejected": 348822848.0, + "logps/chosen": -376.03167724609375, + "logps/rejected": -398.17950439453125, + "loss": 0.0206, + "rewards/chosen": 3.9695663452148438, + "rewards/margins": 14.17175006866455, + "rewards/rejected": -10.202183723449707, + "step": 9293 + }, + { + "epoch": 0.8491548652352673, + "grad_norm": 0.92578125, + "kl": 0.0, + "learning_rate": 5.536485997787228e-07, + "logits/chosen": 1029712000.0, + "logits/rejected": 605542400.0, + "logps/chosen": -465.248046875, + "logps/rejected": -465.0793762207031, + "loss": 0.0056, + "rewards/chosen": 4.608072280883789, + "rewards/margins": 14.617371559143066, + "rewards/rejected": -10.009299278259277, + "step": 9294 + }, + { + "epoch": 0.8492462311557789, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5.529911597386822e-07, + "logits/chosen": 615785600.0, + "logits/rejected": 770984806.4, + "logps/chosen": -375.6841634114583, + "logps/rejected": -493.801171875, + "loss": 0.0125, + "rewards/chosen": 3.405920664469401, + "rewards/margins": 14.86535441080729, + "rewards/rejected": -11.45943374633789, + "step": 9295 + }, + { + "epoch": 0.8493375970762905, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5.523340874250704e-07, + "logits/chosen": 525896160.0, + "logits/rejected": 669419477.3333334, + "logps/chosen": -242.9844970703125, + "logps/rejected": -487.3817545572917, + "loss": 0.008, + "rewards/chosen": 4.526022434234619, + "rewards/margins": 14.502312819163004, + "rewards/rejected": -9.976290384928385, + "step": 9296 + }, + { + "epoch": 0.8494289629968022, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5.516773828922201e-07, + "logits/chosen": 490646826.6666667, + "logits/rejected": 747840102.4, + "logps/chosen": -155.91749064127603, + "logps/rejected": -447.6955078125, + "loss": 0.0152, + "rewards/chosen": 3.5812721252441406, + "rewards/margins": 13.459705352783203, + "rewards/rejected": -9.878433227539062, + "step": 9297 + }, + { + "epoch": 0.8495203289173139, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.510210461944365e-07, + "logits/chosen": 572313292.8, + "logits/rejected": 380365482.6666667, + "logps/chosen": -338.55810546875, + "logps/rejected": -284.0415852864583, + "loss": 0.0242, + "rewards/chosen": 3.6222469329833986, + "rewards/margins": 11.447437795003255, + "rewards/rejected": -7.8251908620198565, + "step": 9298 + }, + { + "epoch": 0.8496116948378255, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 5.503650773859914e-07, + "logits/chosen": 515371200.0, + "logits/rejected": 421863904.0, + "logps/chosen": -241.50790405273438, + "logps/rejected": -415.5384521484375, + "loss": 0.0109, + "rewards/chosen": 3.8613457679748535, + "rewards/margins": 12.745913982391357, + "rewards/rejected": -8.884568214416504, + "step": 9299 + }, + { + "epoch": 0.8497030607583371, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 5.497094765211292e-07, + "logits/chosen": 920572672.0, + "logits/rejected": 877000064.0, + "logps/chosen": -573.53173828125, + "logps/rejected": -426.82269287109375, + "loss": 0.0145, + "rewards/chosen": 3.7581682205200195, + "rewards/margins": 12.451350212097168, + "rewards/rejected": -8.693181991577148, + "step": 9300 + }, + { + "epoch": 0.8497944266788487, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5.49054243654058e-07, + "logits/chosen": 380445354.6666667, + "logits/rejected": 375787904.0, + "logps/chosen": -329.2139892578125, + "logps/rejected": -754.6011962890625, + "loss": 0.0309, + "rewards/chosen": 3.7747824986775718, + "rewards/margins": 18.89065678914388, + "rewards/rejected": -15.115874290466309, + "step": 9301 + }, + { + "epoch": 0.8498857925993605, + "grad_norm": 75.5, + "kl": 0.0, + "learning_rate": 5.483993788389636e-07, + "logits/chosen": 620330837.3333334, + "logits/rejected": 655433472.0, + "logps/chosen": -397.5656331380208, + "logps/rejected": -457.6234436035156, + "loss": 0.0681, + "rewards/chosen": 3.4273478190104165, + "rewards/margins": 10.364827315012613, + "rewards/rejected": -6.937479496002197, + "step": 9302 + }, + { + "epoch": 0.8499771585198721, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5.477448821299947e-07, + "logits/chosen": 1019787468.8, + "logits/rejected": 351196586.6666667, + "logps/chosen": -554.569140625, + "logps/rejected": -436.3348795572917, + "loss": 0.0198, + "rewards/chosen": 3.8322113037109373, + "rewards/margins": 13.57004623413086, + "rewards/rejected": -9.737834930419922, + "step": 9303 + }, + { + "epoch": 0.8500685244403837, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5.470907535812736e-07, + "logits/chosen": 489310310.4, + "logits/rejected": 496777301.3333333, + "logps/chosen": -296.7869140625, + "logps/rejected": -463.6275634765625, + "loss": 0.0133, + "rewards/chosen": 3.9834110260009767, + "rewards/margins": 14.798686854044597, + "rewards/rejected": -10.81527582804362, + "step": 9304 + }, + { + "epoch": 0.8501598903608953, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.464369932468888e-07, + "logits/chosen": 439419187.2, + "logits/rejected": 466653184.0, + "logps/chosen": -367.31103515625, + "logps/rejected": -356.790771484375, + "loss": 0.0319, + "rewards/chosen": 3.014605712890625, + "rewards/margins": 11.526744715372722, + "rewards/rejected": -8.512139002482096, + "step": 9305 + }, + { + "epoch": 0.8502512562814071, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5.457836011809026e-07, + "logits/chosen": 685007786.6666666, + "logits/rejected": 442043187.2, + "logps/chosen": -603.4260660807291, + "logps/rejected": -470.054296875, + "loss": 0.0086, + "rewards/chosen": 3.880526860555013, + "rewards/margins": 12.774482854207358, + "rewards/rejected": -8.893955993652344, + "step": 9306 + }, + { + "epoch": 0.8503426222019187, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5.451305774373434e-07, + "logits/chosen": 444923538.28571427, + "logits/rejected": 551214080.0, + "logps/chosen": -318.37974330357144, + "logps/rejected": -731.3701171875, + "loss": 0.0293, + "rewards/chosen": 4.065952845982143, + "rewards/margins": 14.048039027622767, + "rewards/rejected": -9.982086181640625, + "step": 9307 + }, + { + "epoch": 0.8504339881224303, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5.444779220702096e-07, + "logits/rejected": 453712320.0, + "logps/rejected": -426.8368835449219, + "loss": 0.0076, + "rewards/rejected": -7.848681449890137, + "step": 9308 + }, + { + "epoch": 0.8505253540429419, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5.438256351334709e-07, + "logits/chosen": 500082432.0, + "logits/rejected": 1679456512.0, + "logps/chosen": -272.6776529947917, + "logps/rejected": -401.3249206542969, + "loss": 0.0162, + "rewards/chosen": 4.397263526916504, + "rewards/margins": 11.335728645324707, + "rewards/rejected": -6.938465118408203, + "step": 9309 + }, + { + "epoch": 0.8506167199634537, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5.431737166810635e-07, + "logits/chosen": 1072018560.0, + "logits/rejected": 595606912.0, + "logps/chosen": -210.158203125, + "logps/rejected": -418.280029296875, + "loss": 0.0178, + "rewards/chosen": 3.7331342697143555, + "rewards/margins": 12.922537803649902, + "rewards/rejected": -9.189403533935547, + "step": 9310 + }, + { + "epoch": 0.8507080858839653, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 5.425221667668978e-07, + "logits/chosen": 536591264.0, + "logits/rejected": 594405290.6666666, + "logps/chosen": -104.4517822265625, + "logps/rejected": -578.4900309244791, + "loss": 0.0081, + "rewards/chosen": 3.7654147148132324, + "rewards/margins": 15.03071928024292, + "rewards/rejected": -11.265304565429688, + "step": 9311 + }, + { + "epoch": 0.8507994518044769, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5.418709854448489e-07, + "logits/chosen": 301473996.8, + "logits/rejected": 342342933.3333333, + "logps/chosen": -317.395703125, + "logps/rejected": -379.9194742838542, + "loss": 0.0146, + "rewards/chosen": 4.665088653564453, + "rewards/margins": 13.497393925984701, + "rewards/rejected": -8.832305272420248, + "step": 9312 + }, + { + "epoch": 0.8508908177249885, + "grad_norm": 0.45703125, + "kl": 0.0, + "learning_rate": 5.412201727687644e-07, + "logits/chosen": 681232448.0, + "logits/rejected": 899398729.1428572, + "logps/chosen": -282.90533447265625, + "logps/rejected": -546.3602469308036, + "loss": 0.0018, + "rewards/chosen": 5.364139080047607, + "rewards/margins": 15.173868519919258, + "rewards/rejected": -9.809729439871651, + "step": 9313 + }, + { + "epoch": 0.8509821836455003, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5.405697287924589e-07, + "logits/chosen": 605599104.0, + "logits/rejected": 542692010.6666666, + "logps/chosen": -232.8573455810547, + "logps/rejected": -579.6907958984375, + "loss": 0.0094, + "rewards/chosen": 4.328226566314697, + "rewards/margins": 14.117599964141846, + "rewards/rejected": -9.789373397827148, + "step": 9314 + }, + { + "epoch": 0.8510735495660119, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 5.399196535697204e-07, + "logits/chosen": 504083904.0, + "logits/rejected": 476522368.0, + "logps/chosen": -257.1243591308594, + "logps/rejected": -382.8739929199219, + "loss": 0.0113, + "rewards/chosen": 4.14868688583374, + "rewards/margins": 12.433833599090576, + "rewards/rejected": -8.285146713256836, + "step": 9315 + }, + { + "epoch": 0.8511649154865235, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5.392699471543028e-07, + "logits/chosen": 393246368.0, + "logits/rejected": 576411264.0, + "logps/chosen": -212.64584350585938, + "logps/rejected": -469.4604899088542, + "loss": 0.0132, + "rewards/chosen": 3.0453457832336426, + "rewards/margins": 12.32245365778605, + "rewards/rejected": -9.277107874552408, + "step": 9316 + }, + { + "epoch": 0.8512562814070351, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.386206095999313e-07, + "logits/chosen": 522981056.0, + "logits/rejected": 1096713130.6666667, + "logps/chosen": -348.4444885253906, + "logps/rejected": -653.092041015625, + "loss": 0.0222, + "rewards/chosen": 2.3518662452697754, + "rewards/margins": 12.896083037058512, + "rewards/rejected": -10.544216791788736, + "step": 9317 + }, + { + "epoch": 0.8513476473275469, + "grad_norm": 56.75, + "kl": 0.0, + "learning_rate": 5.379716409602986e-07, + "logits/chosen": 504621397.3333333, + "logits/rejected": 922582220.8, + "logps/chosen": -201.1087646484375, + "logps/rejected": -490.86748046875, + "loss": 0.0759, + "rewards/chosen": 3.0973523457845054, + "rewards/margins": 11.025940450032552, + "rewards/rejected": -7.928588104248047, + "step": 9318 + }, + { + "epoch": 0.8514390132480585, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5.373230412890706e-07, + "logits/chosen": 637033280.0, + "logits/rejected": 307006112.0, + "logps/chosen": -576.435546875, + "logps/rejected": -426.25689697265625, + "loss": 0.0138, + "rewards/chosen": 3.6917953491210938, + "rewards/margins": 12.26580810546875, + "rewards/rejected": -8.574012756347656, + "step": 9319 + }, + { + "epoch": 0.8515303791685701, + "grad_norm": 0.9609375, + "kl": 0.0, + "learning_rate": 5.366748106398795e-07, + "logits/chosen": 282366944.0, + "logits/rejected": 525930459.4285714, + "logps/chosen": -80.02479553222656, + "logps/rejected": -476.48193359375, + "loss": 0.0063, + "rewards/chosen": 2.9792556762695312, + "rewards/margins": 12.487654549734932, + "rewards/rejected": -9.508398873465401, + "step": 9320 + }, + { + "epoch": 0.8516217450890817, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5.360269490663278e-07, + "logits/chosen": 507915136.0, + "logits/rejected": 337824204.8, + "logps/chosen": -281.49169921875, + "logps/rejected": -352.3101806640625, + "loss": 0.0125, + "rewards/chosen": 4.372620264689128, + "rewards/margins": 12.109400049845377, + "rewards/rejected": -7.73677978515625, + "step": 9321 + }, + { + "epoch": 0.8517131110095935, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5.353794566219877e-07, + "logits/chosen": 963003904.0, + "logits/rejected": 1019427737.6, + "logps/chosen": -309.9015706380208, + "logps/rejected": -482.064404296875, + "loss": 0.0176, + "rewards/chosen": 3.2986625035603843, + "rewards/margins": 11.98319772084554, + "rewards/rejected": -8.684535217285156, + "step": 9322 + }, + { + "epoch": 0.8518044769301051, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 5.347323333604005e-07, + "logits/chosen": 308949077.3333333, + "logits/rejected": 524525260.8, + "logps/chosen": -267.72515869140625, + "logps/rejected": -493.853125, + "loss": 0.0033, + "rewards/chosen": 5.292119026184082, + "rewards/margins": 13.878837776184081, + "rewards/rejected": -8.58671875, + "step": 9323 + }, + { + "epoch": 0.8518958428506167, + "grad_norm": 0.62109375, + "kl": 0.0, + "learning_rate": 5.34085579335078e-07, + "logits/chosen": 536458848.0, + "logits/rejected": 423182976.0, + "logps/chosen": -111.45858764648438, + "logps/rejected": -435.49609375, + "loss": 0.0048, + "rewards/chosen": 3.946859121322632, + "rewards/margins": 14.706886688868204, + "rewards/rejected": -10.760027567545572, + "step": 9324 + }, + { + "epoch": 0.8519872087711283, + "grad_norm": 79.5, + "kl": 0.0, + "learning_rate": 5.33439194599501e-07, + "logits/chosen": 650752853.3333334, + "logits/rejected": 1168250163.2, + "logps/chosen": -380.9159342447917, + "logps/rejected": -691.8990234375, + "loss": 0.0893, + "rewards/chosen": 2.2123616536458335, + "rewards/margins": 9.664566548665364, + "rewards/rejected": -7.452204895019531, + "step": 9325 + }, + { + "epoch": 0.8520785746916401, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5.327931792071183e-07, + "logits/chosen": 522607411.2, + "logits/rejected": 378287274.6666667, + "logps/chosen": -397.763427734375, + "logps/rejected": -453.5170084635417, + "loss": 0.0207, + "rewards/chosen": 3.725817108154297, + "rewards/margins": 14.803550465901694, + "rewards/rejected": -11.077733357747396, + "step": 9326 + }, + { + "epoch": 0.8521699406121517, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5.321475332113507e-07, + "logits/chosen": 506036128.0, + "logits/rejected": 301955072.0, + "logps/chosen": -343.2782287597656, + "logps/rejected": -362.41204833984375, + "loss": 0.0129, + "rewards/chosen": 4.025176048278809, + "rewards/margins": 15.1445894241333, + "rewards/rejected": -11.119413375854492, + "step": 9327 + }, + { + "epoch": 0.8522613065326633, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 5.315022566655865e-07, + "logits/chosen": 734970752.0, + "logits/rejected": 441554848.0, + "logps/chosen": -562.3543701171875, + "logps/rejected": -371.86968994140625, + "loss": 0.0157, + "rewards/chosen": 3.8839235305786133, + "rewards/margins": 11.929219245910645, + "rewards/rejected": -8.045295715332031, + "step": 9328 + }, + { + "epoch": 0.8523526724531749, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5.308573496231839e-07, + "logits/chosen": 300497984.0, + "logits/rejected": 176166912.0, + "logps/chosen": -237.70965576171875, + "logps/rejected": -287.92327880859375, + "loss": 0.1326, + "rewards/chosen": 2.8035383224487305, + "rewards/margins": 10.279311180114746, + "rewards/rejected": -7.475772857666016, + "step": 9329 + }, + { + "epoch": 0.8524440383736867, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 5.302128121374711e-07, + "logits/chosen": 540913664.0, + "logits/rejected": 493173845.3333333, + "logps/chosen": -391.9461364746094, + "logps/rejected": -579.7349853515625, + "loss": 0.004, + "rewards/chosen": 4.719400882720947, + "rewards/margins": 14.202938556671143, + "rewards/rejected": -9.483537673950195, + "step": 9330 + }, + { + "epoch": 0.8525354042941983, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5.295686442617442e-07, + "logits/chosen": 325982400.0, + "logits/rejected": 558877184.0, + "logps/chosen": -151.8931427001953, + "logps/rejected": -566.5263671875, + "loss": 0.0911, + "rewards/chosen": 5.047010898590088, + "rewards/margins": 13.56062718800136, + "rewards/rejected": -8.513616289411273, + "step": 9331 + }, + { + "epoch": 0.8526267702147099, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5.289248460492719e-07, + "logits/chosen": 189010512.0, + "logits/rejected": 343869842.28571427, + "logps/chosen": -129.14688110351562, + "logps/rejected": -481.9666224888393, + "loss": 0.0036, + "rewards/chosen": 4.8197479248046875, + "rewards/margins": 14.552238464355469, + "rewards/rejected": -9.732490539550781, + "step": 9332 + }, + { + "epoch": 0.8527181361352215, + "grad_norm": 44.5, + "kl": 0.0, + "learning_rate": 5.282814175532897e-07, + "logits/chosen": 693445632.0, + "logits/rejected": 627101593.6, + "logps/chosen": -334.6363525390625, + "logps/rejected": -403.58544921875, + "loss": 0.0745, + "rewards/chosen": 3.303569475809733, + "rewards/margins": 11.452409807840983, + "rewards/rejected": -8.14884033203125, + "step": 9333 + }, + { + "epoch": 0.8528095020557332, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 5.276383588270024e-07, + "logits/chosen": 529878613.3333333, + "logits/rejected": 429196364.8, + "logps/chosen": -300.5253092447917, + "logps/rejected": -475.259375, + "loss": 0.0082, + "rewards/chosen": 4.372765858968099, + "rewards/margins": 12.878951009114584, + "rewards/rejected": -8.506185150146484, + "step": 9334 + }, + { + "epoch": 0.8529008679762449, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 5.269956699235846e-07, + "logits/chosen": 411650112.0, + "logits/rejected": 413293141.3333333, + "logps/chosen": -310.4338073730469, + "logps/rejected": -559.9659830729166, + "loss": 0.0053, + "rewards/chosen": 4.264305114746094, + "rewards/margins": 14.627532958984375, + "rewards/rejected": -10.363227844238281, + "step": 9335 + }, + { + "epoch": 0.8529922338967565, + "grad_norm": 25.375, + "kl": 0.0, + "learning_rate": 5.263533508961827e-07, + "logits/chosen": 708635072.0, + "logits/rejected": 580508288.0, + "logps/chosen": -389.9710693359375, + "logps/rejected": -634.5225830078125, + "loss": 0.0196, + "rewards/chosen": 3.730473279953003, + "rewards/margins": 12.281588315963745, + "rewards/rejected": -8.551115036010742, + "step": 9336 + }, + { + "epoch": 0.8530835998172681, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 5.257114017979098e-07, + "logits/chosen": 625557708.8, + "logits/rejected": 822814464.0, + "logps/chosen": -383.3206298828125, + "logps/rejected": -393.4791666666667, + "loss": 0.0088, + "rewards/chosen": 4.278375244140625, + "rewards/margins": 13.56855354309082, + "rewards/rejected": -9.290178298950195, + "step": 9337 + }, + { + "epoch": 0.8531749657377798, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 5.250698226818479e-07, + "logits/chosen": 616524458.6666666, + "logits/rejected": 415291187.2, + "logps/chosen": -501.8250325520833, + "logps/rejected": -422.43974609375, + "loss": 0.0114, + "rewards/chosen": 3.9082276026407876, + "rewards/margins": 12.40700003306071, + "rewards/rejected": -8.498772430419923, + "step": 9338 + }, + { + "epoch": 0.8532663316582915, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5.244286136010502e-07, + "logits/chosen": 599545941.3333334, + "logits/rejected": 194504960.0, + "logps/chosen": -440.300048828125, + "logps/rejected": -282.24200439453125, + "loss": 0.033, + "rewards/chosen": 3.8896299997965493, + "rewards/margins": 12.220442454020182, + "rewards/rejected": -8.330812454223633, + "step": 9339 + }, + { + "epoch": 0.8533576975788031, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 5.237877746085401e-07, + "logits/chosen": 481334304.0, + "logits/rejected": 427971285.3333333, + "logps/chosen": -471.91070556640625, + "logps/rejected": -573.9295247395834, + "loss": 0.0067, + "rewards/chosen": 3.5988478660583496, + "rewards/margins": 14.399280707041422, + "rewards/rejected": -10.800432840983072, + "step": 9340 + }, + { + "epoch": 0.8534490634993147, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 5.231473057573078e-07, + "logits/chosen": 384594528.0, + "logits/rejected": 491877056.0, + "logps/chosen": -308.67431640625, + "logps/rejected": -431.070068359375, + "loss": 0.009, + "rewards/chosen": 4.242178440093994, + "rewards/margins": 12.936745166778564, + "rewards/rejected": -8.69456672668457, + "step": 9341 + }, + { + "epoch": 0.8535404294198264, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5.22507207100314e-07, + "logits/chosen": 539153664.0, + "logits/rejected": 549392384.0, + "logps/chosen": -301.4170837402344, + "logps/rejected": -340.1231689453125, + "loss": 0.029, + "rewards/chosen": 3.6951661109924316, + "rewards/margins": 11.699581623077393, + "rewards/rejected": -8.004415512084961, + "step": 9342 + }, + { + "epoch": 0.8536317953403381, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 5.218674786904898e-07, + "logits/chosen": 454022400.0, + "logits/rejected": 403702229.3333333, + "logps/chosen": -179.44720458984375, + "logps/rejected": -251.1082763671875, + "loss": 0.0132, + "rewards/chosen": 2.957392930984497, + "rewards/margins": 11.47757395108541, + "rewards/rejected": -8.520181020100912, + "step": 9343 + }, + { + "epoch": 0.8537231612608497, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5.212281205807329e-07, + "logits/chosen": 435188096.0, + "logits/rejected": 457932896.0, + "logps/chosen": -381.51519775390625, + "logps/rejected": -450.5072021484375, + "loss": 0.016, + "rewards/chosen": 3.9802372455596924, + "rewards/margins": 13.283808946609497, + "rewards/rejected": -9.303571701049805, + "step": 9344 + }, + { + "epoch": 0.8538145271813613, + "grad_norm": 0.94921875, + "kl": 0.0, + "learning_rate": 5.205891328239149e-07, + "logits/chosen": 537974784.0, + "logits/rejected": 242716251.42857143, + "logps/chosen": -438.091552734375, + "logps/rejected": -423.77504185267856, + "loss": 0.0031, + "rewards/chosen": 3.7627625465393066, + "rewards/margins": 14.236077104296003, + "rewards/rejected": -10.473314557756696, + "step": 9345 + }, + { + "epoch": 0.853905893101873, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 5.199505154728729e-07, + "logits/chosen": 482552371.2, + "logits/rejected": 569303296.0, + "logps/chosen": -311.21640625, + "logps/rejected": -385.0372721354167, + "loss": 0.0196, + "rewards/chosen": 3.741416168212891, + "rewards/margins": 12.80633977254232, + "rewards/rejected": -9.064923604329428, + "step": 9346 + }, + { + "epoch": 0.8539972590223847, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 5.193122685804147e-07, + "logits/chosen": 493834976.0, + "logits/rejected": 955324160.0, + "logps/chosen": -215.4405517578125, + "logps/rejected": -380.5017903645833, + "loss": 0.1019, + "rewards/chosen": 4.905729293823242, + "rewards/margins": 12.060958226521809, + "rewards/rejected": -7.155228932698567, + "step": 9347 + }, + { + "epoch": 0.8540886249428963, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5.186743921993165e-07, + "logits/chosen": 596691046.4, + "logits/rejected": 407467264.0, + "logps/chosen": -462.97216796875, + "logps/rejected": -305.9800618489583, + "loss": 0.0174, + "rewards/chosen": 3.853141021728516, + "rewards/margins": 10.532628885904948, + "rewards/rejected": -6.679487864176433, + "step": 9348 + }, + { + "epoch": 0.8541799908634079, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 5.180368863823271e-07, + "logits/chosen": 242852928.0, + "logits/rejected": 810734372.5714285, + "logps/chosen": -46.11701965332031, + "logps/rejected": -519.9555315290179, + "loss": 0.0082, + "rewards/chosen": 2.7015717029571533, + "rewards/margins": 12.222423928124565, + "rewards/rejected": -9.520852225167411, + "step": 9349 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5.173997511821599e-07, + "logits/chosen": 493360448.0, + "logits/rejected": 338184469.3333333, + "logps/chosen": -376.38922119140625, + "logps/rejected": -370.5076090494792, + "loss": 0.1226, + "rewards/chosen": 0.3607501983642578, + "rewards/margins": 9.693105697631836, + "rewards/rejected": -9.332355499267578, + "step": 9350 + }, + { + "epoch": 0.8543627227044313, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5.167629866515017e-07, + "logits/chosen": 664320597.3333334, + "logits/rejected": 556841523.2, + "logps/chosen": -250.0418701171875, + "logps/rejected": -350.5178955078125, + "loss": 0.0161, + "rewards/chosen": 3.2672958374023438, + "rewards/margins": 11.390487670898438, + "rewards/rejected": -8.123191833496094, + "step": 9351 + }, + { + "epoch": 0.8544540886249429, + "grad_norm": 0.69140625, + "kl": 0.0, + "learning_rate": 5.161265928430065e-07, + "logits/chosen": 589142528.0, + "logits/rejected": 581415116.8, + "logps/chosen": -302.5010172526042, + "logps/rejected": -391.839111328125, + "loss": 0.0043, + "rewards/chosen": 4.632043838500977, + "rewards/margins": 12.633392715454102, + "rewards/rejected": -8.001348876953125, + "step": 9352 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5.154905698092977e-07, + "logits/chosen": 282940608.0, + "logits/rejected": 200056064.0, + "logps/chosen": -304.56243896484375, + "logps/rejected": -417.4779459635417, + "loss": 0.0106, + "rewards/chosen": 3.9611001014709473, + "rewards/margins": 14.243532021840414, + "rewards/rejected": -10.282431920369467, + "step": 9353 + }, + { + "epoch": 0.8546368204659662, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.148549176029687e-07, + "logits/chosen": 537768128.0, + "logits/rejected": 717222976.0, + "logps/chosen": -181.03744506835938, + "logps/rejected": -1160.738037109375, + "loss": 0.0228, + "rewards/chosen": 3.891486167907715, + "rewards/margins": 21.17941951751709, + "rewards/rejected": -17.287933349609375, + "step": 9354 + }, + { + "epoch": 0.8547281863864779, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5.142196362765828e-07, + "logits/chosen": 483053158.4, + "logits/rejected": 491209813.3333333, + "logps/chosen": -342.920751953125, + "logps/rejected": -478.79541015625, + "loss": 0.022, + "rewards/chosen": 3.80892333984375, + "rewards/margins": 13.633849080403646, + "rewards/rejected": -9.824925740559896, + "step": 9355 + }, + { + "epoch": 0.8548195523069895, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.135847258826699e-07, + "logits/chosen": 1986982784.0, + "logits/rejected": 745118122.6666666, + "logps/chosen": -401.8343200683594, + "logps/rejected": -486.4101969401042, + "loss": 0.0255, + "rewards/chosen": 2.2733216285705566, + "rewards/margins": 10.321698983510336, + "rewards/rejected": -8.04837735493978, + "step": 9356 + }, + { + "epoch": 0.8549109182275011, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.129501864737335e-07, + "logits/chosen": 329945312.0, + "logits/rejected": 456698240.0, + "logps/chosen": -291.2425537109375, + "logps/rejected": -577.01171875, + "loss": 0.0353, + "rewards/chosen": 3.245051622390747, + "rewards/margins": 14.178537607192993, + "rewards/rejected": -10.933485984802246, + "step": 9357 + }, + { + "epoch": 0.8550022841480128, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5.123160181022435e-07, + "logits/chosen": 609202483.2, + "logits/rejected": 575902677.3333334, + "logps/chosen": -377.3127685546875, + "logps/rejected": -324.32806396484375, + "loss": 0.0356, + "rewards/chosen": 3.918218994140625, + "rewards/margins": 8.573561096191407, + "rewards/rejected": -4.655342102050781, + "step": 9358 + }, + { + "epoch": 0.8550936500685244, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.116822208206396e-07, + "logits/chosen": 792443776.0, + "logits/rejected": 454667424.0, + "logps/chosen": -365.28997802734375, + "logps/rejected": -416.3323974609375, + "loss": 0.0779, + "rewards/chosen": 4.071480751037598, + "rewards/margins": 11.465641021728516, + "rewards/rejected": -7.394160270690918, + "step": 9359 + }, + { + "epoch": 0.8551850159890361, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 5.110487946813297e-07, + "logits/chosen": 571018137.6, + "logits/rejected": 328461738.6666667, + "logps/chosen": -401.3856201171875, + "logps/rejected": -431.5691731770833, + "loss": 0.0182, + "rewards/chosen": 3.8144229888916015, + "rewards/margins": 15.551639938354493, + "rewards/rejected": -11.73721694946289, + "step": 9360 + }, + { + "epoch": 0.8552763819095477, + "grad_norm": 36.0, + "kl": 0.0, + "learning_rate": 5.104157397366943e-07, + "logits/chosen": 395006634.6666667, + "logits/rejected": 387431884.8, + "logps/chosen": -206.50240071614584, + "logps/rejected": -486.795703125, + "loss": 0.0549, + "rewards/chosen": 2.5170933405558267, + "rewards/margins": 12.075360552469888, + "rewards/rejected": -9.558267211914062, + "step": 9361 + }, + { + "epoch": 0.8553677478300594, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5.097830560390804e-07, + "logits/chosen": 472691200.0, + "logits/rejected": 291593344.0, + "logps/chosen": -246.85626220703125, + "logps/rejected": -386.67431640625, + "loss": 0.116, + "rewards/chosen": 3.579646110534668, + "rewards/margins": 9.483246326446533, + "rewards/rejected": -5.903600215911865, + "step": 9362 + }, + { + "epoch": 0.855459113750571, + "grad_norm": 0.0206298828125, + "kl": 0.0, + "learning_rate": 5.09150743640805e-07, + "logits/rejected": 430480128.0, + "logps/rejected": -478.10284423828125, + "loss": 0.0001, + "rewards/rejected": -11.003694534301758, + "step": 9363 + }, + { + "epoch": 0.8555504796710827, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5.085188025941545e-07, + "logits/chosen": 339642816.0, + "logits/rejected": 414947584.0, + "logps/chosen": -273.2567138671875, + "logps/rejected": -527.7646484375, + "loss": 0.0196, + "rewards/chosen": 3.76971435546875, + "rewards/margins": 12.746153831481934, + "rewards/rejected": -8.976439476013184, + "step": 9364 + }, + { + "epoch": 0.8556418455915943, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5.078872329513834e-07, + "logits/chosen": 757114470.4, + "logits/rejected": 782153045.3333334, + "logps/chosen": -501.39375, + "logps/rejected": -437.160400390625, + "loss": 0.0357, + "rewards/chosen": 3.0669445037841796, + "rewards/margins": 12.52861696879069, + "rewards/rejected": -9.46167246500651, + "step": 9365 + }, + { + "epoch": 0.855733211512106, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 5.072560347647188e-07, + "logits/chosen": 547149977.6, + "logits/rejected": 478937770.6666667, + "logps/chosen": -313.844580078125, + "logps/rejected": -449.888671875, + "loss": 0.0082, + "rewards/chosen": 4.714609527587891, + "rewards/margins": 13.31921869913737, + "rewards/rejected": -8.604609171549479, + "step": 9366 + }, + { + "epoch": 0.8558245774326176, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5.066252080863543e-07, + "logits/chosen": 834787840.0, + "logits/rejected": 698911590.4, + "logps/chosen": -233.45808919270834, + "logps/rejected": -606.271923828125, + "loss": 0.0156, + "rewards/chosen": 3.381544748942057, + "rewards/margins": 13.269818369547526, + "rewards/rejected": -9.888273620605469, + "step": 9367 + }, + { + "epoch": 0.8559159433531293, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5.059947529684523e-07, + "logits/chosen": 591925568.0, + "logits/rejected": 994027776.0, + "logps/chosen": -332.2615661621094, + "logps/rejected": -860.1689453125, + "loss": 0.023, + "rewards/chosen": 3.2417330741882324, + "rewards/margins": 14.72583532333374, + "rewards/rejected": -11.484102249145508, + "step": 9368 + }, + { + "epoch": 0.8560073092736409, + "grad_norm": 0.67578125, + "kl": 0.0, + "learning_rate": 5.053646694631459e-07, + "logits/chosen": 632995328.0, + "logits/rejected": 571208499.2, + "logps/chosen": -274.61989339192706, + "logps/rejected": -583.999072265625, + "loss": 0.004, + "rewards/chosen": 4.676644961039226, + "rewards/margins": 13.776340548197428, + "rewards/rejected": -9.099695587158203, + "step": 9369 + }, + { + "epoch": 0.8560986751941526, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 5.047349576225386e-07, + "logits/chosen": 448317781.3333333, + "logits/rejected": 376215398.4, + "logps/chosen": -406.3767903645833, + "logps/rejected": -433.95478515625, + "loss": 0.005, + "rewards/chosen": 4.697732925415039, + "rewards/margins": 12.662948989868164, + "rewards/rejected": -7.965216064453125, + "step": 9370 + }, + { + "epoch": 0.8561900411146642, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.041056174987008e-07, + "logits/chosen": 399084074.6666667, + "logits/rejected": 258955776.0, + "logps/chosen": -200.02473958333334, + "logps/rejected": -448.7216796875, + "loss": 0.0461, + "rewards/chosen": 3.155454635620117, + "rewards/margins": 12.245222091674805, + "rewards/rejected": -9.089767456054688, + "step": 9371 + }, + { + "epoch": 0.8562814070351759, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5.034766491436727e-07, + "logits/chosen": 592617685.3333334, + "logits/rejected": 327888896.0, + "logps/chosen": -275.43064371744794, + "logps/rejected": -249.0596466064453, + "loss": 0.0474, + "rewards/chosen": 2.9800949096679688, + "rewards/margins": 11.613000869750977, + "rewards/rejected": -8.632905960083008, + "step": 9372 + }, + { + "epoch": 0.8563727729556875, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5.028480526094648e-07, + "logits/chosen": 953310310.4, + "logits/rejected": 556849450.6666666, + "logps/chosen": -333.39013671875, + "logps/rejected": -670.8486735026041, + "loss": 0.021, + "rewards/chosen": 3.6994705200195312, + "rewards/margins": 16.101042429606117, + "rewards/rejected": -12.401571909586588, + "step": 9373 + }, + { + "epoch": 0.8564641388761992, + "grad_norm": 61.0, + "kl": 0.0, + "learning_rate": 5.02219827948055e-07, + "logits/chosen": 833136512.0, + "logits/rejected": 500095936.0, + "logps/chosen": -494.744873046875, + "logps/rejected": -581.1607666015625, + "loss": 0.0754, + "rewards/chosen": 2.815416097640991, + "rewards/margins": 13.14307951927185, + "rewards/rejected": -10.32766342163086, + "step": 9374 + }, + { + "epoch": 0.8565555047967108, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 5.01591975211393e-07, + "logits/chosen": 401567872.0, + "logits/rejected": 380061952.0, + "logps/chosen": -296.2744140625, + "logps/rejected": -411.9989013671875, + "loss": 0.0161, + "rewards/chosen": 4.1088361740112305, + "rewards/margins": 13.421675682067871, + "rewards/rejected": -9.31283950805664, + "step": 9375 + }, + { + "epoch": 0.8566468707172225, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5.009644944513964e-07, + "logits/chosen": 475558080.0, + "logits/rejected": 399876032.0, + "logps/chosen": -323.4110107421875, + "logps/rejected": -420.87762451171875, + "loss": 0.0201, + "rewards/chosen": 3.362109661102295, + "rewards/margins": 10.998061180114746, + "rewards/rejected": -7.635951519012451, + "step": 9376 + }, + { + "epoch": 0.8567382366377341, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5.003373857199517e-07, + "logits/chosen": 642611302.4, + "logits/rejected": 243761237.33333334, + "logps/chosen": -318.3827880859375, + "logps/rejected": -325.7977701822917, + "loss": 0.0475, + "rewards/chosen": 2.911835861206055, + "rewards/margins": 10.761671574910482, + "rewards/rejected": -7.849835713704427, + "step": 9377 + }, + { + "epoch": 0.8568296025582458, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 4.997106490689136e-07, + "logits/chosen": 696788778.6666666, + "logits/rejected": 571472486.4, + "logps/chosen": -319.7763264973958, + "logps/rejected": -440.33017578125, + "loss": 0.0108, + "rewards/chosen": 3.89036496480306, + "rewards/margins": 14.237780125935872, + "rewards/rejected": -10.347415161132812, + "step": 9378 + }, + { + "epoch": 0.8569209684787574, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 4.990842845501099e-07, + "logits/chosen": 288023040.0, + "logits/rejected": 285731430.4, + "logps/chosen": -336.2423095703125, + "logps/rejected": -387.260546875, + "loss": 0.0122, + "rewards/chosen": 4.719113667805989, + "rewards/margins": 13.825375111897785, + "rewards/rejected": -9.106261444091796, + "step": 9379 + }, + { + "epoch": 0.857012334399269, + "grad_norm": 0.091796875, + "kl": 0.0, + "learning_rate": 4.984582922153352e-07, + "logits/chosen": 393813728.0, + "logits/rejected": 765408256.0, + "logps/chosen": -319.4214172363281, + "logps/rejected": -595.4069475446429, + "loss": 0.0005, + "rewards/chosen": 6.9162139892578125, + "rewards/margins": 16.35107421875, + "rewards/rejected": -9.434860229492188, + "step": 9380 + }, + { + "epoch": 0.8571037003197807, + "grad_norm": 0.48828125, + "kl": 0.0, + "learning_rate": 4.978326721163496e-07, + "logits/chosen": 573631829.3333334, + "logits/rejected": 623560089.6, + "logps/chosen": -215.80476888020834, + "logps/rejected": -549.8546875, + "loss": 0.0032, + "rewards/chosen": 4.924429575602214, + "rewards/margins": 14.653035227457682, + "rewards/rejected": -9.728605651855469, + "step": 9381 + }, + { + "epoch": 0.8571950662402924, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 4.972074243048896e-07, + "logits/chosen": 309776746.6666667, + "logits/rejected": 465738137.6, + "logps/chosen": -238.52962239583334, + "logps/rejected": -404.395556640625, + "loss": 0.0259, + "rewards/chosen": 4.20726203918457, + "rewards/margins": 12.216729354858398, + "rewards/rejected": -8.009467315673827, + "step": 9382 + }, + { + "epoch": 0.857286432160804, + "grad_norm": 0.06396484375, + "kl": 0.0, + "learning_rate": 4.965825488326564e-07, + "logits/rejected": 343774944.0, + "logps/rejected": -479.3944091796875, + "loss": 0.0003, + "rewards/rejected": -8.685955047607422, + "step": 9383 + }, + { + "epoch": 0.8573777980813156, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 4.959580457513214e-07, + "logits/chosen": 329731797.3333333, + "logits/rejected": 590037555.2, + "logps/chosen": -256.33673095703125, + "logps/rejected": -489.573779296875, + "loss": 0.0183, + "rewards/chosen": 3.1406612396240234, + "rewards/margins": 13.321162033081055, + "rewards/rejected": -10.180500793457032, + "step": 9384 + }, + { + "epoch": 0.8574691640018273, + "grad_norm": 0.69140625, + "kl": 0.0, + "learning_rate": 4.953339151125253e-07, + "logits/chosen": 1151458218.6666667, + "logits/rejected": 432129382.4, + "logps/chosen": -321.13437906901044, + "logps/rejected": -364.004248046875, + "loss": 0.0037, + "rewards/chosen": 4.65138594309489, + "rewards/margins": 15.188178698221844, + "rewards/rejected": -10.536792755126953, + "step": 9385 + }, + { + "epoch": 0.857560529922339, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 4.947101569678764e-07, + "logits/chosen": 363373158.4, + "logits/rejected": 648532224.0, + "logps/chosen": -157.7472412109375, + "logps/rejected": -639.0437825520834, + "loss": 0.1244, + "rewards/chosen": 2.3849864959716798, + "rewards/margins": 13.801880772908529, + "rewards/rejected": -11.41689427693685, + "step": 9386 + }, + { + "epoch": 0.8576518958428506, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 4.940867713689563e-07, + "logits/chosen": 556309120.0, + "logits/rejected": 334501344.0, + "logps/chosen": -160.76780700683594, + "logps/rejected": -271.345703125, + "loss": 0.114, + "rewards/chosen": 2.0050406455993652, + "rewards/margins": 8.701330661773682, + "rewards/rejected": -6.696290016174316, + "step": 9387 + }, + { + "epoch": 0.8577432617633622, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 4.934637583673119e-07, + "logits/chosen": 565484800.0, + "logits/rejected": 645872713.1428572, + "logps/chosen": -354.5811767578125, + "logps/rejected": -461.27406529017856, + "loss": 0.008, + "rewards/chosen": 2.7773988246917725, + "rewards/margins": 11.931078808648246, + "rewards/rejected": -9.153679983956474, + "step": 9388 + }, + { + "epoch": 0.8578346276838739, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 4.928411180144605e-07, + "logits/chosen": 1317264128.0, + "logits/rejected": 637470080.0, + "logps/chosen": -373.48260498046875, + "logps/rejected": -606.9526977539062, + "loss": 0.0092, + "rewards/chosen": 4.2625932693481445, + "rewards/margins": 16.55286693572998, + "rewards/rejected": -12.290273666381836, + "step": 9389 + }, + { + "epoch": 0.8579259936043856, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.92218850361888e-07, + "logits/chosen": 796666048.0, + "logits/rejected": 327252096.0, + "logps/chosen": -298.62677001953125, + "logps/rejected": -412.241943359375, + "loss": 0.0104, + "rewards/chosen": 3.482438802719116, + "rewards/margins": 12.482059876124064, + "rewards/rejected": -8.999621073404947, + "step": 9390 + }, + { + "epoch": 0.8580173595248972, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 4.915969554610523e-07, + "logits/chosen": 533376073.14285713, + "logits/rejected": 689104000.0, + "logps/chosen": -343.60501534598217, + "logps/rejected": -877.495361328125, + "loss": 0.0173, + "rewards/chosen": 4.197509765625, + "rewards/margins": 13.9237060546875, + "rewards/rejected": -9.7261962890625, + "step": 9391 + }, + { + "epoch": 0.8581087254454088, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 4.909754333633771e-07, + "logits/chosen": 751152128.0, + "logits/rejected": 400813312.0, + "logps/chosen": -286.397265625, + "logps/rejected": -436.5544840494792, + "loss": 0.0207, + "rewards/chosen": 3.9751182556152345, + "rewards/margins": 13.022608693440755, + "rewards/rejected": -9.047490437825521, + "step": 9392 + }, + { + "epoch": 0.8582000913659205, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 4.903542841202563e-07, + "logits/chosen": 434971340.8, + "logits/rejected": 576109525.3333334, + "logps/chosen": -445.374072265625, + "logps/rejected": -341.5238444010417, + "loss": 0.0122, + "rewards/chosen": 4.427670669555664, + "rewards/margins": 12.566774876912437, + "rewards/rejected": -8.139104207356771, + "step": 9393 + }, + { + "epoch": 0.8582914572864322, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 4.897335077830534e-07, + "logits/chosen": 568514645.3333334, + "logits/rejected": 469190208.0, + "logps/chosen": -266.6004231770833, + "logps/rejected": -517.043701171875, + "loss": 0.02, + "rewards/chosen": 3.9856878916422525, + "rewards/margins": 15.849377314249674, + "rewards/rejected": -11.863689422607422, + "step": 9394 + }, + { + "epoch": 0.8583828232069438, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 4.891131044031e-07, + "logits/chosen": 883633792.0, + "logits/rejected": 640345728.0, + "logps/chosen": -585.4573974609375, + "logps/rejected": -552.7105712890625, + "loss": 0.0172, + "rewards/chosen": 3.6435317993164062, + "rewards/margins": 11.812088966369629, + "rewards/rejected": -8.168557167053223, + "step": 9395 + }, + { + "epoch": 0.8584741891274554, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 4.88493074031699e-07, + "logits/chosen": 921433600.0, + "logits/rejected": 751405824.0, + "logps/chosen": -302.678955078125, + "logps/rejected": -883.0574951171875, + "loss": 0.0151, + "rewards/chosen": 4.222462336222331, + "rewards/margins": 17.990511576334637, + "rewards/rejected": -13.768049240112305, + "step": 9396 + }, + { + "epoch": 0.858565555047967, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 4.878734167201215e-07, + "logits/chosen": 718977664.0, + "logits/rejected": 569470720.0, + "logps/chosen": -345.0056457519531, + "logps/rejected": -524.555419921875, + "loss": 0.0239, + "rewards/chosen": 3.4710330963134766, + "rewards/margins": 14.215095520019531, + "rewards/rejected": -10.744062423706055, + "step": 9397 + }, + { + "epoch": 0.8586569209684788, + "grad_norm": 11.3125, + "kl": 13.928394317626953, + "learning_rate": 4.872541325196061e-07, + "logits/chosen": 559535552.0, + "logps/chosen": -333.8002624511719, + "loss": 0.1257, + "rewards/chosen": 3.7057886123657227, + "step": 9398 + }, + { + "epoch": 0.8587482868889904, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 4.866352214813618e-07, + "logits/chosen": 167109760.0, + "logits/rejected": 347034016.0, + "logps/chosen": -218.77748107910156, + "logps/rejected": -353.12890625, + "loss": 0.0575, + "rewards/chosen": 3.1799733638763428, + "rewards/margins": 11.3760507106781, + "rewards/rejected": -8.196077346801758, + "step": 9399 + }, + { + "epoch": 0.858839652809502, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 4.860166836565683e-07, + "logits/chosen": 621330304.0, + "logits/rejected": 672348736.0, + "logps/chosen": -433.8385925292969, + "logps/rejected": -567.715087890625, + "loss": 0.0204, + "rewards/chosen": 3.342473030090332, + "rewards/margins": 12.712874412536621, + "rewards/rejected": -9.370401382446289, + "step": 9400 + }, + { + "epoch": 0.8589310187300137, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.853985190963723e-07, + "logits/chosen": 496850730.6666667, + "logits/rejected": 705574144.0, + "logps/chosen": -254.11189778645834, + "logps/rejected": -502.34002685546875, + "loss": 0.0133, + "rewards/chosen": 4.709531148274739, + "rewards/margins": 13.997067769368488, + "rewards/rejected": -9.28753662109375, + "step": 9401 + }, + { + "epoch": 0.8590223846505254, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 4.847807278518901e-07, + "logits/chosen": 433394249.14285713, + "logits/rejected": 993606016.0, + "logps/chosen": -290.2781459263393, + "logps/rejected": -1216.9368896484375, + "loss": 0.1811, + "rewards/chosen": 2.2645918982369557, + "rewards/margins": 14.574955667768206, + "rewards/rejected": -12.31036376953125, + "step": 9402 + }, + { + "epoch": 0.859113750571037, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 4.84163309974206e-07, + "logits/chosen": 560460160.0, + "logits/rejected": 419315072.0, + "logps/chosen": -592.2628173828125, + "logps/rejected": -384.827880859375, + "loss": 0.0079, + "rewards/chosen": 4.709566593170166, + "rewards/margins": 12.739393711090088, + "rewards/rejected": -8.029827117919922, + "step": 9403 + }, + { + "epoch": 0.8592051164915486, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 4.835462655143769e-07, + "logits/chosen": 448815872.0, + "logits/rejected": 386094976.0, + "logps/chosen": -277.0435791015625, + "logps/rejected": -420.1072998046875, + "loss": 0.0331, + "rewards/chosen": 3.290623188018799, + "rewards/margins": 12.831906795501709, + "rewards/rejected": -9.54128360748291, + "step": 9404 + }, + { + "epoch": 0.8592964824120602, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 4.829295945234258e-07, + "logits/chosen": 449680544.0, + "logits/rejected": 341722432.0, + "logps/chosen": -337.955078125, + "logps/rejected": -468.5899658203125, + "loss": 0.0258, + "rewards/chosen": 2.9973957538604736, + "rewards/margins": 12.529610872268677, + "rewards/rejected": -9.532215118408203, + "step": 9405 + }, + { + "epoch": 0.859387848332572, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.823132970523459e-07, + "logits/chosen": 466618752.0, + "logits/rejected": 1022178201.6, + "logps/chosen": -410.0594889322917, + "logps/rejected": -539.72490234375, + "loss": 0.0093, + "rewards/chosen": 5.264837582906087, + "rewards/margins": 13.586306699117024, + "rewards/rejected": -8.321469116210938, + "step": 9406 + }, + { + "epoch": 0.8594792142530836, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 4.81697373152099e-07, + "logits/chosen": 778780928.0, + "logits/rejected": 372764992.0, + "logps/chosen": -362.32110595703125, + "logps/rejected": -415.0405578613281, + "loss": 0.0103, + "rewards/chosen": 4.301356315612793, + "rewards/margins": 12.826350212097168, + "rewards/rejected": -8.524993896484375, + "step": 9407 + }, + { + "epoch": 0.8595705801735952, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 4.810818228736164e-07, + "logits/chosen": 527490368.0, + "logits/rejected": 466899264.0, + "logps/chosen": -301.1123046875, + "logps/rejected": -543.5909423828125, + "loss": 0.0175, + "rewards/chosen": 3.8425261974334717, + "rewards/margins": 13.026488065719604, + "rewards/rejected": -9.183961868286133, + "step": 9408 + }, + { + "epoch": 0.8596619460941068, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 4.804666462677982e-07, + "logits/chosen": 528756448.0, + "logits/rejected": 396060800.0, + "logps/chosen": -418.802734375, + "logps/rejected": -474.76739501953125, + "loss": 0.0143, + "rewards/chosen": 3.8206894397735596, + "rewards/margins": 14.433558702468872, + "rewards/rejected": -10.612869262695312, + "step": 9409 + }, + { + "epoch": 0.8597533120146186, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 4.798518433855137e-07, + "logits/chosen": 457884745.14285713, + "logits/rejected": 730326784.0, + "logps/chosen": -372.11244419642856, + "logps/rejected": -643.84423828125, + "loss": 0.0296, + "rewards/chosen": 3.9376577649797713, + "rewards/margins": 13.065856388636998, + "rewards/rejected": -9.128198623657227, + "step": 9410 + }, + { + "epoch": 0.8598446779351302, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.792374142776007e-07, + "logits/chosen": 691787571.2, + "logits/rejected": 466010581.3333333, + "logps/chosen": -305.04228515625, + "logps/rejected": -270.9950764973958, + "loss": 0.0274, + "rewards/chosen": 3.713591766357422, + "rewards/margins": 12.607994588216147, + "rewards/rejected": -8.894402821858725, + "step": 9411 + }, + { + "epoch": 0.8599360438556418, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 4.786233589948691e-07, + "logits/chosen": 660679808.0, + "logits/rejected": 751376128.0, + "logps/chosen": -372.035400390625, + "logps/rejected": -612.8186442057291, + "loss": 0.0436, + "rewards/chosen": 3.107574462890625, + "rewards/margins": 12.602677663167318, + "rewards/rejected": -9.495103200276693, + "step": 9412 + }, + { + "epoch": 0.8600274097761534, + "grad_norm": 66.0, + "kl": 5.466390609741211, + "learning_rate": 4.780096775880938e-07, + "logits/chosen": 541513142.8571428, + "logits/rejected": 579952256.0, + "logps/chosen": -253.47316196986608, + "logps/rejected": -220.68206787109375, + "loss": 0.1149, + "rewards/chosen": 3.361833299909319, + "rewards/margins": 11.517027582441058, + "rewards/rejected": -8.155194282531738, + "step": 9413 + }, + { + "epoch": 0.8601187756966652, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 4.773963701080214e-07, + "logits/chosen": 510109952.0, + "logits/rejected": 454579242.6666667, + "logps/chosen": -355.3761474609375, + "logps/rejected": -366.8756510416667, + "loss": 0.0227, + "rewards/chosen": 4.07947998046875, + "rewards/margins": 12.640525945027669, + "rewards/rejected": -8.56104596455892, + "step": 9414 + }, + { + "epoch": 0.8602101416171768, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 4.7678343660536644e-07, + "logits/chosen": 1305307520.0, + "logits/rejected": 605328512.0, + "logps/chosen": -382.1943359375, + "logps/rejected": -488.89739990234375, + "loss": 0.0108, + "rewards/chosen": 3.9111952781677246, + "rewards/margins": 12.3611159324646, + "rewards/rejected": -8.449920654296875, + "step": 9415 + }, + { + "epoch": 0.8603015075376884, + "grad_norm": 49.25, + "kl": 0.0, + "learning_rate": 4.76170877130811e-07, + "logits/chosen": 445220096.0, + "logits/rejected": 581378867.2, + "logps/chosen": -290.1190185546875, + "logps/rejected": -505.809375, + "loss": 0.0378, + "rewards/chosen": 2.44967254002889, + "rewards/margins": 11.811386712392173, + "rewards/rejected": -9.361714172363282, + "step": 9416 + }, + { + "epoch": 0.8603928734582, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 4.755586917350113e-07, + "logits/chosen": 453170739.2, + "logits/rejected": 399635456.0, + "logps/chosen": -299.627734375, + "logps/rejected": -472.5062662760417, + "loss": 0.018, + "rewards/chosen": 4.007841110229492, + "rewards/margins": 12.263630549112955, + "rewards/rejected": -8.255789438883463, + "step": 9417 + }, + { + "epoch": 0.8604842393787118, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 4.7494688046858806e-07, + "logits/chosen": 575334144.0, + "logits/rejected": 935832512.0, + "logps/chosen": -273.484375, + "logps/rejected": -263.86688232421875, + "loss": 0.0132, + "rewards/chosen": 4.600930213928223, + "rewards/margins": 12.969389915466309, + "rewards/rejected": -8.368459701538086, + "step": 9418 + }, + { + "epoch": 0.8605756052992234, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 4.7433544338213233e-07, + "logits/chosen": 489776725.3333333, + "logits/rejected": 591574848.0, + "logps/chosen": -405.4984944661458, + "logps/rejected": -679.9659423828125, + "loss": 0.0247, + "rewards/chosen": 3.7303670247395835, + "rewards/margins": 13.691924413045248, + "rewards/rejected": -9.961557388305664, + "step": 9419 + }, + { + "epoch": 0.860666971219735, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 4.7372438052620317e-07, + "logits/chosen": 503368512.0, + "logits/rejected": 452486336.0, + "logps/chosen": -344.3193054199219, + "logps/rejected": -404.4667053222656, + "loss": 0.1255, + "rewards/chosen": 2.549494743347168, + "rewards/margins": 11.787545204162598, + "rewards/rejected": -9.23805046081543, + "step": 9420 + }, + { + "epoch": 0.8607583371402467, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.7311369195133127e-07, + "logits/chosen": 760508032.0, + "logits/rejected": 684589504.0, + "logps/chosen": -369.08941650390625, + "logps/rejected": -388.82470703125, + "loss": 0.0205, + "rewards/chosen": 3.4280788898468018, + "rewards/margins": 12.10688328742981, + "rewards/rejected": -8.678804397583008, + "step": 9421 + }, + { + "epoch": 0.8608497030607584, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 4.7250337770801514e-07, + "logits/chosen": 523549738.6666667, + "logits/rejected": 454260940.8, + "logps/chosen": -407.8486735026042, + "logps/rejected": -539.69677734375, + "loss": 0.0058, + "rewards/chosen": 4.537940343221028, + "rewards/margins": 13.921906407674154, + "rewards/rejected": -9.383966064453125, + "step": 9422 + }, + { + "epoch": 0.86094106898127, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 4.7189343784672103e-07, + "logits/chosen": 489148714.6666667, + "logits/rejected": 478986444.8, + "logps/chosen": -357.3802490234375, + "logps/rejected": -450.630078125, + "loss": 0.0119, + "rewards/chosen": 3.605001449584961, + "rewards/margins": 13.079033279418946, + "rewards/rejected": -9.474031829833985, + "step": 9423 + }, + { + "epoch": 0.8610324349017816, + "grad_norm": 0.1611328125, + "kl": 0.0, + "learning_rate": 4.7128387241788533e-07, + "logits/chosen": 291738880.0, + "logits/rejected": 449821513.14285713, + "logps/chosen": -401.13592529296875, + "logps/rejected": -450.43826729910717, + "loss": 0.0005, + "rewards/chosen": 5.757174968719482, + "rewards/margins": 15.954742499760219, + "rewards/rejected": -10.197567531040736, + "step": 9424 + }, + { + "epoch": 0.8611238008222933, + "grad_norm": 61.0, + "kl": 0.0, + "learning_rate": 4.706746814719143e-07, + "logits/chosen": 696878037.3333334, + "logits/rejected": 956199116.8, + "logps/chosen": -315.67633056640625, + "logps/rejected": -374.2626953125, + "loss": 0.0617, + "rewards/chosen": 3.073602040608724, + "rewards/margins": 10.109400685628255, + "rewards/rejected": -7.035798645019531, + "step": 9425 + }, + { + "epoch": 0.861215166742805, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 4.7006586505918273e-07, + "logits/chosen": 380355157.3333333, + "logits/rejected": 380522656.0, + "logps/chosen": -191.25618489583334, + "logps/rejected": -332.4444580078125, + "loss": 0.0108, + "rewards/chosen": 4.7762807210286455, + "rewards/margins": 17.531887372334797, + "rewards/rejected": -12.755606651306152, + "step": 9426 + }, + { + "epoch": 0.8613065326633166, + "grad_norm": 0.55859375, + "kl": 0.0, + "learning_rate": 4.6945742323003254e-07, + "logits/chosen": 1424170368.0, + "logits/rejected": 573284754.2857143, + "logps/chosen": -596.414306640625, + "logps/rejected": -575.5981794084821, + "loss": 0.0016, + "rewards/chosen": 4.400622844696045, + "rewards/margins": 13.919624396732875, + "rewards/rejected": -9.51900155203683, + "step": 9427 + }, + { + "epoch": 0.8613978985838282, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 4.6884935603477733e-07, + "logits/chosen": 524604928.0, + "logits/rejected": 464634965.3333333, + "logps/chosen": -194.4939208984375, + "logps/rejected": -369.0957845052083, + "loss": 0.0369, + "rewards/chosen": 3.5030487060546873, + "rewards/margins": 12.13873748779297, + "rewards/rejected": -8.635688781738281, + "step": 9428 + }, + { + "epoch": 0.8614892645043399, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 4.682416635236975e-07, + "logits/chosen": 1109511936.0, + "logits/rejected": 563787008.0, + "logps/chosen": -458.89666748046875, + "logps/rejected": -588.1217854817709, + "loss": 0.014, + "rewards/chosen": 2.886826992034912, + "rewards/margins": 12.81149435043335, + "rewards/rejected": -9.924667358398438, + "step": 9429 + }, + { + "epoch": 0.8615806304248516, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 4.676343457470456e-07, + "logits/chosen": 373676384.0, + "logits/rejected": 702405440.0, + "logps/chosen": -193.1101837158203, + "logps/rejected": -273.5904235839844, + "loss": 0.0252, + "rewards/chosen": 3.6447267532348633, + "rewards/margins": 11.222365379333496, + "rewards/rejected": -7.577638626098633, + "step": 9430 + }, + { + "epoch": 0.8616719963453632, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 4.670274027550392e-07, + "logits/chosen": 456223360.0, + "logits/rejected": 776434688.0, + "logps/chosen": -187.63749186197916, + "logps/rejected": -626.04697265625, + "loss": 0.0235, + "rewards/chosen": 2.863757769266764, + "rewards/margins": 13.927808062235513, + "rewards/rejected": -11.06405029296875, + "step": 9431 + }, + { + "epoch": 0.8617633622658748, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 4.664208345978682e-07, + "logits/chosen": 1076460236.8, + "logits/rejected": 629290794.6666666, + "logps/chosen": -342.047998046875, + "logps/rejected": -488.8283284505208, + "loss": 0.0286, + "rewards/chosen": 3.311238479614258, + "rewards/margins": 10.327290725708007, + "rewards/rejected": -7.01605224609375, + "step": 9432 + }, + { + "epoch": 0.8618547281863865, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 4.6581464132568863e-07, + "logits/chosen": 796212608.0, + "logits/rejected": 646504106.6666666, + "logps/chosen": -408.51763916015625, + "logps/rejected": -520.5343424479166, + "loss": 0.008, + "rewards/chosen": 3.6656785011291504, + "rewards/margins": 11.539234002431233, + "rewards/rejected": -7.873555501302083, + "step": 9433 + }, + { + "epoch": 0.8619460941068982, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 4.6520882298863037e-07, + "logits/chosen": 889953894.4, + "logits/rejected": 567262976.0, + "logps/chosen": -477.196923828125, + "logps/rejected": -596.0756022135416, + "loss": 0.0156, + "rewards/chosen": 4.172983932495117, + "rewards/margins": 15.033065923055013, + "rewards/rejected": -10.860081990559896, + "step": 9434 + }, + { + "epoch": 0.8620374600274098, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 4.646033796367855e-07, + "logits/chosen": 657242880.0, + "logits/rejected": 521674080.0, + "logps/chosen": -381.29827880859375, + "logps/rejected": -652.5940551757812, + "loss": 0.017, + "rewards/chosen": 3.745591402053833, + "rewards/margins": 13.782684564590454, + "rewards/rejected": -10.037093162536621, + "step": 9435 + }, + { + "epoch": 0.8621288259479214, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 4.639983113202196e-07, + "logits/chosen": 882247509.3333334, + "logits/rejected": 516751820.8, + "logps/chosen": -661.2408854166666, + "logps/rejected": -420.59091796875, + "loss": 0.0086, + "rewards/chosen": 4.0822194417317705, + "rewards/margins": 12.84193623860677, + "rewards/rejected": -8.759716796875, + "step": 9436 + }, + { + "epoch": 0.8622201918684331, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 4.633936180889653e-07, + "logits/chosen": 832117632.0, + "logits/rejected": 517320362.6666667, + "logps/chosen": -689.6866455078125, + "logps/rejected": -533.1405029296875, + "loss": 0.0075, + "rewards/chosen": 3.601912021636963, + "rewards/margins": 12.912430922190348, + "rewards/rejected": -9.310518900553385, + "step": 9437 + }, + { + "epoch": 0.8623115577889447, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 4.6278929999302726e-07, + "logits/chosen": 450692710.4, + "logits/rejected": 933003178.6666666, + "logps/chosen": -272.150439453125, + "logps/rejected": -591.1012776692709, + "loss": 0.0264, + "rewards/chosen": 3.5931442260742186, + "rewards/margins": 16.42649917602539, + "rewards/rejected": -12.833354949951172, + "step": 9438 + }, + { + "epoch": 0.8624029237094564, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 4.621853570823759e-07, + "logits/chosen": 366861056.0, + "logits/rejected": 568376490.6666666, + "logps/chosen": -201.7720184326172, + "logps/rejected": -602.3944498697916, + "loss": 0.0063, + "rewards/chosen": 4.140763759613037, + "rewards/margins": 14.972774664560953, + "rewards/rejected": -10.832010904947916, + "step": 9439 + }, + { + "epoch": 0.862494289629968, + "grad_norm": 0.96484375, + "kl": 0.0, + "learning_rate": 4.6158178940695186e-07, + "logits/chosen": 524235712.0, + "logits/rejected": 611786922.6666666, + "logps/chosen": -372.872802734375, + "logps/rejected": -452.9950764973958, + "loss": 0.005, + "rewards/chosen": 4.033394813537598, + "rewards/margins": 11.807487169901531, + "rewards/rejected": -7.774092356363933, + "step": 9440 + }, + { + "epoch": 0.8625856555504797, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 4.60978597016663e-07, + "logits/chosen": 590007040.0, + "logits/rejected": 368234624.0, + "logps/chosen": -369.2423095703125, + "logps/rejected": -505.9437662760417, + "loss": 0.009, + "rewards/chosen": 3.362509250640869, + "rewards/margins": 12.991084893544516, + "rewards/rejected": -9.628575642903646, + "step": 9441 + }, + { + "epoch": 0.8626770214709913, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 4.603757799613906e-07, + "logits/chosen": 757526101.3333334, + "logits/rejected": 1449984256.0, + "logps/chosen": -343.1792805989583, + "logps/rejected": -595.56884765625, + "loss": 0.0783, + "rewards/chosen": 3.191110293070475, + "rewards/margins": 12.169688860575357, + "rewards/rejected": -8.978578567504883, + "step": 9442 + }, + { + "epoch": 0.862768387391503, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 4.597733382909808e-07, + "logits/chosen": 714277802.6666666, + "logits/rejected": 644597760.0, + "logps/chosen": -321.79327392578125, + "logps/rejected": -891.9876953125, + "loss": 0.0159, + "rewards/chosen": 3.1630935668945312, + "rewards/margins": 17.49853973388672, + "rewards/rejected": -14.335446166992188, + "step": 9443 + }, + { + "epoch": 0.8628597533120146, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 4.591712720552494e-07, + "logits/chosen": 573985856.0, + "logits/rejected": 559334848.0, + "logps/chosen": -441.6175537109375, + "logps/rejected": -410.00152587890625, + "loss": 0.0087, + "rewards/chosen": 4.410958766937256, + "rewards/margins": 13.39701223373413, + "rewards/rejected": -8.986053466796875, + "step": 9444 + }, + { + "epoch": 0.8629511192325263, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.5856958130398146e-07, + "logits/chosen": 554066304.0, + "logits/rejected": 510998425.6, + "logps/chosen": -454.4705403645833, + "logps/rejected": -436.862939453125, + "loss": 0.013, + "rewards/chosen": 3.5040629704793296, + "rewards/margins": 12.46904099782308, + "rewards/rejected": -8.96497802734375, + "step": 9445 + }, + { + "epoch": 0.8630424851530379, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 4.5796826608693277e-07, + "logits/chosen": 624674508.8, + "logits/rejected": 468850218.6666667, + "logps/chosen": -333.7503173828125, + "logps/rejected": -379.676513671875, + "loss": 0.0119, + "rewards/chosen": 4.036156845092774, + "rewards/margins": 13.984842809041343, + "rewards/rejected": -9.948685963948568, + "step": 9446 + }, + { + "epoch": 0.8631338510735496, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 4.5736732645382534e-07, + "logits/chosen": 406320237.71428573, + "logits/rejected": 169192656.0, + "logps/chosen": -288.64365931919644, + "logps/rejected": -371.70599365234375, + "loss": 0.0282, + "rewards/chosen": 3.8455227443150113, + "rewards/margins": 15.58824144090925, + "rewards/rejected": -11.742718696594238, + "step": 9447 + }, + { + "epoch": 0.8632252169940612, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 4.567667624543515e-07, + "logits/chosen": 634389461.3333334, + "logits/rejected": 340318617.6, + "logps/chosen": -500.826904296875, + "logps/rejected": -521.820068359375, + "loss": 0.0332, + "rewards/chosen": 2.6225159962972007, + "rewards/margins": 12.851702245076497, + "rewards/rejected": -10.229186248779296, + "step": 9448 + }, + { + "epoch": 0.8633165829145729, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 4.561665741381727e-07, + "logits/chosen": 484180275.2, + "logits/rejected": 445041152.0, + "logps/chosen": -315.99130859375, + "logps/rejected": -429.3814697265625, + "loss": 0.0164, + "rewards/chosen": 4.010396575927734, + "rewards/margins": 12.199142710367838, + "rewards/rejected": -8.188746134440104, + "step": 9449 + }, + { + "epoch": 0.8634079488350845, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 4.5556676155491807e-07, + "logits/chosen": 649018572.8, + "logits/rejected": 409157376.0, + "logps/chosen": -449.17890625, + "logps/rejected": -594.290771484375, + "loss": 0.0305, + "rewards/chosen": 3.0863561630249023, + "rewards/margins": 11.622574806213379, + "rewards/rejected": -8.536218643188477, + "step": 9450 + }, + { + "epoch": 0.8634993147555962, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 4.549673247541875e-07, + "logits/chosen": 550301013.3333334, + "logits/rejected": 416327264.0, + "logps/chosen": -345.986572265625, + "logps/rejected": -556.8896484375, + "loss": 0.0251, + "rewards/chosen": 3.7344093322753906, + "rewards/margins": 12.727387428283691, + "rewards/rejected": -8.9929780960083, + "step": 9451 + }, + { + "epoch": 0.8635906806761078, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 4.54368263785549e-07, + "logits/chosen": 500887552.0, + "logits/rejected": 757081804.8, + "logps/chosen": -321.9369710286458, + "logps/rejected": -766.274755859375, + "loss": 0.0087, + "rewards/chosen": 3.9461580912272134, + "rewards/margins": 14.084044138590494, + "rewards/rejected": -10.137886047363281, + "step": 9452 + }, + { + "epoch": 0.8636820465966195, + "grad_norm": 29.625, + "kl": 0.0, + "learning_rate": 4.5376957869853857e-07, + "logits/chosen": 1115753574.4, + "logits/rejected": 471959296.0, + "logps/chosen": -247.936865234375, + "logps/rejected": -393.7027587890625, + "loss": 0.0722, + "rewards/chosen": 2.8244556427001952, + "rewards/margins": 12.463665390014649, + "rewards/rejected": -9.639209747314453, + "step": 9453 + }, + { + "epoch": 0.8637734125171311, + "grad_norm": 0.373046875, + "kl": 0.0, + "learning_rate": 4.53171269542661e-07, + "logits/chosen": 1654420224.0, + "logits/rejected": 603979264.0, + "logps/chosen": -191.00103759765625, + "logps/rejected": -365.7261439732143, + "loss": 0.0015, + "rewards/chosen": 4.604255676269531, + "rewards/margins": 13.12715584891183, + "rewards/rejected": -8.522900172642299, + "step": 9454 + }, + { + "epoch": 0.8638647784376428, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.525733363673934e-07, + "logits/chosen": 529616025.6, + "logits/rejected": 821421312.0, + "logps/chosen": -314.0965087890625, + "logps/rejected": -579.091064453125, + "loss": 0.1325, + "rewards/chosen": 3.093007278442383, + "rewards/margins": 12.037729771931968, + "rewards/rejected": -8.944722493489584, + "step": 9455 + }, + { + "epoch": 0.8639561443581544, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 4.5197577922217795e-07, + "logits/chosen": 501798720.0, + "logits/rejected": 405751360.0, + "logps/chosen": -421.8843994140625, + "logps/rejected": -415.40386962890625, + "loss": 0.0193, + "rewards/chosen": 3.9869630336761475, + "rewards/margins": 11.613830327987671, + "rewards/rejected": -7.626867294311523, + "step": 9456 + }, + { + "epoch": 0.8640475102786661, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 4.5137859815642724e-07, + "logits/chosen": 635705702.4, + "logits/rejected": 526714368.0, + "logps/chosen": -347.527783203125, + "logps/rejected": -502.4275309244792, + "loss": 0.0229, + "rewards/chosen": 3.625230407714844, + "rewards/margins": 12.90050786336263, + "rewards/rejected": -9.275277455647787, + "step": 9457 + }, + { + "epoch": 0.8641388761991777, + "grad_norm": 0.9453125, + "kl": 0.0, + "learning_rate": 4.507817932195224e-07, + "logits/chosen": 499586457.6, + "logits/rejected": 656823808.0, + "logps/chosen": -341.872021484375, + "logps/rejected": -604.8704020182291, + "loss": 0.0051, + "rewards/chosen": 4.959303283691407, + "rewards/margins": 15.339002990722657, + "rewards/rejected": -10.37969970703125, + "step": 9458 + }, + { + "epoch": 0.8642302421196894, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.501853644608134e-07, + "logits/chosen": 1398782634.6666667, + "logits/rejected": 911369728.0, + "logps/chosen": -335.9991455078125, + "logps/rejected": -198.04315185546875, + "loss": 0.0448, + "rewards/chosen": 2.9944165547688804, + "rewards/margins": 11.190591176350912, + "rewards/rejected": -8.196174621582031, + "step": 9459 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 4.495893119296213e-07, + "logits/chosen": 306166464.0, + "logits/rejected": 393765920.0, + "logps/chosen": -236.70916748046875, + "logps/rejected": -501.2619934082031, + "loss": 0.0077, + "rewards/chosen": 4.891590118408203, + "rewards/margins": 15.300690650939941, + "rewards/rejected": -10.409100532531738, + "step": 9460 + }, + { + "epoch": 0.8644129739607127, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 4.489936356752317e-07, + "logits/chosen": 476860992.0, + "logits/rejected": 857771264.0, + "logps/chosen": -230.0279541015625, + "logps/rejected": -489.810791015625, + "loss": 0.0113, + "rewards/chosen": 3.9309678077697754, + "rewards/margins": 14.28612756729126, + "rewards/rejected": -10.355159759521484, + "step": 9461 + }, + { + "epoch": 0.8645043398812243, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 4.483983357469013e-07, + "logits/chosen": 313019033.6, + "logits/rejected": 416938922.6666667, + "logps/chosen": -356.952392578125, + "logps/rejected": -358.8822021484375, + "loss": 0.0182, + "rewards/chosen": 3.6895450592041015, + "rewards/margins": 11.50323969523112, + "rewards/rejected": -7.8136946360270185, + "step": 9462 + }, + { + "epoch": 0.864595705801736, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 4.478034121938579e-07, + "logits/chosen": 611845290.6666666, + "logits/rejected": 767775539.2, + "logps/chosen": -129.69916788736978, + "logps/rejected": -436.71142578125, + "loss": 0.012, + "rewards/chosen": 3.6318591435750327, + "rewards/margins": 11.687564404805501, + "rewards/rejected": -8.055705261230468, + "step": 9463 + }, + { + "epoch": 0.8646870717222476, + "grad_norm": 0.33984375, + "kl": 0.0, + "learning_rate": 4.472088650652956e-07, + "logits/chosen": 234562069.33333334, + "logits/rejected": 366781158.4, + "logps/chosen": -243.3813680013021, + "logps/rejected": -490.1392578125, + "loss": 0.0019, + "rewards/chosen": 5.4884897867838545, + "rewards/margins": 14.295118204752605, + "rewards/rejected": -8.80662841796875, + "step": 9464 + }, + { + "epoch": 0.8647784376427593, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 4.4661469441037776e-07, + "logits/chosen": 580240768.0, + "logits/rejected": 573750144.0, + "logps/chosen": -427.03607177734375, + "logps/rejected": -683.2205810546875, + "loss": 0.1227, + "rewards/chosen": 3.392533302307129, + "rewards/margins": 8.558011054992676, + "rewards/rejected": -5.165477752685547, + "step": 9465 + }, + { + "epoch": 0.8648698035632709, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 4.460209002782362e-07, + "logits/chosen": 534381952.0, + "logits/rejected": 420882112.0, + "logps/chosen": -332.9220275878906, + "logps/rejected": -477.906982421875, + "loss": 0.0124, + "rewards/chosen": 4.152029037475586, + "rewards/margins": 13.524919509887695, + "rewards/rejected": -9.37289047241211, + "step": 9466 + }, + { + "epoch": 0.8649611694837825, + "grad_norm": 26.75, + "kl": 0.0, + "learning_rate": 4.454274827179722e-07, + "logits/chosen": 432654745.6, + "logits/rejected": 380147626.6666667, + "logps/chosen": -201.360400390625, + "logps/rejected": -257.3789469401042, + "loss": 0.0497, + "rewards/chosen": 3.250527191162109, + "rewards/margins": 11.164242553710938, + "rewards/rejected": -7.913715362548828, + "step": 9467 + }, + { + "epoch": 0.8650525354042942, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 4.4483444177865654e-07, + "logits/chosen": 451527808.0, + "logits/rejected": 400227200.0, + "logps/chosen": -266.53021240234375, + "logps/rejected": -415.65765380859375, + "loss": 0.0216, + "rewards/chosen": 3.529979705810547, + "rewards/margins": 11.666133880615234, + "rewards/rejected": -8.136154174804688, + "step": 9468 + }, + { + "epoch": 0.8651439013248059, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 4.4424177750932827e-07, + "logits/chosen": 421401472.0, + "logits/rejected": 280953856.0, + "logps/chosen": -354.753173828125, + "logps/rejected": -302.06573486328125, + "loss": 0.0194, + "rewards/chosen": 3.365830898284912, + "rewards/margins": 11.715945720672607, + "rewards/rejected": -8.350114822387695, + "step": 9469 + }, + { + "epoch": 0.8652352672453175, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 4.436494899589955e-07, + "logits/chosen": 456495018.6666667, + "logits/rejected": 302228531.2, + "logps/chosen": -315.4052734375, + "logps/rejected": -253.81162109375, + "loss": 0.0115, + "rewards/chosen": 4.230405807495117, + "rewards/margins": 12.81019172668457, + "rewards/rejected": -8.579785919189453, + "step": 9470 + }, + { + "epoch": 0.8653266331658291, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 4.4305757917663284e-07, + "logits/chosen": 365883488.0, + "logits/rejected": 334356608.0, + "logps/chosen": -405.2048645019531, + "logps/rejected": -364.21771240234375, + "loss": 0.0048, + "rewards/chosen": 5.237011909484863, + "rewards/margins": 13.526501655578613, + "rewards/rejected": -8.28948974609375, + "step": 9471 + }, + { + "epoch": 0.8654179990863408, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 4.424660452111884e-07, + "logits/chosen": 556058733.7142857, + "logits/rejected": 390557152.0, + "logps/chosen": -347.56295340401783, + "logps/rejected": -663.2684326171875, + "loss": 0.0329, + "rewards/chosen": 3.5506973266601562, + "rewards/margins": 13.973817825317383, + "rewards/rejected": -10.423120498657227, + "step": 9472 + }, + { + "epoch": 0.8655093650068525, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 4.4187488811157586e-07, + "logits/chosen": 378245376.0, + "logits/rejected": 375208038.4, + "logps/chosen": -318.5509440104167, + "logps/rejected": -479.0927734375, + "loss": 0.0348, + "rewards/chosen": 2.4435160954793296, + "rewards/margins": 11.988993390401205, + "rewards/rejected": -9.545477294921875, + "step": 9473 + }, + { + "epoch": 0.8656007309273641, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 4.412841079266778e-07, + "logits/chosen": 741517482.6666666, + "logits/rejected": 615053312.0, + "logps/chosen": -176.79632568359375, + "logps/rejected": -487.936865234375, + "loss": 0.0175, + "rewards/chosen": 4.195815404256185, + "rewards/margins": 13.606094487508138, + "rewards/rejected": -9.410279083251954, + "step": 9474 + }, + { + "epoch": 0.8656920968478757, + "grad_norm": 0.78125, + "kl": 0.0, + "learning_rate": 4.4069370470534556e-07, + "logits/chosen": 363233952.0, + "logits/rejected": 375159968.0, + "logps/chosen": -243.43252563476562, + "logps/rejected": -478.39691162109375, + "loss": 0.0047, + "rewards/chosen": 4.754990100860596, + "rewards/margins": 15.752111911773682, + "rewards/rejected": -10.997121810913086, + "step": 9475 + }, + { + "epoch": 0.8657834627683874, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 4.4010367849640247e-07, + "logits/chosen": 557040128.0, + "logits/rejected": 853860556.8, + "logps/chosen": -260.14117431640625, + "logps/rejected": -742.77880859375, + "loss": 0.0074, + "rewards/chosen": 4.259482383728027, + "rewards/margins": 15.377570152282715, + "rewards/rejected": -11.118087768554688, + "step": 9476 + }, + { + "epoch": 0.8658748286888991, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 4.3951402934863606e-07, + "logits/chosen": 632836761.6, + "logits/rejected": 482084736.0, + "logps/chosen": -364.7435791015625, + "logps/rejected": -476.7054850260417, + "loss": 0.1313, + "rewards/chosen": 3.596091461181641, + "rewards/margins": 9.210003407796224, + "rewards/rejected": -5.613911946614583, + "step": 9477 + }, + { + "epoch": 0.8659661946094107, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 4.389247573108063e-07, + "logits/chosen": 574809813.3333334, + "logits/rejected": 483808972.8, + "logps/chosen": -333.70225016276044, + "logps/rejected": -422.7919921875, + "loss": 0.1176, + "rewards/chosen": 3.2165969212849936, + "rewards/margins": 10.253626950581868, + "rewards/rejected": -7.037030029296875, + "step": 9478 + }, + { + "epoch": 0.8660575605299223, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 4.3833586243163917e-07, + "logits/chosen": 915891814.4, + "logits/rejected": 553023829.3333334, + "logps/chosen": -182.62587890625, + "logps/rejected": -677.2648111979166, + "loss": 0.024, + "rewards/chosen": 3.662938690185547, + "rewards/margins": 12.748625691731771, + "rewards/rejected": -9.085687001546225, + "step": 9479 + }, + { + "epoch": 0.866148926450434, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 4.377473447598313e-07, + "logits/chosen": 562079232.0, + "logits/rejected": 327130931.2, + "logps/chosen": -535.794921875, + "logps/rejected": -226.464697265625, + "loss": 0.0249, + "rewards/chosen": 3.8381805419921875, + "rewards/margins": 11.136056518554687, + "rewards/rejected": -7.2978759765625, + "step": 9480 + }, + { + "epoch": 0.8662402923709457, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 4.371592043440481e-07, + "logits/chosen": 873203648.0, + "logits/rejected": 577468544.0, + "logps/chosen": -512.1683349609375, + "logps/rejected": -455.3223876953125, + "loss": 0.0094, + "rewards/chosen": 4.73879861831665, + "rewards/margins": 13.384809970855713, + "rewards/rejected": -8.646011352539062, + "step": 9481 + }, + { + "epoch": 0.8663316582914573, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 4.3657144123292303e-07, + "logits/chosen": 729653674.6666666, + "logits/rejected": 389798118.4, + "logps/chosen": -565.5750732421875, + "logps/rejected": -378.535205078125, + "loss": 0.0148, + "rewards/chosen": 3.851470629374186, + "rewards/margins": 12.772314135233561, + "rewards/rejected": -8.920843505859375, + "step": 9482 + }, + { + "epoch": 0.8664230242119689, + "grad_norm": 3.53125, + "kl": 0.8146839141845703, + "learning_rate": 4.3598405547505886e-07, + "logits/chosen": 456696320.0, + "logits/rejected": 341963840.0, + "logps/chosen": -304.36453683035717, + "logps/rejected": -447.37469482421875, + "loss": 0.0292, + "rewards/chosen": 3.742131096976144, + "rewards/margins": 10.50510106767927, + "rewards/rejected": -6.762969970703125, + "step": 9483 + }, + { + "epoch": 0.8665143901324805, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.353970471190261e-07, + "logits/chosen": 429589600.0, + "logits/rejected": 360068534.85714287, + "logps/chosen": -438.31231689453125, + "logps/rejected": -478.64735630580356, + "loss": 0.0081, + "rewards/chosen": 2.672466993331909, + "rewards/margins": 13.211682081222534, + "rewards/rejected": -10.539215087890625, + "step": 9484 + }, + { + "epoch": 0.8666057560529923, + "grad_norm": 3.078125, + "kl": 4.351940155029297, + "learning_rate": 4.3481041621336595e-07, + "logits/chosen": 572054418.2857143, + "logits/rejected": 1599909632.0, + "logps/chosen": -392.27880859375, + "logps/rejected": -388.5992126464844, + "loss": 0.021, + "rewards/chosen": 4.489576067243304, + "rewards/margins": 14.362144197736468, + "rewards/rejected": -9.872568130493164, + "step": 9485 + }, + { + "epoch": 0.8666971219735039, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 4.3422416280658743e-07, + "logits/chosen": 375124845.71428573, + "logits/rejected": 411142464.0, + "logps/chosen": -264.18265206473217, + "logps/rejected": -432.1499938964844, + "loss": 0.0355, + "rewards/chosen": 4.103512355259487, + "rewards/margins": 10.308441230228969, + "rewards/rejected": -6.204928874969482, + "step": 9486 + }, + { + "epoch": 0.8667884878940155, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 4.336382869471684e-07, + "logits/chosen": 745533440.0, + "logits/rejected": 652513945.6, + "logps/chosen": -269.22690836588544, + "logps/rejected": -436.35341796875, + "loss": 0.0175, + "rewards/chosen": 3.1037658055623374, + "rewards/margins": 9.76761163075765, + "rewards/rejected": -6.663845825195312, + "step": 9487 + }, + { + "epoch": 0.8668798538145271, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 4.330527886835528e-07, + "logits/chosen": 421622144.0, + "logits/rejected": 451375744.0, + "logps/chosen": -366.5232421875, + "logps/rejected": -314.4936116536458, + "loss": 0.0235, + "rewards/chosen": 3.748082733154297, + "rewards/margins": 11.673309580485027, + "rewards/rejected": -7.9252268473307295, + "step": 9488 + }, + { + "epoch": 0.8669712197350389, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 4.3246766806415927e-07, + "logits/chosen": 1221227690.6666667, + "logits/rejected": 489756480.0, + "logps/chosen": -302.91546630859375, + "logps/rejected": -397.39263916015625, + "loss": 0.0164, + "rewards/chosen": 4.122679392496745, + "rewards/margins": 11.802812735239666, + "rewards/rejected": -7.68013334274292, + "step": 9489 + }, + { + "epoch": 0.8670625856555505, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 4.3188292513737016e-07, + "logits/chosen": 709871462.4, + "logits/rejected": 325281749.3333333, + "logps/chosen": -338.59130859375, + "logps/rejected": -569.0387776692709, + "loss": 0.0202, + "rewards/chosen": 3.601548767089844, + "rewards/margins": 16.859015401204427, + "rewards/rejected": -13.257466634114584, + "step": 9490 + }, + { + "epoch": 0.8671539515760621, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 4.3129855995153846e-07, + "logits/chosen": 604192870.4, + "logits/rejected": 432067456.0, + "logps/chosen": -372.021435546875, + "logps/rejected": -691.3486328125, + "loss": 0.0236, + "rewards/chosen": 3.3715023040771483, + "rewards/margins": 15.673534774780274, + "rewards/rejected": -12.302032470703125, + "step": 9491 + }, + { + "epoch": 0.8672453174965737, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 4.307145725549855e-07, + "logits/chosen": 531720155.4285714, + "logits/rejected": 598520512.0, + "logps/chosen": -340.77256556919644, + "logps/rejected": -623.9178466796875, + "loss": 0.0296, + "rewards/chosen": 3.6026079995291576, + "rewards/margins": 11.954451833452497, + "rewards/rejected": -8.35184383392334, + "step": 9492 + }, + { + "epoch": 0.8673366834170855, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 4.301309629960021e-07, + "logits/chosen": 443479765.3333333, + "logits/rejected": 740839884.8, + "logps/chosen": -228.81536865234375, + "logps/rejected": -417.544677734375, + "loss": 0.0893, + "rewards/chosen": 3.389420509338379, + "rewards/margins": 10.335882759094238, + "rewards/rejected": -6.946462249755859, + "step": 9493 + }, + { + "epoch": 0.8674280493375971, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 4.29547731322848e-07, + "logits/chosen": 641866368.0, + "logits/rejected": 437981344.0, + "logps/chosen": -229.07177734375, + "logps/rejected": -617.9150390625, + "loss": 0.0588, + "rewards/chosen": 3.2911481857299805, + "rewards/margins": 14.959980964660645, + "rewards/rejected": -11.668832778930664, + "step": 9494 + }, + { + "epoch": 0.8675194152581087, + "grad_norm": 0.1162109375, + "kl": 0.0, + "learning_rate": 4.289648775837502e-07, + "logits/chosen": 349637248.0, + "logits/rejected": 696576512.0, + "logps/chosen": -195.51980590820312, + "logps/rejected": -382.4866420200893, + "loss": 0.0007, + "rewards/chosen": 6.026286602020264, + "rewards/margins": 15.14268445968628, + "rewards/rejected": -9.116397857666016, + "step": 9495 + }, + { + "epoch": 0.8676107811786203, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 4.283824018269045e-07, + "logits/chosen": 959416320.0, + "logits/rejected": 528577728.0, + "logps/chosen": -373.36602783203125, + "logps/rejected": -555.02099609375, + "loss": 0.0194, + "rewards/chosen": 3.3913066387176514, + "rewards/margins": 14.512325048446655, + "rewards/rejected": -11.121018409729004, + "step": 9496 + }, + { + "epoch": 0.8677021470991321, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.27800304100478e-07, + "logits/chosen": 579972096.0, + "logits/rejected": 371590688.0, + "logps/chosen": -306.41162109375, + "logps/rejected": -376.3998718261719, + "loss": 0.0139, + "rewards/chosen": 4.090832233428955, + "rewards/margins": 13.41133165359497, + "rewards/rejected": -9.320499420166016, + "step": 9497 + }, + { + "epoch": 0.8677935130196437, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 4.272185844526039e-07, + "logits/chosen": 768836900.5714285, + "logits/rejected": 450986720.0, + "logps/chosen": -440.75802176339283, + "logps/rejected": -356.8428039550781, + "loss": 0.0441, + "rewards/chosen": 3.6279449462890625, + "rewards/margins": 12.33978271484375, + "rewards/rejected": -8.711837768554688, + "step": 9498 + }, + { + "epoch": 0.8678848789401553, + "grad_norm": 0.25390625, + "kl": 0.0, + "learning_rate": 4.266372429313853e-07, + "logits/chosen": 156540032.0, + "logits/rejected": 359444662.85714287, + "logps/chosen": -80.11241149902344, + "logps/rejected": -439.37904575892856, + "loss": 0.0012, + "rewards/chosen": 4.79054594039917, + "rewards/margins": 15.055709906986781, + "rewards/rejected": -10.265163966587611, + "step": 9499 + }, + { + "epoch": 0.8679762448606669, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 4.2605627958489324e-07, + "logits/chosen": 252677984.0, + "logits/rejected": 518484394.6666667, + "logps/chosen": -497.3653564453125, + "logps/rejected": -499.601318359375, + "loss": 0.0093, + "rewards/chosen": 3.620938301086426, + "rewards/margins": 13.806113243103027, + "rewards/rejected": -10.185174942016602, + "step": 9500 + }, + { + "epoch": 0.8680676107811787, + "grad_norm": 0.1259765625, + "kl": 0.0, + "learning_rate": 4.254756944611682e-07, + "logits/chosen": 244033600.0, + "logits/rejected": 380621677.71428573, + "logps/chosen": -180.53744506835938, + "logps/rejected": -463.83517020089283, + "loss": 0.0005, + "rewards/chosen": 5.870611667633057, + "rewards/margins": 16.4769218308585, + "rewards/rejected": -10.606310163225446, + "step": 9501 + }, + { + "epoch": 0.8681589767016903, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 4.248954876082195e-07, + "logits/chosen": 574578389.3333334, + "logits/rejected": 526749900.8, + "logps/chosen": -345.6160481770833, + "logps/rejected": -448.2470703125, + "loss": 0.0106, + "rewards/chosen": 3.64788818359375, + "rewards/margins": 13.915900421142577, + "rewards/rejected": -10.268012237548827, + "step": 9502 + }, + { + "epoch": 0.8682503426222019, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 4.2431565907402497e-07, + "logits/chosen": 518378837.3333333, + "logits/rejected": 497974976.0, + "logps/chosen": -236.7643025716146, + "logps/rejected": -367.43804931640625, + "loss": 0.0316, + "rewards/chosen": 4.486215591430664, + "rewards/margins": 12.6383638381958, + "rewards/rejected": -8.152148246765137, + "step": 9503 + }, + { + "epoch": 0.8683417085427135, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 4.2373620890653057e-07, + "logits/chosen": 517219392.0, + "logits/rejected": 364155136.0, + "logps/chosen": -354.6632080078125, + "logps/rejected": -408.75860595703125, + "loss": 0.0136, + "rewards/chosen": 4.040175437927246, + "rewards/margins": 13.971158027648926, + "rewards/rejected": -9.93098258972168, + "step": 9504 + }, + { + "epoch": 0.8684330744632253, + "grad_norm": 52.75, + "kl": 0.0, + "learning_rate": 4.231571371536508e-07, + "logits/chosen": 478504768.0, + "logits/rejected": 790038186.6666666, + "logps/chosen": -98.49955749511719, + "logps/rejected": -555.9224853515625, + "loss": 0.0797, + "rewards/chosen": 2.2392349243164062, + "rewards/margins": 8.951950073242188, + "rewards/rejected": -6.712715148925781, + "step": 9505 + }, + { + "epoch": 0.8685244403837369, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 4.225784438632713e-07, + "logits/chosen": 595347046.4, + "logits/rejected": 277551274.6666667, + "logps/chosen": -394.7020751953125, + "logps/rejected": -365.7989095052083, + "loss": 0.0138, + "rewards/chosen": 4.0264839172363285, + "rewards/margins": 11.254721069335938, + "rewards/rejected": -7.228237152099609, + "step": 9506 + }, + { + "epoch": 0.8686158063042485, + "grad_norm": 0.7265625, + "kl": 0.0, + "learning_rate": 4.2200012908324427e-07, + "logits/chosen": 536394176.0, + "logits/rejected": 1142677760.0, + "logps/chosen": -215.69044494628906, + "logps/rejected": -675.266357421875, + "loss": 0.0045, + "rewards/chosen": 4.784660339355469, + "rewards/margins": 15.020906448364258, + "rewards/rejected": -10.236246109008789, + "step": 9507 + }, + { + "epoch": 0.8687071722247601, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 4.2142219286138987e-07, + "logits/chosen": 1024509440.0, + "logits/rejected": 559266304.0, + "logps/chosen": -353.0901184082031, + "logps/rejected": -391.348388671875, + "loss": 0.1281, + "rewards/chosen": 2.188260555267334, + "rewards/margins": 10.235902309417725, + "rewards/rejected": -8.04764175415039, + "step": 9508 + }, + { + "epoch": 0.8687985381452719, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 4.2084463524549866e-07, + "logits/chosen": 782472960.0, + "logits/rejected": 524548480.0, + "logps/chosen": -374.6197102864583, + "logps/rejected": -480.85430908203125, + "loss": 0.0358, + "rewards/chosen": 3.8288745880126953, + "rewards/margins": 14.846309661865234, + "rewards/rejected": -11.017435073852539, + "step": 9509 + }, + { + "epoch": 0.8688899040657835, + "grad_norm": 0.7734375, + "kl": 0.0, + "learning_rate": 4.2026745628332863e-07, + "logits/chosen": 473450432.0, + "logits/rejected": 379910112.0, + "logps/chosen": -174.21392822265625, + "logps/rejected": -601.9659423828125, + "loss": 0.0055, + "rewards/chosen": 4.680550575256348, + "rewards/margins": 14.166707038879395, + "rewards/rejected": -9.486156463623047, + "step": 9510 + }, + { + "epoch": 0.8689812699862951, + "grad_norm": 67.5, + "kl": 0.0, + "learning_rate": 4.196906560226083e-07, + "logits/chosen": 743224012.8, + "logits/rejected": 537836629.3333334, + "logps/chosen": -419.89912109375, + "logps/rejected": -323.9557698567708, + "loss": 0.0808, + "rewards/chosen": 3.9858665466308594, + "rewards/margins": 9.609689712524414, + "rewards/rejected": -5.623823165893555, + "step": 9511 + }, + { + "epoch": 0.8690726359068067, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 4.1911423451103383e-07, + "logits/chosen": 293497258.6666667, + "logits/rejected": 365343769.6, + "logps/chosen": -387.1732584635417, + "logps/rejected": -324.051416015625, + "loss": 0.0162, + "rewards/chosen": 3.1332263946533203, + "rewards/margins": 11.951531600952148, + "rewards/rejected": -8.818305206298827, + "step": 9512 + }, + { + "epoch": 0.8691640018273185, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 4.185381917962689e-07, + "logits/chosen": 463741747.2, + "logits/rejected": 662859605.3333334, + "logps/chosen": -190.7089599609375, + "logps/rejected": -479.6795247395833, + "loss": 0.0272, + "rewards/chosen": 3.257323455810547, + "rewards/margins": 11.445372772216796, + "rewards/rejected": -8.18804931640625, + "step": 9513 + }, + { + "epoch": 0.8692553677478301, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 4.179625279259475e-07, + "logits/chosen": 585427413.3333334, + "logits/rejected": 1223390464.0, + "logps/chosen": -311.3872477213542, + "logps/rejected": -493.49920654296875, + "loss": 0.0194, + "rewards/chosen": 4.295352935791016, + "rewards/margins": 14.121639251708984, + "rewards/rejected": -9.826286315917969, + "step": 9514 + }, + { + "epoch": 0.8693467336683417, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 4.1738724294767164e-07, + "logits/chosen": 419641514.6666667, + "logits/rejected": 533767782.4, + "logps/chosen": -302.68178304036456, + "logps/rejected": -538.86826171875, + "loss": 0.0097, + "rewards/chosen": 4.496851285298665, + "rewards/margins": 14.817703946431479, + "rewards/rejected": -10.320852661132813, + "step": 9515 + }, + { + "epoch": 0.8694380995888533, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 4.1681233690901155e-07, + "logits/chosen": 1343967232.0, + "logits/rejected": 646584217.6, + "logps/chosen": -390.9170328776042, + "logps/rejected": -466.959228515625, + "loss": 0.0156, + "rewards/chosen": 3.5941244761149087, + "rewards/margins": 12.477614466349284, + "rewards/rejected": -8.883489990234375, + "step": 9516 + }, + { + "epoch": 0.869529465509365, + "grad_norm": 2.734375, + "kl": 1.447296142578125, + "learning_rate": 4.162378098575065e-07, + "logits/chosen": 842960896.0, + "logits/rejected": 556703232.0, + "logps/chosen": -345.9439290364583, + "logps/rejected": -370.92388916015625, + "loss": 0.0238, + "rewards/chosen": 4.014907519022624, + "rewards/margins": 14.043677965799969, + "rewards/rejected": -10.028770446777344, + "step": 9517 + }, + { + "epoch": 0.8696208314298767, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 4.1566366184066564e-07, + "logits/chosen": 519232682.6666667, + "logits/rejected": 516418662.4, + "logps/chosen": -466.9659423828125, + "logps/rejected": -455.7146484375, + "loss": 0.0092, + "rewards/chosen": 3.897740681966146, + "rewards/margins": 11.671268208821616, + "rewards/rejected": -7.773527526855469, + "step": 9518 + }, + { + "epoch": 0.8697121973503883, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 4.1508989290596545e-07, + "logits/chosen": 557573120.0, + "logits/rejected": 347557888.0, + "logps/chosen": -429.9757486979167, + "logps/rejected": -393.891357421875, + "loss": 0.0227, + "rewards/chosen": 3.316648483276367, + "rewards/margins": 9.872294998168945, + "rewards/rejected": -6.555646514892578, + "step": 9519 + }, + { + "epoch": 0.8698035632708999, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 4.1451650310085076e-07, + "logits/chosen": 560514048.0, + "logits/rejected": 922723904.0, + "logps/chosen": -302.3337707519531, + "logps/rejected": -340.4126892089844, + "loss": 0.0249, + "rewards/chosen": 2.9909415245056152, + "rewards/margins": 11.400615215301514, + "rewards/rejected": -8.409673690795898, + "step": 9520 + }, + { + "epoch": 0.8698949291914116, + "grad_norm": 4.96875, + "kl": 4.288505554199219, + "learning_rate": 4.139434924727359e-07, + "logits/chosen": 722296393.1428572, + "logits/rejected": 294112128.0, + "logps/chosen": -500.35518973214283, + "logps/rejected": -443.3950500488281, + "loss": 0.033, + "rewards/chosen": 3.8463497161865234, + "rewards/margins": 14.749356269836426, + "rewards/rejected": -10.903006553649902, + "step": 9521 + }, + { + "epoch": 0.8699862951119233, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 4.133708610690024e-07, + "logits/chosen": 726404403.2, + "logits/rejected": 385783253.3333333, + "logps/chosen": -250.159716796875, + "logps/rejected": -409.903076171875, + "loss": 0.0427, + "rewards/chosen": 3.0654090881347655, + "rewards/margins": 10.218416595458985, + "rewards/rejected": -7.153007507324219, + "step": 9522 + }, + { + "epoch": 0.8700776610324349, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 4.1279860893700343e-07, + "logits/chosen": 452041280.0, + "logits/rejected": 369900373.3333333, + "logps/chosen": -378.653564453125, + "logps/rejected": -350.3969319661458, + "loss": 0.0066, + "rewards/chosen": 4.263542175292969, + "rewards/margins": 12.814581553141275, + "rewards/rejected": -8.551039377848307, + "step": 9523 + }, + { + "epoch": 0.8701690269529465, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 4.122267361240584e-07, + "logits/chosen": 387437516.8, + "logits/rejected": 264118656.0, + "logps/chosen": -229.711474609375, + "logps/rejected": -300.5005696614583, + "loss": 0.0262, + "rewards/chosen": 3.4508800506591797, + "rewards/margins": 11.663501739501953, + "rewards/rejected": -8.212621688842773, + "step": 9524 + }, + { + "epoch": 0.8702603928734582, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 4.116552426774556e-07, + "logits/chosen": 1630813568.0, + "logits/rejected": 587189077.3333334, + "logps/chosen": -475.6390380859375, + "logps/rejected": -469.8883463541667, + "loss": 0.1118, + "rewards/chosen": 3.327281951904297, + "rewards/margins": 10.11945406595866, + "rewards/rejected": -6.792172114054362, + "step": 9525 + }, + { + "epoch": 0.8703517587939699, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 4.1108412864445167e-07, + "logits/chosen": 290402496.0, + "logits/rejected": 539968409.6, + "logps/chosen": -183.4181925455729, + "logps/rejected": -345.2063232421875, + "loss": 0.047, + "rewards/chosen": 3.0962823232014975, + "rewards/margins": 9.21093381245931, + "rewards/rejected": -6.114651489257812, + "step": 9526 + }, + { + "epoch": 0.8704431247144815, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 4.105133940722733e-07, + "logits/chosen": 784422741.3333334, + "logits/rejected": 609818624.0, + "logps/chosen": -213.9950968424479, + "logps/rejected": -811.4573974609375, + "loss": 0.0282, + "rewards/chosen": 3.7866503397623696, + "rewards/margins": 18.450984636942547, + "rewards/rejected": -14.664334297180176, + "step": 9527 + }, + { + "epoch": 0.8705344906349931, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 4.0994303900811494e-07, + "logits/chosen": 512652608.0, + "logits/rejected": 368635050.6666667, + "logps/chosen": -561.35400390625, + "logps/rejected": -466.0533447265625, + "loss": 0.0171, + "rewards/chosen": 2.6582977771759033, + "rewards/margins": 11.793145418167114, + "rewards/rejected": -9.134847640991211, + "step": 9528 + }, + { + "epoch": 0.8706258565555048, + "grad_norm": 0.318359375, + "kl": 0.0, + "learning_rate": 4.0937306349913996e-07, + "logits/chosen": 182545456.0, + "logits/rejected": 538252373.3333334, + "logps/chosen": -177.77639770507812, + "logps/rejected": -589.7742513020834, + "loss": 0.0015, + "rewards/chosen": 5.335360050201416, + "rewards/margins": 16.0802903175354, + "rewards/rejected": -10.744930267333984, + "step": 9529 + }, + { + "epoch": 0.8707172224760165, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 4.0880346759247904e-07, + "logits/chosen": 533400746.6666667, + "logits/rejected": 605461248.0, + "logps/chosen": -369.980224609375, + "logps/rejected": -613.6970703125, + "loss": 0.0142, + "rewards/chosen": 3.548639933268229, + "rewards/margins": 12.295337168375651, + "rewards/rejected": -8.746697235107423, + "step": 9530 + }, + { + "epoch": 0.8708085883965281, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 4.082342513352322e-07, + "logits/chosen": 484810624.0, + "logits/rejected": 455058720.0, + "logps/chosen": -288.2179361979167, + "logps/rejected": -512.508544921875, + "loss": 0.0179, + "rewards/chosen": 4.211508433024089, + "rewards/margins": 13.058698336283367, + "rewards/rejected": -8.847189903259277, + "step": 9531 + }, + { + "epoch": 0.8708999543170397, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 4.0766541477446963e-07, + "logits/chosen": 649826090.6666666, + "logits/rejected": 1140973568.0, + "logps/chosen": -315.01930745442706, + "logps/rejected": -510.074853515625, + "loss": 0.0251, + "rewards/chosen": 4.445687611897786, + "rewards/margins": 11.539767201741537, + "rewards/rejected": -7.09407958984375, + "step": 9532 + }, + { + "epoch": 0.8709913202375514, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 4.070969579572287e-07, + "logits/chosen": 629709824.0, + "logits/rejected": 464112554.6666667, + "logps/chosen": -396.969287109375, + "logps/rejected": -441.9149576822917, + "loss": 0.0226, + "rewards/chosen": 3.5374683380126952, + "rewards/margins": 12.980187479654948, + "rewards/rejected": -9.442719141642252, + "step": 9533 + }, + { + "epoch": 0.8710826861580631, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 4.0652888093051513e-07, + "logits/chosen": 468300390.4, + "logits/rejected": 403225941.3333333, + "logps/chosen": -209.30673828125, + "logps/rejected": -217.0009765625, + "loss": 0.0381, + "rewards/chosen": 3.939362335205078, + "rewards/margins": 9.492588806152344, + "rewards/rejected": -5.553226470947266, + "step": 9534 + }, + { + "epoch": 0.8711740520785747, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 4.05961183741303e-07, + "logits/chosen": 331387840.0, + "logits/rejected": 414278144.0, + "logps/chosen": -242.02032470703125, + "logps/rejected": -441.8865559895833, + "loss": 0.0113, + "rewards/chosen": 3.363779067993164, + "rewards/margins": 10.831480026245117, + "rewards/rejected": -7.467700958251953, + "step": 9535 + }, + { + "epoch": 0.8712654179990863, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 4.0539386643653643e-07, + "logits/chosen": 414849621.3333333, + "logits/rejected": 251211232.0, + "logps/chosen": -293.01177978515625, + "logps/rejected": -435.94683837890625, + "loss": 0.0284, + "rewards/chosen": 3.8124160766601562, + "rewards/margins": 14.965794563293457, + "rewards/rejected": -11.1533784866333, + "step": 9536 + }, + { + "epoch": 0.871356783919598, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 4.048269290631279e-07, + "logits/chosen": 693968640.0, + "logits/rejected": 463534848.0, + "logps/chosen": -415.841015625, + "logps/rejected": -616.7843831380209, + "loss": 0.0154, + "rewards/chosen": 3.7624885559082033, + "rewards/margins": 13.779439798990886, + "rewards/rejected": -10.016951243082682, + "step": 9537 + }, + { + "epoch": 0.8714481498401097, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 4.042603716679566e-07, + "logits/chosen": 404208896.0, + "logits/rejected": 199895424.0, + "logps/chosen": -256.6825256347656, + "logps/rejected": -259.2069091796875, + "loss": 0.0083, + "rewards/chosen": 4.461849689483643, + "rewards/margins": 13.435591220855713, + "rewards/rejected": -8.97374153137207, + "step": 9538 + }, + { + "epoch": 0.8715395157606213, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 4.036941942978717e-07, + "logits/chosen": 308703402.6666667, + "logits/rejected": 523488768.0, + "logps/chosen": -263.21201578776044, + "logps/rejected": -557.96533203125, + "loss": 0.0165, + "rewards/chosen": 3.4309412638346353, + "rewards/margins": 12.557177225748697, + "rewards/rejected": -9.126235961914062, + "step": 9539 + }, + { + "epoch": 0.8716308816811329, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 4.031283969996919e-07, + "logits/chosen": 813608618.6666666, + "logits/rejected": 536451123.2, + "logps/chosen": -524.5326334635416, + "logps/rejected": -419.1576171875, + "loss": 0.0064, + "rewards/chosen": 4.490902582804362, + "rewards/margins": 12.856164423624676, + "rewards/rejected": -8.365261840820313, + "step": 9540 + }, + { + "epoch": 0.8717222476016446, + "grad_norm": 0.6796875, + "kl": 0.0, + "learning_rate": 4.02562979820203e-07, + "logits/chosen": 391549235.2, + "logits/rejected": 501153664.0, + "logps/chosen": -212.2824951171875, + "logps/rejected": -621.7320963541666, + "loss": 0.1269, + "rewards/chosen": 3.3612319946289064, + "rewards/margins": 13.91382204691569, + "rewards/rejected": -10.552590052286783, + "step": 9541 + }, + { + "epoch": 0.8718136135221562, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 4.0199794280615934e-07, + "logits/chosen": 497186713.6, + "logits/rejected": 594905472.0, + "logps/chosen": -260.9804443359375, + "logps/rejected": -590.080322265625, + "loss": 0.0181, + "rewards/chosen": 3.804875946044922, + "rewards/margins": 13.351960118611654, + "rewards/rejected": -9.547084172566732, + "step": 9542 + }, + { + "epoch": 0.8719049794426679, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 4.01433286004283e-07, + "logits/chosen": 497627392.0, + "logits/rejected": 491059029.3333333, + "logps/chosen": -378.97991943359375, + "logps/rejected": -422.899169921875, + "loss": 0.1219, + "rewards/chosen": 3.7349319458007812, + "rewards/margins": 10.023980458577473, + "rewards/rejected": -6.289048512776692, + "step": 9543 + }, + { + "epoch": 0.8719963453631795, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 4.0086900946126886e-07, + "logits/chosen": 480507040.0, + "logits/rejected": 291662037.3333333, + "logps/chosen": -250.02635192871094, + "logps/rejected": -321.187744140625, + "loss": 0.0366, + "rewards/chosen": 2.639669895172119, + "rewards/margins": 10.360114256540935, + "rewards/rejected": -7.720444361368815, + "step": 9544 + }, + { + "epoch": 0.8720877112836912, + "grad_norm": 64.0, + "kl": 0.0, + "learning_rate": 4.0030511322377506e-07, + "logits/chosen": 703121280.0, + "logits/rejected": 1030325248.0, + "logps/chosen": -449.7880045572917, + "logps/rejected": -301.22210693359375, + "loss": 0.0531, + "rewards/chosen": 3.1423956553141275, + "rewards/margins": 9.761011759440104, + "rewards/rejected": -6.618616104125977, + "step": 9545 + }, + { + "epoch": 0.8721790772042028, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 3.997415973384311e-07, + "logits/chosen": 605689130.6666666, + "logits/rejected": 751047270.4, + "logps/chosen": -385.1597493489583, + "logps/rejected": -479.512109375, + "loss": 0.0361, + "rewards/chosen": 3.65665594736735, + "rewards/margins": 11.891617647806802, + "rewards/rejected": -8.234961700439452, + "step": 9546 + }, + { + "epoch": 0.8722704431247145, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.991784618518346e-07, + "logits/chosen": 605703253.3333334, + "logits/rejected": 500080896.0, + "logps/chosen": -193.82666015625, + "logps/rejected": -580.586181640625, + "loss": 0.02, + "rewards/chosen": 3.385570208231608, + "rewards/margins": 14.324804369608561, + "rewards/rejected": -10.939234161376953, + "step": 9547 + }, + { + "epoch": 0.8723618090452261, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 3.986157068105523e-07, + "logits/chosen": 313767731.2, + "logits/rejected": 349219968.0, + "logps/chosen": -243.22763671875, + "logps/rejected": -561.2286783854166, + "loss": 0.0311, + "rewards/chosen": 3.725122833251953, + "rewards/margins": 14.247587585449219, + "rewards/rejected": -10.522464752197266, + "step": 9548 + }, + { + "epoch": 0.8724531749657378, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 3.98053332261118e-07, + "logits/chosen": 368021920.0, + "logits/rejected": 493360981.3333333, + "logps/chosen": -312.4979553222656, + "logps/rejected": -645.5378824869791, + "loss": 0.0146, + "rewards/chosen": 2.857766628265381, + "rewards/margins": 12.702233791351318, + "rewards/rejected": -9.844467163085938, + "step": 9549 + }, + { + "epoch": 0.8725445408862494, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 3.9749133825003505e-07, + "logits/chosen": 756148053.3333334, + "logits/rejected": 430608000.0, + "logps/chosen": -268.6628011067708, + "logps/rejected": -508.662109375, + "loss": 0.0144, + "rewards/chosen": 4.79088560740153, + "rewards/margins": 16.868285814921062, + "rewards/rejected": -12.077400207519531, + "step": 9550 + }, + { + "epoch": 0.8726359068067611, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 3.969297248237758e-07, + "logits/chosen": 589934144.0, + "logits/rejected": 301263040.0, + "logps/chosen": -283.54736328125, + "logps/rejected": -460.6565755208333, + "loss": 0.0873, + "rewards/chosen": 2.1534361839294434, + "rewards/margins": 10.295014222462973, + "rewards/rejected": -8.14157803853353, + "step": 9551 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 3.963684920287786e-07, + "logits/chosen": 592321920.0, + "logits/rejected": 437887712.0, + "logps/chosen": -249.57452392578125, + "logps/rejected": -528.2876586914062, + "loss": 0.0156, + "rewards/chosen": 3.818005084991455, + "rewards/margins": 15.453603267669678, + "rewards/rejected": -11.635598182678223, + "step": 9552 + }, + { + "epoch": 0.8728186386477844, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.958076399114547e-07, + "logits/chosen": 386797504.0, + "logits/rejected": 298955584.0, + "logps/chosen": -323.17535400390625, + "logps/rejected": -391.7462463378906, + "loss": 0.0112, + "rewards/chosen": 3.9569497108459473, + "rewards/margins": 13.041269779205322, + "rewards/rejected": -9.084320068359375, + "step": 9553 + }, + { + "epoch": 0.872910004568296, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 3.952471685181797e-07, + "logits/chosen": 665178726.4, + "logits/rejected": 856502954.6666666, + "logps/chosen": -396.9289794921875, + "logps/rejected": -412.1908365885417, + "loss": 0.0263, + "rewards/chosen": 3.5525348663330076, + "rewards/margins": 10.843982187906901, + "rewards/rejected": -7.2914473215738935, + "step": 9554 + }, + { + "epoch": 0.8730013704888077, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 3.9468707789530047e-07, + "logits/chosen": 557202636.8, + "logits/rejected": 596411136.0, + "logps/chosen": -312.9564697265625, + "logps/rejected": -603.6996256510416, + "loss": 0.0217, + "rewards/chosen": 3.5997276306152344, + "rewards/margins": 11.519690831502277, + "rewards/rejected": -7.919963200887044, + "step": 9555 + }, + { + "epoch": 0.8730927364093193, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 3.9412736808912944e-07, + "logits/chosen": 646683776.0, + "logits/rejected": 431093589.3333333, + "logps/chosen": -426.7672424316406, + "logps/rejected": -382.6730143229167, + "loss": 0.0988, + "rewards/chosen": 3.7217941284179688, + "rewards/margins": 11.349333445231121, + "rewards/rejected": -7.627539316813151, + "step": 9556 + }, + { + "epoch": 0.873184102329831, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 3.9356803914595175e-07, + "logits/chosen": 665674666.6666666, + "logits/rejected": 317198720.0, + "logps/chosen": -628.281982421875, + "logps/rejected": -364.3909912109375, + "loss": 0.0049, + "rewards/chosen": 4.758324940999349, + "rewards/margins": 14.237101491292318, + "rewards/rejected": -9.478776550292968, + "step": 9557 + }, + { + "epoch": 0.8732754682503426, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 3.930090911120177e-07, + "logits/chosen": 309768000.0, + "logits/rejected": 356239189.3333333, + "logps/chosen": -149.81365966796875, + "logps/rejected": -469.4710286458333, + "loss": 0.0064, + "rewards/chosen": 4.031515121459961, + "rewards/margins": 12.576790491739908, + "rewards/rejected": -8.545275370279947, + "step": 9558 + }, + { + "epoch": 0.8733668341708543, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 3.924505240335469e-07, + "logits/chosen": 435481395.2, + "logits/rejected": 639667029.3333334, + "logps/chosen": -340.3327392578125, + "logps/rejected": -621.6037190755209, + "loss": 0.0374, + "rewards/chosen": 3.4451980590820312, + "rewards/margins": 13.847379048665365, + "rewards/rejected": -10.402180989583334, + "step": 9559 + }, + { + "epoch": 0.8734582000913659, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 3.918923379567274e-07, + "logits/chosen": 500991317.3333333, + "logits/rejected": 495911456.0, + "logps/chosen": -349.2081705729167, + "logps/rejected": -598.7305297851562, + "loss": 0.0281, + "rewards/chosen": 3.4978866577148438, + "rewards/margins": 12.523153305053711, + "rewards/rejected": -9.025266647338867, + "step": 9560 + }, + { + "epoch": 0.8735495660118776, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.9133453292771683e-07, + "logits/chosen": 668822976.0, + "logits/rejected": 394623648.0, + "logps/chosen": -388.5078125, + "logps/rejected": -468.80126953125, + "loss": 0.0166, + "rewards/chosen": 3.7471041679382324, + "rewards/margins": 11.675251007080078, + "rewards/rejected": -7.928146839141846, + "step": 9561 + }, + { + "epoch": 0.8736409319323892, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 3.9077710899264045e-07, + "logits/chosen": 565132236.8, + "logits/rejected": 477328554.6666667, + "logps/chosen": -303.9441650390625, + "logps/rejected": -486.4532063802083, + "loss": 0.0254, + "rewards/chosen": 3.3513317108154297, + "rewards/margins": 11.666257858276367, + "rewards/rejected": -8.314926147460938, + "step": 9562 + }, + { + "epoch": 0.8737322978529009, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 3.902200661975919e-07, + "logits/chosen": 415048533.3333333, + "logits/rejected": 558379520.0, + "logps/chosen": -491.1695963541667, + "logps/rejected": -557.4734497070312, + "loss": 0.0251, + "rewards/chosen": 3.7627989451090493, + "rewards/margins": 13.185301462809244, + "rewards/rejected": -9.422502517700195, + "step": 9563 + }, + { + "epoch": 0.8738236637734125, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 3.896634045886333e-07, + "logits/chosen": 727404748.8, + "logits/rejected": 474538581.3333333, + "logps/chosen": -251.124560546875, + "logps/rejected": -399.7715657552083, + "loss": 0.0226, + "rewards/chosen": 4.043608474731445, + "rewards/margins": 11.860761133829753, + "rewards/rejected": -7.817152659098308, + "step": 9564 + }, + { + "epoch": 0.8739150296939242, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 3.891071242117944e-07, + "logits/chosen": 1114956800.0, + "logits/rejected": 599349196.8, + "logps/chosen": -266.78904215494794, + "logps/rejected": -490.421875, + "loss": 0.0376, + "rewards/chosen": 2.4813919067382812, + "rewards/margins": 12.515266418457031, + "rewards/rejected": -10.03387451171875, + "step": 9565 + }, + { + "epoch": 0.8740063956144358, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.885512251130763e-07, + "logits/chosen": 520836169.14285713, + "logits/rejected": 590382208.0, + "logps/chosen": -494.1049107142857, + "logps/rejected": -745.9415283203125, + "loss": 0.0334, + "rewards/chosen": 3.8587635585239957, + "rewards/margins": 13.339544160025461, + "rewards/rejected": -9.480780601501465, + "step": 9566 + }, + { + "epoch": 0.8740977615349474, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.879957073384466e-07, + "logits/chosen": 476934195.2, + "logits/rejected": 477441280.0, + "logps/chosen": -311.7975830078125, + "logps/rejected": -400.053466796875, + "loss": 0.0235, + "rewards/chosen": 3.82828369140625, + "rewards/margins": 11.831550343831381, + "rewards/rejected": -8.00326665242513, + "step": 9567 + }, + { + "epoch": 0.8741891274554591, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 3.8744057093383914e-07, + "logits/chosen": 481602662.4, + "logits/rejected": 370079701.3333333, + "logps/chosen": -320.78095703125, + "logps/rejected": -421.422607421875, + "loss": 0.0146, + "rewards/chosen": 4.077386474609375, + "rewards/margins": 11.30473543802897, + "rewards/rejected": -7.227348963419597, + "step": 9568 + }, + { + "epoch": 0.8742804933759708, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 3.868858159451611e-07, + "logits/chosen": 577163840.0, + "logits/rejected": 1021251840.0, + "logps/chosen": -287.50982666015625, + "logps/rejected": -565.2899169921875, + "loss": 0.007, + "rewards/chosen": 4.605652809143066, + "rewards/margins": 13.954473495483398, + "rewards/rejected": -9.348820686340332, + "step": 9569 + }, + { + "epoch": 0.8743718592964824, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 3.863314424182846e-07, + "logits/chosen": 820202752.0, + "logits/rejected": 510401920.0, + "logps/chosen": -365.66119384765625, + "logps/rejected": -430.9990234375, + "loss": 0.1244, + "rewards/chosen": 0.8683357238769531, + "rewards/margins": 9.571594874064127, + "rewards/rejected": -8.703259150187174, + "step": 9570 + }, + { + "epoch": 0.874463225216994, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 3.857774503990513e-07, + "logits/chosen": 620616396.8, + "logits/rejected": 764051285.3333334, + "logps/chosen": -314.32802734375, + "logps/rejected": -636.5999755859375, + "loss": 0.026, + "rewards/chosen": 3.646501922607422, + "rewards/margins": 12.387539927164713, + "rewards/rejected": -8.741038004557291, + "step": 9571 + }, + { + "epoch": 0.8745545911375057, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.8522383993327086e-07, + "logits/chosen": 856095680.0, + "logits/rejected": 599761536.0, + "logps/chosen": -455.9179992675781, + "logps/rejected": -779.619140625, + "loss": 0.0121, + "rewards/chosen": 3.9017136096954346, + "rewards/margins": 14.912245035171509, + "rewards/rejected": -11.010531425476074, + "step": 9572 + }, + { + "epoch": 0.8746459570580174, + "grad_norm": 54.5, + "kl": 0.0, + "learning_rate": 3.8467061106672143e-07, + "logits/chosen": 280683690.6666667, + "logits/rejected": 441583001.6, + "logps/chosen": -154.1743367513021, + "logps/rejected": -457.001416015625, + "loss": 0.0741, + "rewards/chosen": 3.5091708501180015, + "rewards/margins": 12.576188977559408, + "rewards/rejected": -9.067018127441406, + "step": 9573 + }, + { + "epoch": 0.874737322978529, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 3.8411776384515154e-07, + "logits/chosen": 294331968.0, + "logits/rejected": 566131200.0, + "logps/chosen": -483.23138427734375, + "logps/rejected": -462.7843017578125, + "loss": 0.0115, + "rewards/chosen": 4.115814208984375, + "rewards/margins": 14.075027465820312, + "rewards/rejected": -9.959213256835938, + "step": 9574 + }, + { + "epoch": 0.8748286888990406, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 3.8356529831427514e-07, + "logits/chosen": 559462604.8, + "logits/rejected": 417321472.0, + "logps/chosen": -326.183349609375, + "logps/rejected": -504.0739339192708, + "loss": 0.0178, + "rewards/chosen": 4.043017196655273, + "rewards/margins": 12.965067672729493, + "rewards/rejected": -8.922050476074219, + "step": 9575 + }, + { + "epoch": 0.8749200548195523, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.8301321451977626e-07, + "logits/chosen": 364529305.6, + "logits/rejected": 475659989.3333333, + "logps/chosen": -301.444677734375, + "logps/rejected": -604.464599609375, + "loss": 0.0138, + "rewards/chosen": 4.230851745605468, + "rewards/margins": 15.970442708333334, + "rewards/rejected": -11.739590962727865, + "step": 9576 + }, + { + "epoch": 0.875011420740064, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 3.824615125073067e-07, + "logits/chosen": 545705472.0, + "logits/rejected": 535693760.0, + "logps/chosen": -291.7200927734375, + "logps/rejected": -251.6436767578125, + "loss": 0.0162, + "rewards/chosen": 4.3234357833862305, + "rewards/margins": 12.156323432922363, + "rewards/rejected": -7.832887649536133, + "step": 9577 + }, + { + "epoch": 0.8751027866605756, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 3.819101923224877e-07, + "logits/chosen": 872928960.0, + "logits/rejected": 648021504.0, + "logps/chosen": -320.1856689453125, + "logps/rejected": -487.19580078125, + "loss": 0.0179, + "rewards/chosen": 3.382028102874756, + "rewards/margins": 13.211853504180908, + "rewards/rejected": -9.829825401306152, + "step": 9578 + }, + { + "epoch": 0.8751941525810872, + "grad_norm": 0.671875, + "kl": 0.0, + "learning_rate": 3.81359254010909e-07, + "logits/chosen": 1286076672.0, + "logits/rejected": 793627084.8, + "logps/chosen": -446.3841959635417, + "logps/rejected": -438.362158203125, + "loss": 0.004, + "rewards/chosen": 4.684717178344727, + "rewards/margins": 12.70716438293457, + "rewards/rejected": -8.022447204589843, + "step": 9579 + }, + { + "epoch": 0.8752855185015989, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 3.8080869761812745e-07, + "logits/chosen": 490791628.8, + "logits/rejected": 410616064.0, + "logps/chosen": -438.32216796875, + "logps/rejected": -451.0045979817708, + "loss": 0.0176, + "rewards/chosen": 4.037462615966797, + "rewards/margins": 11.523106638590495, + "rewards/rejected": -7.485644022623698, + "step": 9580 + }, + { + "epoch": 0.8753768844221106, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 3.802585231896672e-07, + "logits/chosen": 449544806.4, + "logits/rejected": 530388864.0, + "logps/chosen": -406.615185546875, + "logps/rejected": -616.2152506510416, + "loss": 0.0195, + "rewards/chosen": 3.9019813537597656, + "rewards/margins": 13.37707773844401, + "rewards/rejected": -9.475096384684244, + "step": 9581 + }, + { + "epoch": 0.8754682503426222, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 3.7970873077102567e-07, + "logits/chosen": 580433664.0, + "logits/rejected": 282143744.0, + "logps/chosen": -346.14822823660717, + "logps/rejected": -409.2518310546875, + "loss": 0.02, + "rewards/chosen": 4.245672498430524, + "rewards/margins": 13.122556005205427, + "rewards/rejected": -8.876883506774902, + "step": 9582 + }, + { + "epoch": 0.8755596162631338, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.7915932040766434e-07, + "logits/chosen": 1698867840.0, + "logits/rejected": 1108160512.0, + "logps/chosen": -366.958251953125, + "logps/rejected": -493.0760904947917, + "loss": 0.0102, + "rewards/chosen": 3.464712619781494, + "rewards/margins": 12.835769494374594, + "rewards/rejected": -9.3710568745931, + "step": 9583 + }, + { + "epoch": 0.8756509821836455, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 3.7861029214501355e-07, + "logits/chosen": 461201510.4, + "logits/rejected": 532835413.3333333, + "logps/chosen": -285.6671630859375, + "logps/rejected": -608.12109375, + "loss": 0.014, + "rewards/chosen": 4.284966278076172, + "rewards/margins": 13.43106600443522, + "rewards/rejected": -9.146099726359049, + "step": 9584 + }, + { + "epoch": 0.8757423481041572, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 3.7806164602847415e-07, + "logits/chosen": 601520085.3333334, + "logits/rejected": 878062796.8, + "logps/chosen": -353.1292317708333, + "logps/rejected": -535.760205078125, + "loss": 0.0131, + "rewards/chosen": 4.346385955810547, + "rewards/margins": 14.030098724365235, + "rewards/rejected": -9.683712768554688, + "step": 9585 + }, + { + "epoch": 0.8758337140246688, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.7751338210341217e-07, + "logits/chosen": 503668064.0, + "logits/rejected": 485752832.0, + "logps/chosen": -351.14837646484375, + "logps/rejected": -494.4281921386719, + "loss": 0.0182, + "rewards/chosen": 3.7594847679138184, + "rewards/margins": 11.576079368591309, + "rewards/rejected": -7.81659460067749, + "step": 9586 + }, + { + "epoch": 0.8759250799451804, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 3.769655004151657e-07, + "logits/chosen": 452619776.0, + "logits/rejected": 652454016.0, + "logps/chosen": -265.228759765625, + "logps/rejected": -376.68292236328125, + "loss": 0.0092, + "rewards/chosen": 4.405444145202637, + "rewards/margins": 13.323752403259277, + "rewards/rejected": -8.91830825805664, + "step": 9587 + }, + { + "epoch": 0.876016445865692, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.7641800100903957e-07, + "logits/chosen": 440806809.6, + "logits/rejected": 386164864.0, + "logps/chosen": -277.0276123046875, + "logps/rejected": -656.036376953125, + "loss": 0.0167, + "rewards/chosen": 4.2616737365722654, + "rewards/margins": 14.321924336751302, + "rewards/rejected": -10.060250600179037, + "step": 9588 + }, + { + "epoch": 0.8761078117862038, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 3.7587088393030604e-07, + "logits/chosen": 266275737.6, + "logits/rejected": 523893248.0, + "logps/chosen": -155.40941162109374, + "logps/rejected": -412.1803385416667, + "loss": 0.013, + "rewards/chosen": 4.058147430419922, + "rewards/margins": 10.960243225097656, + "rewards/rejected": -6.902095794677734, + "step": 9589 + }, + { + "epoch": 0.8761991777067154, + "grad_norm": 0.283203125, + "kl": 0.0, + "learning_rate": 3.7532414922420604e-07, + "logits/chosen": 263021136.0, + "logits/rejected": 445704192.0, + "logps/chosen": -346.7259521484375, + "logps/rejected": -380.80643136160717, + "loss": 0.0016, + "rewards/chosen": 5.584173679351807, + "rewards/margins": 13.533016409192767, + "rewards/rejected": -7.94884272984096, + "step": 9590 + }, + { + "epoch": 0.876290543627227, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 3.7477779693595115e-07, + "logits/chosen": 537764416.0, + "logits/rejected": 664783616.0, + "logps/chosen": -332.1519775390625, + "logps/rejected": -756.0816650390625, + "loss": 0.0108, + "rewards/chosen": 4.05192756652832, + "rewards/margins": 15.944663047790527, + "rewards/rejected": -11.892735481262207, + "step": 9591 + }, + { + "epoch": 0.8763819095477386, + "grad_norm": 9.5, + "kl": 1.886336326599121, + "learning_rate": 3.7423182711071927e-07, + "logits/chosen": 473749650.28571427, + "logits/rejected": 437556288.0, + "logps/chosen": -303.3461216517857, + "logps/rejected": -402.3445739746094, + "loss": 0.0597, + "rewards/chosen": 2.958658218383789, + "rewards/margins": 10.192630290985107, + "rewards/rejected": -7.233972072601318, + "step": 9592 + }, + { + "epoch": 0.8764732754682504, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 3.736862397936564e-07, + "logits/chosen": 532928921.6, + "logits/rejected": 715271594.6666666, + "logps/chosen": -338.45302734375, + "logps/rejected": -605.6555582682291, + "loss": 0.0099, + "rewards/chosen": 4.644356536865234, + "rewards/margins": 12.728395589192708, + "rewards/rejected": -8.084039052327475, + "step": 9593 + }, + { + "epoch": 0.876564641388762, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 3.731410350298775e-07, + "logits/chosen": 447069632.0, + "logits/rejected": 369730720.0, + "logps/chosen": -205.06158447265625, + "logps/rejected": -441.26593017578125, + "loss": 0.1428, + "rewards/chosen": 1.291210651397705, + "rewards/margins": 10.560791492462158, + "rewards/rejected": -9.269580841064453, + "step": 9594 + }, + { + "epoch": 0.8766560073092736, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 3.725962128644661e-07, + "logits/chosen": 926602240.0, + "logits/rejected": 691567718.4, + "logps/chosen": -379.1863199869792, + "logps/rejected": -434.410009765625, + "loss": 0.039, + "rewards/chosen": 3.4798361460367837, + "rewards/margins": 11.91112429300944, + "rewards/rejected": -8.431288146972657, + "step": 9595 + }, + { + "epoch": 0.8767473732297852, + "grad_norm": 4.34375, + "kl": 0.42559814453125, + "learning_rate": 3.7205177334247445e-07, + "logits/chosen": 996086345.1428572, + "logits/rejected": 486744576.0, + "logps/chosen": -475.1874302455357, + "logps/rejected": -258.9960632324219, + "loss": 0.0365, + "rewards/chosen": 3.5237944466727122, + "rewards/margins": 9.473090921129499, + "rewards/rejected": -5.949296474456787, + "step": 9596 + }, + { + "epoch": 0.876838739150297, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 3.71507716508922e-07, + "logits/chosen": 600175769.6, + "logits/rejected": 508999210.6666667, + "logps/chosen": -377.913330078125, + "logps/rejected": -472.0948893229167, + "loss": 0.0173, + "rewards/chosen": 3.7967208862304687, + "rewards/margins": 13.680938466389975, + "rewards/rejected": -9.884217580159506, + "step": 9597 + }, + { + "epoch": 0.8769301050708086, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 3.709640424087968e-07, + "logits/chosen": 731207744.0, + "logits/rejected": 456833760.0, + "logps/chosen": -163.42811584472656, + "logps/rejected": -444.42913818359375, + "loss": 0.1119, + "rewards/chosen": 2.8349366188049316, + "rewards/margins": 12.36287546157837, + "rewards/rejected": -9.527938842773438, + "step": 9598 + }, + { + "epoch": 0.8770214709913202, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 3.7042075108705665e-07, + "logits/chosen": 503483801.6, + "logits/rejected": 311429248.0, + "logps/chosen": -358.87177734375, + "logps/rejected": -442.1316731770833, + "loss": 0.0313, + "rewards/chosen": 3.6573818206787108, + "rewards/margins": 11.402384567260743, + "rewards/rejected": -7.745002746582031, + "step": 9599 + }, + { + "epoch": 0.8771128369118318, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 3.698778425886268e-07, + "logits/chosen": 540763136.0, + "logits/rejected": 625874602.6666666, + "logps/chosen": -294.4075927734375, + "logps/rejected": -434.6562906901042, + "loss": 0.0093, + "rewards/chosen": 3.3328490257263184, + "rewards/margins": 12.376195430755615, + "rewards/rejected": -9.043346405029297, + "step": 9600 + }, + { + "epoch": 0.8772042028323436, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 3.6933531695840073e-07, + "logits/chosen": 645826764.8, + "logits/rejected": 831103146.6666666, + "logps/chosen": -439.887939453125, + "logps/rejected": -749.9557291666666, + "loss": 0.0368, + "rewards/chosen": 3.0839052200317383, + "rewards/margins": 12.564148902893066, + "rewards/rejected": -9.480243682861328, + "step": 9601 + }, + { + "epoch": 0.8772955687528552, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 3.687931742412382e-07, + "logits/chosen": 448812352.0, + "logits/rejected": 562458752.0, + "logps/chosen": -374.213134765625, + "logps/rejected": -547.8941243489584, + "loss": 0.112, + "rewards/chosen": 3.823324680328369, + "rewards/margins": 11.179110050201416, + "rewards/rejected": -7.355785369873047, + "step": 9602 + }, + { + "epoch": 0.8773869346733668, + "grad_norm": 0.9296875, + "kl": 0.0, + "learning_rate": 3.682514144819727e-07, + "logits/chosen": 355996569.6, + "logits/rejected": 526458752.0, + "logps/chosen": -250.930224609375, + "logps/rejected": -350.3302001953125, + "loss": 0.0068, + "rewards/chosen": 4.671306991577149, + "rewards/margins": 12.603002293904623, + "rewards/rejected": -7.931695302327474, + "step": 9603 + }, + { + "epoch": 0.8774783005938784, + "grad_norm": 29.5, + "kl": 0.0, + "learning_rate": 3.677100377254006e-07, + "logits/chosen": 745504640.0, + "logits/rejected": 713520640.0, + "logps/chosen": -477.583984375, + "logps/rejected": -329.5674641927083, + "loss": 0.0401, + "rewards/chosen": 4.333010673522949, + "rewards/margins": 11.07732105255127, + "rewards/rejected": -6.74431037902832, + "step": 9604 + }, + { + "epoch": 0.8775696665143902, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 3.671690440162895e-07, + "logits/chosen": 772981440.0, + "logits/rejected": 1300791680.0, + "logps/chosen": -353.98712158203125, + "logps/rejected": -641.6010131835938, + "loss": 0.0136, + "rewards/chosen": 4.332820415496826, + "rewards/margins": 12.810128688812256, + "rewards/rejected": -8.47730827331543, + "step": 9605 + }, + { + "epoch": 0.8776610324349018, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.6662843339937416e-07, + "logits/chosen": 631183616.0, + "logits/rejected": 962997760.0, + "logps/chosen": -317.145654296875, + "logps/rejected": -617.5457356770834, + "loss": 0.0188, + "rewards/chosen": 3.710858154296875, + "rewards/margins": 11.943517812093098, + "rewards/rejected": -8.232659657796225, + "step": 9606 + }, + { + "epoch": 0.8777523983554134, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 3.6608820591935767e-07, + "logits/chosen": 513011968.0, + "logits/rejected": 484045472.0, + "logps/chosen": -333.7507019042969, + "logps/rejected": -413.0955810546875, + "loss": 0.0168, + "rewards/chosen": 3.4368720054626465, + "rewards/margins": 11.589906215667725, + "rewards/rejected": -8.153034210205078, + "step": 9607 + }, + { + "epoch": 0.877843764275925, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 3.6554836162091266e-07, + "logits/chosen": 390505472.0, + "logits/rejected": 692751616.0, + "logps/chosen": -302.94683837890625, + "logps/rejected": -549.091796875, + "loss": 0.0149, + "rewards/chosen": 4.011617183685303, + "rewards/margins": 13.392787456512451, + "rewards/rejected": -9.381170272827148, + "step": 9608 + }, + { + "epoch": 0.8779351301964368, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 3.6500890054867956e-07, + "logits/chosen": 497118976.0, + "logits/rejected": 505183897.6, + "logps/chosen": -289.8085530598958, + "logps/rejected": -551.1771484375, + "loss": 0.0273, + "rewards/chosen": 2.8038276036580405, + "rewards/margins": 11.82915414174398, + "rewards/rejected": -9.025326538085938, + "step": 9609 + }, + { + "epoch": 0.8780264961169484, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 3.6446982274726594e-07, + "logits/chosen": 402593504.0, + "logits/rejected": 370878912.0, + "logps/chosen": -342.43463134765625, + "logps/rejected": -195.77078247070312, + "loss": 0.0173, + "rewards/chosen": 3.993813991546631, + "rewards/margins": 10.327272891998291, + "rewards/rejected": -6.33345890045166, + "step": 9610 + }, + { + "epoch": 0.87811786203746, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 3.6393112826124843e-07, + "logits/chosen": 349891264.0, + "logits/rejected": 460706918.4, + "logps/chosen": -242.06494140625, + "logps/rejected": -670.915478515625, + "loss": 0.0089, + "rewards/chosen": 4.19752820332845, + "rewards/margins": 12.496621831258139, + "rewards/rejected": -8.299093627929688, + "step": 9611 + }, + { + "epoch": 0.8782092279579716, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 3.6339281713517304e-07, + "logits/chosen": 704808652.8, + "logits/rejected": 526884522.6666667, + "logps/chosen": -226.3262939453125, + "logps/rejected": -649.2228190104166, + "loss": 0.0173, + "rewards/chosen": 4.0028430938720705, + "rewards/margins": 14.502863438924155, + "rewards/rejected": -10.500020345052084, + "step": 9612 + }, + { + "epoch": 0.8783005938784834, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 3.628548894135525e-07, + "logits/chosen": 519885670.4, + "logits/rejected": 580786261.3333334, + "logps/chosen": -415.9119140625, + "logps/rejected": -275.72747802734375, + "loss": 0.0327, + "rewards/chosen": 3.1558189392089844, + "rewards/margins": 9.393889109293621, + "rewards/rejected": -6.238070170084636, + "step": 9613 + }, + { + "epoch": 0.878391959798995, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 3.623173451408685e-07, + "logits/chosen": 490398848.0, + "logits/rejected": 697663872.0, + "logps/chosen": -240.26676432291666, + "logps/rejected": -655.9917602539062, + "loss": 0.1218, + "rewards/chosen": 3.5713650385538735, + "rewards/margins": 14.31700579325358, + "rewards/rejected": -10.745640754699707, + "step": 9614 + }, + { + "epoch": 0.8784833257195066, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 3.617801843615709e-07, + "logits/chosen": 648423253.3333334, + "logits/rejected": 519269734.4, + "logps/chosen": -406.8468424479167, + "logps/rejected": -590.9095703125, + "loss": 0.0611, + "rewards/chosen": 3.5022691090901694, + "rewards/margins": 14.020404179890951, + "rewards/rejected": -10.518135070800781, + "step": 9615 + }, + { + "epoch": 0.8785746916400182, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 3.612434071200771e-07, + "logits/chosen": 604409728.0, + "logits/rejected": 669817173.3333334, + "logps/chosen": -101.12154388427734, + "logps/rejected": -485.38623046875, + "loss": 0.0081, + "rewards/chosen": 3.4352753162384033, + "rewards/margins": 12.96592895189921, + "rewards/rejected": -9.530653635660807, + "step": 9616 + }, + { + "epoch": 0.87866605756053, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 3.60707013460776e-07, + "logits/chosen": 543304704.0, + "logits/rejected": 423077568.0, + "logps/chosen": -279.1001892089844, + "logps/rejected": -548.2238159179688, + "loss": 0.0252, + "rewards/chosen": 3.1607799530029297, + "rewards/margins": 14.081016540527344, + "rewards/rejected": -10.920236587524414, + "step": 9617 + }, + { + "epoch": 0.8787574234810416, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.6017100342802037e-07, + "logits/chosen": 741580970.6666666, + "logits/rejected": 728352972.8, + "logps/chosen": -518.9473470052084, + "logps/rejected": -318.401611328125, + "loss": 0.0256, + "rewards/chosen": 2.6741822560628257, + "rewards/margins": 10.86219342549642, + "rewards/rejected": -8.188011169433594, + "step": 9618 + }, + { + "epoch": 0.8788487894015532, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 3.596353770661337e-07, + "logits/chosen": 967142092.8, + "logits/rejected": 707104981.3333334, + "logps/chosen": -204.43564453125, + "logps/rejected": -516.9615885416666, + "loss": 0.0277, + "rewards/chosen": 3.6624305725097654, + "rewards/margins": 14.609803009033204, + "rewards/rejected": -10.947372436523438, + "step": 9619 + }, + { + "epoch": 0.8789401553220648, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 3.591001344194067e-07, + "logits/chosen": 431735744.0, + "logits/rejected": 418625066.6666667, + "logps/chosen": -358.25836181640625, + "logps/rejected": -413.1365559895833, + "loss": 0.0052, + "rewards/chosen": 4.134844779968262, + "rewards/margins": 13.363390922546387, + "rewards/rejected": -9.228546142578125, + "step": 9620 + }, + { + "epoch": 0.8790315212425766, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 3.585652755321012e-07, + "logits/chosen": 911523840.0, + "logits/rejected": 552350566.4, + "logps/chosen": -364.3184407552083, + "logps/rejected": -504.2599609375, + "loss": 0.0079, + "rewards/chosen": 3.9143616358439126, + "rewards/margins": 13.87879721323649, + "rewards/rejected": -9.964435577392578, + "step": 9621 + }, + { + "epoch": 0.8791228871630882, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 3.5803080044844283e-07, + "logits/chosen": 579793728.0, + "logits/rejected": 1069808384.0, + "logps/chosen": -622.4617919921875, + "logps/rejected": -611.4779663085938, + "loss": 0.0255, + "rewards/chosen": 4.243920803070068, + "rewards/margins": 13.870313167572021, + "rewards/rejected": -9.626392364501953, + "step": 9622 + }, + { + "epoch": 0.8792142530835998, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 3.5749670921262744e-07, + "logits/chosen": 356218720.0, + "logits/rejected": 541829376.0, + "logps/chosen": -751.6292114257812, + "logps/rejected": -580.2200927734375, + "loss": 0.0039, + "rewards/chosen": 4.278578281402588, + "rewards/margins": 13.364477634429932, + "rewards/rejected": -9.085899353027344, + "step": 9623 + }, + { + "epoch": 0.8793056190041114, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 3.5696300186882194e-07, + "logits/chosen": 408088405.3333333, + "logits/rejected": 351101286.4, + "logps/chosen": -205.4364217122396, + "logps/rejected": -575.02783203125, + "loss": 0.0056, + "rewards/chosen": 4.860241889953613, + "rewards/margins": 16.00786609649658, + "rewards/rejected": -11.147624206542968, + "step": 9624 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 3.5642967846115705e-07, + "logits/chosen": 674479189.3333334, + "logits/rejected": 339126592.0, + "logps/chosen": -342.2591552734375, + "logps/rejected": -667.7598266601562, + "loss": 0.0225, + "rewards/chosen": 3.848184585571289, + "rewards/margins": 18.83258819580078, + "rewards/rejected": -14.984403610229492, + "step": 9625 + }, + { + "epoch": 0.8794883508451348, + "grad_norm": 68.5, + "kl": 0.0, + "learning_rate": 3.5589673903373365e-07, + "logits/chosen": 485699379.2, + "logits/rejected": 1031880618.6666666, + "logps/chosen": -221.09755859375, + "logps/rejected": -666.6050618489584, + "loss": 0.0449, + "rewards/chosen": 3.4624073028564455, + "rewards/margins": 14.33388760884603, + "rewards/rejected": -10.871480305989584, + "step": 9626 + }, + { + "epoch": 0.8795797167656464, + "grad_norm": 0.578125, + "kl": 0.0, + "learning_rate": 3.5536418363062207e-07, + "logits/chosen": 516294912.0, + "logits/rejected": 1151038122.6666667, + "logps/chosen": -336.84930419921875, + "logps/rejected": -745.919189453125, + "loss": 0.0045, + "rewards/chosen": 4.22556209564209, + "rewards/margins": 13.95046583811442, + "rewards/rejected": -9.72490374247233, + "step": 9627 + }, + { + "epoch": 0.879671082686158, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 3.548320122958582e-07, + "logits/chosen": 312625365.3333333, + "logits/rejected": 414859264.0, + "logps/chosen": -223.03767903645834, + "logps/rejected": -538.303759765625, + "loss": 0.007, + "rewards/chosen": 4.196123123168945, + "rewards/margins": 13.923888778686523, + "rewards/rejected": -9.727765655517578, + "step": 9628 + }, + { + "epoch": 0.8797624486066697, + "grad_norm": 28.25, + "kl": 2.7054882049560547, + "learning_rate": 3.54300225073449e-07, + "logits/chosen": 384778837.3333333, + "logits/rejected": 550913664.0, + "logps/chosen": -224.3651123046875, + "logps/rejected": -204.6971435546875, + "loss": 0.1448, + "rewards/chosen": 4.2811228434244795, + "rewards/margins": 8.43369213740031, + "rewards/rejected": -4.15256929397583, + "step": 9629 + }, + { + "epoch": 0.8798538145271814, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 3.5376882200736764e-07, + "logits/chosen": 325307520.0, + "logits/rejected": 458030890.6666667, + "logps/chosen": -256.53837890625, + "logps/rejected": -729.72216796875, + "loss": 0.0126, + "rewards/chosen": 4.366529083251953, + "rewards/margins": 16.344736989339193, + "rewards/rejected": -11.97820790608724, + "step": 9630 + }, + { + "epoch": 0.879945180447693, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 3.5323780314155676e-07, + "logits/chosen": 410785689.6, + "logits/rejected": 449792085.3333333, + "logps/chosen": -309.50185546875, + "logps/rejected": -129.51132202148438, + "loss": 0.1161, + "rewards/chosen": 4.497852325439453, + "rewards/margins": 7.602388381958008, + "rewards/rejected": -3.1045360565185547, + "step": 9631 + }, + { + "epoch": 0.8800365463682046, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 3.5270716851992517e-07, + "logits/chosen": 638803328.0, + "logits/rejected": 1095894323.2, + "logps/chosen": -354.5724283854167, + "logps/rejected": -374.7339599609375, + "loss": 0.0102, + "rewards/chosen": 4.009610493977864, + "rewards/margins": 13.890372975667319, + "rewards/rejected": -9.880762481689453, + "step": 9632 + }, + { + "epoch": 0.8801279122887163, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 3.521769181863538e-07, + "logits/chosen": 529944960.0, + "logits/rejected": 799186752.0, + "logps/chosen": -327.4563903808594, + "logps/rejected": -352.91302490234375, + "loss": 0.01, + "rewards/chosen": 4.43359375, + "rewards/margins": 13.213930130004883, + "rewards/rejected": -8.780336380004883, + "step": 9633 + }, + { + "epoch": 0.880219278209228, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 3.516470521846882e-07, + "logits/chosen": 517561958.4, + "logits/rejected": 228292778.66666666, + "logps/chosen": -368.115283203125, + "logps/rejected": -277.21877034505206, + "loss": 0.0798, + "rewards/chosen": 3.997509765625, + "rewards/margins": 10.995037206013997, + "rewards/rejected": -6.997527440388997, + "step": 9634 + }, + { + "epoch": 0.8803106441297396, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 3.511175705587433e-07, + "logits/chosen": 484808992.0, + "logits/rejected": 682283840.0, + "logps/chosen": -394.7447509765625, + "logps/rejected": -350.35723876953125, + "loss": 0.0159, + "rewards/chosen": 3.523277759552002, + "rewards/margins": 11.448570728302002, + "rewards/rejected": -7.92529296875, + "step": 9635 + }, + { + "epoch": 0.8804020100502512, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 3.505884733523024e-07, + "logits/chosen": 921231488.0, + "logits/rejected": 754473984.0, + "logps/chosen": -394.37579345703125, + "logps/rejected": -452.42462158203125, + "loss": 0.0411, + "rewards/chosen": 2.6641311645507812, + "rewards/margins": 11.543112754821777, + "rewards/rejected": -8.878981590270996, + "step": 9636 + }, + { + "epoch": 0.8804933759707629, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 3.500597606091166e-07, + "logits/chosen": 252662624.0, + "logits/rejected": 434814240.0, + "logps/chosen": -389.8135986328125, + "logps/rejected": -548.0348510742188, + "loss": 0.0223, + "rewards/chosen": 3.6249241828918457, + "rewards/margins": 12.463504314422607, + "rewards/rejected": -8.838580131530762, + "step": 9637 + }, + { + "epoch": 0.8805847418912746, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 3.4953143237290656e-07, + "logits/chosen": 441557162.6666667, + "logits/rejected": 605875251.2, + "logps/chosen": -237.09549967447916, + "logps/rejected": -401.353955078125, + "loss": 0.0088, + "rewards/chosen": 4.149862925211589, + "rewards/margins": 11.796039072672526, + "rewards/rejected": -7.646176147460937, + "step": 9638 + }, + { + "epoch": 0.8806761078117862, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 3.4900348868735947e-07, + "logits/chosen": 288780202.6666667, + "logits/rejected": 485680435.2, + "logps/chosen": -226.6103719075521, + "logps/rejected": -564.02939453125, + "loss": 0.0107, + "rewards/chosen": 4.048422495524089, + "rewards/margins": 12.72311045328776, + "rewards/rejected": -8.674687957763672, + "step": 9639 + }, + { + "epoch": 0.8807674737322978, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 3.4847592959613095e-07, + "logits/chosen": 606516309.3333334, + "logits/rejected": 211798816.0, + "logps/chosen": -361.973388671875, + "logps/rejected": -327.4593505859375, + "loss": 0.0554, + "rewards/chosen": 2.762997627258301, + "rewards/margins": 13.082621574401855, + "rewards/rejected": -10.319623947143555, + "step": 9640 + }, + { + "epoch": 0.8808588396528095, + "grad_norm": 0.486328125, + "kl": 0.0, + "learning_rate": 3.4794875514284567e-07, + "logits/chosen": 361469120.0, + "logits/rejected": 813416533.3333334, + "logps/chosen": -237.25189208984375, + "logps/rejected": -499.026123046875, + "loss": 0.0033, + "rewards/chosen": 4.623048782348633, + "rewards/margins": 13.554864883422852, + "rewards/rejected": -8.931816101074219, + "step": 9641 + }, + { + "epoch": 0.8809502055733212, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 3.474219653710964e-07, + "logits/chosen": 454282956.8, + "logits/rejected": 403386112.0, + "logps/chosen": -234.763037109375, + "logps/rejected": -419.8506673177083, + "loss": 0.0165, + "rewards/chosen": 3.8721519470214845, + "rewards/margins": 12.441448465983072, + "rewards/rejected": -8.569296518961588, + "step": 9642 + }, + { + "epoch": 0.8810415714938328, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 3.4689556032444396e-07, + "logits/chosen": 381303193.6, + "logits/rejected": 512071338.6666667, + "logps/chosen": -269.0308349609375, + "logps/rejected": -462.066650390625, + "loss": 0.133, + "rewards/chosen": 2.5644554138183593, + "rewards/margins": 13.40643793741862, + "rewards/rejected": -10.84198252360026, + "step": 9643 + }, + { + "epoch": 0.8811329374143444, + "grad_norm": 0.56640625, + "kl": 0.0, + "learning_rate": 3.463695400464162e-07, + "logits/chosen": 483972416.0, + "logits/rejected": 507047082.6666667, + "logps/chosen": -299.1223449707031, + "logps/rejected": -542.802490234375, + "loss": 0.003, + "rewards/chosen": 4.759848594665527, + "rewards/margins": 14.680214881896973, + "rewards/rejected": -9.920366287231445, + "step": 9644 + }, + { + "epoch": 0.8812243033348561, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 3.458439045805101e-07, + "logits/chosen": 772137386.6666666, + "logits/rejected": 418851200.0, + "logps/chosen": -324.06687418619794, + "logps/rejected": -371.8204345703125, + "loss": 0.0258, + "rewards/chosen": 3.5747693379720054, + "rewards/margins": 12.17798646291097, + "rewards/rejected": -8.603217124938965, + "step": 9645 + }, + { + "epoch": 0.8813156692553678, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 3.453186539701925e-07, + "logits/chosen": 443737280.0, + "logits/rejected": 607070208.0, + "logps/chosen": -334.0543212890625, + "logps/rejected": -381.51324462890625, + "loss": 0.0207, + "rewards/chosen": 4.046050071716309, + "rewards/margins": 12.02378225326538, + "rewards/rejected": -7.977732181549072, + "step": 9646 + }, + { + "epoch": 0.8814070351758794, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 3.4479378825889534e-07, + "logits/chosen": 316874325.3333333, + "logits/rejected": 460649216.0, + "logps/chosen": -231.72257486979166, + "logps/rejected": -583.9122314453125, + "loss": 0.0356, + "rewards/chosen": 4.0917558670043945, + "rewards/margins": 13.628335952758789, + "rewards/rejected": -9.536580085754395, + "step": 9647 + }, + { + "epoch": 0.881498401096391, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 3.442693074900211e-07, + "logits/chosen": 864167204.5714285, + "logits/rejected": 817995584.0, + "logps/chosen": -309.27064732142856, + "logps/rejected": -269.8188171386719, + "loss": 0.0246, + "rewards/chosen": 4.269759314400809, + "rewards/margins": 10.872887270791189, + "rewards/rejected": -6.603127956390381, + "step": 9648 + }, + { + "epoch": 0.8815897670169027, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.4374521170693733e-07, + "logits/chosen": 471996352.0, + "logits/rejected": 571544874.6666666, + "logps/chosen": -365.9388122558594, + "logps/rejected": -686.9813639322916, + "loss": 0.0122, + "rewards/chosen": 3.1558423042297363, + "rewards/margins": 12.343706607818604, + "rewards/rejected": -9.187864303588867, + "step": 9649 + }, + { + "epoch": 0.8816811329374143, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 3.4322150095298434e-07, + "logits/chosen": 493470208.0, + "logits/rejected": 211596384.0, + "logps/chosen": -317.7165222167969, + "logps/rejected": -320.58331298828125, + "loss": 0.0188, + "rewards/chosen": 3.7336416244506836, + "rewards/margins": 13.621638298034668, + "rewards/rejected": -9.887996673583984, + "step": 9650 + }, + { + "epoch": 0.881772498857926, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 3.42698175271467e-07, + "logits/chosen": 600836096.0, + "logits/rejected": 573720064.0, + "logps/chosen": -296.76979573567706, + "logps/rejected": -609.5185546875, + "loss": 0.0454, + "rewards/chosen": 3.3026355107625327, + "rewards/margins": 13.890956242879232, + "rewards/rejected": -10.5883207321167, + "step": 9651 + }, + { + "epoch": 0.8818638647784376, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.4217523470566006e-07, + "logits/chosen": 477739434.6666667, + "logits/rejected": 609843904.0, + "logps/chosen": -331.2894694010417, + "logps/rejected": -464.9576721191406, + "loss": 0.0226, + "rewards/chosen": 4.108071009318034, + "rewards/margins": 12.556040445963543, + "rewards/rejected": -8.447969436645508, + "step": 9652 + }, + { + "epoch": 0.8819552306989493, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 3.4165267929880397e-07, + "logits/chosen": 572825216.0, + "logits/rejected": 632797397.3333334, + "logps/chosen": -401.0353698730469, + "logps/rejected": -477.9488118489583, + "loss": 0.009, + "rewards/chosen": 3.782365322113037, + "rewards/margins": 12.468372503916422, + "rewards/rejected": -8.686007181803385, + "step": 9653 + }, + { + "epoch": 0.8820465966194609, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 3.411305090941125e-07, + "logits/chosen": 613236480.0, + "logits/rejected": 357434784.0, + "logps/chosen": -372.19097900390625, + "logps/rejected": -349.0626220703125, + "loss": 0.0206, + "rewards/chosen": 3.369868755340576, + "rewards/margins": 12.299037456512451, + "rewards/rejected": -8.929168701171875, + "step": 9654 + }, + { + "epoch": 0.8821379625399726, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 3.406087241347616e-07, + "logits/chosen": 491358784.0, + "logits/rejected": 402246816.0, + "logps/chosen": -235.63970947265625, + "logps/rejected": -301.53704833984375, + "loss": 0.0222, + "rewards/chosen": 3.7739553451538086, + "rewards/margins": 12.221840858459473, + "rewards/rejected": -8.447885513305664, + "step": 9655 + }, + { + "epoch": 0.8822293284604842, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 3.4008732446389967e-07, + "logits/chosen": 481588256.0, + "logits/rejected": 350917344.0, + "logps/chosen": -215.76129150390625, + "logps/rejected": -341.64727783203125, + "loss": 0.0191, + "rewards/chosen": 3.431021213531494, + "rewards/margins": 12.196627140045166, + "rewards/rejected": -8.765605926513672, + "step": 9656 + }, + { + "epoch": 0.8823206943809959, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 3.3956631012464103e-07, + "logits/chosen": 588747221.3333334, + "logits/rejected": 465929113.6, + "logps/chosen": -490.8629150390625, + "logps/rejected": -631.467236328125, + "loss": 0.0143, + "rewards/chosen": 3.709853490193685, + "rewards/margins": 13.34608472188314, + "rewards/rejected": -9.636231231689454, + "step": 9657 + }, + { + "epoch": 0.8824120603015075, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 3.390456811600673e-07, + "logits/chosen": 398089024.0, + "logits/rejected": 419228544.0, + "logps/chosen": -282.3255920410156, + "logps/rejected": -611.5389811197916, + "loss": 0.1205, + "rewards/chosen": 4.363615989685059, + "rewards/margins": 13.989812533060709, + "rewards/rejected": -9.62619654337565, + "step": 9658 + }, + { + "epoch": 0.8825034262220192, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 3.3852543761323186e-07, + "logits/chosen": 1126575308.8, + "logits/rejected": 751161088.0, + "logps/chosen": -421.568701171875, + "logps/rejected": -1089.3583984375, + "loss": 0.0136, + "rewards/chosen": 4.452561569213867, + "rewards/margins": 16.167451095581054, + "rewards/rejected": -11.714889526367188, + "step": 9659 + }, + { + "epoch": 0.8825947921425308, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 3.3800557952715365e-07, + "logits/chosen": 557788979.2, + "logits/rejected": 916005120.0, + "logps/chosen": -300.630419921875, + "logps/rejected": -438.4167887369792, + "loss": 0.0093, + "rewards/chosen": 4.672748565673828, + "rewards/margins": 14.048899332682291, + "rewards/rejected": -9.376150767008463, + "step": 9660 + }, + { + "epoch": 0.8826861580630425, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 3.374861069448199e-07, + "logits/chosen": 589403093.3333334, + "logits/rejected": 586586931.2, + "logps/chosen": -516.5558268229166, + "logps/rejected": -671.06708984375, + "loss": 0.0183, + "rewards/chosen": 3.036543846130371, + "rewards/margins": 11.71711025238037, + "rewards/rejected": -8.68056640625, + "step": 9661 + }, + { + "epoch": 0.8827775239835541, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 3.369670199091846e-07, + "logits/chosen": 1008179200.0, + "logits/rejected": 453850112.0, + "logps/chosen": -449.0390625, + "logps/rejected": -546.328857421875, + "loss": 0.009, + "rewards/chosen": 4.400662994384765, + "rewards/margins": 15.008465067545572, + "rewards/rejected": -10.607802073160807, + "step": 9662 + }, + { + "epoch": 0.8828688899040658, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 3.364483184631734e-07, + "logits/chosen": 516078131.2, + "logits/rejected": 608280618.6666666, + "logps/chosen": -411.5783203125, + "logps/rejected": -466.0784505208333, + "loss": 0.0206, + "rewards/chosen": 3.606404113769531, + "rewards/margins": 11.968688456217446, + "rewards/rejected": -8.362284342447916, + "step": 9663 + }, + { + "epoch": 0.8829602558245774, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 3.359300026496781e-07, + "logits/chosen": 480064000.0, + "logits/rejected": 317905280.0, + "logps/chosen": -282.8268636067708, + "logps/rejected": -397.6341552734375, + "loss": 0.0311, + "rewards/chosen": 3.741556167602539, + "rewards/margins": 13.902109146118164, + "rewards/rejected": -10.160552978515625, + "step": 9664 + }, + { + "epoch": 0.8830516217450891, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 3.3541207251155783e-07, + "logits/chosen": 1298796373.3333333, + "logits/rejected": 860451020.8, + "logps/chosen": -538.0392252604166, + "logps/rejected": -671.22509765625, + "loss": 0.0059, + "rewards/chosen": 4.213777542114258, + "rewards/margins": 12.955617141723632, + "rewards/rejected": -8.741839599609374, + "step": 9665 + }, + { + "epoch": 0.8831429876656007, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 3.3489452809164046e-07, + "logits/chosen": 450423456.0, + "logits/rejected": 537484160.0, + "logps/chosen": -334.9498291015625, + "logps/rejected": -495.0227355957031, + "loss": 0.0267, + "rewards/chosen": 3.065854787826538, + "rewards/margins": 13.44712233543396, + "rewards/rejected": -10.381267547607422, + "step": 9666 + }, + { + "epoch": 0.8832343535861124, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 3.343773694327218e-07, + "logits/chosen": 1399584768.0, + "logits/rejected": 768236544.0, + "logps/chosen": -568.5608317057291, + "logps/rejected": -545.21171875, + "loss": 0.0076, + "rewards/chosen": 4.073046048482259, + "rewards/margins": 13.482945950826007, + "rewards/rejected": -9.40989990234375, + "step": 9667 + }, + { + "epoch": 0.883325719506624, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 3.338605965775682e-07, + "logits/chosen": 994768640.0, + "logits/rejected": 632825344.0, + "logps/chosen": -222.14752197265625, + "logps/rejected": -480.5875244140625, + "loss": 0.0173, + "rewards/chosen": 3.7290780544281006, + "rewards/margins": 12.481351613998413, + "rewards/rejected": -8.752273559570312, + "step": 9668 + }, + { + "epoch": 0.8834170854271357, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 3.333442095689099e-07, + "logits/chosen": 548903893.3333334, + "logits/rejected": 608561766.4, + "logps/chosen": -325.7397867838542, + "logps/rejected": -446.60400390625, + "loss": 0.0084, + "rewards/chosen": 3.895827293395996, + "rewards/margins": 13.472591209411622, + "rewards/rejected": -9.576763916015626, + "step": 9669 + }, + { + "epoch": 0.8835084513476473, + "grad_norm": 0.8359375, + "kl": 0.0, + "learning_rate": 3.3282820844944776e-07, + "logits/chosen": 757279573.3333334, + "logits/rejected": 782706534.4, + "logps/chosen": -190.958984375, + "logps/rejected": -586.09794921875, + "loss": 0.0133, + "rewards/chosen": 3.9231739044189453, + "rewards/margins": 14.08769416809082, + "rewards/rejected": -10.164520263671875, + "step": 9670 + }, + { + "epoch": 0.883599817268159, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 3.3231259326184983e-07, + "logits/chosen": 368627675.4285714, + "logits/rejected": 285848992.0, + "logps/chosen": -301.78104073660717, + "logps/rejected": -324.1570739746094, + "loss": 0.0201, + "rewards/chosen": 4.515852519444057, + "rewards/margins": 13.90088517325265, + "rewards/rejected": -9.385032653808594, + "step": 9671 + }, + { + "epoch": 0.8836911831886706, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 3.317973640487543e-07, + "logits/chosen": 626669120.0, + "logits/rejected": 721065472.0, + "logps/chosen": -316.80291748046875, + "logps/rejected": -568.3131713867188, + "loss": 0.0199, + "rewards/chosen": 3.438539981842041, + "rewards/margins": 11.394599437713623, + "rewards/rejected": -7.956059455871582, + "step": 9672 + }, + { + "epoch": 0.8837825491091823, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 3.312825208527648e-07, + "logits/chosen": 661582293.3333334, + "logits/rejected": 325794432.0, + "logps/chosen": -415.959228515625, + "logps/rejected": -400.73614501953125, + "loss": 0.0157, + "rewards/chosen": 4.383185704549153, + "rewards/margins": 14.35625394185384, + "rewards/rejected": -9.973068237304688, + "step": 9673 + }, + { + "epoch": 0.8838739150296939, + "grad_norm": 44.5, + "kl": 0.0, + "learning_rate": 3.307680637164545e-07, + "logits/chosen": 286237926.4, + "logits/rejected": 335764565.3333333, + "logps/chosen": -178.7839111328125, + "logps/rejected": -462.7516682942708, + "loss": 0.1409, + "rewards/chosen": 2.3135711669921877, + "rewards/margins": 11.643307240804038, + "rewards/rejected": -9.32973607381185, + "step": 9674 + }, + { + "epoch": 0.8839652809502055, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 3.3025399268236393e-07, + "logits/chosen": 504512938.6666667, + "logits/rejected": 573746688.0, + "logps/chosen": -408.6663411458333, + "logps/rejected": -745.71435546875, + "loss": 0.025, + "rewards/chosen": 3.978510538736979, + "rewards/margins": 23.736584345499676, + "rewards/rejected": -19.758073806762695, + "step": 9675 + }, + { + "epoch": 0.8840566468707173, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 3.297403077930017e-07, + "logits/chosen": 629672021.3333334, + "logits/rejected": 581027584.0, + "logps/chosen": -400.9582112630208, + "logps/rejected": -549.548583984375, + "loss": 0.0106, + "rewards/chosen": 3.689589500427246, + "rewards/margins": 14.855118370056152, + "rewards/rejected": -11.165528869628906, + "step": 9676 + }, + { + "epoch": 0.8841480127912289, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.29227009090845e-07, + "logits/chosen": 407750485.3333333, + "logits/rejected": 537448704.0, + "logps/chosen": -199.44610595703125, + "logps/rejected": -361.3895263671875, + "loss": 0.0123, + "rewards/chosen": 4.25888188680013, + "rewards/margins": 12.498184712727863, + "rewards/rejected": -8.239302825927734, + "step": 9677 + }, + { + "epoch": 0.8842393787117405, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 3.287140966183389e-07, + "logits/chosen": 334231372.8, + "logits/rejected": 451276117.3333333, + "logps/chosen": -224.96044921875, + "logps/rejected": -508.7538655598958, + "loss": 0.0392, + "rewards/chosen": 3.381536865234375, + "rewards/margins": 12.560747528076172, + "rewards/rejected": -9.179210662841797, + "step": 9678 + }, + { + "epoch": 0.8843307446322521, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 3.282015704178965e-07, + "logits/chosen": 961865113.6, + "logits/rejected": 885681920.0, + "logps/chosen": -434.06689453125, + "logps/rejected": -384.5284016927083, + "loss": 0.0868, + "rewards/chosen": 4.110355377197266, + "rewards/margins": 10.622841517130535, + "rewards/rejected": -6.5124861399332685, + "step": 9679 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 3.2768943053189893e-07, + "logits/chosen": 770018560.0, + "logits/rejected": 902399658.6666666, + "logps/chosen": -327.38470458984375, + "logps/rejected": -384.4583333333333, + "loss": 0.0126, + "rewards/chosen": 3.128997802734375, + "rewards/margins": 11.582351684570312, + "rewards/rejected": -8.453353881835938, + "step": 9680 + }, + { + "epoch": 0.8845134764732755, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 3.271776770026963e-07, + "logits/chosen": 393691520.0, + "logits/rejected": 260243840.0, + "logps/chosen": -291.086083984375, + "logps/rejected": -451.1820882161458, + "loss": 0.0088, + "rewards/chosen": 4.43294677734375, + "rewards/margins": 14.099656677246093, + "rewards/rejected": -9.666709899902344, + "step": 9681 + }, + { + "epoch": 0.8846048423937871, + "grad_norm": 37.5, + "kl": 0.0, + "learning_rate": 3.266663098726047e-07, + "logits/chosen": 536170400.0, + "logits/rejected": 329751040.0, + "logps/chosen": -392.77142333984375, + "logps/rejected": -375.5873718261719, + "loss": 0.1002, + "rewards/chosen": 3.016561508178711, + "rewards/margins": 14.399200439453125, + "rewards/rejected": -11.382638931274414, + "step": 9682 + }, + { + "epoch": 0.8846962083142987, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 3.261553291839092e-07, + "logits/chosen": 681498026.6666666, + "logits/rejected": 497759078.4, + "logps/chosen": -357.4051106770833, + "logps/rejected": -468.446630859375, + "loss": 0.0226, + "rewards/chosen": 2.7824198404947915, + "rewards/margins": 12.072098795572916, + "rewards/rejected": -9.289678955078125, + "step": 9683 + }, + { + "epoch": 0.8847875742348105, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 3.256447349788644e-07, + "logits/chosen": 715538073.6, + "logits/rejected": 291867669.3333333, + "logps/chosen": -498.1330078125, + "logps/rejected": -460.6110026041667, + "loss": 0.0157, + "rewards/chosen": 4.245731353759766, + "rewards/margins": 15.188685099283854, + "rewards/rejected": -10.942953745524088, + "step": 9684 + }, + { + "epoch": 0.8848789401553221, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 3.2513452729969144e-07, + "logits/chosen": 617924864.0, + "logits/rejected": 320381344.0, + "logps/chosen": -255.28645833333334, + "logps/rejected": -332.5142517089844, + "loss": 0.0129, + "rewards/chosen": 4.45491886138916, + "rewards/margins": 13.428759574890137, + "rewards/rejected": -8.973840713500977, + "step": 9685 + }, + { + "epoch": 0.8849703060758337, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 3.246247061885793e-07, + "logits/chosen": 828106752.0, + "logits/rejected": 418110592.0, + "logps/chosen": -417.1781005859375, + "logps/rejected": -503.6962076822917, + "loss": 0.005, + "rewards/chosen": 3.896258592605591, + "rewards/margins": 14.564626614252726, + "rewards/rejected": -10.668368021647135, + "step": 9686 + }, + { + "epoch": 0.8850616719963453, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 3.241152716876861e-07, + "logits/chosen": 595716672.0, + "logits/rejected": 397094400.0, + "logps/chosen": -370.28485107421875, + "logps/rejected": -593.2724609375, + "loss": 0.0149, + "rewards/chosen": 4.221867561340332, + "rewards/margins": 13.37503719329834, + "rewards/rejected": -9.153169631958008, + "step": 9687 + }, + { + "epoch": 0.8851530379168571, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 3.236062238391358e-07, + "logits/chosen": 463232170.6666667, + "logits/rejected": 423784345.6, + "logps/chosen": -317.8787434895833, + "logps/rejected": -414.9736328125, + "loss": 0.1126, + "rewards/chosen": 2.585303624471029, + "rewards/margins": 13.354845555623372, + "rewards/rejected": -10.769541931152343, + "step": 9688 + }, + { + "epoch": 0.8852444038373687, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 3.230975626850236e-07, + "logits/chosen": 364272096.0, + "logits/rejected": 552889408.0, + "logps/chosen": -236.2436065673828, + "logps/rejected": -417.9993896484375, + "loss": 0.031, + "rewards/chosen": 2.9438610076904297, + "rewards/margins": 12.956815719604492, + "rewards/rejected": -10.012954711914062, + "step": 9689 + }, + { + "epoch": 0.8853357697578803, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 3.225892882674109e-07, + "logits/chosen": 357420492.8, + "logits/rejected": 559506517.3333334, + "logps/chosen": -244.144970703125, + "logps/rejected": -450.2696940104167, + "loss": 0.0855, + "rewards/chosen": 4.343304824829102, + "rewards/margins": 11.666794967651366, + "rewards/rejected": -7.323490142822266, + "step": 9690 + }, + { + "epoch": 0.8854271356783919, + "grad_norm": 56.0, + "kl": 0.0, + "learning_rate": 3.2208140062832685e-07, + "logits/chosen": 696250304.0, + "logits/rejected": 985869653.3333334, + "logps/chosen": -441.5428466796875, + "logps/rejected": -286.4849853515625, + "loss": 0.0451, + "rewards/chosen": 3.5868425369262695, + "rewards/margins": 10.513740857442219, + "rewards/rejected": -6.92689832051595, + "step": 9691 + }, + { + "epoch": 0.8855185015989037, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 3.2157389980976783e-07, + "logits/chosen": 313217484.8, + "logits/rejected": 315652842.6666667, + "logps/chosen": -255.157861328125, + "logps/rejected": -399.0531819661458, + "loss": 0.0165, + "rewards/chosen": 4.224664306640625, + "rewards/margins": 12.240537897745767, + "rewards/rejected": -8.015873591105143, + "step": 9692 + }, + { + "epoch": 0.8856098675194153, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 3.21066785853702e-07, + "logits/chosen": 240023616.0, + "logits/rejected": 595468373.3333334, + "logps/chosen": -436.2166748046875, + "logps/rejected": -532.47412109375, + "loss": 0.0168, + "rewards/chosen": 2.724905490875244, + "rewards/margins": 11.688979307810465, + "rewards/rejected": -8.96407381693522, + "step": 9693 + }, + { + "epoch": 0.8857012334399269, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 3.205600588020613e-07, + "logits/chosen": 916307456.0, + "logits/rejected": 562210389.3333334, + "logps/chosen": -659.2167358398438, + "logps/rejected": -376.4064534505208, + "loss": 0.0137, + "rewards/chosen": 2.9906814098358154, + "rewards/margins": 11.439644575119019, + "rewards/rejected": -8.448963165283203, + "step": 9694 + }, + { + "epoch": 0.8857925993604385, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 3.200537186967484e-07, + "logits/chosen": 680468778.6666666, + "logits/rejected": 488205056.0, + "logps/chosen": -272.40651448567706, + "logps/rejected": -461.790234375, + "loss": 0.0061, + "rewards/chosen": 4.539933522542317, + "rewards/margins": 14.373370869954428, + "rewards/rejected": -9.83343734741211, + "step": 9695 + }, + { + "epoch": 0.8858839652809503, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 3.1954776557963086e-07, + "logits/chosen": 488081517.71428573, + "logits/rejected": 285572416.0, + "logps/chosen": -341.66469029017856, + "logps/rejected": -501.09747314453125, + "loss": 0.014, + "rewards/chosen": 4.724659511021206, + "rewards/margins": 15.388050624302455, + "rewards/rejected": -10.66339111328125, + "step": 9696 + }, + { + "epoch": 0.8859753312014619, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 3.1904219949254856e-07, + "logits/chosen": 553539413.3333334, + "logits/rejected": 749504153.6, + "logps/chosen": -339.4987386067708, + "logps/rejected": -568.87822265625, + "loss": 0.0157, + "rewards/chosen": 3.2360312143961587, + "rewards/margins": 12.56656634012858, + "rewards/rejected": -9.330535125732421, + "step": 9697 + }, + { + "epoch": 0.8860666971219735, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 3.1853702047730697e-07, + "logits/chosen": 529606613.3333333, + "logits/rejected": 322264544.0, + "logps/chosen": -219.65155029296875, + "logps/rejected": -206.50027465820312, + "loss": 0.0638, + "rewards/chosen": 4.38152535756429, + "rewards/margins": 8.440567811330158, + "rewards/rejected": -4.059042453765869, + "step": 9698 + }, + { + "epoch": 0.8861580630424851, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 3.180322285756782e-07, + "logits/chosen": 562877476.5714285, + "logits/rejected": 514708608.0, + "logps/chosen": -322.60438755580356, + "logps/rejected": -1424.53271484375, + "loss": 0.0368, + "rewards/chosen": 3.8439036778041293, + "rewards/margins": 24.615363257271902, + "rewards/rejected": -20.771459579467773, + "step": 9699 + }, + { + "epoch": 0.8862494289629969, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 3.175278238294055e-07, + "logits/chosen": 576717721.6, + "logits/rejected": 516932394.6666667, + "logps/chosen": -435.041796875, + "logps/rejected": -534.582763671875, + "loss": 0.0182, + "rewards/chosen": 3.7232742309570312, + "rewards/margins": 15.079765319824219, + "rewards/rejected": -11.356491088867188, + "step": 9700 + }, + { + "epoch": 0.8863407948835085, + "grad_norm": 4.03125, + "kl": 3.484302520751953, + "learning_rate": 3.1702380628019723e-07, + "logits/chosen": 537307428.5714285, + "logits/rejected": 704217984.0, + "logps/chosen": -480.37587193080356, + "logps/rejected": -526.4056396484375, + "loss": 0.028, + "rewards/chosen": 4.16645622253418, + "rewards/margins": 13.365046501159668, + "rewards/rejected": -9.198590278625488, + "step": 9701 + }, + { + "epoch": 0.8864321608040201, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 3.1652017596973174e-07, + "logits/chosen": 315847744.0, + "logits/rejected": 470346678.85714287, + "logps/chosen": -219.0970001220703, + "logps/rejected": -513.6348702566964, + "loss": 0.0035, + "rewards/chosen": 3.576887607574463, + "rewards/margins": 14.30356672831944, + "rewards/rejected": -10.726679120744977, + "step": 9702 + }, + { + "epoch": 0.8865235267245317, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 3.1601693293965397e-07, + "logits/chosen": 596062272.0, + "logits/rejected": 322408512.0, + "logps/chosen": -265.8503112792969, + "logps/rejected": -429.954833984375, + "loss": 0.0171, + "rewards/chosen": 3.3837828636169434, + "rewards/margins": 12.149438381195068, + "rewards/rejected": -8.765655517578125, + "step": 9703 + }, + { + "epoch": 0.8866148926450435, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.1551407723157734e-07, + "logits/chosen": 655833344.0, + "logits/rejected": 1118441252.5714285, + "logps/chosen": -392.0536804199219, + "logps/rejected": -440.2372349330357, + "loss": 0.0443, + "rewards/chosen": 2.9119904041290283, + "rewards/margins": 11.591911145618983, + "rewards/rejected": -8.679920741489955, + "step": 9704 + }, + { + "epoch": 0.8867062585655551, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.1501160888708413e-07, + "logits/chosen": 631610282.6666666, + "logits/rejected": 1099014348.8, + "logps/chosen": -312.2357177734375, + "logps/rejected": -637.70234375, + "loss": 0.0185, + "rewards/chosen": 3.0316670735677085, + "rewards/margins": 13.549084981282553, + "rewards/rejected": -10.517417907714844, + "step": 9705 + }, + { + "epoch": 0.8867976244860667, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 3.1450952794772394e-07, + "logits/chosen": 1129072128.0, + "logits/rejected": 1146799206.4, + "logps/chosen": -448.4745686848958, + "logps/rejected": -552.8240234375, + "loss": 0.0123, + "rewards/chosen": 3.4215828577677407, + "rewards/margins": 13.221176973978677, + "rewards/rejected": -9.799594116210937, + "step": 9706 + }, + { + "epoch": 0.8868889904065783, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 3.1400783445501347e-07, + "logits/chosen": 751804608.0, + "logits/rejected": 963861589.3333334, + "logps/chosen": -374.3720703125, + "logps/rejected": -446.4471842447917, + "loss": 0.0158, + "rewards/chosen": 2.705915927886963, + "rewards/margins": 11.912522474924723, + "rewards/rejected": -9.20660654703776, + "step": 9707 + }, + { + "epoch": 0.88698035632709, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 3.135065284504385e-07, + "logits/chosen": 573469952.0, + "logits/rejected": 414605504.0, + "logps/chosen": -388.636962890625, + "logps/rejected": -363.35015869140625, + "loss": 0.021, + "rewards/chosen": 3.4038305282592773, + "rewards/margins": 11.828389167785645, + "rewards/rejected": -8.424558639526367, + "step": 9708 + }, + { + "epoch": 0.8870717222476017, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 3.130056099754514e-07, + "logits/chosen": 533806080.0, + "logits/rejected": 562265753.6, + "logps/chosen": -197.2413330078125, + "logps/rejected": -327.5869384765625, + "loss": 0.0117, + "rewards/chosen": 3.61783504486084, + "rewards/margins": 11.245419883728028, + "rewards/rejected": -7.627584838867188, + "step": 9709 + }, + { + "epoch": 0.8871630881681133, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 3.1250507907147563e-07, + "logits/chosen": 397028672.0, + "logits/rejected": 418147370.6666667, + "logps/chosen": -156.79139709472656, + "logps/rejected": -471.099365234375, + "loss": 0.0117, + "rewards/chosen": 3.4967877864837646, + "rewards/margins": 12.277635971705118, + "rewards/rejected": -8.780848185221354, + "step": 9710 + }, + { + "epoch": 0.8872544540886249, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 3.1200493577989875e-07, + "logits/chosen": 570355968.0, + "logits/rejected": 826958028.8, + "logps/chosen": -216.1681925455729, + "logps/rejected": -570.2484375, + "loss": 0.0128, + "rewards/chosen": 3.5339171091715493, + "rewards/margins": 12.71535784403483, + "rewards/rejected": -9.181440734863282, + "step": 9711 + }, + { + "epoch": 0.8873458200091366, + "grad_norm": 12.75, + "kl": 7.4171295166015625, + "learning_rate": 3.1150518014207877e-07, + "logits/chosen": 645092352.0, + "logits/rejected": 332877568.0, + "logps/chosen": -407.9868861607143, + "logps/rejected": -272.7968444824219, + "loss": 0.0891, + "rewards/chosen": 3.4991209847586497, + "rewards/margins": 11.550870214189802, + "rewards/rejected": -8.051749229431152, + "step": 9712 + }, + { + "epoch": 0.8874371859296483, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 3.1100581219933923e-07, + "logits/chosen": 535276458.6666667, + "logits/rejected": 488685772.8, + "logps/chosen": -351.272216796875, + "logps/rejected": -384.11376953125, + "loss": 0.008, + "rewards/chosen": 4.041146914164226, + "rewards/margins": 12.359792391459148, + "rewards/rejected": -8.318645477294922, + "step": 9713 + }, + { + "epoch": 0.8875285518501599, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 3.10506831992976e-07, + "logits/chosen": 293882752.0, + "logits/rejected": 433082965.3333333, + "logps/chosen": -69.53715515136719, + "logps/rejected": -652.6588541666666, + "loss": 0.009, + "rewards/chosen": 3.7086567878723145, + "rewards/margins": 15.243605772654215, + "rewards/rejected": -11.5349489847819, + "step": 9714 + }, + { + "epoch": 0.8876199177706715, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 3.1000823956424785e-07, + "logits/chosen": 580510933.3333334, + "logits/rejected": 503843020.8, + "logps/chosen": -417.7317301432292, + "logps/rejected": -537.32509765625, + "loss": 0.0094, + "rewards/chosen": 3.897342046101888, + "rewards/margins": 11.29148801167806, + "rewards/rejected": -7.394145965576172, + "step": 9715 + }, + { + "epoch": 0.8877112836911832, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 3.0951003495438504e-07, + "logits/chosen": 580056678.4, + "logits/rejected": 609290666.6666666, + "logps/chosen": -372.310693359375, + "logps/rejected": -271.4679768880208, + "loss": 0.021, + "rewards/chosen": 3.6963050842285154, + "rewards/margins": 12.089012908935548, + "rewards/rejected": -8.392707824707031, + "step": 9716 + }, + { + "epoch": 0.8878026496116949, + "grad_norm": 0.57421875, + "kl": 0.0, + "learning_rate": 3.090122182045835e-07, + "logits/chosen": 488478165.3333333, + "logits/rejected": 405027481.6, + "logps/chosen": -249.9547119140625, + "logps/rejected": -560.524609375, + "loss": 0.0036, + "rewards/chosen": 5.924082438151042, + "rewards/margins": 14.06614023844401, + "rewards/rejected": -8.142057800292969, + "step": 9717 + }, + { + "epoch": 0.8878940155322065, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 3.0851478935600864e-07, + "logits/chosen": 1104091989.3333333, + "logits/rejected": 1573464883.2, + "logps/chosen": -272.59690348307294, + "logps/rejected": -637.147265625, + "loss": 0.0111, + "rewards/chosen": 3.8025201161702475, + "rewards/margins": 14.200008519490561, + "rewards/rejected": -10.397488403320313, + "step": 9718 + }, + { + "epoch": 0.8879853814527181, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 3.080177484497937e-07, + "logits/chosen": 446151424.0, + "logits/rejected": 325826624.0, + "logps/chosen": -276.04217529296875, + "logps/rejected": -343.83843994140625, + "loss": 0.0212, + "rewards/chosen": 3.3058085441589355, + "rewards/margins": 11.793881893157959, + "rewards/rejected": -8.488073348999023, + "step": 9719 + }, + { + "epoch": 0.8880767473732298, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 3.0752109552703856e-07, + "logits/chosen": 328940672.0, + "logits/rejected": 440399667.2, + "logps/chosen": -287.6990966796875, + "logps/rejected": -392.798486328125, + "loss": 0.0124, + "rewards/chosen": 4.25906499226888, + "rewards/margins": 13.688798268636067, + "rewards/rejected": -9.429733276367188, + "step": 9720 + }, + { + "epoch": 0.8881681132937415, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.0702483062881206e-07, + "logits/chosen": 227376341.33333334, + "logits/rejected": 664259123.2, + "logps/chosen": -402.3660888671875, + "logps/rejected": -547.32080078125, + "loss": 0.0076, + "rewards/chosen": 4.321338335673015, + "rewards/margins": 13.791943041483563, + "rewards/rejected": -9.470604705810548, + "step": 9721 + }, + { + "epoch": 0.8882594792142531, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 3.0652895379614966e-07, + "logits/chosen": 503170218.6666667, + "logits/rejected": 496572928.0, + "logps/chosen": -352.6141764322917, + "logps/rejected": -455.12529296875, + "loss": 0.0059, + "rewards/chosen": 4.396783192952474, + "rewards/margins": 14.903870900472004, + "rewards/rejected": -10.507087707519531, + "step": 9722 + }, + { + "epoch": 0.8883508451347647, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 3.0603346507005806e-07, + "logits/chosen": 829815910.4, + "logits/rejected": 854550698.6666666, + "logps/chosen": -302.1168212890625, + "logps/rejected": -602.7821858723959, + "loss": 0.0149, + "rewards/chosen": 3.9333118438720702, + "rewards/margins": 16.355991744995116, + "rewards/rejected": -12.422679901123047, + "step": 9723 + }, + { + "epoch": 0.8884422110552764, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 3.0553836449150777e-07, + "logits/chosen": 556501845.3333334, + "logits/rejected": 593159270.4, + "logps/chosen": -503.2278645833333, + "logps/rejected": -593.38369140625, + "loss": 0.0069, + "rewards/chosen": 4.295261700948079, + "rewards/margins": 13.26935838063558, + "rewards/rejected": -8.9740966796875, + "step": 9724 + }, + { + "epoch": 0.888533576975788, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 3.050436521014399e-07, + "logits/chosen": 457944832.0, + "logits/rejected": 695864780.8, + "logps/chosen": -234.088134765625, + "logps/rejected": -605.49775390625, + "loss": 0.1426, + "rewards/chosen": 2.335088094075521, + "rewards/margins": 9.54380849202474, + "rewards/rejected": -7.208720397949219, + "step": 9725 + }, + { + "epoch": 0.8886249428962997, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 3.0454932794076133e-07, + "logits/chosen": 507947072.0, + "logits/rejected": 561139456.0, + "logps/chosen": -171.1442108154297, + "logps/rejected": -328.06634521484375, + "loss": 0.0189, + "rewards/chosen": 3.312474012374878, + "rewards/margins": 13.077685117721558, + "rewards/rejected": -9.76521110534668, + "step": 9726 + }, + { + "epoch": 0.8887163088168113, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.040553920503503e-07, + "logits/chosen": 526294528.0, + "logits/rejected": 706487936.0, + "logps/chosen": -384.3227945963542, + "logps/rejected": -390.0263671875, + "loss": 0.0262, + "rewards/chosen": 3.6206607818603516, + "rewards/margins": 11.035361289978027, + "rewards/rejected": -7.414700508117676, + "step": 9727 + }, + { + "epoch": 0.888807674737323, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 3.0356184447104976e-07, + "logits/chosen": 416308480.0, + "logits/rejected": 613899520.0, + "logps/chosen": -294.54937744140625, + "logps/rejected": -473.3707275390625, + "loss": 0.0073, + "rewards/chosen": 4.451529502868652, + "rewards/margins": 12.727246284484863, + "rewards/rejected": -8.275716781616211, + "step": 9728 + }, + { + "epoch": 0.8888990406578346, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 3.030686852436704e-07, + "logits/chosen": 319672128.0, + "logits/rejected": 738063104.0, + "logps/chosen": -241.72743225097656, + "logps/rejected": -354.8259684244792, + "loss": 0.013, + "rewards/chosen": 5.885169982910156, + "rewards/margins": 13.020578384399414, + "rewards/rejected": -7.135408401489258, + "step": 9729 + }, + { + "epoch": 0.8889904065783463, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 3.0257591440899235e-07, + "logits/chosen": 582700928.0, + "logits/rejected": 493898905.6, + "logps/chosen": -642.824462890625, + "logps/rejected": -539.10703125, + "loss": 0.0069, + "rewards/chosen": 4.297610600789388, + "rewards/margins": 12.799508794148764, + "rewards/rejected": -8.501898193359375, + "step": 9730 + }, + { + "epoch": 0.8890817724988579, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 3.0208353200776366e-07, + "logits/chosen": 638844672.0, + "logits/rejected": 1010053888.0, + "logps/chosen": -547.2611694335938, + "logps/rejected": -437.7427978515625, + "loss": 0.0048, + "rewards/chosen": 4.484484672546387, + "rewards/margins": 12.323233604431152, + "rewards/rejected": -7.838748931884766, + "step": 9731 + }, + { + "epoch": 0.8891731384193696, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 3.0159153808070054e-07, + "logits/chosen": 700724326.4, + "logits/rejected": 865583616.0, + "logps/chosen": -376.08427734375, + "logps/rejected": -427.8557942708333, + "loss": 0.0243, + "rewards/chosen": 3.392877960205078, + "rewards/margins": 12.258078638712565, + "rewards/rejected": -8.865200678507486, + "step": 9732 + }, + { + "epoch": 0.8892645043398812, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 3.010999326684849e-07, + "logits/chosen": 554576512.0, + "logits/rejected": 1048275328.0, + "logps/chosen": -382.7500305175781, + "logps/rejected": -664.5891723632812, + "loss": 0.0114, + "rewards/chosen": 4.120082855224609, + "rewards/margins": 15.416070938110352, + "rewards/rejected": -11.295988082885742, + "step": 9733 + }, + { + "epoch": 0.8893558702603929, + "grad_norm": 36.75, + "kl": 0.0, + "learning_rate": 3.006087158117682e-07, + "logits/chosen": 628843673.6, + "logits/rejected": 313462634.6666667, + "logps/chosen": -388.3872314453125, + "logps/rejected": -358.0215250651042, + "loss": 0.0852, + "rewards/chosen": 3.636766815185547, + "rewards/margins": 10.534127298990885, + "rewards/rejected": -6.897360483805339, + "step": 9734 + }, + { + "epoch": 0.8894472361809045, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 3.001178875511701e-07, + "logits/chosen": 860219200.0, + "logits/rejected": 494876608.0, + "logps/chosen": -196.22193908691406, + "logps/rejected": -427.675537109375, + "loss": 0.0149, + "rewards/chosen": 3.5622339248657227, + "rewards/margins": 12.296845436096191, + "rewards/rejected": -8.734611511230469, + "step": 9735 + }, + { + "epoch": 0.8895386021014162, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 2.996274479272776e-07, + "logits/chosen": 216312960.0, + "logits/rejected": 312874176.0, + "logps/chosen": -187.1135711669922, + "logps/rejected": -441.8924967447917, + "loss": 0.0054, + "rewards/chosen": 4.848125457763672, + "rewards/margins": 14.706581751505533, + "rewards/rejected": -9.858456293741861, + "step": 9736 + }, + { + "epoch": 0.8896299680219278, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 2.9913739698064545e-07, + "logits/chosen": 744437418.6666666, + "logits/rejected": 432914688.0, + "logps/chosen": -378.9506022135417, + "logps/rejected": -397.338623046875, + "loss": 0.0162, + "rewards/chosen": 4.086004892985026, + "rewards/margins": 13.475739161173504, + "rewards/rejected": -9.389734268188477, + "step": 9737 + }, + { + "epoch": 0.8897213339424395, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 2.986477347517952e-07, + "logits/chosen": 544280640.0, + "logits/rejected": 406385728.0, + "logps/chosen": -432.19439697265625, + "logps/rejected": -454.093994140625, + "loss": 0.0356, + "rewards/chosen": 4.119991302490234, + "rewards/margins": 10.47127628326416, + "rewards/rejected": -6.351284980773926, + "step": 9738 + }, + { + "epoch": 0.8898126998629511, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 2.9815846128121885e-07, + "logits/chosen": 707433881.6, + "logits/rejected": 358353024.0, + "logps/chosen": -454.18486328125, + "logps/rejected": -410.473388671875, + "loss": 0.0209, + "rewards/chosen": 3.5310882568359374, + "rewards/margins": 12.45174191792806, + "rewards/rejected": -8.920653661092123, + "step": 9739 + }, + { + "epoch": 0.8899040657834628, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.9766957660937455e-07, + "logits/chosen": 337779754.6666667, + "logits/rejected": 470091366.4, + "logps/chosen": -406.2403971354167, + "logps/rejected": -615.8947265625, + "loss": 0.0094, + "rewards/chosen": 3.7586488723754883, + "rewards/margins": 13.176188850402832, + "rewards/rejected": -9.417539978027344, + "step": 9740 + }, + { + "epoch": 0.8899954317039744, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 2.971810807766884e-07, + "logits/chosen": 678923968.0, + "logits/rejected": 619875712.0, + "logps/chosen": -423.5150451660156, + "logps/rejected": -618.9256591796875, + "loss": 0.0138, + "rewards/chosen": 3.7156078815460205, + "rewards/margins": 14.120325326919556, + "rewards/rejected": -10.404717445373535, + "step": 9741 + }, + { + "epoch": 0.8900867976244861, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 2.9669297382355354e-07, + "logits/chosen": 574744874.6666666, + "logits/rejected": 494997811.2, + "logps/chosen": -304.68666585286456, + "logps/rejected": -530.213427734375, + "loss": 0.014, + "rewards/chosen": 3.698491414388021, + "rewards/margins": 14.266829427083334, + "rewards/rejected": -10.568338012695312, + "step": 9742 + }, + { + "epoch": 0.8901781635449977, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 2.9620525579033274e-07, + "logits/chosen": 435310677.3333333, + "logits/rejected": 778080819.2, + "logps/chosen": -202.12150065104166, + "logps/rejected": -432.05478515625, + "loss": 0.0099, + "rewards/chosen": 4.344574928283691, + "rewards/margins": 14.756926918029786, + "rewards/rejected": -10.412351989746094, + "step": 9743 + }, + { + "epoch": 0.8902695294655094, + "grad_norm": 38.0, + "kl": 0.0, + "learning_rate": 2.95717926717356e-07, + "logits/chosen": 407299456.0, + "logits/rejected": 478776512.0, + "logps/chosen": -341.2613220214844, + "logps/rejected": -458.5779724121094, + "loss": 0.0446, + "rewards/chosen": 4.604557991027832, + "rewards/margins": 11.653324127197266, + "rewards/rejected": -7.048766136169434, + "step": 9744 + }, + { + "epoch": 0.890360895386021, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 2.9523098664492046e-07, + "logits/chosen": 554628480.0, + "logits/rejected": 435049984.0, + "logps/chosen": -399.60040283203125, + "logps/rejected": -622.0457763671875, + "loss": 0.0259, + "rewards/chosen": 3.2767350673675537, + "rewards/margins": 12.943618535995483, + "rewards/rejected": -9.66688346862793, + "step": 9745 + }, + { + "epoch": 0.8904522613065327, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 2.947444356132917e-07, + "logits/chosen": 625731686.4, + "logits/rejected": 489269589.3333333, + "logps/chosen": -221.450244140625, + "logps/rejected": -540.3631998697916, + "loss": 0.0177, + "rewards/chosen": 4.019578552246093, + "rewards/margins": 13.212757364908853, + "rewards/rejected": -9.19317881266276, + "step": 9746 + }, + { + "epoch": 0.8905436272270443, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 2.94258273662702e-07, + "logits/chosen": 695573696.0, + "logits/rejected": 467598784.0, + "logps/chosen": -251.39910888671875, + "logps/rejected": -524.8583984375, + "loss": 0.0208, + "rewards/chosen": 3.359091281890869, + "rewards/margins": 13.055795192718506, + "rewards/rejected": -9.696703910827637, + "step": 9747 + }, + { + "epoch": 0.890634993147556, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 2.937725008333547e-07, + "logits/chosen": 897295360.0, + "logits/rejected": 557286400.0, + "logps/chosen": -501.10321044921875, + "logps/rejected": -437.7752990722656, + "loss": 0.0219, + "rewards/chosen": 3.1559853553771973, + "rewards/margins": 14.018947124481201, + "rewards/rejected": -10.862961769104004, + "step": 9748 + }, + { + "epoch": 0.8907263590680676, + "grad_norm": 38.0, + "kl": 0.0, + "learning_rate": 2.9328711716541725e-07, + "logits/chosen": 475799552.0, + "logits/rejected": 704736832.0, + "logps/chosen": -191.59312438964844, + "logps/rejected": -444.898681640625, + "loss": 0.0263, + "rewards/chosen": 3.4935905933380127, + "rewards/margins": 12.993556261062622, + "rewards/rejected": -9.49996566772461, + "step": 9749 + }, + { + "epoch": 0.8908177249885793, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 2.928021226990263e-07, + "logits/chosen": 498674858.6666667, + "logits/rejected": 614194560.0, + "logps/chosen": -338.6885172526042, + "logps/rejected": -1048.14404296875, + "loss": 0.0602, + "rewards/chosen": 2.8713245391845703, + "rewards/margins": 21.589981079101562, + "rewards/rejected": -18.718656539916992, + "step": 9750 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 65.0, + "kl": 0.0, + "learning_rate": 2.9231751747428647e-07, + "logits/chosen": 618624384.0, + "logits/rejected": 833015936.0, + "logps/chosen": -346.0440368652344, + "logps/rejected": -615.8934326171875, + "loss": 0.0698, + "rewards/chosen": 2.552302837371826, + "rewards/margins": 12.870082378387451, + "rewards/rejected": -10.317779541015625, + "step": 9751 + }, + { + "epoch": 0.8910004568296026, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 2.9183330153127023e-07, + "logits/chosen": 1115462997.3333333, + "logits/rejected": 545236992.0, + "logps/chosen": -301.2262369791667, + "logps/rejected": -464.895556640625, + "loss": 0.0231, + "rewards/chosen": 3.282801946004232, + "rewards/margins": 13.26251843770345, + "rewards/rejected": -9.979716491699218, + "step": 9752 + }, + { + "epoch": 0.8910918227501142, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 2.913494749100182e-07, + "logits/chosen": 471567136.0, + "logits/rejected": 552335232.0, + "logps/chosen": -390.431640625, + "logps/rejected": -520.1387939453125, + "loss": 0.0127, + "rewards/chosen": 3.8994195461273193, + "rewards/margins": 13.918131589889526, + "rewards/rejected": -10.018712043762207, + "step": 9753 + }, + { + "epoch": 0.8911831886706258, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 2.908660376505379e-07, + "logits/chosen": 635142954.6666666, + "logits/rejected": 980281139.2, + "logps/chosen": -349.6289876302083, + "logps/rejected": -574.105615234375, + "loss": 0.0072, + "rewards/chosen": 4.006661097208659, + "rewards/margins": 12.984526697794596, + "rewards/rejected": -8.977865600585938, + "step": 9754 + }, + { + "epoch": 0.8912745545911375, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 2.903829897928051e-07, + "logits/chosen": 594733465.6, + "logits/rejected": 1495835136.0, + "logps/chosen": -341.708642578125, + "logps/rejected": -583.6144612630209, + "loss": 0.012, + "rewards/chosen": 4.285694885253906, + "rewards/margins": 13.826116689046223, + "rewards/rejected": -9.540421803792318, + "step": 9755 + }, + { + "epoch": 0.8913659205116492, + "grad_norm": 5.15625, + "kl": 2.5554141998291016, + "learning_rate": 2.8990033137676397e-07, + "logits/chosen": 689829595.4285715, + "logits/rejected": 385644224.0, + "logps/chosen": -305.57889229910717, + "logps/rejected": -532.3251953125, + "loss": 0.0431, + "rewards/chosen": 3.3776062556675504, + "rewards/margins": 9.735308987753733, + "rewards/rejected": -6.357702732086182, + "step": 9756 + }, + { + "epoch": 0.8914572864321608, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 2.8941806244232486e-07, + "logits/chosen": 413294745.6, + "logits/rejected": 513718698.6666667, + "logps/chosen": -274.4985595703125, + "logps/rejected": -776.23583984375, + "loss": 0.0309, + "rewards/chosen": 4.098035430908203, + "rewards/margins": 14.672820027669271, + "rewards/rejected": -10.574784596761068, + "step": 9757 + }, + { + "epoch": 0.8915486523526724, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 2.8893618302936797e-07, + "logits/chosen": 496416832.0, + "logits/rejected": 469624608.0, + "logps/chosen": -313.019287109375, + "logps/rejected": -398.9631042480469, + "loss": 0.0068, + "rewards/chosen": 4.645007133483887, + "rewards/margins": 14.577103614807129, + "rewards/rejected": -9.932096481323242, + "step": 9758 + }, + { + "epoch": 0.8916400182731841, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.884546931777388e-07, + "logits/chosen": 486394453.3333333, + "logits/rejected": 206845104.0, + "logps/chosen": -265.0830485026042, + "logps/rejected": -293.33660888671875, + "loss": 0.0155, + "rewards/chosen": 4.4448191324869795, + "rewards/margins": 15.222228686014812, + "rewards/rejected": -10.777409553527832, + "step": 9759 + }, + { + "epoch": 0.8917313841936958, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 2.879735929272537e-07, + "logits/chosen": 476963072.0, + "logits/rejected": 572371763.2, + "logps/chosen": -206.7335205078125, + "logps/rejected": -401.7406005859375, + "loss": 0.0213, + "rewards/chosen": 3.7572269439697266, + "rewards/margins": 12.42381019592285, + "rewards/rejected": -8.666583251953124, + "step": 9760 + }, + { + "epoch": 0.8918227501142074, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 2.8749288231769423e-07, + "logits/chosen": 368047680.0, + "logits/rejected": 369555328.0, + "logps/chosen": -246.89556884765625, + "logps/rejected": -354.4415283203125, + "loss": 0.0827, + "rewards/chosen": 5.77187967300415, + "rewards/margins": 12.823854923248291, + "rewards/rejected": -7.051975250244141, + "step": 9761 + }, + { + "epoch": 0.891914116034719, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 2.870125613888114e-07, + "logits/chosen": 525569760.0, + "logits/rejected": 417833600.0, + "logps/chosen": -274.12310791015625, + "logps/rejected": -518.647705078125, + "loss": 0.0089, + "rewards/chosen": 3.3373823165893555, + "rewards/margins": 12.869983355204264, + "rewards/rejected": -9.532601038614908, + "step": 9762 + }, + { + "epoch": 0.8920054819552307, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.865326301803234e-07, + "logits/chosen": 723148714.6666666, + "logits/rejected": 743028480.0, + "logps/chosen": -511.6409505208333, + "logps/rejected": -611.99384765625, + "loss": 0.0207, + "rewards/chosen": 3.1056833267211914, + "rewards/margins": 13.695852088928223, + "rewards/rejected": -10.590168762207032, + "step": 9763 + }, + { + "epoch": 0.8920968478757424, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 2.8605308873191405e-07, + "logits/chosen": 701776768.0, + "logits/rejected": 310763488.0, + "logps/chosen": -335.3515319824219, + "logps/rejected": -543.25830078125, + "loss": 0.0107, + "rewards/chosen": 4.326257705688477, + "rewards/margins": 15.283863067626953, + "rewards/rejected": -10.957605361938477, + "step": 9764 + }, + { + "epoch": 0.892188213796254, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 2.8557393708324e-07, + "logits/chosen": 386864864.0, + "logits/rejected": 402921792.0, + "logps/chosen": -290.8660583496094, + "logps/rejected": -391.3873596191406, + "loss": 0.0113, + "rewards/chosen": 3.9251816272735596, + "rewards/margins": 12.83753228187561, + "rewards/rejected": -8.91235065460205, + "step": 9765 + }, + { + "epoch": 0.8922795797167656, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 2.850951752739212e-07, + "logits/chosen": 693763392.0, + "logits/rejected": 675077184.0, + "logps/chosen": -367.85040283203125, + "logps/rejected": -701.2701416015625, + "loss": 0.0135, + "rewards/chosen": 3.686983585357666, + "rewards/margins": 17.25723123550415, + "rewards/rejected": -13.570247650146484, + "step": 9766 + }, + { + "epoch": 0.8923709456372773, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 2.8461680334354647e-07, + "logits/chosen": 640489728.0, + "logits/rejected": 526680832.0, + "logps/chosen": -261.1198323567708, + "logps/rejected": -485.26055908203125, + "loss": 0.0229, + "rewards/chosen": 3.7777201334635415, + "rewards/margins": 14.064636866251627, + "rewards/rejected": -10.286916732788086, + "step": 9767 + }, + { + "epoch": 0.892462311557789, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 2.841388213316726e-07, + "logits/chosen": 410661418.6666667, + "logits/rejected": 351830336.0, + "logps/chosen": -379.596923828125, + "logps/rejected": -431.8153076171875, + "loss": 0.0086, + "rewards/chosen": 5.036827723185222, + "rewards/margins": 13.522537867228191, + "rewards/rejected": -8.485710144042969, + "step": 9768 + }, + { + "epoch": 0.8925536774783006, + "grad_norm": 30.375, + "kl": 0.0, + "learning_rate": 2.836612292778251e-07, + "logits/chosen": 970266560.0, + "logits/rejected": 526078016.0, + "logps/chosen": -309.59332275390625, + "logps/rejected": -395.1790771484375, + "loss": 0.0436, + "rewards/chosen": 3.145070791244507, + "rewards/margins": 12.061110258102417, + "rewards/rejected": -8.91603946685791, + "step": 9769 + }, + { + "epoch": 0.8926450433988122, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.831840272214964e-07, + "logits/chosen": 359729856.0, + "logits/rejected": 467203840.0, + "logps/chosen": -341.1363525390625, + "logps/rejected": -441.0901794433594, + "loss": 0.0172, + "rewards/chosen": 3.703237533569336, + "rewards/margins": 12.224287033081055, + "rewards/rejected": -8.521049499511719, + "step": 9770 + }, + { + "epoch": 0.8927364093193239, + "grad_norm": 0.98046875, + "kl": 0.0, + "learning_rate": 2.827072152021465e-07, + "logits/chosen": 568135552.0, + "logits/rejected": 890523733.3333334, + "logps/chosen": -281.4851989746094, + "logps/rejected": -398.4064127604167, + "loss": 0.0046, + "rewards/chosen": 4.437567710876465, + "rewards/margins": 13.206528663635254, + "rewards/rejected": -8.768960952758789, + "step": 9771 + }, + { + "epoch": 0.8928277752398356, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 2.8223079325920333e-07, + "logits/rejected": 618237952.0, + "logps/rejected": -494.72021484375, + "loss": 0.1011, + "rewards/rejected": -8.949467658996582, + "step": 9772 + }, + { + "epoch": 0.8929191411603472, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 2.817547614320615e-07, + "logits/chosen": 323921971.2, + "logits/rejected": 309818602.6666667, + "logps/chosen": -275.5464111328125, + "logps/rejected": -407.4342447916667, + "loss": 0.01, + "rewards/chosen": 4.619966888427735, + "rewards/margins": 14.637277984619141, + "rewards/rejected": -10.017311096191406, + "step": 9773 + }, + { + "epoch": 0.8930105070808588, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 2.8127911976008615e-07, + "logits/chosen": 779146956.8, + "logits/rejected": 702223360.0, + "logps/chosen": -370.2498291015625, + "logps/rejected": -690.25341796875, + "loss": 0.0227, + "rewards/chosen": 3.5409881591796877, + "rewards/margins": 13.846320978800456, + "rewards/rejected": -10.305332819620768, + "step": 9774 + }, + { + "epoch": 0.8931018730013704, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.8080386828260754e-07, + "logits/chosen": 635785216.0, + "logits/rejected": 508044096.0, + "logps/chosen": -277.35784912109375, + "logps/rejected": -375.15740966796875, + "loss": 0.0258, + "rewards/chosen": 3.554088274637858, + "rewards/margins": 11.818105379740397, + "rewards/rejected": -8.264017105102539, + "step": 9775 + }, + { + "epoch": 0.8931932389218822, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 2.8032900703892473e-07, + "logits/chosen": 856168521.1428572, + "logits/rejected": 566323072.0, + "logps/chosen": -287.8115931919643, + "logps/rejected": -362.9071044921875, + "loss": 0.0343, + "rewards/chosen": 3.6750790732247487, + "rewards/margins": 11.890634264264788, + "rewards/rejected": -8.215555191040039, + "step": 9776 + }, + { + "epoch": 0.8932846048423938, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.7985453606830357e-07, + "logits/chosen": 561114944.0, + "logits/rejected": 1215600512.0, + "logps/chosen": -368.3634338378906, + "logps/rejected": -700.028564453125, + "loss": 0.0107, + "rewards/chosen": 4.9302263259887695, + "rewards/margins": 15.050127029418945, + "rewards/rejected": -10.119900703430176, + "step": 9777 + }, + { + "epoch": 0.8933759707629054, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 2.7938045540997995e-07, + "logits/chosen": 512381269.3333333, + "logits/rejected": 911190323.2, + "logps/chosen": -290.08935546875, + "logps/rejected": -481.0521484375, + "loss": 0.009, + "rewards/chosen": 4.015908241271973, + "rewards/margins": 14.189528846740723, + "rewards/rejected": -10.17362060546875, + "step": 9778 + }, + { + "epoch": 0.893467336683417, + "grad_norm": 0.4453125, + "kl": 0.0, + "learning_rate": 2.789067651031552e-07, + "logits/chosen": 555806549.3333334, + "logits/rejected": 701249484.8, + "logps/chosen": -214.38214111328125, + "logps/rejected": -971.9767578125, + "loss": 0.0024, + "rewards/chosen": 5.246946334838867, + "rewards/margins": 18.102897262573244, + "rewards/rejected": -12.855950927734375, + "step": 9779 + }, + { + "epoch": 0.8935587026039288, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 2.784334651869991e-07, + "logits/chosen": 418184832.0, + "logits/rejected": 491079680.0, + "logps/chosen": -407.3968505859375, + "logps/rejected": -384.128173828125, + "loss": 0.0465, + "rewards/chosen": 2.362673759460449, + "rewards/margins": 10.404847145080566, + "rewards/rejected": -8.042173385620117, + "step": 9780 + }, + { + "epoch": 0.8936500685244404, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 2.7796055570064817e-07, + "logits/chosen": 631678464.0, + "logits/rejected": 369271744.0, + "logps/chosen": -480.4383544921875, + "logps/rejected": -528.064453125, + "loss": 0.01, + "rewards/chosen": 3.9455370903015137, + "rewards/margins": 14.307126522064209, + "rewards/rejected": -10.361589431762695, + "step": 9781 + }, + { + "epoch": 0.893741434444952, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 2.774880366832106e-07, + "logits/chosen": 433365833.14285713, + "logits/rejected": 471037504.0, + "logps/chosen": -232.55671037946428, + "logps/rejected": -696.818603515625, + "loss": 0.0106, + "rewards/chosen": 4.741796766008649, + "rewards/margins": 11.934203897203718, + "rewards/rejected": -7.192407131195068, + "step": 9782 + }, + { + "epoch": 0.8938328003654636, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 2.770159081737561e-07, + "logits/chosen": 292328338.28571427, + "logits/rejected": 559746176.0, + "logps/chosen": -295.1133510044643, + "logps/rejected": -670.8441162109375, + "loss": 0.024, + "rewards/chosen": 3.937199456351144, + "rewards/margins": 13.819828850882395, + "rewards/rejected": -9.88262939453125, + "step": 9783 + }, + { + "epoch": 0.8939241662859754, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 2.7654417021132694e-07, + "logits/chosen": 621572778.6666666, + "logits/rejected": 607133798.4, + "logps/chosen": -302.8981119791667, + "logps/rejected": -325.401904296875, + "loss": 0.017, + "rewards/chosen": 3.429145177205404, + "rewards/margins": 11.233457310994467, + "rewards/rejected": -7.804312133789063, + "step": 9784 + }, + { + "epoch": 0.894015532206487, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 2.760728228349302e-07, + "logits/chosen": 548176537.6, + "logits/rejected": 580184832.0, + "logps/chosen": -310.889111328125, + "logps/rejected": -826.875244140625, + "loss": 0.0124, + "rewards/chosen": 4.529863357543945, + "rewards/margins": 16.426223119099937, + "rewards/rejected": -11.89635976155599, + "step": 9785 + }, + { + "epoch": 0.8941068981269986, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 2.7560186608354355e-07, + "logits/chosen": 560069339.4285715, + "logits/rejected": 243377760.0, + "logps/chosen": -446.55887276785717, + "logps/rejected": -118.79846954345703, + "loss": 0.0109, + "rewards/chosen": 4.924591064453125, + "rewards/margins": 11.370530605316162, + "rewards/rejected": -6.445939540863037, + "step": 9786 + }, + { + "epoch": 0.8941982640475102, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 2.751312999961103e-07, + "logits/chosen": 660249088.0, + "logits/rejected": 529942112.0, + "logps/chosen": -281.0015869140625, + "logps/rejected": -432.5483093261719, + "loss": 0.0158, + "rewards/chosen": 3.8830995559692383, + "rewards/margins": 12.167014122009277, + "rewards/rejected": -8.283914566040039, + "step": 9787 + }, + { + "epoch": 0.894289629968022, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 2.7466112461154106e-07, + "logits/chosen": 969803434.6666666, + "logits/rejected": 658620800.0, + "logps/chosen": -362.781494140625, + "logps/rejected": -659.0128173828125, + "loss": 0.0105, + "rewards/chosen": 4.308908462524414, + "rewards/margins": 13.426877975463867, + "rewards/rejected": -9.117969512939453, + "step": 9788 + }, + { + "epoch": 0.8943809958885336, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 2.741913399687157e-07, + "logits/chosen": 880972185.6, + "logits/rejected": 603369386.6666666, + "logps/chosen": -455.58173828125, + "logps/rejected": -461.8931477864583, + "loss": 0.0147, + "rewards/chosen": 4.097016906738281, + "rewards/margins": 13.0874267578125, + "rewards/rejected": -8.990409851074219, + "step": 9789 + }, + { + "epoch": 0.8944723618090452, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.7372194610648105e-07, + "logits/chosen": 252848016.0, + "logits/rejected": 503894624.0, + "logps/chosen": -505.85809326171875, + "logps/rejected": -391.1602783203125, + "loss": 0.0149, + "rewards/chosen": 3.8394954204559326, + "rewards/margins": 11.900912046432495, + "rewards/rejected": -8.061416625976562, + "step": 9790 + }, + { + "epoch": 0.8945637277295568, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 2.732529430636516e-07, + "logits/chosen": 431729459.2, + "logits/rejected": 854689706.6666666, + "logps/chosen": -237.6451171875, + "logps/rejected": -598.1284993489584, + "loss": 0.0251, + "rewards/chosen": 3.251250076293945, + "rewards/margins": 14.044316736857095, + "rewards/rejected": -10.79306666056315, + "step": 9791 + }, + { + "epoch": 0.8946550936500686, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 2.727843308790096e-07, + "logits/chosen": 425555968.0, + "logits/rejected": 371074048.0, + "logps/chosen": -271.8912760416667, + "logps/rejected": -385.18902587890625, + "loss": 0.0303, + "rewards/chosen": 3.6684443155924478, + "rewards/margins": 11.679107348124186, + "rewards/rejected": -8.010663032531738, + "step": 9792 + }, + { + "epoch": 0.8947464595705802, + "grad_norm": 0.60546875, + "kl": 0.0, + "learning_rate": 2.723161095913046e-07, + "logits/chosen": 532843264.0, + "logits/rejected": 654442432.0, + "logps/chosen": -267.62384033203125, + "logps/rejected": -649.812744140625, + "loss": 0.0036, + "rewards/chosen": 5.492765426635742, + "rewards/margins": 13.9356689453125, + "rewards/rejected": -8.442903518676758, + "step": 9793 + }, + { + "epoch": 0.8948378254910918, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.7184827923925403e-07, + "logits/chosen": 432540211.2, + "logits/rejected": 421416704.0, + "logps/chosen": -253.2522216796875, + "logps/rejected": -381.107421875, + "loss": 0.0292, + "rewards/chosen": 3.604787826538086, + "rewards/margins": 14.558070500691732, + "rewards/rejected": -10.953282674153646, + "step": 9794 + }, + { + "epoch": 0.8949291914116034, + "grad_norm": 0.625, + "kl": 0.0, + "learning_rate": 2.7138083986154417e-07, + "logits/chosen": 608352896.0, + "logits/rejected": 417596013.71428573, + "logps/chosen": -575.1893310546875, + "logps/rejected": -538.2314453125, + "loss": 0.0018, + "rewards/chosen": 4.27838134765625, + "rewards/margins": 14.857934134347099, + "rewards/rejected": -10.579552786690849, + "step": 9795 + }, + { + "epoch": 0.8950205573321152, + "grad_norm": 0.2578125, + "kl": 0.0, + "learning_rate": 2.7091379149682683e-07, + "logits/chosen": 491464192.0, + "logits/rejected": 762682002.2857143, + "logps/chosen": -318.6048278808594, + "logps/rejected": -505.84859793526783, + "loss": 0.0013, + "rewards/chosen": 4.859021186828613, + "rewards/margins": 14.11963585444859, + "rewards/rejected": -9.260614667619977, + "step": 9796 + }, + { + "epoch": 0.8951119232526268, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 2.704471341837234e-07, + "logits/chosen": 756576768.0, + "logits/rejected": 757811840.0, + "logps/chosen": -341.9844970703125, + "logps/rejected": -567.587646484375, + "loss": 0.0075, + "rewards/chosen": 4.852065086364746, + "rewards/margins": 15.133927345275879, + "rewards/rejected": -10.281862258911133, + "step": 9797 + }, + { + "epoch": 0.8952032891731384, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 2.6998086796082066e-07, + "logits/chosen": 723260928.0, + "logits/rejected": 648225280.0, + "logps/chosen": -328.1837972005208, + "logps/rejected": -528.9837890625, + "loss": 0.0136, + "rewards/chosen": 3.382766087849935, + "rewards/margins": 12.91761105855306, + "rewards/rejected": -9.534844970703125, + "step": 9798 + }, + { + "epoch": 0.89529465509365, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 2.6951499286667626e-07, + "logits/chosen": 625681024.0, + "logits/rejected": 451883136.0, + "logps/chosen": -280.8379211425781, + "logps/rejected": -355.17950439453125, + "loss": 0.0191, + "rewards/chosen": 3.426168918609619, + "rewards/margins": 13.588390827178955, + "rewards/rejected": -10.162221908569336, + "step": 9799 + }, + { + "epoch": 0.8953860210141618, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 2.690495089398132e-07, + "logits/chosen": 434955520.0, + "logits/rejected": 488427776.0, + "logps/chosen": -382.1827392578125, + "logps/rejected": -628.5234375, + "loss": 0.0072, + "rewards/chosen": 3.515608310699463, + "rewards/margins": 14.87477477391561, + "rewards/rejected": -11.359166463216146, + "step": 9800 + }, + { + "epoch": 0.8954773869346734, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 2.6858441621872235e-07, + "logits/chosen": 997654272.0, + "logits/rejected": 1037682346.6666666, + "logps/chosen": -312.27490234375, + "logps/rejected": -553.2925618489584, + "loss": 0.0176, + "rewards/chosen": 2.903761863708496, + "rewards/margins": 13.842703819274902, + "rewards/rejected": -10.938941955566406, + "step": 9801 + }, + { + "epoch": 0.895568752855185, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 2.681197147418624e-07, + "logits/chosen": 770699456.0, + "logits/rejected": 510220842.6666667, + "logps/chosen": -188.66275024414062, + "logps/rejected": -418.569091796875, + "loss": 0.0145, + "rewards/chosen": 2.9023513793945312, + "rewards/margins": 11.705829620361328, + "rewards/rejected": -8.803478240966797, + "step": 9802 + }, + { + "epoch": 0.8956601187756966, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.676554045476604e-07, + "logits/chosen": 546112109.7142857, + "logits/rejected": 404087808.0, + "logps/chosen": -274.6705845424107, + "logps/rejected": -315.6473388671875, + "loss": 0.0404, + "rewards/chosen": 3.2464381626674106, + "rewards/margins": 10.709298270089285, + "rewards/rejected": -7.462860107421875, + "step": 9803 + }, + { + "epoch": 0.8957514846962084, + "grad_norm": 0.93359375, + "kl": 0.0, + "learning_rate": 2.671914856745106e-07, + "logits/chosen": 550266880.0, + "logits/rejected": 749420714.6666666, + "logps/chosen": -354.7498474121094, + "logps/rejected": -528.5028483072916, + "loss": 0.004, + "rewards/chosen": 4.371585845947266, + "rewards/margins": 13.96808115641276, + "rewards/rejected": -9.596495310465494, + "step": 9804 + }, + { + "epoch": 0.89584285061672, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 2.667279581607746e-07, + "logits/chosen": 620471893.3333334, + "logits/rejected": 345411686.4, + "logps/chosen": -436.0072428385417, + "logps/rejected": -278.797021484375, + "loss": 0.0917, + "rewards/chosen": 4.540052731831868, + "rewards/margins": 11.382977231343588, + "rewards/rejected": -6.842924499511719, + "step": 9805 + }, + { + "epoch": 0.8959342165372316, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 2.662648220447811e-07, + "logits/chosen": 406924032.0, + "logits/rejected": 440159786.6666667, + "logps/chosen": -314.601806640625, + "logps/rejected": -522.2943522135416, + "loss": 0.0081, + "rewards/chosen": 4.475140380859375, + "rewards/margins": 13.374138641357423, + "rewards/rejected": -8.898998260498047, + "step": 9806 + }, + { + "epoch": 0.8960255824577432, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 2.6580207736482787e-07, + "logits/chosen": 589008128.0, + "logits/rejected": 398042176.0, + "logps/chosen": -372.9925537109375, + "logps/rejected": -528.575439453125, + "loss": 0.0203, + "rewards/chosen": 3.468728542327881, + "rewards/margins": 14.670855045318604, + "rewards/rejected": -11.202126502990723, + "step": 9807 + }, + { + "epoch": 0.896116948378255, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 2.6533972415918044e-07, + "logits/chosen": 460796032.0, + "logits/rejected": 614080102.4, + "logps/chosen": -284.7383626302083, + "logps/rejected": -574.90615234375, + "loss": 0.0205, + "rewards/chosen": 4.120814959208171, + "rewards/margins": 13.866609636942545, + "rewards/rejected": -9.745794677734375, + "step": 9808 + }, + { + "epoch": 0.8962083142987666, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 2.648777624660698e-07, + "logits/chosen": 541025600.0, + "logits/rejected": 587378048.0, + "logps/chosen": -355.8335266113281, + "logps/rejected": -575.0899658203125, + "loss": 0.0161, + "rewards/chosen": 3.9061803817749023, + "rewards/margins": 12.967999458312988, + "rewards/rejected": -9.061819076538086, + "step": 9809 + }, + { + "epoch": 0.8962996802192782, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 2.644161923236954e-07, + "logits/chosen": 323930016.0, + "logits/rejected": 511819904.0, + "logps/chosen": -244.57925415039062, + "logps/rejected": -540.64892578125, + "loss": 0.0132, + "rewards/chosen": 4.412054061889648, + "rewards/margins": 13.791125297546387, + "rewards/rejected": -9.379071235656738, + "step": 9810 + }, + { + "epoch": 0.8963910461397898, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 2.639550137702268e-07, + "logits/chosen": 558706112.0, + "logits/rejected": 266687424.0, + "logps/chosen": -352.03192138671875, + "logps/rejected": -467.2812093098958, + "loss": 0.0158, + "rewards/chosen": 2.7506651878356934, + "rewards/margins": 11.006749629974365, + "rewards/rejected": -8.256084442138672, + "step": 9811 + }, + { + "epoch": 0.8964824120603015, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 2.634942268437979e-07, + "logits/chosen": 852041045.3333334, + "logits/rejected": 687655731.2, + "logps/chosen": -298.811279296875, + "logps/rejected": -733.380859375, + "loss": 0.0071, + "rewards/chosen": 4.320070266723633, + "rewards/margins": 14.801405715942384, + "rewards/rejected": -10.48133544921875, + "step": 9812 + }, + { + "epoch": 0.8965737779808132, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 2.630338315825115e-07, + "logits/chosen": 498077013.3333333, + "logits/rejected": 653906688.0, + "logps/chosen": -313.8494466145833, + "logps/rejected": -703.102783203125, + "loss": 0.0136, + "rewards/chosen": 4.089759190877278, + "rewards/margins": 14.56541856129964, + "rewards/rejected": -10.475659370422363, + "step": 9813 + }, + { + "epoch": 0.8966651439013248, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 2.625738280244383e-07, + "logits/chosen": 513013862.4, + "logits/rejected": 425640789.3333333, + "logps/chosen": -307.946435546875, + "logps/rejected": -438.1599934895833, + "loss": 0.0254, + "rewards/chosen": 3.367035675048828, + "rewards/margins": 10.968941497802735, + "rewards/rejected": -7.601905822753906, + "step": 9814 + }, + { + "epoch": 0.8967565098218364, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 2.6211421620761567e-07, + "logits/chosen": 504838528.0, + "logits/rejected": 970538944.0, + "logps/chosen": -287.46893310546875, + "logps/rejected": -806.0152587890625, + "loss": 0.0198, + "rewards/chosen": 3.974842071533203, + "rewards/margins": 18.516456604003906, + "rewards/rejected": -14.541614532470703, + "step": 9815 + }, + { + "epoch": 0.8968478757423481, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 2.6165499617005095e-07, + "logits/chosen": 432818560.0, + "logits/rejected": 478570197.3333333, + "logps/chosen": -417.7925720214844, + "logps/rejected": -458.7819010416667, + "loss": 0.0053, + "rewards/chosen": 4.414935111999512, + "rewards/margins": 13.44017759958903, + "rewards/rejected": -9.025242487589518, + "step": 9816 + }, + { + "epoch": 0.8969392416628598, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 2.6119616794971605e-07, + "logits/chosen": 588219699.2, + "logits/rejected": 604850560.0, + "logps/chosen": -415.328125, + "logps/rejected": -502.5916748046875, + "loss": 0.0225, + "rewards/chosen": 4.170603942871094, + "rewards/margins": 12.172066243489585, + "rewards/rejected": -8.00146230061849, + "step": 9817 + }, + { + "epoch": 0.8970306075833714, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 2.6073773158455164e-07, + "logits/chosen": 528918976.0, + "logits/rejected": 1059156032.0, + "logps/chosen": -234.42295837402344, + "logps/rejected": -398.3227233886719, + "loss": 0.0102, + "rewards/chosen": 4.053046226501465, + "rewards/margins": 13.517645835876465, + "rewards/rejected": -9.464599609375, + "step": 9818 + }, + { + "epoch": 0.897121973503883, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 2.602796871124663e-07, + "logits/chosen": 1152358997.3333333, + "logits/rejected": 482403328.0, + "logps/chosen": -357.7423095703125, + "logps/rejected": -379.93896484375, + "loss": 0.0236, + "rewards/chosen": 2.9575684865315757, + "rewards/margins": 10.653366216023763, + "rewards/rejected": -7.695797729492187, + "step": 9819 + }, + { + "epoch": 0.8972133394243947, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 2.5982203457133704e-07, + "logits/chosen": 961477939.2, + "logits/rejected": 461403904.0, + "logps/chosen": -195.7660400390625, + "logps/rejected": -304.43210856119794, + "loss": 0.0227, + "rewards/chosen": 4.049595642089844, + "rewards/margins": 13.27158940633138, + "rewards/rejected": -9.221993764241537, + "step": 9820 + }, + { + "epoch": 0.8973047053449064, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.593647739990068e-07, + "logits/chosen": 545478016.0, + "logits/rejected": 694077056.0, + "logps/chosen": -318.3155517578125, + "logps/rejected": -547.0234375, + "loss": 0.0979, + "rewards/chosen": 3.6462769508361816, + "rewards/margins": 12.616783618927002, + "rewards/rejected": -8.97050666809082, + "step": 9821 + }, + { + "epoch": 0.897396071265418, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 2.5890790543328716e-07, + "logits/chosen": 390264992.0, + "logits/rejected": 379450816.0, + "logps/chosen": -272.6634826660156, + "logps/rejected": -371.7315673828125, + "loss": 0.0453, + "rewards/chosen": 2.887810230255127, + "rewards/margins": 11.50767469406128, + "rewards/rejected": -8.619864463806152, + "step": 9822 + }, + { + "epoch": 0.8974874371859296, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 2.58451428911955e-07, + "logits/chosen": 406388224.0, + "logits/rejected": 410930278.4, + "logps/chosen": -211.1478068033854, + "logps/rejected": -456.949560546875, + "loss": 0.0098, + "rewards/chosen": 4.218203862508138, + "rewards/margins": 12.835672124226889, + "rewards/rejected": -8.61746826171875, + "step": 9823 + }, + { + "epoch": 0.8975788031064413, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 2.5799534447275953e-07, + "logits/chosen": 731446528.0, + "logits/rejected": 712593536.0, + "logps/chosen": -483.8063151041667, + "logps/rejected": -529.25537109375, + "loss": 0.0245, + "rewards/chosen": 3.769198735555013, + "rewards/margins": 11.191348393758139, + "rewards/rejected": -7.422149658203125, + "step": 9824 + }, + { + "epoch": 0.897670169026953, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 2.5753965215341347e-07, + "logits/chosen": 446995163.4285714, + "logits/rejected": 626202752.0, + "logps/chosen": -296.40335518973217, + "logps/rejected": -653.22802734375, + "loss": 0.0535, + "rewards/chosen": 3.1960062299455916, + "rewards/margins": 12.137956074305944, + "rewards/rejected": -8.941949844360352, + "step": 9825 + }, + { + "epoch": 0.8977615349474646, + "grad_norm": 40.5, + "kl": 0.0, + "learning_rate": 2.570843519915983e-07, + "logits/chosen": 393789952.0, + "logits/rejected": 293118720.0, + "logps/chosen": -202.0724080403646, + "logps/rejected": -360.13787841796875, + "loss": 0.0892, + "rewards/chosen": 3.742321014404297, + "rewards/margins": 12.6679048538208, + "rewards/rejected": -8.925583839416504, + "step": 9826 + }, + { + "epoch": 0.8978529008679762, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 2.5662944402496284e-07, + "logits/chosen": 713662976.0, + "logits/rejected": 448369792.0, + "logps/chosen": -595.3658447265625, + "logps/rejected": -524.422119140625, + "loss": 0.0061, + "rewards/chosen": 4.6623430252075195, + "rewards/margins": 16.290424346923828, + "rewards/rejected": -11.628081321716309, + "step": 9827 + }, + { + "epoch": 0.8979442667884879, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.5617492829112356e-07, + "logits/chosen": 407406656.0, + "logits/rejected": 615295744.0, + "logps/chosen": -279.07916259765625, + "logps/rejected": -549.2230224609375, + "loss": 0.0204, + "rewards/chosen": 4.294894218444824, + "rewards/margins": 16.54472064971924, + "rewards/rejected": -12.249826431274414, + "step": 9828 + }, + { + "epoch": 0.8980356327089996, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 2.557208048276655e-07, + "logits/chosen": 625754752.0, + "logits/rejected": 408929536.0, + "logps/chosen": -357.4869689941406, + "logps/rejected": -424.9731140136719, + "loss": 0.0128, + "rewards/chosen": 3.830050230026245, + "rewards/margins": 12.903803586959839, + "rewards/rejected": -9.073753356933594, + "step": 9829 + }, + { + "epoch": 0.8981269986295112, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.5526707367214077e-07, + "logits/chosen": 390909312.0, + "logits/rejected": 272690816.0, + "logps/chosen": -299.0155944824219, + "logps/rejected": -488.7037353515625, + "loss": 0.0261, + "rewards/chosen": 3.3114748001098633, + "rewards/margins": 12.718669891357422, + "rewards/rejected": -9.407195091247559, + "step": 9830 + }, + { + "epoch": 0.8982183645500228, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 2.548137348620677e-07, + "logits/chosen": 653006592.0, + "logits/rejected": 388435916.8, + "logps/chosen": -448.2833251953125, + "logps/rejected": -402.846337890625, + "loss": 0.0148, + "rewards/chosen": 4.042195638020833, + "rewards/margins": 12.90410130818685, + "rewards/rejected": -8.861905670166015, + "step": 9831 + }, + { + "epoch": 0.8983097304705345, + "grad_norm": 0.16796875, + "kl": 0.0, + "learning_rate": 2.5436078843493306e-07, + "logits/rejected": 375440704.0, + "logps/rejected": -467.86029052734375, + "loss": 0.0008, + "rewards/rejected": -8.762319564819336, + "step": 9832 + }, + { + "epoch": 0.8984010963910462, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 2.5390823442819234e-07, + "logits/chosen": 838330675.2, + "logits/rejected": 632390912.0, + "logps/chosen": -405.91103515625, + "logps/rejected": -360.6796468098958, + "loss": 0.019, + "rewards/chosen": 4.03833236694336, + "rewards/margins": 12.779979451497397, + "rewards/rejected": -8.741647084554037, + "step": 9833 + }, + { + "epoch": 0.8984924623115578, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.534560728792673e-07, + "logits/chosen": 487151520.0, + "logits/rejected": 856209536.0, + "logps/chosen": -241.24002075195312, + "logps/rejected": -644.6414184570312, + "loss": 0.0131, + "rewards/chosen": 4.121585369110107, + "rewards/margins": 14.665915966033936, + "rewards/rejected": -10.544330596923828, + "step": 9834 + }, + { + "epoch": 0.8985838282320694, + "grad_norm": 0.72265625, + "kl": 0.0, + "learning_rate": 2.5300430382554864e-07, + "logits/chosen": 342934080.0, + "logits/rejected": 516594176.0, + "logps/chosen": -145.02835083007812, + "logps/rejected": -550.5745239257812, + "loss": 0.0068, + "rewards/chosen": 4.3568902015686035, + "rewards/margins": 14.401469707489014, + "rewards/rejected": -10.04457950592041, + "step": 9835 + }, + { + "epoch": 0.8986751941525811, + "grad_norm": 57.25, + "kl": 2.725292205810547, + "learning_rate": 2.5255292730438976e-07, + "logits/chosen": 403344530.28571427, + "logits/rejected": 573946624.0, + "logps/chosen": -303.3401576450893, + "logps/rejected": -490.0550842285156, + "loss": 0.0682, + "rewards/chosen": 3.6778782435825894, + "rewards/margins": 13.390348298209055, + "rewards/rejected": -9.712470054626465, + "step": 9836 + }, + { + "epoch": 0.8987665600730927, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 2.521019433531191e-07, + "logits/chosen": 336406570.6666667, + "logits/rejected": 468261785.6, + "logps/chosen": -217.970703125, + "logps/rejected": -604.254052734375, + "loss": 0.0121, + "rewards/chosen": 4.0646317799886065, + "rewards/margins": 13.412302525838214, + "rewards/rejected": -9.347670745849609, + "step": 9837 + }, + { + "epoch": 0.8988579259936044, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 2.5165135200902745e-07, + "logits/chosen": 812706496.0, + "logits/rejected": 408437589.3333333, + "logps/chosen": -519.7261352539062, + "logps/rejected": -443.6448567708333, + "loss": 0.1099, + "rewards/chosen": 4.073051452636719, + "rewards/margins": 12.890605926513672, + "rewards/rejected": -8.817554473876953, + "step": 9838 + }, + { + "epoch": 0.898949291914116, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 2.512011533093739e-07, + "logits/chosen": 625580160.0, + "logits/rejected": 545961088.0, + "logps/chosen": -209.0885772705078, + "logps/rejected": -401.06396484375, + "loss": 0.0106, + "rewards/chosen": 4.276874542236328, + "rewards/margins": 13.80363655090332, + "rewards/rejected": -9.526762008666992, + "step": 9839 + }, + { + "epoch": 0.8990406578346277, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 2.507513472913864e-07, + "logits/chosen": 586165589.3333334, + "logits/rejected": 517989580.8, + "logps/chosen": -275.1215006510417, + "logps/rejected": -591.80107421875, + "loss": 0.0085, + "rewards/chosen": 4.033235549926758, + "rewards/margins": 15.294878005981445, + "rewards/rejected": -11.261642456054688, + "step": 9840 + }, + { + "epoch": 0.8991320237551393, + "grad_norm": 53.75, + "kl": 0.0, + "learning_rate": 2.503019339922602e-07, + "logits/chosen": 459501632.0, + "logits/rejected": 506246784.0, + "logps/chosen": -166.7777557373047, + "logps/rejected": -474.95245361328125, + "loss": 0.0817, + "rewards/chosen": 3.763005256652832, + "rewards/margins": 11.651318550109863, + "rewards/rejected": -7.888313293457031, + "step": 9841 + }, + { + "epoch": 0.899223389675651, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 2.4985291344915675e-07, + "logits/chosen": 484914688.0, + "logits/rejected": 552803072.0, + "logps/chosen": -281.33697509765625, + "logps/rejected": -518.2396240234375, + "loss": 0.0207, + "rewards/chosen": 3.2587714195251465, + "rewards/margins": 14.1233229637146, + "rewards/rejected": -10.864551544189453, + "step": 9842 + }, + { + "epoch": 0.8993147555961626, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.4940428569920683e-07, + "logits/chosen": 748667238.4, + "logits/rejected": 490803840.0, + "logps/chosen": -394.9162353515625, + "logps/rejected": -478.5048014322917, + "loss": 0.0209, + "rewards/chosen": 3.605142593383789, + "rewards/margins": 10.812564214070637, + "rewards/rejected": -7.207421620686849, + "step": 9843 + }, + { + "epoch": 0.8994061215166743, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.489560507795069e-07, + "logits/chosen": 372670464.0, + "logits/rejected": 192984640.0, + "logps/chosen": -325.6689453125, + "logps/rejected": -405.7860921223958, + "loss": 0.0405, + "rewards/chosen": 3.020580291748047, + "rewards/margins": 14.76177724202474, + "rewards/rejected": -11.741196950276693, + "step": 9844 + }, + { + "epoch": 0.8994974874371859, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 2.485082087271218e-07, + "logits/chosen": 477552256.0, + "logits/rejected": 463236288.0, + "logps/chosen": -423.60137939453125, + "logps/rejected": -590.7508544921875, + "loss": 0.0043, + "rewards/chosen": 5.066858291625977, + "rewards/margins": 14.397336959838867, + "rewards/rejected": -9.33047866821289, + "step": 9845 + }, + { + "epoch": 0.8995888533576976, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 2.480607595790846e-07, + "logits/chosen": 381304883.2, + "logits/rejected": 602059008.0, + "logps/chosen": -142.51318359375, + "logps/rejected": -572.3662109375, + "loss": 0.0322, + "rewards/chosen": 3.6439971923828125, + "rewards/margins": 12.552497863769531, + "rewards/rejected": -8.908500671386719, + "step": 9846 + }, + { + "epoch": 0.8996802192782092, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 2.476137033723952e-07, + "logits/chosen": 697978572.8, + "logits/rejected": 769332138.6666666, + "logps/chosen": -417.13203125, + "logps/rejected": -793.3748372395834, + "loss": 0.0107, + "rewards/chosen": 4.1773124694824215, + "rewards/margins": 18.527420298258463, + "rewards/rejected": -14.350107828776041, + "step": 9847 + }, + { + "epoch": 0.8997715851987209, + "grad_norm": 0.59765625, + "kl": 0.0, + "learning_rate": 2.4716704014402013e-07, + "logits/chosen": 954292053.3333334, + "logits/rejected": 628765644.8, + "logps/chosen": -241.37939453125, + "logps/rejected": -515.57607421875, + "loss": 0.0032, + "rewards/chosen": 4.967245737711589, + "rewards/margins": 15.304918162027995, + "rewards/rejected": -10.337672424316406, + "step": 9848 + }, + { + "epoch": 0.8998629511192325, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 2.467207699308949e-07, + "logits/chosen": 360829760.0, + "logits/rejected": 463816416.0, + "logps/chosen": -333.14697265625, + "logps/rejected": -447.96734619140625, + "loss": 0.0066, + "rewards/chosen": 4.900599002838135, + "rewards/margins": 14.9239182472229, + "rewards/rejected": -10.023319244384766, + "step": 9849 + }, + { + "epoch": 0.8999543170397442, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 2.462748927699221e-07, + "logits/chosen": 650472106.6666666, + "logits/rejected": 890589388.8, + "logps/chosen": -287.09812418619794, + "logps/rejected": -445.519873046875, + "loss": 0.0095, + "rewards/chosen": 3.9834060668945312, + "rewards/margins": 14.548341369628906, + "rewards/rejected": -10.564935302734375, + "step": 9850 + }, + { + "epoch": 0.9000456829602558, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 2.4582940869797177e-07, + "logits/chosen": 488437792.0, + "logits/rejected": 538548096.0, + "logps/chosen": -357.30865478515625, + "logps/rejected": -835.434814453125, + "loss": 0.0135, + "rewards/chosen": 4.044161319732666, + "rewards/margins": 13.872808933258057, + "rewards/rejected": -9.82864761352539, + "step": 9851 + }, + { + "epoch": 0.9001370488807675, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 2.4538431775188053e-07, + "logits/chosen": 700710848.0, + "logits/rejected": 886675626.6666666, + "logps/chosen": -234.49249267578125, + "logps/rejected": -602.8397623697916, + "loss": 0.0098, + "rewards/chosen": 3.5103912353515625, + "rewards/margins": 12.621405919392904, + "rewards/rejected": -9.111014684041342, + "step": 9852 + }, + { + "epoch": 0.9002284148012791, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 2.449396199684534e-07, + "logits/chosen": 591979520.0, + "logits/rejected": 422109107.2, + "logps/chosen": -206.53401692708334, + "logps/rejected": -397.60029296875, + "loss": 0.1106, + "rewards/chosen": 3.5173327128092446, + "rewards/margins": 10.819669596354167, + "rewards/rejected": -7.302336883544922, + "step": 9853 + }, + { + "epoch": 0.9003197807217908, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 2.444953153844637e-07, + "logits/chosen": 412672810.6666667, + "logits/rejected": 854910272.0, + "logps/chosen": -305.61163330078125, + "logps/rejected": -418.290771484375, + "loss": 0.034, + "rewards/chosen": 3.8023910522460938, + "rewards/margins": 12.659175872802734, + "rewards/rejected": -8.85678482055664, + "step": 9854 + }, + { + "epoch": 0.9004111466423024, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 2.4405140403665093e-07, + "logits/chosen": 576458197.3333334, + "logits/rejected": 736927846.4, + "logps/chosen": -275.3297119140625, + "logps/rejected": -677.237451171875, + "loss": 0.0036, + "rewards/chosen": 4.849257151285808, + "rewards/margins": 16.87849146525065, + "rewards/rejected": -12.029234313964844, + "step": 9855 + }, + { + "epoch": 0.9005025125628141, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 2.436078859617219e-07, + "logits/chosen": 427989401.6, + "logits/rejected": 378239829.3333333, + "logps/chosen": -269.758056640625, + "logps/rejected": -393.0873616536458, + "loss": 0.0112, + "rewards/chosen": 4.587954711914063, + "rewards/margins": 12.74335594177246, + "rewards/rejected": -8.155401229858398, + "step": 9856 + }, + { + "epoch": 0.9005938784833257, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 2.431647611963517e-07, + "logits/chosen": 750845235.2, + "logits/rejected": 535333162.6666667, + "logps/chosen": -261.2282470703125, + "logps/rejected": -469.9464518229167, + "loss": 0.017, + "rewards/chosen": 3.9555587768554688, + "rewards/margins": 13.728950500488281, + "rewards/rejected": -9.773391723632812, + "step": 9857 + }, + { + "epoch": 0.9006852444038373, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 2.4272202977718205e-07, + "logits/chosen": 640629760.0, + "logits/rejected": 1326340403.2, + "logps/chosen": -347.9064127604167, + "logps/rejected": -878.6419921875, + "loss": 0.0313, + "rewards/chosen": 3.0696690877278647, + "rewards/margins": 13.88802693684896, + "rewards/rejected": -10.818357849121094, + "step": 9858 + }, + { + "epoch": 0.900776610324349, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 2.4227969174082376e-07, + "logits/chosen": 523532800.0, + "logits/rejected": 407769344.0, + "logps/chosen": -401.0384928385417, + "logps/rejected": -423.14716796875, + "loss": 0.0223, + "rewards/chosen": 2.78935178120931, + "rewards/margins": 11.262848790486654, + "rewards/rejected": -8.473497009277343, + "step": 9859 + }, + { + "epoch": 0.9008679762448607, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 2.418377471238542e-07, + "logits/chosen": 717440512.0, + "logits/rejected": 457929248.0, + "logps/chosen": -371.12139892578125, + "logps/rejected": -428.4222412109375, + "loss": 0.0205, + "rewards/chosen": 3.5796120166778564, + "rewards/margins": 13.480406999588013, + "rewards/rejected": -9.900794982910156, + "step": 9860 + }, + { + "epoch": 0.9009593421653723, + "grad_norm": 31.875, + "kl": 0.0, + "learning_rate": 2.4139619596281747e-07, + "logits/chosen": 455485856.0, + "logits/rejected": 392923328.0, + "logps/chosen": -314.12738037109375, + "logps/rejected": -424.98919677734375, + "loss": 0.0267, + "rewards/chosen": 3.3460912704467773, + "rewards/margins": 11.928970336914062, + "rewards/rejected": -8.582879066467285, + "step": 9861 + }, + { + "epoch": 0.901050708085884, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 2.4095503829422606e-07, + "logits/chosen": 405625130.6666667, + "logits/rejected": 398673740.8, + "logps/chosen": -336.6695963541667, + "logps/rejected": -526.29365234375, + "loss": 0.0154, + "rewards/chosen": 4.372607549031575, + "rewards/margins": 13.655734380086262, + "rewards/rejected": -9.283126831054688, + "step": 9862 + }, + { + "epoch": 0.9011420740063956, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 2.405142741545591e-07, + "logits/chosen": 530201600.0, + "logits/rejected": 1249483776.0, + "logps/chosen": -294.2432861328125, + "logps/rejected": -549.2840576171875, + "loss": 0.0263, + "rewards/chosen": 3.9725755055745444, + "rewards/margins": 14.891151746114096, + "rewards/rejected": -10.91857624053955, + "step": 9863 + }, + { + "epoch": 0.9012334399269073, + "grad_norm": 0.59765625, + "kl": 0.0, + "learning_rate": 2.400739035802641e-07, + "logits/chosen": 798232960.0, + "logits/rejected": 675331200.0, + "logps/chosen": -276.01171875, + "logps/rejected": -763.6097005208334, + "loss": 0.0024, + "rewards/chosen": 5.014410495758057, + "rewards/margins": 15.785625298817953, + "rewards/rejected": -10.771214803059896, + "step": 9864 + }, + { + "epoch": 0.9013248058474189, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 2.3963392660775576e-07, + "logits/chosen": 529690720.0, + "logits/rejected": 408211200.0, + "logps/chosen": -353.1547546386719, + "logps/rejected": -377.05218505859375, + "loss": 0.0116, + "rewards/chosen": 4.363409996032715, + "rewards/margins": 10.776814460754395, + "rewards/rejected": -6.41340446472168, + "step": 9865 + }, + { + "epoch": 0.9014161717679305, + "grad_norm": 1.4375, + "kl": 0.0, + "learning_rate": 2.391943432734151e-07, + "logits/chosen": 569441109.3333334, + "logits/rejected": 460648448.0, + "logps/chosen": -369.6332600911458, + "logps/rejected": -405.36279296875, + "loss": 0.0094, + "rewards/chosen": 3.752145449320475, + "rewards/margins": 12.45819403330485, + "rewards/rejected": -8.706048583984375, + "step": 9866 + }, + { + "epoch": 0.9015075376884422, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 2.3875515361359347e-07, + "logits/chosen": 981403328.0, + "logits/rejected": 407674944.0, + "logps/chosen": -329.6847839355469, + "logps/rejected": -360.79754638671875, + "loss": 0.0137, + "rewards/chosen": 3.987015724182129, + "rewards/margins": 12.349217414855957, + "rewards/rejected": -8.362201690673828, + "step": 9867 + }, + { + "epoch": 0.9015989036089539, + "grad_norm": 0.92578125, + "kl": 0.0, + "learning_rate": 2.3831635766460638e-07, + "logits/chosen": 503914688.0, + "logits/rejected": 744450048.0, + "logps/chosen": -367.1804504394531, + "logps/rejected": -581.43798828125, + "loss": 0.0025, + "rewards/chosen": 3.9372832775115967, + "rewards/margins": 14.880374533789498, + "rewards/rejected": -10.943091256277901, + "step": 9868 + }, + { + "epoch": 0.9016902695294655, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 2.3787795546273807e-07, + "logits/chosen": 651799808.0, + "logits/rejected": 609802624.0, + "logps/chosen": -347.24822998046875, + "logps/rejected": -620.81201171875, + "loss": 0.012, + "rewards/chosen": 3.9088079929351807, + "rewards/margins": 14.641878366470337, + "rewards/rejected": -10.733070373535156, + "step": 9869 + }, + { + "epoch": 0.9017816354499771, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 2.3743994704424067e-07, + "logits/chosen": 755996928.0, + "logits/rejected": 532051296.0, + "logps/chosen": -338.4684651692708, + "logps/rejected": -401.3765563964844, + "loss": 0.0146, + "rewards/chosen": 4.184320131937663, + "rewards/margins": 11.53186861673991, + "rewards/rejected": -7.347548484802246, + "step": 9870 + }, + { + "epoch": 0.9018730013704888, + "grad_norm": 40.75, + "kl": 0.0, + "learning_rate": 2.3700233244533412e-07, + "logits/chosen": 907121792.0, + "logits/rejected": 483033600.0, + "logps/chosen": -236.7524871826172, + "logps/rejected": -420.1197509765625, + "loss": 0.1025, + "rewards/chosen": 2.090240478515625, + "rewards/margins": 12.076253890991211, + "rewards/rejected": -9.986013412475586, + "step": 9871 + }, + { + "epoch": 0.9019643672910005, + "grad_norm": 4.125, + "kl": 4.876012802124023, + "learning_rate": 2.3656511170220498e-07, + "logits/chosen": 563787584.0, + "logps/chosen": -260.5943603515625, + "loss": 0.0222, + "rewards/chosen": 4.640518665313721, + "step": 9872 + }, + { + "epoch": 0.9020557332115121, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.361282848510066e-07, + "logits/chosen": 457933312.0, + "logits/rejected": 414709504.0, + "logps/chosen": -515.3531901041666, + "logps/rejected": -734.09833984375, + "loss": 0.0156, + "rewards/chosen": 3.2106593449910483, + "rewards/margins": 16.649853070576984, + "rewards/rejected": -13.439193725585938, + "step": 9873 + }, + { + "epoch": 0.9021470991320237, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.3569185192785948e-07, + "logits/chosen": 935599808.0, + "logits/rejected": 778015168.0, + "logps/chosen": -216.756591796875, + "logps/rejected": -451.61505126953125, + "loss": 0.0095, + "rewards/chosen": 4.3933796882629395, + "rewards/margins": 12.635961055755615, + "rewards/rejected": -8.242581367492676, + "step": 9874 + }, + { + "epoch": 0.9022384650525354, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 2.3525581296885535e-07, + "logits/chosen": 899607244.8, + "logits/rejected": 850080682.6666666, + "logps/chosen": -353.03203125, + "logps/rejected": -483.12890625, + "loss": 0.0199, + "rewards/chosen": 3.560136413574219, + "rewards/margins": 11.957386906941732, + "rewards/rejected": -8.397250493367514, + "step": 9875 + }, + { + "epoch": 0.9023298309730471, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 2.3482016801004926e-07, + "logits/chosen": 485054784.0, + "logits/rejected": 274472533.3333333, + "logps/chosen": -401.8499755859375, + "logps/rejected": -359.3603515625, + "loss": 0.0069, + "rewards/chosen": 4.069676399230957, + "rewards/margins": 13.242463747660318, + "rewards/rejected": -9.172787348429361, + "step": 9876 + }, + { + "epoch": 0.9024211968935587, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 2.3438491708746458e-07, + "logits/chosen": 453264597.3333333, + "logits/rejected": 483759104.0, + "logps/chosen": -308.14479573567706, + "logps/rejected": -473.715576171875, + "loss": 0.0221, + "rewards/chosen": 3.769232432047526, + "rewards/margins": 13.722896258036295, + "rewards/rejected": -9.95366382598877, + "step": 9877 + }, + { + "epoch": 0.9025125628140703, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 2.3395006023709255e-07, + "logits/chosen": 573759488.0, + "logits/rejected": 582609152.0, + "logps/chosen": -357.7733642578125, + "logps/rejected": -490.7009684244792, + "loss": 0.0267, + "rewards/chosen": 3.5893348693847655, + "rewards/margins": 10.931850179036458, + "rewards/rejected": -7.342515309651692, + "step": 9878 + }, + { + "epoch": 0.902603928734582, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 2.335155974948916e-07, + "logits/chosen": 543108973.7142857, + "logits/rejected": 326937088.0, + "logps/chosen": -274.25003487723217, + "logps/rejected": -363.3152770996094, + "loss": 0.029, + "rewards/chosen": 3.8294473375592912, + "rewards/margins": 12.33189160483224, + "rewards/rejected": -8.50244426727295, + "step": 9879 + }, + { + "epoch": 0.9026952946550937, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 2.3308152889678915e-07, + "logits/chosen": 579773030.4, + "logits/rejected": 453809066.6666667, + "logps/chosen": -387.796435546875, + "logps/rejected": -371.7787679036458, + "loss": 0.0168, + "rewards/chosen": 3.8050773620605467, + "rewards/margins": 13.890123494466145, + "rewards/rejected": -10.0850461324056, + "step": 9880 + }, + { + "epoch": 0.9027866605756053, + "grad_norm": 0.376953125, + "kl": 0.0, + "learning_rate": 2.3264785447867754e-07, + "logits/chosen": 378872128.0, + "logits/rejected": 578251477.3333334, + "logps/chosen": -363.35198974609375, + "logps/rejected": -708.671875, + "loss": 0.0014, + "rewards/chosen": 5.441917419433594, + "rewards/margins": 16.457533518473305, + "rewards/rejected": -11.015616099039713, + "step": 9881 + }, + { + "epoch": 0.9028780264961169, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 2.3221457427641815e-07, + "logits/chosen": 318248618.6666667, + "logits/rejected": 490667296.0, + "logps/chosen": -414.5909830729167, + "logps/rejected": -313.7322082519531, + "loss": 0.0147, + "rewards/chosen": 4.309552510579427, + "rewards/margins": 13.092238744099934, + "rewards/rejected": -8.782686233520508, + "step": 9882 + }, + { + "epoch": 0.9029693924166285, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 2.3178168832583836e-07, + "logits/chosen": 345993654.85714287, + "logits/rejected": 589545984.0, + "logps/chosen": -331.80018833705356, + "logps/rejected": -836.7489013671875, + "loss": 0.1535, + "rewards/chosen": 2.3767247881208147, + "rewards/margins": 16.353971072605678, + "rewards/rejected": -13.977246284484863, + "step": 9883 + }, + { + "epoch": 0.9030607583371403, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 2.3134919666273513e-07, + "logits/chosen": 380713728.0, + "logits/rejected": 497381184.0, + "logps/chosen": -232.08204650878906, + "logps/rejected": -530.5267333984375, + "loss": 0.0242, + "rewards/chosen": 3.1310112476348877, + "rewards/margins": 14.167049646377563, + "rewards/rejected": -11.036038398742676, + "step": 9884 + }, + { + "epoch": 0.9031521242576519, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 2.3091709932287042e-07, + "logits/chosen": 390376512.0, + "logits/rejected": 416190688.0, + "logps/chosen": -251.22970581054688, + "logps/rejected": -496.76519775390625, + "loss": 0.0137, + "rewards/chosen": 4.183444976806641, + "rewards/margins": 14.155943870544434, + "rewards/rejected": -9.972498893737793, + "step": 9885 + }, + { + "epoch": 0.9032434901781635, + "grad_norm": 0.41015625, + "kl": 0.0, + "learning_rate": 2.304853963419751e-07, + "logits/chosen": 257201280.0, + "logits/rejected": 579862425.6, + "logps/chosen": -180.09110514322916, + "logps/rejected": -479.348291015625, + "loss": 0.0022, + "rewards/chosen": 5.266666412353516, + "rewards/margins": 14.311320495605468, + "rewards/rejected": -9.044654083251952, + "step": 9886 + }, + { + "epoch": 0.9033348560986751, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.3005408775574723e-07, + "logits/chosen": 392260138.6666667, + "logits/rejected": 599443328.0, + "logps/chosen": -286.06333414713544, + "logps/rejected": -808.0634765625, + "loss": 0.0312, + "rewards/chosen": 3.7673476537068686, + "rewards/margins": 13.079226811726889, + "rewards/rejected": -9.31187915802002, + "step": 9887 + }, + { + "epoch": 0.9034262220191869, + "grad_norm": 59.5, + "kl": 0.0, + "learning_rate": 2.296231735998511e-07, + "logits/chosen": 818071722.6666666, + "logits/rejected": 300596416.0, + "logps/chosen": -226.64412434895834, + "logps/rejected": -381.5870056152344, + "loss": 0.0631, + "rewards/chosen": 3.3192214965820312, + "rewards/margins": 11.36755657196045, + "rewards/rejected": -8.048335075378418, + "step": 9888 + }, + { + "epoch": 0.9035175879396985, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 2.2919265390992095e-07, + "logits/chosen": 935691776.0, + "logits/rejected": 738339669.3333334, + "logps/chosen": -355.811328125, + "logps/rejected": -848.0020345052084, + "loss": 0.0196, + "rewards/chosen": 3.786711502075195, + "rewards/margins": 13.322947311401368, + "rewards/rejected": -9.536235809326172, + "step": 9889 + }, + { + "epoch": 0.9036089538602101, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 2.2876252872155558e-07, + "logits/chosen": 803068313.6, + "logits/rejected": 918500693.3333334, + "logps/chosen": -328.83740234375, + "logps/rejected": -357.6670735677083, + "loss": 0.0104, + "rewards/chosen": 4.5780586242675785, + "rewards/margins": 11.143385314941407, + "rewards/rejected": -6.565326690673828, + "step": 9890 + }, + { + "epoch": 0.9037003197807217, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.283327980703215e-07, + "logits/chosen": 436492288.0, + "logits/rejected": 253348192.0, + "logps/chosen": -291.21922084263394, + "logps/rejected": -500.9578552246094, + "loss": 0.0363, + "rewards/chosen": 3.397830145699637, + "rewards/margins": 13.540576117379324, + "rewards/rejected": -10.142745971679688, + "step": 9891 + }, + { + "epoch": 0.9037916857012335, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 2.2790346199175528e-07, + "logits/chosen": 495070912.0, + "logits/rejected": 689209216.0, + "logps/chosen": -369.13311767578125, + "logps/rejected": -512.1388549804688, + "loss": 0.0183, + "rewards/chosen": 3.9727346897125244, + "rewards/margins": 11.938885927200317, + "rewards/rejected": -7.966151237487793, + "step": 9892 + }, + { + "epoch": 0.9038830516217451, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 2.2747452052135854e-07, + "logits/chosen": 416280352.0, + "logits/rejected": 201512800.0, + "logps/chosen": -332.9256286621094, + "logps/rejected": -323.44183349609375, + "loss": 0.0861, + "rewards/chosen": 4.677003860473633, + "rewards/margins": 11.972186088562012, + "rewards/rejected": -7.295182228088379, + "step": 9893 + }, + { + "epoch": 0.9039744175422567, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 2.2704597369460067e-07, + "logits/chosen": 468496947.2, + "logits/rejected": 440036693.3333333, + "logps/chosen": -288.78291015625, + "logps/rejected": -454.044189453125, + "loss": 0.0092, + "rewards/chosen": 4.377777862548828, + "rewards/margins": 12.838930257161458, + "rewards/rejected": -8.46115239461263, + "step": 9894 + }, + { + "epoch": 0.9040657834627683, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 2.2661782154691725e-07, + "logits/chosen": 326049459.2, + "logits/rejected": 223079125.33333334, + "logps/chosen": -403.909033203125, + "logps/rejected": -300.1502278645833, + "loss": 0.0298, + "rewards/chosen": 3.672438049316406, + "rewards/margins": 14.043168131510416, + "rewards/rejected": -10.37073008219401, + "step": 9895 + }, + { + "epoch": 0.9041571493832801, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.2619006411371437e-07, + "logits/chosen": 797851520.0, + "logits/rejected": 477143808.0, + "logps/chosen": -426.01580810546875, + "logps/rejected": -613.5531005859375, + "loss": 0.0177, + "rewards/chosen": 3.5688316822052, + "rewards/margins": 12.779536485671997, + "rewards/rejected": -9.210704803466797, + "step": 9896 + }, + { + "epoch": 0.9042485153037917, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 2.2576270143036316e-07, + "logits/chosen": 578107172.5714285, + "logits/rejected": 722001280.0, + "logps/chosen": -245.9852294921875, + "logps/rejected": -823.9852294921875, + "loss": 0.0457, + "rewards/chosen": 3.286932809012277, + "rewards/margins": 13.229938370840891, + "rewards/rejected": -9.943005561828613, + "step": 9897 + }, + { + "epoch": 0.9043398812243033, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 2.2533573353220206e-07, + "logits/chosen": 397559872.0, + "logits/rejected": 529059552.0, + "logps/chosen": -327.2528076171875, + "logps/rejected": -326.4583740234375, + "loss": 0.0224, + "rewards/chosen": 3.3302807807922363, + "rewards/margins": 11.670103549957275, + "rewards/rejected": -8.339822769165039, + "step": 9898 + }, + { + "epoch": 0.9044312471448149, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.249091604545378e-07, + "logits/chosen": 542074197.3333334, + "logits/rejected": 603067008.0, + "logps/chosen": -243.95210774739584, + "logps/rejected": -540.8370361328125, + "loss": 0.0215, + "rewards/chosen": 3.924316088358561, + "rewards/margins": 13.025158564249674, + "rewards/rejected": -9.100842475891113, + "step": 9899 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 2.2448298223264276e-07, + "logits/chosen": 552431829.3333334, + "logits/rejected": 672968396.8, + "logps/chosen": -394.76171875, + "logps/rejected": -477.422265625, + "loss": 0.0124, + "rewards/chosen": 4.237545013427734, + "rewards/margins": 12.672394561767579, + "rewards/rejected": -8.434849548339844, + "step": 9900 + }, + { + "epoch": 0.9046139789858383, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 2.240571989017598e-07, + "logits/chosen": 132161008.0, + "logits/rejected": 427450666.6666667, + "logps/chosen": -175.50140380859375, + "logps/rejected": -357.3361409505208, + "loss": 0.0089, + "rewards/chosen": 4.063475131988525, + "rewards/margins": 13.297546227773031, + "rewards/rejected": -9.234071095784506, + "step": 9901 + }, + { + "epoch": 0.9047053449063499, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 2.2363181049709637e-07, + "logits/chosen": 561759744.0, + "logits/rejected": 252350566.4, + "logps/chosen": -386.7694905598958, + "logps/rejected": -418.446875, + "loss": 0.0111, + "rewards/chosen": 3.6515636444091797, + "rewards/margins": 14.204815292358399, + "rewards/rejected": -10.553251647949219, + "step": 9902 + }, + { + "epoch": 0.9047967108268615, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 2.232068170538282e-07, + "logits/chosen": 677078016.0, + "logits/rejected": 546647936.0, + "logps/chosen": -540.0450439453125, + "logps/rejected": -514.4996948242188, + "loss": 0.0136, + "rewards/chosen": 4.6039252281188965, + "rewards/margins": 13.622849941253662, + "rewards/rejected": -9.018924713134766, + "step": 9903 + }, + { + "epoch": 0.9048880767473733, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 2.2278221860709714e-07, + "logits/chosen": 564214886.4, + "logits/rejected": 427857834.6666667, + "logps/chosen": -255.078173828125, + "logps/rejected": -766.38818359375, + "loss": 0.0139, + "rewards/chosen": 4.635005187988281, + "rewards/margins": 17.61461944580078, + "rewards/rejected": -12.9796142578125, + "step": 9904 + }, + { + "epoch": 0.9049794426678849, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 2.223580151920157e-07, + "logits/chosen": 361322700.8, + "logits/rejected": 539478741.3333334, + "logps/chosen": -369.06826171875, + "logps/rejected": -359.5170491536458, + "loss": 0.1112, + "rewards/chosen": 4.807421493530273, + "rewards/margins": 10.126045354207356, + "rewards/rejected": -5.318623860677083, + "step": 9905 + }, + { + "epoch": 0.9050708085883965, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 2.2193420684366084e-07, + "logits/chosen": 356861226.6666667, + "logits/rejected": 533767968.0, + "logps/chosen": -276.4307047526042, + "logps/rejected": -621.091064453125, + "loss": 0.013, + "rewards/chosen": 4.309476534525554, + "rewards/margins": 13.467981974283855, + "rewards/rejected": -9.1585054397583, + "step": 9906 + }, + { + "epoch": 0.9051621745089081, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 2.2151079359707728e-07, + "logits/chosen": 457016320.0, + "logits/rejected": 462363084.8, + "logps/chosen": -186.5040283203125, + "logps/rejected": -539.15400390625, + "loss": 0.0178, + "rewards/chosen": 4.04459285736084, + "rewards/margins": 13.927745628356934, + "rewards/rejected": -9.883152770996094, + "step": 9907 + }, + { + "epoch": 0.9052535404294199, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 2.21087775487277e-07, + "logits/chosen": 595733299.2, + "logits/rejected": 229306048.0, + "logps/chosen": -328.578369140625, + "logps/rejected": -227.87886555989584, + "loss": 0.0218, + "rewards/chosen": 3.48184814453125, + "rewards/margins": 10.707937622070313, + "rewards/rejected": -7.2260894775390625, + "step": 9908 + }, + { + "epoch": 0.9053449063499315, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 2.2066515254923982e-07, + "logits/chosen": 717582131.2, + "logits/rejected": 1107817813.3333333, + "logps/chosen": -307.922705078125, + "logps/rejected": -437.7508951822917, + "loss": 0.011, + "rewards/chosen": 4.315869903564453, + "rewards/margins": 13.86062266031901, + "rewards/rejected": -9.544752756754557, + "step": 9909 + }, + { + "epoch": 0.9054362722704431, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 2.2024292481791332e-07, + "logits/chosen": 676642432.0, + "logits/rejected": 391722112.0, + "logps/chosen": -420.20849609375, + "logps/rejected": -360.7227783203125, + "loss": 0.0132, + "rewards/chosen": 4.430742263793945, + "rewards/margins": 11.122763633728027, + "rewards/rejected": -6.692021369934082, + "step": 9910 + }, + { + "epoch": 0.9055276381909547, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 2.198210923282118e-07, + "logits/chosen": 877430784.0, + "logits/rejected": 834103808.0, + "logps/chosen": -206.24332682291666, + "logps/rejected": -380.913037109375, + "loss": 0.0112, + "rewards/chosen": 3.5320914586385093, + "rewards/margins": 12.91515267690023, + "rewards/rejected": -9.38306121826172, + "step": 9911 + }, + { + "epoch": 0.9056190041114665, + "grad_norm": 96.5, + "kl": 0.0, + "learning_rate": 2.1939965511501682e-07, + "logits/chosen": 485015936.0, + "logits/rejected": 307494553.6, + "logps/chosen": -323.05979410807294, + "logps/rejected": -438.323583984375, + "loss": 0.0524, + "rewards/chosen": 4.049760182698567, + "rewards/margins": 13.088334401448567, + "rewards/rejected": -9.03857421875, + "step": 9912 + }, + { + "epoch": 0.9057103700319781, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 2.1897861321317603e-07, + "logits/chosen": 489847705.6, + "logits/rejected": 424326869.3333333, + "logps/chosen": -328.334912109375, + "logps/rejected": -587.8837890625, + "loss": 0.015, + "rewards/chosen": 4.517076110839843, + "rewards/margins": 15.046540832519531, + "rewards/rejected": -10.529464721679688, + "step": 9913 + }, + { + "epoch": 0.9058017359524897, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.1855796665750718e-07, + "logits/chosen": 665425920.0, + "logits/rejected": 756110656.0, + "logps/chosen": -254.61519368489584, + "logps/rejected": -494.98663330078125, + "loss": 0.1428, + "rewards/chosen": 2.8841911951700845, + "rewards/margins": 12.660831133524576, + "rewards/rejected": -9.776639938354492, + "step": 9914 + }, + { + "epoch": 0.9058931018730013, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 2.1813771548279405e-07, + "logits/chosen": 489625258.6666667, + "logits/rejected": 363058073.6, + "logps/chosen": -288.7665201822917, + "logps/rejected": -489.938916015625, + "loss": 0.0898, + "rewards/chosen": 4.455630938212077, + "rewards/margins": 12.463494555155435, + "rewards/rejected": -8.007863616943359, + "step": 9915 + }, + { + "epoch": 0.905984467793513, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 2.1771785972378667e-07, + "logits/chosen": 766222506.6666666, + "logits/rejected": 477008588.8, + "logps/chosen": -536.7574055989584, + "logps/rejected": -542.0962890625, + "loss": 0.0029, + "rewards/chosen": 5.234206199645996, + "rewards/margins": 14.990032768249511, + "rewards/rejected": -9.755826568603515, + "step": 9916 + }, + { + "epoch": 0.9060758337140247, + "grad_norm": 51.75, + "kl": 0.0, + "learning_rate": 2.1729839941520336e-07, + "logits/chosen": 720561305.6, + "logits/rejected": 619017728.0, + "logps/chosen": -296.297802734375, + "logps/rejected": -549.6055908203125, + "loss": 0.0439, + "rewards/chosen": 3.535969924926758, + "rewards/margins": 11.666391881306968, + "rewards/rejected": -8.130421956380209, + "step": 9917 + }, + { + "epoch": 0.9061671996345363, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.168793345917297e-07, + "logits/chosen": 554627532.8, + "logits/rejected": 323490432.0, + "logps/chosen": -348.3070556640625, + "logps/rejected": -364.9796549479167, + "loss": 0.0194, + "rewards/chosen": 4.15410041809082, + "rewards/margins": 12.479791005452475, + "rewards/rejected": -8.325690587361654, + "step": 9918 + }, + { + "epoch": 0.9062585655550479, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 2.1646066528801857e-07, + "logits/chosen": 806275520.0, + "logits/rejected": 479562240.0, + "logps/chosen": -433.54205322265625, + "logps/rejected": -371.8807678222656, + "loss": 0.1241, + "rewards/chosen": 4.227503776550293, + "rewards/margins": 10.9710054397583, + "rewards/rejected": -6.743501663208008, + "step": 9919 + }, + { + "epoch": 0.9063499314755596, + "grad_norm": 52.25, + "kl": 0.0, + "learning_rate": 2.1604239153869e-07, + "logits/chosen": 824790186.6666666, + "logits/rejected": 357901440.0, + "logps/chosen": -319.4926350911458, + "logps/rejected": -261.4888916015625, + "loss": 0.1064, + "rewards/chosen": 3.1641174952189126, + "rewards/margins": 10.731131235758463, + "rewards/rejected": -7.567013740539551, + "step": 9920 + }, + { + "epoch": 0.9064412973960713, + "grad_norm": 62.25, + "kl": 0.0, + "learning_rate": 2.156245133783308e-07, + "logits/chosen": 1499819136.0, + "logits/rejected": 659639210.6666666, + "logps/chosen": -374.5055236816406, + "logps/rejected": -547.53759765625, + "loss": 0.031, + "rewards/chosen": 3.1793212890625, + "rewards/margins": 11.252510070800781, + "rewards/rejected": -8.073188781738281, + "step": 9921 + }, + { + "epoch": 0.9065326633165829, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 2.1520703084149664e-07, + "logits/chosen": 422408960.0, + "logits/rejected": 427018752.0, + "logps/chosen": -287.14447021484375, + "logps/rejected": -441.1708679199219, + "loss": 0.0216, + "rewards/chosen": 3.688995838165283, + "rewards/margins": 12.851674556732178, + "rewards/rejected": -9.162678718566895, + "step": 9922 + }, + { + "epoch": 0.9066240292370945, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 2.147899439627088e-07, + "logits/chosen": 553116876.8, + "logits/rejected": 454033749.3333333, + "logps/chosen": -262.9591552734375, + "logps/rejected": -434.720458984375, + "loss": 0.015, + "rewards/chosen": 3.85164794921875, + "rewards/margins": 12.157567596435547, + "rewards/rejected": -8.305919647216797, + "step": 9923 + }, + { + "epoch": 0.9067153951576062, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 2.1437325277645639e-07, + "logits/chosen": 715220582.4, + "logits/rejected": 727178154.6666666, + "logps/chosen": -288.69462890625, + "logps/rejected": -635.9968668619791, + "loss": 0.0168, + "rewards/chosen": 3.865225982666016, + "rewards/margins": 13.842823537190757, + "rewards/rejected": -9.97759755452474, + "step": 9924 + }, + { + "epoch": 0.9068067610781179, + "grad_norm": 2.328125, + "kl": 0.039264678955078125, + "learning_rate": 2.1395695731719623e-07, + "logits/chosen": 660029098.6666666, + "logits/rejected": 821949824.0, + "logps/chosen": -388.7155354817708, + "logps/rejected": -324.18359375, + "loss": 0.0132, + "rewards/chosen": 4.389366785685222, + "rewards/margins": 11.886237303415935, + "rewards/rejected": -7.496870517730713, + "step": 9925 + }, + { + "epoch": 0.9068981269986295, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 2.1354105761935196e-07, + "logits/chosen": 860831129.6, + "logits/rejected": 622902229.3333334, + "logps/chosen": -429.2201171875, + "logps/rejected": -535.092041015625, + "loss": 0.0223, + "rewards/chosen": 3.7052848815917967, + "rewards/margins": 12.539176432291665, + "rewards/rejected": -8.83389155069987, + "step": 9926 + }, + { + "epoch": 0.9069894929191412, + "grad_norm": 5.96875, + "kl": 7.2749481201171875, + "learning_rate": 2.1312555371731493e-07, + "logits/chosen": 777680457.1428572, + "logits/rejected": 494266912.0, + "logps/chosen": -367.58726283482144, + "logps/rejected": -450.93658447265625, + "loss": 0.0519, + "rewards/chosen": 3.849491664341518, + "rewards/margins": 13.021513530186244, + "rewards/rejected": -9.172021865844727, + "step": 9927 + }, + { + "epoch": 0.9070808588396528, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 2.1271044564544374e-07, + "logits/chosen": 1052581632.0, + "logits/rejected": 610471058.2857143, + "logps/chosen": -453.52752685546875, + "logps/rejected": -485.9471958705357, + "loss": 0.009, + "rewards/chosen": 2.8038270473480225, + "rewards/margins": 10.871790783745903, + "rewards/rejected": -8.06796373639788, + "step": 9928 + }, + { + "epoch": 0.9071722247601645, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 2.1229573343806264e-07, + "logits/chosen": 694628761.6, + "logits/rejected": 569390165.3333334, + "logps/chosen": -347.656396484375, + "logps/rejected": -699.0087890625, + "loss": 0.1263, + "rewards/chosen": 2.5492069244384767, + "rewards/margins": 14.456347020467124, + "rewards/rejected": -11.907140096028646, + "step": 9929 + }, + { + "epoch": 0.9072635906806761, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 2.118814171294653e-07, + "logits/chosen": 486989760.0, + "logits/rejected": 835966976.0, + "logps/chosen": -249.5620574951172, + "logps/rejected": -797.16064453125, + "loss": 0.0308, + "rewards/chosen": 3.044093370437622, + "rewards/margins": 16.500685930252075, + "rewards/rejected": -13.456592559814453, + "step": 9930 + }, + { + "epoch": 0.9073549566011878, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.114674967539121e-07, + "logits/chosen": 465098016.0, + "logits/rejected": 616041088.0, + "logps/chosen": -346.6302795410156, + "logps/rejected": -562.580078125, + "loss": 0.0134, + "rewards/chosen": 3.7531371116638184, + "rewards/margins": 13.542311191558838, + "rewards/rejected": -9.78917407989502, + "step": 9931 + }, + { + "epoch": 0.9074463225216994, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 2.1105397234563064e-07, + "logits/chosen": 396325888.0, + "logits/rejected": 677732224.0, + "logps/chosen": -279.2468959263393, + "logps/rejected": -459.74884033203125, + "loss": 0.0232, + "rewards/chosen": 3.9206747327532088, + "rewards/margins": 12.430941036769322, + "rewards/rejected": -8.510266304016113, + "step": 9932 + }, + { + "epoch": 0.907537688442211, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.1064084393881468e-07, + "logits/chosen": 448302368.0, + "logits/rejected": 434245696.0, + "logps/chosen": -310.98883056640625, + "logps/rejected": -452.2247619628906, + "loss": 0.0093, + "rewards/chosen": 4.9496355056762695, + "rewards/margins": 14.73287296295166, + "rewards/rejected": -9.78323745727539, + "step": 9933 + }, + { + "epoch": 0.9076290543627227, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 2.102281115676258e-07, + "logits/chosen": 324064768.0, + "logits/rejected": 459680042.6666667, + "logps/chosen": -237.4395751953125, + "logps/rejected": -619.5344645182291, + "loss": 0.0103, + "rewards/chosen": 3.307831287384033, + "rewards/margins": 13.33231782913208, + "rewards/rejected": -10.024486541748047, + "step": 9934 + }, + { + "epoch": 0.9077204202832344, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 2.0981577526619501e-07, + "logits/chosen": 367508019.2, + "logits/rejected": 777367040.0, + "logps/chosen": -366.4435546875, + "logps/rejected": -558.734619140625, + "loss": 0.0243, + "rewards/chosen": 3.5148521423339845, + "rewards/margins": 13.236405944824218, + "rewards/rejected": -9.721553802490234, + "step": 9935 + }, + { + "epoch": 0.907811786203746, + "grad_norm": 0.6484375, + "kl": 0.0, + "learning_rate": 2.094038350686173e-07, + "logits/chosen": 883792832.0, + "logits/rejected": 758008393.1428572, + "logps/chosen": -632.014892578125, + "logps/rejected": -651.9398716517857, + "loss": 0.0018, + "rewards/chosen": 4.231970310211182, + "rewards/margins": 14.649888924189977, + "rewards/rejected": -10.417918613978795, + "step": 9936 + }, + { + "epoch": 0.9079031521242577, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 2.0899229100895647e-07, + "logits/chosen": 487367475.2, + "logits/rejected": 1013210282.6666666, + "logps/chosen": -263.10205078125, + "logps/rejected": -758.763671875, + "loss": 0.0162, + "rewards/chosen": 3.866864776611328, + "rewards/margins": 11.23841183980306, + "rewards/rejected": -7.3715470631917315, + "step": 9937 + }, + { + "epoch": 0.9079945180447693, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 2.085811431212431e-07, + "logits/chosen": 593647744.0, + "logits/rejected": 648966229.3333334, + "logps/chosen": -224.97183227539062, + "logps/rejected": -633.7267252604166, + "loss": 0.0125, + "rewards/chosen": 3.0621538162231445, + "rewards/margins": 12.776291211446127, + "rewards/rejected": -9.714137395222982, + "step": 9938 + }, + { + "epoch": 0.908085883965281, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 2.0817039143947614e-07, + "logits/chosen": 499526041.6, + "logits/rejected": 175214229.33333334, + "logps/chosen": -301.344189453125, + "logps/rejected": -369.0445149739583, + "loss": 0.1249, + "rewards/chosen": 2.8502933502197267, + "rewards/margins": 10.568116378784179, + "rewards/rejected": -7.717823028564453, + "step": 9939 + }, + { + "epoch": 0.9081772498857926, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 2.0776003599762063e-07, + "logits/chosen": 399269290.6666667, + "logits/rejected": 603381196.8, + "logps/chosen": -290.93267822265625, + "logps/rejected": -713.830615234375, + "loss": 0.0287, + "rewards/chosen": 2.625925064086914, + "rewards/margins": 12.63233985900879, + "rewards/rejected": -10.006414794921875, + "step": 9940 + }, + { + "epoch": 0.9082686158063042, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 2.0735007682960884e-07, + "logits/chosen": 600763648.0, + "logits/rejected": 667862720.0, + "logps/chosen": -182.89862060546875, + "logps/rejected": -418.2354736328125, + "loss": 0.0095, + "rewards/chosen": 4.194775104522705, + "rewards/margins": 13.767189502716064, + "rewards/rejected": -9.57241439819336, + "step": 9941 + }, + { + "epoch": 0.9083599817268159, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 2.0694051396934089e-07, + "logits/chosen": 466120576.0, + "logits/rejected": 549698645.3333334, + "logps/chosen": -230.74183654785156, + "logps/rejected": -413.3771565755208, + "loss": 0.0244, + "rewards/chosen": 3.3594651222229004, + "rewards/margins": 12.628255685170492, + "rewards/rejected": -9.268790562947592, + "step": 9942 + }, + { + "epoch": 0.9084513476473276, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 2.0653134745068416e-07, + "logits/chosen": 742392490.6666666, + "logits/rejected": 271632537.6, + "logps/chosen": -503.0653076171875, + "logps/rejected": -336.71005859375, + "loss": 0.0076, + "rewards/chosen": 4.011588414510091, + "rewards/margins": 13.656885655721027, + "rewards/rejected": -9.645297241210937, + "step": 9943 + }, + { + "epoch": 0.9085427135678392, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 2.0612257730747209e-07, + "logits/chosen": 372718688.0, + "logits/rejected": 283637056.0, + "logps/chosen": -286.7458801269531, + "logps/rejected": -522.4611206054688, + "loss": 0.0098, + "rewards/chosen": 4.738656044006348, + "rewards/margins": 15.944012641906738, + "rewards/rejected": -11.20535659790039, + "step": 9944 + }, + { + "epoch": 0.9086340794883508, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 2.0571420357350659e-07, + "logits/chosen": 425774873.6, + "logits/rejected": 312510720.0, + "logps/chosen": -319.328564453125, + "logps/rejected": -368.1381429036458, + "loss": 0.008, + "rewards/chosen": 4.6327880859375, + "rewards/margins": 13.899128341674805, + "rewards/rejected": -9.266340255737305, + "step": 9945 + }, + { + "epoch": 0.9087254454088625, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 2.0530622628255613e-07, + "logits/chosen": 813765120.0, + "logits/rejected": 444075008.0, + "logps/chosen": -419.0619140625, + "logps/rejected": -322.053955078125, + "loss": 0.016, + "rewards/chosen": 3.7524463653564455, + "rewards/margins": 11.88785285949707, + "rewards/rejected": -8.135406494140625, + "step": 9946 + }, + { + "epoch": 0.9088168113293742, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 2.0489864546835715e-07, + "logits/chosen": 754321856.0, + "logits/rejected": 795069525.3333334, + "logps/chosen": -160.3810577392578, + "logps/rejected": -385.7105712890625, + "loss": 0.014, + "rewards/chosen": 2.8622450828552246, + "rewards/margins": 11.780994574228922, + "rewards/rejected": -8.918749491373697, + "step": 9947 + }, + { + "epoch": 0.9089081772498858, + "grad_norm": 25.125, + "kl": 0.0, + "learning_rate": 2.044914611646126e-07, + "logits/chosen": 595250278.4, + "logits/rejected": 533702912.0, + "logps/chosen": -136.84306640625, + "logps/rejected": -477.5625, + "loss": 0.134, + "rewards/chosen": 2.7239334106445314, + "rewards/margins": 11.831214141845702, + "rewards/rejected": -9.107280731201172, + "step": 9948 + }, + { + "epoch": 0.9089995431703974, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 2.040846734049934e-07, + "logits/chosen": 579368896.0, + "logits/rejected": 590319189.3333334, + "logps/chosen": -308.8480224609375, + "logps/rejected": -517.1737467447916, + "loss": 0.11, + "rewards/chosen": 4.337701320648193, + "rewards/margins": 13.339867432912191, + "rewards/rejected": -9.002166112263998, + "step": 9949 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 2.036782822231359e-07, + "logits/chosen": 612425088.0, + "logits/rejected": 721521024.0, + "logps/chosen": -378.52178955078125, + "logps/rejected": -417.73284912109375, + "loss": 0.0349, + "rewards/chosen": 3.0345146656036377, + "rewards/margins": 8.978402853012085, + "rewards/rejected": -5.943888187408447, + "step": 9950 + }, + { + "epoch": 0.9091822750114208, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 2.032722876526455e-07, + "logits/chosen": 314388096.0, + "logits/rejected": 572380379.4285715, + "logps/chosen": -252.4200897216797, + "logps/rejected": -605.1455078125, + "loss": 0.0046, + "rewards/chosen": 3.3909225463867188, + "rewards/margins": 13.238397870744977, + "rewards/rejected": -9.847475324358259, + "step": 9951 + }, + { + "epoch": 0.9092736409319324, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.0286668972709422e-07, + "logits/chosen": 594684416.0, + "logits/rejected": 488488480.0, + "logps/chosen": -372.23602294921875, + "logps/rejected": -518.7059326171875, + "loss": 0.0181, + "rewards/chosen": 3.724951982498169, + "rewards/margins": 13.951346635818481, + "rewards/rejected": -10.226394653320312, + "step": 9952 + }, + { + "epoch": 0.909365006852444, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 2.024614884800219e-07, + "logits/chosen": 290481493.3333333, + "logits/rejected": 344547942.4, + "logps/chosen": -248.87076822916666, + "logps/rejected": -605.235205078125, + "loss": 0.0062, + "rewards/chosen": 4.895101547241211, + "rewards/margins": 15.909300613403321, + "rewards/rejected": -11.01419906616211, + "step": 9953 + }, + { + "epoch": 0.9094563727729557, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 2.0205668394493395e-07, + "logits/chosen": 644502528.0, + "logits/rejected": 464262058.6666667, + "logps/chosen": -415.3443359375, + "logps/rejected": -286.61301676432294, + "loss": 0.0177, + "rewards/chosen": 3.7091156005859376, + "rewards/margins": 11.314810689290365, + "rewards/rejected": -7.605695088704427, + "step": 9954 + }, + { + "epoch": 0.9095477386934674, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 2.0165227615530358e-07, + "logits/chosen": 340792678.4, + "logits/rejected": 299310954.6666667, + "logps/chosen": -311.961328125, + "logps/rejected": -397.3301595052083, + "loss": 0.0173, + "rewards/chosen": 3.819559097290039, + "rewards/margins": 12.552748998006185, + "rewards/rejected": -8.733189900716146, + "step": 9955 + }, + { + "epoch": 0.909639104613979, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 2.0124826514457353e-07, + "logits/chosen": 460305792.0, + "logits/rejected": 357028192.0, + "logps/chosen": -319.3755798339844, + "logps/rejected": -284.9019470214844, + "loss": 0.0145, + "rewards/chosen": 4.435092449188232, + "rewards/margins": 12.628514766693115, + "rewards/rejected": -8.193422317504883, + "step": 9956 + }, + { + "epoch": 0.9097304705344906, + "grad_norm": 6.1875, + "kl": 0.5961570739746094, + "learning_rate": 2.008446509461498e-07, + "logits/chosen": 510640859.4285714, + "logits/rejected": 639015680.0, + "logps/chosen": -338.7760532924107, + "logps/rejected": -409.8691711425781, + "loss": 0.0444, + "rewards/chosen": 3.6776030404227122, + "rewards/margins": 12.800964627947126, + "rewards/rejected": -9.123361587524414, + "step": 9957 + }, + { + "epoch": 0.9098218364550023, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 2.0044143359340852e-07, + "logits/chosen": 861294592.0, + "logits/rejected": 561730496.0, + "logps/chosen": -275.84059651692706, + "logps/rejected": -595.0380249023438, + "loss": 0.0326, + "rewards/chosen": 3.542258898417155, + "rewards/margins": 15.433686892191568, + "rewards/rejected": -11.891427993774414, + "step": 9958 + }, + { + "epoch": 0.909913202375514, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 2.000386131196913e-07, + "logits/chosen": 400520064.0, + "logits/rejected": 438383712.0, + "logps/chosen": -248.57791137695312, + "logps/rejected": -571.3235473632812, + "loss": 0.0103, + "rewards/chosen": 4.033719062805176, + "rewards/margins": 13.248941421508789, + "rewards/rejected": -9.215222358703613, + "step": 9959 + }, + { + "epoch": 0.9100045682960256, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 1.9963618955830877e-07, + "logits/chosen": 711038400.0, + "logits/rejected": 699117248.0, + "logps/chosen": -401.7130432128906, + "logps/rejected": -495.473876953125, + "loss": 0.0154, + "rewards/chosen": 3.6514501571655273, + "rewards/margins": 14.166975021362305, + "rewards/rejected": -10.515524864196777, + "step": 9960 + }, + { + "epoch": 0.9100959342165372, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.9923416294253705e-07, + "logits/chosen": 359512685.71428573, + "logits/rejected": 158885264.0, + "logps/chosen": -302.74696568080356, + "logps/rejected": -206.73110961914062, + "loss": 0.0394, + "rewards/chosen": 3.2937938145228793, + "rewards/margins": 9.284361021859304, + "rewards/rejected": -5.990567207336426, + "step": 9961 + }, + { + "epoch": 0.9101873001370488, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 1.988325333056196e-07, + "logits/chosen": 466298112.0, + "logits/rejected": 870357094.4, + "logps/chosen": -289.2027587890625, + "logps/rejected": -669.11337890625, + "loss": 0.0118, + "rewards/chosen": 3.6853694915771484, + "rewards/margins": 12.931284713745118, + "rewards/rejected": -9.24591522216797, + "step": 9962 + }, + { + "epoch": 0.9102786660575606, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 1.984313006807681e-07, + "logits/chosen": 541838438.4, + "logits/rejected": 486529280.0, + "logps/chosen": -340.3026123046875, + "logps/rejected": -739.6712239583334, + "loss": 0.0254, + "rewards/chosen": 3.5109878540039063, + "rewards/margins": 17.628647104899088, + "rewards/rejected": -14.117659250895182, + "step": 9963 + }, + { + "epoch": 0.9103700319780722, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 1.9803046510115998e-07, + "logits/chosen": 319063424.0, + "logits/rejected": 475084416.0, + "logps/chosen": -113.24589538574219, + "logps/rejected": -630.5450846354166, + "loss": 0.0257, + "rewards/chosen": 3.0736231803894043, + "rewards/margins": 12.83601967493693, + "rewards/rejected": -9.762396494547525, + "step": 9964 + }, + { + "epoch": 0.9104613978985838, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 1.9763002659994146e-07, + "logits/chosen": 600011392.0, + "logits/rejected": 398245312.0, + "logps/chosen": -409.02435302734375, + "logps/rejected": -490.31903076171875, + "loss": 0.0131, + "rewards/chosen": 4.015268802642822, + "rewards/margins": 14.943783283233643, + "rewards/rejected": -10.92851448059082, + "step": 9965 + }, + { + "epoch": 0.9105527638190954, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.9722998521022497e-07, + "logits/chosen": 265413734.4, + "logits/rejected": 410813952.0, + "logps/chosen": -313.8263671875, + "logps/rejected": -422.625, + "loss": 0.0136, + "rewards/chosen": 4.452635192871094, + "rewards/margins": 13.40767199198405, + "rewards/rejected": -8.955036799112955, + "step": 9966 + }, + { + "epoch": 0.9106441297396072, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 1.9683034096509014e-07, + "logits/chosen": 514529408.0, + "logits/rejected": 382178278.4, + "logps/chosen": -333.66943359375, + "logps/rejected": -473.206103515625, + "loss": 0.01, + "rewards/chosen": 3.820547103881836, + "rewards/margins": 12.509373092651368, + "rewards/rejected": -8.688825988769532, + "step": 9967 + }, + { + "epoch": 0.9107354956601188, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 1.964310938975833e-07, + "logits/chosen": 609875029.3333334, + "logits/rejected": 972925440.0, + "logps/chosen": -325.44590250651044, + "logps/rejected": -592.1079711914062, + "loss": 0.0272, + "rewards/chosen": 4.1073252360026045, + "rewards/margins": 14.980984369913738, + "rewards/rejected": -10.873659133911133, + "step": 9968 + }, + { + "epoch": 0.9108268615806304, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.9603224404071974e-07, + "logits/chosen": 677178112.0, + "logits/rejected": 963304618.6666666, + "logps/chosen": -496.11025390625, + "logps/rejected": -296.9155680338542, + "loss": 0.0156, + "rewards/chosen": 3.9043323516845705, + "rewards/margins": 11.322912724812825, + "rewards/rejected": -7.418580373128255, + "step": 9969 + }, + { + "epoch": 0.910918227501142, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 1.9563379142747974e-07, + "logits/chosen": 935872768.0, + "logits/rejected": 519960352.0, + "logps/chosen": -374.9798583984375, + "logps/rejected": -438.2056884765625, + "loss": 0.08, + "rewards/chosen": 3.4002785682678223, + "rewards/margins": 9.950208187103271, + "rewards/rejected": -6.549929618835449, + "step": 9970 + }, + { + "epoch": 0.9110095934216538, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 1.9523573609081137e-07, + "logits/chosen": 792543317.3333334, + "logits/rejected": 790383718.4, + "logps/chosen": -335.4567464192708, + "logps/rejected": -681.5162109375, + "loss": 0.0109, + "rewards/chosen": 3.7656129201253257, + "rewards/margins": 14.214814122517904, + "rewards/rejected": -10.449201202392578, + "step": 9971 + }, + { + "epoch": 0.9111009593421654, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.9483807806362997e-07, + "logits/chosen": 472531660.8, + "logits/rejected": 499633877.3333333, + "logps/chosen": -294.4142822265625, + "logps/rejected": -514.6964518229166, + "loss": 0.0174, + "rewards/chosen": 3.954903793334961, + "rewards/margins": 15.573454411824546, + "rewards/rejected": -11.618550618489584, + "step": 9972 + }, + { + "epoch": 0.911192325262677, + "grad_norm": 0.38671875, + "kl": 0.0, + "learning_rate": 1.9444081737881924e-07, + "logits/chosen": 352996160.0, + "logits/rejected": 803030442.6666666, + "logps/chosen": -151.76123046875, + "logps/rejected": -505.400634765625, + "loss": 0.0017, + "rewards/chosen": 5.327637672424316, + "rewards/margins": 16.096805254618324, + "rewards/rejected": -10.76916758219401, + "step": 9973 + }, + { + "epoch": 0.9112836911831886, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 1.9404395406922848e-07, + "logits/chosen": 370989568.0, + "logits/rejected": 422901248.0, + "logps/chosen": -405.1165771484375, + "logps/rejected": -426.5682373046875, + "loss": 0.0055, + "rewards/chosen": 3.9985382556915283, + "rewards/margins": 14.6332848072052, + "rewards/rejected": -10.634746551513672, + "step": 9974 + }, + { + "epoch": 0.9113750571037004, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 1.9364748816767476e-07, + "logits/chosen": 647507609.6, + "logits/rejected": 545897173.3333334, + "logps/chosen": -433.221484375, + "logps/rejected": -434.6560872395833, + "loss": 0.0347, + "rewards/chosen": 2.996063232421875, + "rewards/margins": 11.409860610961914, + "rewards/rejected": -8.413797378540039, + "step": 9975 + }, + { + "epoch": 0.911466423024212, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 1.9325141970694127e-07, + "logits/chosen": 581185792.0, + "logits/rejected": 818431692.8, + "logps/chosen": -206.89229329427084, + "logps/rejected": -656.232080078125, + "loss": 0.0173, + "rewards/chosen": 3.2748587926228843, + "rewards/margins": 12.169977506001791, + "rewards/rejected": -8.895118713378906, + "step": 9976 + }, + { + "epoch": 0.9115577889447236, + "grad_norm": 55.5, + "kl": 0.0, + "learning_rate": 1.928557487197802e-07, + "logits/chosen": 413209514.6666667, + "logits/rejected": 280034201.6, + "logps/chosen": -159.64871215820312, + "logps/rejected": -238.5335693359375, + "loss": 0.138, + "rewards/chosen": 2.593063990275065, + "rewards/margins": 9.096448389689128, + "rewards/rejected": -6.503384399414062, + "step": 9977 + }, + { + "epoch": 0.9116491548652352, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.9246047523890977e-07, + "logits/chosen": 660994496.0, + "logits/rejected": 673718208.0, + "logps/chosen": -423.447021484375, + "logps/rejected": -419.7539978027344, + "loss": 0.0147, + "rewards/chosen": 3.9494149684906006, + "rewards/margins": 13.886302709579468, + "rewards/rejected": -9.936887741088867, + "step": 9978 + }, + { + "epoch": 0.911740520785747, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.9206559929701496e-07, + "logits/chosen": 481658240.0, + "logits/rejected": 542640576.0, + "logps/chosen": -289.4605712890625, + "logps/rejected": -670.3258666992188, + "loss": 0.0187, + "rewards/chosen": 3.572525978088379, + "rewards/margins": 14.374141693115234, + "rewards/rejected": -10.801615715026855, + "step": 9979 + }, + { + "epoch": 0.9118318867062586, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 1.91671120926748e-07, + "logits/chosen": 670353280.0, + "logits/rejected": 724039040.0, + "logps/chosen": -506.0187072753906, + "logps/rejected": -374.18939208984375, + "loss": 0.0241, + "rewards/chosen": 3.4478001594543457, + "rewards/margins": 12.761921405792236, + "rewards/rejected": -9.31412124633789, + "step": 9980 + }, + { + "epoch": 0.9119232526267702, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 1.9127704016073002e-07, + "logits/chosen": 594948437.3333334, + "logits/rejected": 1100569344.0, + "logps/chosen": -398.0839029947917, + "logps/rejected": -784.1141967773438, + "loss": 0.029, + "rewards/chosen": 3.7593819300333657, + "rewards/margins": 13.693649927775065, + "rewards/rejected": -9.9342679977417, + "step": 9981 + }, + { + "epoch": 0.9120146185472818, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 1.9088335703154658e-07, + "logits/chosen": 536199270.4, + "logits/rejected": 381469354.6666667, + "logps/chosen": -274.015966796875, + "logps/rejected": -521.412353515625, + "loss": 0.0225, + "rewards/chosen": 3.6228981018066406, + "rewards/margins": 12.349604924519857, + "rewards/rejected": -8.726706822713217, + "step": 9982 + }, + { + "epoch": 0.9121059844677936, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 1.904900715717528e-07, + "logits/chosen": 719543552.0, + "logits/rejected": 349559456.0, + "logps/chosen": -337.8332214355469, + "logps/rejected": -312.901123046875, + "loss": 0.0178, + "rewards/chosen": 4.549121856689453, + "rewards/margins": 12.396139144897461, + "rewards/rejected": -7.847017288208008, + "step": 9983 + }, + { + "epoch": 0.9121973503883052, + "grad_norm": 0.0830078125, + "kl": 0.0, + "learning_rate": 1.900971838138682e-07, + "logits/rejected": 616227968.0, + "logps/rejected": -513.2875366210938, + "loss": 0.0002, + "rewards/rejected": -9.71798324584961, + "step": 9984 + }, + { + "epoch": 0.9122887163088168, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 1.897046937903818e-07, + "logits/chosen": 401306496.0, + "logits/rejected": 578073344.0, + "logps/chosen": -334.71240234375, + "logps/rejected": -418.0325622558594, + "loss": 0.01, + "rewards/chosen": 4.518087863922119, + "rewards/margins": 13.447953701019287, + "rewards/rejected": -8.929865837097168, + "step": 9985 + }, + { + "epoch": 0.9123800822293284, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.8931260153374876e-07, + "logits/chosen": 590325632.0, + "logits/rejected": 323314976.0, + "logps/chosen": -348.45904541015625, + "logps/rejected": -506.8544921875, + "loss": 0.01, + "rewards/chosen": 4.299381256103516, + "rewards/margins": 13.526269912719727, + "rewards/rejected": -9.226888656616211, + "step": 9986 + }, + { + "epoch": 0.9124714481498402, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 1.8892090707639198e-07, + "logits/chosen": 573658240.0, + "logits/rejected": 407918976.0, + "logps/chosen": -301.6582845052083, + "logps/rejected": -254.3152618408203, + "loss": 0.1428, + "rewards/chosen": 2.3373332023620605, + "rewards/margins": 9.018767356872559, + "rewards/rejected": -6.681434154510498, + "step": 9987 + }, + { + "epoch": 0.9125628140703518, + "grad_norm": 0.8203125, + "kl": 0.0, + "learning_rate": 1.8852961045070062e-07, + "logits/rejected": 557247936.0, + "logps/rejected": -630.6134033203125, + "loss": 0.0016, + "rewards/rejected": -8.280996322631836, + "step": 9988 + }, + { + "epoch": 0.9126541799908634, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 1.881387116890304e-07, + "logits/chosen": 428835264.0, + "logits/rejected": 428054336.0, + "logps/chosen": -360.2449951171875, + "logps/rejected": -520.950927734375, + "loss": 0.017, + "rewards/chosen": 4.123174667358398, + "rewards/margins": 15.354589462280273, + "rewards/rejected": -11.231414794921875, + "step": 9989 + }, + { + "epoch": 0.912745545911375, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 1.8774821082370665e-07, + "logits/chosen": 166887216.0, + "logits/rejected": 1024759296.0, + "logps/chosen": -226.860107421875, + "logps/rejected": -975.0361328125, + "loss": 0.006, + "rewards/chosen": 4.477361679077148, + "rewards/margins": 14.931512832641602, + "rewards/rejected": -10.454151153564453, + "step": 9990 + }, + { + "epoch": 0.9128369118318868, + "grad_norm": 1.7265625, + "kl": 0.0, + "learning_rate": 1.873581078870196e-07, + "logits/chosen": 725019033.6, + "logits/rejected": 465413205.3333333, + "logps/chosen": -314.053564453125, + "logps/rejected": -438.9875895182292, + "loss": 0.0117, + "rewards/chosen": 4.502758026123047, + "rewards/margins": 12.011725107828777, + "rewards/rejected": -7.5089670817057295, + "step": 9991 + }, + { + "epoch": 0.9129282777523984, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 1.869684029112262e-07, + "logits/chosen": 469613738.6666667, + "logits/rejected": 448680499.2, + "logps/chosen": -304.1579996744792, + "logps/rejected": -438.85224609375, + "loss": 0.0263, + "rewards/chosen": 3.4275925954182944, + "rewards/margins": 9.82151730855306, + "rewards/rejected": -6.393924713134766, + "step": 9992 + }, + { + "epoch": 0.91301964367291, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 1.8657909592855295e-07, + "logits/chosen": 360539264.0, + "logits/rejected": 629232640.0, + "logps/chosen": -269.8127136230469, + "logps/rejected": -675.510986328125, + "loss": 0.0237, + "rewards/chosen": 3.2032504081726074, + "rewards/margins": 11.863044261932373, + "rewards/rejected": -8.659793853759766, + "step": 9993 + }, + { + "epoch": 0.9131110095934216, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 1.8619018697119018e-07, + "logits/chosen": 486696746.6666667, + "logits/rejected": 738710067.2, + "logps/chosen": -397.127685546875, + "logps/rejected": -668.5408203125, + "loss": 0.0353, + "rewards/chosen": 3.169663111368815, + "rewards/margins": 12.732979456583658, + "rewards/rejected": -9.563316345214844, + "step": 9994 + }, + { + "epoch": 0.9132023755139334, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 1.8580167607129828e-07, + "logits/chosen": 470740832.0, + "logits/rejected": 582316928.0, + "logps/chosen": -387.840087890625, + "logps/rejected": -580.1051025390625, + "loss": 0.0196, + "rewards/chosen": 3.336371421813965, + "rewards/margins": 14.678179740905762, + "rewards/rejected": -11.341808319091797, + "step": 9995 + }, + { + "epoch": 0.913293741434445, + "grad_norm": 0.66796875, + "kl": 0.0, + "learning_rate": 1.8541356326100436e-07, + "logits/chosen": 173516896.0, + "logits/rejected": 546905600.0, + "logps/chosen": -91.85491180419922, + "logps/rejected": -579.5, + "loss": 0.0029, + "rewards/chosen": 3.8967506885528564, + "rewards/margins": 11.995141540254865, + "rewards/rejected": -8.098390851702009, + "step": 9996 + }, + { + "epoch": 0.9133851073549566, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 1.850258485723988e-07, + "logits/chosen": 609706240.0, + "logits/rejected": 706629440.0, + "logps/chosen": -383.2813720703125, + "logps/rejected": -350.1923828125, + "loss": 0.1029, + "rewards/chosen": 4.044726371765137, + "rewards/margins": 11.454447746276855, + "rewards/rejected": -7.409721374511719, + "step": 9997 + }, + { + "epoch": 0.9134764732754682, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 1.8463853203754488e-07, + "logits/chosen": 892038144.0, + "logits/rejected": 878282956.8, + "logps/chosen": -602.3609212239584, + "logps/rejected": -692.24375, + "loss": 0.0033, + "rewards/chosen": 4.868028004964192, + "rewards/margins": 16.646734364827473, + "rewards/rejected": -11.778706359863282, + "step": 9998 + }, + { + "epoch": 0.91356783919598, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.8425161368846923e-07, + "logits/chosen": 477936224.0, + "logits/rejected": 712818048.0, + "logps/chosen": -304.32830810546875, + "logps/rejected": -762.3541259765625, + "loss": 0.0108, + "rewards/chosen": 4.269552230834961, + "rewards/margins": 13.48604965209961, + "rewards/rejected": -9.216497421264648, + "step": 9999 + }, + { + "epoch": 0.9136592051164916, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 1.8386509355716565e-07, + "logits/chosen": 457882112.0, + "logits/rejected": 558241408.0, + "logps/chosen": -251.0760498046875, + "logps/rejected": -275.55267333984375, + "loss": 0.0338, + "rewards/chosen": 3.30730406443278, + "rewards/margins": 10.761140982309977, + "rewards/rejected": -7.453836917877197, + "step": 10000 + }, + { + "epoch": 0.9137505710370032, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 1.8347897167559636e-07, + "logits/chosen": 610353664.0, + "logits/rejected": 714800128.0, + "logps/chosen": -266.5053405761719, + "logps/rejected": -751.5918579101562, + "loss": 0.0105, + "rewards/chosen": 4.467464447021484, + "rewards/margins": 14.440352439880371, + "rewards/rejected": -9.972887992858887, + "step": 10001 + }, + { + "epoch": 0.9138419369575148, + "grad_norm": 75.5, + "kl": 0.0, + "learning_rate": 1.830932480756903e-07, + "logits/chosen": 568267878.4, + "logits/rejected": 663275648.0, + "logps/chosen": -285.01416015625, + "logps/rejected": -681.7890218098959, + "loss": 0.073, + "rewards/chosen": 3.3609214782714845, + "rewards/margins": 11.938967641194662, + "rewards/rejected": -8.578046162923178, + "step": 10002 + }, + { + "epoch": 0.9139333028780265, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 1.8270792278934302e-07, + "logits/chosen": 522288576.0, + "logits/rejected": 529251072.0, + "logps/chosen": -292.8893737792969, + "logps/rejected": -565.0078125, + "loss": 0.0156, + "rewards/chosen": 3.822953701019287, + "rewards/margins": 12.586015224456787, + "rewards/rejected": -8.7630615234375, + "step": 10003 + }, + { + "epoch": 0.9140246687985382, + "grad_norm": 52.0, + "kl": 0.0, + "learning_rate": 1.823229958484174e-07, + "logits/chosen": 696547328.0, + "logits/rejected": 539338069.3333334, + "logps/chosen": -514.0705078125, + "logps/rejected": -359.2747395833333, + "loss": 0.0879, + "rewards/chosen": 4.161694717407227, + "rewards/margins": 9.71539675394694, + "rewards/rejected": -5.553702036539714, + "step": 10004 + }, + { + "epoch": 0.9141160347190498, + "grad_norm": 0.0703125, + "kl": 0.0, + "learning_rate": 1.8193846728474296e-07, + "logits/rejected": 884745216.0, + "logps/rejected": -510.20001220703125, + "loss": 0.0003, + "rewards/rejected": -10.293380737304688, + "step": 10005 + }, + { + "epoch": 0.9142074006395614, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 1.815543371301165e-07, + "logits/chosen": 463093452.8, + "logits/rejected": 521158741.3333333, + "logps/chosen": -187.6255859375, + "logps/rejected": -521.993896484375, + "loss": 0.0316, + "rewards/chosen": 3.142866325378418, + "rewards/margins": 11.388075065612792, + "rewards/rejected": -8.245208740234375, + "step": 10006 + }, + { + "epoch": 0.9142987665600731, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 1.8117060541630317e-07, + "logits/chosen": 277311488.0, + "logits/rejected": 611030784.0, + "logps/chosen": -295.0485026041667, + "logps/rejected": -517.59931640625, + "loss": 0.0245, + "rewards/chosen": 2.8739147186279297, + "rewards/margins": 11.92821159362793, + "rewards/rejected": -9.054296875, + "step": 10007 + }, + { + "epoch": 0.9143901324805848, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 1.8078727217503366e-07, + "logits/chosen": 595676586.6666666, + "logits/rejected": 1017833062.4, + "logps/chosen": -296.3504638671875, + "logps/rejected": -424.52568359375, + "loss": 0.0186, + "rewards/chosen": 3.3926963806152344, + "rewards/margins": 12.34635238647461, + "rewards/rejected": -8.953656005859376, + "step": 10008 + }, + { + "epoch": 0.9144814984010964, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1.8040433743800546e-07, + "logits/chosen": 865237632.0, + "logps/chosen": -311.0071716308594, + "loss": 0.1378, + "rewards/chosen": 3.3614065647125244, + "step": 10009 + }, + { + "epoch": 0.914572864321608, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 1.8002180123688318e-07, + "logits/chosen": 828512576.0, + "logits/rejected": 396223456.0, + "logps/chosen": -336.89691162109375, + "logps/rejected": -453.08551025390625, + "loss": 0.0075, + "rewards/chosen": 4.28872013092041, + "rewards/margins": 13.378944396972656, + "rewards/rejected": -9.090224266052246, + "step": 10010 + }, + { + "epoch": 0.9146642302421197, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.7963966360330044e-07, + "logits/chosen": 313724373.3333333, + "logits/rejected": 581244723.2, + "logps/chosen": -226.41316731770834, + "logps/rejected": -481.660546875, + "loss": 0.1213, + "rewards/chosen": 3.6367807388305664, + "rewards/margins": 11.881867408752441, + "rewards/rejected": -8.245086669921875, + "step": 10011 + }, + { + "epoch": 0.9147555961626314, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 1.792579245688564e-07, + "logits/chosen": 303289984.0, + "logits/rejected": 411038016.0, + "logps/chosen": -179.427978515625, + "logps/rejected": -542.16748046875, + "loss": 0.0214, + "rewards/chosen": 3.76547908782959, + "rewards/margins": 13.787859916687012, + "rewards/rejected": -10.022380828857422, + "step": 10012 + }, + { + "epoch": 0.914846962083143, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.7887658416511634e-07, + "logits/chosen": 611377868.8, + "logits/rejected": 1402856704.0, + "logps/chosen": -336.070166015625, + "logps/rejected": -430.0513102213542, + "loss": 0.0304, + "rewards/chosen": 3.6987625122070313, + "rewards/margins": 13.088746007283529, + "rewards/rejected": -9.389983495076498, + "step": 10013 + }, + { + "epoch": 0.9149383280036546, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 1.784956424236145e-07, + "logits/chosen": 542738090.6666666, + "logits/rejected": 428336332.8, + "logps/chosen": -355.5343424479167, + "logps/rejected": -568.68857421875, + "loss": 0.0358, + "rewards/chosen": 3.0197715759277344, + "rewards/margins": 9.818931579589844, + "rewards/rejected": -6.799160003662109, + "step": 10014 + }, + { + "epoch": 0.9150296939241663, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.7811509937584958e-07, + "logits/chosen": 582255155.2, + "logits/rejected": 419915776.0, + "logps/chosen": -459.7564453125, + "logps/rejected": -360.4244791666667, + "loss": 0.0074, + "rewards/chosen": 4.812509918212891, + "rewards/margins": 14.690828704833985, + "rewards/rejected": -9.878318786621094, + "step": 10015 + }, + { + "epoch": 0.915121059844678, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 1.7773495505329141e-07, + "logits/chosen": 436215488.0, + "logits/rejected": 348226346.6666667, + "logps/chosen": -355.9258728027344, + "logps/rejected": -378.9852294921875, + "loss": 0.0087, + "rewards/chosen": 3.4091796875, + "rewards/margins": 12.112377802530924, + "rewards/rejected": -8.703198115030924, + "step": 10016 + }, + { + "epoch": 0.9152124257651896, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.7735520948737263e-07, + "logits/chosen": 825005056.0, + "logits/rejected": 600060774.4, + "logps/chosen": -423.3974202473958, + "logps/rejected": -382.270849609375, + "loss": 0.0124, + "rewards/chosen": 3.407257080078125, + "rewards/margins": 12.658694458007812, + "rewards/rejected": -9.251437377929687, + "step": 10017 + }, + { + "epoch": 0.9153037916857012, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 1.7697586270949585e-07, + "logits/chosen": 474523904.0, + "logits/rejected": 428232448.0, + "logps/chosen": -213.49625651041666, + "logps/rejected": -541.89775390625, + "loss": 0.0173, + "rewards/chosen": 3.293349266052246, + "rewards/margins": 11.928568077087402, + "rewards/rejected": -8.635218811035156, + "step": 10018 + }, + { + "epoch": 0.9153951576062129, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 1.765969147510277e-07, + "logits/chosen": 875179417.6, + "logits/rejected": 939772501.3333334, + "logps/chosen": -422.4822265625, + "logps/rejected": -437.2691650390625, + "loss": 0.018, + "rewards/chosen": 3.7212432861328124, + "rewards/margins": 12.003397369384766, + "rewards/rejected": -8.282154083251953, + "step": 10019 + }, + { + "epoch": 0.9154865235267245, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 1.7621836564330585e-07, + "logits/chosen": 482330709.3333333, + "logits/rejected": 582537984.0, + "logps/chosen": -403.8545328776042, + "logps/rejected": -479.05380859375, + "loss": 0.0174, + "rewards/chosen": 3.5390412012736, + "rewards/margins": 11.622078005472819, + "rewards/rejected": -8.083036804199219, + "step": 10020 + }, + { + "epoch": 0.9155778894472362, + "grad_norm": 0.169921875, + "kl": 0.0, + "learning_rate": 1.758402154176314e-07, + "logits/chosen": 212848336.0, + "logits/rejected": 344574390.85714287, + "logps/chosen": -174.204833984375, + "logps/rejected": -431.99630301339283, + "loss": 0.0009, + "rewards/chosen": 5.430418491363525, + "rewards/margins": 14.359769753047399, + "rewards/rejected": -8.929351261683873, + "step": 10021 + }, + { + "epoch": 0.9156692553677478, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.754624641052749e-07, + "logits/chosen": 536321962.6666667, + "logits/rejected": 467386265.6, + "logps/chosen": -371.3831380208333, + "logps/rejected": -427.30673828125, + "loss": 0.0154, + "rewards/chosen": 3.627567927042643, + "rewards/margins": 13.207787958780925, + "rewards/rejected": -9.580220031738282, + "step": 10022 + }, + { + "epoch": 0.9157606212882595, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 1.750851117374719e-07, + "logits/chosen": 517085337.6, + "logits/rejected": 927132416.0, + "logps/chosen": -382.529443359375, + "logps/rejected": -841.6197102864584, + "loss": 0.0299, + "rewards/chosen": 3.524117279052734, + "rewards/margins": 13.928648122151692, + "rewards/rejected": -10.404530843098959, + "step": 10023 + }, + { + "epoch": 0.9158519872087711, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 1.747081583454263e-07, + "logits/chosen": 604629888.0, + "logits/rejected": 296887392.0, + "logps/chosen": -271.2569885253906, + "logps/rejected": -361.1524353027344, + "loss": 0.0251, + "rewards/chosen": 3.077406406402588, + "rewards/margins": 11.887278079986572, + "rewards/rejected": -8.809871673583984, + "step": 10024 + }, + { + "epoch": 0.9159433531292828, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 1.7433160396030935e-07, + "logits/chosen": 758767360.0, + "logits/rejected": 1195942144.0, + "logps/chosen": -144.3097381591797, + "logps/rejected": -450.045166015625, + "loss": 0.0144, + "rewards/chosen": 4.169558525085449, + "rewards/margins": 13.696913719177246, + "rewards/rejected": -9.527355194091797, + "step": 10025 + }, + { + "epoch": 0.9160347190497944, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 1.7395544861325718e-07, + "logits/chosen": 430719072.0, + "logits/rejected": 445900800.0, + "logps/chosen": -321.0939636230469, + "logps/rejected": -526.2255045572916, + "loss": 0.0078, + "rewards/chosen": 4.0430908203125, + "rewards/margins": 13.468893686930338, + "rewards/rejected": -9.425802866617838, + "step": 10026 + }, + { + "epoch": 0.9161260849703061, + "grad_norm": 0.287109375, + "kl": 0.0, + "learning_rate": 1.7357969233537498e-07, + "logits/chosen": 381099072.0, + "logits/rejected": 489387702.85714287, + "logps/chosen": -354.7742614746094, + "logps/rejected": -514.3353794642857, + "loss": 0.001, + "rewards/chosen": 5.198935031890869, + "rewards/margins": 15.33675241470337, + "rewards/rejected": -10.1378173828125, + "step": 10027 + }, + { + "epoch": 0.9162174508908177, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 1.732043351577345e-07, + "logits/chosen": 422398003.2, + "logits/rejected": 429336405.3333333, + "logps/chosen": -304.3853759765625, + "logps/rejected": -501.7218424479167, + "loss": 0.0242, + "rewards/chosen": 3.621052932739258, + "rewards/margins": 11.449269485473632, + "rewards/rejected": -7.828216552734375, + "step": 10028 + }, + { + "epoch": 0.9163088168113294, + "grad_norm": 6.28125, + "kl": 1.050313949584961, + "learning_rate": 1.728293771113748e-07, + "logits/chosen": 724624896.0, + "logits/rejected": 730650368.0, + "logps/chosen": -267.22140066964283, + "logps/rejected": -742.6005859375, + "loss": 0.0393, + "rewards/chosen": 3.913231985909598, + "rewards/margins": 13.754821913582937, + "rewards/rejected": -9.84158992767334, + "step": 10029 + }, + { + "epoch": 0.916400182731841, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 1.724548182273006e-07, + "logits/chosen": 822886272.0, + "logits/rejected": 621735296.0, + "logps/chosen": -370.19342041015625, + "logps/rejected": -327.42755126953125, + "loss": 0.0141, + "rewards/chosen": 3.558276414871216, + "rewards/margins": 12.287806749343872, + "rewards/rejected": -8.729530334472656, + "step": 10030 + }, + { + "epoch": 0.9164915486523527, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 1.7208065853648425e-07, + "logits/chosen": 433193898.6666667, + "logits/rejected": 552027200.0, + "logps/chosen": -269.5342610677083, + "logps/rejected": -664.3031005859375, + "loss": 0.0341, + "rewards/chosen": 3.3885936737060547, + "rewards/margins": 14.810895919799805, + "rewards/rejected": -11.42230224609375, + "step": 10031 + }, + { + "epoch": 0.9165829145728643, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 1.7170689806986607e-07, + "logits/chosen": 591128192.0, + "logits/rejected": 618861568.0, + "logps/chosen": -284.7992858886719, + "logps/rejected": -472.1703186035156, + "loss": 0.0203, + "rewards/chosen": 3.3318426609039307, + "rewards/margins": 12.623381853103638, + "rewards/rejected": -9.291539192199707, + "step": 10032 + }, + { + "epoch": 0.916674280493376, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.713335368583524e-07, + "logits/chosen": 529512416.0, + "logits/rejected": 850991616.0, + "logps/chosen": -330.7684326171875, + "logps/rejected": -511.0929870605469, + "loss": 0.0917, + "rewards/chosen": 4.334256172180176, + "rewards/margins": 11.465338706970215, + "rewards/rejected": -7.131082534790039, + "step": 10033 + }, + { + "epoch": 0.9167656464138876, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 1.7096057493281693e-07, + "logits/chosen": 1557216256.0, + "logits/rejected": 685128345.6, + "logps/chosen": -310.39565022786456, + "logps/rejected": -496.955078125, + "loss": 0.0118, + "rewards/chosen": 3.7498343785603843, + "rewards/margins": 13.516811307271322, + "rewards/rejected": -9.766976928710937, + "step": 10034 + }, + { + "epoch": 0.9168570123343993, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 1.7058801232409993e-07, + "logits/chosen": 419844320.0, + "logits/rejected": 470049450.6666667, + "logps/chosen": -495.20941162109375, + "logps/rejected": -376.7816975911458, + "loss": 0.0095, + "rewards/chosen": 3.4522414207458496, + "rewards/margins": 12.551198482513428, + "rewards/rejected": -9.098957061767578, + "step": 10035 + }, + { + "epoch": 0.9169483782549109, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 1.7021584906300792e-07, + "logits/chosen": 631054165.3333334, + "logits/rejected": 748538726.4, + "logps/chosen": -376.1955973307292, + "logps/rejected": -613.53681640625, + "loss": 0.0097, + "rewards/chosen": 4.147861480712891, + "rewards/margins": 14.265129852294923, + "rewards/rejected": -10.117268371582032, + "step": 10036 + }, + { + "epoch": 0.9170397441754226, + "grad_norm": 0.12890625, + "kl": 0.0, + "learning_rate": 1.698440851803168e-07, + "logits/chosen": 1190322816.0, + "logits/rejected": 484894683.4285714, + "logps/chosen": -138.87591552734375, + "logps/rejected": -558.85595703125, + "loss": 0.0007, + "rewards/chosen": 5.1932373046875, + "rewards/margins": 15.820714678083148, + "rewards/rejected": -10.627477373395648, + "step": 10037 + }, + { + "epoch": 0.9171311100959342, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.6947272070676756e-07, + "logits/chosen": 326897664.0, + "logits/rejected": 690561152.0, + "logps/chosen": -185.63174438476562, + "logps/rejected": -587.765869140625, + "loss": 0.0144, + "rewards/chosen": 3.895415782928467, + "rewards/margins": 12.92244291305542, + "rewards/rejected": -9.027027130126953, + "step": 10038 + }, + { + "epoch": 0.9172224760164459, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 1.691017556730684e-07, + "logits/chosen": 763049472.0, + "logits/rejected": 697337088.0, + "logps/chosen": -346.174267578125, + "logps/rejected": -424.7794596354167, + "loss": 0.0318, + "rewards/chosen": 3.668701171875, + "rewards/margins": 11.0902099609375, + "rewards/rejected": -7.4215087890625, + "step": 10039 + }, + { + "epoch": 0.9173138419369575, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 1.687311901098948e-07, + "logits/chosen": 322166677.3333333, + "logits/rejected": 353997926.4, + "logps/chosen": -373.0032552083333, + "logps/rejected": -396.3345703125, + "loss": 0.0068, + "rewards/chosen": 4.79579480489095, + "rewards/margins": 13.677429326375325, + "rewards/rejected": -8.881634521484376, + "step": 10040 + }, + { + "epoch": 0.9174052078574692, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.683610240478889e-07, + "logits/chosen": 822370969.6, + "logits/rejected": 1339592106.6666667, + "logps/chosen": -323.27646484375, + "logps/rejected": -825.75927734375, + "loss": 0.0153, + "rewards/chosen": 3.867376708984375, + "rewards/margins": 13.946298980712891, + "rewards/rejected": -10.078922271728516, + "step": 10041 + }, + { + "epoch": 0.9174965737779808, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 1.6799125751766066e-07, + "logits/chosen": 457556224.0, + "logits/rejected": 385361152.0, + "logps/chosen": -443.5205078125, + "logps/rejected": -407.57470703125, + "loss": 0.0104, + "rewards/chosen": 4.5179706573486325, + "rewards/margins": 11.601088460286459, + "rewards/rejected": -7.083117802937825, + "step": 10042 + }, + { + "epoch": 0.9175879396984925, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 1.6762189054978562e-07, + "logits/chosen": 463747776.0, + "logits/rejected": 515030826.6666667, + "logps/chosen": -237.37791442871094, + "logps/rejected": -530.7304280598959, + "loss": 0.003, + "rewards/chosen": 4.790343761444092, + "rewards/margins": 15.482601960500082, + "rewards/rejected": -10.69225819905599, + "step": 10043 + }, + { + "epoch": 0.9176793056190041, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 1.6725292317480712e-07, + "logits/chosen": 341871274.6666667, + "logits/rejected": 371263564.8, + "logps/chosen": -229.618408203125, + "logps/rejected": -338.204736328125, + "loss": 0.0059, + "rewards/chosen": 4.728789329528809, + "rewards/margins": 13.39514980316162, + "rewards/rejected": -8.666360473632812, + "step": 10044 + }, + { + "epoch": 0.9177706715395157, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 1.6688435542323522e-07, + "logits/chosen": 493345484.8, + "logits/rejected": 467482112.0, + "logps/chosen": -200.46409912109374, + "logps/rejected": -518.2057698567709, + "loss": 0.0253, + "rewards/chosen": 3.6327140808105467, + "rewards/margins": 12.027708180745442, + "rewards/rejected": -8.394994099934896, + "step": 10045 + }, + { + "epoch": 0.9178620374600274, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 1.6651618732554774e-07, + "logits/chosen": 838113126.4, + "logits/rejected": 491801898.6666667, + "logps/chosen": -372.6318359375, + "logps/rejected": -602.6186930338541, + "loss": 0.016, + "rewards/chosen": 3.8765228271484373, + "rewards/margins": 14.115264383951821, + "rewards/rejected": -10.238741556803385, + "step": 10046 + }, + { + "epoch": 0.9179534033805391, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 1.6614841891218815e-07, + "logits/chosen": 489569536.0, + "logits/rejected": 418486912.0, + "logps/chosen": -334.6214599609375, + "logps/rejected": -415.9645080566406, + "loss": 0.0158, + "rewards/chosen": 3.7854928970336914, + "rewards/margins": 12.509516716003418, + "rewards/rejected": -8.724023818969727, + "step": 10047 + }, + { + "epoch": 0.9180447693010507, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 1.6578105021356815e-07, + "logits/chosen": 554737322.6666666, + "logits/rejected": 546572902.4, + "logps/chosen": -161.25080362955728, + "logps/rejected": -408.7058349609375, + "loss": 0.0126, + "rewards/chosen": 4.107810656229655, + "rewards/margins": 12.607661120096843, + "rewards/rejected": -8.499850463867187, + "step": 10048 + }, + { + "epoch": 0.9181361352215623, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 1.6541408126006464e-07, + "logits/chosen": 570719385.6, + "logits/rejected": 628528896.0, + "logps/chosen": -361.54375, + "logps/rejected": -701.2571614583334, + "loss": 0.0091, + "rewards/chosen": 4.572051239013672, + "rewards/margins": 14.122869364420573, + "rewards/rejected": -9.5508181254069, + "step": 10049 + }, + { + "epoch": 0.918227501142074, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 1.6504751208202385e-07, + "logits/chosen": 532586496.0, + "logits/rejected": 965346816.0, + "logps/chosen": -254.70843505859375, + "logps/rejected": -631.92431640625, + "loss": 0.0104, + "rewards/chosen": 4.313117027282715, + "rewards/margins": 12.531691551208496, + "rewards/rejected": -8.218574523925781, + "step": 10050 + }, + { + "epoch": 0.9183188670625857, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.6468134270975655e-07, + "logits/chosen": 511700821.3333333, + "logits/rejected": 491916492.8, + "logps/chosen": -311.9204915364583, + "logps/rejected": -558.5181640625, + "loss": 0.0205, + "rewards/chosen": 3.556333859761556, + "rewards/margins": 12.058340390523275, + "rewards/rejected": -8.502006530761719, + "step": 10051 + }, + { + "epoch": 0.9184102329830973, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 1.643155731735413e-07, + "logits/chosen": 372993472.0, + "logits/rejected": 467005482.6666667, + "logps/chosen": -273.8919677734375, + "logps/rejected": -444.0728759765625, + "loss": 0.0909, + "rewards/chosen": 3.808712959289551, + "rewards/margins": 13.844426155090332, + "rewards/rejected": -10.035713195800781, + "step": 10052 + }, + { + "epoch": 0.9185015989036089, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 1.6395020350362557e-07, + "logits/chosen": 348057600.0, + "logits/rejected": 286882858.6666667, + "logps/chosen": -338.0475830078125, + "logps/rejected": -599.9618326822916, + "loss": 0.0097, + "rewards/chosen": 4.625511169433594, + "rewards/margins": 16.28354517618815, + "rewards/rejected": -11.658034006754557, + "step": 10053 + }, + { + "epoch": 0.9185929648241206, + "grad_norm": 0.32421875, + "kl": 0.0, + "learning_rate": 1.6358523373022071e-07, + "logits/chosen": 340770080.0, + "logits/rejected": 470980800.0, + "logps/chosen": -283.173583984375, + "logps/rejected": -544.1572875976562, + "loss": 0.0021, + "rewards/chosen": 5.746767044067383, + "rewards/margins": 14.355964660644531, + "rewards/rejected": -8.609197616577148, + "step": 10054 + }, + { + "epoch": 0.9186843307446323, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 1.6322066388350655e-07, + "logits/chosen": 555003187.2, + "logits/rejected": 525340416.0, + "logps/chosen": -369.837548828125, + "logps/rejected": -625.4758707682291, + "loss": 0.0147, + "rewards/chosen": 3.8191577911376955, + "rewards/margins": 16.683647028605144, + "rewards/rejected": -12.864489237467447, + "step": 10055 + }, + { + "epoch": 0.9187756966651439, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 1.6285649399363003e-07, + "logits/chosen": 859398656.0, + "logits/rejected": 455469568.0, + "logps/chosen": -409.995703125, + "logps/rejected": -478.9647623697917, + "loss": 0.132, + "rewards/chosen": 2.851752281188965, + "rewards/margins": 12.918534914652506, + "rewards/rejected": -10.066782633463541, + "step": 10056 + }, + { + "epoch": 0.9188670625856555, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 1.624927240907037e-07, + "logits/chosen": 637886720.0, + "logits/rejected": 392653568.0, + "logps/chosen": -328.34234619140625, + "logps/rejected": -398.7244873046875, + "loss": 0.0198, + "rewards/chosen": 3.7111358642578125, + "rewards/margins": 13.177976608276367, + "rewards/rejected": -9.466840744018555, + "step": 10057 + }, + { + "epoch": 0.9189584285061672, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 1.6212935420480912e-07, + "logits/chosen": 766209216.0, + "logits/rejected": 574233088.0, + "logps/chosen": -289.0738830566406, + "logps/rejected": -430.7730407714844, + "loss": 0.0128, + "rewards/chosen": 3.944035768508911, + "rewards/margins": 13.030908823013306, + "rewards/rejected": -9.086873054504395, + "step": 10058 + }, + { + "epoch": 0.9190497944266789, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 1.6176638436599278e-07, + "logits/chosen": 431268192.0, + "logits/rejected": 329767328.0, + "logps/chosen": -221.64324951171875, + "logps/rejected": -419.27911376953125, + "loss": 0.0274, + "rewards/chosen": 3.0152018070220947, + "rewards/margins": 11.526211023330688, + "rewards/rejected": -8.511009216308594, + "step": 10059 + }, + { + "epoch": 0.9191411603471905, + "grad_norm": 0.58203125, + "kl": 0.0, + "learning_rate": 1.6140381460426957e-07, + "logits/chosen": 587031808.0, + "logits/rejected": 1139493683.2, + "logps/chosen": -199.98333740234375, + "logps/rejected": -539.10927734375, + "loss": 0.0036, + "rewards/chosen": 4.8226369222005205, + "rewards/margins": 14.045853169759113, + "rewards/rejected": -9.223216247558593, + "step": 10060 + }, + { + "epoch": 0.9192325262677021, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 1.6104164494961994e-07, + "logits/chosen": 629182848.0, + "logits/rejected": 418660992.0, + "logps/chosen": -330.2108968098958, + "logps/rejected": -543.3033447265625, + "loss": 0.0137, + "rewards/chosen": 4.3105723063151045, + "rewards/margins": 13.938893000284832, + "rewards/rejected": -9.628320693969727, + "step": 10061 + }, + { + "epoch": 0.9193238921882138, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 1.6067987543199215e-07, + "logits/chosen": 579866624.0, + "logits/rejected": 810965248.0, + "logps/chosen": -378.24765625, + "logps/rejected": -422.0965983072917, + "loss": 0.0252, + "rewards/chosen": 3.85784912109375, + "rewards/margins": 11.565605799357098, + "rewards/rejected": -7.707756678263347, + "step": 10062 + }, + { + "epoch": 0.9194152581087255, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 1.6031850608130172e-07, + "logits/chosen": 427874986.6666667, + "logits/rejected": 562479104.0, + "logps/chosen": -218.048828125, + "logps/rejected": -587.8685546875, + "loss": 0.0085, + "rewards/chosen": 4.237228711446126, + "rewards/margins": 14.50305512746175, + "rewards/rejected": -10.265826416015624, + "step": 10063 + }, + { + "epoch": 0.9195066240292371, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 1.5995753692743032e-07, + "logits/chosen": 381224096.0, + "logits/rejected": 258340576.0, + "logps/chosen": -254.90597534179688, + "logps/rejected": -335.8769226074219, + "loss": 0.1149, + "rewards/chosen": 3.398249626159668, + "rewards/margins": 11.23337459564209, + "rewards/rejected": -7.835124969482422, + "step": 10064 + }, + { + "epoch": 0.9195979899497487, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.5959696800022683e-07, + "logits/chosen": 549772544.0, + "logits/rejected": 590977344.0, + "logps/chosen": -275.2590026855469, + "logps/rejected": -461.1553039550781, + "loss": 0.0214, + "rewards/chosen": 3.1632862091064453, + "rewards/margins": 13.030519485473633, + "rewards/rejected": -9.867233276367188, + "step": 10065 + }, + { + "epoch": 0.9196893558702603, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 1.5923679932950577e-07, + "logits/chosen": 472262784.0, + "logits/rejected": 364345190.4, + "logps/chosen": -315.16632080078125, + "logps/rejected": -489.45908203125, + "loss": 0.0046, + "rewards/chosen": 4.827088038126628, + "rewards/margins": 15.42371431986491, + "rewards/rejected": -10.596626281738281, + "step": 10066 + }, + { + "epoch": 0.9197807217907721, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 1.5887703094505158e-07, + "logits/chosen": 769729877.3333334, + "logits/rejected": 435446169.6, + "logps/chosen": -384.4720865885417, + "logps/rejected": -424.640966796875, + "loss": 0.0219, + "rewards/chosen": 3.44735050201416, + "rewards/margins": 13.301832008361817, + "rewards/rejected": -9.854481506347657, + "step": 10067 + }, + { + "epoch": 0.9198720877112837, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 1.5851766287661275e-07, + "logits/chosen": 465626026.6666667, + "logits/rejected": 449540928.0, + "logps/chosen": -193.0286865234375, + "logps/rejected": -620.393798828125, + "loss": 0.0362, + "rewards/chosen": 3.395127296447754, + "rewards/margins": 13.596087455749512, + "rewards/rejected": -10.200960159301758, + "step": 10068 + }, + { + "epoch": 0.9199634536317953, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.5815869515390603e-07, + "logits/chosen": 359166560.0, + "logits/rejected": 345403136.0, + "logps/chosen": -276.4580078125, + "logps/rejected": -369.7805480957031, + "loss": 0.0986, + "rewards/chosen": 4.220523357391357, + "rewards/margins": 11.16041612625122, + "rewards/rejected": -6.939892768859863, + "step": 10069 + }, + { + "epoch": 0.920054819552307, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.578001278066138e-07, + "logits/chosen": 773949568.0, + "logits/rejected": 782295978.6666666, + "logps/chosen": -248.52186584472656, + "logps/rejected": -577.6570231119791, + "loss": 0.0122, + "rewards/chosen": 3.034655809402466, + "rewards/margins": 13.483259121576944, + "rewards/rejected": -10.448603312174479, + "step": 10070 + }, + { + "epoch": 0.9201461854728187, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.5744196086438789e-07, + "logits/chosen": 479371221.3333333, + "logits/rejected": 558758860.8, + "logps/chosen": -284.9714762369792, + "logps/rejected": -521.92939453125, + "loss": 0.0086, + "rewards/chosen": 4.185222625732422, + "rewards/margins": 13.773487091064453, + "rewards/rejected": -9.588264465332031, + "step": 10071 + }, + { + "epoch": 0.9202375513933303, + "grad_norm": 38.5, + "kl": 0.0, + "learning_rate": 1.5708419435684463e-07, + "logits/chosen": 899678310.4, + "logits/rejected": 1159128832.0, + "logps/chosen": -311.32275390625, + "logps/rejected": -301.97544352213544, + "loss": 0.0562, + "rewards/chosen": 4.581941223144531, + "rewards/margins": 9.352082506815592, + "rewards/rejected": -4.7701412836710615, + "step": 10072 + }, + { + "epoch": 0.9203289173138419, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 1.5672682831356756e-07, + "logits/chosen": 485503385.6, + "logits/rejected": 479143680.0, + "logps/chosen": -318.569677734375, + "logps/rejected": -431.162109375, + "loss": 0.0162, + "rewards/chosen": 4.012321472167969, + "rewards/margins": 13.174649556477865, + "rewards/rejected": -9.162328084309896, + "step": 10073 + }, + { + "epoch": 0.9204202832343535, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 1.563698627641075e-07, + "logits/chosen": 430642585.6, + "logits/rejected": 343775253.3333333, + "logps/chosen": -401.35888671875, + "logps/rejected": -393.4739176432292, + "loss": 0.0169, + "rewards/chosen": 4.340219116210937, + "rewards/margins": 13.434505716959634, + "rewards/rejected": -9.094286600748697, + "step": 10074 + }, + { + "epoch": 0.9205116491548653, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 1.5601329773798246e-07, + "logits/chosen": 828979456.0, + "logits/rejected": 577576960.0, + "logps/chosen": -308.46307373046875, + "logps/rejected": -382.4613952636719, + "loss": 0.0161, + "rewards/chosen": 4.193781852722168, + "rewards/margins": 13.127835273742676, + "rewards/rejected": -8.934053421020508, + "step": 10075 + }, + { + "epoch": 0.9206030150753769, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 1.5565713326467835e-07, + "logits/chosen": 552099174.4, + "logits/rejected": 557657770.6666666, + "logps/chosen": -389.5685302734375, + "logps/rejected": -706.604736328125, + "loss": 0.0105, + "rewards/chosen": 4.45935287475586, + "rewards/margins": 13.204160435994467, + "rewards/rejected": -8.744807561238607, + "step": 10076 + }, + { + "epoch": 0.9206943809958885, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.5530136937364493e-07, + "logits/chosen": 838893824.0, + "logits/rejected": 704433365.3333334, + "logps/chosen": -310.09991455078125, + "logps/rejected": -414.6563313802083, + "loss": 0.022, + "rewards/chosen": 2.3764543533325195, + "rewards/margins": 11.23535950978597, + "rewards/rejected": -8.858905156453451, + "step": 10077 + }, + { + "epoch": 0.9207857469164001, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 1.5494600609430032e-07, + "logits/chosen": 334107545.6, + "logits/rejected": 636671744.0, + "logps/chosen": -379.5314453125, + "logps/rejected": -631.6561279296875, + "loss": 0.0133, + "rewards/chosen": 4.054503631591797, + "rewards/margins": 14.379946263631187, + "rewards/rejected": -10.325442632039389, + "step": 10078 + }, + { + "epoch": 0.9208771128369119, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.545910434560316e-07, + "logits/chosen": 682312857.6, + "logits/rejected": 271220480.0, + "logps/chosen": -244.6435546875, + "logps/rejected": -404.8800862630208, + "loss": 0.0403, + "rewards/chosen": 3.1948007583618163, + "rewards/margins": 14.747552935282389, + "rewards/rejected": -11.552752176920572, + "step": 10079 + }, + { + "epoch": 0.9209684787574235, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.5423648148818914e-07, + "logits/chosen": 553899776.0, + "logits/rejected": 928351158.8571428, + "logps/chosen": -321.93768310546875, + "logps/rejected": -421.6914760044643, + "loss": 0.0091, + "rewards/chosen": 2.594067335128784, + "rewards/margins": 11.549846819468907, + "rewards/rejected": -8.955779484340123, + "step": 10080 + }, + { + "epoch": 0.9210598446779351, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.538823202200934e-07, + "logits/chosen": 544093696.0, + "logits/rejected": 495720064.0, + "logps/chosen": -323.40234375, + "logps/rejected": -576.7139485677084, + "loss": 0.0387, + "rewards/chosen": 2.8367389678955077, + "rewards/margins": 12.168415451049805, + "rewards/rejected": -9.331676483154297, + "step": 10081 + }, + { + "epoch": 0.9211512105984467, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 1.5352855968102874e-07, + "logits/chosen": 560939776.0, + "logits/rejected": 381948006.4, + "logps/chosen": -191.5755411783854, + "logps/rejected": -454.058642578125, + "loss": 0.01, + "rewards/chosen": 3.7919336954752603, + "rewards/margins": 13.430201975504557, + "rewards/rejected": -9.638268280029298, + "step": 10082 + }, + { + "epoch": 0.9212425765189585, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.5317519990024898e-07, + "logits/chosen": 665588787.2, + "logits/rejected": 584794965.3333334, + "logps/chosen": -253.836279296875, + "logps/rejected": -642.6853434244791, + "loss": 0.0158, + "rewards/chosen": 4.280452728271484, + "rewards/margins": 14.844909032185873, + "rewards/rejected": -10.564456303914389, + "step": 10083 + }, + { + "epoch": 0.9213339424394701, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 1.528222409069735e-07, + "logits/chosen": 488401459.2, + "logits/rejected": 579633066.6666666, + "logps/chosen": -363.203369140625, + "logps/rejected": -304.491943359375, + "loss": 0.014, + "rewards/chosen": 3.9998828887939455, + "rewards/margins": 11.743713251749675, + "rewards/rejected": -7.7438303629557295, + "step": 10084 + }, + { + "epoch": 0.9214253083599817, + "grad_norm": 1.0625, + "kl": 0.0, + "learning_rate": 1.5246968273038844e-07, + "logits/chosen": 902367829.3333334, + "logits/rejected": 701425792.0, + "logps/chosen": -572.8689778645834, + "logps/rejected": -758.1422729492188, + "loss": 0.006, + "rewards/chosen": 5.0335954030354815, + "rewards/margins": 20.99500306447347, + "rewards/rejected": -15.961407661437988, + "step": 10085 + }, + { + "epoch": 0.9215166742804933, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.5211752539964707e-07, + "logits/chosen": 576599360.0, + "logits/rejected": 487805056.0, + "logps/chosen": -409.4621276855469, + "logps/rejected": -605.47802734375, + "loss": 0.0193, + "rewards/chosen": 3.282111167907715, + "rewards/margins": 13.411958694458008, + "rewards/rejected": -10.129847526550293, + "step": 10086 + }, + { + "epoch": 0.9216080402010051, + "grad_norm": 0.56640625, + "kl": 0.0, + "learning_rate": 1.5176576894386953e-07, + "logits/chosen": 586934186.6666666, + "logits/rejected": 410520780.8, + "logps/chosen": -512.3787434895834, + "logps/rejected": -501.27587890625, + "loss": 0.0024, + "rewards/chosen": 5.207334518432617, + "rewards/margins": 15.71147117614746, + "rewards/rejected": -10.504136657714843, + "step": 10087 + }, + { + "epoch": 0.9216994061215167, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 1.514144133921436e-07, + "logits/chosen": 745852032.0, + "logits/rejected": 806494549.3333334, + "logps/chosen": -495.9447937011719, + "logps/rejected": -444.7867431640625, + "loss": 0.005, + "rewards/chosen": 4.191998481750488, + "rewards/margins": 12.560996691385904, + "rewards/rejected": -8.368998209635416, + "step": 10088 + }, + { + "epoch": 0.9217907720420283, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 1.510634587735227e-07, + "logits/chosen": 453539737.6, + "logits/rejected": 586013141.3333334, + "logps/chosen": -270.075, + "logps/rejected": -513.5384928385416, + "loss": 0.0131, + "rewards/chosen": 4.285205459594726, + "rewards/margins": 16.49816983540853, + "rewards/rejected": -12.212964375813803, + "step": 10089 + }, + { + "epoch": 0.9218821379625399, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 1.5071290511702695e-07, + "logits/chosen": 433344960.0, + "logits/rejected": 468139200.0, + "logps/chosen": -373.7886047363281, + "logps/rejected": -355.8211669921875, + "loss": 0.0162, + "rewards/chosen": 4.013581275939941, + "rewards/margins": 12.048882484436035, + "rewards/rejected": -8.035301208496094, + "step": 10090 + }, + { + "epoch": 0.9219735038830517, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.5036275245164377e-07, + "logits/chosen": 546790848.0, + "logits/rejected": 544299200.0, + "logps/chosen": -330.7665710449219, + "logps/rejected": -618.640625, + "loss": 0.0143, + "rewards/chosen": 3.898921251296997, + "rewards/margins": 13.320681810379028, + "rewards/rejected": -9.421760559082031, + "step": 10091 + }, + { + "epoch": 0.9220648698035633, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 1.5001300080632887e-07, + "logits/chosen": 468367872.0, + "logits/rejected": 721455530.6666666, + "logps/chosen": -346.110302734375, + "logps/rejected": -250.0341593424479, + "loss": 0.0169, + "rewards/chosen": 4.121955108642578, + "rewards/margins": 11.822189203898112, + "rewards/rejected": -7.700234095255534, + "step": 10092 + }, + { + "epoch": 0.9221562357240749, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 1.496636502100024e-07, + "logits/chosen": 554158336.0, + "logits/rejected": 346989568.0, + "logps/chosen": -442.271484375, + "logps/rejected": -473.1478515625, + "loss": 0.0079, + "rewards/chosen": 3.948509852091471, + "rewards/margins": 14.96324373881022, + "rewards/rejected": -11.01473388671875, + "step": 10093 + }, + { + "epoch": 0.9222476016445865, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 1.4931470069155296e-07, + "logits/chosen": 330486732.8, + "logits/rejected": 437115306.6666667, + "logps/chosen": -292.445703125, + "logps/rejected": -648.5955403645834, + "loss": 0.0278, + "rewards/chosen": 5.272257232666016, + "rewards/margins": 13.013426844278971, + "rewards/rejected": -7.741169611612956, + "step": 10094 + }, + { + "epoch": 0.9223389675650983, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 1.4896615227983468e-07, + "logits/chosen": 962065510.4, + "logits/rejected": 476978474.6666667, + "logps/chosen": -195.749755859375, + "logps/rejected": -397.3983968098958, + "loss": 0.0148, + "rewards/chosen": 4.266005325317383, + "rewards/margins": 13.82495511372884, + "rewards/rejected": -9.558949788411459, + "step": 10095 + }, + { + "epoch": 0.9224303334856099, + "grad_norm": 42.5, + "kl": 0.0, + "learning_rate": 1.4861800500367007e-07, + "logits/chosen": 510461440.0, + "logits/rejected": 321167018.6666667, + "logps/chosen": -204.17655029296876, + "logps/rejected": -613.16943359375, + "loss": 0.0779, + "rewards/chosen": 2.7811019897460936, + "rewards/margins": 17.203594970703126, + "rewards/rejected": -14.422492980957031, + "step": 10096 + }, + { + "epoch": 0.9225216994061215, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 1.482702588918472e-07, + "logits/chosen": 616367744.0, + "logits/rejected": 503090858.6666667, + "logps/chosen": -229.57244873046875, + "logps/rejected": -342.3762613932292, + "loss": 0.0434, + "rewards/chosen": 2.835911750793457, + "rewards/margins": 11.163716316223145, + "rewards/rejected": -8.327804565429688, + "step": 10097 + }, + { + "epoch": 0.9226130653266331, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 1.4792291397312198e-07, + "logits/chosen": 475154816.0, + "logits/rejected": 788835498.6666666, + "logps/chosen": -226.25173950195312, + "logps/rejected": -637.44189453125, + "loss": 0.008, + "rewards/chosen": 3.4669618606567383, + "rewards/margins": 12.785480181376139, + "rewards/rejected": -9.3185183207194, + "step": 10098 + }, + { + "epoch": 0.9227044312471449, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 1.475759702762164e-07, + "logits/chosen": 383386080.0, + "logits/rejected": 686609621.3333334, + "logps/chosen": -145.38717651367188, + "logps/rejected": -587.137451171875, + "loss": 0.0099, + "rewards/chosen": 3.2085347175598145, + "rewards/margins": 14.598668893178305, + "rewards/rejected": -11.39013417561849, + "step": 10099 + }, + { + "epoch": 0.9227957971676565, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 1.4722942782981863e-07, + "logits/chosen": 765121024.0, + "logits/rejected": 1099500288.0, + "logps/chosen": -284.45261928013394, + "logps/rejected": -794.6717529296875, + "loss": 0.0202, + "rewards/chosen": 4.598730359758649, + "rewards/margins": 17.370404515947612, + "rewards/rejected": -12.771674156188965, + "step": 10100 + }, + { + "epoch": 0.9228871630881681, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 1.468832866625858e-07, + "logits/chosen": 607487872.0, + "logits/rejected": 847201450.6666666, + "logps/chosen": -437.7821960449219, + "logps/rejected": -481.6626790364583, + "loss": 0.0056, + "rewards/chosen": 3.9651641845703125, + "rewards/margins": 13.478034337361654, + "rewards/rejected": -9.512870152791342, + "step": 10101 + }, + { + "epoch": 0.9229785290086797, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 1.4653754680314047e-07, + "logits/chosen": 620928000.0, + "logits/rejected": 572688896.0, + "logps/chosen": -485.0347595214844, + "logps/rejected": -652.8654174804688, + "loss": 0.0144, + "rewards/chosen": 3.7309982776641846, + "rewards/margins": 14.336275339126587, + "rewards/rejected": -10.605277061462402, + "step": 10102 + }, + { + "epoch": 0.9230698949291914, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 1.4619220828007097e-07, + "logits/chosen": 907890048.0, + "logits/rejected": 635552256.0, + "logps/chosen": -426.50274658203125, + "logps/rejected": -420.1252848307292, + "loss": 0.0047, + "rewards/chosen": 4.203010559082031, + "rewards/margins": 11.638984044392902, + "rewards/rejected": -7.435973485310872, + "step": 10103 + }, + { + "epoch": 0.9231612608497031, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 1.4584727112193497e-07, + "logits/chosen": 852180160.0, + "logits/rejected": 767244672.0, + "logps/chosen": -355.28985595703125, + "logps/rejected": -795.0654907226562, + "loss": 0.0063, + "rewards/chosen": 4.991237640380859, + "rewards/margins": 17.411977767944336, + "rewards/rejected": -12.420740127563477, + "step": 10104 + }, + { + "epoch": 0.9232526267702147, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.455027353572541e-07, + "logits/chosen": 550064640.0, + "logits/rejected": 482770176.0, + "logps/chosen": -595.0823974609375, + "logps/rejected": -564.1055501302084, + "loss": 0.0126, + "rewards/chosen": 2.943875312805176, + "rewards/margins": 12.689585049947103, + "rewards/rejected": -9.745709737141928, + "step": 10105 + }, + { + "epoch": 0.9233439926907263, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 1.4515860101452007e-07, + "logits/chosen": 760141952.0, + "logits/rejected": 746763776.0, + "logps/chosen": -342.7874450683594, + "logps/rejected": -451.157470703125, + "loss": 0.0055, + "rewards/chosen": 4.478154182434082, + "rewards/margins": 12.771342277526855, + "rewards/rejected": -8.293188095092773, + "step": 10106 + }, + { + "epoch": 0.923435358611238, + "grad_norm": 16.875, + "kl": 9.383031845092773, + "learning_rate": 1.4481486812218783e-07, + "logits/chosen": 464788626.28571427, + "logits/rejected": 672990208.0, + "logps/chosen": -270.6718226841518, + "logps/rejected": -828.4908447265625, + "loss": 0.15, + "rewards/chosen": 3.4270351954868863, + "rewards/margins": 11.269771916525706, + "rewards/rejected": -7.842736721038818, + "step": 10107 + }, + { + "epoch": 0.9235267245317497, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 1.4447153670868185e-07, + "logits/chosen": 607140096.0, + "logits/rejected": 561809956.5714285, + "logps/chosen": -321.8966979980469, + "logps/rejected": -417.825927734375, + "loss": 0.1035, + "rewards/chosen": 3.228402853012085, + "rewards/margins": 11.193375757762364, + "rewards/rejected": -7.964972904750279, + "step": 10108 + }, + { + "epoch": 0.9236180904522613, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 1.4412860680239228e-07, + "logits/chosen": 303436906.6666667, + "logits/rejected": 522632128.0, + "logps/chosen": -331.0694580078125, + "logps/rejected": -783.1468505859375, + "loss": 0.0164, + "rewards/chosen": 4.639315923055013, + "rewards/margins": 15.323128064473469, + "rewards/rejected": -10.683812141418457, + "step": 10109 + }, + { + "epoch": 0.9237094563727729, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 1.4378607843167635e-07, + "logits/chosen": 463011360.0, + "logits/rejected": 391314592.0, + "logps/chosen": -402.32098388671875, + "logps/rejected": -599.52490234375, + "loss": 0.0096, + "rewards/chosen": 4.088758945465088, + "rewards/margins": 16.032183170318604, + "rewards/rejected": -11.943424224853516, + "step": 10110 + }, + { + "epoch": 0.9238008222932846, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 1.4344395162485814e-07, + "logits/chosen": 756165760.0, + "logits/rejected": 1269517056.0, + "logps/chosen": -313.5408020019531, + "logps/rejected": -456.5102844238281, + "loss": 0.0321, + "rewards/chosen": 3.5376405715942383, + "rewards/margins": 11.623022079467773, + "rewards/rejected": -8.085381507873535, + "step": 10111 + }, + { + "epoch": 0.9238921882137963, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 1.4310222641022775e-07, + "logits/chosen": 1050072405.3333334, + "logits/rejected": 550560204.8, + "logps/chosen": -321.1605631510417, + "logps/rejected": -401.24501953125, + "loss": 0.0052, + "rewards/chosen": 4.7108503977457685, + "rewards/margins": 14.375668970743817, + "rewards/rejected": -9.664818572998048, + "step": 10112 + }, + { + "epoch": 0.9239835541343079, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 1.4276090281604317e-07, + "logits/chosen": 387666144.0, + "logits/rejected": 1291560448.0, + "logps/chosen": -206.98638916015625, + "logps/rejected": -618.11083984375, + "loss": 0.0117, + "rewards/chosen": 4.330785751342773, + "rewards/margins": 14.058221817016602, + "rewards/rejected": -9.727436065673828, + "step": 10113 + }, + { + "epoch": 0.9240749200548195, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 1.42419980870529e-07, + "logits/chosen": 477461792.0, + "logits/rejected": 638066304.0, + "logps/chosen": -287.19415283203125, + "logps/rejected": -528.4645589192709, + "loss": 0.0108, + "rewards/chosen": 3.356343984603882, + "rewards/margins": 11.636495987574259, + "rewards/rejected": -8.280152002970377, + "step": 10114 + }, + { + "epoch": 0.9241662859753312, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 1.420794606018755e-07, + "logits/chosen": 528456277.3333333, + "logits/rejected": 445011264.0, + "logps/chosen": -284.6263834635417, + "logps/rejected": -578.92724609375, + "loss": 0.0497, + "rewards/chosen": 3.1853176752726235, + "rewards/margins": 12.058761278788248, + "rewards/rejected": -8.873443603515625, + "step": 10115 + }, + { + "epoch": 0.9242576518958429, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 1.417393420382407e-07, + "logits/chosen": 411184256.0, + "logits/rejected": 570485162.6666666, + "logps/chosen": -118.26283264160156, + "logps/rejected": -530.805908203125, + "loss": 0.0124, + "rewards/chosen": 3.5853686332702637, + "rewards/margins": 12.497705618540445, + "rewards/rejected": -8.912336985270182, + "step": 10116 + }, + { + "epoch": 0.9243490178163545, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.4139962520775042e-07, + "logits/chosen": 456600480.0, + "logits/rejected": 736377216.0, + "logps/chosen": -285.26348876953125, + "logps/rejected": -597.3141479492188, + "loss": 0.0244, + "rewards/chosen": 3.5349793434143066, + "rewards/margins": 15.215653896331787, + "rewards/rejected": -11.68067455291748, + "step": 10117 + }, + { + "epoch": 0.9244403837368661, + "grad_norm": 1.1171875, + "kl": 0.0, + "learning_rate": 1.4106031013849498e-07, + "logits/chosen": 425619029.3333333, + "logits/rejected": 579405824.0, + "logps/chosen": -439.6309000651042, + "logps/rejected": -900.7294921875, + "loss": 0.0062, + "rewards/chosen": 5.308681805928548, + "rewards/margins": 16.017380078633625, + "rewards/rejected": -10.708698272705078, + "step": 10118 + }, + { + "epoch": 0.9245317496573778, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 1.4072139685853247e-07, + "logits/chosen": 448300032.0, + "logits/rejected": 517726080.0, + "logps/chosen": -292.5789794921875, + "logps/rejected": -488.8233642578125, + "loss": 0.0195, + "rewards/chosen": 3.760270357131958, + "rewards/margins": 11.887046098709106, + "rewards/rejected": -8.126775741577148, + "step": 10119 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.18359375, + "kl": 0.0, + "learning_rate": 1.403828853958883e-07, + "logits/chosen": 425206208.0, + "logits/rejected": 406028288.0, + "logps/chosen": -410.09625244140625, + "logps/rejected": -508.3084309895833, + "loss": 0.0007, + "rewards/chosen": 6.190741062164307, + "rewards/margins": 15.976906935373941, + "rewards/rejected": -9.786165873209635, + "step": 10120 + }, + { + "epoch": 0.9247144814984011, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 1.4004477577855392e-07, + "logits/chosen": 600797184.0, + "logits/rejected": 450759744.0, + "logps/chosen": -341.4203796386719, + "logps/rejected": -283.66363525390625, + "loss": 0.0085, + "rewards/chosen": 4.398345470428467, + "rewards/margins": 14.127362728118896, + "rewards/rejected": -9.72901725769043, + "step": 10121 + }, + { + "epoch": 0.9248058474189127, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.3970706803448864e-07, + "logits/chosen": 660475520.0, + "logits/rejected": 449598432.0, + "logps/chosen": -366.16595458984375, + "logps/rejected": -354.2386474609375, + "loss": 0.0123, + "rewards/chosen": 4.1351728439331055, + "rewards/margins": 13.200835227966309, + "rewards/rejected": -9.065662384033203, + "step": 10122 + }, + { + "epoch": 0.9248972133394244, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 1.393697621916168e-07, + "logits/chosen": 678506956.8, + "logits/rejected": 510647637.3333333, + "logps/chosen": -441.26435546875, + "logps/rejected": -459.3515625, + "loss": 0.0316, + "rewards/chosen": 3.279624176025391, + "rewards/margins": 13.584667078653972, + "rewards/rejected": -10.30504290262858, + "step": 10123 + }, + { + "epoch": 0.924988579259936, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.3903285827783108e-07, + "logits/chosen": 620816793.6, + "logits/rejected": 448387669.3333333, + "logps/chosen": -322.581787109375, + "logps/rejected": -268.5532633463542, + "loss": 0.148, + "rewards/chosen": 2.42403564453125, + "rewards/margins": 10.17967249552409, + "rewards/rejected": -7.755636850992839, + "step": 10124 + }, + { + "epoch": 0.9250799451804477, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 1.3869635632098976e-07, + "logits/chosen": 547711180.8, + "logits/rejected": 547629568.0, + "logps/chosen": -361.903662109375, + "logps/rejected": -471.828857421875, + "loss": 0.0249, + "rewards/chosen": 3.4831443786621095, + "rewards/margins": 14.052461242675781, + "rewards/rejected": -10.569316864013672, + "step": 10125 + }, + { + "epoch": 0.9251713111009593, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 1.3836025634891948e-07, + "logits/chosen": 1080214400.0, + "logits/rejected": 1437673728.0, + "logps/chosen": -269.6430969238281, + "logps/rejected": -445.0083312988281, + "loss": 0.3015, + "rewards/chosen": 1.1137751340866089, + "rewards/margins": 8.056410193443298, + "rewards/rejected": -6.9426350593566895, + "step": 10126 + }, + { + "epoch": 0.925262677021471, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 1.380245583894113e-07, + "logits/chosen": 264872480.0, + "logits/rejected": 497392288.0, + "logps/chosen": -193.26705932617188, + "logps/rejected": -715.739501953125, + "loss": 0.0313, + "rewards/chosen": 3.6209335327148438, + "rewards/margins": 14.399528503417969, + "rewards/rejected": -10.778594970703125, + "step": 10127 + }, + { + "epoch": 0.9253540429419826, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.376892624702253e-07, + "logits/chosen": 358732576.0, + "logits/rejected": 262044144.0, + "logps/chosen": -174.69229125976562, + "logps/rejected": -492.2649841308594, + "loss": 0.018, + "rewards/chosen": 3.320420503616333, + "rewards/margins": 14.809026002883911, + "rewards/rejected": -11.488605499267578, + "step": 10128 + }, + { + "epoch": 0.9254454088624943, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 1.3735436861908702e-07, + "logits/chosen": 562187072.0, + "logits/rejected": 567908608.0, + "logps/chosen": -175.01373291015625, + "logps/rejected": -685.13720703125, + "loss": 0.0372, + "rewards/chosen": 2.5812010765075684, + "rewards/margins": 14.25559663772583, + "rewards/rejected": -11.674395561218262, + "step": 10129 + }, + { + "epoch": 0.9255367747830059, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 1.3701987686368934e-07, + "logits/chosen": 809576320.0, + "logits/rejected": 476921258.6666667, + "logps/chosen": -265.29156494140625, + "logps/rejected": -433.3748372395833, + "loss": 0.013, + "rewards/chosen": 2.966169834136963, + "rewards/margins": 12.16057825088501, + "rewards/rejected": -9.194408416748047, + "step": 10130 + }, + { + "epoch": 0.9256281407035176, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 1.3668578723169067e-07, + "logits/chosen": 466682528.0, + "logits/rejected": 349251456.0, + "logps/chosen": -275.8409118652344, + "logps/rejected": -446.6582438151042, + "loss": 0.007, + "rewards/chosen": 3.5741934776306152, + "rewards/margins": 13.221449693044027, + "rewards/rejected": -9.647256215413412, + "step": 10131 + }, + { + "epoch": 0.9257195066240292, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 1.3635209975071838e-07, + "logits/chosen": 385300160.0, + "logits/rejected": 430300672.0, + "logps/chosen": -231.6673583984375, + "logps/rejected": -883.63818359375, + "loss": 0.0161, + "rewards/chosen": 3.676650047302246, + "rewards/margins": 17.63796329498291, + "rewards/rejected": -13.961313247680664, + "step": 10132 + }, + { + "epoch": 0.9258108725445409, + "grad_norm": 1.6484375, + "kl": 0.0, + "learning_rate": 1.3601881444836374e-07, + "logits/chosen": 441392320.0, + "logits/rejected": 614741418.6666666, + "logps/chosen": -336.6786804199219, + "logps/rejected": -410.6790364583333, + "loss": 0.0069, + "rewards/chosen": 4.193310737609863, + "rewards/margins": 12.138696988423664, + "rewards/rejected": -7.945386250813802, + "step": 10133 + }, + { + "epoch": 0.9259022384650525, + "grad_norm": 0.099609375, + "kl": 0.0, + "learning_rate": 1.35685931352188e-07, + "logits/rejected": 433621440.0, + "logps/rejected": -434.41729736328125, + "loss": 0.0006, + "rewards/rejected": -8.039780616760254, + "step": 10134 + }, + { + "epoch": 0.9259936043855642, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.3535345048971694e-07, + "logits/chosen": 700080179.2, + "logits/rejected": 623042133.3333334, + "logps/chosen": -261.587158203125, + "logps/rejected": -480.3603515625, + "loss": 0.0184, + "rewards/chosen": 3.683690643310547, + "rewards/margins": 12.042376454671224, + "rewards/rejected": -8.358685811360678, + "step": 10135 + }, + { + "epoch": 0.9260849703060758, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.3502137188844357e-07, + "logits/chosen": 641426944.0, + "logits/rejected": 869265856.0, + "logps/chosen": -345.62109375, + "logps/rejected": -437.0975341796875, + "loss": 0.0161, + "rewards/chosen": 3.461395263671875, + "rewards/margins": 13.783438682556152, + "rewards/rejected": -10.322043418884277, + "step": 10136 + }, + { + "epoch": 0.9261763362265875, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 1.3468969557582756e-07, + "logits/chosen": 589468057.6, + "logits/rejected": 622856746.6666666, + "logps/chosen": -448.65390625, + "logps/rejected": -503.544189453125, + "loss": 0.0248, + "rewards/chosen": 3.2674976348876954, + "rewards/margins": 12.805804824829101, + "rewards/rejected": -9.538307189941406, + "step": 10137 + }, + { + "epoch": 0.9262677021470991, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 1.3435842157929534e-07, + "logits/chosen": 578748672.0, + "logits/rejected": 902227370.6666666, + "logps/chosen": -343.3651123046875, + "logps/rejected": -513.4207763671875, + "loss": 0.1026, + "rewards/chosen": 4.643026828765869, + "rewards/margins": 12.773303826649984, + "rewards/rejected": -8.130276997884115, + "step": 10138 + }, + { + "epoch": 0.9263590680676108, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.340275499262411e-07, + "logits/chosen": 641918933.3333334, + "logits/rejected": 657238272.0, + "logps/chosen": -312.1140543619792, + "logps/rejected": -553.88740234375, + "loss": 0.0146, + "rewards/chosen": 3.398202578226725, + "rewards/margins": 12.596090761820475, + "rewards/rejected": -9.19788818359375, + "step": 10139 + }, + { + "epoch": 0.9264504339881224, + "grad_norm": 0.8984375, + "kl": 0.0, + "learning_rate": 1.336970806440241e-07, + "logits/chosen": 376768192.0, + "logits/rejected": 768483840.0, + "logps/chosen": -218.30184936523438, + "logps/rejected": -581.5312151227679, + "loss": 0.004, + "rewards/chosen": 3.492907762527466, + "rewards/margins": 12.351028272083827, + "rewards/rejected": -8.858120509556361, + "step": 10140 + }, + { + "epoch": 0.9265417999086341, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 1.333670137599713e-07, + "logits/chosen": 413686016.0, + "logits/rejected": 524179840.0, + "logps/chosen": -310.07208251953125, + "logps/rejected": -582.470458984375, + "loss": 0.0123, + "rewards/chosen": 3.8820197582244873, + "rewards/margins": 14.306221723556519, + "rewards/rejected": -10.424201965332031, + "step": 10141 + }, + { + "epoch": 0.9266331658291457, + "grad_norm": 0.060302734375, + "kl": 0.0, + "learning_rate": 1.3303734930137535e-07, + "logits/chosen": 252297584.0, + "logits/rejected": 479644233.14285713, + "logps/chosen": -373.79168701171875, + "logps/rejected": -544.0180315290179, + "loss": 0.0003, + "rewards/chosen": 6.5430908203125, + "rewards/margins": 16.728195190429688, + "rewards/rejected": -10.185104370117188, + "step": 10142 + }, + { + "epoch": 0.9267245317496574, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 1.327080872954978e-07, + "logits/chosen": 478202880.0, + "logits/rejected": 779520128.0, + "logps/chosen": -347.81005859375, + "logps/rejected": -947.5093994140625, + "loss": 0.0282, + "rewards/chosen": 3.9128101893833707, + "rewards/margins": 13.705003602164133, + "rewards/rejected": -9.792193412780762, + "step": 10143 + }, + { + "epoch": 0.926815897670169, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 1.3237922776956514e-07, + "logits/chosen": 522195488.0, + "logits/rejected": 472047744.0, + "logps/chosen": -379.41766357421875, + "logps/rejected": -482.344482421875, + "loss": 0.0058, + "rewards/chosen": 4.906486511230469, + "rewards/margins": 13.889699935913086, + "rewards/rejected": -8.983213424682617, + "step": 10144 + }, + { + "epoch": 0.9269072635906807, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 1.3205077075077065e-07, + "logits/chosen": 446051328.0, + "logits/rejected": 771848320.0, + "logps/chosen": -301.0402018229167, + "logps/rejected": -388.88671875, + "loss": 0.0195, + "rewards/chosen": 4.040205637613933, + "rewards/margins": 14.0027281443278, + "rewards/rejected": -9.962522506713867, + "step": 10145 + }, + { + "epoch": 0.9269986295111923, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 1.3172271626627486e-07, + "logits/chosen": 550555392.0, + "logits/rejected": 729483776.0, + "logps/chosen": -260.51055908203125, + "logps/rejected": -350.0838317871094, + "loss": 0.0256, + "rewards/chosen": 4.7367095947265625, + "rewards/margins": 12.847613334655762, + "rewards/rejected": -8.1109037399292, + "step": 10146 + }, + { + "epoch": 0.927089995431704, + "grad_norm": 0.5546875, + "kl": 0.0, + "learning_rate": 1.3139506434320493e-07, + "logits/chosen": 921570176.0, + "logits/rejected": 987337472.0, + "logps/chosen": -485.98907470703125, + "logps/rejected": -695.6644694010416, + "loss": 0.0025, + "rewards/chosen": 4.668362617492676, + "rewards/margins": 15.230103492736816, + "rewards/rejected": -10.56174087524414, + "step": 10147 + }, + { + "epoch": 0.9271813613522156, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.3106781500865417e-07, + "logits/chosen": 798505420.8, + "logits/rejected": 298292800.0, + "logps/chosen": -443.64609375, + "logps/rejected": -409.215576171875, + "loss": 0.0224, + "rewards/chosen": 3.7200672149658205, + "rewards/margins": 10.90172945658366, + "rewards/rejected": -7.181662241617839, + "step": 10148 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 1.307409682896843e-07, + "logits/chosen": 266799360.0, + "logits/rejected": 182505056.0, + "logps/chosen": -301.5654296875, + "logps/rejected": -308.4959716796875, + "loss": 0.0203, + "rewards/chosen": 3.450129985809326, + "rewards/margins": 12.682693004608154, + "rewards/rejected": -9.232563018798828, + "step": 10149 + }, + { + "epoch": 0.9273640931932389, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 1.304145242133209e-07, + "logits/chosen": 957829696.0, + "logits/rejected": 621654912.0, + "logps/chosen": -288.87371826171875, + "logps/rejected": -425.046142578125, + "loss": 0.0192, + "rewards/chosen": 3.3789210319519043, + "rewards/margins": 14.91957712173462, + "rewards/rejected": -11.540656089782715, + "step": 10150 + }, + { + "epoch": 0.9274554591137506, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 1.300884828065585e-07, + "logits/chosen": 496965632.0, + "logits/rejected": 537081446.4, + "logps/chosen": -309.7127278645833, + "logps/rejected": -475.8673828125, + "loss": 0.0064, + "rewards/chosen": 4.431800842285156, + "rewards/margins": 14.183634948730468, + "rewards/rejected": -9.751834106445312, + "step": 10151 + }, + { + "epoch": 0.9275468250342622, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.2976284409635832e-07, + "logits/chosen": 766130176.0, + "logits/rejected": 521713280.0, + "logps/chosen": -398.3416442871094, + "logps/rejected": -651.323486328125, + "loss": 0.0165, + "rewards/chosen": 3.5025155544281006, + "rewards/margins": 14.102451086044312, + "rewards/rejected": -10.599935531616211, + "step": 10152 + }, + { + "epoch": 0.9276381909547738, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 1.2943760810964712e-07, + "logits/chosen": 493164864.0, + "logits/rejected": 634874944.0, + "logps/chosen": -375.9163818359375, + "logps/rejected": -289.3674011230469, + "loss": 0.0122, + "rewards/chosen": 4.091076374053955, + "rewards/margins": 11.246082782745361, + "rewards/rejected": -7.155006408691406, + "step": 10153 + }, + { + "epoch": 0.9277295568752855, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 1.2911277487331897e-07, + "logits/chosen": 593883562.6666666, + "logits/rejected": 613067980.8, + "logps/chosen": -430.7427571614583, + "logps/rejected": -592.7228515625, + "loss": 0.0078, + "rewards/chosen": 4.519998550415039, + "rewards/margins": 14.750371170043945, + "rewards/rejected": -10.230372619628906, + "step": 10154 + }, + { + "epoch": 0.9278209227957972, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 1.2878834441423406e-07, + "logits/chosen": 246695573.33333334, + "logits/rejected": 389134848.0, + "logps/chosen": -182.2392781575521, + "logps/rejected": -370.136328125, + "loss": 0.1105, + "rewards/chosen": 4.238910675048828, + "rewards/margins": 11.052136993408203, + "rewards/rejected": -6.813226318359375, + "step": 10155 + }, + { + "epoch": 0.9279122887163088, + "grad_norm": 26.25, + "kl": 0.0, + "learning_rate": 1.284643167592209e-07, + "logits/chosen": 431526485.3333333, + "logits/rejected": 899561472.0, + "logps/chosen": -264.82427978515625, + "logps/rejected": -354.2793212890625, + "loss": 0.101, + "rewards/chosen": 3.855353037516276, + "rewards/margins": 10.504077402750651, + "rewards/rejected": -6.648724365234375, + "step": 10156 + }, + { + "epoch": 0.9280036546368204, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 1.281406919350736e-07, + "logits/chosen": 804598101.3333334, + "logits/rejected": 1077410816.0, + "logps/chosen": -284.1642659505208, + "logps/rejected": -599.50576171875, + "loss": 0.0133, + "rewards/chosen": 3.7203292846679688, + "rewards/margins": 13.622653198242187, + "rewards/rejected": -9.902323913574218, + "step": 10157 + }, + { + "epoch": 0.9280950205573321, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 1.2781746996855083e-07, + "logits/chosen": 636307114.6666666, + "logits/rejected": 530929049.6, + "logps/chosen": -394.3998209635417, + "logps/rejected": -612.5927734375, + "loss": 0.1252, + "rewards/chosen": 1.563015302022298, + "rewards/margins": 12.785576947530112, + "rewards/rejected": -11.222561645507813, + "step": 10158 + }, + { + "epoch": 0.9281863864778438, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 1.2749465088638225e-07, + "logits/chosen": 429769504.0, + "logits/rejected": 346868544.0, + "logps/chosen": -296.7663269042969, + "logps/rejected": -350.1900329589844, + "loss": 0.0148, + "rewards/chosen": 3.852057933807373, + "rewards/margins": 13.005744457244873, + "rewards/rejected": -9.1536865234375, + "step": 10159 + }, + { + "epoch": 0.9282777523983554, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 1.271722347152615e-07, + "logits/chosen": 869375744.0, + "logits/rejected": 1121329920.0, + "logps/chosen": -216.77872721354166, + "logps/rejected": -554.748291015625, + "loss": 0.0274, + "rewards/chosen": 3.442214330037435, + "rewards/margins": 13.505818684895834, + "rewards/rejected": -10.063604354858398, + "step": 10160 + }, + { + "epoch": 0.928369118318867, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.26850221481849e-07, + "logits/chosen": 640566101.3333334, + "logits/rejected": 419676364.8, + "logps/chosen": -406.285400390625, + "logps/rejected": -879.4580078125, + "loss": 0.0202, + "rewards/chosen": 3.05582332611084, + "rewards/margins": 12.292349815368652, + "rewards/rejected": -9.236526489257812, + "step": 10161 + }, + { + "epoch": 0.9284604842393787, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 1.2652861121277227e-07, + "logits/chosen": 658135744.0, + "logits/rejected": 521575552.0, + "logps/chosen": -212.13613891601562, + "logps/rejected": -375.128173828125, + "loss": 0.1364, + "rewards/chosen": 1.7651896476745605, + "rewards/margins": 11.806580066680908, + "rewards/rejected": -10.041390419006348, + "step": 10162 + }, + { + "epoch": 0.9285518501598904, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 1.262074039346256e-07, + "logits/chosen": 408292010.6666667, + "logits/rejected": 334396825.6, + "logps/chosen": -217.6260986328125, + "logps/rejected": -469.774267578125, + "loss": 0.0128, + "rewards/chosen": 3.628948529561361, + "rewards/margins": 14.814051373799643, + "rewards/rejected": -11.185102844238282, + "step": 10163 + }, + { + "epoch": 0.928643216080402, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 1.2588659967396998e-07, + "logits/chosen": 983099712.0, + "logits/rejected": 519617152.0, + "logps/chosen": -317.0567626953125, + "logps/rejected": -504.5399169921875, + "loss": 0.0065, + "rewards/chosen": 3.8410797119140625, + "rewards/margins": 14.657175699869791, + "rewards/rejected": -10.816095987955729, + "step": 10164 + }, + { + "epoch": 0.9287345820009136, + "grad_norm": 0.87890625, + "kl": 0.0, + "learning_rate": 1.2556619845733308e-07, + "logits/chosen": 496355123.2, + "logits/rejected": 617540096.0, + "logps/chosen": -439.05927734375, + "logps/rejected": -483.0936279296875, + "loss": 0.0057, + "rewards/chosen": 4.965191650390625, + "rewards/margins": 12.061223475138346, + "rewards/rejected": -7.096031824747722, + "step": 10165 + }, + { + "epoch": 0.9288259479214253, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 1.252462003112087e-07, + "logits/chosen": 530002986.6666667, + "logits/rejected": 207680528.0, + "logps/chosen": -452.1178792317708, + "logps/rejected": -380.35968017578125, + "loss": 0.0214, + "rewards/chosen": 3.954050381978353, + "rewards/margins": 15.130728085835775, + "rewards/rejected": -11.176677703857422, + "step": 10166 + }, + { + "epoch": 0.928917313841937, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 1.249266052620579e-07, + "logits/chosen": 384857696.0, + "logits/rejected": 753787733.3333334, + "logps/chosen": -276.001220703125, + "logps/rejected": -445.6466471354167, + "loss": 0.0044, + "rewards/chosen": 5.227163791656494, + "rewards/margins": 13.894016106923422, + "rewards/rejected": -8.666852315266928, + "step": 10167 + }, + { + "epoch": 0.9290086797624486, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 1.2460741333630844e-07, + "logits/chosen": 969587968.0, + "logits/rejected": 782684569.6, + "logps/chosen": -254.50569661458334, + "logps/rejected": -658.02275390625, + "loss": 0.0074, + "rewards/chosen": 4.8664290110270185, + "rewards/margins": 14.101202265421549, + "rewards/rejected": -9.234773254394531, + "step": 10168 + }, + { + "epoch": 0.9291000456829602, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 1.2428862456035472e-07, + "logits/chosen": 265883904.0, + "logits/rejected": 575201638.4, + "logps/chosen": -203.5245361328125, + "logps/rejected": -530.016015625, + "loss": 0.053, + "rewards/chosen": 3.9053688049316406, + "rewards/margins": 13.724504852294922, + "rewards/rejected": -9.819136047363282, + "step": 10169 + }, + { + "epoch": 0.9291914116034719, + "grad_norm": 50.25, + "kl": 0.0, + "learning_rate": 1.2397023896055737e-07, + "logits/chosen": 528103241.14285713, + "logits/rejected": 1278125184.0, + "logps/chosen": -302.10232979910717, + "logps/rejected": -498.9620361328125, + "loss": 0.0807, + "rewards/chosen": 2.9948517935616628, + "rewards/margins": 14.910333360944476, + "rewards/rejected": -11.915481567382812, + "step": 10170 + }, + { + "epoch": 0.9292827775239836, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 1.2365225656324308e-07, + "logits/chosen": 520064682.6666667, + "logits/rejected": 670022451.2, + "logps/chosen": -381.6980387369792, + "logps/rejected": -464.298828125, + "loss": 0.0059, + "rewards/chosen": 4.4603729248046875, + "rewards/margins": 13.069799041748047, + "rewards/rejected": -8.609426116943359, + "step": 10171 + }, + { + "epoch": 0.9293741434444952, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.2333467739470696e-07, + "logits/chosen": 321260992.0, + "logits/rejected": 388141337.6, + "logps/chosen": -285.34775797526044, + "logps/rejected": -482.818017578125, + "loss": 0.0155, + "rewards/chosen": 3.6442909240722656, + "rewards/margins": 11.956295013427734, + "rewards/rejected": -8.312004089355469, + "step": 10172 + }, + { + "epoch": 0.9294655093650068, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 1.2301750148121018e-07, + "logits/chosen": 747199658.6666666, + "logits/rejected": 426368128.0, + "logps/chosen": -421.1733805338542, + "logps/rejected": -479.95302734375, + "loss": 0.0076, + "rewards/chosen": 3.9457998275756836, + "rewards/margins": 14.145134544372558, + "rewards/rejected": -10.199334716796875, + "step": 10173 + }, + { + "epoch": 0.9295568752855184, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.2270072884897954e-07, + "logits/chosen": 645660928.0, + "logits/rejected": 339939328.0, + "logps/chosen": -353.1954345703125, + "logps/rejected": -327.50543212890625, + "loss": 0.0212, + "rewards/chosen": 3.2043938636779785, + "rewards/margins": 13.029032230377197, + "rewards/rejected": -9.824638366699219, + "step": 10174 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.2238435952420968e-07, + "logits/chosen": 404756256.0, + "logits/rejected": 454224960.0, + "logps/chosen": -263.60736083984375, + "logps/rejected": -494.42864990234375, + "loss": 0.0096, + "rewards/chosen": 4.717911243438721, + "rewards/margins": 15.23761796951294, + "rewards/rejected": -10.519706726074219, + "step": 10175 + }, + { + "epoch": 0.9297396071265418, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 1.2206839353306076e-07, + "logits/chosen": 716412224.0, + "logits/rejected": 472925525.3333333, + "logps/chosen": -322.443603515625, + "logps/rejected": -566.4783121744791, + "loss": 0.0083, + "rewards/chosen": 3.4984283447265625, + "rewards/margins": 12.990813573201498, + "rewards/rejected": -9.492385228474935, + "step": 10176 + }, + { + "epoch": 0.9298309730470534, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.2175283090166079e-07, + "logits/chosen": 703793152.0, + "logits/rejected": 546118485.3333334, + "logps/chosen": -248.6102294921875, + "logps/rejected": -592.2313232421875, + "loss": 0.0101, + "rewards/chosen": 4.639729309082031, + "rewards/margins": 14.272522735595704, + "rewards/rejected": -9.632793426513672, + "step": 10177 + }, + { + "epoch": 0.929922338967565, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 1.2143767165610386e-07, + "logits/chosen": 588158336.0, + "logits/rejected": 316995776.0, + "logps/chosen": -361.60467529296875, + "logps/rejected": -391.33294677734375, + "loss": 0.0093, + "rewards/chosen": 4.29502534866333, + "rewards/margins": 14.18997049331665, + "rewards/rejected": -9.89494514465332, + "step": 10178 + }, + { + "epoch": 0.9300137048880768, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 1.2112291582245083e-07, + "logits/chosen": 274653461.3333333, + "logits/rejected": 503252940.8, + "logps/chosen": -230.20682779947916, + "logps/rejected": -419.430517578125, + "loss": 0.0104, + "rewards/chosen": 3.9780654907226562, + "rewards/margins": 12.891703796386718, + "rewards/rejected": -8.913638305664062, + "step": 10179 + }, + { + "epoch": 0.9301050708085884, + "grad_norm": 0.6015625, + "kl": 0.0, + "learning_rate": 1.208085634267281e-07, + "logits/chosen": 409376768.0, + "logits/rejected": 618149546.6666666, + "logps/chosen": -340.29315185546875, + "logps/rejected": -593.8461100260416, + "loss": 0.0032, + "rewards/chosen": 4.381446838378906, + "rewards/margins": 14.638201395670572, + "rewards/rejected": -10.256754557291666, + "step": 10180 + }, + { + "epoch": 0.9301964367291, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 1.2049461449493093e-07, + "logits/chosen": 1226334464.0, + "logits/rejected": 359020960.0, + "logps/chosen": -301.5694580078125, + "logps/rejected": -375.01751708984375, + "loss": 0.0064, + "rewards/chosen": 4.693531036376953, + "rewards/margins": 14.260520935058594, + "rewards/rejected": -9.56698989868164, + "step": 10181 + }, + { + "epoch": 0.9302878026496118, + "grad_norm": 46.25, + "kl": 0.0, + "learning_rate": 1.2018106905301973e-07, + "logits/chosen": 708227968.0, + "logits/rejected": 646678528.0, + "logps/chosen": -356.50103759765625, + "logps/rejected": -714.4173583984375, + "loss": 0.108, + "rewards/chosen": 2.4218921661376953, + "rewards/margins": 13.364484786987305, + "rewards/rejected": -10.94259262084961, + "step": 10182 + }, + { + "epoch": 0.9303791685701234, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 1.1986792712692153e-07, + "logits/chosen": 374100955.4285714, + "logits/rejected": 319415008.0, + "logps/chosen": -313.0487583705357, + "logps/rejected": -459.0840759277344, + "loss": 0.0303, + "rewards/chosen": 4.001607894897461, + "rewards/margins": 12.900365829467773, + "rewards/rejected": -8.898757934570312, + "step": 10183 + }, + { + "epoch": 0.930470534490635, + "grad_norm": 0.1279296875, + "kl": 0.0, + "learning_rate": 1.1955518874253003e-07, + "logits/chosen": 178173760.0, + "logits/rejected": 477177344.0, + "logps/chosen": -69.98106384277344, + "logps/rejected": -456.85581752232144, + "loss": 0.0007, + "rewards/chosen": 5.6122002601623535, + "rewards/margins": 14.387383801596505, + "rewards/rejected": -8.775183541434151, + "step": 10184 + }, + { + "epoch": 0.9305619004111466, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 1.1924285392570621e-07, + "logits/chosen": 749908377.6, + "logits/rejected": 620986965.3333334, + "logps/chosen": -219.3168701171875, + "logps/rejected": -683.22509765625, + "loss": 0.0143, + "rewards/chosen": 4.000639724731445, + "rewards/margins": 17.284200159708657, + "rewards/rejected": -13.283560434977213, + "step": 10185 + }, + { + "epoch": 0.9306532663316583, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 1.1893092270227724e-07, + "logits/chosen": 408091712.0, + "logits/rejected": 379236992.0, + "logps/chosen": -285.794189453125, + "logps/rejected": -509.56085205078125, + "loss": 0.0134, + "rewards/chosen": 4.573957920074463, + "rewards/margins": 12.940547466278076, + "rewards/rejected": -8.366589546203613, + "step": 10186 + }, + { + "epoch": 0.93074463225217, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.1861939509803688e-07, + "logits/chosen": 403408640.0, + "logits/rejected": 441218816.0, + "logps/chosen": -276.563232421875, + "logps/rejected": -462.19091796875, + "loss": 0.0342, + "rewards/chosen": 3.408327579498291, + "rewards/margins": 12.633589267730713, + "rewards/rejected": -9.225261688232422, + "step": 10187 + }, + { + "epoch": 0.9308359981726816, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 1.183082711387451e-07, + "logits/chosen": 571510016.0, + "logits/rejected": 491272000.0, + "logps/chosen": -377.929443359375, + "logps/rejected": -456.91668701171875, + "loss": 0.0093, + "rewards/chosen": 4.493837356567383, + "rewards/margins": 13.419099807739258, + "rewards/rejected": -8.925262451171875, + "step": 10188 + }, + { + "epoch": 0.9309273640931932, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 1.1799755085013021e-07, + "logits/chosen": 813878080.0, + "logits/rejected": 429075712.0, + "logps/chosen": -256.5063171386719, + "logps/rejected": -357.1216227213542, + "loss": 0.0066, + "rewards/chosen": 3.659069776535034, + "rewards/margins": 12.396844307581583, + "rewards/rejected": -8.737774531046549, + "step": 10189 + }, + { + "epoch": 0.9310187300137049, + "grad_norm": 1.140625, + "kl": 0.0, + "learning_rate": 1.1768723425788442e-07, + "logits/chosen": 334677696.0, + "logits/rejected": 505797504.0, + "logps/chosen": -198.1941680908203, + "logps/rejected": -529.9403889973959, + "loss": 0.0056, + "rewards/chosen": 4.158833980560303, + "rewards/margins": 13.270523230234781, + "rewards/rejected": -9.111689249674479, + "step": 10190 + }, + { + "epoch": 0.9311100959342166, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 1.1737732138766944e-07, + "logits/chosen": 342294933.3333333, + "logits/rejected": 656376627.2, + "logps/chosen": -160.9348347981771, + "logps/rejected": -550.9890625, + "loss": 0.0242, + "rewards/chosen": 3.4606825510660806, + "rewards/margins": 14.566020838419595, + "rewards/rejected": -11.105338287353515, + "step": 10191 + }, + { + "epoch": 0.9312014618547282, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 1.1706781226511088e-07, + "logits/chosen": 1187758250.6666667, + "logits/rejected": 520688768.0, + "logps/chosen": -458.0313313802083, + "logps/rejected": -373.9219970703125, + "loss": 0.0307, + "rewards/chosen": 3.367882410685221, + "rewards/margins": 11.131441752115885, + "rewards/rejected": -7.763559341430664, + "step": 10192 + }, + { + "epoch": 0.9312928277752398, + "grad_norm": 49.5, + "kl": 0.0, + "learning_rate": 1.167587069158027e-07, + "logits/chosen": 742212608.0, + "logits/rejected": 670786474.6666666, + "logps/chosen": -313.747802734375, + "logps/rejected": -471.6165364583333, + "loss": 0.0436, + "rewards/chosen": 4.677484512329102, + "rewards/margins": 12.742573420206705, + "rewards/rejected": -8.065088907877604, + "step": 10193 + }, + { + "epoch": 0.9313841936957515, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.164500053653056e-07, + "logits/chosen": 481059157.3333333, + "logits/rejected": 343468608.0, + "logps/chosen": -257.9434407552083, + "logps/rejected": -377.9611511230469, + "loss": 0.0211, + "rewards/chosen": 3.8872753779093423, + "rewards/margins": 12.221646944681803, + "rewards/rejected": -8.334371566772461, + "step": 10194 + }, + { + "epoch": 0.9314755596162632, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.1614170763914634e-07, + "logits/chosen": 1339897088.0, + "logits/rejected": 427815782.4, + "logps/chosen": -309.16127522786456, + "logps/rejected": -457.141943359375, + "loss": 0.0264, + "rewards/chosen": 3.2304261525472007, + "rewards/margins": 13.434111150105794, + "rewards/rejected": -10.203684997558593, + "step": 10195 + }, + { + "epoch": 0.9315669255367748, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.1583381376281733e-07, + "logits/chosen": 397509939.2, + "logits/rejected": 580515925.3333334, + "logps/chosen": -283.869873046875, + "logps/rejected": -502.0542805989583, + "loss": 0.0343, + "rewards/chosen": 4.029894256591797, + "rewards/margins": 11.963714345296225, + "rewards/rejected": -7.933820088704427, + "step": 10196 + }, + { + "epoch": 0.9316582914572864, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 1.1552632376177874e-07, + "logits/chosen": 259556416.0, + "logits/rejected": 423357632.0, + "logps/chosen": -153.34234619140625, + "logps/rejected": -570.9443969726562, + "loss": 0.0119, + "rewards/chosen": 4.57499361038208, + "rewards/margins": 13.77428674697876, + "rewards/rejected": -9.19929313659668, + "step": 10197 + }, + { + "epoch": 0.9317496573777981, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 1.1521923766145804e-07, + "logits/chosen": 216568928.0, + "logits/rejected": 408175530.6666667, + "logps/chosen": -291.646240234375, + "logps/rejected": -554.4374186197916, + "loss": 0.0062, + "rewards/chosen": 4.264904975891113, + "rewards/margins": 14.698653856913248, + "rewards/rejected": -10.433748881022135, + "step": 10198 + }, + { + "epoch": 0.9318410232983098, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 1.1491255548724822e-07, + "logits/chosen": 424279104.0, + "logits/rejected": 620045056.0, + "logps/chosen": -289.87725830078125, + "logps/rejected": -427.1494445800781, + "loss": 0.0647, + "rewards/chosen": 3.828029155731201, + "rewards/margins": 9.818809032440186, + "rewards/rejected": -5.990779876708984, + "step": 10199 + }, + { + "epoch": 0.9319323892188214, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 1.1460627726450846e-07, + "logits/chosen": 483322304.0, + "logits/rejected": 456423936.0, + "logps/chosen": -385.8384704589844, + "logps/rejected": -337.1943664550781, + "loss": 0.0143, + "rewards/chosen": 4.048892974853516, + "rewards/margins": 11.115440368652344, + "rewards/rejected": -7.066547393798828, + "step": 10200 + }, + { + "epoch": 0.932023755139333, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 1.1430040301856515e-07, + "logits/chosen": 789054890.6666666, + "logits/rejected": 945045094.4, + "logps/chosen": -323.5846354166667, + "logps/rejected": -534.754541015625, + "loss": 0.0111, + "rewards/chosen": 3.840669314066569, + "rewards/margins": 15.582524172465005, + "rewards/rejected": -11.741854858398437, + "step": 10201 + }, + { + "epoch": 0.9321151210598447, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 1.1399493277471141e-07, + "logits/chosen": 550423040.0, + "logits/rejected": 561822634.6666666, + "logps/chosen": -348.1121337890625, + "logps/rejected": -320.64512125651044, + "loss": 0.0056, + "rewards/chosen": 4.92333869934082, + "rewards/margins": 14.695663579305013, + "rewards/rejected": -9.772324879964193, + "step": 10202 + }, + { + "epoch": 0.9322064869803564, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 1.1368986655820757e-07, + "logits/chosen": 515701845.3333333, + "logits/rejected": 547073843.2, + "logps/chosen": -233.09429931640625, + "logps/rejected": -515.505859375, + "loss": 0.0107, + "rewards/chosen": 4.556789398193359, + "rewards/margins": 13.030889129638672, + "rewards/rejected": -8.474099731445312, + "step": 10203 + }, + { + "epoch": 0.932297852900868, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 1.1338520439427848e-07, + "logits/chosen": 593336405.3333334, + "logits/rejected": 858851430.4, + "logps/chosen": -334.64479573567706, + "logps/rejected": -541.32509765625, + "loss": 0.0089, + "rewards/chosen": 3.8705005645751953, + "rewards/margins": 12.800510787963868, + "rewards/rejected": -8.930010223388672, + "step": 10204 + }, + { + "epoch": 0.9323892188213796, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 1.1308094630811784e-07, + "logits/chosen": 577925248.0, + "logits/rejected": 654840256.0, + "logps/chosen": -258.91412353515625, + "logps/rejected": -858.2125244140625, + "loss": 0.0261, + "rewards/chosen": 3.2079684734344482, + "rewards/margins": 13.941553831100464, + "rewards/rejected": -10.733585357666016, + "step": 10205 + }, + { + "epoch": 0.9324805847418913, + "grad_norm": 0.8046875, + "kl": 0.0, + "learning_rate": 1.1277709232488443e-07, + "logits/chosen": 422662101.3333333, + "logits/rejected": 455847731.2, + "logps/chosen": -208.1847127278646, + "logps/rejected": -526.390771484375, + "loss": 0.0055, + "rewards/chosen": 4.2402299245198565, + "rewards/margins": 12.916693242390949, + "rewards/rejected": -8.676463317871093, + "step": 10206 + }, + { + "epoch": 0.932571950662403, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 1.1247364246970427e-07, + "logits/chosen": 900975701.3333334, + "logits/rejected": 313519104.0, + "logps/chosen": -400.3503011067708, + "logps/rejected": -491.71923828125, + "loss": 0.025, + "rewards/chosen": 3.6164379119873047, + "rewards/margins": 14.584322929382324, + "rewards/rejected": -10.96788501739502, + "step": 10207 + }, + { + "epoch": 0.9326633165829146, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.1217059676767006e-07, + "logits/chosen": 507314816.0, + "logits/rejected": 540028006.4, + "logps/chosen": -383.8607584635417, + "logps/rejected": -482.569921875, + "loss": 0.0138, + "rewards/chosen": 4.005036989847819, + "rewards/margins": 12.908104006449381, + "rewards/rejected": -8.903067016601563, + "step": 10208 + }, + { + "epoch": 0.9327546825034262, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 1.118679552438412e-07, + "logits/chosen": 689896576.0, + "logits/rejected": 604727296.0, + "logps/chosen": -239.6393585205078, + "logps/rejected": -460.50738525390625, + "loss": 0.0239, + "rewards/chosen": 3.2060558795928955, + "rewards/margins": 11.312429666519165, + "rewards/rejected": -8.10637378692627, + "step": 10209 + }, + { + "epoch": 0.9328460484239379, + "grad_norm": 24.875, + "kl": 0.0, + "learning_rate": 1.1156571792324212e-07, + "logits/chosen": 329194432.0, + "logits/rejected": 561278310.4, + "logps/chosen": -304.7682698567708, + "logps/rejected": -468.73662109375, + "loss": 0.1082, + "rewards/chosen": 3.1213480631510415, + "rewards/margins": 11.295711771647134, + "rewards/rejected": -8.174363708496093, + "step": 10210 + }, + { + "epoch": 0.9329374143444495, + "grad_norm": 1.828125, + "kl": 0.0, + "learning_rate": 1.112638848308667e-07, + "logits/chosen": 1017085120.0, + "logits/rejected": 509779264.0, + "logps/chosen": -472.3975830078125, + "logps/rejected": -343.796875, + "loss": 0.0109, + "rewards/chosen": 4.3504767417907715, + "rewards/margins": 11.805249691009521, + "rewards/rejected": -7.45477294921875, + "step": 10211 + }, + { + "epoch": 0.9330287802649612, + "grad_norm": 0.6328125, + "kl": 0.0, + "learning_rate": 1.1096245599167221e-07, + "logits/chosen": 285924288.0, + "logits/rejected": 417081088.0, + "logps/chosen": -269.3952941894531, + "logps/rejected": -615.3280436197916, + "loss": 0.0028, + "rewards/chosen": 4.752248764038086, + "rewards/margins": 14.395844141642252, + "rewards/rejected": -9.643595377604166, + "step": 10212 + }, + { + "epoch": 0.9331201461854728, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 1.1066143143058483e-07, + "logits/chosen": 594702464.0, + "logps/chosen": -332.674072265625, + "loss": 0.0503, + "rewards/chosen": 3.4835779666900635, + "step": 10213 + }, + { + "epoch": 0.9332115121059845, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 1.1036081117249575e-07, + "logits/chosen": 592315264.0, + "logits/rejected": 430179904.0, + "logps/chosen": -328.1783040364583, + "logps/rejected": -480.5702819824219, + "loss": 0.0266, + "rewards/chosen": 3.550503412882487, + "rewards/margins": 12.441921869913736, + "rewards/rejected": -8.89141845703125, + "step": 10214 + }, + { + "epoch": 0.9333028780264961, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 1.1006059524226509e-07, + "logits/chosen": 561784320.0, + "logits/rejected": 689085312.0, + "logps/chosen": -311.4728698730469, + "logps/rejected": -380.9564208984375, + "loss": 0.0185, + "rewards/chosen": 3.4622950553894043, + "rewards/margins": 12.768964290618896, + "rewards/rejected": -9.306669235229492, + "step": 10215 + }, + { + "epoch": 0.9333942439470078, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.0976078366471632e-07, + "logits/chosen": 513433728.0, + "logits/rejected": 743268480.0, + "logps/chosen": -441.09014892578125, + "logps/rejected": -659.4739990234375, + "loss": 0.0162, + "rewards/chosen": 4.715771675109863, + "rewards/margins": 13.1665620803833, + "rewards/rejected": -8.450790405273438, + "step": 10216 + }, + { + "epoch": 0.9334856098675194, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 1.0946137646464183e-07, + "logits/chosen": 362165280.0, + "logits/rejected": 581572416.0, + "logps/chosen": -283.32989501953125, + "logps/rejected": -724.63232421875, + "loss": 0.0092, + "rewards/chosen": 4.370676517486572, + "rewards/margins": 14.070152759552002, + "rewards/rejected": -9.69947624206543, + "step": 10217 + }, + { + "epoch": 0.9335769757880311, + "grad_norm": 0.2431640625, + "kl": 0.0, + "learning_rate": 1.0916237366679961e-07, + "logits/chosen": 850378368.0, + "logits/rejected": 385737728.0, + "logps/chosen": -190.27687072753906, + "logps/rejected": -482.19151088169644, + "loss": 0.0012, + "rewards/chosen": 6.336116313934326, + "rewards/margins": 16.012053421565465, + "rewards/rejected": -9.675937107631139, + "step": 10218 + }, + { + "epoch": 0.9336683417085427, + "grad_norm": 0.77734375, + "kl": 0.0, + "learning_rate": 1.088637752959143e-07, + "logits/chosen": 991350400.0, + "logits/rejected": 461394528.0, + "logps/chosen": -475.19757080078125, + "logps/rejected": -456.0459899902344, + "loss": 0.0051, + "rewards/chosen": 4.682900428771973, + "rewards/margins": 14.411933898925781, + "rewards/rejected": -9.729033470153809, + "step": 10219 + }, + { + "epoch": 0.9337597076290544, + "grad_norm": 0.85546875, + "kl": 0.0, + "learning_rate": 1.0856558137667784e-07, + "logits/chosen": 706812288.0, + "logits/rejected": 686686720.0, + "logps/chosen": -547.310546875, + "logps/rejected": -519.6879185267857, + "loss": 0.0033, + "rewards/chosen": 3.726086378097534, + "rewards/margins": 13.27897116116115, + "rewards/rejected": -9.552884783063616, + "step": 10220 + }, + { + "epoch": 0.933851073549566, + "grad_norm": 0.83984375, + "kl": 0.0, + "learning_rate": 1.0826779193374715e-07, + "logits/chosen": 452249344.0, + "logits/rejected": 321920204.8, + "logps/chosen": -287.5961507161458, + "logps/rejected": -319.33857421875, + "loss": 0.0049, + "rewards/chosen": 4.464979807535808, + "rewards/margins": 13.690206400553386, + "rewards/rejected": -9.225226593017577, + "step": 10221 + }, + { + "epoch": 0.9339424394700777, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 1.0797040699174754e-07, + "logits/chosen": 642911829.3333334, + "logits/rejected": 620689817.6, + "logps/chosen": -277.0584309895833, + "logps/rejected": -519.56923828125, + "loss": 0.0049, + "rewards/chosen": 4.8627621332804365, + "rewards/margins": 14.798745409647623, + "rewards/rejected": -9.935983276367187, + "step": 10222 + }, + { + "epoch": 0.9340338053905893, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 1.0767342657526936e-07, + "logits/chosen": 603456921.6, + "logits/rejected": 629741440.0, + "logps/chosen": -232.942529296875, + "logps/rejected": -480.30322265625, + "loss": 0.0101, + "rewards/chosen": 4.442604064941406, + "rewards/margins": 14.502159118652344, + "rewards/rejected": -10.059555053710938, + "step": 10223 + }, + { + "epoch": 0.934125171311101, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 1.0737685070887016e-07, + "logits/chosen": 1128485632.0, + "logits/rejected": 728143360.0, + "logps/chosen": -277.91058349609375, + "logps/rejected": -598.571044921875, + "loss": 0.0111, + "rewards/chosen": 4.182559013366699, + "rewards/margins": 14.340996742248535, + "rewards/rejected": -10.158437728881836, + "step": 10224 + }, + { + "epoch": 0.9342165372316126, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 1.0708067941707478e-07, + "logits/chosen": 276641484.8, + "logits/rejected": 244522837.33333334, + "logps/chosen": -212.1181396484375, + "logps/rejected": -282.7242024739583, + "loss": 0.0196, + "rewards/chosen": 3.816770553588867, + "rewards/margins": 12.590478897094727, + "rewards/rejected": -8.77370834350586, + "step": 10225 + }, + { + "epoch": 0.9343079031521243, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 1.0678491272437309e-07, + "logits/chosen": 592621440.0, + "logits/rejected": 307620672.0, + "logps/chosen": -323.0992126464844, + "logps/rejected": -380.76385498046875, + "loss": 0.0054, + "rewards/chosen": 4.830440521240234, + "rewards/margins": 16.200212478637695, + "rewards/rejected": -11.369771957397461, + "step": 10226 + }, + { + "epoch": 0.9343992690726359, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.0648955065522216e-07, + "logits/chosen": 496184012.8, + "logits/rejected": 600763562.6666666, + "logps/chosen": -345.1234375, + "logps/rejected": -375.8067626953125, + "loss": 0.0198, + "rewards/chosen": 3.6726593017578124, + "rewards/margins": 11.124276224772135, + "rewards/rejected": -7.451616923014323, + "step": 10227 + }, + { + "epoch": 0.9344906349931476, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 1.061945932340469e-07, + "logits/chosen": 557782570.6666666, + "logits/rejected": 709573171.2, + "logps/chosen": -274.0662434895833, + "logps/rejected": -602.024267578125, + "loss": 0.1088, + "rewards/chosen": 3.8507347106933594, + "rewards/margins": 13.734091186523438, + "rewards/rejected": -9.883356475830078, + "step": 10228 + }, + { + "epoch": 0.9345820009136592, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.0590004048523617e-07, + "logits/chosen": 580184768.0, + "logits/rejected": 573476608.0, + "logps/chosen": -240.32334899902344, + "logps/rejected": -531.893310546875, + "loss": 0.0094, + "rewards/chosen": 3.4429550170898438, + "rewards/margins": 13.192042032877604, + "rewards/rejected": -9.74908701578776, + "step": 10229 + }, + { + "epoch": 0.9346733668341709, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 1.0560589243314711e-07, + "logits/chosen": 455771136.0, + "logits/rejected": 438629939.2, + "logps/chosen": -313.041259765625, + "logps/rejected": -394.26953125, + "loss": 0.015, + "rewards/chosen": 3.4353516896565757, + "rewards/margins": 10.988540776570638, + "rewards/rejected": -7.553189086914062, + "step": 10230 + }, + { + "epoch": 0.9347647327546825, + "grad_norm": 0.30078125, + "kl": 0.0, + "learning_rate": 1.0531214910210308e-07, + "logits/chosen": 170071477.33333334, + "logits/rejected": 360780723.2, + "logps/chosen": -157.72479248046875, + "logps/rejected": -382.040966796875, + "loss": 0.0022, + "rewards/chosen": 5.425568262736003, + "rewards/margins": 14.559560267130536, + "rewards/rejected": -9.133992004394532, + "step": 10231 + }, + { + "epoch": 0.9348560986751941, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 1.0501881051639407e-07, + "logits/chosen": 493086003.2, + "logits/rejected": 464515712.0, + "logps/chosen": -307.23212890625, + "logps/rejected": -338.638427734375, + "loss": 0.0099, + "rewards/chosen": 4.8517814636230465, + "rewards/margins": 11.971501922607422, + "rewards/rejected": -7.119720458984375, + "step": 10232 + }, + { + "epoch": 0.9349474645957058, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 1.0472587670027678e-07, + "logits/chosen": 828483456.0, + "logits/rejected": 552700800.0, + "logps/chosen": -329.2943115234375, + "logps/rejected": -323.86989339192706, + "loss": 0.0116, + "rewards/chosen": 3.1303162574768066, + "rewards/margins": 11.186628818511963, + "rewards/rejected": -8.056312561035156, + "step": 10233 + }, + { + "epoch": 0.9350388305162175, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.0443334767797353e-07, + "logits/chosen": 723056786.2857143, + "logits/rejected": 506513024.0, + "logps/chosen": -297.53006417410717, + "logps/rejected": -971.46533203125, + "loss": 0.0146, + "rewards/chosen": 4.599909101213727, + "rewards/margins": 25.647339139665874, + "rewards/rejected": -21.04743003845215, + "step": 10234 + }, + { + "epoch": 0.9351301964367291, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 1.0414122347367384e-07, + "logits/chosen": 283824800.0, + "logits/rejected": 467826336.0, + "logps/chosen": -408.95330810546875, + "logps/rejected": -462.28509521484375, + "loss": 0.0332, + "rewards/chosen": 3.955324649810791, + "rewards/margins": 13.498898029327393, + "rewards/rejected": -9.543573379516602, + "step": 10235 + }, + { + "epoch": 0.9352215623572407, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 1.0384950411153394e-07, + "logits/chosen": 313809689.6, + "logits/rejected": 254196288.0, + "logps/chosen": -260.63251953125, + "logps/rejected": -284.1280517578125, + "loss": 0.0508, + "rewards/chosen": 3.6633926391601563, + "rewards/margins": 11.133682505289714, + "rewards/rejected": -7.470289866129558, + "step": 10236 + }, + { + "epoch": 0.9353129282777524, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 1.0355818961567677e-07, + "logits/chosen": 432423526.4, + "logits/rejected": 801897984.0, + "logps/chosen": -189.070947265625, + "logps/rejected": -695.6726888020834, + "loss": 0.0092, + "rewards/chosen": 5.225350189208984, + "rewards/margins": 15.336637624104817, + "rewards/rejected": -10.111287434895834, + "step": 10237 + }, + { + "epoch": 0.9354042941982641, + "grad_norm": 23.25, + "kl": 0.0, + "learning_rate": 1.0326728001019026e-07, + "logits/chosen": 453120614.4, + "logits/rejected": 615319637.3333334, + "logps/chosen": -302.77421875, + "logps/rejected": -307.07094319661456, + "loss": 0.0872, + "rewards/chosen": 2.7186681747436525, + "rewards/margins": 7.764401308695476, + "rewards/rejected": -5.045733133951823, + "step": 10238 + }, + { + "epoch": 0.9354956601187757, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 1.029767753191302e-07, + "logits/chosen": 1040583987.2, + "logits/rejected": 570341973.3333334, + "logps/chosen": -333.64609375, + "logps/rejected": -565.6341552734375, + "loss": 0.0095, + "rewards/chosen": 4.4191947937011715, + "rewards/margins": 15.401748911539713, + "rewards/rejected": -10.982554117838541, + "step": 10239 + }, + { + "epoch": 0.9355870260392873, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 1.02686675566519e-07, + "logits/chosen": 655193728.0, + "logits/rejected": 595885440.0, + "logps/chosen": -431.6202799479167, + "logps/rejected": -672.8015747070312, + "loss": 0.012, + "rewards/chosen": 4.823437372843425, + "rewards/margins": 14.210693041483562, + "rewards/rejected": -9.387255668640137, + "step": 10240 + }, + { + "epoch": 0.935678391959799, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 1.023969807763453e-07, + "logits/chosen": 804199168.0, + "logits/rejected": 512212480.0, + "logps/chosen": -758.7576293945312, + "logps/rejected": -519.4375, + "loss": 0.0061, + "rewards/chosen": 3.987645149230957, + "rewards/margins": 12.684099515279135, + "rewards/rejected": -8.696454366048178, + "step": 10241 + }, + { + "epoch": 0.9357697578803107, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 1.0210769097256435e-07, + "logits/chosen": 556007808.0, + "logits/rejected": 464971648.0, + "logps/chosen": -463.326171875, + "logps/rejected": -639.3675537109375, + "loss": 0.0194, + "rewards/chosen": 3.5429391860961914, + "rewards/margins": 12.979207038879395, + "rewards/rejected": -9.436267852783203, + "step": 10242 + }, + { + "epoch": 0.9358611238008223, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 1.0181880617909701e-07, + "logits/chosen": 546029248.0, + "logits/rejected": 543920938.6666666, + "logps/chosen": -457.1883850097656, + "logps/rejected": -679.6196695963541, + "loss": 0.015, + "rewards/chosen": 3.4414169788360596, + "rewards/margins": 13.339661200841268, + "rewards/rejected": -9.898244222005209, + "step": 10243 + }, + { + "epoch": 0.9359524897213339, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 1.015303264198314e-07, + "logits/chosen": 499665152.0, + "logits/rejected": 321637184.0, + "logps/chosen": -369.788818359375, + "logps/rejected": -195.59515380859375, + "loss": 0.0168, + "rewards/chosen": 3.9060722986857095, + "rewards/margins": 11.649930159250895, + "rewards/rejected": -7.7438578605651855, + "step": 10244 + }, + { + "epoch": 0.9360438556418456, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.012422517186229e-07, + "logits/chosen": 592913365.3333334, + "logits/rejected": 1530610816.0, + "logps/chosen": -232.8139851888021, + "logps/rejected": -672.5064086914062, + "loss": 0.0284, + "rewards/chosen": 4.676440556844075, + "rewards/margins": 10.79486592610677, + "rewards/rejected": -6.118425369262695, + "step": 10245 + }, + { + "epoch": 0.9361352215623573, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 1.0095458209929243e-07, + "logits/chosen": 378227302.4, + "logits/rejected": 532667733.3333333, + "logps/chosen": -274.140869140625, + "logps/rejected": -577.3932291666666, + "loss": 0.0233, + "rewards/chosen": 3.2729957580566404, + "rewards/margins": 14.965504455566407, + "rewards/rejected": -11.692508697509766, + "step": 10246 + }, + { + "epoch": 0.9362265874828689, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 1.0066731758562709e-07, + "logits/chosen": 756261973.3333334, + "logits/rejected": 577293824.0, + "logps/chosen": -563.7069091796875, + "logps/rejected": -505.210498046875, + "loss": 0.012, + "rewards/chosen": 3.4768829345703125, + "rewards/margins": 12.170318603515625, + "rewards/rejected": -8.693435668945312, + "step": 10247 + }, + { + "epoch": 0.9363179534033805, + "grad_norm": 54.25, + "kl": 0.0, + "learning_rate": 1.0038045820138065e-07, + "logits/chosen": 413689557.3333333, + "logits/rejected": 408119552.0, + "logps/chosen": -280.2644856770833, + "logps/rejected": -181.8723907470703, + "loss": 0.1018, + "rewards/chosen": 3.4433631896972656, + "rewards/margins": 10.269658088684082, + "rewards/rejected": -6.826294898986816, + "step": 10248 + }, + { + "epoch": 0.9364093193238922, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 1.0009400397027469e-07, + "logits/chosen": 592957504.0, + "logits/rejected": 524783296.0, + "logps/chosen": -409.8372497558594, + "logps/rejected": -668.16259765625, + "loss": 0.0128, + "rewards/chosen": 3.7885048389434814, + "rewards/margins": 13.366276025772095, + "rewards/rejected": -9.577771186828613, + "step": 10249 + }, + { + "epoch": 0.9365006852444039, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 9.980795491599638e-08, + "logits/chosen": 256114346.66666666, + "logits/rejected": 409078835.2, + "logps/chosen": -212.59195963541666, + "logps/rejected": -317.3696533203125, + "loss": 0.1296, + "rewards/chosen": 4.142398198445638, + "rewards/margins": 9.281040700276693, + "rewards/rejected": -5.138642501831055, + "step": 10250 + }, + { + "epoch": 0.9365920511649155, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 9.9522311062199e-08, + "logits/chosen": 463615104.0, + "logits/rejected": 530960736.0, + "logps/chosen": -235.02294921875, + "logps/rejected": -639.1036987304688, + "loss": 0.0168, + "rewards/chosen": 3.525753974914551, + "rewards/margins": 14.928393363952637, + "rewards/rejected": -11.402639389038086, + "step": 10251 + }, + { + "epoch": 0.9366834170854271, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 9.923707243250147e-08, + "logits/chosen": 673525043.2, + "logits/rejected": 1598154410.6666667, + "logps/chosen": -464.012109375, + "logps/rejected": -544.343505859375, + "loss": 0.0221, + "rewards/chosen": 3.5984756469726564, + "rewards/margins": 12.267780812581382, + "rewards/rejected": -8.669305165608725, + "step": 10252 + }, + { + "epoch": 0.9367747830059387, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 9.895223905049267e-08, + "logits/chosen": 632452309.3333334, + "logits/rejected": 380148889.6, + "logps/chosen": -443.4278564453125, + "logps/rejected": -338.9336181640625, + "loss": 0.1212, + "rewards/chosen": 4.061592102050781, + "rewards/margins": 10.347892379760742, + "rewards/rejected": -6.286300277709961, + "step": 10253 + }, + { + "epoch": 0.9368661489264505, + "grad_norm": 0.76953125, + "kl": 0.0, + "learning_rate": 9.866781093972377e-08, + "logits/chosen": 747180672.0, + "logits/rejected": 651255936.0, + "logps/chosen": -315.4722900390625, + "logps/rejected": -679.1895751953125, + "loss": 0.0055, + "rewards/chosen": 4.580442905426025, + "rewards/margins": 15.023205280303955, + "rewards/rejected": -10.44276237487793, + "step": 10254 + }, + { + "epoch": 0.9369575148469621, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 9.838378812371541e-08, + "logits/chosen": 540190361.6, + "logits/rejected": 512775424.0, + "logps/chosen": -295.2658935546875, + "logps/rejected": -632.2215576171875, + "loss": 0.0222, + "rewards/chosen": 3.4325279235839843, + "rewards/margins": 13.299916330973307, + "rewards/rejected": -9.867388407389322, + "step": 10255 + }, + { + "epoch": 0.9370488807674737, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 9.810017062595322e-08, + "logits/chosen": 457337856.0, + "logits/rejected": 329836202.6666667, + "logps/chosen": -336.303369140625, + "logps/rejected": -450.5398356119792, + "loss": 0.0189, + "rewards/chosen": 3.563123321533203, + "rewards/margins": 15.003793334960937, + "rewards/rejected": -11.440670013427734, + "step": 10256 + }, + { + "epoch": 0.9371402466879853, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 9.781695846988958e-08, + "logits/chosen": 1199647744.0, + "logits/rejected": 591353152.0, + "logps/chosen": -286.34332275390625, + "logps/rejected": -388.42828369140625, + "loss": 0.0107, + "rewards/chosen": 3.9085028171539307, + "rewards/margins": 13.432840585708618, + "rewards/rejected": -9.524337768554688, + "step": 10257 + }, + { + "epoch": 0.9372316126084971, + "grad_norm": 49.5, + "kl": 0.0, + "learning_rate": 9.753415167894409e-08, + "logits/chosen": 423286208.0, + "logits/rejected": 249121424.0, + "logps/chosen": -290.83966064453125, + "logps/rejected": -360.13787841796875, + "loss": 0.0713, + "rewards/chosen": 2.255584955215454, + "rewards/margins": 12.448461771011353, + "rewards/rejected": -10.192876815795898, + "step": 10258 + }, + { + "epoch": 0.9373229785290087, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 9.725175027650191e-08, + "logits/chosen": 352242892.8, + "logits/rejected": 450956245.3333333, + "logps/chosen": -295.836083984375, + "logps/rejected": -667.675537109375, + "loss": 0.0131, + "rewards/chosen": 4.071054840087891, + "rewards/margins": 14.824602508544922, + "rewards/rejected": -10.753547668457031, + "step": 10259 + }, + { + "epoch": 0.9374143444495203, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 9.696975428591493e-08, + "logits/chosen": 737431552.0, + "logits/rejected": 829993301.3333334, + "logps/chosen": -351.4537048339844, + "logps/rejected": -434.5934244791667, + "loss": 0.0142, + "rewards/chosen": 3.267333984375, + "rewards/margins": 11.823602040608725, + "rewards/rejected": -8.556268056233725, + "step": 10260 + }, + { + "epoch": 0.9375057103700319, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 9.668816373050117e-08, + "logits/chosen": 458498048.0, + "logits/rejected": 392858880.0, + "logps/chosen": -328.898583984375, + "logps/rejected": -493.0391438802083, + "loss": 0.0252, + "rewards/chosen": 3.551984405517578, + "rewards/margins": 13.087447102864584, + "rewards/rejected": -9.535462697347006, + "step": 10261 + }, + { + "epoch": 0.9375970762905437, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.640697863354697e-08, + "logits/chosen": 672488384.0, + "logits/rejected": 663379392.0, + "logps/chosen": -440.7635498046875, + "logps/rejected": -398.953125, + "loss": 0.0481, + "rewards/chosen": 2.616516590118408, + "rewards/margins": 10.346810817718506, + "rewards/rejected": -7.730294227600098, + "step": 10262 + }, + { + "epoch": 0.9376884422110553, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.612619901830211e-08, + "logits/chosen": 746377676.8, + "logits/rejected": 526292778.6666667, + "logps/chosen": -442.95244140625, + "logps/rejected": -546.4296061197916, + "loss": 0.0289, + "rewards/chosen": 3.439226531982422, + "rewards/margins": 11.684098943074545, + "rewards/rejected": -8.244872411092123, + "step": 10263 + }, + { + "epoch": 0.9377798081315669, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 9.584582490798633e-08, + "logits/chosen": 345523648.0, + "logits/rejected": 575585280.0, + "logps/chosen": -255.30433654785156, + "logps/rejected": -600.4503784179688, + "loss": 0.0133, + "rewards/chosen": 3.999814510345459, + "rewards/margins": 11.884144306182861, + "rewards/rejected": -7.884329795837402, + "step": 10264 + }, + { + "epoch": 0.9378711740520785, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 9.556585632578163e-08, + "logits/chosen": 500711456.0, + "logits/rejected": 610418688.0, + "logps/chosen": -213.80807495117188, + "logps/rejected": -537.5850219726562, + "loss": 0.0111, + "rewards/chosen": 4.293712615966797, + "rewards/margins": 13.815230369567871, + "rewards/rejected": -9.521517753601074, + "step": 10265 + }, + { + "epoch": 0.9379625399725903, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.528629329484007e-08, + "logits/chosen": 989096345.6, + "logits/rejected": 899433642.6666666, + "logps/chosen": -458.52958984375, + "logps/rejected": -545.4240315755209, + "loss": 0.0429, + "rewards/chosen": 3.359230804443359, + "rewards/margins": 11.41582883199056, + "rewards/rejected": -8.056598027547201, + "step": 10266 + }, + { + "epoch": 0.9380539058931019, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 9.500713583827926e-08, + "logits/chosen": 448738858.6666667, + "logits/rejected": 319435571.2, + "logps/chosen": -306.4696858723958, + "logps/rejected": -470.39189453125, + "loss": 0.0058, + "rewards/chosen": 4.901627858479817, + "rewards/margins": 14.509966786702474, + "rewards/rejected": -9.608338928222656, + "step": 10267 + }, + { + "epoch": 0.9381452718136135, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 9.472838397918294e-08, + "logits/chosen": 756008768.0, + "logits/rejected": 657696256.0, + "logps/chosen": -402.06170654296875, + "logps/rejected": -422.1459045410156, + "loss": 0.0448, + "rewards/chosen": 4.133945465087891, + "rewards/margins": 11.883834838867188, + "rewards/rejected": -7.749889373779297, + "step": 10268 + }, + { + "epoch": 0.9382366377341251, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.445003774059991e-08, + "logits/chosen": 679034624.0, + "logits/rejected": 437623104.0, + "logps/chosen": -348.86907958984375, + "logps/rejected": -563.0807495117188, + "loss": 0.0295, + "rewards/chosen": 3.270669937133789, + "rewards/margins": 12.249588012695312, + "rewards/rejected": -8.978918075561523, + "step": 10269 + }, + { + "epoch": 0.9383280036546369, + "grad_norm": 2.640625, + "kl": 0.20965194702148438, + "learning_rate": 9.417209714554842e-08, + "logits/chosen": 937611178.6666666, + "logits/rejected": 499632416.0, + "logps/chosen": -360.5370686848958, + "logps/rejected": -492.2408752441406, + "loss": 0.0157, + "rewards/chosen": 4.3492685953776045, + "rewards/margins": 14.947768847147625, + "rewards/rejected": -10.59850025177002, + "step": 10270 + }, + { + "epoch": 0.9384193695751485, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 9.389456221701121e-08, + "logits/chosen": 1184089472.0, + "logits/rejected": 578294613.3333334, + "logps/chosen": -553.9205322265625, + "logps/rejected": -490.2007649739583, + "loss": 0.0043, + "rewards/chosen": 4.142735481262207, + "rewards/margins": 12.895930926005045, + "rewards/rejected": -8.753195444742838, + "step": 10271 + }, + { + "epoch": 0.9385107354956601, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 9.361743297793713e-08, + "logits/chosen": 388106794.6666667, + "logits/rejected": 455961600.0, + "logps/chosen": -528.20166015625, + "logps/rejected": -490.098388671875, + "loss": 0.017, + "rewards/chosen": 3.6305421193440757, + "rewards/margins": 13.74013532002767, + "rewards/rejected": -10.109593200683594, + "step": 10272 + }, + { + "epoch": 0.9386021014161717, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 9.334070945124284e-08, + "logits/chosen": 475746688.0, + "logits/rejected": 583824896.0, + "logps/chosen": -262.35150146484375, + "logps/rejected": -520.9777018229166, + "loss": 0.0048, + "rewards/chosen": 4.083975315093994, + "rewards/margins": 12.4036914507548, + "rewards/rejected": -8.319716135660807, + "step": 10273 + }, + { + "epoch": 0.9386934673366835, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 9.306439165981063e-08, + "logits/chosen": 672707072.0, + "logits/rejected": 721710208.0, + "logps/chosen": -394.2197265625, + "logps/rejected": -792.583740234375, + "loss": 0.0214, + "rewards/chosen": 3.8007691701253257, + "rewards/margins": 16.156894048055012, + "rewards/rejected": -12.356124877929688, + "step": 10274 + }, + { + "epoch": 0.9387848332571951, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.278847962648996e-08, + "logits/chosen": 526275942.4, + "logits/rejected": 793421994.6666666, + "logps/chosen": -496.158203125, + "logps/rejected": -446.8973795572917, + "loss": 0.0154, + "rewards/chosen": 4.619071197509766, + "rewards/margins": 13.731014760335288, + "rewards/rejected": -9.111943562825521, + "step": 10275 + }, + { + "epoch": 0.9388761991777067, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 9.251297337409537e-08, + "logits/chosen": 455362889.14285713, + "logits/rejected": 135292480.0, + "logps/chosen": -248.78982979910714, + "logps/rejected": -125.78231811523438, + "loss": 0.0435, + "rewards/chosen": 3.2386975969587053, + "rewards/margins": 7.843741485050746, + "rewards/rejected": -4.605043888092041, + "step": 10276 + }, + { + "epoch": 0.9389675650982183, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 9.223787292540865e-08, + "logits/chosen": 484710656.0, + "logits/rejected": 371426688.0, + "logps/chosen": -314.482666015625, + "logps/rejected": -453.9633483886719, + "loss": 0.0114, + "rewards/chosen": 4.302330493927002, + "rewards/margins": 13.191904544830322, + "rewards/rejected": -8.88957405090332, + "step": 10277 + }, + { + "epoch": 0.9390589310187301, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 9.19631783031788e-08, + "logits/chosen": 864989056.0, + "logits/rejected": 405861376.0, + "logps/chosen": -471.7940979003906, + "logps/rejected": -541.4539794921875, + "loss": 0.0152, + "rewards/chosen": 3.9442214965820312, + "rewards/margins": 13.655377388000488, + "rewards/rejected": -9.711155891418457, + "step": 10278 + }, + { + "epoch": 0.9391502969392417, + "grad_norm": 65.5, + "kl": 0.0, + "learning_rate": 9.16888895301199e-08, + "logits/chosen": 600638656.0, + "logits/rejected": 711864640.0, + "logps/chosen": -342.28155517578125, + "logps/rejected": -548.5931396484375, + "loss": 0.089, + "rewards/chosen": 3.539989471435547, + "rewards/margins": 13.832216262817383, + "rewards/rejected": -10.292226791381836, + "step": 10279 + }, + { + "epoch": 0.9392416628597533, + "grad_norm": 37.75, + "kl": 0.0, + "learning_rate": 9.141500662891378e-08, + "logits/chosen": 645141650.2857143, + "logits/rejected": 190129008.0, + "logps/chosen": -256.76524135044644, + "logps/rejected": -234.03787231445312, + "loss": 0.1242, + "rewards/chosen": 2.4323131016322543, + "rewards/margins": 10.870026724679128, + "rewards/rejected": -8.437713623046875, + "step": 10280 + }, + { + "epoch": 0.9393330287802649, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 9.114152962220734e-08, + "logits/chosen": 593752115.2, + "logits/rejected": 738862421.3333334, + "logps/chosen": -271.3359619140625, + "logps/rejected": -407.6295572916667, + "loss": 0.0222, + "rewards/chosen": 4.253182220458984, + "rewards/margins": 13.630457051595052, + "rewards/rejected": -9.377274831136068, + "step": 10281 + }, + { + "epoch": 0.9394243947007767, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.086845853261417e-08, + "logits/chosen": 546874112.0, + "logits/rejected": 511891200.0, + "logps/chosen": -389.443017578125, + "logps/rejected": -520.3802897135416, + "loss": 0.023, + "rewards/chosen": 3.7362895965576173, + "rewards/margins": 12.685211435953775, + "rewards/rejected": -8.948921839396158, + "step": 10282 + }, + { + "epoch": 0.9395157606212883, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 9.059579338271618e-08, + "logits/chosen": 514607648.0, + "logits/rejected": 414625856.0, + "logps/chosen": -309.75018310546875, + "logps/rejected": -454.851806640625, + "loss": 0.0254, + "rewards/chosen": 3.2430877685546875, + "rewards/margins": 11.636103630065918, + "rewards/rejected": -8.39301586151123, + "step": 10283 + }, + { + "epoch": 0.9396071265417999, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 9.032353419505923e-08, + "logits/chosen": 471816490.6666667, + "logits/rejected": 1456369920.0, + "logps/chosen": -267.7984212239583, + "logps/rejected": -541.7902221679688, + "loss": 0.0145, + "rewards/chosen": 4.352184295654297, + "rewards/margins": 15.541197776794434, + "rewards/rejected": -11.189013481140137, + "step": 10284 + }, + { + "epoch": 0.9396984924623115, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 9.0051680992157e-08, + "logits/chosen": 645168256.0, + "logits/rejected": 520360064.0, + "logps/chosen": -244.3028564453125, + "logps/rejected": -423.23138427734375, + "loss": 0.0088, + "rewards/chosen": 4.160290241241455, + "rewards/margins": 13.588294506072998, + "rewards/rejected": -9.428004264831543, + "step": 10285 + }, + { + "epoch": 0.9397898583828233, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 8.97802337964887e-08, + "logits/chosen": 497597888.0, + "logits/rejected": 642338368.0, + "logps/chosen": -357.7943115234375, + "logps/rejected": -314.33685302734375, + "loss": 0.0182, + "rewards/chosen": 3.6630699634552, + "rewards/margins": 12.216248750686646, + "rewards/rejected": -8.553178787231445, + "step": 10286 + }, + { + "epoch": 0.9398812243033349, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 8.950919263050084e-08, + "logits/chosen": 499796960.0, + "logits/rejected": 402032448.0, + "logps/chosen": -248.7033233642578, + "logps/rejected": -286.54071044921875, + "loss": 0.0217, + "rewards/chosen": 3.3554673194885254, + "rewards/margins": 13.229227542877197, + "rewards/rejected": -9.873760223388672, + "step": 10287 + }, + { + "epoch": 0.9399725902238465, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 8.923855751660603e-08, + "logits/chosen": 904379989.3333334, + "logits/rejected": 751529113.6, + "logps/chosen": -308.89381917317706, + "logps/rejected": -530.1080078125, + "loss": 0.0201, + "rewards/chosen": 3.4035466512044272, + "rewards/margins": 12.214847310384116, + "rewards/rejected": -8.811300659179688, + "step": 10288 + }, + { + "epoch": 0.9400639561443581, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 8.896832847718361e-08, + "logits/chosen": 452242048.0, + "logits/rejected": 766146560.0, + "logps/chosen": -211.82733154296875, + "logps/rejected": -473.7909458705357, + "loss": 0.0056, + "rewards/chosen": 3.0676636695861816, + "rewards/margins": 13.300393990107946, + "rewards/rejected": -10.232730320521764, + "step": 10289 + }, + { + "epoch": 0.9401553220648698, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 8.869850553457903e-08, + "logits/chosen": 241053808.0, + "logits/rejected": 531185408.0, + "logps/chosen": -170.51129150390625, + "logps/rejected": -599.69384765625, + "loss": 0.0077, + "rewards/chosen": 3.85078763961792, + "rewards/margins": 13.145058472951254, + "rewards/rejected": -9.294270833333334, + "step": 10290 + }, + { + "epoch": 0.9402466879853815, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 8.842908871110334e-08, + "logits/chosen": 561220032.0, + "logits/rejected": 606861952.0, + "logps/chosen": -132.91336059570312, + "logps/rejected": -531.2943929036459, + "loss": 0.0164, + "rewards/chosen": 2.8100998401641846, + "rewards/margins": 11.839362382888794, + "rewards/rejected": -9.02926254272461, + "step": 10291 + }, + { + "epoch": 0.9403380539058931, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 8.816007802903537e-08, + "logits/chosen": 702640742.4, + "logits/rejected": 714269568.0, + "logps/chosen": -358.7220458984375, + "logps/rejected": -759.5311686197916, + "loss": 0.0115, + "rewards/chosen": 4.43755874633789, + "rewards/margins": 15.609761810302734, + "rewards/rejected": -11.172203063964844, + "step": 10292 + }, + { + "epoch": 0.9404294198264047, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.789147351062011e-08, + "logits/chosen": 436298666.6666667, + "logits/rejected": 517262624.0, + "logps/chosen": -312.6780192057292, + "logps/rejected": -664.501953125, + "loss": 0.0389, + "rewards/chosen": 3.373323440551758, + "rewards/margins": 12.792337417602539, + "rewards/rejected": -9.419013977050781, + "step": 10293 + }, + { + "epoch": 0.9405207857469164, + "grad_norm": 0.2353515625, + "kl": 0.0, + "learning_rate": 8.762327517806756e-08, + "logits/rejected": 298445920.0, + "logps/rejected": -495.9443054199219, + "loss": 0.001, + "rewards/rejected": -9.917168617248535, + "step": 10294 + }, + { + "epoch": 0.9406121516674281, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 8.735548305355613e-08, + "logits/chosen": 305279136.0, + "logits/rejected": 468527616.0, + "logps/chosen": -344.415283203125, + "logps/rejected": -451.6621398925781, + "loss": 0.0332, + "rewards/chosen": 2.681195020675659, + "rewards/margins": 11.637224912643433, + "rewards/rejected": -8.956029891967773, + "step": 10295 + }, + { + "epoch": 0.9407035175879397, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 8.708809715922973e-08, + "logits/chosen": 649796949.3333334, + "logits/rejected": 673215488.0, + "logps/chosen": -224.97843424479166, + "logps/rejected": -254.578369140625, + "loss": 0.0531, + "rewards/chosen": 3.390373865763346, + "rewards/margins": 12.07961908976237, + "rewards/rejected": -8.689245223999023, + "step": 10296 + }, + { + "epoch": 0.9407948835084513, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 8.682111751719901e-08, + "logits/chosen": 303263584.0, + "logits/rejected": 548140672.0, + "logps/chosen": -239.19387817382812, + "logps/rejected": -612.23046875, + "loss": 0.0422, + "rewards/chosen": 2.4441165924072266, + "rewards/margins": 9.863362789154053, + "rewards/rejected": -7.419246196746826, + "step": 10297 + }, + { + "epoch": 0.940886249428963, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 8.655454414953968e-08, + "logits/chosen": 427361322.6666667, + "logits/rejected": 353265824.0, + "logps/chosen": -349.3048095703125, + "logps/rejected": -366.129150390625, + "loss": 0.0236, + "rewards/chosen": 4.038356781005859, + "rewards/margins": 9.835390567779541, + "rewards/rejected": -5.797033786773682, + "step": 10298 + }, + { + "epoch": 0.9409776153494747, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.628837707829519e-08, + "logits/chosen": 447958528.0, + "logits/rejected": 493866368.0, + "logps/chosen": -261.8668212890625, + "logps/rejected": -503.181396484375, + "loss": 0.0114, + "rewards/chosen": 4.4871649742126465, + "rewards/margins": 13.8367018699646, + "rewards/rejected": -9.349536895751953, + "step": 10299 + }, + { + "epoch": 0.9410689812699863, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 8.602261632547626e-08, + "logits/chosen": 514805760.0, + "logits/rejected": 316998528.0, + "logps/chosen": -264.916162109375, + "logps/rejected": -275.6317545572917, + "loss": 0.0224, + "rewards/chosen": 3.570104217529297, + "rewards/margins": 10.734288787841797, + "rewards/rejected": -7.1641845703125, + "step": 10300 + }, + { + "epoch": 0.9411603471904979, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.575726191305755e-08, + "logits/chosen": 563204454.4, + "logits/rejected": 927209642.6666666, + "logps/chosen": -343.9551025390625, + "logps/rejected": -569.9302978515625, + "loss": 0.0146, + "rewards/chosen": 4.350249481201172, + "rewards/margins": 12.665846379597983, + "rewards/rejected": -8.31559689839681, + "step": 10301 + }, + { + "epoch": 0.9412517131110096, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 8.549231386298151e-08, + "logits/chosen": 552536064.0, + "logits/rejected": 447104554.6666667, + "logps/chosen": -482.1186218261719, + "logps/rejected": -599.1923421223959, + "loss": 0.0113, + "rewards/chosen": 3.0655150413513184, + "rewards/margins": 12.545520941416422, + "rewards/rejected": -9.480005900065104, + "step": 10302 + }, + { + "epoch": 0.9413430790315213, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 8.522777219715728e-08, + "logits/chosen": 517561258.6666667, + "logits/rejected": 778399808.0, + "logps/chosen": -343.9092610677083, + "logps/rejected": -518.7225952148438, + "loss": 0.0225, + "rewards/chosen": 3.949343681335449, + "rewards/margins": 14.14039134979248, + "rewards/rejected": -10.191047668457031, + "step": 10303 + }, + { + "epoch": 0.9414344449520329, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 8.496363693746013e-08, + "logits/chosen": 607449386.6666666, + "logits/rejected": 285091891.2, + "logps/chosen": -289.1609700520833, + "logps/rejected": -342.6413818359375, + "loss": 0.0123, + "rewards/chosen": 4.065708160400391, + "rewards/margins": 13.427997589111328, + "rewards/rejected": -9.362289428710938, + "step": 10304 + }, + { + "epoch": 0.9415258108725445, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 8.469990810573148e-08, + "logits/chosen": 797818965.3333334, + "logits/rejected": 706516224.0, + "logps/chosen": -488.0214436848958, + "logps/rejected": -524.8414916992188, + "loss": 0.0215, + "rewards/chosen": 4.008074442545573, + "rewards/margins": 12.783623377482098, + "rewards/rejected": -8.775548934936523, + "step": 10305 + }, + { + "epoch": 0.9416171767930562, + "grad_norm": 54.5, + "kl": 0.0, + "learning_rate": 8.443658572377889e-08, + "logits/chosen": 649793664.0, + "logits/rejected": 600321462.8571428, + "logps/chosen": -514.20654296875, + "logps/rejected": -396.26450892857144, + "loss": 0.0859, + "rewards/chosen": 3.43536376953125, + "rewards/margins": 12.322964804513115, + "rewards/rejected": -8.887601034981865, + "step": 10306 + }, + { + "epoch": 0.9417085427135679, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 8.417366981337715e-08, + "logits/chosen": 592934400.0, + "logits/rejected": 417137664.0, + "logps/chosen": -451.57265625, + "logps/rejected": -451.5795084635417, + "loss": 0.0156, + "rewards/chosen": 4.514923095703125, + "rewards/margins": 10.695851008097332, + "rewards/rejected": -6.180927912394206, + "step": 10307 + }, + { + "epoch": 0.9417999086340795, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 8.391116039626668e-08, + "logits/chosen": 720072089.6, + "logits/rejected": 574558250.6666666, + "logps/chosen": -486.392431640625, + "logps/rejected": -252.30586751302084, + "loss": 0.0153, + "rewards/chosen": 3.855612945556641, + "rewards/margins": 11.829449208577474, + "rewards/rejected": -7.973836263020833, + "step": 10308 + }, + { + "epoch": 0.9418912745545911, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 8.364905749415453e-08, + "logits/chosen": 492185760.0, + "logits/rejected": 317700704.0, + "logps/chosen": -298.58465576171875, + "logps/rejected": -389.7314453125, + "loss": 0.007, + "rewards/chosen": 4.517012596130371, + "rewards/margins": 13.992056846618652, + "rewards/rejected": -9.475044250488281, + "step": 10309 + }, + { + "epoch": 0.9419826404751028, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 8.338736112871448e-08, + "logits/chosen": 478346605.71428573, + "logits/rejected": 264761248.0, + "logps/chosen": -361.10362025669644, + "logps/rejected": -539.4414672851562, + "loss": 0.0337, + "rewards/chosen": 3.838689531598772, + "rewards/margins": 17.03638240269252, + "rewards/rejected": -13.19769287109375, + "step": 10310 + }, + { + "epoch": 0.9420740063956144, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 8.312607132158645e-08, + "logits/chosen": 647417446.4, + "logits/rejected": 503241557.3333333, + "logps/chosen": -372.0864990234375, + "logps/rejected": -415.18212890625, + "loss": 0.0261, + "rewards/chosen": 3.335139846801758, + "rewards/margins": 11.822455978393554, + "rewards/rejected": -8.487316131591797, + "step": 10311 + }, + { + "epoch": 0.9421653723161261, + "grad_norm": 0.5234375, + "kl": 0.0, + "learning_rate": 8.286518809437594e-08, + "logits/chosen": 384409088.0, + "logits/rejected": 590404608.0, + "logps/chosen": -343.2052307128906, + "logps/rejected": -531.7143961588541, + "loss": 0.0022, + "rewards/chosen": 5.291550636291504, + "rewards/margins": 14.508945147196451, + "rewards/rejected": -9.217394510904947, + "step": 10312 + }, + { + "epoch": 0.9422567382366377, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 8.260471146865678e-08, + "logits/chosen": 430823104.0, + "logits/rejected": 502356032.0, + "logps/chosen": -239.0015869140625, + "logps/rejected": -567.7539672851562, + "loss": 0.0138, + "rewards/chosen": 4.119137763977051, + "rewards/margins": 15.15485668182373, + "rewards/rejected": -11.03571891784668, + "step": 10313 + }, + { + "epoch": 0.9423481041571494, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 8.234464146596677e-08, + "logits/chosen": 595538176.0, + "logits/rejected": 687374933.3333334, + "logps/chosen": -417.7834167480469, + "logps/rejected": -636.4007568359375, + "loss": 0.0039, + "rewards/chosen": 4.1482696533203125, + "rewards/margins": 15.286036173502604, + "rewards/rejected": -11.137766520182291, + "step": 10314 + }, + { + "epoch": 0.942439470077661, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 8.208497810781202e-08, + "logits/chosen": 711057706.6666666, + "logits/rejected": 936515392.0, + "logps/chosen": -357.3473714192708, + "logps/rejected": -403.7683410644531, + "loss": 0.0389, + "rewards/chosen": 3.5301602681477866, + "rewards/margins": 13.237772305806478, + "rewards/rejected": -9.707612037658691, + "step": 10315 + }, + { + "epoch": 0.9425308359981727, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 8.18257214156637e-08, + "logits/chosen": 449360832.0, + "logits/rejected": 412634176.0, + "logps/chosen": -359.56451416015625, + "logps/rejected": -597.6641845703125, + "loss": 0.0178, + "rewards/chosen": 3.6630501747131348, + "rewards/margins": 13.310805797576904, + "rewards/rejected": -9.64775562286377, + "step": 10316 + }, + { + "epoch": 0.9426222019186843, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 8.156687141096131e-08, + "logits/chosen": 690752128.0, + "logits/rejected": 456368448.0, + "logps/chosen": -257.0520324707031, + "logps/rejected": -348.9740905761719, + "loss": 0.0194, + "rewards/chosen": 4.078251838684082, + "rewards/margins": 12.213528633117676, + "rewards/rejected": -8.135276794433594, + "step": 10317 + }, + { + "epoch": 0.942713567839196, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.130842811510775e-08, + "logits/chosen": 661564501.3333334, + "logits/rejected": 416422092.8, + "logps/chosen": -392.004150390625, + "logps/rejected": -451.36396484375, + "loss": 0.0219, + "rewards/chosen": 3.8237152099609375, + "rewards/margins": 12.06276626586914, + "rewards/rejected": -8.239051055908202, + "step": 10318 + }, + { + "epoch": 0.9428049337597076, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 8.10503915494748e-08, + "logits/chosen": 778244900.5714285, + "logits/rejected": 413478016.0, + "logps/chosen": -462.2095424107143, + "logps/rejected": -622.365966796875, + "loss": 0.0113, + "rewards/chosen": 4.515426090785435, + "rewards/margins": 14.096602848597936, + "rewards/rejected": -9.5811767578125, + "step": 10319 + }, + { + "epoch": 0.9428962996802193, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 8.079276173539874e-08, + "logits/chosen": 456980096.0, + "logits/rejected": 824606822.4, + "logps/chosen": -248.74381510416666, + "logps/rejected": -267.35078125, + "loss": 0.0837, + "rewards/chosen": 3.673598289489746, + "rewards/margins": 9.468270683288575, + "rewards/rejected": -5.7946723937988285, + "step": 10320 + }, + { + "epoch": 0.9429876656007309, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 8.053553869418418e-08, + "logits/chosen": 464707754.6666667, + "logits/rejected": 345565760.0, + "logps/chosen": -311.73606363932294, + "logps/rejected": -331.9118347167969, + "loss": 0.0222, + "rewards/chosen": 3.8500022888183594, + "rewards/margins": 10.823956489562988, + "rewards/rejected": -6.973954200744629, + "step": 10321 + }, + { + "epoch": 0.9430790315212426, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 8.027872244710078e-08, + "logits/chosen": 656621056.0, + "logits/rejected": 742580224.0, + "logps/chosen": -247.046826171875, + "logps/rejected": -807.4651692708334, + "loss": 0.0133, + "rewards/chosen": 4.481835556030274, + "rewards/margins": 13.416859817504882, + "rewards/rejected": -8.93502426147461, + "step": 10322 + }, + { + "epoch": 0.9431703974417542, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 8.002231301538488e-08, + "logits/chosen": 693050026.6666666, + "logits/rejected": 964955648.0, + "logps/chosen": -425.3968098958333, + "logps/rejected": -691.64501953125, + "loss": 0.0095, + "rewards/chosen": 3.703251520792643, + "rewards/margins": 13.235606257120768, + "rewards/rejected": -9.532354736328125, + "step": 10323 + }, + { + "epoch": 0.9432617633622659, + "grad_norm": 17.5, + "kl": 4.7388763427734375, + "learning_rate": 7.976631042023842e-08, + "logits/chosen": 559683029.3333334, + "logits/rejected": 1328988928.0, + "logps/chosen": -274.459716796875, + "logps/rejected": -398.8894348144531, + "loss": 0.1357, + "rewards/chosen": 3.394926389058431, + "rewards/margins": 8.575561364491781, + "rewards/rejected": -5.18063497543335, + "step": 10324 + }, + { + "epoch": 0.9433531292827775, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 7.951071468283166e-08, + "logits/chosen": 547313600.0, + "logits/rejected": 298691200.0, + "logps/chosen": -392.1104736328125, + "logps/rejected": -503.1016540527344, + "loss": 0.0121, + "rewards/chosen": 4.19455623626709, + "rewards/margins": 16.770742416381836, + "rewards/rejected": -12.576186180114746, + "step": 10325 + }, + { + "epoch": 0.9434444952032892, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.92555258242994e-08, + "logits/chosen": 642256640.0, + "logits/rejected": 540725888.0, + "logps/chosen": -377.408935546875, + "logps/rejected": -515.339111328125, + "loss": 0.0315, + "rewards/chosen": 3.2793585459391275, + "rewards/margins": 12.763004938761393, + "rewards/rejected": -9.483646392822266, + "step": 10326 + }, + { + "epoch": 0.9435358611238008, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 7.900074386574363e-08, + "logits/chosen": 1086646272.0, + "logits/rejected": 532008499.2, + "logps/chosen": -212.60919189453125, + "logps/rejected": -427.715283203125, + "loss": 0.0108, + "rewards/chosen": 4.2189896901448565, + "rewards/margins": 12.91431058247884, + "rewards/rejected": -8.695320892333985, + "step": 10327 + }, + { + "epoch": 0.9436272270443125, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 7.87463688282325e-08, + "logits/chosen": 797564160.0, + "logits/rejected": 1095793766.4, + "logps/chosen": -489.72998046875, + "logps/rejected": -658.81611328125, + "loss": 0.0075, + "rewards/chosen": 4.0280609130859375, + "rewards/margins": 12.83013458251953, + "rewards/rejected": -8.802073669433593, + "step": 10328 + }, + { + "epoch": 0.9437185929648241, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 7.849240073279918e-08, + "logits/chosen": 760118528.0, + "logits/rejected": 487687526.4, + "logps/chosen": -374.1163736979167, + "logps/rejected": -460.234765625, + "loss": 0.0076, + "rewards/chosen": 4.726975758870442, + "rewards/margins": 12.94686559041341, + "rewards/rejected": -8.219889831542968, + "step": 10329 + }, + { + "epoch": 0.9438099588853358, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 7.823883960044632e-08, + "logits/chosen": 520015701.3333333, + "logits/rejected": 272723328.0, + "logps/chosen": -319.2068277994792, + "logps/rejected": -273.33612060546875, + "loss": 0.0133, + "rewards/chosen": 4.17592175801595, + "rewards/margins": 11.587036450703938, + "rewards/rejected": -7.411114692687988, + "step": 10330 + }, + { + "epoch": 0.9439013248058474, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 7.798568545214047e-08, + "logits/chosen": 344774144.0, + "logits/rejected": 346145749.3333333, + "logps/chosen": -193.3000244140625, + "logps/rejected": -449.7742513020833, + "loss": 0.0518, + "rewards/chosen": 3.0234664916992187, + "rewards/margins": 12.081336085001627, + "rewards/rejected": -9.057869593302408, + "step": 10331 + }, + { + "epoch": 0.943992690726359, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 7.77329383088149e-08, + "logits/chosen": 496208704.0, + "logits/rejected": 520492128.0, + "logps/chosen": -388.5949401855469, + "logps/rejected": -466.8388671875, + "loss": 0.0075, + "rewards/chosen": 4.489526748657227, + "rewards/margins": 13.524921417236328, + "rewards/rejected": -9.035394668579102, + "step": 10332 + }, + { + "epoch": 0.9440840566468707, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 7.748059819136843e-08, + "logits/chosen": 775334912.0, + "logits/rejected": 1522365824.0, + "logps/chosen": -354.230712890625, + "logps/rejected": -438.67535400390625, + "loss": 0.019, + "rewards/chosen": 3.8587093353271484, + "rewards/margins": 13.973361015319824, + "rewards/rejected": -10.114651679992676, + "step": 10333 + }, + { + "epoch": 0.9441754225673824, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.72286651206694e-08, + "logits/chosen": 517129164.8, + "logits/rejected": 292646506.6666667, + "logps/chosen": -372.0580322265625, + "logps/rejected": -381.8040771484375, + "loss": 0.0216, + "rewards/chosen": 3.5624015808105467, + "rewards/margins": 13.141159820556641, + "rewards/rejected": -9.578758239746094, + "step": 10334 + }, + { + "epoch": 0.944266788487894, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 7.697713911754945e-08, + "logits/chosen": 491534336.0, + "logits/rejected": 341789056.0, + "logps/chosen": -375.990234375, + "logps/rejected": -391.48577880859375, + "loss": 0.0136, + "rewards/chosen": 3.999702215194702, + "rewards/margins": 12.279470205307007, + "rewards/rejected": -8.279767990112305, + "step": 10335 + }, + { + "epoch": 0.9443581544084056, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 7.672602020280695e-08, + "logits/chosen": 1275013376.0, + "logits/rejected": 504381024.0, + "logps/chosen": -388.9017333984375, + "logps/rejected": -353.5894775390625, + "loss": 0.0135, + "rewards/chosen": 4.204194068908691, + "rewards/margins": 14.699263572692871, + "rewards/rejected": -10.49506950378418, + "step": 10336 + }, + { + "epoch": 0.9444495203289173, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 7.647530839720696e-08, + "logits/chosen": 531088512.0, + "logits/rejected": 538239104.0, + "logps/chosen": -372.0782165527344, + "logps/rejected": -455.18597412109375, + "loss": 0.0076, + "rewards/chosen": 4.352055072784424, + "rewards/margins": 13.361464023590088, + "rewards/rejected": -9.009408950805664, + "step": 10337 + }, + { + "epoch": 0.944540886249429, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 7.622500372148179e-08, + "logits/chosen": 1274457216.0, + "logits/rejected": 1219196672.0, + "logps/chosen": -199.251708984375, + "logps/rejected": -518.3419596354166, + "loss": 0.0113, + "rewards/chosen": 3.36715030670166, + "rewards/margins": 12.378350893656412, + "rewards/rejected": -9.011200586954752, + "step": 10338 + }, + { + "epoch": 0.9446322521699406, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 7.597510619632931e-08, + "logits/chosen": 881734912.0, + "logits/rejected": 506878912.0, + "logps/chosen": -455.6356201171875, + "logps/rejected": -451.57147216796875, + "loss": 0.0132, + "rewards/chosen": 3.7435126304626465, + "rewards/margins": 14.480058193206787, + "rewards/rejected": -10.73654556274414, + "step": 10339 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 7.572561584241355e-08, + "logits/chosen": 615387818.6666666, + "logits/rejected": 440510412.8, + "logps/chosen": -340.8114013671875, + "logps/rejected": -323.915234375, + "loss": 0.0162, + "rewards/chosen": 3.108626365661621, + "rewards/margins": 12.126064109802247, + "rewards/rejected": -9.017437744140626, + "step": 10340 + }, + { + "epoch": 0.9448149840109639, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 7.54765326803647e-08, + "logits/chosen": 766159445.3333334, + "logits/rejected": 447760640.0, + "logps/chosen": -457.9818522135417, + "logps/rejected": -525.6236328125, + "loss": 0.0085, + "rewards/chosen": 3.8658936818440757, + "rewards/margins": 13.252140935262045, + "rewards/rejected": -9.386247253417968, + "step": 10341 + }, + { + "epoch": 0.9449063499314756, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 7.522785673077959e-08, + "logits/chosen": 1070123008.0, + "logits/rejected": 486185062.4, + "logps/chosen": -272.228515625, + "logps/rejected": -360.81328125, + "loss": 0.0069, + "rewards/chosen": 4.449185689290364, + "rewards/margins": 13.487463887532552, + "rewards/rejected": -9.038278198242187, + "step": 10342 + }, + { + "epoch": 0.9449977158519872, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 7.497958801422178e-08, + "logits/chosen": 297347520.0, + "logits/rejected": 331603712.0, + "logps/chosen": -216.6551055908203, + "logps/rejected": -559.5035807291666, + "loss": 0.0158, + "rewards/chosen": 4.36761474609375, + "rewards/margins": 14.743029276529947, + "rewards/rejected": -10.375414530436197, + "step": 10343 + }, + { + "epoch": 0.9450890817724988, + "grad_norm": 2.953125, + "kl": 0.0, + "learning_rate": 7.473172655122152e-08, + "logits/chosen": 428456396.8, + "logits/rejected": 526603136.0, + "logps/chosen": -377.45869140625, + "logps/rejected": -451.8128255208333, + "loss": 0.018, + "rewards/chosen": 4.010537719726562, + "rewards/margins": 11.069666798909505, + "rewards/rejected": -7.059129079182942, + "step": 10344 + }, + { + "epoch": 0.9451804476930105, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 7.448427236227296e-08, + "logits/chosen": 454308044.8, + "logits/rejected": 501150805.3333333, + "logps/chosen": -364.887255859375, + "logps/rejected": -449.78955078125, + "loss": 0.0203, + "rewards/chosen": 3.554266357421875, + "rewards/margins": 13.06782251993815, + "rewards/rejected": -9.513556162516275, + "step": 10345 + }, + { + "epoch": 0.9452718136135222, + "grad_norm": 0.81640625, + "kl": 0.0, + "learning_rate": 7.423722546783918e-08, + "logits/chosen": 417183488.0, + "logits/rejected": 436863488.0, + "logps/chosen": -114.04367065429688, + "logps/rejected": -486.3048909505208, + "loss": 0.0036, + "rewards/chosen": 4.440779685974121, + "rewards/margins": 13.695555686950684, + "rewards/rejected": -9.254776000976562, + "step": 10346 + }, + { + "epoch": 0.9453631795340338, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 7.399058588834884e-08, + "logits/chosen": 601048704.0, + "logits/rejected": 641072192.0, + "logps/chosen": -189.2323455810547, + "logps/rejected": -357.0097351074219, + "loss": 0.02, + "rewards/chosen": 3.4471259117126465, + "rewards/margins": 11.53761625289917, + "rewards/rejected": -8.090490341186523, + "step": 10347 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 7.374435364419675e-08, + "logits/chosen": 429495893.3333333, + "logits/rejected": 515058380.8, + "logps/chosen": -346.522216796875, + "logps/rejected": -558.6794921875, + "loss": 0.0116, + "rewards/chosen": 3.902320226033529, + "rewards/margins": 12.383493169148764, + "rewards/rejected": -8.481172943115235, + "step": 10348 + }, + { + "epoch": 0.9455459113750571, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 7.349852875574381e-08, + "logits/chosen": 814520576.0, + "logits/rejected": 864631936.0, + "logps/chosen": -391.81732177734375, + "logps/rejected": -639.5665283203125, + "loss": 0.0266, + "rewards/chosen": 3.313013792037964, + "rewards/margins": 12.167597532272339, + "rewards/rejected": -8.854583740234375, + "step": 10349 + }, + { + "epoch": 0.9456372772955688, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 7.325311124331713e-08, + "logits/chosen": 441101344.0, + "logits/rejected": 411038176.0, + "logps/chosen": -174.620361328125, + "logps/rejected": -375.8890686035156, + "loss": 0.0389, + "rewards/chosen": 3.1053013801574707, + "rewards/margins": 11.09489393234253, + "rewards/rejected": -7.989592552185059, + "step": 10350 + }, + { + "epoch": 0.9457286432160804, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 7.300810112721102e-08, + "logits/chosen": 688358912.0, + "logits/rejected": 911829333.3333334, + "logps/chosen": -433.82529296875, + "logps/rejected": -590.545166015625, + "loss": 0.0094, + "rewards/chosen": 4.396434020996094, + "rewards/margins": 16.582176717122397, + "rewards/rejected": -12.185742696126303, + "step": 10351 + }, + { + "epoch": 0.945820009136592, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 7.276349842768538e-08, + "logits/chosen": 689996288.0, + "logits/rejected": 1077872896.0, + "logps/chosen": -230.8128204345703, + "logps/rejected": -680.4046630859375, + "loss": 0.0095, + "rewards/chosen": 4.012641906738281, + "rewards/margins": 12.348285675048828, + "rewards/rejected": -8.335643768310547, + "step": 10352 + }, + { + "epoch": 0.9459113750571037, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 7.25193031649668e-08, + "logits/chosen": 335216992.0, + "logits/rejected": 618452672.0, + "logps/chosen": -265.3132629394531, + "logps/rejected": -489.9245300292969, + "loss": 0.0071, + "rewards/chosen": 4.449484348297119, + "rewards/margins": 14.875044345855713, + "rewards/rejected": -10.425559997558594, + "step": 10353 + }, + { + "epoch": 0.9460027409776154, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.227551535924748e-08, + "logits/chosen": 402134323.2, + "logits/rejected": 326762560.0, + "logps/chosen": -305.462939453125, + "logps/rejected": -346.5670979817708, + "loss": 0.1384, + "rewards/chosen": 2.096497917175293, + "rewards/margins": 10.671772066752116, + "rewards/rejected": -8.575274149576822, + "step": 10354 + }, + { + "epoch": 0.946094106898127, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.203213503068629e-08, + "logits/chosen": 893218304.0, + "logits/rejected": 520588096.0, + "logps/chosen": -444.72650146484375, + "logps/rejected": -426.3790283203125, + "loss": 0.0249, + "rewards/chosen": 3.111515998840332, + "rewards/margins": 12.69480037689209, + "rewards/rejected": -9.583284378051758, + "step": 10355 + }, + { + "epoch": 0.9461854728186386, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 7.178916219940934e-08, + "logits/chosen": 659114752.0, + "logits/rejected": 614549094.4, + "logps/chosen": -340.6770833333333, + "logps/rejected": -382.58603515625, + "loss": 0.0066, + "rewards/chosen": 4.040143330891927, + "rewards/margins": 12.985481770833331, + "rewards/rejected": -8.945338439941406, + "step": 10356 + }, + { + "epoch": 0.9462768387391503, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 7.15465968855078e-08, + "logits/chosen": 673229516.8, + "logits/rejected": 489913770.6666667, + "logps/chosen": -257.031005859375, + "logps/rejected": -533.7646891276041, + "loss": 0.0467, + "rewards/chosen": 3.4493331909179688, + "rewards/margins": 13.10805638631185, + "rewards/rejected": -9.65872319539388, + "step": 10357 + }, + { + "epoch": 0.946368204659662, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 7.13044391090395e-08, + "logits/chosen": 626769152.0, + "logits/rejected": 489780576.0, + "logps/chosen": -251.14436848958334, + "logps/rejected": -456.9241027832031, + "loss": 0.019, + "rewards/chosen": 3.9540888468424478, + "rewards/margins": 12.744436899820963, + "rewards/rejected": -8.790348052978516, + "step": 10358 + }, + { + "epoch": 0.9464595705801736, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 7.106268889002899e-08, + "logits/chosen": 818660010.6666666, + "logits/rejected": 675604019.2, + "logps/chosen": -430.8936767578125, + "logps/rejected": -488.141552734375, + "loss": 0.0077, + "rewards/chosen": 4.232746124267578, + "rewards/margins": 14.390244293212891, + "rewards/rejected": -10.157498168945313, + "step": 10359 + }, + { + "epoch": 0.9465509365006852, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 7.082134624846582e-08, + "logits/chosen": 586524928.0, + "logits/rejected": 668485568.0, + "logps/chosen": -314.13438197544644, + "logps/rejected": -844.0791625976562, + "loss": 0.0289, + "rewards/chosen": 3.6361754281180247, + "rewards/margins": 12.276147161211286, + "rewards/rejected": -8.639971733093262, + "step": 10360 + }, + { + "epoch": 0.9466423024211968, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 7.058041120430792e-08, + "logits/chosen": 488232874.6666667, + "logits/rejected": 389791641.6, + "logps/chosen": -403.8323567708333, + "logps/rejected": -446.737060546875, + "loss": 0.0129, + "rewards/chosen": 4.2212575276692705, + "rewards/margins": 12.051988728841145, + "rewards/rejected": -7.830731201171875, + "step": 10361 + }, + { + "epoch": 0.9467336683417086, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 7.033988377747825e-08, + "logits/chosen": 1075194441.142857, + "logits/rejected": 544272640.0, + "logps/chosen": -400.8462611607143, + "logps/rejected": -494.9570007324219, + "loss": 0.0165, + "rewards/chosen": 4.052729742867606, + "rewards/margins": 13.64101995740618, + "rewards/rejected": -9.588290214538574, + "step": 10362 + }, + { + "epoch": 0.9468250342622202, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 7.009976398786533e-08, + "logits/chosen": 628825728.0, + "logits/rejected": 1029820757.3333334, + "logps/chosen": -361.2696533203125, + "logps/rejected": -452.7270914713542, + "loss": 0.0077, + "rewards/chosen": 4.03955078125, + "rewards/margins": 11.892578125, + "rewards/rejected": -7.85302734375, + "step": 10363 + }, + { + "epoch": 0.9469164001827318, + "grad_norm": 0.80078125, + "kl": 0.0, + "learning_rate": 6.986005185532552e-08, + "logits/chosen": 293811946.6666667, + "logits/rejected": 384286003.2, + "logps/chosen": -246.8621826171875, + "logps/rejected": -473.476953125, + "loss": 0.0054, + "rewards/chosen": 4.642015139261882, + "rewards/margins": 13.59910208384196, + "rewards/rejected": -8.957086944580078, + "step": 10364 + }, + { + "epoch": 0.9470077661032434, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 6.962074739968072e-08, + "logits/chosen": 445336512.0, + "logits/rejected": 870281152.0, + "logps/chosen": -240.097900390625, + "logps/rejected": -447.1423034667969, + "loss": 0.0359, + "rewards/chosen": 3.7866086959838867, + "rewards/margins": 11.011568069458008, + "rewards/rejected": -7.224959373474121, + "step": 10365 + }, + { + "epoch": 0.9470991320237552, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.9381850640719e-08, + "logits/chosen": 869254400.0, + "logits/rejected": 322436704.0, + "logps/chosen": -293.0177001953125, + "logps/rejected": -375.1788024902344, + "loss": 0.0213, + "rewards/chosen": 3.4711523056030273, + "rewards/margins": 13.135801315307617, + "rewards/rejected": -9.66464900970459, + "step": 10366 + }, + { + "epoch": 0.9471904979442668, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 6.914336159819513e-08, + "logits/chosen": 683377715.2, + "logits/rejected": 596967680.0, + "logps/chosen": -657.4197265625, + "logps/rejected": -581.3440755208334, + "loss": 0.0095, + "rewards/chosen": 4.273302459716797, + "rewards/margins": 13.250128173828125, + "rewards/rejected": -8.976825714111328, + "step": 10367 + }, + { + "epoch": 0.9472818638647784, + "grad_norm": 1.6328125, + "kl": 0.0, + "learning_rate": 6.890528029182942e-08, + "logits/chosen": 609074483.2, + "logits/rejected": 449705130.6666667, + "logps/chosen": -442.3568359375, + "logps/rejected": -580.5332845052084, + "loss": 0.0087, + "rewards/chosen": 4.949653625488281, + "rewards/margins": 17.642757415771484, + "rewards/rejected": -12.693103790283203, + "step": 10368 + }, + { + "epoch": 0.94737322978529, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 6.866760674131002e-08, + "logits/chosen": 556403285.3333334, + "logits/rejected": 386573414.4, + "logps/chosen": -405.4077962239583, + "logps/rejected": -591.65859375, + "loss": 0.0131, + "rewards/chosen": 3.8727696736653647, + "rewards/margins": 14.066956075032552, + "rewards/rejected": -10.194186401367187, + "step": 10369 + }, + { + "epoch": 0.9474645957058018, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 6.843034096629009e-08, + "logits/chosen": 659602752.0, + "logits/rejected": 712503765.3333334, + "logps/chosen": -337.3863830566406, + "logps/rejected": -378.9645182291667, + "loss": 0.0121, + "rewards/chosen": 3.121281385421753, + "rewards/margins": 11.633764346440634, + "rewards/rejected": -8.51248296101888, + "step": 10370 + }, + { + "epoch": 0.9475559616263134, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 6.819348298638839e-08, + "logits/chosen": 485359411.2, + "logits/rejected": 413299626.6666667, + "logps/chosen": -270.233203125, + "logps/rejected": -562.888671875, + "loss": 0.031, + "rewards/chosen": 3.7217124938964843, + "rewards/margins": 12.754791005452475, + "rewards/rejected": -9.03307851155599, + "step": 10371 + }, + { + "epoch": 0.947647327546825, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 6.795703282119148e-08, + "logits/chosen": 382335616.0, + "logits/rejected": 481401536.0, + "logps/chosen": -371.47796630859375, + "logps/rejected": -610.710693359375, + "loss": 0.0273, + "rewards/chosen": 3.3751327991485596, + "rewards/margins": 13.221305131912231, + "rewards/rejected": -9.846172332763672, + "step": 10372 + }, + { + "epoch": 0.9477386934673366, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 6.772099049025149e-08, + "logits/chosen": 622682112.0, + "logits/rejected": 459262304.0, + "logps/chosen": -157.1625213623047, + "logps/rejected": -492.2825622558594, + "loss": 0.0325, + "rewards/chosen": 3.150667190551758, + "rewards/margins": 13.889791488647461, + "rewards/rejected": -10.739124298095703, + "step": 10373 + }, + { + "epoch": 0.9478300593878484, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 6.748535601308726e-08, + "logits/chosen": 352836480.0, + "logits/rejected": 478140032.0, + "logps/chosen": -245.16062927246094, + "logps/rejected": -468.6410319010417, + "loss": 0.0077, + "rewards/chosen": 4.065293312072754, + "rewards/margins": 12.322576204935709, + "rewards/rejected": -8.257282892862955, + "step": 10374 + }, + { + "epoch": 0.94792142530836, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.725012940918263e-08, + "logits/chosen": 352829909.3333333, + "logits/rejected": 280066816.0, + "logps/chosen": -285.9149576822917, + "logps/rejected": -260.1334228515625, + "loss": 0.1242, + "rewards/chosen": 3.361871083577474, + "rewards/margins": 8.778419812520346, + "rewards/rejected": -5.416548728942871, + "step": 10375 + }, + { + "epoch": 0.9480127912288716, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 6.701531069799039e-08, + "logits/chosen": 435427020.8, + "logits/rejected": 674225280.0, + "logps/chosen": -203.94359130859374, + "logps/rejected": -491.8660888671875, + "loss": 0.0196, + "rewards/chosen": 4.535871887207032, + "rewards/margins": 14.088915252685547, + "rewards/rejected": -9.553043365478516, + "step": 10376 + }, + { + "epoch": 0.9481041571493832, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.678089989892611e-08, + "logits/chosen": 608943488.0, + "logits/rejected": 743679616.0, + "logps/chosen": -259.4392395019531, + "logps/rejected": -719.1893310546875, + "loss": 0.0358, + "rewards/chosen": 2.677978754043579, + "rewards/margins": 13.434160470962524, + "rewards/rejected": -10.756181716918945, + "step": 10377 + }, + { + "epoch": 0.948195523069895, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 6.654689703137429e-08, + "logits/chosen": 531851776.0, + "logits/rejected": 659661098.6666666, + "logps/chosen": -299.568994140625, + "logps/rejected": -435.9427083333333, + "loss": 0.0096, + "rewards/chosen": 4.476389312744141, + "rewards/margins": 12.625306193033854, + "rewards/rejected": -8.148916880289713, + "step": 10378 + }, + { + "epoch": 0.9482868889904066, + "grad_norm": 1.9921875, + "kl": 0.0, + "learning_rate": 6.631330211468445e-08, + "logits/chosen": 1503764309.3333333, + "logits/rejected": 710501376.0, + "logps/chosen": -270.22743733723956, + "logps/rejected": -404.2243896484375, + "loss": 0.0179, + "rewards/chosen": 3.6964238484700522, + "rewards/margins": 12.914795430501302, + "rewards/rejected": -9.21837158203125, + "step": 10379 + }, + { + "epoch": 0.9483782549109182, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 6.608011516817336e-08, + "logits/chosen": 588304064.0, + "logits/rejected": 352060480.0, + "logps/chosen": -386.57049560546875, + "logps/rejected": -426.84765625, + "loss": 0.0164, + "rewards/chosen": 3.6565942764282227, + "rewards/margins": 14.303414344787598, + "rewards/rejected": -10.646820068359375, + "step": 10380 + }, + { + "epoch": 0.9484696208314298, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 6.584733621112282e-08, + "logits/chosen": 330363413.3333333, + "logits/rejected": 440042137.6, + "logps/chosen": -280.4073486328125, + "logps/rejected": -405.588330078125, + "loss": 0.009, + "rewards/chosen": 3.7910130818684897, + "rewards/margins": 12.933287556966146, + "rewards/rejected": -9.142274475097656, + "step": 10381 + }, + { + "epoch": 0.9485609867519416, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 6.561496526278189e-08, + "logits/chosen": 374138816.0, + "logits/rejected": 636712448.0, + "logps/chosen": -223.73757934570312, + "logps/rejected": -579.9353841145834, + "loss": 0.0239, + "rewards/chosen": 3.725806713104248, + "rewards/margins": 11.40075381596883, + "rewards/rejected": -7.674947102864583, + "step": 10382 + }, + { + "epoch": 0.9486523526724532, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 6.538300234236461e-08, + "logits/chosen": 465123648.0, + "logits/rejected": 344922069.3333333, + "logps/chosen": -364.2792663574219, + "logps/rejected": -450.4567057291667, + "loss": 0.0083, + "rewards/chosen": 3.407667636871338, + "rewards/margins": 12.403177738189697, + "rewards/rejected": -8.99551010131836, + "step": 10383 + }, + { + "epoch": 0.9487437185929648, + "grad_norm": 4.78125, + "kl": 3.2588119506835938, + "learning_rate": 6.515144746905289e-08, + "logits/chosen": 688636544.0, + "logps/chosen": -354.0546569824219, + "loss": 0.0541, + "rewards/chosen": 3.626898765563965, + "step": 10384 + }, + { + "epoch": 0.9488350845134764, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 6.492030066199417e-08, + "logits/chosen": 439866304.0, + "logits/rejected": 817600256.0, + "logps/chosen": -300.65142822265625, + "logps/rejected": -512.0142822265625, + "loss": 0.0354, + "rewards/chosen": 3.2399258613586426, + "rewards/margins": 11.505159854888916, + "rewards/rejected": -8.265233993530273, + "step": 10385 + }, + { + "epoch": 0.9489264504339882, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 6.468956194030263e-08, + "logits/chosen": 795538304.0, + "logits/rejected": 615487573.3333334, + "logps/chosen": -579.782470703125, + "logps/rejected": -588.7870686848959, + "loss": 0.0064, + "rewards/chosen": 3.6742889881134033, + "rewards/margins": 14.117687781651815, + "rewards/rejected": -10.443398793538412, + "step": 10386 + }, + { + "epoch": 0.9490178163544998, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 6.445923132305687e-08, + "logits/chosen": 568235417.6, + "logits/rejected": 453308074.6666667, + "logps/chosen": -439.76005859375, + "logps/rejected": -490.9508056640625, + "loss": 0.035, + "rewards/chosen": 3.0318689346313477, + "rewards/margins": 13.93386173248291, + "rewards/rejected": -10.901992797851562, + "step": 10387 + }, + { + "epoch": 0.9491091822750114, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 6.422930882930389e-08, + "logits/chosen": 341002035.2, + "logits/rejected": 394022144.0, + "logps/chosen": -254.610986328125, + "logps/rejected": -514.5619303385416, + "loss": 0.0256, + "rewards/chosen": 4.199658203125, + "rewards/margins": 12.13405049641927, + "rewards/rejected": -7.9343922932942705, + "step": 10388 + }, + { + "epoch": 0.949200548195523, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 6.399979447805682e-08, + "logits/chosen": 573270835.2, + "logits/rejected": 906286592.0, + "logps/chosen": -531.86484375, + "logps/rejected": -461.9949137369792, + "loss": 0.0202, + "rewards/chosen": 4.027780151367187, + "rewards/margins": 13.311745580037435, + "rewards/rejected": -9.283965428670248, + "step": 10389 + }, + { + "epoch": 0.9492919141160348, + "grad_norm": 1.015625, + "kl": 0.0, + "learning_rate": 6.377068828829325e-08, + "logits/chosen": 512978773.3333333, + "logits/rejected": 422998374.4, + "logps/chosen": -446.7781982421875, + "logps/rejected": -364.477197265625, + "loss": 0.0055, + "rewards/chosen": 4.448452631632487, + "rewards/margins": 12.836663691202801, + "rewards/rejected": -8.388211059570313, + "step": 10390 + }, + { + "epoch": 0.9493832800365464, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 6.354199027895858e-08, + "logits/chosen": 704749952.0, + "logits/rejected": 557228416.0, + "logps/chosen": -445.8183898925781, + "logps/rejected": -549.9055786132812, + "loss": 0.0066, + "rewards/chosen": 4.347519874572754, + "rewards/margins": 13.997632026672363, + "rewards/rejected": -9.65011215209961, + "step": 10391 + }, + { + "epoch": 0.949474645957058, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 6.331370046896435e-08, + "logits/chosen": 847902336.0, + "logits/rejected": 635661531.4285715, + "logps/chosen": -254.28253173828125, + "logps/rejected": -586.3351004464286, + "loss": 0.0046, + "rewards/chosen": 3.30808424949646, + "rewards/margins": 13.407231501170568, + "rewards/rejected": -10.099147251674108, + "step": 10392 + }, + { + "epoch": 0.9495660118775696, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 6.308581887718712e-08, + "logits/chosen": 518645856.0, + "logits/rejected": 640682816.0, + "logps/chosen": -280.34649658203125, + "logps/rejected": -481.9020690917969, + "loss": 0.0082, + "rewards/chosen": 4.471518516540527, + "rewards/margins": 13.514370918273926, + "rewards/rejected": -9.042852401733398, + "step": 10393 + }, + { + "epoch": 0.9496573777980813, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 6.285834552247127e-08, + "logits/chosen": 324014101.3333333, + "logits/rejected": 793031065.6, + "logps/chosen": -176.2245076497396, + "logps/rejected": -473.547802734375, + "loss": 0.0058, + "rewards/chosen": 5.099283218383789, + "rewards/margins": 14.974782943725586, + "rewards/rejected": -9.875499725341797, + "step": 10394 + }, + { + "epoch": 0.949748743718593, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.263128042362676e-08, + "logits/chosen": 653375360.0, + "logits/rejected": 893501504.0, + "logps/chosen": -538.140625, + "logps/rejected": -411.0272216796875, + "loss": 0.0302, + "rewards/chosen": 2.889380693435669, + "rewards/margins": 9.78050684928894, + "rewards/rejected": -6.8911261558532715, + "step": 10395 + }, + { + "epoch": 0.9498401096391046, + "grad_norm": 0.765625, + "kl": 0.0, + "learning_rate": 6.240462359942967e-08, + "logits/chosen": 1120559616.0, + "logits/rejected": 519185024.0, + "logps/chosen": -556.585693359375, + "logps/rejected": -556.6719563802084, + "loss": 0.0033, + "rewards/chosen": 4.43267822265625, + "rewards/margins": 13.64212163289388, + "rewards/rejected": -9.20944341023763, + "step": 10396 + }, + { + "epoch": 0.9499314755596162, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 6.217837506862223e-08, + "logits/chosen": 279181952.0, + "logits/rejected": 367623040.0, + "logps/chosen": -246.5866241455078, + "logps/rejected": -358.88385009765625, + "loss": 0.0215, + "rewards/chosen": 3.939901828765869, + "rewards/margins": 13.149475574493408, + "rewards/rejected": -9.209573745727539, + "step": 10397 + }, + { + "epoch": 0.9500228414801279, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 6.195253484991392e-08, + "logits/chosen": 480096204.8, + "logits/rejected": 379504554.6666667, + "logps/chosen": -378.67431640625, + "logps/rejected": -521.18994140625, + "loss": 0.0127, + "rewards/chosen": 4.509154891967773, + "rewards/margins": 13.642314529418945, + "rewards/rejected": -9.133159637451172, + "step": 10398 + }, + { + "epoch": 0.9501142074006396, + "grad_norm": 1.8671875, + "kl": 0.0, + "learning_rate": 6.172710296197814e-08, + "logits/chosen": 356122944.0, + "logits/rejected": 544057920.0, + "logps/chosen": -236.92276000976562, + "logps/rejected": -554.67333984375, + "loss": 0.0127, + "rewards/chosen": 4.041694641113281, + "rewards/margins": 14.63127326965332, + "rewards/rejected": -10.589578628540039, + "step": 10399 + }, + { + "epoch": 0.9502055733211512, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 6.150207942345721e-08, + "logits/chosen": 458833824.0, + "logits/rejected": 596619520.0, + "logps/chosen": -287.1043701171875, + "logps/rejected": -625.6398111979166, + "loss": 0.0082, + "rewards/chosen": 3.4685516357421875, + "rewards/margins": 13.159177780151367, + "rewards/rejected": -9.69062614440918, + "step": 10400 + }, + { + "epoch": 0.9502969392416628, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 6.127746425295788e-08, + "logits/chosen": 629992908.8, + "logits/rejected": 671720192.0, + "logps/chosen": -407.8013916015625, + "logps/rejected": -501.0442708333333, + "loss": 0.0332, + "rewards/chosen": 3.6349205017089843, + "rewards/margins": 10.854120635986328, + "rewards/rejected": -7.219200134277344, + "step": 10401 + }, + { + "epoch": 0.9503883051621745, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 6.105325746905421e-08, + "logits/chosen": 875989824.0, + "logits/rejected": 442314976.0, + "logps/chosen": -356.0409851074219, + "logps/rejected": -341.0584716796875, + "loss": 0.0153, + "rewards/chosen": 4.466198921203613, + "rewards/margins": 11.608817100524902, + "rewards/rejected": -7.142618179321289, + "step": 10402 + }, + { + "epoch": 0.9504796710826862, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 6.082945909028526e-08, + "logits/chosen": 658846016.0, + "logits/rejected": 403252224.0, + "logps/chosen": -650.4168090820312, + "logps/rejected": -446.11077880859375, + "loss": 0.0174, + "rewards/chosen": 3.425004720687866, + "rewards/margins": 12.306631803512573, + "rewards/rejected": -8.881627082824707, + "step": 10403 + }, + { + "epoch": 0.9505710370031978, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 6.060606913515843e-08, + "logits/chosen": 493076480.0, + "logits/rejected": 297719552.0, + "logps/chosen": -355.837646484375, + "logps/rejected": -340.40386962890625, + "loss": 0.0178, + "rewards/chosen": 3.4092206954956055, + "rewards/margins": 11.276505470275879, + "rewards/rejected": -7.867284774780273, + "step": 10404 + }, + { + "epoch": 0.9506624029237094, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 6.038308762214451e-08, + "logits/chosen": 295343061.3333333, + "logits/rejected": 430490675.2, + "logps/chosen": -179.66455078125, + "logps/rejected": -494.28447265625, + "loss": 0.0262, + "rewards/chosen": 2.626429875691732, + "rewards/margins": 11.242934544881185, + "rewards/rejected": -8.616504669189453, + "step": 10405 + }, + { + "epoch": 0.9507537688442211, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 6.016051456968264e-08, + "logits/chosen": 757549056.0, + "logits/rejected": 358601728.0, + "logps/chosen": -264.4855041503906, + "logps/rejected": -479.59906005859375, + "loss": 0.0252, + "rewards/chosen": 3.6087262630462646, + "rewards/margins": 14.014361143112183, + "rewards/rejected": -10.405634880065918, + "step": 10406 + }, + { + "epoch": 0.9508451347647328, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5.99383499961781e-08, + "logits/chosen": 522376768.0, + "logits/rejected": 388666770.28571427, + "logps/chosen": -183.4268798828125, + "logps/rejected": -459.66294642857144, + "loss": 0.0065, + "rewards/chosen": 2.958439588546753, + "rewards/margins": 11.657063790730067, + "rewards/rejected": -8.698624202183314, + "step": 10407 + }, + { + "epoch": 0.9509365006852444, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5.971659392000118e-08, + "logits/chosen": 569949952.0, + "logits/rejected": 560370278.4, + "logps/chosen": -236.99434407552084, + "logps/rejected": -583.087060546875, + "loss": 0.0115, + "rewards/chosen": 3.790703455607096, + "rewards/margins": 13.309024683634439, + "rewards/rejected": -9.518321228027343, + "step": 10408 + }, + { + "epoch": 0.951027866605756, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 5.949524635948889e-08, + "logits/chosen": 404433504.0, + "logits/rejected": 237990448.0, + "logps/chosen": -419.1551818847656, + "logps/rejected": -395.3219909667969, + "loss": 0.0074, + "rewards/chosen": 4.5287370681762695, + "rewards/margins": 13.171355247497559, + "rewards/rejected": -8.642618179321289, + "step": 10409 + }, + { + "epoch": 0.9511192325262677, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5.927430733294492e-08, + "logits/chosen": 857832640.0, + "logits/rejected": 1105307776.0, + "logps/chosen": -447.1986999511719, + "logps/rejected": -635.2434692382812, + "loss": 0.0126, + "rewards/chosen": 3.924267530441284, + "rewards/margins": 15.103127241134644, + "rewards/rejected": -11.17885971069336, + "step": 10410 + }, + { + "epoch": 0.9512105984467794, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 5.9053776858639114e-08, + "logits/chosen": 642623744.0, + "logits/rejected": 734519193.6, + "logps/chosen": -468.1397705078125, + "logps/rejected": -493.80712890625, + "loss": 0.0093, + "rewards/chosen": 3.681677500406901, + "rewards/margins": 14.849968210856119, + "rewards/rejected": -11.168290710449218, + "step": 10411 + }, + { + "epoch": 0.951301964367291, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 5.883365495480742e-08, + "logits/chosen": 593163434.6666666, + "logits/rejected": 585169664.0, + "logps/chosen": -230.73763020833334, + "logps/rejected": -495.665234375, + "loss": 0.0149, + "rewards/chosen": 3.385542869567871, + "rewards/margins": 13.074370384216309, + "rewards/rejected": -9.688827514648438, + "step": 10412 + }, + { + "epoch": 0.9513933302878026, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5.86139416396514e-08, + "logits/chosen": 284618598.4, + "logits/rejected": 358529365.3333333, + "logps/chosen": -219.616064453125, + "logps/rejected": -550.7525634765625, + "loss": 0.0393, + "rewards/chosen": 3.18885498046875, + "rewards/margins": 16.278832244873048, + "rewards/rejected": -13.089977264404297, + "step": 10413 + }, + { + "epoch": 0.9514846962083143, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 5.839463693133873e-08, + "logits/chosen": 665438500.5714285, + "logits/rejected": 1103709824.0, + "logps/chosen": -359.98890904017856, + "logps/rejected": -261.0858459472656, + "loss": 0.01, + "rewards/chosen": 4.707993643624442, + "rewards/margins": 11.514603751046316, + "rewards/rejected": -6.806610107421875, + "step": 10414 + }, + { + "epoch": 0.951576062128826, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.8175740848004925e-08, + "logits/chosen": 620407978.6666666, + "logits/rejected": 352450624.0, + "logps/chosen": -388.3312174479167, + "logps/rejected": -315.75836181640625, + "loss": 0.0283, + "rewards/chosen": 3.7283554077148438, + "rewards/margins": 10.033873558044434, + "rewards/rejected": -6.30551815032959, + "step": 10415 + }, + { + "epoch": 0.9516674280493376, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 5.7957253407751045e-08, + "logits/chosen": 523389152.0, + "logits/rejected": 448148906.6666667, + "logps/chosen": -317.1673889160156, + "logps/rejected": -516.7533365885416, + "loss": 0.0081, + "rewards/chosen": 3.5040855407714844, + "rewards/margins": 13.377510706583658, + "rewards/rejected": -9.873425165812174, + "step": 10416 + }, + { + "epoch": 0.9517587939698492, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 5.773917462864265e-08, + "logits/chosen": 1046809792.0, + "logits/rejected": 575560661.3333334, + "logps/chosen": -389.1695556640625, + "logps/rejected": -441.2035319010417, + "loss": 0.1063, + "rewards/chosen": 4.5890607833862305, + "rewards/margins": 11.535723686218262, + "rewards/rejected": -6.946662902832031, + "step": 10417 + }, + { + "epoch": 0.9518501598903609, + "grad_norm": 0.15234375, + "kl": 0.0, + "learning_rate": 5.752150452871308e-08, + "logits/chosen": 230935392.0, + "logits/rejected": 441709994.6666667, + "logps/chosen": -241.24957275390625, + "logps/rejected": -486.8805338541667, + "loss": 0.0011, + "rewards/chosen": 5.886211395263672, + "rewards/margins": 15.471587498982748, + "rewards/rejected": -9.585376103719076, + "step": 10418 + }, + { + "epoch": 0.9519415258108725, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5.730424312596294e-08, + "logits/chosen": 786949717.3333334, + "logits/rejected": 548588441.6, + "logps/chosen": -578.5892740885416, + "logps/rejected": -432.1720703125, + "loss": 0.0107, + "rewards/chosen": 3.6728636423746743, + "rewards/margins": 11.386142603556316, + "rewards/rejected": -7.713278961181641, + "step": 10419 + }, + { + "epoch": 0.9520328917313842, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5.7087390438356205e-08, + "logits/chosen": 444137472.0, + "logits/rejected": 357483584.0, + "logps/chosen": -284.4242757161458, + "logps/rejected": -615.7301635742188, + "loss": 0.0213, + "rewards/chosen": 3.8751939137776694, + "rewards/margins": 13.894953091939291, + "rewards/rejected": -10.019759178161621, + "step": 10420 + }, + { + "epoch": 0.9521242576518958, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5.687094648382518e-08, + "logits/chosen": 469468864.0, + "logits/rejected": 416126336.0, + "logps/chosen": -353.4136657714844, + "logps/rejected": -535.6024169921875, + "loss": 0.1284, + "rewards/chosen": 2.734557867050171, + "rewards/margins": 12.280565023422241, + "rewards/rejected": -9.54600715637207, + "step": 10421 + }, + { + "epoch": 0.9522156235724075, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 5.665491128026834e-08, + "logits/chosen": 430472704.0, + "logits/rejected": 241819024.0, + "logps/chosen": -291.2925618489583, + "logps/rejected": -323.4603271484375, + "loss": 0.0213, + "rewards/chosen": 3.9348586400349936, + "rewards/margins": 12.495001157124838, + "rewards/rejected": -8.560142517089844, + "step": 10422 + }, + { + "epoch": 0.9523069894929191, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 5.6439284845548057e-08, + "logits/chosen": 815158912.0, + "logits/rejected": 360400896.0, + "logps/chosen": -669.8955078125, + "logps/rejected": -574.4104817708334, + "loss": 0.0049, + "rewards/chosen": 4.155966281890869, + "rewards/margins": 13.396059513092041, + "rewards/rejected": -9.240093231201172, + "step": 10423 + }, + { + "epoch": 0.9523983554134308, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 5.622406719749729e-08, + "logits/chosen": 443020928.0, + "logits/rejected": 822634304.0, + "logps/chosen": -362.8199768066406, + "logps/rejected": -428.67474365234375, + "loss": 0.0087, + "rewards/chosen": 4.426408767700195, + "rewards/margins": 11.638851165771484, + "rewards/rejected": -7.212442398071289, + "step": 10424 + }, + { + "epoch": 0.9524897213339424, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5.6009258353910137e-08, + "logits/chosen": 575618816.0, + "logits/rejected": 297657152.0, + "logps/chosen": -178.2308837890625, + "logps/rejected": -394.2858072916667, + "loss": 0.0275, + "rewards/chosen": 3.5117225646972656, + "rewards/margins": 13.176752090454102, + "rewards/rejected": -9.665029525756836, + "step": 10425 + }, + { + "epoch": 0.9525810872544541, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5.579485833255016e-08, + "logits/chosen": 498125738.6666667, + "logits/rejected": 333427148.8, + "logps/chosen": -285.9337158203125, + "logps/rejected": -406.9231201171875, + "loss": 0.0193, + "rewards/chosen": 3.3411951065063477, + "rewards/margins": 13.302161598205567, + "rewards/rejected": -9.960966491699219, + "step": 10426 + }, + { + "epoch": 0.9526724531749657, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5.5580867151146525e-08, + "logits/chosen": 520566067.2, + "logits/rejected": 1092937984.0, + "logps/chosen": -274.4521484375, + "logps/rejected": -364.3920084635417, + "loss": 0.0188, + "rewards/chosen": 3.6464302062988283, + "rewards/margins": 12.610928090413413, + "rewards/rejected": -8.964497884114584, + "step": 10427 + }, + { + "epoch": 0.9527638190954774, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 5.536728482739451e-08, + "logits/chosen": 661510912.0, + "logits/rejected": 438480076.8, + "logps/chosen": -164.52651977539062, + "logps/rejected": -564.915234375, + "loss": 0.0138, + "rewards/chosen": 3.985164006551107, + "rewards/margins": 12.896486028035483, + "rewards/rejected": -8.911322021484375, + "step": 10428 + }, + { + "epoch": 0.952855185015989, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5.515411137895443e-08, + "logits/chosen": 829477376.0, + "logits/rejected": 991980544.0, + "logps/chosen": -321.1872965494792, + "logps/rejected": -382.63372802734375, + "loss": 0.0266, + "rewards/chosen": 3.76350466410319, + "rewards/margins": 12.050748507181803, + "rewards/rejected": -8.287243843078613, + "step": 10429 + }, + { + "epoch": 0.9529465509365007, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5.494134682345442e-08, + "logits/chosen": 364741888.0, + "logits/rejected": 403800320.0, + "logps/chosen": -234.61062622070312, + "logps/rejected": -401.34814453125, + "loss": 0.0057, + "rewards/chosen": 4.192091464996338, + "rewards/margins": 12.446473280588785, + "rewards/rejected": -8.254381815592447, + "step": 10430 + }, + { + "epoch": 0.9530379168570123, + "grad_norm": 26.375, + "kl": 0.0, + "learning_rate": 5.472899117848818e-08, + "logits/chosen": 368517888.0, + "logits/rejected": 430667673.6, + "logps/chosen": -297.3537190755208, + "logps/rejected": -524.684716796875, + "loss": 0.0187, + "rewards/chosen": 3.4551871617635093, + "rewards/margins": 13.285532315572103, + "rewards/rejected": -9.830345153808594, + "step": 10431 + }, + { + "epoch": 0.953129282777524, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.4517044461615544e-08, + "logits/chosen": 638883669.3333334, + "logits/rejected": 1018289920.0, + "logps/chosen": -388.1241048177083, + "logps/rejected": -682.19091796875, + "loss": 0.0296, + "rewards/chosen": 3.8048273722330728, + "rewards/margins": 14.95624033610026, + "rewards/rejected": -11.151412963867188, + "step": 10432 + }, + { + "epoch": 0.9532206486980356, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5.4305506690362495e-08, + "logits/chosen": 511371929.6, + "logits/rejected": 1574568618.6666667, + "logps/chosen": -188.5009033203125, + "logps/rejected": -583.9923909505209, + "loss": 0.0246, + "rewards/chosen": 3.9983943939208983, + "rewards/margins": 14.273275756835938, + "rewards/rejected": -10.274881362915039, + "step": 10433 + }, + { + "epoch": 0.9533120146185473, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.409437788222061e-08, + "logits/chosen": 540127061.3333334, + "logits/rejected": 345739808.0, + "logps/chosen": -291.7515462239583, + "logps/rejected": -271.28521728515625, + "loss": 0.0215, + "rewards/chosen": 4.315381685892741, + "rewards/margins": 12.669158617655437, + "rewards/rejected": -8.353776931762695, + "step": 10434 + }, + { + "epoch": 0.9534033805390589, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5.3883658054649234e-08, + "logits/chosen": 1119687552.0, + "logits/rejected": 661597952.0, + "logps/chosen": -383.61981201171875, + "logps/rejected": -425.5401611328125, + "loss": 0.0192, + "rewards/chosen": 3.3667726516723633, + "rewards/margins": 12.221550941467285, + "rewards/rejected": -8.854778289794922, + "step": 10435 + }, + { + "epoch": 0.9534947464595706, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5.3673347225072227e-08, + "logits/chosen": 715539264.0, + "logits/rejected": 266041312.0, + "logps/chosen": -354.26617431640625, + "logps/rejected": -327.65606689453125, + "loss": 0.0147, + "rewards/chosen": 3.6675422191619873, + "rewards/margins": 11.371244668960571, + "rewards/rejected": -7.703702449798584, + "step": 10436 + }, + { + "epoch": 0.9535861123800823, + "grad_norm": 40.25, + "kl": 0.0, + "learning_rate": 5.3463445410881224e-08, + "logits/chosen": 539181129.1428572, + "logits/rejected": 187079648.0, + "logps/chosen": -263.15907505580356, + "logps/rejected": -295.4886474609375, + "loss": 0.1032, + "rewards/chosen": 3.154832567487444, + "rewards/margins": 12.797688211713519, + "rewards/rejected": -9.642855644226074, + "step": 10437 + }, + { + "epoch": 0.9536774783005939, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 5.325395262943234e-08, + "logits/chosen": 303150464.0, + "logits/rejected": 541318656.0, + "logps/chosen": -289.500244140625, + "logps/rejected": -433.62935965401783, + "loss": 0.0035, + "rewards/chosen": 4.889923095703125, + "rewards/margins": 13.905972072056361, + "rewards/rejected": -9.016048976353236, + "step": 10438 + }, + { + "epoch": 0.9537688442211055, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5.3044868898048386e-08, + "logits/chosen": 708124160.0, + "logits/rejected": 581964992.0, + "logps/chosen": -276.9765319824219, + "logps/rejected": -691.3013305664062, + "loss": 0.0163, + "rewards/chosen": 3.70810604095459, + "rewards/margins": 13.542986869812012, + "rewards/rejected": -9.834880828857422, + "step": 10439 + }, + { + "epoch": 0.9538602101416171, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5.2836194234019976e-08, + "logits/chosen": 735459225.6, + "logits/rejected": 307737621.3333333, + "logps/chosen": -510.2560546875, + "logps/rejected": -298.1742350260417, + "loss": 0.0092, + "rewards/chosen": 4.340514755249023, + "rewards/margins": 12.790072377522787, + "rewards/rejected": -8.449557622273764, + "step": 10440 + }, + { + "epoch": 0.9539515760621289, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 5.262792865460164e-08, + "logits/chosen": 556216115.2, + "logits/rejected": 238043136.0, + "logps/chosen": -290.94619140625, + "logps/rejected": -358.6064046223958, + "loss": 0.0147, + "rewards/chosen": 4.287782669067383, + "rewards/margins": 13.410873413085938, + "rewards/rejected": -9.123090744018555, + "step": 10441 + }, + { + "epoch": 0.9540429419826405, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.242007217701517e-08, + "logits/chosen": 605854336.0, + "logits/rejected": 441648896.0, + "logps/chosen": -292.13677978515625, + "logps/rejected": -381.82244873046875, + "loss": 0.131, + "rewards/chosen": 3.134183406829834, + "rewards/margins": 9.643335819244385, + "rewards/rejected": -6.509152412414551, + "step": 10442 + }, + { + "epoch": 0.9541343079031521, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5.221262481844846e-08, + "logits/chosen": 840851558.4, + "logits/rejected": 1364921856.0, + "logps/chosen": -309.240087890625, + "logps/rejected": -773.3818359375, + "loss": 0.0239, + "rewards/chosen": 4.132731246948242, + "rewards/margins": 13.956464258829751, + "rewards/rejected": -9.82373301188151, + "step": 10443 + }, + { + "epoch": 0.9542256738236637, + "grad_norm": 1.2890625, + "kl": 0.0, + "learning_rate": 5.2005586596055037e-08, + "logits/chosen": 621694592.0, + "logits/rejected": 492605568.0, + "logps/chosen": -491.0875549316406, + "logps/rejected": -657.1088053385416, + "loss": 0.0055, + "rewards/chosen": 3.8353333473205566, + "rewards/margins": 16.582893212636314, + "rewards/rejected": -12.747559865315756, + "step": 10444 + }, + { + "epoch": 0.9543170397441755, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5.179895752695563e-08, + "logits/chosen": 578577152.0, + "logits/rejected": 455667872.0, + "logps/chosen": -257.1180419921875, + "logps/rejected": -469.6089782714844, + "loss": 0.0274, + "rewards/chosen": 2.9462196826934814, + "rewards/margins": 11.157788515090942, + "rewards/rejected": -8.211568832397461, + "step": 10445 + }, + { + "epoch": 0.9544084056646871, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5.159273762823658e-08, + "logits/chosen": 572097682.2857143, + "logits/rejected": 974516992.0, + "logps/chosen": -341.1766880580357, + "logps/rejected": -908.8357543945312, + "loss": 0.042, + "rewards/chosen": 3.401168005807059, + "rewards/margins": 13.42105974469866, + "rewards/rejected": -10.019891738891602, + "step": 10446 + }, + { + "epoch": 0.9544997715851987, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5.13869269169498e-08, + "logits/chosen": 401734464.0, + "logits/rejected": 621129344.0, + "logps/chosen": -373.401123046875, + "logps/rejected": -622.813720703125, + "loss": 0.0104, + "rewards/chosen": 4.297918319702148, + "rewards/margins": 13.31690502166748, + "rewards/rejected": -9.018986701965332, + "step": 10447 + }, + { + "epoch": 0.9545911375057103, + "grad_norm": 34.75, + "kl": 0.0, + "learning_rate": 5.118152541011445e-08, + "logits/chosen": 336719392.0, + "logits/rejected": 404416365.71428573, + "logps/chosen": -68.57096862792969, + "logps/rejected": -341.56494140625, + "loss": 0.1012, + "rewards/chosen": -1.4245662689208984, + "rewards/margins": 6.9217570168631415, + "rewards/rejected": -8.34632328578404, + "step": 10448 + }, + { + "epoch": 0.9546825034262221, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 5.0976533124714736e-08, + "logits/chosen": 901886122.6666666, + "logits/rejected": 610564812.8, + "logps/chosen": -311.73288981119794, + "logps/rejected": -518.368359375, + "loss": 0.0058, + "rewards/chosen": 4.46607780456543, + "rewards/margins": 14.32488136291504, + "rewards/rejected": -9.85880355834961, + "step": 10449 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5.077195007770264e-08, + "logits/chosen": 701848128.0, + "logits/rejected": 1069908992.0, + "logps/chosen": -575.2426147460938, + "logps/rejected": -913.089111328125, + "loss": 0.0085, + "rewards/chosen": 4.555039405822754, + "rewards/margins": 14.527632713317871, + "rewards/rejected": -9.972593307495117, + "step": 10450 + }, + { + "epoch": 0.9548652352672453, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5.056777628599463e-08, + "logits/chosen": 817851392.0, + "logits/rejected": 846888277.3333334, + "logps/chosen": -414.82021484375, + "logps/rejected": -437.5514322916667, + "loss": 0.0162, + "rewards/chosen": 3.917279815673828, + "rewards/margins": 12.673814392089843, + "rewards/rejected": -8.756534576416016, + "step": 10451 + }, + { + "epoch": 0.9549566011877569, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5.036401176647332e-08, + "logits/chosen": 474491008.0, + "logits/rejected": 487974348.8, + "logps/chosen": -250.70538330078125, + "logps/rejected": -622.26875, + "loss": 0.0157, + "rewards/chosen": 3.1786152521769204, + "rewards/margins": 14.977213732401529, + "rewards/rejected": -11.798598480224609, + "step": 10452 + }, + { + "epoch": 0.9550479671082687, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5.016065653598967e-08, + "logits/chosen": 417043626.6666667, + "logits/rejected": 654275584.0, + "logps/chosen": -291.33984375, + "logps/rejected": -565.97080078125, + "loss": 0.0818, + "rewards/chosen": 1.8050292332967122, + "rewards/margins": 11.112957700093588, + "rewards/rejected": -9.307928466796875, + "step": 10453 + }, + { + "epoch": 0.9551393330287803, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 4.995771061135801e-08, + "logits/chosen": 459834794.6666667, + "logits/rejected": 553374464.0, + "logps/chosen": -258.0391031901042, + "logps/rejected": -588.6165161132812, + "loss": 0.0332, + "rewards/chosen": 3.8483912150065103, + "rewards/margins": 13.498359362284342, + "rewards/rejected": -9.649968147277832, + "step": 10454 + }, + { + "epoch": 0.9552306989492919, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 4.9755174009360475e-08, + "logits/chosen": 792123136.0, + "logits/rejected": 767090858.6666666, + "logps/chosen": -491.95355224609375, + "logps/rejected": -449.7610677083333, + "loss": 0.0114, + "rewards/chosen": 3.0854415893554688, + "rewards/margins": 12.28941790262858, + "rewards/rejected": -9.203976313273111, + "step": 10455 + }, + { + "epoch": 0.9553220648698035, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.955304674674477e-08, + "logits/chosen": 469882410.6666667, + "logits/rejected": 601391923.2, + "logps/chosen": -180.1614990234375, + "logps/rejected": -549.3359375, + "loss": 0.0168, + "rewards/chosen": 3.7811902364095054, + "rewards/margins": 12.382950337727864, + "rewards/rejected": -8.601760101318359, + "step": 10456 + }, + { + "epoch": 0.9554134307903153, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 4.935132884022587e-08, + "logits/chosen": 417368576.0, + "logits/rejected": 402966186.6666667, + "logps/chosen": -293.3873046875, + "logps/rejected": -308.0798746744792, + "loss": 0.0932, + "rewards/chosen": 3.7381317138671877, + "rewards/margins": 9.005567105611165, + "rewards/rejected": -5.267435391743978, + "step": 10457 + }, + { + "epoch": 0.9555047967108269, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 4.915002030648264e-08, + "logits/chosen": 370428313.6, + "logits/rejected": 690287232.0, + "logps/chosen": -265.360498046875, + "logps/rejected": -521.0233154296875, + "loss": 0.0199, + "rewards/chosen": 4.523951721191406, + "rewards/margins": 12.761619440714519, + "rewards/rejected": -8.237667719523111, + "step": 10458 + }, + { + "epoch": 0.9555961626313385, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 4.8949121162162326e-08, + "logits/chosen": 793504512.0, + "logits/rejected": 708574361.6, + "logps/chosen": -493.6368408203125, + "logps/rejected": -558.65673828125, + "loss": 0.0111, + "rewards/chosen": 3.5731423695882163, + "rewards/margins": 14.748479588826498, + "rewards/rejected": -11.175337219238282, + "step": 10459 + }, + { + "epoch": 0.9556875285518501, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 4.8748631423877204e-08, + "logits/chosen": 570141184.0, + "logits/rejected": 373573120.0, + "logps/chosen": -393.25433349609375, + "logps/rejected": -556.388916015625, + "loss": 0.01, + "rewards/chosen": 4.027443885803223, + "rewards/margins": 14.29691219329834, + "rewards/rejected": -10.269468307495117, + "step": 10460 + }, + { + "epoch": 0.9557788944723619, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 4.8548551108205666e-08, + "logits/chosen": 578210218.6666666, + "logits/rejected": 601193728.0, + "logps/chosen": -452.226806640625, + "logps/rejected": -535.0998046875, + "loss": 0.0173, + "rewards/chosen": 3.3938201268514, + "rewards/margins": 13.201138242085776, + "rewards/rejected": -9.807318115234375, + "step": 10461 + }, + { + "epoch": 0.9558702603928735, + "grad_norm": 48.5, + "kl": 0.0, + "learning_rate": 4.8348880231693375e-08, + "logits/chosen": 701227840.0, + "logps/chosen": -249.38388061523438, + "loss": 0.1157, + "rewards/chosen": 3.1591176986694336, + "step": 10462 + }, + { + "epoch": 0.9559616263133851, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 4.8149618810850454e-08, + "logits/chosen": 468029525.3333333, + "logits/rejected": 514874432.0, + "logps/chosen": -292.9755045572917, + "logps/rejected": -486.18951416015625, + "loss": 0.029, + "rewards/chosen": 3.7575785319010415, + "rewards/margins": 14.944544474283854, + "rewards/rejected": -11.186965942382812, + "step": 10463 + }, + { + "epoch": 0.9560529922338967, + "grad_norm": 0.73046875, + "kl": 0.0, + "learning_rate": 4.795076686215372e-08, + "logits/chosen": 772374144.0, + "logits/rejected": 573207296.0, + "logps/chosen": -412.72515869140625, + "logps/rejected": -408.7008056640625, + "loss": 0.0034, + "rewards/chosen": 4.364872932434082, + "rewards/margins": 13.796814918518066, + "rewards/rejected": -9.431941986083984, + "step": 10464 + }, + { + "epoch": 0.9561443581544085, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 4.775232440204669e-08, + "logits/chosen": 223905872.0, + "logits/rejected": 569615232.0, + "logps/chosen": -392.6051025390625, + "logps/rejected": -671.6572265625, + "loss": 0.0098, + "rewards/chosen": 4.302721977233887, + "rewards/margins": 13.199271202087402, + "rewards/rejected": -8.896549224853516, + "step": 10465 + }, + { + "epoch": 0.9562357240749201, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 4.755429144693957e-08, + "logits/chosen": 635464499.2, + "logits/rejected": 438751914.6666667, + "logps/chosen": -290.402783203125, + "logps/rejected": -385.8149007161458, + "loss": 0.0208, + "rewards/chosen": 4.047807312011718, + "rewards/margins": 11.247454579671224, + "rewards/rejected": -7.199647267659505, + "step": 10466 + }, + { + "epoch": 0.9563270899954317, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 4.7356668013206486e-08, + "logits/chosen": 507871061.3333333, + "logits/rejected": 95911200.0, + "logps/chosen": -327.0758056640625, + "logps/rejected": -490.51177978515625, + "loss": 0.0123, + "rewards/chosen": 4.403344472249349, + "rewards/margins": 13.573676427205402, + "rewards/rejected": -9.170331954956055, + "step": 10467 + }, + { + "epoch": 0.9564184559159433, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 4.715945411718992e-08, + "logits/chosen": 430555562.6666667, + "logits/rejected": 793921433.6, + "logps/chosen": -252.62479654947916, + "logps/rejected": -698.26298828125, + "loss": 0.0144, + "rewards/chosen": 3.824284235636393, + "rewards/margins": 14.729523340861002, + "rewards/rejected": -10.90523910522461, + "step": 10468 + }, + { + "epoch": 0.956509821836455, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 4.696264977519738e-08, + "logits/chosen": 471855513.6, + "logits/rejected": 749216426.6666666, + "logps/chosen": -255.8511474609375, + "logps/rejected": -399.432373046875, + "loss": 0.0265, + "rewards/chosen": 3.745127868652344, + "rewards/margins": 13.98134028116862, + "rewards/rejected": -10.236212412516275, + "step": 10469 + }, + { + "epoch": 0.9566011877569667, + "grad_norm": 0.78515625, + "kl": 0.0, + "learning_rate": 4.676625500350307e-08, + "logits/chosen": 565749120.0, + "logits/rejected": 615562971.4285715, + "logps/chosen": -47.19227600097656, + "logps/rejected": -677.2505580357143, + "loss": 0.007, + "rewards/chosen": 2.8604142665863037, + "rewards/margins": 11.830746820994786, + "rewards/rejected": -8.970332554408483, + "step": 10470 + }, + { + "epoch": 0.9566925536774783, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 4.657026981834623e-08, + "logits/chosen": 378812256.0, + "logits/rejected": 899219776.0, + "logps/chosen": -175.5996856689453, + "logps/rejected": -451.997314453125, + "loss": 0.0217, + "rewards/chosen": 3.6007232666015625, + "rewards/margins": 12.640783309936523, + "rewards/rejected": -9.040060043334961, + "step": 10471 + }, + { + "epoch": 0.9567839195979899, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 4.637469423593388e-08, + "logits/chosen": 792496332.8, + "logits/rejected": 494092714.6666667, + "logps/chosen": -404.609814453125, + "logps/rejected": -481.9998372395833, + "loss": 0.0178, + "rewards/chosen": 4.023217010498047, + "rewards/margins": 11.422826894124348, + "rewards/rejected": -7.399609883626302, + "step": 10472 + }, + { + "epoch": 0.9568752855185017, + "grad_norm": 3.46875, + "kl": 1.7679405212402344, + "learning_rate": 4.617952827243755e-08, + "logits/chosen": 769033142.8571428, + "logits/rejected": 541893120.0, + "logps/chosen": -352.47119140625, + "logps/rejected": -821.171875, + "loss": 0.0189, + "rewards/chosen": 4.685488564627511, + "rewards/margins": 16.329177720206125, + "rewards/rejected": -11.643689155578613, + "step": 10473 + }, + { + "epoch": 0.9569666514390133, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 4.598477194399653e-08, + "logits/chosen": 342558182.4, + "logits/rejected": 399659349.3333333, + "logps/chosen": -312.1136962890625, + "logps/rejected": -375.1988525390625, + "loss": 0.0298, + "rewards/chosen": 3.6708656311035157, + "rewards/margins": 12.387769317626953, + "rewards/rejected": -8.716903686523438, + "step": 10474 + }, + { + "epoch": 0.9570580173595249, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 4.579042526671462e-08, + "logits/chosen": 364817664.0, + "logits/rejected": 383612902.4, + "logps/chosen": -344.229736328125, + "logps/rejected": -609.060205078125, + "loss": 0.0186, + "rewards/chosen": 3.0162566502889, + "rewards/margins": 11.625968869527181, + "rewards/rejected": -8.609712219238281, + "step": 10475 + }, + { + "epoch": 0.9571493832800365, + "grad_norm": 56.75, + "kl": 0.0, + "learning_rate": 4.559648825666341e-08, + "logits/chosen": 773330176.0, + "logits/rejected": 606904640.0, + "logps/chosen": -415.8604431152344, + "logps/rejected": -510.59429931640625, + "loss": 0.0961, + "rewards/chosen": 3.176818370819092, + "rewards/margins": 11.082385540008545, + "rewards/rejected": -7.905567169189453, + "step": 10476 + }, + { + "epoch": 0.9572407492005482, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 4.5402960929878394e-08, + "logits/chosen": 611142451.2, + "logits/rejected": 461194154.6666667, + "logps/chosen": -429.117041015625, + "logps/rejected": -560.3935953776041, + "loss": 0.0201, + "rewards/chosen": 3.635764312744141, + "rewards/margins": 12.731026204427085, + "rewards/rejected": -9.095261891682943, + "step": 10477 + }, + { + "epoch": 0.9573321151210599, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.5209843302363444e-08, + "logits/chosen": 371413888.0, + "logits/rejected": 303884736.0, + "logps/chosen": -330.0121765136719, + "logps/rejected": -517.8330688476562, + "loss": 0.0169, + "rewards/chosen": 3.8905694484710693, + "rewards/margins": 14.524522542953491, + "rewards/rejected": -10.633953094482422, + "step": 10478 + }, + { + "epoch": 0.9574234810415715, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 4.501713539008745e-08, + "logits/chosen": 605511360.0, + "logits/rejected": 454076320.0, + "logps/chosen": -473.4626770019531, + "logps/rejected": -490.42059326171875, + "loss": 0.0079, + "rewards/chosen": 4.663602352142334, + "rewards/margins": 13.792726039886475, + "rewards/rejected": -9.12912368774414, + "step": 10479 + }, + { + "epoch": 0.9575148469620831, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.4824837208984896e-08, + "logits/chosen": 315308492.8, + "logits/rejected": 652096853.3333334, + "logps/chosen": -275.8699951171875, + "logps/rejected": -328.59710693359375, + "loss": 0.0165, + "rewards/chosen": 4.119009780883789, + "rewards/margins": 12.678217697143555, + "rewards/rejected": -8.559207916259766, + "step": 10480 + }, + { + "epoch": 0.9576062128825948, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 4.4632948774957494e-08, + "logits/chosen": 519102880.0, + "logits/rejected": 471704928.0, + "logps/chosen": -380.56134033203125, + "logps/rejected": -484.87109375, + "loss": 0.0105, + "rewards/chosen": 4.196070671081543, + "rewards/margins": 16.313234329223633, + "rewards/rejected": -12.11716365814209, + "step": 10481 + }, + { + "epoch": 0.9576975788031065, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 4.444147010387312e-08, + "logits/chosen": 480717397.3333333, + "logits/rejected": 400019840.0, + "logps/chosen": -416.5823567708333, + "logps/rejected": -447.0703125, + "loss": 0.0161, + "rewards/chosen": 4.118722279866536, + "rewards/margins": 16.70123227437337, + "rewards/rejected": -12.582509994506836, + "step": 10482 + }, + { + "epoch": 0.9577889447236181, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 4.425040121156521e-08, + "logits/chosen": 487595861.3333333, + "logits/rejected": 1000178496.0, + "logps/chosen": -249.0943603515625, + "logps/rejected": -644.306396484375, + "loss": 0.027, + "rewards/chosen": 4.281078338623047, + "rewards/margins": 14.50307559967041, + "rewards/rejected": -10.221997261047363, + "step": 10483 + }, + { + "epoch": 0.9578803106441297, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 4.4059742113832793e-08, + "logits/chosen": 466370150.4, + "logits/rejected": 928099157.3333334, + "logps/chosen": -287.270556640625, + "logps/rejected": -628.0337727864584, + "loss": 0.0131, + "rewards/chosen": 4.328129577636719, + "rewards/margins": 14.70281550089518, + "rewards/rejected": -10.374685923258463, + "step": 10484 + }, + { + "epoch": 0.9579716765646414, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 4.3869492826441595e-08, + "logits/chosen": 586392704.0, + "logits/rejected": 334722496.0, + "logps/chosen": -308.60931396484375, + "logps/rejected": -621.6404418945312, + "loss": 0.1005, + "rewards/chosen": 3.0391321182250977, + "rewards/margins": 14.530686378479004, + "rewards/rejected": -11.491554260253906, + "step": 10485 + }, + { + "epoch": 0.9580630424851531, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 4.367965336512403e-08, + "logits/chosen": 839744614.4, + "logits/rejected": 334716693.3333333, + "logps/chosen": -299.150341796875, + "logps/rejected": -512.3865966796875, + "loss": 0.0097, + "rewards/chosen": 4.298246383666992, + "rewards/margins": 16.34961382548014, + "rewards/rejected": -12.05136744181315, + "step": 10486 + }, + { + "epoch": 0.9581544084056647, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 4.349022374557754e-08, + "logits/chosen": 688154538.6666666, + "logits/rejected": 481276211.2, + "logps/chosen": -496.001953125, + "logps/rejected": -580.172265625, + "loss": 0.0138, + "rewards/chosen": 3.331712086995443, + "rewards/margins": 12.904916127522787, + "rewards/rejected": -9.573204040527344, + "step": 10487 + }, + { + "epoch": 0.9582457743261763, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 4.330120398346682e-08, + "logits/chosen": 484393344.0, + "logits/rejected": 467537536.0, + "logps/chosen": -349.85809326171875, + "logps/rejected": -438.8257141113281, + "loss": 0.0063, + "rewards/chosen": 4.604502201080322, + "rewards/margins": 13.882648944854736, + "rewards/rejected": -9.278146743774414, + "step": 10488 + }, + { + "epoch": 0.958337140246688, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 4.311259409442159e-08, + "logits/chosen": 815241536.0, + "logits/rejected": 497694464.0, + "logps/chosen": -408.02294921875, + "logps/rejected": -499.588623046875, + "loss": 0.026, + "rewards/chosen": 3.2208805084228516, + "rewards/margins": 12.64848518371582, + "rewards/rejected": -9.427604675292969, + "step": 10489 + }, + { + "epoch": 0.9584285061671997, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 4.292439409403826e-08, + "logits/chosen": 721577856.0, + "logits/rejected": 312854250.6666667, + "logps/chosen": -469.1141357421875, + "logps/rejected": -370.6690266927083, + "loss": 0.0041, + "rewards/chosen": 4.454931735992432, + "rewards/margins": 12.930493513743082, + "rewards/rejected": -8.47556177775065, + "step": 10490 + }, + { + "epoch": 0.9585198720877113, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 4.273660399787938e-08, + "logits/chosen": 319649331.2, + "logits/rejected": 495155541.3333333, + "logps/chosen": -251.2869384765625, + "logps/rejected": -651.1699625651041, + "loss": 0.0208, + "rewards/chosen": 4.08355712890625, + "rewards/margins": 13.30783716837565, + "rewards/rejected": -9.2242800394694, + "step": 10491 + }, + { + "epoch": 0.9586112380082229, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 4.254922382147309e-08, + "logits/chosen": 325010912.0, + "logits/rejected": 374147008.0, + "logps/chosen": -247.385986328125, + "logps/rejected": -540.3477783203125, + "loss": 0.0155, + "rewards/chosen": 4.0934271812438965, + "rewards/margins": 13.46692419052124, + "rewards/rejected": -9.373497009277344, + "step": 10492 + }, + { + "epoch": 0.9587026039287346, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 4.236225358031421e-08, + "logits/chosen": 507257056.0, + "logits/rejected": 425620832.0, + "logps/chosen": -318.53997802734375, + "logps/rejected": -551.0929565429688, + "loss": 0.0064, + "rewards/chosen": 4.921039581298828, + "rewards/margins": 16.83294677734375, + "rewards/rejected": -11.911907196044922, + "step": 10493 + }, + { + "epoch": 0.9587939698492463, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 4.217569328986315e-08, + "logits/chosen": 900697920.0, + "logits/rejected": 497393280.0, + "logps/chosen": -508.857421875, + "logps/rejected": -384.2493591308594, + "loss": 0.0089, + "rewards/chosen": 5.1032562255859375, + "rewards/margins": 13.496816635131836, + "rewards/rejected": -8.393560409545898, + "step": 10494 + }, + { + "epoch": 0.9588853357697579, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 4.1989542965547025e-08, + "logits/chosen": 406574899.2, + "logits/rejected": 255453397.33333334, + "logps/chosen": -267.8406982421875, + "logps/rejected": -389.6572265625, + "loss": 0.019, + "rewards/chosen": 4.284654235839843, + "rewards/margins": 14.54164784749349, + "rewards/rejected": -10.256993611653646, + "step": 10495 + }, + { + "epoch": 0.9589767016902695, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 4.180380262275907e-08, + "logits/chosen": 436260778.6666667, + "logits/rejected": 646923264.0, + "logps/chosen": -342.4488932291667, + "logps/rejected": -573.5845947265625, + "loss": 0.0103, + "rewards/chosen": 4.500617027282715, + "rewards/margins": 13.035725593566895, + "rewards/rejected": -8.53510856628418, + "step": 10496 + }, + { + "epoch": 0.9590680676107812, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.161847227685811e-08, + "logits/chosen": 902327210.6666666, + "logits/rejected": 802815680.0, + "logps/chosen": -279.8087158203125, + "logps/rejected": -458.35986328125, + "loss": 0.0157, + "rewards/chosen": 4.4716746012369795, + "rewards/margins": 14.621569315592449, + "rewards/rejected": -10.149894714355469, + "step": 10497 + }, + { + "epoch": 0.9591594335312928, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 4.143355194316856e-08, + "logits/chosen": 432374016.0, + "logits/rejected": 945220608.0, + "logps/chosen": -144.3617960611979, + "logps/rejected": -464.62841796875, + "loss": 0.0152, + "rewards/chosen": 3.2564919789632163, + "rewards/margins": 12.636499913533529, + "rewards/rejected": -9.380007934570312, + "step": 10498 + }, + { + "epoch": 0.9592507994518045, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 4.124904163698207e-08, + "logits/chosen": 513114965.3333333, + "logits/rejected": 646516736.0, + "logps/chosen": -371.3455403645833, + "logps/rejected": -481.69794921875, + "loss": 0.0216, + "rewards/chosen": 2.8066558837890625, + "rewards/margins": 12.391709899902343, + "rewards/rejected": -9.585054016113281, + "step": 10499 + }, + { + "epoch": 0.9593421653723161, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.1064941373556434e-08, + "logits/chosen": 439685589.3333333, + "logits/rejected": 595501977.6, + "logps/chosen": -284.39341227213544, + "logps/rejected": -426.70712890625, + "loss": 0.0131, + "rewards/chosen": 4.093270937601726, + "rewards/margins": 12.565966860453287, + "rewards/rejected": -8.472695922851562, + "step": 10500 + }, + { + "epoch": 0.9594335312928278, + "grad_norm": 1.4453125, + "kl": 0.0, + "learning_rate": 4.088125116811448e-08, + "logits/chosen": 655144789.3333334, + "logits/rejected": 859867955.2, + "logps/chosen": -469.805419921875, + "logps/rejected": -537.7314453125, + "loss": 0.0079, + "rewards/chosen": 3.8918050130208335, + "rewards/margins": 13.27753651936849, + "rewards/rejected": -9.385731506347657, + "step": 10501 + }, + { + "epoch": 0.9595248972133394, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 4.069797103584572e-08, + "logits/chosen": 384461440.0, + "logits/rejected": 442712768.0, + "logps/chosen": -259.9921468098958, + "logps/rejected": -274.83740234375, + "loss": 0.031, + "rewards/chosen": 3.7049859364827475, + "rewards/margins": 13.496208508809408, + "rewards/rejected": -9.79122257232666, + "step": 10502 + }, + { + "epoch": 0.9596162631338511, + "grad_norm": 34.75, + "kl": 0.0, + "learning_rate": 4.051510099190581e-08, + "logits/chosen": 1210076928.0, + "logits/rejected": 488048347.4285714, + "logps/chosen": -278.13275146484375, + "logps/rejected": -530.0631277901786, + "loss": 0.0216, + "rewards/chosen": 4.701565742492676, + "rewards/margins": 15.31752382005964, + "rewards/rejected": -10.615958077566964, + "step": 10503 + }, + { + "epoch": 0.9597076290543627, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 4.033264105141599e-08, + "logits/chosen": 734295296.0, + "logits/rejected": 580832042.6666666, + "logps/chosen": -262.4197082519531, + "logps/rejected": -400.97119140625, + "loss": 0.0079, + "rewards/chosen": 4.283967018127441, + "rewards/margins": 12.429933230082193, + "rewards/rejected": -8.145966211954752, + "step": 10504 + }, + { + "epoch": 0.9597989949748744, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 4.0150591229465294e-08, + "logits/chosen": 1159761066.6666667, + "logits/rejected": 426092390.4, + "logps/chosen": -287.2714436848958, + "logps/rejected": -297.215478515625, + "loss": 0.0105, + "rewards/chosen": 3.651190439860026, + "rewards/margins": 12.962948099772134, + "rewards/rejected": -9.311757659912109, + "step": 10505 + }, + { + "epoch": 0.959890360895386, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.996895154110614e-08, + "logits/chosen": 478331611.4285714, + "logits/rejected": 410079552.0, + "logps/chosen": -294.0532924107143, + "logps/rejected": -285.4367980957031, + "loss": 0.0211, + "rewards/chosen": 4.007805415562221, + "rewards/margins": 12.46457917349679, + "rewards/rejected": -8.45677375793457, + "step": 10506 + }, + { + "epoch": 0.9599817268158977, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 3.978772200135927e-08, + "logits/chosen": 745786026.6666666, + "logits/rejected": 627773312.0, + "logps/chosen": -301.66827392578125, + "logps/rejected": -451.01385498046875, + "loss": 0.0276, + "rewards/chosen": 3.5340938568115234, + "rewards/margins": 13.595966339111328, + "rewards/rejected": -10.061872482299805, + "step": 10507 + }, + { + "epoch": 0.9600730927364093, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.960690262520994e-08, + "logits/chosen": 544031061.3333334, + "logits/rejected": 487343552.0, + "logps/chosen": -336.00852457682294, + "logps/rejected": -759.439453125, + "loss": 0.0168, + "rewards/chosen": 3.9690465927124023, + "rewards/margins": 13.677538871765137, + "rewards/rejected": -9.708492279052734, + "step": 10508 + }, + { + "epoch": 0.960164458656921, + "grad_norm": 29.25, + "kl": 0.0, + "learning_rate": 3.9426493427611177e-08, + "logits/chosen": 544178022.4, + "logits/rejected": 461550592.0, + "logps/chosen": -252.9424072265625, + "logps/rejected": -447.8599446614583, + "loss": 0.0477, + "rewards/chosen": 2.9243133544921873, + "rewards/margins": 13.22144775390625, + "rewards/rejected": -10.297134399414062, + "step": 10509 + }, + { + "epoch": 0.9602558245774326, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 3.92464944234805e-08, + "logits/chosen": 682923776.0, + "logits/rejected": 621608448.0, + "logps/chosen": -358.43939208984375, + "logps/rejected": -402.99639892578125, + "loss": 0.0284, + "rewards/chosen": 3.3684685230255127, + "rewards/margins": 10.778397798538208, + "rewards/rejected": -7.409929275512695, + "step": 10510 + }, + { + "epoch": 0.9603471904979443, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 3.90669056277021e-08, + "logits/chosen": 501198720.0, + "logits/rejected": 319241568.0, + "logps/chosen": -347.34710693359375, + "logps/rejected": -509.47052001953125, + "loss": 0.0159, + "rewards/chosen": 3.9442856311798096, + "rewards/margins": 12.005680322647095, + "rewards/rejected": -8.061394691467285, + "step": 10511 + }, + { + "epoch": 0.9604385564184559, + "grad_norm": 1.796875, + "kl": 0.0, + "learning_rate": 3.888772705512633e-08, + "logits/chosen": 688920448.0, + "logits/rejected": 872784896.0, + "logps/chosen": -374.32861328125, + "logps/rejected": -440.8690592447917, + "loss": 0.0079, + "rewards/chosen": 4.20303201675415, + "rewards/margins": 12.048161665598553, + "rewards/rejected": -7.845129648844401, + "step": 10512 + }, + { + "epoch": 0.9605299223389676, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 3.870895872057023e-08, + "logits/chosen": 1216837248.0, + "logits/rejected": 405554432.0, + "logps/chosen": -333.7275390625, + "logps/rejected": -632.669921875, + "loss": 0.0235, + "rewards/chosen": 3.1705546379089355, + "rewards/margins": 15.929324626922607, + "rewards/rejected": -12.758769989013672, + "step": 10513 + }, + { + "epoch": 0.9606212882594792, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 3.8530600638815865e-08, + "logits/chosen": 573202304.0, + "logits/rejected": 528959414.85714287, + "logps/chosen": -396.432861328125, + "logps/rejected": -535.7498604910714, + "loss": 0.0068, + "rewards/chosen": 2.888397216796875, + "rewards/margins": 13.400314331054688, + "rewards/rejected": -10.511917114257812, + "step": 10514 + }, + { + "epoch": 0.9607126541799909, + "grad_norm": 0.84765625, + "kl": 0.0, + "learning_rate": 3.835265282461198e-08, + "logits/chosen": 513263104.0, + "logits/rejected": 281405152.0, + "logps/chosen": -380.8509216308594, + "logps/rejected": -404.63409423828125, + "loss": 0.0042, + "rewards/chosen": 5.236793518066406, + "rewards/margins": 15.28841495513916, + "rewards/rejected": -10.051621437072754, + "step": 10515 + }, + { + "epoch": 0.9608040201005025, + "grad_norm": 1.578125, + "kl": 0.0, + "learning_rate": 3.817511529267237e-08, + "logits/chosen": 438197043.2, + "logits/rejected": 797799424.0, + "logps/chosen": -247.1832275390625, + "logps/rejected": -700.477294921875, + "loss": 0.0105, + "rewards/chosen": 4.336761856079102, + "rewards/margins": 15.191257858276368, + "rewards/rejected": -10.854496002197266, + "step": 10516 + }, + { + "epoch": 0.9608953860210142, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 3.799798805767918e-08, + "logits/chosen": 315185834.6666667, + "logits/rejected": 590315264.0, + "logps/chosen": -293.9723307291667, + "logps/rejected": -260.701171875, + "loss": 0.0235, + "rewards/chosen": 3.960972468058268, + "rewards/margins": 12.290420214335123, + "rewards/rejected": -8.329447746276855, + "step": 10517 + }, + { + "epoch": 0.9609867519415258, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 3.782127113427791e-08, + "logits/chosen": 1039267635.2, + "logits/rejected": 575746005.3333334, + "logps/chosen": -274.4364501953125, + "logps/rejected": -500.3636067708333, + "loss": 0.0289, + "rewards/chosen": 3.6880844116210936, + "rewards/margins": 11.795583852132161, + "rewards/rejected": -8.107499440511068, + "step": 10518 + }, + { + "epoch": 0.9610781178620375, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 3.764496453708244e-08, + "logits/chosen": 260110540.8, + "logits/rejected": 354396074.6666667, + "logps/chosen": -132.5595458984375, + "logps/rejected": -500.7499593098958, + "loss": 0.0314, + "rewards/chosen": 4.29595718383789, + "rewards/margins": 12.99273198445638, + "rewards/rejected": -8.69677480061849, + "step": 10519 + }, + { + "epoch": 0.9611694837825491, + "grad_norm": 1.2734375, + "kl": 0.0, + "learning_rate": 3.746906828067054e-08, + "logits/chosen": 414616064.0, + "logits/rejected": 395324342.85714287, + "logps/chosen": -301.20733642578125, + "logps/rejected": -491.11561802455356, + "loss": 0.0048, + "rewards/chosen": 3.285571336746216, + "rewards/margins": 11.942636592047554, + "rewards/rejected": -8.657065255301339, + "step": 10520 + }, + { + "epoch": 0.9612608497030608, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 3.72935823795878e-08, + "logits/chosen": 634254272.0, + "logits/rejected": 352592896.0, + "logps/chosen": -334.8931579589844, + "logps/rejected": -481.4891662597656, + "loss": 0.0271, + "rewards/chosen": 3.0875353813171387, + "rewards/margins": 12.185445308685303, + "rewards/rejected": -9.097909927368164, + "step": 10521 + }, + { + "epoch": 0.9613522156235724, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.711850684834539e-08, + "logits/chosen": 571435434.6666666, + "logits/rejected": 314456448.0, + "logps/chosen": -333.05678304036456, + "logps/rejected": -452.9047546386719, + "loss": 0.0244, + "rewards/chosen": 3.592151323954264, + "rewards/margins": 13.61470858256022, + "rewards/rejected": -10.022557258605957, + "step": 10522 + }, + { + "epoch": 0.961443581544084, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 3.694384170142063e-08, + "logits/chosen": 565616341.3333334, + "logits/rejected": 703841408.0, + "logps/chosen": -307.3462727864583, + "logps/rejected": -608.0414428710938, + "loss": 0.0468, + "rewards/chosen": 3.604877154032389, + "rewards/margins": 12.828255335489908, + "rewards/rejected": -9.22337818145752, + "step": 10523 + }, + { + "epoch": 0.9615349474645957, + "grad_norm": 0.80859375, + "kl": 0.0, + "learning_rate": 3.676958695325639e-08, + "logits/chosen": 792373568.0, + "logits/rejected": 831895296.0, + "logps/chosen": -307.5026550292969, + "logps/rejected": -565.3720703125, + "loss": 0.0037, + "rewards/chosen": 4.356683731079102, + "rewards/margins": 14.077959060668945, + "rewards/rejected": -9.721275329589844, + "step": 10524 + }, + { + "epoch": 0.9616263133851074, + "grad_norm": 0.64453125, + "kl": 0.0, + "learning_rate": 3.659574261826171e-08, + "logits/chosen": 313877717.3333333, + "logits/rejected": 355732096.0, + "logps/chosen": -341.349853515625, + "logps/rejected": -360.8333984375, + "loss": 0.0027, + "rewards/chosen": 5.6208852132161455, + "rewards/margins": 15.068964131673177, + "rewards/rejected": -9.448078918457032, + "step": 10525 + }, + { + "epoch": 0.961717679305619, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 3.6422308710812314e-08, + "logits/chosen": 536055142.4, + "logits/rejected": 343686890.6666667, + "logps/chosen": -325.493701171875, + "logps/rejected": -438.6185709635417, + "loss": 0.0114, + "rewards/chosen": 4.548580169677734, + "rewards/margins": 14.406094233194986, + "rewards/rejected": -9.857514063517252, + "step": 10526 + }, + { + "epoch": 0.9618090452261306, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 3.624928524524951e-08, + "logits/chosen": 493379584.0, + "logits/rejected": 292291754.6666667, + "logps/chosen": -398.983056640625, + "logps/rejected": -320.01715087890625, + "loss": 0.0114, + "rewards/chosen": 4.323825073242188, + "rewards/margins": 12.50922826131185, + "rewards/rejected": -8.185403188069662, + "step": 10527 + }, + { + "epoch": 0.9619004111466423, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 3.6076672235880737e-08, + "logits/chosen": 604287573.3333334, + "logits/rejected": 309053632.0, + "logps/chosen": -277.2236735026042, + "logps/rejected": -147.71499633789062, + "loss": 0.0447, + "rewards/chosen": 3.4729458491007485, + "rewards/margins": 9.062277475992838, + "rewards/rejected": -5.58933162689209, + "step": 10528 + }, + { + "epoch": 0.961991777067154, + "grad_norm": 0.875, + "kl": 0.0, + "learning_rate": 3.5904469696979025e-08, + "logits/chosen": 414318336.0, + "logits/rejected": 400510720.0, + "logps/chosen": -378.851318359375, + "logps/rejected": -602.41845703125, + "loss": 0.0046, + "rewards/chosen": 4.972770690917969, + "rewards/margins": 16.699617385864258, + "rewards/rejected": -11.726846694946289, + "step": 10529 + }, + { + "epoch": 0.9620831429876656, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 3.573267764278465e-08, + "logits/chosen": 371569203.2, + "logits/rejected": 299339605.3333333, + "logps/chosen": -333.70390625, + "logps/rejected": -358.5301106770833, + "loss": 0.0397, + "rewards/chosen": 3.1991809844970702, + "rewards/margins": 11.511467361450196, + "rewards/rejected": -8.312286376953125, + "step": 10530 + }, + { + "epoch": 0.9621745089081772, + "grad_norm": 48.5, + "kl": 0.0, + "learning_rate": 3.5561296087502916e-08, + "logits/chosen": 555009484.8, + "logits/rejected": 645730218.6666666, + "logps/chosen": -311.069482421875, + "logps/rejected": -406.4978841145833, + "loss": 0.0673, + "rewards/chosen": 4.06023063659668, + "rewards/margins": 10.879825337727866, + "rewards/rejected": -6.819594701131185, + "step": 10531 + }, + { + "epoch": 0.9622658748286889, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 3.5390325045304704e-08, + "logits/chosen": 589948842.6666666, + "logits/rejected": 610737459.2, + "logps/chosen": -495.4016927083333, + "logps/rejected": -580.72919921875, + "loss": 0.0054, + "rewards/chosen": 4.319041887919108, + "rewards/margins": 13.450008074442547, + "rewards/rejected": -9.130966186523438, + "step": 10532 + }, + { + "epoch": 0.9623572407492006, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 3.521976453032927e-08, + "logits/chosen": 349537638.4, + "logits/rejected": 186433344.0, + "logps/chosen": -201.493896484375, + "logps/rejected": -409.2805989583333, + "loss": 0.0216, + "rewards/chosen": 3.828053283691406, + "rewards/margins": 15.84376220703125, + "rewards/rejected": -12.015708923339844, + "step": 10533 + }, + { + "epoch": 0.9624486066697122, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 3.5049614556679214e-08, + "logits/chosen": 664265002.6666666, + "logits/rejected": 480292147.2, + "logps/chosen": -338.3268229166667, + "logps/rejected": -548.221484375, + "loss": 0.02, + "rewards/chosen": 3.263939539591471, + "rewards/margins": 13.079163233439127, + "rewards/rejected": -9.815223693847656, + "step": 10534 + }, + { + "epoch": 0.9625399725902238, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 3.4879875138424945e-08, + "logits/chosen": 627092416.0, + "logits/rejected": 460380384.0, + "logps/chosen": -365.947021484375, + "logps/rejected": -367.0208435058594, + "loss": 0.0103, + "rewards/chosen": 4.548826217651367, + "rewards/margins": 13.590526580810547, + "rewards/rejected": -9.04170036315918, + "step": 10535 + }, + { + "epoch": 0.9626313385107355, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 3.471054628960191e-08, + "logits/chosen": 660025958.4, + "logits/rejected": 434752896.0, + "logps/chosen": -261.819384765625, + "logps/rejected": -487.6164143880208, + "loss": 0.0511, + "rewards/chosen": 2.9831607818603514, + "rewards/margins": 10.053828048706055, + "rewards/rejected": -7.070667266845703, + "step": 10536 + }, + { + "epoch": 0.9627227044312472, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 3.4541628024212794e-08, + "logits/chosen": 766916266.6666666, + "logits/rejected": 604608409.6, + "logps/chosen": -606.5688883463541, + "logps/rejected": -417.1123046875, + "loss": 0.0135, + "rewards/chosen": 3.6310323079427085, + "rewards/margins": 11.914656575520834, + "rewards/rejected": -8.283624267578125, + "step": 10537 + }, + { + "epoch": 0.9628140703517588, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 3.437312035622475e-08, + "logits/chosen": 894764714.6666666, + "logits/rejected": 1129045299.2, + "logps/chosen": -393.9976806640625, + "logps/rejected": -474.795947265625, + "loss": 0.0098, + "rewards/chosen": 3.935497283935547, + "rewards/margins": 13.159407043457032, + "rewards/rejected": -9.223909759521485, + "step": 10538 + }, + { + "epoch": 0.9629054362722704, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 3.4205023299572205e-08, + "logits/chosen": 896332608.0, + "logits/rejected": 586010154.6666666, + "logps/chosen": -478.74951171875, + "logps/rejected": -407.1598714192708, + "loss": 0.0062, + "rewards/chosen": 4.049708843231201, + "rewards/margins": 12.398082256317139, + "rewards/rejected": -8.348373413085938, + "step": 10539 + }, + { + "epoch": 0.962996802192782, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 3.403733686815458e-08, + "logits/chosen": 599935573.3333334, + "logits/rejected": 676972544.0, + "logps/chosen": -278.45074462890625, + "logps/rejected": -503.5076904296875, + "loss": 0.0163, + "rewards/chosen": 4.840924580891927, + "rewards/margins": 13.787247021993, + "rewards/rejected": -8.946322441101074, + "step": 10540 + }, + { + "epoch": 0.9630881681132938, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 3.387006107583912e-08, + "logits/chosen": 760239616.0, + "logits/rejected": 615207552.0, + "logps/chosen": -446.226806640625, + "logps/rejected": -638.3591918945312, + "loss": 0.0116, + "rewards/chosen": 4.273672103881836, + "rewards/margins": 12.760177612304688, + "rewards/rejected": -8.486505508422852, + "step": 10541 + }, + { + "epoch": 0.9631795340338054, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.370319593645699e-08, + "logits/chosen": 534248755.2, + "logits/rejected": 819535360.0, + "logps/chosen": -334.88662109375, + "logps/rejected": -503.8780110677083, + "loss": 0.0198, + "rewards/chosen": 3.8770355224609374, + "rewards/margins": 12.775489552815756, + "rewards/rejected": -8.898454030354818, + "step": 10542 + }, + { + "epoch": 0.963270899954317, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 3.353674146380714e-08, + "logits/chosen": 396796608.0, + "logits/rejected": 636082346.6666666, + "logps/chosen": -277.5914306640625, + "logps/rejected": -517.07421875, + "loss": 0.0111, + "rewards/chosen": 3.628535270690918, + "rewards/margins": 13.171632448832193, + "rewards/rejected": -9.543097178141275, + "step": 10543 + }, + { + "epoch": 0.9633622658748286, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 3.337069767165357e-08, + "logits/chosen": 369915861.3333333, + "logits/rejected": 713826406.4, + "logps/chosen": -199.42683919270834, + "logps/rejected": -502.360107421875, + "loss": 0.0138, + "rewards/chosen": 4.178107897440593, + "rewards/margins": 14.882671801249188, + "rewards/rejected": -10.704563903808594, + "step": 10544 + }, + { + "epoch": 0.9634536317953404, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 3.3205064573725855e-08, + "logits/chosen": 525792992.0, + "logits/rejected": 578025557.3333334, + "logps/chosen": -318.43994140625, + "logps/rejected": -465.7386067708333, + "loss": 0.0147, + "rewards/chosen": 2.8343827724456787, + "rewards/margins": 11.3171333471934, + "rewards/rejected": -8.48275057474772, + "step": 10545 + }, + { + "epoch": 0.963544997715852, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 3.303984218372136e-08, + "logits/chosen": 942842675.2, + "logits/rejected": 746233002.6666666, + "logps/chosen": -335.940478515625, + "logps/rejected": -271.51243082682294, + "loss": 0.0189, + "rewards/chosen": 3.848943328857422, + "rewards/margins": 10.887773768107097, + "rewards/rejected": -7.038830439249675, + "step": 10546 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 3.287503051530194e-08, + "logits/chosen": 412033024.0, + "logits/rejected": 612456512.0, + "logps/chosen": -374.3291931152344, + "logps/rejected": -431.28369140625, + "loss": 0.0094, + "rewards/chosen": 4.646425724029541, + "rewards/margins": 13.685981273651123, + "rewards/rejected": -9.039555549621582, + "step": 10547 + }, + { + "epoch": 0.9637277295568752, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 3.271062958209614e-08, + "logits/chosen": 570265920.0, + "logits/rejected": 808848298.6666666, + "logps/chosen": -366.6859436035156, + "logps/rejected": -571.079833984375, + "loss": 0.0147, + "rewards/chosen": 3.1006522178649902, + "rewards/margins": 11.76848872502645, + "rewards/rejected": -8.667836507161459, + "step": 10548 + }, + { + "epoch": 0.963819095477387, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 3.254663939769809e-08, + "logits/chosen": 361535061.3333333, + "logits/rejected": 628212787.2, + "logps/chosen": -184.739013671875, + "logps/rejected": -541.829931640625, + "loss": 0.0213, + "rewards/chosen": 4.119653065999349, + "rewards/margins": 12.962479909261067, + "rewards/rejected": -8.842826843261719, + "step": 10549 + }, + { + "epoch": 0.9639104613978986, + "grad_norm": 1.0625, + "kl": 0.0, + "learning_rate": 3.2383059975668594e-08, + "logits/chosen": 250229248.0, + "logits/rejected": 377500825.6, + "logps/chosen": -208.670166015625, + "logps/rejected": -423.64287109375, + "loss": 0.0059, + "rewards/chosen": 4.780199368794759, + "rewards/margins": 13.638147672017414, + "rewards/rejected": -8.857948303222656, + "step": 10550 + }, + { + "epoch": 0.9640018273184102, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 3.221989132953407e-08, + "logits/chosen": 859319500.8, + "logits/rejected": 606788309.3333334, + "logps/chosen": -496.913525390625, + "logps/rejected": -850.4905598958334, + "loss": 0.0079, + "rewards/chosen": 4.715273666381836, + "rewards/margins": 16.491874821980794, + "rewards/rejected": -11.776601155598959, + "step": 10551 + }, + { + "epoch": 0.9640931932389218, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 3.205713347278705e-08, + "logits/chosen": 311504384.0, + "logits/rejected": 270325034.6666667, + "logps/chosen": -249.7375, + "logps/rejected": -456.940673828125, + "loss": 0.0086, + "rewards/chosen": 4.499336624145508, + "rewards/margins": 15.20177116394043, + "rewards/rejected": -10.702434539794922, + "step": 10552 + }, + { + "epoch": 0.9641845591594336, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 3.1894786418886214e-08, + "logits/chosen": 571557824.0, + "logits/rejected": 540092160.0, + "logps/chosen": -380.23529052734375, + "logps/rejected": -407.1143798828125, + "loss": 0.0109, + "rewards/chosen": 3.2401199340820312, + "rewards/margins": 11.689097086588541, + "rewards/rejected": -8.44897715250651, + "step": 10553 + }, + { + "epoch": 0.9642759250799452, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3.173285018125527e-08, + "logits/chosen": 425694080.0, + "logits/rejected": 595166336.0, + "logps/chosen": -372.7644348144531, + "logps/rejected": -415.3883056640625, + "loss": 0.1233, + "rewards/chosen": 1.9899860620498657, + "rewards/margins": 10.373942017555237, + "rewards/rejected": -8.383955955505371, + "step": 10554 + }, + { + "epoch": 0.9643672910004568, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.1571324773286284e-08, + "logits/chosen": 585210368.0, + "logits/rejected": 223236352.0, + "logps/chosen": -199.35035400390626, + "logps/rejected": -399.3884684244792, + "loss": 0.0327, + "rewards/chosen": 3.2579414367675783, + "rewards/margins": 12.745115152994792, + "rewards/rejected": -9.487173716227213, + "step": 10555 + }, + { + "epoch": 0.9644586569209684, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 3.141021020833523e-08, + "logits/chosen": 526824896.0, + "logits/rejected": 620951808.0, + "logps/chosen": -262.4378662109375, + "logps/rejected": -445.2729187011719, + "loss": 0.0193, + "rewards/chosen": 4.4371442794799805, + "rewards/margins": 13.73543643951416, + "rewards/rejected": -9.29829216003418, + "step": 10556 + }, + { + "epoch": 0.9645500228414802, + "grad_norm": 38.75, + "kl": 0.0, + "learning_rate": 3.12495064997248e-08, + "logits/chosen": 1013951385.6, + "logits/rejected": 549180757.3333334, + "logps/chosen": -391.332666015625, + "logps/rejected": -357.6409505208333, + "loss": 0.0598, + "rewards/chosen": 3.685861587524414, + "rewards/margins": 11.984504699707031, + "rewards/rejected": -8.298643112182617, + "step": 10557 + }, + { + "epoch": 0.9646413887619918, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 3.10892136607438e-08, + "logits/chosen": 530493344.0, + "logits/rejected": 564774464.0, + "logps/chosen": -242.52249145507812, + "logps/rejected": -458.6921081542969, + "loss": 0.0112, + "rewards/chosen": 4.002281188964844, + "rewards/margins": 13.828437805175781, + "rewards/rejected": -9.826156616210938, + "step": 10558 + }, + { + "epoch": 0.9647327546825034, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 3.0929331704646624e-08, + "logits/chosen": 641171840.0, + "logits/rejected": 718145962.6666666, + "logps/chosen": -254.80392456054688, + "logps/rejected": -516.7136637369791, + "loss": 0.0079, + "rewards/chosen": 3.4344429969787598, + "rewards/margins": 13.201864083607992, + "rewards/rejected": -9.767421086629232, + "step": 10559 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 39.0, + "kl": 0.0, + "learning_rate": 3.076986064465437e-08, + "logits/chosen": 390788053.3333333, + "logits/rejected": 562629836.8, + "logps/chosen": -244.48787434895834, + "logps/rejected": -445.39638671875, + "loss": 0.0418, + "rewards/chosen": 3.7731202443440757, + "rewards/margins": 11.774562962849936, + "rewards/rejected": -8.00144271850586, + "step": 10560 + }, + { + "epoch": 0.9649154865235268, + "grad_norm": 0.263671875, + "kl": 0.0, + "learning_rate": 3.0610800493953706e-08, + "logits/chosen": 299153344.0, + "logits/rejected": 446434048.0, + "logps/chosen": -285.5537109375, + "logps/rejected": -417.65269252232144, + "loss": 0.001, + "rewards/chosen": 5.812164306640625, + "rewards/margins": 14.526956285749163, + "rewards/rejected": -8.714791979108538, + "step": 10561 + }, + { + "epoch": 0.9650068524440384, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 3.0452151265696897e-08, + "logits/chosen": 550095872.0, + "logits/rejected": 402444480.0, + "logps/chosen": -350.9690246582031, + "logps/rejected": -411.5827941894531, + "loss": 0.0145, + "rewards/chosen": 3.7494213581085205, + "rewards/margins": 12.178611040115356, + "rewards/rejected": -8.429189682006836, + "step": 10562 + }, + { + "epoch": 0.96509821836455, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 3.029391297300399e-08, + "logits/chosen": 838496256.0, + "logits/rejected": 482685132.8, + "logps/chosen": -468.2865397135417, + "logps/rejected": -408.2045654296875, + "loss": 0.0096, + "rewards/chosen": 3.7051350275675454, + "rewards/margins": 13.312588183085124, + "rewards/rejected": -9.607453155517579, + "step": 10563 + }, + { + "epoch": 0.9651895842850616, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 3.013608562895842e-08, + "logits/chosen": 418839466.6666667, + "logits/rejected": 338549792.0, + "logps/chosen": -342.731689453125, + "logps/rejected": -446.9013671875, + "loss": 0.0291, + "rewards/chosen": 3.3373406728108725, + "rewards/margins": 13.619318326314291, + "rewards/rejected": -10.281977653503418, + "step": 10564 + }, + { + "epoch": 0.9652809502055734, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 2.997866924661252e-08, + "logits/chosen": 522271456.0, + "logits/rejected": 532006613.3333333, + "logps/chosen": -316.7232360839844, + "logps/rejected": -368.6284993489583, + "loss": 0.0076, + "rewards/chosen": 3.9116151332855225, + "rewards/margins": 12.248214483261108, + "rewards/rejected": -8.336599349975586, + "step": 10565 + }, + { + "epoch": 0.965372316126085, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 2.9821663838981994e-08, + "logits/chosen": 207365077.33333334, + "logits/rejected": 414337689.6, + "logps/chosen": -111.71072387695312, + "logps/rejected": -465.292578125, + "loss": 0.014, + "rewards/chosen": 3.359511057535807, + "rewards/margins": 12.466157786051431, + "rewards/rejected": -9.106646728515624, + "step": 10566 + }, + { + "epoch": 0.9654636820465966, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 2.9665069419049784e-08, + "logits/chosen": 757953280.0, + "logits/rejected": 846236964.5714285, + "logps/chosen": -335.9389343261719, + "logps/rejected": -597.5166713169643, + "loss": 0.0118, + "rewards/chosen": 2.7404448986053467, + "rewards/margins": 11.22978935922895, + "rewards/rejected": -8.489344460623604, + "step": 10567 + }, + { + "epoch": 0.9655550479671082, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 2.950888599976609e-08, + "logits/chosen": 656214630.4, + "logits/rejected": 591714346.6666666, + "logps/chosen": -428.450390625, + "logps/rejected": -427.4046223958333, + "loss": 0.0224, + "rewards/chosen": 3.797198486328125, + "rewards/margins": 13.232655080159507, + "rewards/rejected": -9.43545659383138, + "step": 10568 + }, + { + "epoch": 0.96564641388762, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.935311359404447e-08, + "logits/chosen": 805797504.0, + "logits/rejected": 490548096.0, + "logps/chosen": -566.02587890625, + "logps/rejected": -613.0864868164062, + "loss": 0.0213, + "rewards/chosen": 3.716769218444824, + "rewards/margins": 11.827070236206055, + "rewards/rejected": -8.11030101776123, + "step": 10569 + }, + { + "epoch": 0.9657377798081316, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 2.9197752214766284e-08, + "logits/chosen": 574105024.0, + "logits/rejected": 437568512.0, + "logps/chosen": -445.666015625, + "logps/rejected": -608.3221435546875, + "loss": 0.0092, + "rewards/chosen": 4.313716411590576, + "rewards/margins": 17.377206325531006, + "rewards/rejected": -13.06348991394043, + "step": 10570 + }, + { + "epoch": 0.9658291457286432, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.9042801874777925e-08, + "logits/chosen": 392800768.0, + "logits/rejected": 264679680.0, + "logps/chosen": -202.14267578125, + "logps/rejected": -238.5069376627604, + "loss": 0.1498, + "rewards/chosen": 3.1641517639160157, + "rewards/margins": 9.705962117513021, + "rewards/rejected": -6.541810353597005, + "step": 10571 + }, + { + "epoch": 0.9659205116491548, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.888826258689359e-08, + "logits/chosen": 457573152.0, + "logits/rejected": 867161472.0, + "logps/chosen": -169.35018920898438, + "logps/rejected": -684.6129150390625, + "loss": 0.0242, + "rewards/chosen": 3.492575168609619, + "rewards/margins": 12.84231424331665, + "rewards/rejected": -9.349739074707031, + "step": 10572 + }, + { + "epoch": 0.9660118775696666, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 2.8734134363891387e-08, + "logits/chosen": 402580070.4, + "logits/rejected": 473108394.6666667, + "logps/chosen": -283.6239013671875, + "logps/rejected": -715.5895182291666, + "loss": 0.0168, + "rewards/chosen": 4.14079704284668, + "rewards/margins": 15.526630783081055, + "rewards/rejected": -11.385833740234375, + "step": 10573 + }, + { + "epoch": 0.9661032434901782, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 2.8580417218516675e-08, + "logits/chosen": 608192384.0, + "logits/rejected": 302062233.6, + "logps/chosen": -381.4781901041667, + "logps/rejected": -325.5196533203125, + "loss": 0.1011, + "rewards/chosen": 3.4355430603027344, + "rewards/margins": 10.290193939208985, + "rewards/rejected": -6.85465087890625, + "step": 10574 + }, + { + "epoch": 0.9661946094106898, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 2.842711116347985e-08, + "logits/chosen": 587098931.2, + "logits/rejected": 482030165.3333333, + "logps/chosen": -165.4294677734375, + "logps/rejected": -396.5074055989583, + "loss": 0.0261, + "rewards/chosen": 3.2392414093017576, + "rewards/margins": 12.910376358032227, + "rewards/rejected": -9.671134948730469, + "step": 10575 + }, + { + "epoch": 0.9662859753312014, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 2.8274216211458538e-08, + "logits/chosen": 510269440.0, + "logits/rejected": 329762752.0, + "logps/chosen": -405.24542236328125, + "logps/rejected": -469.7777099609375, + "loss": 0.0145, + "rewards/chosen": 3.7169432640075684, + "rewards/margins": 13.188089847564697, + "rewards/rejected": -9.471146583557129, + "step": 10576 + }, + { + "epoch": 0.9663773412517132, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 2.812173237509541e-08, + "logits/chosen": 553637802.6666666, + "logits/rejected": 758703616.0, + "logps/chosen": -309.9123128255208, + "logps/rejected": -520.49755859375, + "loss": 0.0322, + "rewards/chosen": 2.660332202911377, + "rewards/margins": 11.402011585235595, + "rewards/rejected": -8.741679382324218, + "step": 10577 + }, + { + "epoch": 0.9664687071722248, + "grad_norm": 37.0, + "kl": 0.0, + "learning_rate": 2.7969659666999273e-08, + "logits/chosen": 977521459.2, + "logits/rejected": 718163114.6666666, + "logps/chosen": -219.103564453125, + "logps/rejected": -627.5978190104166, + "loss": 0.0364, + "rewards/chosen": 3.326734924316406, + "rewards/margins": 13.340630594889323, + "rewards/rejected": -10.013895670572916, + "step": 10578 + }, + { + "epoch": 0.9665600730927364, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 2.7817998099745615e-08, + "logits/chosen": 959177523.2, + "logits/rejected": 556112384.0, + "logps/chosen": -461.103515625, + "logps/rejected": -821.9889322916666, + "loss": 0.0135, + "rewards/chosen": 3.9916259765625, + "rewards/margins": 16.81415735880534, + "rewards/rejected": -12.822531382242838, + "step": 10579 + }, + { + "epoch": 0.966651439013248, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 2.7666747685875517e-08, + "logits/chosen": 1141853286.4, + "logits/rejected": 490446208.0, + "logps/chosen": -432.865966796875, + "logps/rejected": -355.254638671875, + "loss": 0.0282, + "rewards/chosen": 3.665599822998047, + "rewards/margins": 13.667928059895834, + "rewards/rejected": -10.002328236897787, + "step": 10580 + }, + { + "epoch": 0.9667428049337597, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 2.751590843789509e-08, + "logits/chosen": 926028544.0, + "logits/rejected": 662233139.2, + "logps/chosen": -398.2100830078125, + "logps/rejected": -390.032958984375, + "loss": 0.0096, + "rewards/chosen": 3.6888205210367837, + "rewards/margins": 12.790071741739908, + "rewards/rejected": -9.101251220703125, + "step": 10581 + }, + { + "epoch": 0.9668341708542714, + "grad_norm": 4.84375, + "kl": 4.85382080078125, + "learning_rate": 2.7365480368278796e-08, + "logits/chosen": 1016884224.0, + "logits/rejected": 484092288.0, + "logps/chosen": -356.74550083705356, + "logps/rejected": -682.226806640625, + "loss": 0.0392, + "rewards/chosen": 3.7474283490862166, + "rewards/margins": 12.801248959132604, + "rewards/rejected": -9.053820610046387, + "step": 10582 + }, + { + "epoch": 0.966925536774783, + "grad_norm": 0.8984375, + "kl": 0.0, + "learning_rate": 2.7215463489463912e-08, + "logits/chosen": 465330752.0, + "logits/rejected": 591006400.0, + "logps/chosen": -148.26564025878906, + "logps/rejected": -407.0306701660156, + "loss": 0.0102, + "rewards/chosen": 4.531484603881836, + "rewards/margins": 14.325671195983887, + "rewards/rejected": -9.79418659210205, + "step": 10583 + }, + { + "epoch": 0.9670169026952946, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 2.7065857813856622e-08, + "logits/chosen": 307634624.0, + "logits/rejected": 547545792.0, + "logps/chosen": -179.4710235595703, + "logps/rejected": -524.96142578125, + "loss": 0.0232, + "rewards/chosen": 3.7039077281951904, + "rewards/margins": 12.307133436203003, + "rewards/rejected": -8.603225708007812, + "step": 10584 + }, + { + "epoch": 0.9671082686158063, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 2.6916663353828142e-08, + "logits/chosen": 602917845.3333334, + "logits/rejected": 262903264.0, + "logps/chosen": -385.7796223958333, + "logps/rejected": -420.8763122558594, + "loss": 0.0348, + "rewards/chosen": 3.7059408823649087, + "rewards/margins": 12.093969027201334, + "rewards/rejected": -8.388028144836426, + "step": 10585 + }, + { + "epoch": 0.967199634536318, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 2.6767880121714717e-08, + "logits/chosen": 429413504.0, + "logits/rejected": 617573248.0, + "logps/chosen": -336.14202880859375, + "logps/rejected": -266.46124267578125, + "loss": 0.0397, + "rewards/chosen": 3.2047220865885415, + "rewards/margins": 9.542855898539225, + "rewards/rejected": -6.338133811950684, + "step": 10586 + }, + { + "epoch": 0.9672910004568296, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 2.6619508129819282e-08, + "logits/chosen": 600185130.6666666, + "logits/rejected": 530093363.2, + "logps/chosen": -391.6913655598958, + "logps/rejected": -430.193359375, + "loss": 0.0144, + "rewards/chosen": 3.2385025024414062, + "rewards/margins": 13.357339477539062, + "rewards/rejected": -10.118836975097656, + "step": 10587 + }, + { + "epoch": 0.9673823663773412, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 2.6471547390410914e-08, + "logits/chosen": 605949866.6666666, + "logits/rejected": 818482944.0, + "logps/chosen": -271.15869140625, + "logps/rejected": -559.59130859375, + "loss": 0.0135, + "rewards/chosen": 3.4035043716430664, + "rewards/margins": 12.311243629455566, + "rewards/rejected": -8.9077392578125, + "step": 10588 + }, + { + "epoch": 0.9674737322978529, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 2.6323997915725374e-08, + "logits/chosen": 588568704.0, + "logits/rejected": 798641984.0, + "logps/chosen": -240.43533325195312, + "logps/rejected": -485.3114013671875, + "loss": 0.0091, + "rewards/chosen": 4.108287334442139, + "rewards/margins": 12.466639995574951, + "rewards/rejected": -8.358352661132812, + "step": 10589 + }, + { + "epoch": 0.9675650982183646, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 2.617685971796291e-08, + "logits/chosen": 548204646.4, + "logits/rejected": 1113301333.3333333, + "logps/chosen": -547.713330078125, + "logps/rejected": -418.284423828125, + "loss": 0.0126, + "rewards/chosen": 4.293038177490234, + "rewards/margins": 11.002566146850587, + "rewards/rejected": -6.709527969360352, + "step": 10590 + }, + { + "epoch": 0.9676564641388762, + "grad_norm": 64.5, + "kl": 0.0, + "learning_rate": 2.6030132809290454e-08, + "logits/chosen": 247497504.0, + "logits/rejected": 358094528.0, + "logps/chosen": -268.83837890625, + "logps/rejected": -365.6777648925781, + "loss": 0.078, + "rewards/chosen": 3.387712240219116, + "rewards/margins": 12.003787755966187, + "rewards/rejected": -8.61607551574707, + "step": 10591 + }, + { + "epoch": 0.9677478300593878, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 2.5883817201841076e-08, + "logits/chosen": 505619353.6, + "logits/rejected": 482317312.0, + "logps/chosen": -317.6123046875, + "logps/rejected": -370.9231770833333, + "loss": 0.0221, + "rewards/chosen": 3.988982391357422, + "rewards/margins": 13.041295878092448, + "rewards/rejected": -9.052313486735025, + "step": 10592 + }, + { + "epoch": 0.9678391959798995, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 2.5737912907714547e-08, + "logits/chosen": 369496128.0, + "logits/rejected": 364390400.0, + "logps/chosen": -318.9683532714844, + "logps/rejected": -494.48193359375, + "loss": 0.0049, + "rewards/chosen": 4.321893215179443, + "rewards/margins": 12.002454280853271, + "rewards/rejected": -7.680561065673828, + "step": 10593 + }, + { + "epoch": 0.9679305619004112, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.5592419938974545e-08, + "logits/chosen": 484470826.6666667, + "logits/rejected": 516897126.4, + "logps/chosen": -369.0111083984375, + "logps/rejected": -427.4513671875, + "loss": 0.0179, + "rewards/chosen": 3.0033305486043296, + "rewards/margins": 11.795111401875815, + "rewards/rejected": -8.791780853271485, + "step": 10594 + }, + { + "epoch": 0.9680219278209228, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 2.544733830765256e-08, + "logits/chosen": 510709312.0, + "logits/rejected": 479444992.0, + "logps/chosen": -245.86094665527344, + "logps/rejected": -528.5142822265625, + "loss": 0.0563, + "rewards/chosen": 3.5922656059265137, + "rewards/margins": 13.358621756235758, + "rewards/rejected": -9.766356150309244, + "step": 10595 + }, + { + "epoch": 0.9681132937414344, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 2.53026680257451e-08, + "logits/chosen": 529098137.6, + "logits/rejected": 509179562.6666667, + "logps/chosen": -390.4158447265625, + "logps/rejected": -621.3334147135416, + "loss": 0.0179, + "rewards/chosen": 3.7527137756347657, + "rewards/margins": 12.839738464355468, + "rewards/rejected": -9.087024688720703, + "step": 10596 + }, + { + "epoch": 0.9682046596619461, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 2.5158409105215942e-08, + "logits/chosen": 390602048.0, + "logits/rejected": 667751104.0, + "logps/chosen": -316.02130126953125, + "logps/rejected": -461.2359924316406, + "loss": 0.1288, + "rewards/chosen": 3.732048273086548, + "rewards/margins": 11.100374460220337, + "rewards/rejected": -7.368326187133789, + "step": 10597 + }, + { + "epoch": 0.9682960255824578, + "grad_norm": 30.25, + "kl": 0.0, + "learning_rate": 2.5014561557992756e-08, + "logits/chosen": 406661504.0, + "logits/rejected": 394952917.3333333, + "logps/chosen": -287.90167236328125, + "logps/rejected": -432.2491861979167, + "loss": 0.0976, + "rewards/chosen": 4.325587272644043, + "rewards/margins": 12.43478806813558, + "rewards/rejected": -8.109200795491537, + "step": 10598 + }, + { + "epoch": 0.9683873915029694, + "grad_norm": 46.0, + "kl": 0.0, + "learning_rate": 2.4871125395971586e-08, + "logits/chosen": 602892851.2, + "logits/rejected": 363912746.6666667, + "logps/chosen": -360.662158203125, + "logps/rejected": -306.0069173177083, + "loss": 0.074, + "rewards/chosen": 3.6026531219482423, + "rewards/margins": 9.964269892374675, + "rewards/rejected": -6.361616770426433, + "step": 10599 + }, + { + "epoch": 0.968478757423481, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 2.472810063101294e-08, + "logits/chosen": 667333184.0, + "logits/rejected": 472501056.0, + "logps/chosen": -368.63946533203125, + "logps/rejected": -422.53118896484375, + "loss": 0.0205, + "rewards/chosen": 3.31158709526062, + "rewards/margins": 13.037761449813843, + "rewards/rejected": -9.726174354553223, + "step": 10600 + }, + { + "epoch": 0.9685701233439927, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 2.4585487274942922e-08, + "logits/chosen": 350723584.0, + "logits/rejected": 778898496.0, + "logps/chosen": -214.46295166015625, + "logps/rejected": -501.2966003417969, + "loss": 0.0117, + "rewards/chosen": 4.563811302185059, + "rewards/margins": 12.711020469665527, + "rewards/rejected": -8.147209167480469, + "step": 10601 + }, + { + "epoch": 0.9686614892645043, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.4443285339555422e-08, + "logits/chosen": 404134144.0, + "logits/rejected": 448114858.6666667, + "logps/chosen": -259.624853515625, + "logps/rejected": -587.4368896484375, + "loss": 0.0229, + "rewards/chosen": 3.513987350463867, + "rewards/margins": 14.831510035196938, + "rewards/rejected": -11.317522684733072, + "step": 10602 + }, + { + "epoch": 0.968752855185016, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.4301494836608264e-08, + "logits/chosen": 774636864.0, + "logits/rejected": 978445056.0, + "logps/chosen": -365.53118896484375, + "logps/rejected": -449.43743896484375, + "loss": 0.026, + "rewards/chosen": 3.549691677093506, + "rewards/margins": 11.904306888580322, + "rewards/rejected": -8.354615211486816, + "step": 10603 + }, + { + "epoch": 0.9688442211055276, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 2.416011577782651e-08, + "logits/chosen": 444540288.0, + "logits/rejected": 290772480.0, + "logps/chosen": -340.99871826171875, + "logps/rejected": -672.9534912109375, + "loss": 0.0143, + "rewards/chosen": 4.046316623687744, + "rewards/margins": 14.001196384429932, + "rewards/rejected": -9.954879760742188, + "step": 10604 + }, + { + "epoch": 0.9689355870260393, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.4019148174901364e-08, + "logits/chosen": 481178060.8, + "logits/rejected": 389125589.3333333, + "logps/chosen": -404.874072265625, + "logps/rejected": -355.2320963541667, + "loss": 0.0163, + "rewards/chosen": 4.001393127441406, + "rewards/margins": 13.232080586751302, + "rewards/rejected": -9.230687459309896, + "step": 10605 + }, + { + "epoch": 0.969026952946551, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 2.3878592039489057e-08, + "logits/chosen": 814430208.0, + "logits/rejected": 673935872.0, + "logps/chosen": -389.4772135416667, + "logps/rejected": -600.425341796875, + "loss": 0.1092, + "rewards/chosen": 4.0725447336832685, + "rewards/margins": 14.452943293253583, + "rewards/rejected": -10.380398559570313, + "step": 10606 + }, + { + "epoch": 0.9691183188670626, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 2.373844738321196e-08, + "logits/chosen": 340484394.6666667, + "logits/rejected": 394787200.0, + "logps/chosen": -235.17911783854166, + "logps/rejected": -389.979248046875, + "loss": 0.1055, + "rewards/chosen": 4.139395713806152, + "rewards/margins": 11.24278621673584, + "rewards/rejected": -7.103390502929687, + "step": 10607 + }, + { + "epoch": 0.9692096847875742, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 2.359871421765969e-08, + "logits/chosen": 404592347.4285714, + "logits/rejected": 431877856.0, + "logps/chosen": -307.08726283482144, + "logps/rejected": -528.6660766601562, + "loss": 0.0242, + "rewards/chosen": 4.065164293561663, + "rewards/margins": 16.44342395237514, + "rewards/rejected": -12.378259658813477, + "step": 10608 + }, + { + "epoch": 0.9693010507080859, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 2.345939255438634e-08, + "logits/chosen": 301076768.0, + "logits/rejected": 403952896.0, + "logps/chosen": -230.9138946533203, + "logps/rejected": -430.6164855957031, + "loss": 0.0124, + "rewards/chosen": 4.473899841308594, + "rewards/margins": 12.636526107788086, + "rewards/rejected": -8.162626266479492, + "step": 10609 + }, + { + "epoch": 0.9693924166285975, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 2.3320482404912137e-08, + "logits/chosen": 429590432.0, + "logits/rejected": 627307328.0, + "logps/chosen": -238.9678497314453, + "logps/rejected": -563.630615234375, + "loss": 0.1135, + "rewards/chosen": 2.587282180786133, + "rewards/margins": 13.012311935424805, + "rewards/rejected": -10.425029754638672, + "step": 10610 + }, + { + "epoch": 0.9694837825491092, + "grad_norm": 0.85546875, + "kl": 0.0, + "learning_rate": 2.318198378072456e-08, + "logits/chosen": 220519936.0, + "logits/rejected": 440122660.5714286, + "logps/chosen": -131.70445251464844, + "logps/rejected": -366.8897181919643, + "loss": 0.0043, + "rewards/chosen": 3.435411214828491, + "rewards/margins": 12.768841913768224, + "rewards/rejected": -9.333430698939733, + "step": 10611 + }, + { + "epoch": 0.9695751484696208, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 2.3043896693276113e-08, + "logits/chosen": 324593868.8, + "logits/rejected": 403418709.3333333, + "logps/chosen": -257.5794921875, + "logps/rejected": -327.54958089192706, + "loss": 0.1295, + "rewards/chosen": 2.8821666717529295, + "rewards/margins": 10.732154337565104, + "rewards/rejected": -7.849987665812175, + "step": 10612 + }, + { + "epoch": 0.9696665143901325, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 2.2906221153983775e-08, + "logits/chosen": 1280345344.0, + "logits/rejected": 895955840.0, + "logps/chosen": -349.9644470214844, + "logps/rejected": -920.2411499023438, + "loss": 0.022, + "rewards/chosen": 3.369901418685913, + "rewards/margins": 15.527362585067749, + "rewards/rejected": -12.157461166381836, + "step": 10613 + }, + { + "epoch": 0.9697578803106441, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 2.2768957174233995e-08, + "logits/chosen": 376615840.0, + "logits/rejected": 419641376.0, + "logps/chosen": -200.64088439941406, + "logps/rejected": -371.90032958984375, + "loss": 0.0181, + "rewards/chosen": 3.7191359996795654, + "rewards/margins": 12.088206052780151, + "rewards/rejected": -8.369070053100586, + "step": 10614 + }, + { + "epoch": 0.9698492462311558, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 2.2632104765376027e-08, + "logits/chosen": 535679424.0, + "logits/rejected": 688014592.0, + "logps/chosen": -265.5992736816406, + "logps/rejected": -567.957763671875, + "loss": 0.0049, + "rewards/chosen": 5.170942783355713, + "rewards/margins": 14.5325026512146, + "rewards/rejected": -9.361559867858887, + "step": 10615 + }, + { + "epoch": 0.9699406121516674, + "grad_norm": 0.01019287109375, + "kl": 0.0, + "learning_rate": 2.249566393872693e-08, + "logits/rejected": 416225472.0, + "logps/rejected": -573.7496337890625, + "loss": 0.0, + "rewards/rejected": -12.421392440795898, + "step": 10616 + }, + { + "epoch": 0.9700319780721791, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 2.2359634705568233e-08, + "logits/chosen": 602391360.0, + "logits/rejected": 445082496.0, + "logps/chosen": -341.91290283203125, + "logps/rejected": -460.96832275390625, + "loss": 0.0178, + "rewards/chosen": 3.4774131774902344, + "rewards/margins": 12.954154968261719, + "rewards/rejected": -9.476741790771484, + "step": 10617 + }, + { + "epoch": 0.9701233439926907, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 2.2224017077149273e-08, + "logits/chosen": 495968480.0, + "logits/rejected": 527134176.0, + "logps/chosen": -423.6741943359375, + "logps/rejected": -490.1221008300781, + "loss": 0.0147, + "rewards/chosen": 4.288758277893066, + "rewards/margins": 14.409071922302246, + "rewards/rejected": -10.12031364440918, + "step": 10618 + }, + { + "epoch": 0.9702147099132024, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 2.2088811064683856e-08, + "logits/chosen": 341608960.0, + "logits/rejected": 497678016.0, + "logps/chosen": -279.5725911458333, + "logps/rejected": -374.71942138671875, + "loss": 0.0185, + "rewards/chosen": 4.254804611206055, + "rewards/margins": 11.245702743530273, + "rewards/rejected": -6.990898132324219, + "step": 10619 + }, + { + "epoch": 0.970306075833714, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 2.1954016679352486e-08, + "logits/chosen": 439050035.2, + "logits/rejected": 447599274.6666667, + "logps/chosen": -267.602392578125, + "logps/rejected": -474.2550048828125, + "loss": 0.0232, + "rewards/chosen": 3.409682846069336, + "rewards/margins": 13.768416341145834, + "rewards/rejected": -10.358733495076498, + "step": 10620 + }, + { + "epoch": 0.9703974417542257, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 2.1819633932301797e-08, + "logits/chosen": 53673824.0, + "logits/rejected": 336798427.4285714, + "logps/chosen": -499.827880859375, + "logps/rejected": -497.87904575892856, + "loss": 0.0038, + "rewards/chosen": 3.469531297683716, + "rewards/margins": 14.316181557519096, + "rewards/rejected": -10.84665025983538, + "step": 10621 + }, + { + "epoch": 0.9704888076747373, + "grad_norm": 0.9140625, + "kl": 0.0, + "learning_rate": 2.1685662834642908e-08, + "logits/chosen": 238050201.6, + "logits/rejected": 460698794.6666667, + "logps/chosen": -192.92794189453124, + "logps/rejected": -299.89794921875, + "loss": 0.0076, + "rewards/chosen": 4.684587860107422, + "rewards/margins": 14.00353978474935, + "rewards/rejected": -9.318951924641928, + "step": 10622 + }, + { + "epoch": 0.970580173595249, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 2.1552103397454725e-08, + "logits/chosen": 619472128.0, + "logits/rejected": 586118784.0, + "logps/chosen": -489.2547200520833, + "logps/rejected": -388.5945739746094, + "loss": 0.0137, + "rewards/chosen": 4.313330332438151, + "rewards/margins": 13.038208643595379, + "rewards/rejected": -8.724878311157227, + "step": 10623 + }, + { + "epoch": 0.9706715395157606, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 2.1418955631781203e-08, + "logits/chosen": 588823296.0, + "logits/rejected": 721638144.0, + "logps/chosen": -339.9222412109375, + "logps/rejected": -731.7608032226562, + "loss": 0.0213, + "rewards/chosen": 3.4970178604125977, + "rewards/margins": 11.610139846801758, + "rewards/rejected": -8.11312198638916, + "step": 10624 + }, + { + "epoch": 0.9707629054362723, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 2.1286219548632415e-08, + "logits/chosen": 635212672.0, + "logits/rejected": 390299538.28571427, + "logps/chosen": -527.192138671875, + "logps/rejected": -486.68777901785717, + "loss": 0.0206, + "rewards/chosen": 1.6480712890625, + "rewards/margins": 10.881783621651786, + "rewards/rejected": -9.233712332589286, + "step": 10625 + }, + { + "epoch": 0.9708542713567839, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 2.1153895158984582e-08, + "logits/chosen": 429103872.0, + "logits/rejected": 417083008.0, + "logps/chosen": -261.7594807942708, + "logps/rejected": -463.52978515625, + "loss": 0.0165, + "rewards/chosen": 4.153896331787109, + "rewards/margins": 13.02295970916748, + "rewards/rejected": -8.869063377380371, + "step": 10626 + }, + { + "epoch": 0.9709456372772955, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 2.1021982473779513e-08, + "logits/chosen": 572089792.0, + "logits/rejected": 418611136.0, + "logps/chosen": -261.77203369140625, + "logps/rejected": -318.58770751953125, + "loss": 0.0197, + "rewards/chosen": 3.774773120880127, + "rewards/margins": 12.551037311553955, + "rewards/rejected": -8.776264190673828, + "step": 10627 + }, + { + "epoch": 0.9710370031978072, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 2.089048150392514e-08, + "logits/chosen": 982783744.0, + "logits/rejected": 904614741.3333334, + "logps/chosen": -401.1472473144531, + "logps/rejected": -482.8687744140625, + "loss": 0.0055, + "rewards/chosen": 4.14105224609375, + "rewards/margins": 13.402461369832357, + "rewards/rejected": -9.261409123738607, + "step": 10628 + }, + { + "epoch": 0.9711283691183189, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 2.075939226029555e-08, + "logits/chosen": 951648170.6666666, + "logits/rejected": 662033203.2, + "logps/chosen": -621.9098307291666, + "logps/rejected": -345.7832275390625, + "loss": 0.0166, + "rewards/chosen": 3.789844830830892, + "rewards/margins": 12.886762555440267, + "rewards/rejected": -9.096917724609375, + "step": 10629 + }, + { + "epoch": 0.9712197350388305, + "grad_norm": 39.25, + "kl": 0.0, + "learning_rate": 2.062871475373096e-08, + "logits/chosen": 289346368.0, + "logits/rejected": 229932000.0, + "logps/chosen": -161.82749938964844, + "logps/rejected": -311.5990905761719, + "loss": 0.104, + "rewards/chosen": 2.916623592376709, + "rewards/margins": 10.249557495117188, + "rewards/rejected": -7.3329339027404785, + "step": 10630 + }, + { + "epoch": 0.9713111009593421, + "grad_norm": 0.283203125, + "kl": 0.0, + "learning_rate": 2.0498448995036056e-08, + "logits/chosen": 636418730.6666666, + "logits/rejected": 556418918.4, + "logps/chosen": -349.8529052734375, + "logps/rejected": -495.466748046875, + "loss": 0.0018, + "rewards/chosen": 5.555385589599609, + "rewards/margins": 14.519257354736329, + "rewards/rejected": -8.96387176513672, + "step": 10631 + }, + { + "epoch": 0.9714024668798538, + "grad_norm": 54.25, + "kl": 0.0, + "learning_rate": 2.0368594994983893e-08, + "logits/chosen": 571766592.0, + "logits/rejected": 608915456.0, + "logps/chosen": -342.28424072265625, + "logps/rejected": -373.61492919921875, + "loss": 0.0529, + "rewards/chosen": 2.307002305984497, + "rewards/margins": 11.89876103401184, + "rewards/rejected": -9.591758728027344, + "step": 10632 + }, + { + "epoch": 0.9714938328003655, + "grad_norm": 0.271484375, + "kl": 0.0, + "learning_rate": 2.023915276431143e-08, + "logits/chosen": 244058080.0, + "logits/rejected": 473000618.6666667, + "logps/chosen": -196.2235870361328, + "logps/rejected": -523.6234537760416, + "loss": 0.0013, + "rewards/chosen": 5.34727668762207, + "rewards/margins": 15.172798156738281, + "rewards/rejected": -9.825521469116211, + "step": 10633 + }, + { + "epoch": 0.9715851987208771, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 2.0110122313722334e-08, + "logits/chosen": 1092147882.6666667, + "logits/rejected": 454288793.6, + "logps/chosen": -424.81689453125, + "logps/rejected": -360.5064453125, + "loss": 0.0054, + "rewards/chosen": 4.5797913869222, + "rewards/margins": 12.725120671590169, + "rewards/rejected": -8.14532928466797, + "step": 10634 + }, + { + "epoch": 0.9716765646413887, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 1.9981503653886404e-08, + "logits/chosen": 352853056.0, + "logits/rejected": 406344128.0, + "logps/chosen": -225.03134155273438, + "logps/rejected": -495.2811279296875, + "loss": 0.0203, + "rewards/chosen": 3.8125381469726562, + "rewards/margins": 13.209441184997559, + "rewards/rejected": -9.396903038024902, + "step": 10635 + }, + { + "epoch": 0.9717679305619004, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 1.985329679543957e-08, + "logits/chosen": 524815296.0, + "logits/rejected": 494132160.0, + "logps/chosen": -405.43939208984375, + "logps/rejected": -616.5394897460938, + "loss": 0.014, + "rewards/chosen": 4.041851997375488, + "rewards/margins": 12.231255531311035, + "rewards/rejected": -8.189403533935547, + "step": 10636 + }, + { + "epoch": 0.9718592964824121, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.97255017489828e-08, + "logits/chosen": 472884940.8, + "logits/rejected": 604448512.0, + "logps/chosen": -397.130078125, + "logps/rejected": -270.00124104817706, + "loss": 0.0272, + "rewards/chosen": 3.2409389495849608, + "rewards/margins": 10.720105616251628, + "rewards/rejected": -7.479166666666667, + "step": 10637 + }, + { + "epoch": 0.9719506624029237, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 1.9598118525083753e-08, + "logits/chosen": 600843306.6666666, + "logits/rejected": 546559232.0, + "logps/chosen": -307.8158772786458, + "logps/rejected": -548.6495361328125, + "loss": 0.1554, + "rewards/chosen": 2.185612519582113, + "rewards/margins": 12.561469872792562, + "rewards/rejected": -10.37585735321045, + "step": 10638 + }, + { + "epoch": 0.9720420283234353, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 1.9471147134276226e-08, + "logits/chosen": 576048554.6666666, + "logits/rejected": 204199014.4, + "logps/chosen": -398.0128580729167, + "logps/rejected": -330.889501953125, + "loss": 0.0187, + "rewards/chosen": 3.256061871846517, + "rewards/margins": 12.814174969991049, + "rewards/rejected": -9.558113098144531, + "step": 10639 + }, + { + "epoch": 0.972133394243947, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 1.9344587587059037e-08, + "logits/chosen": 514990489.6, + "logits/rejected": 533600256.0, + "logps/chosen": -270.2406005859375, + "logps/rejected": -400.0547281901042, + "loss": 0.0272, + "rewards/chosen": 3.910724639892578, + "rewards/margins": 11.784799575805664, + "rewards/rejected": -7.874074935913086, + "step": 10640 + }, + { + "epoch": 0.9722247601644587, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 1.9218439893897157e-08, + "logits/chosen": 618024533.3333334, + "logits/rejected": 1758225024.0, + "logps/chosen": -498.9591878255208, + "logps/rejected": -510.83270263671875, + "loss": 0.0232, + "rewards/chosen": 4.247820536295573, + "rewards/margins": 13.004423777262371, + "rewards/rejected": -8.756603240966797, + "step": 10641 + }, + { + "epoch": 0.9723161260849703, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 1.909270406522279e-08, + "logits/chosen": 428395232.0, + "logits/rejected": 484693589.3333333, + "logps/chosen": -275.1864013671875, + "logps/rejected": -532.5513509114584, + "loss": 0.0097, + "rewards/chosen": 3.526303291320801, + "rewards/margins": 12.896879514058432, + "rewards/rejected": -9.37057622273763, + "step": 10642 + }, + { + "epoch": 0.9724074920054819, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 1.896738011143262e-08, + "logits/chosen": 632206677.3333334, + "logits/rejected": 567186636.8, + "logps/chosen": -377.8003336588542, + "logps/rejected": -582.332421875, + "loss": 0.101, + "rewards/chosen": 4.384572347005208, + "rewards/margins": 12.150569661458334, + "rewards/rejected": -7.765997314453125, + "step": 10643 + }, + { + "epoch": 0.9724988579259936, + "grad_norm": 0.5078125, + "kl": 0.0, + "learning_rate": 1.884246804288947e-08, + "logits/chosen": 957371584.0, + "logits/rejected": 505451702.85714287, + "logps/chosen": -214.80783081054688, + "logps/rejected": -446.19384765625, + "loss": 0.0025, + "rewards/chosen": 4.250280857086182, + "rewards/margins": 12.973631381988525, + "rewards/rejected": -8.723350524902344, + "step": 10644 + }, + { + "epoch": 0.9725902238465053, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 1.8717967869923416e-08, + "logits/chosen": 811318784.0, + "logits/rejected": 490051296.0, + "logps/chosen": -227.61795043945312, + "logps/rejected": -355.4944763183594, + "loss": 0.0197, + "rewards/chosen": 3.44578218460083, + "rewards/margins": 11.797670841217041, + "rewards/rejected": -8.351888656616211, + "step": 10645 + }, + { + "epoch": 0.9726815897670169, + "grad_norm": 0.33203125, + "kl": 0.0, + "learning_rate": 1.8593879602828434e-08, + "logits/rejected": 361333248.0, + "logps/rejected": -429.59161376953125, + "loss": 0.0009, + "rewards/rejected": -9.178425788879395, + "step": 10646 + }, + { + "epoch": 0.9727729556875285, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 1.847020325186577e-08, + "logits/chosen": 723643136.0, + "logits/rejected": 364134400.0, + "logps/chosen": -262.4683532714844, + "logps/rejected": -477.9107666015625, + "loss": 0.0078, + "rewards/chosen": 5.286115646362305, + "rewards/margins": 14.951602935791016, + "rewards/rejected": -9.665487289428711, + "step": 10647 + }, + { + "epoch": 0.9728643216080402, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 1.834693882726224e-08, + "logits/chosen": 544979712.0, + "logits/rejected": 900580928.0, + "logps/chosen": -197.2678680419922, + "logps/rejected": -487.1224670410156, + "loss": 0.0235, + "rewards/chosen": 3.232581853866577, + "rewards/margins": 13.465768575668335, + "rewards/rejected": -10.233186721801758, + "step": 10648 + }, + { + "epoch": 0.9729556875285519, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 1.8224086339211356e-08, + "logits/chosen": 587068544.0, + "logits/rejected": 735287936.0, + "logps/chosen": -303.25579833984375, + "logps/rejected": -402.8236389160156, + "loss": 0.0102, + "rewards/chosen": 4.229612350463867, + "rewards/margins": 12.967750549316406, + "rewards/rejected": -8.738138198852539, + "step": 10649 + }, + { + "epoch": 0.9730470534490635, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 1.8101645797871104e-08, + "logits/chosen": 420741034.6666667, + "logits/rejected": 421503744.0, + "logps/chosen": -219.6939697265625, + "logps/rejected": -545.782958984375, + "loss": 0.1302, + "rewards/chosen": 3.112488110860189, + "rewards/margins": 12.179224332173666, + "rewards/rejected": -9.066736221313477, + "step": 10650 + }, + { + "epoch": 0.9731384193695751, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.7979617213366163e-08, + "logits/chosen": 469365312.0, + "logits/rejected": 621538688.0, + "logps/chosen": -393.59698486328125, + "logps/rejected": -668.4759521484375, + "loss": 0.0107, + "rewards/chosen": 4.119868278503418, + "rewards/margins": 12.823250770568848, + "rewards/rejected": -8.70338249206543, + "step": 10651 + }, + { + "epoch": 0.9732297852900867, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 1.7858000595787904e-08, + "logits/chosen": 470372915.2, + "logits/rejected": 796723968.0, + "logps/chosen": -380.630126953125, + "logps/rejected": -364.2058919270833, + "loss": 0.0158, + "rewards/chosen": 4.396006011962891, + "rewards/margins": 15.20251439412435, + "rewards/rejected": -10.806508382161459, + "step": 10652 + }, + { + "epoch": 0.9733211512105985, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 1.7736795955192175e-08, + "logits/chosen": 765878579.2, + "logits/rejected": 1184096768.0, + "logps/chosen": -474.00615234375, + "logps/rejected": -500.9080403645833, + "loss": 0.0109, + "rewards/chosen": 4.243149185180664, + "rewards/margins": 14.81877301534017, + "rewards/rejected": -10.575623830159506, + "step": 10653 + }, + { + "epoch": 0.9734125171311101, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 1.7616003301601514e-08, + "logits/chosen": 582506496.0, + "logits/rejected": 1137470037.3333333, + "logps/chosen": -277.107275390625, + "logps/rejected": -327.0732421875, + "loss": 0.0521, + "rewards/chosen": 3.4120792388916015, + "rewards/margins": 13.776593907674155, + "rewards/rejected": -10.364514668782553, + "step": 10654 + }, + { + "epoch": 0.9735038830516217, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 1.749562264500515e-08, + "logits/chosen": 445264810.6666667, + "logits/rejected": 745026560.0, + "logps/chosen": -324.7149658203125, + "logps/rejected": -481.6122741699219, + "loss": 0.0169, + "rewards/chosen": 3.9672263463338218, + "rewards/margins": 11.041396458943685, + "rewards/rejected": -7.074170112609863, + "step": 10655 + }, + { + "epoch": 0.9735952489721333, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.7375653995356233e-08, + "logits/chosen": 666976358.4, + "logits/rejected": 324455914.6666667, + "logps/chosen": -237.3861083984375, + "logps/rejected": -465.5785725911458, + "loss": 0.0125, + "rewards/chosen": 4.35571517944336, + "rewards/margins": 12.5421750386556, + "rewards/rejected": -8.18645985921224, + "step": 10656 + }, + { + "epoch": 0.9736866148926451, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 1.7256097362576275e-08, + "logits/chosen": 636516147.2, + "logits/rejected": 566558122.6666666, + "logps/chosen": -299.9955078125, + "logps/rejected": -644.7464192708334, + "loss": 0.0286, + "rewards/chosen": 3.7498146057128907, + "rewards/margins": 15.822115834554037, + "rewards/rejected": -12.072301228841146, + "step": 10657 + }, + { + "epoch": 0.9737779808131567, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 1.7136952756550695e-08, + "logits/chosen": 381594282.6666667, + "logits/rejected": 334061376.0, + "logps/chosen": -428.4365641276042, + "logps/rejected": -398.54766845703125, + "loss": 0.0719, + "rewards/chosen": 5.4662831624348955, + "rewards/margins": 10.851207574208576, + "rewards/rejected": -5.384924411773682, + "step": 10658 + }, + { + "epoch": 0.9738693467336683, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 1.701822018713162e-08, + "logits/chosen": 367231948.8, + "logits/rejected": 515842602.6666667, + "logps/chosen": -273.422412109375, + "logps/rejected": -472.2773844401042, + "loss": 0.0115, + "rewards/chosen": 4.931365966796875, + "rewards/margins": 13.376182556152344, + "rewards/rejected": -8.444816589355469, + "step": 10659 + }, + { + "epoch": 0.9739607126541799, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.68998996641373e-08, + "logits/chosen": 733823872.0, + "logits/rejected": 705552768.0, + "logps/chosen": -308.68841552734375, + "logps/rejected": -466.61920166015625, + "loss": 0.0083, + "rewards/chosen": 4.592724800109863, + "rewards/margins": 14.118170738220215, + "rewards/rejected": -9.525445938110352, + "step": 10660 + }, + { + "epoch": 0.9740520785746917, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.6781991197352133e-08, + "logits/chosen": 514187712.0, + "logits/rejected": 642583040.0, + "logps/chosen": -348.689208984375, + "logps/rejected": -539.1348876953125, + "loss": 0.0141, + "rewards/chosen": 4.387810707092285, + "rewards/margins": 12.93757152557373, + "rewards/rejected": -8.549760818481445, + "step": 10661 + }, + { + "epoch": 0.9741434444952033, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 1.6664494796526098e-08, + "logits/chosen": 490204569.6, + "logits/rejected": 680210090.6666666, + "logps/chosen": -332.8248046875, + "logps/rejected": -445.7330729166667, + "loss": 0.0338, + "rewards/chosen": 3.9554229736328126, + "rewards/margins": 11.698867416381836, + "rewards/rejected": -7.743444442749023, + "step": 10662 + }, + { + "epoch": 0.9742348104157149, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 1.6547410471374203e-08, + "logits/chosen": 521191744.0, + "logits/rejected": 613672576.0, + "logps/chosen": -397.2563171386719, + "logps/rejected": -548.265380859375, + "loss": 0.0197, + "rewards/chosen": 3.2857232093811035, + "rewards/margins": 14.871336460113525, + "rewards/rejected": -11.585613250732422, + "step": 10663 + }, + { + "epoch": 0.9743261763362265, + "grad_norm": 1.671875, + "kl": 0.0, + "learning_rate": 1.64307382315787e-08, + "logits/chosen": 482592000.0, + "logits/rejected": 469188147.2, + "logps/chosen": -331.4702555338542, + "logps/rejected": -412.342919921875, + "loss": 0.0099, + "rewards/chosen": 3.9467363357543945, + "rewards/margins": 13.00784854888916, + "rewards/rejected": -9.061112213134766, + "step": 10664 + }, + { + "epoch": 0.9744175422567383, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.6314478086787988e-08, + "logits/chosen": 767470592.0, + "logits/rejected": 1164965068.8, + "logps/chosen": -495.7783610026042, + "logps/rejected": -677.49169921875, + "loss": 0.0172, + "rewards/chosen": 4.063490867614746, + "rewards/margins": 15.19110927581787, + "rewards/rejected": -11.127618408203125, + "step": 10665 + }, + { + "epoch": 0.9745089081772499, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 1.6198630046614373e-08, + "logits/chosen": 653388245.3333334, + "logits/rejected": 377044019.2, + "logps/chosen": -552.8368326822916, + "logps/rejected": -270.0489501953125, + "loss": 0.0121, + "rewards/chosen": 3.605463663736979, + "rewards/margins": 10.442863718668619, + "rewards/rejected": -6.83740005493164, + "step": 10666 + }, + { + "epoch": 0.9746002740977615, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 1.6083194120638523e-08, + "logits/chosen": 872399701.3333334, + "logits/rejected": 830380902.4, + "logps/chosen": -357.8071695963542, + "logps/rejected": -538.14091796875, + "loss": 0.0196, + "rewards/chosen": 3.0377209981282554, + "rewards/margins": 13.531795247395834, + "rewards/rejected": -10.494074249267578, + "step": 10667 + }, + { + "epoch": 0.9746916400182731, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 1.596817031840503e-08, + "logits/chosen": 376230176.0, + "logits/rejected": 431278912.0, + "logps/chosen": -262.2837219238281, + "logps/rejected": -643.1983032226562, + "loss": 0.0148, + "rewards/chosen": 3.638540267944336, + "rewards/margins": 14.535881996154785, + "rewards/rejected": -10.89734172821045, + "step": 10668 + }, + { + "epoch": 0.9747830059387849, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 1.585355864942628e-08, + "logits/chosen": 457045632.0, + "logits/rejected": 365131648.0, + "logps/chosen": -301.4932047526042, + "logps/rejected": -351.9215087890625, + "loss": 0.0133, + "rewards/chosen": 4.336318333943685, + "rewards/margins": 13.820990880330402, + "rewards/rejected": -9.484672546386719, + "step": 10669 + }, + { + "epoch": 0.9748743718592965, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.5739359123178587e-08, + "logits/chosen": 815249792.0, + "logits/rejected": 707436288.0, + "logps/chosen": -396.1858215332031, + "logps/rejected": -252.06549072265625, + "loss": 0.0116, + "rewards/chosen": 4.172316074371338, + "rewards/margins": 13.498158931732178, + "rewards/rejected": -9.32584285736084, + "step": 10670 + }, + { + "epoch": 0.9749657377798081, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 1.562557174910606e-08, + "logits/chosen": 589068288.0, + "logits/rejected": 583768960.0, + "logps/chosen": -324.8067626953125, + "logps/rejected": -552.6249389648438, + "loss": 0.0135, + "rewards/chosen": 3.950226306915283, + "rewards/margins": 13.365612506866455, + "rewards/rejected": -9.415386199951172, + "step": 10671 + }, + { + "epoch": 0.9750571037003197, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 1.5512196536617286e-08, + "logits/chosen": 624911104.0, + "logits/rejected": 644675520.0, + "logps/chosen": -234.4912109375, + "logps/rejected": -383.0378723144531, + "loss": 0.0086, + "rewards/chosen": 4.6039299964904785, + "rewards/margins": 13.322767734527588, + "rewards/rejected": -8.71883773803711, + "step": 10672 + }, + { + "epoch": 0.9751484696208315, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 1.5399233495088093e-08, + "logits/chosen": 464637696.0, + "logits/rejected": 174619552.0, + "logps/chosen": -205.11830357142858, + "logps/rejected": -348.7201843261719, + "loss": 0.0306, + "rewards/chosen": 3.69998414175851, + "rewards/margins": 14.981521197727748, + "rewards/rejected": -11.281537055969238, + "step": 10673 + }, + { + "epoch": 0.9752398355413431, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 1.528668263385824e-08, + "logits/chosen": 529101376.0, + "logits/rejected": 433849792.0, + "logps/chosen": -268.68280029296875, + "logps/rejected": -587.3692016601562, + "loss": 0.017, + "rewards/chosen": 4.0305938720703125, + "rewards/margins": 13.636014938354492, + "rewards/rejected": -9.60542106628418, + "step": 10674 + }, + { + "epoch": 0.9753312014618547, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 1.5174543962235832e-08, + "logits/chosen": 593844394.6666666, + "logits/rejected": 346274816.0, + "logps/chosen": -309.2781982421875, + "logps/rejected": -323.54521484375, + "loss": 0.0083, + "rewards/chosen": 4.339382807413737, + "rewards/margins": 12.35396105448405, + "rewards/rejected": -8.014578247070313, + "step": 10675 + }, + { + "epoch": 0.9754225673823663, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 1.5062817489492897e-08, + "logits/chosen": 408919296.0, + "logits/rejected": 456213589.3333333, + "logps/chosen": -249.922119140625, + "logps/rejected": -658.061767578125, + "loss": 0.0104, + "rewards/chosen": 4.635231781005859, + "rewards/margins": 14.87818285624186, + "rewards/rejected": -10.242951075236002, + "step": 10676 + }, + { + "epoch": 0.9755139333028781, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 1.4951503224868713e-08, + "logits/chosen": 603505005.7142857, + "logits/rejected": 321873280.0, + "logps/chosen": -257.35689871651783, + "logps/rejected": -609.9335327148438, + "loss": 0.0455, + "rewards/chosen": 3.1403606959751675, + "rewards/margins": 17.99051938738142, + "rewards/rejected": -14.85015869140625, + "step": 10677 + }, + { + "epoch": 0.9756052992233897, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 1.4840601177567582e-08, + "logits/chosen": 449745510.4, + "logits/rejected": 285550592.0, + "logps/chosen": -258.328466796875, + "logps/rejected": -496.5515950520833, + "loss": 0.0172, + "rewards/chosen": 4.3342548370361325, + "rewards/margins": 12.948671595255533, + "rewards/rejected": -8.6144167582194, + "step": 10678 + }, + { + "epoch": 0.9756966651439013, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 1.4730111356759947e-08, + "logits/chosen": 588084224.0, + "logits/rejected": 411248128.0, + "logps/chosen": -423.1116129557292, + "logps/rejected": -582.177099609375, + "loss": 0.0059, + "rewards/chosen": 4.71646245320638, + "rewards/margins": 16.537521107991537, + "rewards/rejected": -11.821058654785157, + "step": 10679 + }, + { + "epoch": 0.9757880310644129, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 1.4620033771582942e-08, + "logits/chosen": 769783808.0, + "logits/rejected": 540352384.0, + "logps/chosen": -446.383056640625, + "logps/rejected": -389.6985168457031, + "loss": 0.013, + "rewards/chosen": 3.740248918533325, + "rewards/margins": 11.57864785194397, + "rewards/rejected": -7.8383989334106445, + "step": 10680 + }, + { + "epoch": 0.9758793969849247, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 1.4510368431138177e-08, + "logits/chosen": 359340330.6666667, + "logits/rejected": 402037312.0, + "logps/chosen": -264.7037353515625, + "logps/rejected": -577.0242919921875, + "loss": 0.0231, + "rewards/chosen": 3.8393478393554688, + "rewards/margins": 11.770614624023438, + "rewards/rejected": -7.931266784667969, + "step": 10681 + }, + { + "epoch": 0.9759707629054363, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.4401115344495064e-08, + "logits/chosen": 514082656.0, + "logits/rejected": 391727360.0, + "logps/chosen": -399.9326477050781, + "logps/rejected": -447.564453125, + "loss": 0.0128, + "rewards/chosen": 4.093703746795654, + "rewards/margins": 12.530642032623291, + "rewards/rejected": -8.436938285827637, + "step": 10682 + }, + { + "epoch": 0.9760621288259479, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 1.4292274520686377e-08, + "logits/chosen": 588017664.0, + "logits/rejected": 608995328.0, + "logps/chosen": -417.52105712890625, + "logps/rejected": -721.8101806640625, + "loss": 0.0216, + "rewards/chosen": 3.1282734870910645, + "rewards/margins": 12.482578754425049, + "rewards/rejected": -9.354305267333984, + "step": 10683 + }, + { + "epoch": 0.9761534947464595, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 1.418384596871325e-08, + "logits/chosen": 435243468.8, + "logits/rejected": 416281130.6666667, + "logps/chosen": -243.3220947265625, + "logps/rejected": -723.00048828125, + "loss": 0.0604, + "rewards/chosen": 2.380972480773926, + "rewards/margins": 14.69826119740804, + "rewards/rejected": -12.317288716634115, + "step": 10684 + }, + { + "epoch": 0.9762448606669712, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.4075829697541843e-08, + "logits/chosen": 679081676.8, + "logits/rejected": 419834624.0, + "logps/chosen": -404.21865234375, + "logps/rejected": -734.4041341145834, + "loss": 0.0253, + "rewards/chosen": 4.005948638916015, + "rewards/margins": 15.195144907633463, + "rewards/rejected": -11.189196268717447, + "step": 10685 + }, + { + "epoch": 0.9763362265874829, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.3968225716103346e-08, + "logits/chosen": 369051315.2, + "logits/rejected": 472400682.6666667, + "logps/chosen": -260.9388671875, + "logps/rejected": -536.5592854817709, + "loss": 0.0187, + "rewards/chosen": 3.9771133422851563, + "rewards/margins": 13.130811309814453, + "rewards/rejected": -9.153697967529297, + "step": 10686 + }, + { + "epoch": 0.9764275925079945, + "grad_norm": 36.25, + "kl": 0.0, + "learning_rate": 1.386103403329564e-08, + "logits/chosen": 369310816.0, + "logits/rejected": 270936672.0, + "logps/chosen": -160.27667236328125, + "logps/rejected": -597.0980834960938, + "loss": 0.0422, + "rewards/chosen": 2.7538564205169678, + "rewards/margins": 14.6333749294281, + "rewards/rejected": -11.879518508911133, + "step": 10687 + }, + { + "epoch": 0.9765189584285061, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 1.3754254657983302e-08, + "logits/chosen": 726097600.0, + "logits/rejected": 1041541952.0, + "logps/chosen": -290.8126220703125, + "logps/rejected": -437.220458984375, + "loss": 0.0161, + "rewards/chosen": 3.6098570823669434, + "rewards/margins": 11.315499782562256, + "rewards/rejected": -7.7056427001953125, + "step": 10688 + }, + { + "epoch": 0.9766103243490178, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 1.3647887598995379e-08, + "logits/chosen": 514800384.0, + "logits/rejected": 641688883.2, + "logps/chosen": -362.9277750651042, + "logps/rejected": -545.0904296875, + "loss": 0.0072, + "rewards/chosen": 4.698378245035808, + "rewards/margins": 13.44832026163737, + "rewards/rejected": -8.749942016601562, + "step": 10689 + }, + { + "epoch": 0.9767016902695295, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 1.3541932865127615e-08, + "logits/chosen": 361027968.0, + "logits/rejected": 331087808.0, + "logps/chosen": -210.9954833984375, + "logps/rejected": -336.62017822265625, + "loss": 0.0161, + "rewards/chosen": 3.6976170539855957, + "rewards/margins": 12.560976505279541, + "rewards/rejected": -8.863359451293945, + "step": 10690 + }, + { + "epoch": 0.9767930561900411, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 1.3436390465141336e-08, + "logits/chosen": 1014088704.0, + "logits/rejected": 524205696.0, + "logps/chosen": -291.7818196614583, + "logps/rejected": -568.1505126953125, + "loss": 0.0128, + "rewards/chosen": 4.532192548116048, + "rewards/margins": 18.65431531270345, + "rewards/rejected": -14.122122764587402, + "step": 10691 + }, + { + "epoch": 0.9768844221105528, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 1.3331260407764002e-08, + "logits/chosen": 367240672.0, + "logits/rejected": 595734976.0, + "logps/chosen": -202.26995849609375, + "logps/rejected": -353.767822265625, + "loss": 0.0238, + "rewards/chosen": 3.5774664878845215, + "rewards/margins": 12.965344905853271, + "rewards/rejected": -9.38787841796875, + "step": 10692 + }, + { + "epoch": 0.9769757880310644, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.3226542701689215e-08, + "logits/chosen": 457234773.3333333, + "logits/rejected": 741996902.4, + "logps/chosen": -320.7176106770833, + "logps/rejected": -479.14482421875, + "loss": 0.0106, + "rewards/chosen": 4.252362569173177, + "rewards/margins": 12.744234212239583, + "rewards/rejected": -8.491871643066407, + "step": 10693 + }, + { + "epoch": 0.9770671539515761, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.3122237355575606e-08, + "logits/chosen": 587392409.6, + "logits/rejected": 803750400.0, + "logps/chosen": -477.3021484375, + "logps/rejected": -525.5635579427084, + "loss": 0.0151, + "rewards/chosen": 4.050897216796875, + "rewards/margins": 15.448111470540365, + "rewards/rejected": -11.39721425374349, + "step": 10694 + }, + { + "epoch": 0.9771585198720877, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 1.3018344378048498e-08, + "logits/chosen": 559163456.0, + "logits/rejected": 431112704.0, + "logps/chosen": -151.41481018066406, + "logps/rejected": -460.5199279785156, + "loss": 0.0235, + "rewards/chosen": 3.184962034225464, + "rewards/margins": 11.575485944747925, + "rewards/rejected": -8.390523910522461, + "step": 10695 + }, + { + "epoch": 0.9772498857925994, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 1.2914863777698794e-08, + "logits/chosen": 580217216.0, + "logits/rejected": 1206651008.0, + "logps/chosen": -260.2424621582031, + "logps/rejected": -742.3724365234375, + "loss": 0.0076, + "rewards/chosen": 4.400968551635742, + "rewards/margins": 13.738879203796387, + "rewards/rejected": -9.337910652160645, + "step": 10696 + }, + { + "epoch": 0.977341251713111, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.281179556308354e-08, + "logits/chosen": 562016051.2, + "logits/rejected": 608138410.6666666, + "logps/chosen": -408.8214599609375, + "logps/rejected": -640.7061360677084, + "loss": 0.0153, + "rewards/chosen": 3.705562210083008, + "rewards/margins": 14.227641677856445, + "rewards/rejected": -10.522079467773438, + "step": 10697 + }, + { + "epoch": 0.9774326176336227, + "grad_norm": 8.75, + "kl": 17.46173095703125, + "learning_rate": 1.2709139742725918e-08, + "logits/chosen": 402363136.0, + "logits/rejected": 781369344.0, + "logps/chosen": -344.17665318080356, + "logps/rejected": -447.24517822265625, + "loss": 0.1068, + "rewards/chosen": 4.286223820277622, + "rewards/margins": 11.498913219996862, + "rewards/rejected": -7.212689399719238, + "step": 10698 + }, + { + "epoch": 0.9775239835541343, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 1.2606896325113582e-08, + "logits/chosen": 1086647091.2, + "logits/rejected": 777318912.0, + "logps/chosen": -358.04248046875, + "logps/rejected": -267.15728759765625, + "loss": 0.022, + "rewards/chosen": 3.6877674102783202, + "rewards/margins": 10.775317255655924, + "rewards/rejected": -7.0875498453776045, + "step": 10699 + }, + { + "epoch": 0.977615349474646, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 1.2505065318701992e-08, + "logits/chosen": 416963925.3333333, + "logits/rejected": 369276352.0, + "logps/chosen": -234.2607625325521, + "logps/rejected": -689.4378051757812, + "loss": 0.0465, + "rewards/chosen": 2.978640874226888, + "rewards/margins": 14.982854207356771, + "rewards/rejected": -12.004213333129883, + "step": 10700 + }, + { + "epoch": 0.9777067153951576, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.240364673191108e-08, + "logits/chosen": 526258944.0, + "logits/rejected": 1019636288.0, + "logps/chosen": -239.92840576171875, + "logps/rejected": -430.885986328125, + "loss": 0.0309, + "rewards/chosen": 3.2394556999206543, + "rewards/margins": 10.235888957977295, + "rewards/rejected": -6.996433258056641, + "step": 10701 + }, + { + "epoch": 0.9777980813156693, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 1.2302640573128023e-08, + "logits/chosen": 557337600.0, + "logits/rejected": 763067904.0, + "logps/chosen": -207.74267578125, + "logps/rejected": -734.7628580729166, + "loss": 0.037, + "rewards/chosen": 3.1413253784179687, + "rewards/margins": 11.649591827392578, + "rewards/rejected": -8.50826644897461, + "step": 10702 + }, + { + "epoch": 0.9778894472361809, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 1.2202046850703919e-08, + "logits/chosen": 489716326.4, + "logits/rejected": 636610816.0, + "logps/chosen": -302.54853515625, + "logps/rejected": -525.5124918619791, + "loss": 0.0122, + "rewards/chosen": 4.212508010864258, + "rewards/margins": 12.443565495808919, + "rewards/rejected": -8.231057484944662, + "step": 10703 + }, + { + "epoch": 0.9779808131566926, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 1.2101865572957672e-08, + "logits/chosen": 532490444.8, + "logits/rejected": 326655957.3333333, + "logps/chosen": -376.68525390625, + "logps/rejected": -398.7880859375, + "loss": 0.0255, + "rewards/chosen": 3.5166366577148436, + "rewards/margins": 12.780271657307942, + "rewards/rejected": -9.2636349995931, + "step": 10704 + }, + { + "epoch": 0.9780721790772042, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 1.2002096748173763e-08, + "logits/chosen": 543508800.0, + "logits/rejected": 358923104.0, + "logps/chosen": -381.67633056640625, + "logps/rejected": -402.2178955078125, + "loss": 0.0073, + "rewards/chosen": 4.503678321838379, + "rewards/margins": 13.836492538452148, + "rewards/rejected": -9.33281421661377, + "step": 10705 + }, + { + "epoch": 0.9781635449977159, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.1902740384601152e-08, + "logits/chosen": 348637056.0, + "logits/rejected": 336300416.0, + "logps/chosen": -263.3609212239583, + "logps/rejected": -409.75018310546875, + "loss": 0.0324, + "rewards/chosen": 4.006350835164388, + "rewards/margins": 10.784237225850422, + "rewards/rejected": -6.777886390686035, + "step": 10706 + }, + { + "epoch": 0.9782549109182275, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 1.1803796490456044e-08, + "logits/chosen": 472786944.0, + "logits/rejected": 654087014.4, + "logps/chosen": -214.41861979166666, + "logps/rejected": -491.493603515625, + "loss": 0.0222, + "rewards/chosen": 2.816122055053711, + "rewards/margins": 13.344299697875977, + "rewards/rejected": -10.528177642822266, + "step": 10707 + }, + { + "epoch": 0.9783462768387392, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 1.170526507392078e-08, + "logits/chosen": 579515136.0, + "logits/rejected": 657329766.4, + "logps/chosen": -335.6680908203125, + "logps/rejected": -547.341162109375, + "loss": 0.0154, + "rewards/chosen": 3.191783587137858, + "rewards/margins": 12.31566359202067, + "rewards/rejected": -9.123880004882812, + "step": 10708 + }, + { + "epoch": 0.9784376427592508, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 1.1607146143142178e-08, + "logits/chosen": 436215193.6, + "logits/rejected": 536534784.0, + "logps/chosen": -310.571875, + "logps/rejected": -480.3048502604167, + "loss": 0.034, + "rewards/chosen": 2.969693183898926, + "rewards/margins": 12.178311983744303, + "rewards/rejected": -9.208618799845377, + "step": 10709 + }, + { + "epoch": 0.9785290086797624, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 1.1509439706234305e-08, + "logits/chosen": 592220569.6, + "logits/rejected": 755997525.3333334, + "logps/chosen": -335.3213134765625, + "logps/rejected": -224.8331298828125, + "loss": 0.0164, + "rewards/chosen": 4.008668518066406, + "rewards/margins": 12.06044667561849, + "rewards/rejected": -8.051778157552084, + "step": 10710 + }, + { + "epoch": 0.9786203746002741, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 1.1412145771276251e-08, + "logits/chosen": 486632277.3333333, + "logits/rejected": 465859968.0, + "logps/chosen": -261.59755452473956, + "logps/rejected": -555.7933959960938, + "loss": 0.0249, + "rewards/chosen": 4.016913414001465, + "rewards/margins": 11.65807819366455, + "rewards/rejected": -7.641164779663086, + "step": 10711 + }, + { + "epoch": 0.9787117405207858, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 1.1315264346313248e-08, + "logits/chosen": 967227596.8, + "logits/rejected": 606638848.0, + "logps/chosen": -322.044384765625, + "logps/rejected": -408.8252360026042, + "loss": 0.1467, + "rewards/chosen": 2.346469688415527, + "rewards/margins": 9.042043495178223, + "rewards/rejected": -6.695573806762695, + "step": 10712 + }, + { + "epoch": 0.9788031064412974, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 1.1218795439357222e-08, + "logits/chosen": 502618976.0, + "logits/rejected": 302919232.0, + "logps/chosen": -428.92962646484375, + "logps/rejected": -351.0032552083333, + "loss": 0.0043, + "rewards/chosen": 4.557676792144775, + "rewards/margins": 13.287519931793213, + "rewards/rejected": -8.729843139648438, + "step": 10713 + }, + { + "epoch": 0.978894472361809, + "grad_norm": 1.5234375, + "kl": 0.0, + "learning_rate": 1.1122739058384569e-08, + "logits/chosen": 513369184.0, + "logits/rejected": 454286400.0, + "logps/chosen": -311.29339599609375, + "logps/rejected": -430.37493896484375, + "loss": 0.0071, + "rewards/chosen": 4.707214832305908, + "rewards/margins": 14.88266897201538, + "rewards/rejected": -10.175454139709473, + "step": 10714 + }, + { + "epoch": 0.9789858382823207, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 1.1027095211337825e-08, + "logits/chosen": 686419882.6666666, + "logits/rejected": 259862576.0, + "logps/chosen": -470.334228515625, + "logps/rejected": -427.55462646484375, + "loss": 0.0235, + "rewards/chosen": 3.8500747680664062, + "rewards/margins": 13.514070510864258, + "rewards/rejected": -9.663995742797852, + "step": 10715 + }, + { + "epoch": 0.9790772042028324, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 1.0931863906127327e-08, + "logits/chosen": 797077504.0, + "logits/rejected": 833552076.8, + "logps/chosen": -624.595703125, + "logps/rejected": -502.04404296875, + "loss": 0.0064, + "rewards/chosen": 4.521805763244629, + "rewards/margins": 13.092847633361817, + "rewards/rejected": -8.571041870117188, + "step": 10716 + }, + { + "epoch": 0.979168570123344, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 1.0837045150626224e-08, + "logits/chosen": 473185088.0, + "logits/rejected": 535265120.0, + "logps/chosen": -253.4704132080078, + "logps/rejected": -499.74273681640625, + "loss": 0.0101, + "rewards/chosen": 4.305079460144043, + "rewards/margins": 14.482026100158691, + "rewards/rejected": -10.176946640014648, + "step": 10717 + }, + { + "epoch": 0.9792599360438556, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 1.0742638952676577e-08, + "logits/chosen": 420551520.0, + "logits/rejected": 342951232.0, + "logps/chosen": -239.9201202392578, + "logps/rejected": -310.0057373046875, + "loss": 0.0139, + "rewards/chosen": 4.805379867553711, + "rewards/margins": 12.266858100891113, + "rewards/rejected": -7.461478233337402, + "step": 10718 + }, + { + "epoch": 0.9793513019643673, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 1.0648645320083806e-08, + "logits/chosen": 551266304.0, + "logits/rejected": 382550101.3333333, + "logps/chosen": -417.0771484375, + "logps/rejected": -245.2224324544271, + "loss": 0.0123, + "rewards/chosen": 4.226543807983399, + "rewards/margins": 11.91496950785319, + "rewards/rejected": -7.688425699869792, + "step": 10719 + }, + { + "epoch": 0.979442667884879, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 1.0555064260620584e-08, + "logits/chosen": 834908723.2, + "logits/rejected": 471892608.0, + "logps/chosen": -236.2284912109375, + "logps/rejected": -394.5537109375, + "loss": 0.0106, + "rewards/chosen": 4.289234161376953, + "rewards/margins": 13.145923614501953, + "rewards/rejected": -8.856689453125, + "step": 10720 + }, + { + "epoch": 0.9795340338053906, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 1.0461895782025166e-08, + "logits/chosen": 931284275.2, + "logits/rejected": 490142122.6666667, + "logps/chosen": -188.93560791015625, + "logps/rejected": -468.0557454427083, + "loss": 0.153, + "rewards/chosen": 1.5274866104125977, + "rewards/margins": 12.066380627950034, + "rewards/rejected": -10.538894017537435, + "step": 10721 + }, + { + "epoch": 0.9796253997259022, + "grad_norm": 0.46484375, + "kl": 0.0, + "learning_rate": 1.03691398920025e-08, + "logits/chosen": 437734272.0, + "logits/rejected": 384599552.0, + "logps/chosen": -221.0318145751953, + "logps/rejected": -385.20584542410717, + "loss": 0.0021, + "rewards/chosen": 4.289271831512451, + "rewards/margins": 13.412186826978411, + "rewards/rejected": -9.12291499546596, + "step": 10722 + }, + { + "epoch": 0.9797167656464139, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 1.027679659822145e-08, + "logits/chosen": 540036138.6666666, + "logits/rejected": 704761344.0, + "logps/chosen": -300.78607177734375, + "logps/rejected": -407.2595703125, + "loss": 0.043, + "rewards/chosen": 3.038366953531901, + "rewards/margins": 11.509183756510415, + "rewards/rejected": -8.470816802978515, + "step": 10723 + }, + { + "epoch": 0.9798081315669256, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 1.0184865908318686e-08, + "logits/chosen": 427449770.6666667, + "logits/rejected": 407660928.0, + "logps/chosen": -385.1325276692708, + "logps/rejected": -485.09033203125, + "loss": 0.0104, + "rewards/chosen": 4.474759101867676, + "rewards/margins": 13.392398262023926, + "rewards/rejected": -8.91763916015625, + "step": 10724 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 1.0093347829895905e-08, + "logits/chosen": 768905984.0, + "logits/rejected": 521644672.0, + "logps/chosen": -551.658837890625, + "logps/rejected": -456.5712076822917, + "loss": 0.0183, + "rewards/chosen": 3.736873245239258, + "rewards/margins": 13.612203852335611, + "rewards/rejected": -9.875330607096354, + "step": 10725 + }, + { + "epoch": 0.9799908634079488, + "grad_norm": 1.4609375, + "kl": 0.0, + "learning_rate": 1.0002242370520387e-08, + "logits/chosen": 397421440.0, + "logits/rejected": 471115622.4, + "logps/chosen": -284.5641276041667, + "logps/rejected": -538.6759765625, + "loss": 0.0101, + "rewards/chosen": 3.6696554819742837, + "rewards/margins": 14.052991358439128, + "rewards/rejected": -10.383335876464844, + "step": 10726 + }, + { + "epoch": 0.9800822293284605, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.91154953772666e-09, + "logits/chosen": 404147456.0, + "logits/rejected": 904263744.0, + "logps/chosen": -315.0833740234375, + "logps/rejected": -748.9434814453125, + "loss": 0.0544, + "rewards/chosen": 4.31962776184082, + "rewards/margins": 16.01670265197754, + "rewards/rejected": -11.697074890136719, + "step": 10727 + }, + { + "epoch": 0.9801735952489722, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 9.821269339013173e-09, + "logits/chosen": 1008508416.0, + "logits/rejected": 610335232.0, + "logps/chosen": -361.964306640625, + "logps/rejected": -683.1722005208334, + "loss": 0.0091, + "rewards/chosen": 4.690310287475586, + "rewards/margins": 15.498119481404622, + "rewards/rejected": -10.807809193929037, + "step": 10728 + }, + { + "epoch": 0.9802649611694838, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.731401781845618e-09, + "logits/chosen": 369805491.2, + "logits/rejected": 918523818.6666666, + "logps/chosen": -244.924609375, + "logps/rejected": -418.7568766276042, + "loss": 0.0274, + "rewards/chosen": 3.4567947387695312, + "rewards/margins": 13.092987696329752, + "rewards/rejected": -9.63619295756022, + "step": 10729 + }, + { + "epoch": 0.9803563270899954, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 9.641946873655273e-09, + "logits/chosen": 363695283.2, + "logits/rejected": 1153468757.3333333, + "logps/chosen": -162.03330078125, + "logps/rejected": -555.2523600260416, + "loss": 0.0545, + "rewards/chosen": 3.090488052368164, + "rewards/margins": 10.77330805460612, + "rewards/rejected": -7.682820002237956, + "step": 10730 + }, + { + "epoch": 0.980447693010507, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.552904621839554e-09, + "logits/chosen": 755912265.1428572, + "logits/rejected": 933410560.0, + "logps/chosen": -226.07845633370536, + "logps/rejected": -501.7411193847656, + "loss": 0.0285, + "rewards/chosen": 3.7535547528948103, + "rewards/margins": 14.088854244777135, + "rewards/rejected": -10.335299491882324, + "step": 10731 + }, + { + "epoch": 0.9805390589310188, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 9.464275033760906e-09, + "logits/chosen": 390904192.0, + "logits/rejected": 901892608.0, + "logps/chosen": -179.958154296875, + "logps/rejected": -585.6238606770834, + "loss": 0.023, + "rewards/chosen": 3.605040740966797, + "rewards/margins": 13.122069549560546, + "rewards/rejected": -9.51702880859375, + "step": 10732 + }, + { + "epoch": 0.9806304248515304, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.376058116748466e-09, + "logits/chosen": 476364458.6666667, + "logits/rejected": 985497856.0, + "logps/chosen": -349.3762613932292, + "logps/rejected": -364.5830078125, + "loss": 0.0339, + "rewards/chosen": 3.875155766805013, + "rewards/margins": 11.409640630086264, + "rewards/rejected": -7.53448486328125, + "step": 10733 + }, + { + "epoch": 0.980721790772042, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 9.288253878096954e-09, + "logits/chosen": 594171968.0, + "logits/rejected": 549976064.0, + "logps/chosen": -332.0361022949219, + "logps/rejected": -584.7059936523438, + "loss": 0.0225, + "rewards/chosen": 3.6830811500549316, + "rewards/margins": 14.514882564544678, + "rewards/rejected": -10.831801414489746, + "step": 10734 + }, + { + "epoch": 0.9808131566925536, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 9.200862325066673e-09, + "logits/chosen": 405902250.6666667, + "logits/rejected": 428384563.2, + "logps/chosen": -367.8133138020833, + "logps/rejected": -604.16240234375, + "loss": 0.0134, + "rewards/chosen": 3.3449230194091797, + "rewards/margins": 14.50352439880371, + "rewards/rejected": -11.158601379394531, + "step": 10735 + }, + { + "epoch": 0.9809045226130654, + "grad_norm": 0.01531982421875, + "kl": 0.0, + "learning_rate": 9.113883464884066e-09, + "logits/rejected": 521604608.0, + "logps/rejected": -559.5018310546875, + "loss": 0.0001, + "rewards/rejected": -10.775012969970703, + "step": 10736 + }, + { + "epoch": 0.980995888533577, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 9.027317304742267e-09, + "logits/chosen": 403072972.8, + "logits/rejected": 400159829.3333333, + "logps/chosen": -354.3354248046875, + "logps/rejected": -454.9934895833333, + "loss": 0.0152, + "rewards/chosen": 4.419598388671875, + "rewards/margins": 12.859276580810548, + "rewards/rejected": -8.439678192138672, + "step": 10737 + }, + { + "epoch": 0.9810872544540886, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 8.94116385179833e-09, + "logits/chosen": 512802144.0, + "logits/rejected": 366476608.0, + "logps/chosen": -374.5394287109375, + "logps/rejected": -410.47882080078125, + "loss": 0.0227, + "rewards/chosen": 3.5009827613830566, + "rewards/margins": 13.013012409210205, + "rewards/rejected": -9.512029647827148, + "step": 10738 + }, + { + "epoch": 0.9811786203746002, + "grad_norm": 1.125, + "kl": 0.0, + "learning_rate": 8.855423113177664e-09, + "logits/chosen": 817565952.0, + "logits/rejected": 329493138.28571427, + "logps/chosen": -280.47314453125, + "logps/rejected": -375.01708984375, + "loss": 0.0043, + "rewards/chosen": 3.3932220935821533, + "rewards/margins": 12.718952553612846, + "rewards/rejected": -9.325730460030693, + "step": 10739 + }, + { + "epoch": 0.981269986295112, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.770095095969045e-09, + "logits/chosen": 585909973.3333334, + "logits/rejected": 490698598.4, + "logps/chosen": -329.28444417317706, + "logps/rejected": -403.2019287109375, + "loss": 0.0132, + "rewards/chosen": 4.085283915201823, + "rewards/margins": 12.694686381022137, + "rewards/rejected": -8.609402465820313, + "step": 10740 + }, + { + "epoch": 0.9813613522156236, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.68517980722905e-09, + "logits/chosen": 440874666.6666667, + "logits/rejected": 358856448.0, + "logps/chosen": -272.18223063151044, + "logps/rejected": -722.1693115234375, + "loss": 0.0367, + "rewards/chosen": 3.2190732955932617, + "rewards/margins": 14.972244262695312, + "rewards/rejected": -11.75317096710205, + "step": 10741 + }, + { + "epoch": 0.9814527181361352, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 8.600677253978728e-09, + "logits/chosen": 356007808.0, + "logits/rejected": 471521962.6666667, + "logps/chosen": -314.00546875, + "logps/rejected": -458.7611897786458, + "loss": 0.1488, + "rewards/chosen": 1.6828241348266602, + "rewards/margins": 11.606558163960775, + "rewards/rejected": -9.923734029134115, + "step": 10742 + }, + { + "epoch": 0.9815440840566468, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.516587443206382e-09, + "logits/chosen": 700101632.0, + "logits/rejected": 678710336.0, + "logps/chosen": -422.1480712890625, + "logps/rejected": -500.0377502441406, + "loss": 0.0257, + "rewards/chosen": 3.742966651916504, + "rewards/margins": 10.29594373703003, + "rewards/rejected": -6.552977085113525, + "step": 10743 + }, + { + "epoch": 0.9816354499771586, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 8.432910381865334e-09, + "logits/chosen": 478124501.3333333, + "logits/rejected": 446487552.0, + "logps/chosen": -316.4834391276042, + "logps/rejected": -469.700048828125, + "loss": 0.0158, + "rewards/chosen": 3.677881876627604, + "rewards/margins": 12.624017588297525, + "rewards/rejected": -8.946135711669921, + "step": 10744 + }, + { + "epoch": 0.9817268158976702, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 8.349646076873941e-09, + "logits/chosen": 611215411.2, + "logits/rejected": 447640106.6666667, + "logps/chosen": -326.56318359375, + "logps/rejected": -558.881591796875, + "loss": 0.0431, + "rewards/chosen": 3.0327762603759765, + "rewards/margins": 12.984075291951498, + "rewards/rejected": -9.951299031575521, + "step": 10745 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 8.266794535118915e-09, + "logits/chosen": 559550310.4, + "logits/rejected": 415194197.3333333, + "logps/chosen": -316.4427001953125, + "logps/rejected": -630.4871419270834, + "loss": 0.0233, + "rewards/chosen": 3.693803405761719, + "rewards/margins": 15.901447550455728, + "rewards/rejected": -12.20764414469401, + "step": 10746 + }, + { + "epoch": 0.9819095477386934, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 8.184355763450336e-09, + "logits/chosen": 587199424.0, + "logits/rejected": 322420288.0, + "logps/chosen": -479.63250732421875, + "logps/rejected": -490.8697509765625, + "loss": 0.0166, + "rewards/chosen": 3.5992279052734375, + "rewards/margins": 14.071654319763184, + "rewards/rejected": -10.472426414489746, + "step": 10747 + }, + { + "epoch": 0.9820009136592052, + "grad_norm": 77.5, + "kl": 0.0, + "learning_rate": 8.102329768685524e-09, + "logits/chosen": 698774118.4, + "logits/rejected": 715235072.0, + "logps/chosen": -263.19658203125, + "logps/rejected": -374.3976643880208, + "loss": 0.0548, + "rewards/chosen": 3.237223434448242, + "rewards/margins": 10.424349594116212, + "rewards/rejected": -7.187126159667969, + "step": 10748 + }, + { + "epoch": 0.9820922795797168, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 8.020716557606279e-09, + "logits/chosen": 461642598.4, + "logits/rejected": 485607125.3333333, + "logps/chosen": -267.4152587890625, + "logps/rejected": -474.1037190755208, + "loss": 0.0108, + "rewards/chosen": 4.601498413085937, + "rewards/margins": 13.935515848795571, + "rewards/rejected": -9.334017435709635, + "step": 10749 + }, + { + "epoch": 0.9821836455002284, + "grad_norm": 1.359375, + "kl": 0.0, + "learning_rate": 7.939516136962754e-09, + "logits/chosen": 745622954.6666666, + "logits/rejected": 547612262.4, + "logps/chosen": -350.3818359375, + "logps/rejected": -390.7807373046875, + "loss": 0.0057, + "rewards/chosen": 4.584433237711589, + "rewards/margins": 13.120519510904948, + "rewards/rejected": -8.536086273193359, + "step": 10750 + }, + { + "epoch": 0.98227501142074, + "grad_norm": 1.515625, + "kl": 0.0, + "learning_rate": 7.858728513469027e-09, + "logits/chosen": 1395538944.0, + "logits/rejected": 745731328.0, + "logps/chosen": -560.5675048828125, + "logps/rejected": -581.9759114583334, + "loss": 0.0063, + "rewards/chosen": 3.811418294906616, + "rewards/margins": 12.235862493515015, + "rewards/rejected": -8.424444198608398, + "step": 10751 + }, + { + "epoch": 0.9823663773412518, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 7.778353693804752e-09, + "logits/chosen": 553955776.0, + "logits/rejected": 497082496.0, + "logps/chosen": -435.1543884277344, + "logps/rejected": -346.728271484375, + "loss": 0.0184, + "rewards/chosen": 3.9008781909942627, + "rewards/margins": 11.469431638717651, + "rewards/rejected": -7.568553447723389, + "step": 10752 + }, + { + "epoch": 0.9824577432617634, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 7.69839168461628e-09, + "logits/chosen": 423426201.6, + "logits/rejected": 290724864.0, + "logps/chosen": -422.6859375, + "logps/rejected": -282.72796630859375, + "loss": 0.0108, + "rewards/chosen": 4.430970764160156, + "rewards/margins": 12.357881164550781, + "rewards/rejected": -7.926910400390625, + "step": 10753 + }, + { + "epoch": 0.982549109182275, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 7.618842492516654e-09, + "logits/chosen": 384115916.8, + "logits/rejected": 614845738.6666666, + "logps/chosen": -272.38173828125, + "logps/rejected": -221.87548828125, + "loss": 0.0074, + "rewards/chosen": 5.199208450317383, + "rewards/margins": 11.583604049682616, + "rewards/rejected": -6.384395599365234, + "step": 10754 + }, + { + "epoch": 0.9826404751027866, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 7.53970612408339e-09, + "logits/chosen": 1377941674.6666667, + "logits/rejected": 997060812.8, + "logps/chosen": -328.5625406901042, + "logps/rejected": -533.985693359375, + "loss": 0.0129, + "rewards/chosen": 3.800490379333496, + "rewards/margins": 13.171742820739746, + "rewards/rejected": -9.37125244140625, + "step": 10755 + }, + { + "epoch": 0.9827318410232984, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 7.460982585860144e-09, + "logits/chosen": 601777664.0, + "logits/rejected": 484024608.0, + "logps/chosen": -325.6581115722656, + "logps/rejected": -548.6939697265625, + "loss": 0.0171, + "rewards/chosen": 3.9342331886291504, + "rewards/margins": 12.878939151763916, + "rewards/rejected": -8.944705963134766, + "step": 10756 + }, + { + "epoch": 0.98282320694381, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 7.382671884356707e-09, + "logits/chosen": 923035306.6666666, + "logits/rejected": 435437875.2, + "logps/chosen": -484.4687906901042, + "logps/rejected": -378.3480224609375, + "loss": 0.0161, + "rewards/chosen": 3.1855236689249673, + "rewards/margins": 12.520115343729655, + "rewards/rejected": -9.334591674804688, + "step": 10757 + }, + { + "epoch": 0.9829145728643216, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 7.304774026048456e-09, + "logits/chosen": 412036032.0, + "logits/rejected": 479074005.3333333, + "logps/chosen": -304.729248046875, + "logps/rejected": -475.0806477864583, + "loss": 0.0051, + "rewards/chosen": 3.9479050636291504, + "rewards/margins": 12.279825687408447, + "rewards/rejected": -8.331920623779297, + "step": 10758 + }, + { + "epoch": 0.9830059387848332, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 7.227289017377459e-09, + "logits/chosen": 779008256.0, + "logits/rejected": 568602816.0, + "logps/chosen": -422.377685546875, + "logps/rejected": -441.1573486328125, + "loss": 0.0157, + "rewards/chosen": 3.532287836074829, + "rewards/margins": 13.93134331703186, + "rewards/rejected": -10.399055480957031, + "step": 10759 + }, + { + "epoch": 0.983097304705345, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 7.150216864750259e-09, + "logits/chosen": 407928320.0, + "logits/rejected": 465231392.0, + "logps/chosen": -307.366455078125, + "logps/rejected": -488.4344177246094, + "loss": 0.0194, + "rewards/chosen": 4.207705974578857, + "rewards/margins": 11.578115940093994, + "rewards/rejected": -7.370409965515137, + "step": 10760 + }, + { + "epoch": 0.9831886706258566, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 7.073557574540646e-09, + "logits/chosen": 817019136.0, + "logits/rejected": 624176469.3333334, + "logps/chosen": -764.6417236328125, + "logps/rejected": -709.1210123697916, + "loss": 0.0088, + "rewards/chosen": 3.529776096343994, + "rewards/margins": 13.376884937286377, + "rewards/rejected": -9.847108840942383, + "step": 10761 + }, + { + "epoch": 0.9832800365463682, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 6.997311153086883e-09, + "logits/chosen": 549431808.0, + "logits/rejected": 570625472.0, + "logps/chosen": -172.3411407470703, + "logps/rejected": -888.0643310546875, + "loss": 0.0254, + "rewards/chosen": 3.2120144367218018, + "rewards/margins": 15.216704607009888, + "rewards/rejected": -12.004690170288086, + "step": 10762 + }, + { + "epoch": 0.9833714024668798, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 6.921477606695037e-09, + "logits/chosen": 724117248.0, + "logits/rejected": 930160768.0, + "logps/chosen": -328.6396891276042, + "logps/rejected": -371.72747802734375, + "loss": 0.0196, + "rewards/chosen": 4.347069422403972, + "rewards/margins": 14.18019739786784, + "rewards/rejected": -9.833127975463867, + "step": 10763 + }, + { + "epoch": 0.9834627683873916, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 6.846056941634538e-09, + "logits/chosen": 710255680.0, + "logits/rejected": 448516480.0, + "logps/chosen": -399.7076416015625, + "logps/rejected": -431.39434814453125, + "loss": 0.0227, + "rewards/chosen": 3.116135597229004, + "rewards/margins": 12.020659446716309, + "rewards/rejected": -8.904523849487305, + "step": 10764 + }, + { + "epoch": 0.9835541343079032, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 6.771049164142618e-09, + "logits/chosen": 532683946.6666667, + "logits/rejected": 705880422.4, + "logps/chosen": -263.7336832682292, + "logps/rejected": -568.114990234375, + "loss": 0.0132, + "rewards/chosen": 3.8326950073242188, + "rewards/margins": 16.235716247558592, + "rewards/rejected": -12.403021240234375, + "step": 10765 + }, + { + "epoch": 0.9836455002284148, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 6.696454280421538e-09, + "logits/chosen": 1356737706.6666667, + "logits/rejected": 455424819.2, + "logps/chosen": -388.5149739583333, + "logps/rejected": -446.094775390625, + "loss": 0.0164, + "rewards/chosen": 3.1410694122314453, + "rewards/margins": 13.811679458618164, + "rewards/rejected": -10.670610046386718, + "step": 10766 + }, + { + "epoch": 0.9837368661489264, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 6.622272296639698e-09, + "logits/chosen": 796890709.3333334, + "logits/rejected": 362331340.8, + "logps/chosen": -536.260498046875, + "logps/rejected": -585.92275390625, + "loss": 0.0084, + "rewards/chosen": 4.454978624979655, + "rewards/margins": 13.794735399882, + "rewards/rejected": -9.339756774902344, + "step": 10767 + }, + { + "epoch": 0.9838282320694381, + "grad_norm": 0.2294921875, + "kl": 0.0, + "learning_rate": 6.548503218931635e-09, + "logits/rejected": 430153408.0, + "logps/rejected": -390.23687744140625, + "loss": 0.0011, + "rewards/rejected": -9.4359130859375, + "step": 10768 + }, + { + "epoch": 0.9839195979899498, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 6.475147053396913e-09, + "logits/chosen": 340930483.2, + "logits/rejected": 363266304.0, + "logps/chosen": -359.152392578125, + "logps/rejected": -481.7122395833333, + "loss": 0.0329, + "rewards/chosen": 2.9478654861450195, + "rewards/margins": 12.690681139628092, + "rewards/rejected": -9.742815653483072, + "step": 10769 + }, + { + "epoch": 0.9840109639104614, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 6.402203806101237e-09, + "logits/chosen": 639119936.0, + "logits/rejected": 630762569.1428572, + "logps/chosen": -416.5889892578125, + "logps/rejected": -580.7691824776786, + "loss": 0.0084, + "rewards/chosen": 2.707479953765869, + "rewards/margins": 11.134904793330602, + "rewards/rejected": -8.427424839564733, + "step": 10770 + }, + { + "epoch": 0.984102329830973, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 6.329673483076448e-09, + "logits/chosen": 548474624.0, + "logits/rejected": 596304256.0, + "logps/chosen": -429.1903076171875, + "logps/rejected": -785.6456909179688, + "loss": 0.0116, + "rewards/chosen": 4.006765842437744, + "rewards/margins": 13.046172618865967, + "rewards/rejected": -9.039406776428223, + "step": 10771 + }, + { + "epoch": 0.9841936957514847, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 6.2575560903205265e-09, + "logits/chosen": 602092800.0, + "logits/rejected": 687937843.2, + "logps/chosen": -449.2965087890625, + "logps/rejected": -294.2270263671875, + "loss": 0.0086, + "rewards/chosen": 6.65231450398763, + "rewards/margins": 13.434512074788412, + "rewards/rejected": -6.782197570800781, + "step": 10772 + }, + { + "epoch": 0.9842850616719964, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 6.185851633797035e-09, + "logits/chosen": 611351104.0, + "logits/rejected": 488987168.0, + "logps/chosen": -370.92938232421875, + "logps/rejected": -391.30804443359375, + "loss": 0.0181, + "rewards/chosen": 3.6516835689544678, + "rewards/margins": 11.844646215438843, + "rewards/rejected": -8.192962646484375, + "step": 10773 + }, + { + "epoch": 0.984376427592508, + "grad_norm": 38.5, + "kl": 0.0, + "learning_rate": 6.11456011943401e-09, + "logits/chosen": 500889386.6666667, + "logits/rejected": 434529228.8, + "logps/chosen": -386.8041585286458, + "logps/rejected": -625.3841796875, + "loss": 0.0964, + "rewards/chosen": 2.989525477091471, + "rewards/margins": 13.884304682413736, + "rewards/rejected": -10.894779205322266, + "step": 10774 + }, + { + "epoch": 0.9844677935130196, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 6.0436815531284e-09, + "logits/chosen": 435881770.6666667, + "logits/rejected": 460890009.6, + "logps/chosen": -366.6250813802083, + "logps/rejected": -541.541552734375, + "loss": 0.0165, + "rewards/chosen": 3.277219772338867, + "rewards/margins": 13.101987838745117, + "rewards/rejected": -9.82476806640625, + "step": 10775 + }, + { + "epoch": 0.9845591594335313, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5.973215940739963e-09, + "logits/chosen": 464460544.0, + "logits/rejected": 242202512.0, + "logps/chosen": -278.55670166015625, + "logps/rejected": -334.8812255859375, + "loss": 0.0197, + "rewards/chosen": 3.300919771194458, + "rewards/margins": 12.241210222244263, + "rewards/rejected": -8.940290451049805, + "step": 10776 + }, + { + "epoch": 0.984650525354043, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 5.903163288095704e-09, + "logits/chosen": 645901056.0, + "logits/rejected": 372623701.3333333, + "logps/chosen": -262.4646484375, + "logps/rejected": -565.4603678385416, + "loss": 0.0206, + "rewards/chosen": 3.5701416015625, + "rewards/margins": 18.039752451578774, + "rewards/rejected": -14.469610850016275, + "step": 10777 + }, + { + "epoch": 0.9847418912745546, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 5.833523600988766e-09, + "logits/chosen": 419438592.0, + "logits/rejected": 517170176.0, + "logps/chosen": -263.32822672526044, + "logps/rejected": -437.8145446777344, + "loss": 0.0244, + "rewards/chosen": 3.935460408528646, + "rewards/margins": 12.630243619283041, + "rewards/rejected": -8.694783210754395, + "step": 10778 + }, + { + "epoch": 0.9848332571950662, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.764296885176768e-09, + "logits/chosen": 715150400.0, + "logits/rejected": 741745152.0, + "logps/chosen": -254.13035583496094, + "logps/rejected": -552.3876953125, + "loss": 0.01, + "rewards/chosen": 5.361259460449219, + "rewards/margins": 13.683427810668945, + "rewards/rejected": -8.322168350219727, + "step": 10779 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5.695483146385683e-09, + "logits/chosen": 495597363.2, + "logits/rejected": 419637973.3333333, + "logps/chosen": -310.7904296875, + "logps/rejected": -540.1087239583334, + "loss": 0.0277, + "rewards/chosen": 3.403448486328125, + "rewards/margins": 14.495516204833985, + "rewards/rejected": -11.09206771850586, + "step": 10780 + }, + { + "epoch": 0.9850159890360896, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 5.627082390304295e-09, + "logits/chosen": 908472000.0, + "logits/rejected": 425704448.0, + "logps/chosen": -342.32940673828125, + "logps/rejected": -360.3221740722656, + "loss": 0.0115, + "rewards/chosen": 4.058490753173828, + "rewards/margins": 11.460312366485596, + "rewards/rejected": -7.401821613311768, + "step": 10781 + }, + { + "epoch": 0.9851073549566012, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 5.559094622589745e-09, + "logits/chosen": 568731477.3333334, + "logits/rejected": 736340633.6, + "logps/chosen": -351.7950439453125, + "logps/rejected": -418.415673828125, + "loss": 0.0085, + "rewards/chosen": 4.174853642781575, + "rewards/margins": 12.155242284138996, + "rewards/rejected": -7.980388641357422, + "step": 10782 + }, + { + "epoch": 0.9851987208771128, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5.491519848863092e-09, + "logits/chosen": 624743253.3333334, + "logits/rejected": 471251763.2, + "logps/chosen": -399.6768391927083, + "logps/rejected": -385.329150390625, + "loss": 0.01, + "rewards/chosen": 3.835757573445638, + "rewards/margins": 12.552538426717122, + "rewards/rejected": -8.716780853271484, + "step": 10783 + }, + { + "epoch": 0.9852900867976245, + "grad_norm": 49.25, + "kl": 0.0, + "learning_rate": 5.424358074713198e-09, + "logits/chosen": 660541440.0, + "logits/rejected": 1129662720.0, + "logps/chosen": -409.06671142578125, + "logps/rejected": -322.08538818359375, + "loss": 0.0539, + "rewards/chosen": 3.8850882053375244, + "rewards/margins": 11.962708711624146, + "rewards/rejected": -8.077620506286621, + "step": 10784 + }, + { + "epoch": 0.9853814527181362, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 5.357609305692291e-09, + "logits/chosen": 472241472.0, + "logits/rejected": 524887360.0, + "logps/chosen": -234.33551025390625, + "logps/rejected": -374.70770263671875, + "loss": 0.0132, + "rewards/chosen": 4.035394668579102, + "rewards/margins": 12.079008102416992, + "rewards/rejected": -8.04361343383789, + "step": 10785 + }, + { + "epoch": 0.9854728186386478, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 5.291273547321507e-09, + "logits/chosen": 496703146.6666667, + "logits/rejected": 473614438.4, + "logps/chosen": -312.8728434244792, + "logps/rejected": -461.060302734375, + "loss": 0.0094, + "rewards/chosen": 3.8760808308919272, + "rewards/margins": 13.214932759602865, + "rewards/rejected": -9.338851928710938, + "step": 10786 + }, + { + "epoch": 0.9855641845591594, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.225350805084795e-09, + "logits/chosen": 583152384.0, + "logits/rejected": 610577024.0, + "logps/chosen": -385.4041442871094, + "logps/rejected": -480.7747497558594, + "loss": 0.0337, + "rewards/chosen": 3.395267963409424, + "rewards/margins": 12.52096700668335, + "rewards/rejected": -9.125699043273926, + "step": 10787 + }, + { + "epoch": 0.9856555504796711, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5.159841084434458e-09, + "logits/chosen": 508233984.0, + "logits/rejected": 336559744.0, + "logps/chosen": -356.3602701822917, + "logps/rejected": -478.71356201171875, + "loss": 0.014, + "rewards/chosen": 4.530545870463054, + "rewards/margins": 16.135757128397625, + "rewards/rejected": -11.60521125793457, + "step": 10788 + }, + { + "epoch": 0.9857469164001827, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5.094744390786721e-09, + "logits/chosen": 441542528.0, + "logits/rejected": 414548608.0, + "logps/chosen": -323.5369873046875, + "logps/rejected": -381.18756103515625, + "loss": 0.0129, + "rewards/chosen": 3.732759475708008, + "rewards/margins": 13.05725383758545, + "rewards/rejected": -9.324494361877441, + "step": 10789 + }, + { + "epoch": 0.9858382823206944, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 5.0300607295250544e-09, + "logits/chosen": 380774997.3333333, + "logits/rejected": 416505241.6, + "logps/chosen": -254.73592122395834, + "logps/rejected": -389.9375244140625, + "loss": 0.011, + "rewards/chosen": 3.9200560251871743, + "rewards/margins": 12.945589319864908, + "rewards/rejected": -9.025533294677734, + "step": 10790 + }, + { + "epoch": 0.985929648241206, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 4.965790105997403e-09, + "logits/chosen": 1075006464.0, + "logits/rejected": 732247722.6666666, + "logps/chosen": -530.6852416992188, + "logps/rejected": -543.7709147135416, + "loss": 0.0084, + "rewards/chosen": 3.5891523361206055, + "rewards/margins": 13.223801612854004, + "rewards/rejected": -9.634649276733398, + "step": 10791 + }, + { + "epoch": 0.9860210141617177, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 4.901932525519515e-09, + "logits/chosen": 642343296.0, + "logits/rejected": 520124128.0, + "logps/chosen": -473.1081848144531, + "logps/rejected": -589.43017578125, + "loss": 0.0286, + "rewards/chosen": 2.937396287918091, + "rewards/margins": 11.88766598701477, + "rewards/rejected": -8.95026969909668, + "step": 10792 + }, + { + "epoch": 0.9861123800822293, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.838487993371055e-09, + "logits/chosen": 674549452.8, + "logits/rejected": 332773546.6666667, + "logps/chosen": -232.767431640625, + "logps/rejected": -437.6816813151042, + "loss": 0.0499, + "rewards/chosen": 2.6223628997802733, + "rewards/margins": 11.977448908487954, + "rewards/rejected": -9.355086008707682, + "step": 10793 + }, + { + "epoch": 0.986203746002741, + "grad_norm": 0.44140625, + "kl": 0.0, + "learning_rate": 4.7754565147983824e-09, + "logits/chosen": 192744864.0, + "logits/rejected": 453803477.3333333, + "logps/chosen": -182.05267333984375, + "logps/rejected": -562.7732747395834, + "loss": 0.0017, + "rewards/chosen": 5.795522689819336, + "rewards/margins": 15.794126510620117, + "rewards/rejected": -9.998603820800781, + "step": 10794 + }, + { + "epoch": 0.9862951119232526, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.712838095013439e-09, + "logits/chosen": 462678186.6666667, + "logits/rejected": 411623936.0, + "logps/chosen": -342.7765299479167, + "logps/rejected": -241.56439208984375, + "loss": 0.0354, + "rewards/chosen": 3.5033143361409507, + "rewards/margins": 9.789571126302084, + "rewards/rejected": -6.286256790161133, + "step": 10795 + }, + { + "epoch": 0.9863864778437643, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 4.650632739194305e-09, + "logits/chosen": 433868352.0, + "logits/rejected": 465328768.0, + "logps/chosen": -335.30743408203125, + "logps/rejected": -452.8196716308594, + "loss": 0.009, + "rewards/chosen": 4.260980606079102, + "rewards/margins": 13.346122741699219, + "rewards/rejected": -9.085142135620117, + "step": 10796 + }, + { + "epoch": 0.9864778437642759, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 4.588840452485199e-09, + "logits/chosen": 475843712.0, + "logits/rejected": 379770368.0, + "logps/chosen": -212.0246124267578, + "logps/rejected": -447.3504638671875, + "loss": 0.0218, + "rewards/chosen": 3.1109657287597656, + "rewards/margins": 12.17757797241211, + "rewards/rejected": -9.066612243652344, + "step": 10797 + }, + { + "epoch": 0.9865692096847876, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 4.5274612399959226e-09, + "logits/chosen": 270584576.0, + "logits/rejected": 677269546.6666666, + "logps/chosen": -277.8748779296875, + "logps/rejected": -607.9744873046875, + "loss": 0.01, + "rewards/chosen": 4.107302665710449, + "rewards/margins": 14.311453819274902, + "rewards/rejected": -10.204151153564453, + "step": 10798 + }, + { + "epoch": 0.9866605756052992, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 4.46649510680075e-09, + "logits/chosen": 591800960.0, + "logits/rejected": 375234528.0, + "logps/chosen": -295.2601318359375, + "logps/rejected": -601.1025390625, + "loss": 0.0074, + "rewards/chosen": 5.0136308670043945, + "rewards/margins": 13.817152976989746, + "rewards/rejected": -8.803522109985352, + "step": 10799 + }, + { + "epoch": 0.9867519415258109, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 4.405942057942314e-09, + "logits/chosen": 906358976.0, + "logits/rejected": 680732608.0, + "logps/chosen": -385.38262939453125, + "logps/rejected": -624.3748779296875, + "loss": 0.0108, + "rewards/chosen": 4.245504379272461, + "rewards/margins": 12.79837417602539, + "rewards/rejected": -8.55286979675293, + "step": 10800 + }, + { + "epoch": 0.9868433074463225, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 4.345802098427165e-09, + "logits/chosen": 572651008.0, + "logits/rejected": 372703328.0, + "logps/chosen": -210.1385040283203, + "logps/rejected": -399.3593444824219, + "loss": 0.0129, + "rewards/chosen": 3.921492338180542, + "rewards/margins": 13.535096883773804, + "rewards/rejected": -9.613604545593262, + "step": 10801 + }, + { + "epoch": 0.9869346733668342, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 4.286075233227993e-09, + "logits/chosen": 1034238171.4285715, + "logits/rejected": 472313728.0, + "logps/chosen": -278.80496651785717, + "logps/rejected": -445.21734619140625, + "loss": 0.0502, + "rewards/chosen": 2.855300085885184, + "rewards/margins": 13.226812498910085, + "rewards/rejected": -10.371512413024902, + "step": 10802 + }, + { + "epoch": 0.9870260392873458, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 4.226761467284734e-09, + "logits/chosen": 712326912.0, + "logits/rejected": 677561984.0, + "logps/chosen": -295.25286865234375, + "logps/rejected": -267.3317565917969, + "loss": 0.0107, + "rewards/chosen": 4.227137565612793, + "rewards/margins": 12.506379127502441, + "rewards/rejected": -8.279241561889648, + "step": 10803 + }, + { + "epoch": 0.9871174052078575, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 4.16786080550069e-09, + "logits/chosen": 375418965.3333333, + "logits/rejected": 300142668.8, + "logps/chosen": -246.61065673828125, + "logps/rejected": -491.12294921875, + "loss": 0.0144, + "rewards/chosen": 4.098008791605632, + "rewards/margins": 15.68370443979899, + "rewards/rejected": -11.58569564819336, + "step": 10804 + }, + { + "epoch": 0.9872087711283691, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 4.109373252747517e-09, + "logits/chosen": 589444966.4, + "logits/rejected": 642433536.0, + "logps/chosen": -338.074609375, + "logps/rejected": -411.1377360026042, + "loss": 0.0221, + "rewards/chosen": 3.5953845977783203, + "rewards/margins": 12.999526977539062, + "rewards/rejected": -9.404142379760742, + "step": 10805 + }, + { + "epoch": 0.9873001370488808, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 4.051298813861349e-09, + "logits/chosen": 550081024.0, + "logits/rejected": 569329792.0, + "logps/chosen": -275.2796630859375, + "logps/rejected": -489.0577392578125, + "loss": 0.0403, + "rewards/chosen": 3.756019910176595, + "rewards/margins": 12.847745259602865, + "rewards/rejected": -9.09172534942627, + "step": 10806 + }, + { + "epoch": 0.9873915029693924, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 3.993637493644453e-09, + "logits/chosen": 476053280.0, + "logits/rejected": 407389312.0, + "logps/chosen": -315.5900573730469, + "logps/rejected": -434.2979329427083, + "loss": 0.0086, + "rewards/chosen": 4.439738750457764, + "rewards/margins": 13.170693238576254, + "rewards/rejected": -8.73095448811849, + "step": 10807 + }, + { + "epoch": 0.9874828688899041, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 3.936389296864129e-09, + "logits/chosen": 658251878.4, + "logits/rejected": 411153365.3333333, + "logps/chosen": -409.542578125, + "logps/rejected": -625.544921875, + "loss": 0.0229, + "rewards/chosen": 3.6018600463867188, + "rewards/margins": 17.997474670410156, + "rewards/rejected": -14.395614624023438, + "step": 10808 + }, + { + "epoch": 0.9875742348104157, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 3.879554228255477e-09, + "logits/chosen": 711834176.0, + "logits/rejected": 438127872.0, + "logps/chosen": -568.7022705078125, + "logps/rejected": -517.2236735026041, + "loss": 0.0102, + "rewards/chosen": 3.314746141433716, + "rewards/margins": 14.42225956916809, + "rewards/rejected": -11.107513427734375, + "step": 10809 + }, + { + "epoch": 0.9876656007309274, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 3.823132292516962e-09, + "logits/chosen": 370035712.0, + "logits/rejected": 564283904.0, + "logps/chosen": -200.65774536132812, + "logps/rejected": -519.79833984375, + "loss": 0.0282, + "rewards/chosen": 3.1305813789367676, + "rewards/margins": 11.190364360809326, + "rewards/rejected": -8.059782981872559, + "step": 10810 + }, + { + "epoch": 0.987756966651439, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 3.767123494315406e-09, + "logits/chosen": 524912332.8, + "logits/rejected": 405216384.0, + "logps/chosen": -398.112109375, + "logps/rejected": -464.830322265625, + "loss": 0.0181, + "rewards/chosen": 4.111938858032227, + "rewards/margins": 14.06377067565918, + "rewards/rejected": -9.951831817626953, + "step": 10811 + }, + { + "epoch": 0.9878483325719507, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 3.7115278382815478e-09, + "logits/chosen": 408504320.0, + "logits/rejected": 333530453.3333333, + "logps/chosen": -280.4493408203125, + "logps/rejected": -519.6881917317709, + "loss": 0.0088, + "rewards/chosen": 3.4813027381896973, + "rewards/margins": 14.827033837636312, + "rewards/rejected": -11.345731099446615, + "step": 10812 + }, + { + "epoch": 0.9879396984924623, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 3.6563453290122675e-09, + "logits/chosen": 404503072.0, + "logits/rejected": 313333664.0, + "logps/chosen": -261.33160400390625, + "logps/rejected": -472.0480651855469, + "loss": 0.0148, + "rewards/chosen": 3.8469715118408203, + "rewards/margins": 11.829949378967285, + "rewards/rejected": -7.982977867126465, + "step": 10813 + }, + { + "epoch": 0.988031064412974, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 3.6015759710711364e-09, + "logits/chosen": 567818197.3333334, + "logits/rejected": 551500928.0, + "logps/chosen": -301.78476969401044, + "logps/rejected": -476.3720703125, + "loss": 0.0296, + "rewards/chosen": 4.013612111409505, + "rewards/margins": 11.3545716603597, + "rewards/rejected": -7.340959548950195, + "step": 10814 + }, + { + "epoch": 0.9881224303334856, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 3.547219768987309e-09, + "logits/chosen": 726186905.6, + "logits/rejected": 935382357.3333334, + "logps/chosen": -253.1125, + "logps/rejected": -609.4253743489584, + "loss": 0.1271, + "rewards/chosen": 3.0092559814453126, + "rewards/margins": 11.97436548868815, + "rewards/rejected": -8.965109507242838, + "step": 10815 + }, + { + "epoch": 0.9882137962539973, + "grad_norm": 0.890625, + "kl": 0.0, + "learning_rate": 3.4932767272549683e-09, + "logits/chosen": 406166528.0, + "logits/rejected": 404256102.4, + "logps/chosen": -265.7574462890625, + "logps/rejected": -350.0287353515625, + "loss": 0.0049, + "rewards/chosen": 4.716025670369466, + "rewards/margins": 13.572827275594076, + "rewards/rejected": -8.856801605224609, + "step": 10816 + }, + { + "epoch": 0.9883051621745089, + "grad_norm": 0.443359375, + "kl": 0.0, + "learning_rate": 3.439746850334991e-09, + "logits/chosen": 935327360.0, + "logits/rejected": 580713429.3333334, + "logps/chosen": -342.52972412109375, + "logps/rejected": -633.1764729817709, + "loss": 0.002, + "rewards/chosen": 4.884265899658203, + "rewards/margins": 14.430728276570639, + "rewards/rejected": -9.546462376912435, + "step": 10817 + }, + { + "epoch": 0.9883965280950205, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 3.38663014265439e-09, + "logits/chosen": 547403264.0, + "logits/rejected": 498918688.0, + "logps/chosen": -265.8587646484375, + "logps/rejected": -326.9334411621094, + "loss": 0.0178, + "rewards/chosen": 3.649505615234375, + "rewards/margins": 12.128498077392578, + "rewards/rejected": -8.478992462158203, + "step": 10818 + }, + { + "epoch": 0.9884878940155322, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 3.333926608604099e-09, + "logits/chosen": 477771360.0, + "logits/rejected": 524457728.0, + "logps/chosen": -173.75250244140625, + "logps/rejected": -541.1956787109375, + "loss": 0.0047, + "rewards/chosen": 4.884716033935547, + "rewards/margins": 14.61922836303711, + "rewards/rejected": -9.734512329101562, + "step": 10819 + }, + { + "epoch": 0.9885792599360439, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 3.2816362525434074e-09, + "logits/chosen": 777168298.6666666, + "logits/rejected": 141817968.0, + "logps/chosen": -483.8448893229167, + "logps/rejected": -264.1448669433594, + "loss": 0.0257, + "rewards/chosen": 3.498509089152018, + "rewards/margins": 10.236220041910807, + "rewards/rejected": -6.737710952758789, + "step": 10820 + }, + { + "epoch": 0.9886706258565555, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 3.2297590787955248e-09, + "logits/chosen": 658455193.6, + "logits/rejected": 547001130.6666666, + "logps/chosen": -502.87021484375, + "logps/rejected": -616.9250081380209, + "loss": 0.018, + "rewards/chosen": 3.987104034423828, + "rewards/margins": 14.337495422363281, + "rewards/rejected": -10.350391387939453, + "step": 10821 + }, + { + "epoch": 0.9887619917770671, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 3.178295091650352e-09, + "logits/chosen": 856982976.0, + "logits/rejected": 943925174.8571428, + "logps/chosen": -515.3511962890625, + "logps/rejected": -512.8210797991071, + "loss": 0.0042, + "rewards/chosen": 3.713726758956909, + "rewards/margins": 12.83167828832354, + "rewards/rejected": -9.11795152936663, + "step": 10822 + }, + { + "epoch": 0.9888533576975788, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.127244295363929e-09, + "logits/chosen": 767518924.8, + "logits/rejected": 467584341.3333333, + "logps/chosen": -363.318310546875, + "logps/rejected": -392.643798828125, + "loss": 0.0172, + "rewards/chosen": 3.74488525390625, + "rewards/margins": 12.220164744059243, + "rewards/rejected": -8.475279490152994, + "step": 10823 + }, + { + "epoch": 0.9889447236180905, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 3.0766066941567697e-09, + "logits/chosen": 583410304.0, + "logits/rejected": 439586090.6666667, + "logps/chosen": -429.2938537597656, + "logps/rejected": -486.5057779947917, + "loss": 0.0102, + "rewards/chosen": 3.3219940662384033, + "rewards/margins": 13.540738979975382, + "rewards/rejected": -10.218744913736979, + "step": 10824 + }, + { + "epoch": 0.9890360895386021, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 3.026382292217189e-09, + "logits/chosen": 937649152.0, + "logits/rejected": 579121877.3333334, + "logps/chosen": -397.1337646484375, + "logps/rejected": -404.7473551432292, + "loss": 0.0154, + "rewards/chosen": 3.866414260864258, + "rewards/margins": 12.150896581013999, + "rewards/rejected": -8.28448232014974, + "step": 10825 + }, + { + "epoch": 0.9891274554591137, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 2.9765710936979774e-09, + "logits/chosen": 555875712.0, + "logits/rejected": 798194752.0, + "logps/chosen": -282.6557312011719, + "logps/rejected": -538.715576171875, + "loss": 0.0122, + "rewards/chosen": 4.079258918762207, + "rewards/margins": 13.755463600158691, + "rewards/rejected": -9.676204681396484, + "step": 10826 + }, + { + "epoch": 0.9892188213796254, + "grad_norm": 0.455078125, + "kl": 0.0, + "learning_rate": 2.9271731027175064e-09, + "logits/rejected": 729366464.0, + "logps/rejected": -363.8713684082031, + "loss": 0.0005, + "rewards/rejected": -9.223320007324219, + "step": 10827 + }, + { + "epoch": 0.9893101873001371, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 2.878188323360842e-09, + "logits/chosen": 691883477.3333334, + "logits/rejected": 534267238.4, + "logps/chosen": -422.0244140625, + "logps/rejected": -464.8138671875, + "loss": 0.0113, + "rewards/chosen": 3.5351521174112954, + "rewards/margins": 12.139830462137857, + "rewards/rejected": -8.604678344726562, + "step": 10828 + }, + { + "epoch": 0.9894015532206487, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 2.829616759679188e-09, + "logits/chosen": 881087232.0, + "logits/rejected": 834482880.0, + "logps/chosen": -295.8057861328125, + "logps/rejected": -421.7851867675781, + "loss": 0.0232, + "rewards/chosen": 3.1253089904785156, + "rewards/margins": 12.352742195129395, + "rewards/rejected": -9.227433204650879, + "step": 10829 + }, + { + "epoch": 0.9894929191411603, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 2.7814584156876655e-09, + "logits/chosen": 611409920.0, + "logits/rejected": 403833536.0, + "logps/chosen": -587.84423828125, + "logps/rejected": -406.74285888671875, + "loss": 0.0195, + "rewards/chosen": 3.428183078765869, + "rewards/margins": 12.7834153175354, + "rewards/rejected": -9.355232238769531, + "step": 10830 + }, + { + "epoch": 0.989584285061672, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 2.7337132953697555e-09, + "logits/chosen": 415740032.0, + "logits/rejected": 528874432.0, + "logps/chosen": -160.28497314453125, + "logps/rejected": -679.7847290039062, + "loss": 0.0125, + "rewards/chosen": 4.021827220916748, + "rewards/margins": 11.967445850372314, + "rewards/rejected": -7.945618629455566, + "step": 10831 + }, + { + "epoch": 0.9896756509821837, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 2.68638140267341e-09, + "logits/chosen": 441594368.0, + "logits/rejected": 407190144.0, + "logps/chosen": -258.0530700683594, + "logps/rejected": -391.697021484375, + "loss": 0.0119, + "rewards/chosen": 4.21354341506958, + "rewards/margins": 12.178386688232422, + "rewards/rejected": -7.964843273162842, + "step": 10832 + }, + { + "epoch": 0.9897670169026953, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 2.6394627415116115e-09, + "logits/chosen": 655609651.2, + "logits/rejected": 390486186.6666667, + "logps/chosen": -262.5166015625, + "logps/rejected": -384.7544352213542, + "loss": 0.0311, + "rewards/chosen": 3.6653003692626953, + "rewards/margins": 11.061360677083332, + "rewards/rejected": -7.396060307820638, + "step": 10833 + }, + { + "epoch": 0.9898583828232069, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.592957315764588e-09, + "logits/chosen": 454259626.6666667, + "logits/rejected": 494590566.4, + "logps/chosen": -333.7856038411458, + "logps/rejected": -391.412939453125, + "loss": 0.011, + "rewards/chosen": 3.945418357849121, + "rewards/margins": 12.326923942565918, + "rewards/rejected": -8.381505584716797, + "step": 10834 + }, + { + "epoch": 0.9899497487437185, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 2.5468651292781533e-09, + "logits/chosen": 537067673.6, + "logits/rejected": 219140266.66666666, + "logps/chosen": -230.7611328125, + "logps/rejected": -572.97314453125, + "loss": 0.0245, + "rewards/chosen": 4.217464828491211, + "rewards/margins": 14.84603474934896, + "rewards/rejected": -10.628569920857748, + "step": 10835 + }, + { + "epoch": 0.9900411146642303, + "grad_norm": 29.0, + "kl": 0.0, + "learning_rate": 2.5011861858637023e-09, + "logits/chosen": 223572272.0, + "logits/rejected": 526676699.4285714, + "logps/chosen": -176.74087524414062, + "logps/rejected": -451.45326450892856, + "loss": 0.0884, + "rewards/chosen": 4.8711748123168945, + "rewards/margins": 12.99951103755406, + "rewards/rejected": -8.128336225237165, + "step": 10836 + }, + { + "epoch": 0.9901324805847419, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 2.4559204892982136e-09, + "logits/chosen": 453568320.0, + "logits/rejected": 672200320.0, + "logps/chosen": -305.4633483886719, + "logps/rejected": -409.11712646484375, + "loss": 0.0324, + "rewards/chosen": 4.359846115112305, + "rewards/margins": 11.212746620178223, + "rewards/rejected": -6.852900505065918, + "step": 10837 + }, + { + "epoch": 0.9902238465052535, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 2.4110680433248045e-09, + "logits/chosen": 411798229.3333333, + "logits/rejected": 516411955.2, + "logps/chosen": -340.37754313151044, + "logps/rejected": -566.40205078125, + "loss": 0.0209, + "rewards/chosen": 2.905787150065104, + "rewards/margins": 14.152946726481119, + "rewards/rejected": -11.247159576416015, + "step": 10838 + }, + { + "epoch": 0.9903152124257651, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 2.366628851652175e-09, + "logits/chosen": 507870293.3333333, + "logits/rejected": 573453926.4, + "logps/chosen": -344.1636555989583, + "logps/rejected": -648.1306640625, + "loss": 0.0156, + "rewards/chosen": 3.6122461954752603, + "rewards/margins": 14.468462626139322, + "rewards/rejected": -10.856216430664062, + "step": 10839 + }, + { + "epoch": 0.9904065783462769, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 2.3226029179557187e-09, + "logits/chosen": 587026048.0, + "logits/rejected": 343247936.0, + "logps/chosen": -375.2685241699219, + "logps/rejected": -415.6147766113281, + "loss": 0.0281, + "rewards/chosen": 3.348416566848755, + "rewards/margins": 14.400532960891724, + "rewards/rejected": -11.052116394042969, + "step": 10840 + }, + { + "epoch": 0.9904979442667885, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 2.278990245875301e-09, + "logits/chosen": 782155456.0, + "logits/rejected": 1057655424.0, + "logps/chosen": -124.39994049072266, + "logps/rejected": -332.25115966796875, + "loss": 0.0206, + "rewards/chosen": 3.8646159172058105, + "rewards/margins": 11.098484516143799, + "rewards/rejected": -7.233868598937988, + "step": 10841 + }, + { + "epoch": 0.9905893101873001, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 2.235790839017482e-09, + "logits/chosen": 833367616.0, + "logits/rejected": 712400768.0, + "logps/chosen": -226.00177001953125, + "logps/rejected": -826.48583984375, + "loss": 0.0123, + "rewards/chosen": 4.133389472961426, + "rewards/margins": 13.81515884399414, + "rewards/rejected": -9.681769371032715, + "step": 10842 + }, + { + "epoch": 0.9906806761078117, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 2.1930047009538493e-09, + "logits/chosen": 959619925.3333334, + "logits/rejected": 644836659.2, + "logps/chosen": -183.29972330729166, + "logps/rejected": -521.76298828125, + "loss": 0.0347, + "rewards/chosen": 2.3468329111735025, + "rewards/margins": 11.720101038614908, + "rewards/rejected": -9.373268127441406, + "step": 10843 + }, + { + "epoch": 0.9907720420283235, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 2.1506318352237934e-09, + "logits/chosen": 635602329.6, + "logits/rejected": 706733909.3333334, + "logps/chosen": -381.931298828125, + "logps/rejected": -331.36378987630206, + "loss": 0.0208, + "rewards/chosen": 3.5856761932373047, + "rewards/margins": 12.757685979207357, + "rewards/rejected": -9.172009785970053, + "step": 10844 + }, + { + "epoch": 0.9908634079488351, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 2.1086722453300677e-09, + "logits/chosen": 498977484.8, + "logits/rejected": 447785728.0, + "logps/chosen": -238.16513671875, + "logps/rejected": -421.3687337239583, + "loss": 0.1372, + "rewards/chosen": 2.698760414123535, + "rewards/margins": 12.04360523223877, + "rewards/rejected": -9.344844818115234, + "step": 10845 + }, + { + "epoch": 0.9909547738693467, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 2.067125934742675e-09, + "logits/chosen": 619146035.2, + "logits/rejected": 309698261.3333333, + "logps/chosen": -460.99580078125, + "logps/rejected": -453.8404947916667, + "loss": 0.0203, + "rewards/chosen": 3.4468128204345705, + "rewards/margins": 13.544892501831054, + "rewards/rejected": -10.098079681396484, + "step": 10846 + }, + { + "epoch": 0.9910461397898583, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 2.0259929068972007e-09, + "logits/chosen": 547734229.3333334, + "logits/rejected": 470131507.2, + "logps/chosen": -380.2335611979167, + "logps/rejected": -640.395849609375, + "loss": 0.0087, + "rewards/chosen": 4.184318542480469, + "rewards/margins": 14.596323394775391, + "rewards/rejected": -10.412004852294922, + "step": 10847 + }, + { + "epoch": 0.9911375057103701, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 1.9852731651948122e-09, + "logits/chosen": 1394589184.0, + "logits/rejected": 686721962.6666666, + "logps/chosen": -359.3330993652344, + "logps/rejected": -564.4727376302084, + "loss": 0.0144, + "rewards/chosen": 3.030656337738037, + "rewards/margins": 12.533218542734781, + "rewards/rejected": -9.502562204996744, + "step": 10848 + }, + { + "epoch": 0.9912288716308817, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 1.944966713002816e-09, + "logits/chosen": 590489685.3333334, + "logits/rejected": 989341081.6, + "logps/chosen": -335.8941650390625, + "logps/rejected": -459.88896484375, + "loss": 0.0246, + "rewards/chosen": 2.66507625579834, + "rewards/margins": 12.106265830993653, + "rewards/rejected": -9.441189575195313, + "step": 10849 + }, + { + "epoch": 0.9913202375513933, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 1.905073553653547e-09, + "logits/chosen": 548382310.4, + "logits/rejected": 310808960.0, + "logps/chosen": -456.460400390625, + "logps/rejected": -223.44864908854166, + "loss": 0.0165, + "rewards/chosen": 4.176039505004883, + "rewards/margins": 11.945807266235352, + "rewards/rejected": -7.769767761230469, + "step": 10850 + }, + { + "epoch": 0.9914116034719049, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 1.865593690446588e-09, + "logits/chosen": 587662182.4, + "logits/rejected": 586528853.3333334, + "logps/chosen": -243.035009765625, + "logps/rejected": -492.6333821614583, + "loss": 0.0329, + "rewards/chosen": 3.1745578765869142, + "rewards/margins": 10.636848958333333, + "rewards/rejected": -7.462291081746419, + "step": 10851 + }, + { + "epoch": 0.9915029693924167, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 1.826527126646549e-09, + "logits/chosen": 777944256.0, + "logits/rejected": 466743232.0, + "logps/chosen": -516.3034057617188, + "logps/rejected": -562.0051879882812, + "loss": 0.0298, + "rewards/chosen": 3.604931354522705, + "rewards/margins": 13.056212902069092, + "rewards/rejected": -9.451281547546387, + "step": 10852 + }, + { + "epoch": 0.9915943353129283, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 1.7878738654836247e-09, + "logits/chosen": 638250393.6, + "logits/rejected": 356831573.3333333, + "logps/chosen": -307.308935546875, + "logps/rejected": -382.4100341796875, + "loss": 0.0136, + "rewards/chosen": 4.001198577880859, + "rewards/margins": 14.708339182535806, + "rewards/rejected": -10.707140604654947, + "step": 10853 + }, + { + "epoch": 0.9916857012334399, + "grad_norm": 2.828125, + "kl": 0.0, + "learning_rate": 1.749633910153592e-09, + "logits/chosen": 418390848.0, + "logits/rejected": 672368832.0, + "logps/chosen": -272.5384216308594, + "logps/rejected": -546.3468627929688, + "loss": 0.0221, + "rewards/chosen": 3.2568745613098145, + "rewards/margins": 13.674965381622314, + "rewards/rejected": -10.4180908203125, + "step": 10854 + }, + { + "epoch": 0.9917770671539515, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 1.711807263819476e-09, + "logits/chosen": 701115477.3333334, + "logits/rejected": 580802816.0, + "logps/chosen": -549.5394287109375, + "logps/rejected": -588.21572265625, + "loss": 0.0079, + "rewards/chosen": 3.839567184448242, + "rewards/margins": 14.328910446166992, + "rewards/rejected": -10.48934326171875, + "step": 10855 + }, + { + "epoch": 0.9918684330744633, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 1.6743939296087752e-09, + "logits/chosen": 658609194.6666666, + "logits/rejected": 414161356.8, + "logps/chosen": -376.5989990234375, + "logps/rejected": -376.883203125, + "loss": 0.0079, + "rewards/chosen": 4.389517466227214, + "rewards/margins": 13.137369028727214, + "rewards/rejected": -8.7478515625, + "step": 10856 + }, + { + "epoch": 0.9919597989949749, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 1.637393910614571e-09, + "logits/chosen": 452384896.0, + "logits/rejected": 1057196928.0, + "logps/chosen": -358.1728515625, + "logps/rejected": -365.515869140625, + "loss": 0.1073, + "rewards/chosen": 3.751551389694214, + "rewards/margins": 10.415956258773804, + "rewards/rejected": -6.66440486907959, + "step": 10857 + }, + { + "epoch": 0.9920511649154865, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 1.6008072098977478e-09, + "logits/chosen": 358349312.0, + "logits/rejected": 354896448.0, + "logps/chosen": -337.666748046875, + "logps/rejected": -226.9588826497396, + "loss": 0.0092, + "rewards/chosen": 4.690799713134766, + "rewards/margins": 11.923632303873699, + "rewards/rejected": -7.232832590738933, + "step": 10858 + }, + { + "epoch": 0.9921425308359981, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 1.5646338304825537e-09, + "logits/chosen": 806181427.2, + "logits/rejected": 737280170.6666666, + "logps/chosen": -311.1854248046875, + "logps/rejected": -414.6544596354167, + "loss": 0.0271, + "rewards/chosen": 3.3425220489501952, + "rewards/margins": 11.144802474975586, + "rewards/rejected": -7.802280426025391, + "step": 10859 + }, + { + "epoch": 0.9922338967565099, + "grad_norm": 26.125, + "kl": 0.0, + "learning_rate": 1.5288737753604844e-09, + "logits/chosen": 524416416.0, + "logits/rejected": 519858816.0, + "logps/chosen": -306.44940185546875, + "logps/rejected": -430.099609375, + "loss": 0.0791, + "rewards/chosen": 3.544541597366333, + "rewards/margins": 10.21396517753601, + "rewards/rejected": -6.669423580169678, + "step": 10860 + }, + { + "epoch": 0.9923252626770215, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 1.493527047489174e-09, + "logits/chosen": 470524928.0, + "logits/rejected": 351615509.3333333, + "logps/chosen": -280.1211669921875, + "logps/rejected": -542.5432942708334, + "loss": 0.0272, + "rewards/chosen": 3.232582855224609, + "rewards/margins": 14.907379913330079, + "rewards/rejected": -11.674797058105469, + "step": 10861 + }, + { + "epoch": 0.9924166285975331, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 1.4585936497907295e-09, + "logits/chosen": 830206848.0, + "logits/rejected": 436471648.0, + "logps/chosen": -292.3181457519531, + "logps/rejected": -429.7603454589844, + "loss": 0.0144, + "rewards/chosen": 3.6248862743377686, + "rewards/margins": 12.726954221725464, + "rewards/rejected": -9.102067947387695, + "step": 10862 + }, + { + "epoch": 0.9925079945180447, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 1.424073585153951e-09, + "logits/chosen": 1022587584.0, + "logits/rejected": 385748053.3333333, + "logps/chosen": -117.75898742675781, + "logps/rejected": -500.5340983072917, + "loss": 0.0069, + "rewards/chosen": 3.944660186767578, + "rewards/margins": 15.324593861897787, + "rewards/rejected": -11.379933675130209, + "step": 10863 + }, + { + "epoch": 0.9925993604385565, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 1.3899668564337777e-09, + "logits/chosen": 370187712.0, + "logits/rejected": 488643552.0, + "logps/chosen": -380.9732666015625, + "logps/rejected": -529.1951293945312, + "loss": 0.0187, + "rewards/chosen": 3.2761447429656982, + "rewards/margins": 11.82477068901062, + "rewards/rejected": -8.548625946044922, + "step": 10864 + }, + { + "epoch": 0.9926907263590681, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 1.3562734664496203e-09, + "logits/chosen": 1166416691.2, + "logits/rejected": 525932928.0, + "logps/chosen": -468.213671875, + "logps/rejected": -316.00921630859375, + "loss": 0.0249, + "rewards/chosen": 3.6533985137939453, + "rewards/margins": 10.340725580851238, + "rewards/rejected": -6.687327067057292, + "step": 10865 + }, + { + "epoch": 0.9927820922795797, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 1.322993417988694e-09, + "logits/chosen": 412428202.6666667, + "logits/rejected": 397278182.4, + "logps/chosen": -277.05047607421875, + "logps/rejected": -480.759375, + "loss": 0.0157, + "rewards/chosen": 3.4103832244873047, + "rewards/margins": 13.117414474487305, + "rewards/rejected": -9.70703125, + "step": 10866 + }, + { + "epoch": 0.9928734582000913, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 1.2901267138021311e-09, + "logits/chosen": 430975385.6, + "logits/rejected": 356642069.3333333, + "logps/chosen": -331.658544921875, + "logps/rejected": -509.3226725260417, + "loss": 0.021, + "rewards/chosen": 3.805677032470703, + "rewards/margins": 17.332782999674478, + "rewards/rejected": -13.527105967203775, + "step": 10867 + }, + { + "epoch": 0.992964824120603, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 1.2576733566083133e-09, + "logits/chosen": 561898688.0, + "logits/rejected": 394498368.0, + "logps/chosen": -566.0498657226562, + "logps/rejected": -432.94158935546875, + "loss": 0.0138, + "rewards/chosen": 3.79030704498291, + "rewards/margins": 13.474844932556152, + "rewards/rejected": -9.684537887573242, + "step": 10868 + }, + { + "epoch": 0.9930561900411147, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 1.2256333490906492e-09, + "logits/chosen": 383772160.0, + "logits/rejected": 444473088.0, + "logps/chosen": -387.658203125, + "logps/rejected": -465.1248779296875, + "loss": 0.0288, + "rewards/chosen": 3.5477957725524902, + "rewards/margins": 12.07028341293335, + "rewards/rejected": -8.52248764038086, + "step": 10869 + }, + { + "epoch": 0.9931475559616263, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 1.1940066938981309e-09, + "logits/chosen": 565434432.0, + "logits/rejected": 538161408.0, + "logps/chosen": -316.2301025390625, + "logps/rejected": -405.38800048828125, + "loss": 0.0094, + "rewards/chosen": 4.191235542297363, + "rewards/margins": 12.857312202453613, + "rewards/rejected": -8.66607666015625, + "step": 10870 + }, + { + "epoch": 0.9932389218821379, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 1.1627933936464442e-09, + "logits/chosen": 401270442.6666667, + "logits/rejected": 250920768.0, + "logps/chosen": -202.91162109375, + "logps/rejected": -566.9000244140625, + "loss": 0.0166, + "rewards/chosen": 4.167794545491536, + "rewards/margins": 16.452250798543293, + "rewards/rejected": -12.284456253051758, + "step": 10871 + }, + { + "epoch": 0.9933302878026496, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 1.1319934509163023e-09, + "logits/chosen": 763491925.3333334, + "logits/rejected": 759902003.2, + "logps/chosen": -317.21645100911456, + "logps/rejected": -717.33818359375, + "loss": 0.0319, + "rewards/chosen": 2.773434638977051, + "rewards/margins": 14.80693759918213, + "rewards/rejected": -12.033502960205078, + "step": 10872 + }, + { + "epoch": 0.9934216537231613, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 1.1016068682545567e-09, + "logits/chosen": 568296576.0, + "logits/rejected": 557372160.0, + "logps/chosen": -445.21038818359375, + "logps/rejected": -647.4613037109375, + "loss": 0.0279, + "rewards/chosen": 3.1791069507598877, + "rewards/margins": 12.262356996536255, + "rewards/rejected": -9.083250045776367, + "step": 10873 + }, + { + "epoch": 0.9935130196436729, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 1.0716336481747524e-09, + "logits/chosen": 483894816.0, + "logits/rejected": 498835488.0, + "logps/chosen": -362.10430908203125, + "logps/rejected": -722.572021484375, + "loss": 0.0253, + "rewards/chosen": 3.4993810653686523, + "rewards/margins": 13.894671440124512, + "rewards/rejected": -10.39529037475586, + "step": 10874 + }, + { + "epoch": 0.9936043855641845, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 1.0420737931543524e-09, + "logits/chosen": 391563456.0, + "logits/rejected": 353714816.0, + "logps/chosen": -227.31277465820312, + "logps/rejected": -525.8721923828125, + "loss": 0.0125, + "rewards/chosen": 4.150826454162598, + "rewards/margins": 13.693831443786621, + "rewards/rejected": -9.543004989624023, + "step": 10875 + }, + { + "epoch": 0.9936957514846962, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.0129273056380673e-09, + "logits/chosen": 919877248.0, + "logits/rejected": 604477988.5714285, + "logps/chosen": -718.2066650390625, + "logps/rejected": -454.72970145089283, + "loss": 0.0159, + "rewards/chosen": 1.9404724836349487, + "rewards/margins": 10.548172286578588, + "rewards/rejected": -8.607699802943639, + "step": 10876 + }, + { + "epoch": 0.9937871174052079, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 9.841941880361917e-10, + "logits/chosen": 566282240.0, + "logits/rejected": 375683584.0, + "logps/chosen": -317.67738850911456, + "logps/rejected": -449.23896484375, + "loss": 0.0084, + "rewards/chosen": 4.233460108439128, + "rewards/margins": 13.785613123575846, + "rewards/rejected": -9.552153015136719, + "step": 10877 + }, + { + "epoch": 0.9938784833257195, + "grad_norm": 0.83984375, + "kl": 0.0, + "learning_rate": 9.558744427240475e-10, + "logits/chosen": 615202218.6666666, + "logits/rejected": 565039872.0, + "logps/chosen": -367.287353515625, + "logps/rejected": -313.23056640625, + "loss": 0.1092, + "rewards/chosen": 4.683500289916992, + "rewards/margins": 11.438267135620118, + "rewards/rejected": -6.754766845703125, + "step": 10878 + }, + { + "epoch": 0.9939698492462311, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 9.279680720447603e-10, + "logits/chosen": 594461696.0, + "logits/rejected": 315596434.28571427, + "logps/chosen": -336.1072998046875, + "logps/rejected": -517.2253766741071, + "loss": 0.0128, + "rewards/chosen": 2.1856751441955566, + "rewards/margins": 11.80552966254098, + "rewards/rejected": -9.619854518345424, + "step": 10879 + }, + { + "epoch": 0.9940612151667428, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 9.004750783042637e-10, + "logits/chosen": 494518272.0, + "logits/rejected": 873438976.0, + "logps/chosen": -271.40765380859375, + "logps/rejected": -584.169677734375, + "loss": 0.1319, + "rewards/chosen": 2.3067750930786133, + "rewards/margins": 11.82005786895752, + "rewards/rejected": -9.513282775878906, + "step": 10880 + }, + { + "epoch": 0.9941525810872545, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 8.733954637774045e-10, + "logits/chosen": 709518976.0, + "logits/rejected": 1174155008.0, + "logps/chosen": -498.2363586425781, + "logps/rejected": -574.2567749023438, + "loss": 0.0119, + "rewards/chosen": 4.227684020996094, + "rewards/margins": 13.004023551940918, + "rewards/rejected": -8.776339530944824, + "step": 10881 + }, + { + "epoch": 0.9942439470077661, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 8.467292307023923e-10, + "logits/chosen": 484521248.0, + "logits/rejected": 466102592.0, + "logps/chosen": -517.9752197265625, + "logps/rejected": -260.690673828125, + "loss": 0.0066, + "rewards/chosen": 4.512170314788818, + "rewards/margins": 11.266727924346924, + "rewards/rejected": -6.7545576095581055, + "step": 10882 + }, + { + "epoch": 0.9943353129282777, + "grad_norm": 72.0, + "kl": 0.0, + "learning_rate": 8.204763812852401e-10, + "logits/chosen": 521397811.2, + "logits/rejected": 580337493.3333334, + "logps/chosen": -270.969580078125, + "logps/rejected": -881.2543131510416, + "loss": 0.1028, + "rewards/chosen": 2.658442497253418, + "rewards/margins": 13.597076733907064, + "rewards/rejected": -10.938634236653646, + "step": 10883 + }, + { + "epoch": 0.9944266788487894, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 7.94636917695879e-10, + "logits/chosen": 602415232.0, + "logits/rejected": 538275413.3333334, + "logps/chosen": -310.864501953125, + "logps/rejected": -445.2305908203125, + "loss": 0.0094, + "rewards/chosen": 3.404501438140869, + "rewards/margins": 12.169158140818277, + "rewards/rejected": -8.764656702677408, + "step": 10884 + }, + { + "epoch": 0.9945180447693011, + "grad_norm": 0.55859375, + "kl": 0.0, + "learning_rate": 7.692108420714883e-10, + "logits/chosen": 418583082.6666667, + "logits/rejected": 467522713.6, + "logps/chosen": -384.9552001953125, + "logps/rejected": -450.759814453125, + "loss": 0.0027, + "rewards/chosen": 5.0715891520182295, + "rewards/margins": 14.429279581705728, + "rewards/rejected": -9.3576904296875, + "step": 10885 + }, + { + "epoch": 0.9946094106898127, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 7.441981565142753e-10, + "logits/chosen": 731827541.3333334, + "logits/rejected": 1562078336.0, + "logps/chosen": -341.3983561197917, + "logps/rejected": -594.31103515625, + "loss": 0.0214, + "rewards/chosen": 3.6283222834269204, + "rewards/margins": 15.09191862742106, + "rewards/rejected": -11.46359634399414, + "step": 10886 + }, + { + "epoch": 0.9947007766103243, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 7.195988630925854e-10, + "logits/chosen": 625082163.2, + "logits/rejected": 473890986.6666667, + "logps/chosen": -459.331640625, + "logps/rejected": -625.5091959635416, + "loss": 0.0196, + "rewards/chosen": 3.958761978149414, + "rewards/margins": 12.306862004597981, + "rewards/rejected": -8.348100026448568, + "step": 10887 + }, + { + "epoch": 0.994792142530836, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 6.954129638409024e-10, + "logits/chosen": 679230668.8, + "logits/rejected": 480607232.0, + "logps/chosen": -273.019970703125, + "logps/rejected": -448.5733235677083, + "loss": 0.0161, + "rewards/chosen": 4.324533462524414, + "rewards/margins": 14.495829900105795, + "rewards/rejected": -10.17129643758138, + "step": 10888 + }, + { + "epoch": 0.9948835084513477, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 6.71640460759293e-10, + "logits/chosen": 814601536.0, + "logits/rejected": 408187904.0, + "logps/chosen": -362.6119384765625, + "logps/rejected": -522.6975708007812, + "loss": 0.0145, + "rewards/chosen": 3.8640565872192383, + "rewards/margins": 13.868284225463867, + "rewards/rejected": -10.004227638244629, + "step": 10889 + }, + { + "epoch": 0.9949748743718593, + "grad_norm": 0.5546875, + "kl": 0.0, + "learning_rate": 6.48281355812852e-10, + "logits/chosen": 645545152.0, + "logits/rejected": 794770285.7142857, + "logps/chosen": -535.9349365234375, + "logps/rejected": -627.5198102678571, + "loss": 0.002, + "rewards/chosen": 4.175140380859375, + "rewards/margins": 14.096629551478795, + "rewards/rejected": -9.92148917061942, + "step": 10890 + }, + { + "epoch": 0.9950662402923709, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 6.253356509333674e-10, + "logits/chosen": 274265440.0, + "logits/rejected": 420331776.0, + "logps/chosen": -234.18264770507812, + "logps/rejected": -459.571044921875, + "loss": 0.0047, + "rewards/chosen": 4.364822864532471, + "rewards/margins": 13.439363956451416, + "rewards/rejected": -9.074541091918945, + "step": 10891 + }, + { + "epoch": 0.9951576062128826, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.028033480187656e-10, + "logits/chosen": 360720192.0, + "logits/rejected": 312540096.0, + "logps/chosen": -276.44464111328125, + "logps/rejected": -463.60968017578125, + "loss": 0.0196, + "rewards/chosen": 4.023262023925781, + "rewards/margins": 15.449639320373535, + "rewards/rejected": -11.426377296447754, + "step": 10892 + }, + { + "epoch": 0.9952489721333942, + "grad_norm": 0.75390625, + "kl": 0.0, + "learning_rate": 5.806844489320007e-10, + "logits/chosen": 765195776.0, + "logits/rejected": 850948437.3333334, + "logps/chosen": -342.7392883300781, + "logps/rejected": -687.0228678385416, + "loss": 0.0036, + "rewards/chosen": 4.3491010665893555, + "rewards/margins": 15.150383313496908, + "rewards/rejected": -10.801282246907553, + "step": 10893 + }, + { + "epoch": 0.9953403380539059, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5.5897895550161e-10, + "logits/chosen": 621828710.4, + "logits/rejected": 729748394.6666666, + "logps/chosen": -311.9311767578125, + "logps/rejected": -718.8854166666666, + "loss": 0.0254, + "rewards/chosen": 3.507537078857422, + "rewards/margins": 16.73739547729492, + "rewards/rejected": -13.2298583984375, + "step": 10894 + }, + { + "epoch": 0.9954317039744175, + "grad_norm": 49.25, + "kl": 0.0, + "learning_rate": 5.376868695228244e-10, + "logits/chosen": 520157132.8, + "logits/rejected": 382695424.0, + "logps/chosen": -306.1609619140625, + "logps/rejected": -494.1561279296875, + "loss": 0.1732, + "rewards/chosen": 1.7137994766235352, + "rewards/margins": 9.774823188781738, + "rewards/rejected": -8.061023712158203, + "step": 10895 + }, + { + "epoch": 0.9955230698949292, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5.168081927564572e-10, + "logits/chosen": 456334016.0, + "logits/rejected": 535724160.0, + "logps/chosen": -336.560302734375, + "logps/rejected": -675.5751342773438, + "loss": 0.0224, + "rewards/chosen": 3.44998836517334, + "rewards/margins": 14.414125442504883, + "rewards/rejected": -10.964137077331543, + "step": 10896 + }, + { + "epoch": 0.9956144358154408, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 4.963429269289055e-10, + "logits/chosen": 484004096.0, + "logits/rejected": 367732138.6666667, + "logps/chosen": -290.1740478515625, + "logps/rejected": -380.6307779947917, + "loss": 0.0326, + "rewards/chosen": 3.6230628967285154, + "rewards/margins": 10.028457768758138, + "rewards/rejected": -6.405394872029622, + "step": 10897 + }, + { + "epoch": 0.9957058017359525, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 4.762910737321491e-10, + "logits/chosen": 571935360.0, + "logits/rejected": 1151928192.0, + "logps/chosen": -382.4433186848958, + "logps/rejected": -387.3099060058594, + "loss": 0.0278, + "rewards/chosen": 3.3652826944986978, + "rewards/margins": 11.326874415079752, + "rewards/rejected": -7.961591720581055, + "step": 10898 + }, + { + "epoch": 0.9957971676564641, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 4.566526348243061e-10, + "logits/chosen": 593996953.6, + "logits/rejected": 559256490.6666666, + "logps/chosen": -263.359228515625, + "logps/rejected": -574.4437255859375, + "loss": 0.0298, + "rewards/chosen": 3.06629638671875, + "rewards/margins": 13.359061686197915, + "rewards/rejected": -10.292765299479166, + "step": 10899 + }, + { + "epoch": 0.9958885335769758, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 4.374276118301879e-10, + "logits/chosen": 667846400.0, + "logits/rejected": 911058124.8, + "logps/chosen": -205.5610148111979, + "logps/rejected": -630.49306640625, + "loss": 0.0161, + "rewards/chosen": 3.2324609756469727, + "rewards/margins": 14.088656044006347, + "rewards/rejected": -10.856195068359375, + "step": 10900 + }, + { + "epoch": 0.9959798994974874, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 4.186160063379685e-10, + "logits/chosen": 1615566720.0, + "logits/rejected": 510655914.6666667, + "logps/chosen": -344.6315002441406, + "logps/rejected": -360.1452229817708, + "loss": 0.0168, + "rewards/chosen": 3.9575772285461426, + "rewards/margins": 12.961551507314047, + "rewards/rejected": -9.003974278767904, + "step": 10901 + }, + { + "epoch": 0.9960712654179991, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 4.002178199047357e-10, + "logits/chosen": 878619477.3333334, + "logits/rejected": 699988582.4, + "logps/chosen": -167.7545166015625, + "logps/rejected": -635.26884765625, + "loss": 0.1231, + "rewards/chosen": 2.040088971455892, + "rewards/margins": 10.768863995869955, + "rewards/rejected": -8.728775024414062, + "step": 10902 + }, + { + "epoch": 0.9961626313385107, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 3.8223305405094e-10, + "logits/chosen": 494443690.6666667, + "logits/rejected": 396809408.0, + "logps/chosen": -332.5928548177083, + "logps/rejected": -419.400146484375, + "loss": 0.036, + "rewards/chosen": 3.327702204386393, + "rewards/margins": 11.755840937296549, + "rewards/rejected": -8.428138732910156, + "step": 10903 + }, + { + "epoch": 0.9962539972590224, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 3.646617102637251e-10, + "logits/chosen": 566022784.0, + "logits/rejected": 472444800.0, + "logps/chosen": -319.5166015625, + "logps/rejected": -419.3880615234375, + "loss": 0.0212, + "rewards/chosen": 3.2695679664611816, + "rewards/margins": 11.876014232635498, + "rewards/rejected": -8.606446266174316, + "step": 10904 + }, + { + "epoch": 0.996345363179534, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 3.47503789996928e-10, + "logits/chosen": 892873523.2, + "logits/rejected": 588889557.3333334, + "logps/chosen": -431.96533203125, + "logps/rejected": -467.9733479817708, + "loss": 0.0146, + "rewards/chosen": 4.057660675048828, + "rewards/margins": 13.86898193359375, + "rewards/rejected": -9.811321258544922, + "step": 10905 + }, + { + "epoch": 0.9964367291000457, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 3.307592946683036e-10, + "logits/chosen": 1111860633.6, + "logits/rejected": 510245973.3333333, + "logps/chosen": -331.3250244140625, + "logps/rejected": -461.5904541015625, + "loss": 0.0136, + "rewards/chosen": 4.0792236328125, + "rewards/margins": 15.420888264973959, + "rewards/rejected": -11.341664632161459, + "step": 10906 + }, + { + "epoch": 0.9965280950205573, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 3.14428225662855e-10, + "logits/chosen": 489878698.6666667, + "logits/rejected": 535320000.0, + "logps/chosen": -252.18916829427084, + "logps/rejected": -423.39306640625, + "loss": 0.0181, + "rewards/chosen": 4.3530534108479815, + "rewards/margins": 14.669491132100422, + "rewards/rejected": -10.316437721252441, + "step": 10907 + }, + { + "epoch": 0.996619460941069, + "grad_norm": 1.75, + "kl": 0.0, + "learning_rate": 2.9851058433116865e-10, + "logits/chosen": 376403712.0, + "logits/rejected": 200937664.0, + "logps/chosen": -369.4158935546875, + "logps/rejected": -308.525390625, + "loss": 0.0089, + "rewards/chosen": 4.458553314208984, + "rewards/margins": 12.801469802856445, + "rewards/rejected": -8.342916488647461, + "step": 10908 + }, + { + "epoch": 0.9967108268615806, + "grad_norm": 1.5703125, + "kl": 0.0, + "learning_rate": 2.830063719899689e-10, + "logits/chosen": 789027264.0, + "logits/rejected": 987476821.3333334, + "logps/chosen": -399.4722900390625, + "logps/rejected": -591.3628336588541, + "loss": 0.0062, + "rewards/chosen": 3.69970703125, + "rewards/margins": 14.103943506876627, + "rewards/rejected": -10.404236475626627, + "step": 10909 + }, + { + "epoch": 0.9968021927820923, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 2.679155899198982e-10, + "logits/chosen": 588526976.0, + "logits/rejected": 613053184.0, + "logps/chosen": -368.74554443359375, + "logps/rejected": -620.4444580078125, + "loss": 0.0174, + "rewards/chosen": 3.6956276893615723, + "rewards/margins": 11.716599941253662, + "rewards/rejected": -8.02097225189209, + "step": 10910 + }, + { + "epoch": 0.9968935587026039, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 2.5323823936995727e-10, + "logits/chosen": 286717056.0, + "logits/rejected": 470167936.0, + "logps/chosen": -178.70742797851562, + "logps/rejected": -442.4405517578125, + "loss": 0.0298, + "rewards/chosen": 3.7074050903320312, + "rewards/margins": 12.13710880279541, + "rewards/rejected": -8.429703712463379, + "step": 10911 + }, + { + "epoch": 0.9969849246231156, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 2.3897432155362e-10, + "logits/chosen": 584866368.0, + "logits/rejected": 428592896.0, + "logps/chosen": -504.469482421875, + "logps/rejected": -375.2871398925781, + "loss": 0.0108, + "rewards/chosen": 3.8641014099121094, + "rewards/margins": 11.959134101867676, + "rewards/rejected": -8.095032691955566, + "step": 10912 + }, + { + "epoch": 0.9970762905436272, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 2.2512383765049828e-10, + "logits/chosen": 1170719744.0, + "logits/rejected": 451140096.0, + "logps/chosen": -289.88031005859375, + "logps/rejected": -368.28204345703125, + "loss": 0.0256, + "rewards/chosen": 3.2477731704711914, + "rewards/margins": 13.141770362854004, + "rewards/rejected": -9.893997192382812, + "step": 10913 + }, + { + "epoch": 0.9971676564641389, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 2.1168678880523207e-10, + "logits/chosen": 369235072.0, + "logits/rejected": 449956736.0, + "logps/chosen": -377.7593078613281, + "logps/rejected": -412.13446044921875, + "loss": 0.0071, + "rewards/chosen": 4.430161952972412, + "rewards/margins": 14.09308671951294, + "rewards/rejected": -9.662924766540527, + "step": 10914 + }, + { + "epoch": 0.9972590223846505, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 1.9866317612970977e-10, + "logits/chosen": 593191168.0, + "logits/rejected": 936009830.4, + "logps/chosen": -402.7305094401042, + "logps/rejected": -383.116796875, + "loss": 0.0212, + "rewards/chosen": 2.8700958887736, + "rewards/margins": 11.899664370218913, + "rewards/rejected": -9.029568481445313, + "step": 10915 + }, + { + "epoch": 0.9973503883051622, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 1.8605300070029252e-10, + "logits/chosen": 486339669.3333333, + "logits/rejected": 452381696.0, + "logps/chosen": -273.86810302734375, + "logps/rejected": -305.797900390625, + "loss": 0.0073, + "rewards/chosen": 4.305181503295898, + "rewards/margins": 12.662987899780273, + "rewards/rejected": -8.357806396484374, + "step": 10916 + }, + { + "epoch": 0.9974417542256738, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 1.7385626356003493e-10, + "logits/chosen": 572194240.0, + "logits/rejected": 285032000.0, + "logps/chosen": -225.62298583984375, + "logps/rejected": -466.95416259765625, + "loss": 0.0213, + "rewards/chosen": 3.212251663208008, + "rewards/margins": 13.465343475341797, + "rewards/rejected": -10.253091812133789, + "step": 10917 + }, + { + "epoch": 0.9975331201461854, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 1.620729657175746e-10, + "logits/chosen": 450407744.0, + "logits/rejected": 699631104.0, + "logps/chosen": -237.34400939941406, + "logps/rejected": -468.7882080078125, + "loss": 0.0166, + "rewards/chosen": 3.586503744125366, + "rewards/margins": 11.60025954246521, + "rewards/rejected": -8.013755798339844, + "step": 10918 + }, + { + "epoch": 0.9976244860666971, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 1.5070310814713218e-10, + "logits/chosen": 403706154.6666667, + "logits/rejected": 515594444.8, + "logps/chosen": -278.0518798828125, + "logps/rejected": -707.7181640625, + "loss": 0.0099, + "rewards/chosen": 3.7648487091064453, + "rewards/margins": 14.694511795043946, + "rewards/rejected": -10.9296630859375, + "step": 10919 + }, + { + "epoch": 0.9977158519872088, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 1.3974669178906663e-10, + "logits/chosen": 470475328.0, + "logits/rejected": 548866517.3333334, + "logps/chosen": -351.1092224121094, + "logps/rejected": -491.5936686197917, + "loss": 0.0078, + "rewards/chosen": 4.12393045425415, + "rewards/margins": 13.253190835316977, + "rewards/rejected": -9.129260381062826, + "step": 10920 + }, + { + "epoch": 0.9978072179077204, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 1.2920371754931994e-10, + "logits/chosen": 472489472.0, + "logits/rejected": 803832320.0, + "logps/chosen": -226.829052734375, + "logps/rejected": -347.4892985026042, + "loss": 0.029, + "rewards/chosen": 3.7077903747558594, + "rewards/margins": 10.808071772257488, + "rewards/rejected": -7.100281397501628, + "step": 10921 + }, + { + "epoch": 0.997898583828232, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.1907418629941713e-10, + "logits/chosen": 661637529.6, + "logits/rejected": 426109525.3333333, + "logps/chosen": -331.244482421875, + "logps/rejected": -423.5430908203125, + "loss": 0.0142, + "rewards/chosen": 4.287435913085938, + "rewards/margins": 13.34281260172526, + "rewards/rejected": -9.055376688639322, + "step": 10922 + }, + { + "epoch": 0.9979899497487437, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 1.0935809887702154e-10, + "logits/chosen": 1089144149.3333333, + "logits/rejected": 920696217.6, + "logps/chosen": -480.381103515625, + "logps/rejected": -553.978564453125, + "loss": 0.0079, + "rewards/chosen": 4.127374013264974, + "rewards/margins": 14.081086476643879, + "rewards/rejected": -9.953712463378906, + "step": 10923 + }, + { + "epoch": 0.9980813156692554, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 1.0005545608537948e-10, + "logits/chosen": 526293862.4, + "logits/rejected": 236825984.0, + "logps/chosen": -342.70400390625, + "logps/rejected": -200.5101318359375, + "loss": 0.0199, + "rewards/chosen": 3.7227622985839846, + "rewards/margins": 13.219451141357421, + "rewards/rejected": -9.496688842773438, + "step": 10924 + }, + { + "epoch": 0.998172681589767, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 9.116625869443063e-11, + "logits/chosen": 629030741.3333334, + "logits/rejected": 581305856.0, + "logps/chosen": -367.635986328125, + "logps/rejected": -528.449658203125, + "loss": 0.0108, + "rewards/chosen": 3.982672373453776, + "rewards/margins": 13.448504130045572, + "rewards/rejected": -9.465831756591797, + "step": 10925 + }, + { + "epoch": 0.9982640475102786, + "grad_norm": 0.83203125, + "kl": 0.0, + "learning_rate": 8.269050743914264e-11, + "logits/chosen": 843225216.0, + "logits/rejected": 466491776.0, + "logps/chosen": -260.8089599609375, + "logps/rejected": -454.95916748046875, + "loss": 0.0053, + "rewards/chosen": 4.6160664558410645, + "rewards/margins": 15.598762035369873, + "rewards/rejected": -10.982695579528809, + "step": 10926 + }, + { + "epoch": 0.9983554134307903, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 7.462820301951113e-11, + "logits/chosen": 575411904.0, + "logits/rejected": 697800960.0, + "logps/chosen": -392.75006103515625, + "logps/rejected": -522.2471923828125, + "loss": 0.0088, + "rewards/chosen": 4.544703483581543, + "rewards/margins": 12.723122596740723, + "rewards/rejected": -8.17841911315918, + "step": 10927 + }, + { + "epoch": 0.998446779351302, + "grad_norm": 25.375, + "kl": 0.0, + "learning_rate": 6.697934610333524e-11, + "logits/chosen": 594869906.2857143, + "logits/rejected": 778155008.0, + "logps/chosen": -311.36390904017856, + "logps/rejected": -779.0820922851562, + "loss": 0.1114, + "rewards/chosen": 3.9288929530552457, + "rewards/margins": 15.947576386587961, + "rewards/rejected": -12.018683433532715, + "step": 10928 + }, + { + "epoch": 0.9985381452718136, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5.974393732233186e-11, + "logits/chosen": 1806584832.0, + "logits/rejected": 954899200.0, + "logps/chosen": -199.41842651367188, + "logps/rejected": -506.3128255208333, + "loss": 0.0102, + "rewards/chosen": 3.2016730308532715, + "rewards/margins": 12.207124869028727, + "rewards/rejected": -9.005451838175455, + "step": 10929 + }, + { + "epoch": 0.9986295111923252, + "grad_norm": 39.25, + "kl": 0.0, + "learning_rate": 5.2921977274911215e-11, + "logits/chosen": 410700006.4, + "logits/rejected": 295062250.6666667, + "logps/chosen": -230.8962890625, + "logps/rejected": -357.3699951171875, + "loss": 0.1733, + "rewards/chosen": 2.096054458618164, + "rewards/margins": 11.466287358601889, + "rewards/rejected": -9.370232899983725, + "step": 10930 + }, + { + "epoch": 0.9987208771128369, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 4.651346652562172e-11, + "logits/chosen": 479112806.4, + "logits/rejected": 382351658.6666667, + "logps/chosen": -382.702197265625, + "logps/rejected": -433.9256591796875, + "loss": 0.0068, + "rewards/chosen": 4.784206390380859, + "rewards/margins": 12.286691665649414, + "rewards/rejected": -7.502485275268555, + "step": 10931 + }, + { + "epoch": 0.9988122430333486, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 4.051840560348464e-11, + "logits/chosen": 379157184.0, + "logits/rejected": 391579456.0, + "logps/chosen": -288.40765380859375, + "logps/rejected": -615.6447143554688, + "loss": 0.0086, + "rewards/chosen": 4.111031532287598, + "rewards/margins": 14.736379623413086, + "rewards/rejected": -10.625348091125488, + "step": 10932 + }, + { + "epoch": 0.9989036089538602, + "grad_norm": 25.25, + "kl": 0.0, + "learning_rate": 3.4936795005324764e-11, + "logits/chosen": 502241792.0, + "logits/rejected": 367664281.6, + "logps/chosen": -409.0040690104167, + "logps/rejected": -366.44951171875, + "loss": 0.0641, + "rewards/chosen": 2.8022419611612954, + "rewards/margins": 10.124339739481607, + "rewards/rejected": -7.3220977783203125, + "step": 10933 + }, + { + "epoch": 0.9989949748743718, + "grad_norm": 0.73046875, + "kl": 0.0, + "learning_rate": 2.9768635191884665e-11, + "logits/chosen": 332353024.0, + "logits/rejected": 390917312.0, + "logps/chosen": -313.70001220703125, + "logps/rejected": -458.54290771484375, + "loss": 0.0043, + "rewards/chosen": 5.55549955368042, + "rewards/margins": 14.579338550567627, + "rewards/rejected": -9.023838996887207, + "step": 10934 + }, + { + "epoch": 0.9990863407948835, + "grad_norm": 0.443359375, + "kl": 0.0, + "learning_rate": 2.5013926591155314e-11, + "logits/chosen": 839007744.0, + "logits/rejected": 1011104256.0, + "logps/chosen": -413.0757649739583, + "logps/rejected": -594.038330078125, + "loss": 0.0024, + "rewards/chosen": 5.347751617431641, + "rewards/margins": 15.433808135986329, + "rewards/rejected": -10.086056518554688, + "step": 10935 + }, + { + "epoch": 0.9991777067153952, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 2.0672669595600547e-11, + "logits/chosen": 440365866.6666667, + "logits/rejected": 295908544.0, + "logps/chosen": -235.79193115234375, + "logps/rejected": -244.31149291992188, + "loss": 0.036, + "rewards/chosen": 3.4945319493611655, + "rewards/margins": 9.56579860051473, + "rewards/rejected": -6.0712666511535645, + "step": 10936 + }, + { + "epoch": 0.9992690726359068, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 1.6744864564932627e-11, + "logits/chosen": 600781312.0, + "logits/rejected": 457773098.6666667, + "logps/chosen": -296.1209411621094, + "logps/rejected": -515.2039388020834, + "loss": 0.0319, + "rewards/chosen": 4.422308444976807, + "rewards/margins": 13.400179386138916, + "rewards/rejected": -8.97787094116211, + "step": 10937 + }, + { + "epoch": 0.9993604385564184, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 1.3230511823336679e-11, + "logits/chosen": 598039552.0, + "logits/rejected": 562866944.0, + "logps/chosen": -223.8560791015625, + "logps/rejected": -411.82244873046875, + "loss": 0.0347, + "rewards/chosen": 3.2492265701293945, + "rewards/margins": 10.78911018371582, + "rewards/rejected": -7.539883613586426, + "step": 10938 + }, + { + "epoch": 0.99945180447693, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 1.012961166169113e-11, + "logits/chosen": 538129408.0, + "logits/rejected": 782954880.0, + "logps/chosen": -336.8817443847656, + "logps/rejected": -535.1734619140625, + "loss": 0.0113, + "rewards/chosen": 4.052163600921631, + "rewards/margins": 13.88434362411499, + "rewards/rejected": -9.83218002319336, + "step": 10939 + }, + { + "epoch": 0.9995431703974418, + "grad_norm": 0.875, + "kl": 0.0, + "learning_rate": 7.442164336457502e-12, + "logits/chosen": 521691136.0, + "logits/rejected": 815112874.6666666, + "logps/chosen": -337.7614440917969, + "logps/rejected": -403.5771484375, + "loss": 0.0036, + "rewards/chosen": 4.565997123718262, + "rewards/margins": 13.738048871358236, + "rewards/rejected": -9.172051747639975, + "step": 10940 + }, + { + "epoch": 0.9996345363179534, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.1681700696804e-12, + "logits/chosen": 527464192.0, + "logits/rejected": 509284096.0, + "logps/chosen": -331.7568359375, + "logps/rejected": -402.6046447753906, + "loss": 0.0448, + "rewards/chosen": 3.1918986638387046, + "rewards/margins": 14.107736905415853, + "rewards/rejected": -10.915838241577148, + "step": 10941 + }, + { + "epoch": 0.999725902238465, + "grad_norm": 1.703125, + "kl": 0.0, + "learning_rate": 3.307629050097738e-12, + "logits/chosen": 1241676032.0, + "logits/rejected": 896666112.0, + "logps/chosen": -445.3505554199219, + "logps/rejected": -488.1236572265625, + "loss": 0.0107, + "rewards/chosen": 4.095582962036133, + "rewards/margins": 12.0648775100708, + "rewards/rejected": -7.969294548034668, + "step": 10942 + }, + { + "epoch": 0.9998172681589766, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 1.8605414303651816e-12, + "logits/chosen": 580590336.0, + "logits/rejected": 450417248.0, + "logps/chosen": -316.13226318359375, + "logps/rejected": -474.0411376953125, + "loss": 0.0146, + "rewards/chosen": 3.5688390731811523, + "rewards/margins": 14.267080307006836, + "rewards/rejected": -10.698241233825684, + "step": 10943 + }, + { + "epoch": 0.9999086340794884, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 8.269073309419285e-13, + "logits/chosen": 605300906.6666666, + "logits/rejected": 1046651699.2, + "logps/chosen": -358.157958984375, + "logps/rejected": -922.93017578125, + "loss": 0.0053, + "rewards/chosen": 4.646478017171224, + "rewards/margins": 16.570284779866537, + "rewards/rejected": -11.923806762695312, + "step": 10944 + }, + { + "epoch": 1.0, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 2.0672683676004058e-13, + "logits/chosen": 461755989.3333333, + "logits/rejected": 626413440.0, + "logps/chosen": -239.18096923828125, + "logps/rejected": -434.89349365234375, + "loss": 0.0202, + "rewards/chosen": 4.508140563964844, + "rewards/margins": 13.385554313659668, + "rewards/rejected": -8.877413749694824, + "step": 10945 + } + ], + "logging_steps": 1, + "max_steps": 10945, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}