{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996889580093312, "eval_steps": 500, "global_step": 1607, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006220839813374805, "grad_norm": 17.854297637939453, "learning_rate": 3.1055900621118015e-08, "logits/chosen": -1.1386702060699463, "logits/rejected": 3.748429775238037, "logps/chosen": -374.65234375, "logps/rejected": -596.816650390625, "loss": 1.9772, "rewards/accuracies": 0.375, "rewards/chosen": 2.9525623321533203, "rewards/margins": -0.7630168199539185, "rewards/rejected": 3.715578556060791, "step": 1 }, { "epoch": 0.001244167962674961, "grad_norm": 16.559560775756836, "learning_rate": 6.211180124223603e-08, "logits/chosen": 0.7572054266929626, "logits/rejected": 3.991549491882324, "logps/chosen": -356.044677734375, "logps/rejected": -567.2637939453125, "loss": 2.6049, "rewards/accuracies": 0.25, "rewards/chosen": 3.5207176208496094, "rewards/margins": -2.0924265384674072, "rewards/rejected": 5.6131439208984375, "step": 2 }, { "epoch": 0.0018662519440124418, "grad_norm": 15.50611400604248, "learning_rate": 9.316770186335405e-08, "logits/chosen": 0.8064584732055664, "logits/rejected": 2.735602855682373, "logps/chosen": -463.931640625, "logps/rejected": -522.663330078125, "loss": 1.5827, "rewards/accuracies": 0.5, "rewards/chosen": 4.610626220703125, "rewards/margins": -0.5110129117965698, "rewards/rejected": 5.121639251708984, "step": 3 }, { "epoch": 0.002488335925349922, "grad_norm": 15.971909523010254, "learning_rate": 1.2422360248447206e-07, "logits/chosen": -1.3895589113235474, "logits/rejected": 2.5874433517456055, "logps/chosen": -319.51019287109375, "logps/rejected": -532.197265625, "loss": 3.3314, "rewards/accuracies": 0.25, "rewards/chosen": 1.7460381984710693, "rewards/margins": -2.850506544113159, "rewards/rejected": 4.5965447425842285, "step": 4 }, { "epoch": 0.003110419906687403, "grad_norm": 13.186685562133789, "learning_rate": 1.5527950310559006e-07, "logits/chosen": 2.863335132598877, "logits/rejected": 3.063835859298706, "logps/chosen": -555.8003540039062, "logps/rejected": -562.5652465820312, "loss": 1.019, "rewards/accuracies": 0.5, "rewards/chosen": 4.623453140258789, "rewards/margins": 0.44969022274017334, "rewards/rejected": 4.173763275146484, "step": 5 }, { "epoch": 0.0037325038880248835, "grad_norm": 12.212003707885742, "learning_rate": 1.863354037267081e-07, "logits/chosen": -0.08458942174911499, "logits/rejected": 1.3292889595031738, "logps/chosen": -388.036376953125, "logps/rejected": -471.9739990234375, "loss": 0.9845, "rewards/accuracies": 0.375, "rewards/chosen": 3.888078451156616, "rewards/margins": -0.09828263521194458, "rewards/rejected": 3.986361503601074, "step": 6 }, { "epoch": 0.004354587869362364, "grad_norm": 16.828922271728516, "learning_rate": 2.173913043478261e-07, "logits/chosen": -1.60880708694458, "logits/rejected": 2.5346570014953613, "logps/chosen": -334.48052978515625, "logps/rejected": -651.7298583984375, "loss": 1.9882, "rewards/accuracies": 0.375, "rewards/chosen": 3.477569341659546, "rewards/margins": -0.5964668393135071, "rewards/rejected": 4.074036121368408, "step": 7 }, { "epoch": 0.004976671850699844, "grad_norm": 16.430715560913086, "learning_rate": 2.484472049689441e-07, "logits/chosen": 3.21341872215271, "logits/rejected": 3.1351938247680664, "logps/chosen": -516.5238647460938, "logps/rejected": -508.5950012207031, "loss": 1.7002, "rewards/accuracies": 0.25, "rewards/chosen": 3.4094736576080322, "rewards/margins": -1.2247729301452637, "rewards/rejected": 4.634246826171875, "step": 8 }, { "epoch": 0.005598755832037325, "grad_norm": 19.96339988708496, "learning_rate": 2.795031055900621e-07, "logits/chosen": -1.0087116956710815, "logits/rejected": 3.74387526512146, "logps/chosen": -309.85797119140625, "logps/rejected": -585.4224853515625, "loss": 2.907, "rewards/accuracies": 0.125, "rewards/chosen": 2.1543898582458496, "rewards/margins": -2.6042959690093994, "rewards/rejected": 4.758686542510986, "step": 9 }, { "epoch": 0.006220839813374806, "grad_norm": 15.722565650939941, "learning_rate": 3.1055900621118013e-07, "logits/chosen": -0.0343380868434906, "logits/rejected": 2.1398134231567383, "logps/chosen": -419.07177734375, "logps/rejected": -591.7615356445312, "loss": 2.2904, "rewards/accuracies": 0.25, "rewards/chosen": 3.0381715297698975, "rewards/margins": -1.6216652393341064, "rewards/rejected": 4.659837245941162, "step": 10 }, { "epoch": 0.006842923794712286, "grad_norm": 18.467546463012695, "learning_rate": 3.416149068322982e-07, "logits/chosen": 1.6262285709381104, "logits/rejected": 3.635580062866211, "logps/chosen": -471.30792236328125, "logps/rejected": -530.7711791992188, "loss": 1.5478, "rewards/accuracies": 0.375, "rewards/chosen": 3.3277509212493896, "rewards/margins": -0.7134326696395874, "rewards/rejected": 4.0411834716796875, "step": 11 }, { "epoch": 0.007465007776049767, "grad_norm": 15.044721603393555, "learning_rate": 3.726708074534162e-07, "logits/chosen": 1.9768428802490234, "logits/rejected": 3.80654239654541, "logps/chosen": -464.89288330078125, "logps/rejected": -578.0429077148438, "loss": 1.587, "rewards/accuracies": 0.375, "rewards/chosen": 4.207770824432373, "rewards/margins": -0.38001036643981934, "rewards/rejected": 4.5877814292907715, "step": 12 }, { "epoch": 0.008087091757387248, "grad_norm": 18.732120513916016, "learning_rate": 4.037267080745342e-07, "logits/chosen": 0.5272825956344604, "logits/rejected": 4.798799991607666, "logps/chosen": -431.8326416015625, "logps/rejected": -620.5444946289062, "loss": 2.6828, "rewards/accuracies": 0.25, "rewards/chosen": 4.246933460235596, "rewards/margins": -1.8957029581069946, "rewards/rejected": 6.142636299133301, "step": 13 }, { "epoch": 0.008709175738724729, "grad_norm": 21.405832290649414, "learning_rate": 4.347826086956522e-07, "logits/chosen": -2.1653008460998535, "logits/rejected": 2.2637858390808105, "logps/chosen": -285.3441162109375, "logps/rejected": -634.0145874023438, "loss": 3.0336, "rewards/accuracies": 0.25, "rewards/chosen": 1.6230576038360596, "rewards/margins": -2.416438102722168, "rewards/rejected": 4.039495944976807, "step": 14 }, { "epoch": 0.00933125972006221, "grad_norm": 16.034671783447266, "learning_rate": 4.658385093167702e-07, "logits/chosen": -0.58575439453125, "logits/rejected": 2.5629453659057617, "logps/chosen": -426.5467529296875, "logps/rejected": -548.9833374023438, "loss": 2.3319, "rewards/accuracies": 0.375, "rewards/chosen": 3.985501527786255, "rewards/margins": -1.0022425651550293, "rewards/rejected": 4.987744331359863, "step": 15 }, { "epoch": 0.009953343701399688, "grad_norm": 13.583885192871094, "learning_rate": 4.968944099378882e-07, "logits/chosen": -0.38820523023605347, "logits/rejected": 2.7751381397247314, "logps/chosen": -375.39630126953125, "logps/rejected": -531.240234375, "loss": 1.7631, "rewards/accuracies": 0.25, "rewards/chosen": 3.8793234825134277, "rewards/margins": -0.62348473072052, "rewards/rejected": 4.502808094024658, "step": 16 }, { "epoch": 0.010575427682737169, "grad_norm": 13.018609046936035, "learning_rate": 5.279503105590063e-07, "logits/chosen": -1.0484720468521118, "logits/rejected": -0.04507395625114441, "logps/chosen": -424.6312255859375, "logps/rejected": -485.1569519042969, "loss": 1.9984, "rewards/accuracies": 0.375, "rewards/chosen": 1.7034622430801392, "rewards/margins": -0.8553205728530884, "rewards/rejected": 2.5587825775146484, "step": 17 }, { "epoch": 0.01119751166407465, "grad_norm": 14.08587646484375, "learning_rate": 5.590062111801243e-07, "logits/chosen": 1.444870948791504, "logits/rejected": 2.4822463989257812, "logps/chosen": -474.28564453125, "logps/rejected": -600.8206176757812, "loss": 1.5272, "rewards/accuracies": 0.375, "rewards/chosen": 3.9322361946105957, "rewards/margins": -0.7070317268371582, "rewards/rejected": 4.639267921447754, "step": 18 }, { "epoch": 0.01181959564541213, "grad_norm": 16.62580108642578, "learning_rate": 5.900621118012423e-07, "logits/chosen": -0.8313573002815247, "logits/rejected": 3.1075665950775146, "logps/chosen": -424.1575012207031, "logps/rejected": -568.4471435546875, "loss": 1.5167, "rewards/accuracies": 0.125, "rewards/chosen": 2.8406872749328613, "rewards/margins": -1.174468994140625, "rewards/rejected": 4.015156269073486, "step": 19 }, { "epoch": 0.012441679626749611, "grad_norm": 14.394831657409668, "learning_rate": 6.211180124223603e-07, "logits/chosen": -0.5360622406005859, "logits/rejected": 2.539451837539673, "logps/chosen": -416.7165222167969, "logps/rejected": -544.8552856445312, "loss": 2.0467, "rewards/accuracies": 0.375, "rewards/chosen": 3.327951431274414, "rewards/margins": -1.1387128829956055, "rewards/rejected": 4.4666643142700195, "step": 20 }, { "epoch": 0.013063763608087092, "grad_norm": 12.934179306030273, "learning_rate": 6.521739130434783e-07, "logits/chosen": -0.038128167390823364, "logits/rejected": 2.756566047668457, "logps/chosen": -425.3929443359375, "logps/rejected": -589.950439453125, "loss": 1.8009, "rewards/accuracies": 0.5, "rewards/chosen": 2.2645719051361084, "rewards/margins": -0.6078423857688904, "rewards/rejected": 2.8724141120910645, "step": 21 }, { "epoch": 0.013685847589424573, "grad_norm": 14.910382270812988, "learning_rate": 6.832298136645964e-07, "logits/chosen": -1.0666587352752686, "logits/rejected": 2.34639310836792, "logps/chosen": -341.7932434082031, "logps/rejected": -554.8856811523438, "loss": 2.2826, "rewards/accuracies": 0.375, "rewards/chosen": 1.568525791168213, "rewards/margins": -1.6174895763397217, "rewards/rejected": 3.1860151290893555, "step": 22 }, { "epoch": 0.014307931570762053, "grad_norm": 15.411660194396973, "learning_rate": 7.142857142857143e-07, "logits/chosen": 0.22289246320724487, "logits/rejected": 3.1246914863586426, "logps/chosen": -430.3992919921875, "logps/rejected": -655.2235107421875, "loss": 1.8757, "rewards/accuracies": 0.125, "rewards/chosen": 4.319504261016846, "rewards/margins": -1.2606054544448853, "rewards/rejected": 5.580109119415283, "step": 23 }, { "epoch": 0.014930015552099534, "grad_norm": 23.20302963256836, "learning_rate": 7.453416149068324e-07, "logits/chosen": 0.483075350522995, "logits/rejected": 4.29481315612793, "logps/chosen": -363.7115783691406, "logps/rejected": -601.6412963867188, "loss": 3.2021, "rewards/accuracies": 0.0, "rewards/chosen": 3.3802592754364014, "rewards/margins": -3.024005174636841, "rewards/rejected": 6.404264450073242, "step": 24 }, { "epoch": 0.015552099533437015, "grad_norm": 13.197885513305664, "learning_rate": 7.763975155279503e-07, "logits/chosen": -1.6369664669036865, "logits/rejected": 1.334805965423584, "logps/chosen": -252.2938690185547, "logps/rejected": -440.203857421875, "loss": 1.7013, "rewards/accuracies": 0.375, "rewards/chosen": 2.9228367805480957, "rewards/margins": -1.150625228881836, "rewards/rejected": 4.073462009429932, "step": 25 }, { "epoch": 0.016174183514774496, "grad_norm": 8.859484672546387, "learning_rate": 8.074534161490684e-07, "logits/chosen": -0.5109778046607971, "logits/rejected": 1.9775269031524658, "logps/chosen": -282.95904541015625, "logps/rejected": -456.1385498046875, "loss": 1.5515, "rewards/accuracies": 0.625, "rewards/chosen": 2.640273094177246, "rewards/margins": -0.19814348220825195, "rewards/rejected": 2.838416576385498, "step": 26 }, { "epoch": 0.016796267496111975, "grad_norm": 11.10366439819336, "learning_rate": 8.385093167701864e-07, "logits/chosen": -0.20823854207992554, "logits/rejected": 0.14792311191558838, "logps/chosen": -432.61962890625, "logps/rejected": -496.79632568359375, "loss": 1.041, "rewards/accuracies": 0.5, "rewards/chosen": 2.725969076156616, "rewards/margins": 0.09552490711212158, "rewards/rejected": 2.630444049835205, "step": 27 }, { "epoch": 0.017418351477449457, "grad_norm": 18.775503158569336, "learning_rate": 8.695652173913044e-07, "logits/chosen": -0.9147471189498901, "logits/rejected": 3.377066135406494, "logps/chosen": -392.84100341796875, "logps/rejected": -640.43212890625, "loss": 2.3381, "rewards/accuracies": 0.25, "rewards/chosen": 2.873392343521118, "rewards/margins": -1.9523334503173828, "rewards/rejected": 4.825725555419922, "step": 28 }, { "epoch": 0.018040435458786936, "grad_norm": 18.300891876220703, "learning_rate": 9.006211180124224e-07, "logits/chosen": 0.9996336698532104, "logits/rejected": 2.9327127933502197, "logps/chosen": -549.57861328125, "logps/rejected": -630.7413330078125, "loss": 2.285, "rewards/accuracies": 0.5, "rewards/chosen": 4.136654853820801, "rewards/margins": -1.0646252632141113, "rewards/rejected": 5.201280117034912, "step": 29 }, { "epoch": 0.01866251944012442, "grad_norm": 20.41710090637207, "learning_rate": 9.316770186335404e-07, "logits/chosen": 1.769633173942566, "logits/rejected": 4.040060043334961, "logps/chosen": -504.2467041015625, "logps/rejected": -583.26611328125, "loss": 1.8408, "rewards/accuracies": 0.25, "rewards/chosen": 3.9186482429504395, "rewards/margins": -1.409982681274414, "rewards/rejected": 5.3286309242248535, "step": 30 }, { "epoch": 0.019284603421461897, "grad_norm": 7.4204864501953125, "learning_rate": 9.627329192546585e-07, "logits/chosen": 2.414787530899048, "logits/rejected": 4.342754364013672, "logps/chosen": -491.903564453125, "logps/rejected": -529.9178466796875, "loss": 0.5176, "rewards/accuracies": 0.875, "rewards/chosen": 4.258436679840088, "rewards/margins": 1.2984163761138916, "rewards/rejected": 2.9600205421447754, "step": 31 }, { "epoch": 0.019906687402799376, "grad_norm": 26.244693756103516, "learning_rate": 9.937888198757765e-07, "logits/chosen": -1.133277177810669, "logits/rejected": 2.9445366859436035, "logps/chosen": -334.9951171875, "logps/rejected": -539.3391723632812, "loss": 3.098, "rewards/accuracies": 0.0, "rewards/chosen": 2.445523262023926, "rewards/margins": -2.995908498764038, "rewards/rejected": 5.441431999206543, "step": 32 }, { "epoch": 0.02052877138413686, "grad_norm": 19.834293365478516, "learning_rate": 1.0248447204968944e-06, "logits/chosen": -0.28572332859039307, "logits/rejected": 3.181565284729004, "logps/chosen": -431.0903015136719, "logps/rejected": -584.8841552734375, "loss": 1.822, "rewards/accuracies": 0.25, "rewards/chosen": 2.7920920848846436, "rewards/margins": -1.1915936470031738, "rewards/rejected": 3.9836859703063965, "step": 33 }, { "epoch": 0.021150855365474338, "grad_norm": 18.630273818969727, "learning_rate": 1.0559006211180126e-06, "logits/chosen": 1.571908712387085, "logits/rejected": 2.8574092388153076, "logps/chosen": -516.75048828125, "logps/rejected": -569.5855712890625, "loss": 1.6673, "rewards/accuracies": 0.125, "rewards/chosen": 3.487802267074585, "rewards/margins": -1.2885650396347046, "rewards/rejected": 4.776367664337158, "step": 34 }, { "epoch": 0.02177293934681182, "grad_norm": 18.234106063842773, "learning_rate": 1.0869565217391306e-06, "logits/chosen": 0.4845390319824219, "logits/rejected": 1.6266993284225464, "logps/chosen": -487.19537353515625, "logps/rejected": -539.9108276367188, "loss": 2.8629, "rewards/accuracies": 0.5, "rewards/chosen": 3.638322114944458, "rewards/margins": -1.6813172101974487, "rewards/rejected": 5.319639682769775, "step": 35 }, { "epoch": 0.0223950233281493, "grad_norm": 16.096593856811523, "learning_rate": 1.1180124223602485e-06, "logits/chosen": -0.5536847114562988, "logits/rejected": 3.9143714904785156, "logps/chosen": -258.2462158203125, "logps/rejected": -522.1569213867188, "loss": 2.2642, "rewards/accuracies": 0.25, "rewards/chosen": 2.38403582572937, "rewards/margins": -1.6353323459625244, "rewards/rejected": 4.0193681716918945, "step": 36 }, { "epoch": 0.023017107309486782, "grad_norm": 13.231411933898926, "learning_rate": 1.1490683229813664e-06, "logits/chosen": 1.8502612113952637, "logits/rejected": 3.888587713241577, "logps/chosen": -527.3170166015625, "logps/rejected": -577.2305908203125, "loss": 1.2442, "rewards/accuracies": 0.5, "rewards/chosen": 4.5486650466918945, "rewards/margins": 0.6147650480270386, "rewards/rejected": 3.9339001178741455, "step": 37 }, { "epoch": 0.02363919129082426, "grad_norm": 18.222408294677734, "learning_rate": 1.1801242236024846e-06, "logits/chosen": 0.500670850276947, "logits/rejected": 2.7530484199523926, "logps/chosen": -414.5412292480469, "logps/rejected": -526.6549682617188, "loss": 2.13, "rewards/accuracies": 0.25, "rewards/chosen": 2.314025402069092, "rewards/margins": -1.5583739280700684, "rewards/rejected": 3.87239933013916, "step": 38 }, { "epoch": 0.024261275272161743, "grad_norm": 12.805335998535156, "learning_rate": 1.2111801242236026e-06, "logits/chosen": 1.4653871059417725, "logits/rejected": 2.587925910949707, "logps/chosen": -338.72613525390625, "logps/rejected": -392.6908874511719, "loss": 1.584, "rewards/accuracies": 0.375, "rewards/chosen": 1.6573638916015625, "rewards/margins": -0.5469708442687988, "rewards/rejected": 2.2043347358703613, "step": 39 }, { "epoch": 0.024883359253499222, "grad_norm": 22.01996421813965, "learning_rate": 1.2422360248447205e-06, "logits/chosen": 0.1965693235397339, "logits/rejected": 3.027219772338867, "logps/chosen": -460.9700012207031, "logps/rejected": -653.0487060546875, "loss": 4.2398, "rewards/accuracies": 0.0, "rewards/chosen": 2.268369436264038, "rewards/margins": -4.104291915893555, "rewards/rejected": 6.372661590576172, "step": 40 }, { "epoch": 0.0255054432348367, "grad_norm": 15.715839385986328, "learning_rate": 1.2732919254658385e-06, "logits/chosen": -1.6751973628997803, "logits/rejected": 3.080821990966797, "logps/chosen": -289.51800537109375, "logps/rejected": -501.493896484375, "loss": 1.7012, "rewards/accuracies": 0.5, "rewards/chosen": 2.6112499237060547, "rewards/margins": -0.7667644023895264, "rewards/rejected": 3.37801456451416, "step": 41 }, { "epoch": 0.026127527216174184, "grad_norm": 15.455327033996582, "learning_rate": 1.3043478260869566e-06, "logits/chosen": 1.4408159255981445, "logits/rejected": 1.8075611591339111, "logps/chosen": -563.5657958984375, "logps/rejected": -578.3375854492188, "loss": 1.3842, "rewards/accuracies": 0.5, "rewards/chosen": 4.628026485443115, "rewards/margins": 0.18713092803955078, "rewards/rejected": 4.440896034240723, "step": 42 }, { "epoch": 0.026749611197511663, "grad_norm": 12.377591133117676, "learning_rate": 1.3354037267080746e-06, "logits/chosen": 0.5772141814231873, "logits/rejected": 0.3179135322570801, "logps/chosen": -455.52734375, "logps/rejected": -461.08416748046875, "loss": 1.8421, "rewards/accuracies": 0.25, "rewards/chosen": 2.706965208053589, "rewards/margins": -1.241515874862671, "rewards/rejected": 3.9484810829162598, "step": 43 }, { "epoch": 0.027371695178849145, "grad_norm": 19.035192489624023, "learning_rate": 1.3664596273291927e-06, "logits/chosen": -1.3665052652359009, "logits/rejected": 4.01724100112915, "logps/chosen": -341.4517822265625, "logps/rejected": -710.14892578125, "loss": 2.5976, "rewards/accuracies": 0.125, "rewards/chosen": 2.2136545181274414, "rewards/margins": -2.051370143890381, "rewards/rejected": 4.265024662017822, "step": 44 }, { "epoch": 0.027993779160186624, "grad_norm": 18.47613525390625, "learning_rate": 1.3975155279503105e-06, "logits/chosen": -0.5310606956481934, "logits/rejected": 3.3563127517700195, "logps/chosen": -457.00341796875, "logps/rejected": -653.9691162109375, "loss": 2.435, "rewards/accuracies": 0.0, "rewards/chosen": 3.6806583404541016, "rewards/margins": -2.2114646434783936, "rewards/rejected": 5.892122268676758, "step": 45 }, { "epoch": 0.028615863141524107, "grad_norm": 14.408644676208496, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -1.1961864233016968, "logits/rejected": 2.336334705352783, "logps/chosen": -304.5185546875, "logps/rejected": -482.330322265625, "loss": 1.6218, "rewards/accuracies": 0.25, "rewards/chosen": 2.4055614471435547, "rewards/margins": -0.5055615901947021, "rewards/rejected": 2.911123275756836, "step": 46 }, { "epoch": 0.029237947122861586, "grad_norm": 18.27861976623535, "learning_rate": 1.4596273291925466e-06, "logits/chosen": -0.07321079075336456, "logits/rejected": 3.3830206394195557, "logps/chosen": -390.31829833984375, "logps/rejected": -566.5939331054688, "loss": 1.5252, "rewards/accuracies": 0.375, "rewards/chosen": 2.949392795562744, "rewards/margins": -0.8621495962142944, "rewards/rejected": 3.81154203414917, "step": 47 }, { "epoch": 0.029860031104199068, "grad_norm": 20.510622024536133, "learning_rate": 1.4906832298136647e-06, "logits/chosen": -2.373002767562866, "logits/rejected": 1.7714519500732422, "logps/chosen": -348.08740234375, "logps/rejected": -532.2333984375, "loss": 2.3773, "rewards/accuracies": 0.125, "rewards/chosen": 2.341348648071289, "rewards/margins": -1.6777209043502808, "rewards/rejected": 4.019069671630859, "step": 48 }, { "epoch": 0.030482115085536547, "grad_norm": 16.575960159301758, "learning_rate": 1.521739130434783e-06, "logits/chosen": -3.976215124130249, "logits/rejected": 2.546246290206909, "logps/chosen": -251.80389404296875, "logps/rejected": -565.2742309570312, "loss": 2.1789, "rewards/accuracies": 0.375, "rewards/chosen": 2.2429680824279785, "rewards/margins": -1.1396911144256592, "rewards/rejected": 3.3826591968536377, "step": 49 }, { "epoch": 0.03110419906687403, "grad_norm": 17.0865535736084, "learning_rate": 1.5527950310559006e-06, "logits/chosen": -1.9030277729034424, "logits/rejected": 4.865078449249268, "logps/chosen": -276.42449951171875, "logps/rejected": -610.6055297851562, "loss": 3.1826, "rewards/accuracies": 0.125, "rewards/chosen": 1.672954797744751, "rewards/margins": -2.7304153442382812, "rewards/rejected": 4.403370380401611, "step": 50 }, { "epoch": 0.031726283048211505, "grad_norm": 20.365732192993164, "learning_rate": 1.5838509316770188e-06, "logits/chosen": -0.01571650803089142, "logits/rejected": 3.2704014778137207, "logps/chosen": -400.42431640625, "logps/rejected": -600.40234375, "loss": 2.3402, "rewards/accuracies": 0.25, "rewards/chosen": 2.8270480632781982, "rewards/margins": -1.8433575630187988, "rewards/rejected": 4.670405387878418, "step": 51 }, { "epoch": 0.03234836702954899, "grad_norm": 27.406963348388672, "learning_rate": 1.6149068322981367e-06, "logits/chosen": 1.9095484018325806, "logits/rejected": 3.6540932655334473, "logps/chosen": -543.3350830078125, "logps/rejected": -614.4552001953125, "loss": 2.9419, "rewards/accuracies": 0.25, "rewards/chosen": 4.5320329666137695, "rewards/margins": -2.3327999114990234, "rewards/rejected": 6.864832878112793, "step": 52 }, { "epoch": 0.03297045101088647, "grad_norm": 14.246295928955078, "learning_rate": 1.645962732919255e-06, "logits/chosen": 2.348184108734131, "logits/rejected": 4.586565017700195, "logps/chosen": -483.4414367675781, "logps/rejected": -554.7264404296875, "loss": 1.0814, "rewards/accuracies": 0.625, "rewards/chosen": 3.603303909301758, "rewards/margins": -0.0772642195224762, "rewards/rejected": 3.680567979812622, "step": 53 }, { "epoch": 0.03359253499222395, "grad_norm": 17.39348602294922, "learning_rate": 1.6770186335403729e-06, "logits/chosen": -2.882572650909424, "logits/rejected": 3.0221974849700928, "logps/chosen": -302.73358154296875, "logps/rejected": -678.2405395507812, "loss": 2.4495, "rewards/accuracies": 0.25, "rewards/chosen": 2.6320016384124756, "rewards/margins": -1.9446921348571777, "rewards/rejected": 4.576693534851074, "step": 54 }, { "epoch": 0.03421461897356143, "grad_norm": 18.54676055908203, "learning_rate": 1.7080745341614908e-06, "logits/chosen": 1.9731684923171997, "logits/rejected": 3.2952880859375, "logps/chosen": -610.639404296875, "logps/rejected": -663.1764526367188, "loss": 2.0717, "rewards/accuracies": 0.25, "rewards/chosen": 3.48779296875, "rewards/margins": -1.3251686096191406, "rewards/rejected": 4.812961578369141, "step": 55 }, { "epoch": 0.034836702954898914, "grad_norm": 19.899215698242188, "learning_rate": 1.7391304347826088e-06, "logits/chosen": 0.14407885074615479, "logits/rejected": 2.546194076538086, "logps/chosen": -546.3914184570312, "logps/rejected": -662.205322265625, "loss": 3.1558, "rewards/accuracies": 0.375, "rewards/chosen": 3.134453773498535, "rewards/margins": -2.702280282974243, "rewards/rejected": 5.836733818054199, "step": 56 }, { "epoch": 0.03545878693623639, "grad_norm": 14.970836639404297, "learning_rate": 1.770186335403727e-06, "logits/chosen": -0.33258992433547974, "logits/rejected": 2.144075393676758, "logps/chosen": -412.4896240234375, "logps/rejected": -556.417236328125, "loss": 1.3709, "rewards/accuracies": 0.25, "rewards/chosen": 3.347169876098633, "rewards/margins": -0.8939283490180969, "rewards/rejected": 4.241097927093506, "step": 57 }, { "epoch": 0.03608087091757387, "grad_norm": 18.21649742126465, "learning_rate": 1.8012422360248449e-06, "logits/chosen": -0.1760656088590622, "logits/rejected": 3.689772605895996, "logps/chosen": -341.3254089355469, "logps/rejected": -614.287841796875, "loss": 3.2395, "rewards/accuracies": 0.125, "rewards/chosen": 2.6227283477783203, "rewards/margins": -3.030301094055176, "rewards/rejected": 5.653029441833496, "step": 58 }, { "epoch": 0.03670295489891135, "grad_norm": 14.219995498657227, "learning_rate": 1.832298136645963e-06, "logits/chosen": -0.032842814922332764, "logits/rejected": 2.472367286682129, "logps/chosen": -412.5766906738281, "logps/rejected": -563.55322265625, "loss": 2.1787, "rewards/accuracies": 0.25, "rewards/chosen": 3.942850112915039, "rewards/margins": -0.7556142807006836, "rewards/rejected": 4.698464393615723, "step": 59 }, { "epoch": 0.03732503888024884, "grad_norm": 15.933598518371582, "learning_rate": 1.8633540372670808e-06, "logits/chosen": 0.32627394795417786, "logits/rejected": 3.500953197479248, "logps/chosen": -299.4095764160156, "logps/rejected": -494.85186767578125, "loss": 1.9864, "rewards/accuracies": 0.25, "rewards/chosen": 1.9913121461868286, "rewards/margins": -1.4114632606506348, "rewards/rejected": 3.402775526046753, "step": 60 }, { "epoch": 0.037947122861586316, "grad_norm": 15.214622497558594, "learning_rate": 1.894409937888199e-06, "logits/chosen": -2.144202709197998, "logits/rejected": 2.118504524230957, "logps/chosen": -376.8366394042969, "logps/rejected": -622.558349609375, "loss": 1.8904, "rewards/accuracies": 0.375, "rewards/chosen": 3.447816848754883, "rewards/margins": -0.9582694172859192, "rewards/rejected": 4.406085968017578, "step": 61 }, { "epoch": 0.038569206842923795, "grad_norm": 11.512938499450684, "learning_rate": 1.925465838509317e-06, "logits/chosen": 0.3032981753349304, "logits/rejected": 2.3385493755340576, "logps/chosen": -350.2100830078125, "logps/rejected": -490.7513122558594, "loss": 2.0141, "rewards/accuracies": 0.375, "rewards/chosen": 2.47409725189209, "rewards/margins": -0.39925241470336914, "rewards/rejected": 2.873349905014038, "step": 62 }, { "epoch": 0.039191290824261274, "grad_norm": 17.24321937561035, "learning_rate": 1.956521739130435e-06, "logits/chosen": -1.4495515823364258, "logits/rejected": 1.9896833896636963, "logps/chosen": -482.90753173828125, "logps/rejected": -580.9098510742188, "loss": 1.5194, "rewards/accuracies": 0.375, "rewards/chosen": 3.409691333770752, "rewards/margins": -0.645327627658844, "rewards/rejected": 4.055018901824951, "step": 63 }, { "epoch": 0.03981337480559875, "grad_norm": 18.76458168029785, "learning_rate": 1.987577639751553e-06, "logits/chosen": 0.4551563262939453, "logits/rejected": 2.7281529903411865, "logps/chosen": -526.1947021484375, "logps/rejected": -624.5258178710938, "loss": 1.6734, "rewards/accuracies": 0.375, "rewards/chosen": 3.615340232849121, "rewards/margins": 0.08302098512649536, "rewards/rejected": 3.5323190689086914, "step": 64 }, { "epoch": 0.04043545878693624, "grad_norm": 16.15471076965332, "learning_rate": 2.018633540372671e-06, "logits/chosen": 2.0971620082855225, "logits/rejected": 3.0833353996276855, "logps/chosen": -546.3640747070312, "logps/rejected": -593.1632690429688, "loss": 1.5654, "rewards/accuracies": 0.25, "rewards/chosen": 3.5291683673858643, "rewards/margins": -1.1733494997024536, "rewards/rejected": 4.702517509460449, "step": 65 }, { "epoch": 0.04105754276827372, "grad_norm": 16.818897247314453, "learning_rate": 2.049689440993789e-06, "logits/chosen": 0.07451218366622925, "logits/rejected": 3.51839017868042, "logps/chosen": -435.3553466796875, "logps/rejected": -592.5557250976562, "loss": 1.5626, "rewards/accuracies": 0.375, "rewards/chosen": 3.6512696743011475, "rewards/margins": -0.9154375791549683, "rewards/rejected": 4.566707611083984, "step": 66 }, { "epoch": 0.0416796267496112, "grad_norm": 17.558902740478516, "learning_rate": 2.0807453416149073e-06, "logits/chosen": 0.5168266892433167, "logits/rejected": 4.093328475952148, "logps/chosen": -445.5537109375, "logps/rejected": -667.9881591796875, "loss": 1.5334, "rewards/accuracies": 0.375, "rewards/chosen": 2.953523874282837, "rewards/margins": -0.19519448280334473, "rewards/rejected": 3.1487183570861816, "step": 67 }, { "epoch": 0.042301710730948676, "grad_norm": 11.986053466796875, "learning_rate": 2.111801242236025e-06, "logits/chosen": -0.3885886073112488, "logits/rejected": 1.2646914720535278, "logps/chosen": -403.715087890625, "logps/rejected": -491.381591796875, "loss": 0.8884, "rewards/accuracies": 0.625, "rewards/chosen": 2.808671474456787, "rewards/margins": 0.7170834541320801, "rewards/rejected": 2.091587781906128, "step": 68 }, { "epoch": 0.04292379471228616, "grad_norm": 14.44616413116455, "learning_rate": 2.1428571428571427e-06, "logits/chosen": 1.4018076658248901, "logits/rejected": 2.252667188644409, "logps/chosen": -545.3279418945312, "logps/rejected": -625.7229614257812, "loss": 2.008, "rewards/accuracies": 0.5, "rewards/chosen": 4.904933929443359, "rewards/margins": -0.13592761754989624, "rewards/rejected": 5.040862083435059, "step": 69 }, { "epoch": 0.04354587869362364, "grad_norm": 13.186211585998535, "learning_rate": 2.173913043478261e-06, "logits/chosen": -1.621128797531128, "logits/rejected": 2.2737505435943604, "logps/chosen": -209.21923828125, "logps/rejected": -359.25323486328125, "loss": 0.9306, "rewards/accuracies": 0.25, "rewards/chosen": 0.8242598176002502, "rewards/margins": -0.3118968605995178, "rewards/rejected": 1.136156678199768, "step": 70 }, { "epoch": 0.04416796267496112, "grad_norm": 19.368032455444336, "learning_rate": 2.204968944099379e-06, "logits/chosen": -0.7695877552032471, "logits/rejected": 3.2083444595336914, "logps/chosen": -291.3989562988281, "logps/rejected": -526.1988525390625, "loss": 2.4122, "rewards/accuracies": 0.5, "rewards/chosen": 2.067800760269165, "rewards/margins": -1.1661421060562134, "rewards/rejected": 3.233942985534668, "step": 71 }, { "epoch": 0.0447900466562986, "grad_norm": 8.097809791564941, "learning_rate": 2.236024844720497e-06, "logits/chosen": 1.9553561210632324, "logits/rejected": 3.8821897506713867, "logps/chosen": -459.226318359375, "logps/rejected": -545.0194091796875, "loss": 0.633, "rewards/accuracies": 0.625, "rewards/chosen": 2.997643232345581, "rewards/margins": 1.1405882835388184, "rewards/rejected": 1.8570547103881836, "step": 72 }, { "epoch": 0.04541213063763608, "grad_norm": 28.630084991455078, "learning_rate": 2.2670807453416154e-06, "logits/chosen": 0.5597367286682129, "logits/rejected": 3.062697172164917, "logps/chosen": -504.22442626953125, "logps/rejected": -628.9820556640625, "loss": 1.5965, "rewards/accuracies": 0.25, "rewards/chosen": 3.0181586742401123, "rewards/margins": -1.1085466146469116, "rewards/rejected": 4.126705169677734, "step": 73 }, { "epoch": 0.046034214618973564, "grad_norm": 5.517436981201172, "learning_rate": 2.298136645962733e-06, "logits/chosen": 1.4377832412719727, "logits/rejected": 1.949904203414917, "logps/chosen": -447.19549560546875, "logps/rejected": -454.1845397949219, "loss": 0.3698, "rewards/accuracies": 0.875, "rewards/chosen": 3.7997069358825684, "rewards/margins": 2.8889315128326416, "rewards/rejected": 0.910775363445282, "step": 74 }, { "epoch": 0.04665629860031104, "grad_norm": 18.70643424987793, "learning_rate": 2.3291925465838513e-06, "logits/chosen": 2.542583703994751, "logits/rejected": 2.7970423698425293, "logps/chosen": -474.21575927734375, "logps/rejected": -449.4528503417969, "loss": 1.4882, "rewards/accuracies": 0.125, "rewards/chosen": 2.2193827629089355, "rewards/margins": -0.8025726079940796, "rewards/rejected": 3.0219554901123047, "step": 75 }, { "epoch": 0.04727838258164852, "grad_norm": 15.783950805664062, "learning_rate": 2.3602484472049692e-06, "logits/chosen": 0.3996178209781647, "logits/rejected": 2.7533459663391113, "logps/chosen": -528.2157592773438, "logps/rejected": -648.2974243164062, "loss": 2.0542, "rewards/accuracies": 0.5, "rewards/chosen": 3.84883713722229, "rewards/margins": -1.1243318319320679, "rewards/rejected": 4.973168849945068, "step": 76 }, { "epoch": 0.047900466562986, "grad_norm": 13.404909133911133, "learning_rate": 2.391304347826087e-06, "logits/chosen": 0.44638746976852417, "logits/rejected": 2.7166848182678223, "logps/chosen": -529.7808227539062, "logps/rejected": -654.6124267578125, "loss": 1.5697, "rewards/accuracies": 0.625, "rewards/chosen": 3.7755274772644043, "rewards/margins": -0.07085922360420227, "rewards/rejected": 3.846386432647705, "step": 77 }, { "epoch": 0.04852255054432349, "grad_norm": 14.784281730651855, "learning_rate": 2.422360248447205e-06, "logits/chosen": 2.3771920204162598, "logits/rejected": 4.693896293640137, "logps/chosen": -545.78466796875, "logps/rejected": -650.681640625, "loss": 1.5861, "rewards/accuracies": 0.25, "rewards/chosen": 3.8369967937469482, "rewards/margins": -0.9954373836517334, "rewards/rejected": 4.832434177398682, "step": 78 }, { "epoch": 0.049144634525660966, "grad_norm": 14.938491821289062, "learning_rate": 2.453416149068323e-06, "logits/chosen": -0.4898468255996704, "logits/rejected": 2.740943431854248, "logps/chosen": -297.54254150390625, "logps/rejected": -495.09197998046875, "loss": 1.7831, "rewards/accuracies": 0.375, "rewards/chosen": 1.850579857826233, "rewards/margins": -1.2223342657089233, "rewards/rejected": 3.0729141235351562, "step": 79 }, { "epoch": 0.049766718506998445, "grad_norm": 17.010353088378906, "learning_rate": 2.484472049689441e-06, "logits/chosen": 1.1946790218353271, "logits/rejected": 3.110568046569824, "logps/chosen": -513.898681640625, "logps/rejected": -633.34716796875, "loss": 1.3248, "rewards/accuracies": 0.5, "rewards/chosen": 3.4981865882873535, "rewards/margins": -0.5352871417999268, "rewards/rejected": 4.033473968505859, "step": 80 }, { "epoch": 0.050388802488335924, "grad_norm": 14.813240051269531, "learning_rate": 2.515527950310559e-06, "logits/chosen": -1.7359651327133179, "logits/rejected": 2.413377523422241, "logps/chosen": -322.9862060546875, "logps/rejected": -501.1468505859375, "loss": 1.0467, "rewards/accuracies": 0.5, "rewards/chosen": 1.9165505170822144, "rewards/margins": -0.061617642641067505, "rewards/rejected": 1.9781681299209595, "step": 81 }, { "epoch": 0.0510108864696734, "grad_norm": 15.30895709991455, "learning_rate": 2.546583850931677e-06, "logits/chosen": -0.22585351765155792, "logits/rejected": 2.246863842010498, "logps/chosen": -455.3978576660156, "logps/rejected": -596.9251708984375, "loss": 1.9872, "rewards/accuracies": 0.375, "rewards/chosen": 3.1168882846832275, "rewards/margins": -0.8894947171211243, "rewards/rejected": 4.006382942199707, "step": 82 }, { "epoch": 0.05163297045101089, "grad_norm": 22.377033233642578, "learning_rate": 2.5776397515527953e-06, "logits/chosen": -0.3091513514518738, "logits/rejected": 4.61864709854126, "logps/chosen": -430.62103271484375, "logps/rejected": -679.9461669921875, "loss": 2.8942, "rewards/accuracies": 0.375, "rewards/chosen": 2.499563694000244, "rewards/margins": -2.1059954166412354, "rewards/rejected": 4.6055588722229, "step": 83 }, { "epoch": 0.05225505443234837, "grad_norm": 15.34240436553955, "learning_rate": 2.6086956521739132e-06, "logits/chosen": -1.816704511642456, "logits/rejected": 2.3454599380493164, "logps/chosen": -319.86236572265625, "logps/rejected": -584.1156005859375, "loss": 1.7868, "rewards/accuracies": 0.375, "rewards/chosen": 2.1711816787719727, "rewards/margins": -0.16983062028884888, "rewards/rejected": 2.341012477874756, "step": 84 }, { "epoch": 0.05287713841368585, "grad_norm": 17.875179290771484, "learning_rate": 2.639751552795031e-06, "logits/chosen": -1.5490080118179321, "logits/rejected": 2.6023802757263184, "logps/chosen": -371.1326904296875, "logps/rejected": -624.02099609375, "loss": 2.223, "rewards/accuracies": 0.125, "rewards/chosen": 2.910151958465576, "rewards/margins": -1.5838954448699951, "rewards/rejected": 4.49404764175415, "step": 85 }, { "epoch": 0.053499222395023326, "grad_norm": 18.851652145385742, "learning_rate": 2.670807453416149e-06, "logits/chosen": -0.87242591381073, "logits/rejected": 3.042553424835205, "logps/chosen": -484.6915283203125, "logps/rejected": -665.1118774414062, "loss": 3.2805, "rewards/accuracies": 0.375, "rewards/chosen": 2.4592511653900146, "rewards/margins": -2.4145312309265137, "rewards/rejected": 4.873782634735107, "step": 86 }, { "epoch": 0.05412130637636081, "grad_norm": 18.204647064208984, "learning_rate": 2.7018633540372675e-06, "logits/chosen": -2.306795358657837, "logits/rejected": 3.058910846710205, "logps/chosen": -228.07424926757812, "logps/rejected": -520.3849487304688, "loss": 1.8587, "rewards/accuracies": 0.375, "rewards/chosen": 2.028566598892212, "rewards/margins": -1.2790002822875977, "rewards/rejected": 3.3075666427612305, "step": 87 }, { "epoch": 0.05474339035769829, "grad_norm": 16.07090187072754, "learning_rate": 2.7329192546583855e-06, "logits/chosen": -0.4474494159221649, "logits/rejected": 1.9105031490325928, "logps/chosen": -451.0029296875, "logps/rejected": -509.9042053222656, "loss": 2.0134, "rewards/accuracies": 0.25, "rewards/chosen": 1.8902443647384644, "rewards/margins": -0.19919386506080627, "rewards/rejected": 2.0894381999969482, "step": 88 }, { "epoch": 0.05536547433903577, "grad_norm": 9.855167388916016, "learning_rate": 2.7639751552795034e-06, "logits/chosen": -1.10482656955719, "logits/rejected": 2.0926291942596436, "logps/chosen": -345.75262451171875, "logps/rejected": -517.8098754882812, "loss": 0.9753, "rewards/accuracies": 0.75, "rewards/chosen": 1.4932985305786133, "rewards/margins": 0.43307700753211975, "rewards/rejected": 1.060221552848816, "step": 89 }, { "epoch": 0.05598755832037325, "grad_norm": 16.939516067504883, "learning_rate": 2.795031055900621e-06, "logits/chosen": 0.19975408911705017, "logits/rejected": 3.647125005722046, "logps/chosen": -428.03192138671875, "logps/rejected": -624.6107788085938, "loss": 1.8622, "rewards/accuracies": 0.25, "rewards/chosen": 2.7307498455047607, "rewards/margins": -1.3953737020492554, "rewards/rejected": 4.126123428344727, "step": 90 }, { "epoch": 0.05660964230171073, "grad_norm": 13.369673728942871, "learning_rate": 2.8260869565217393e-06, "logits/chosen": -0.44384580850601196, "logits/rejected": 1.0906280279159546, "logps/chosen": -355.5784912109375, "logps/rejected": -454.5552062988281, "loss": 1.274, "rewards/accuracies": 0.125, "rewards/chosen": 1.3210275173187256, "rewards/margins": -0.6426465511322021, "rewards/rejected": 1.9636743068695068, "step": 91 }, { "epoch": 0.05723172628304821, "grad_norm": 14.67619800567627, "learning_rate": 2.8571428571428573e-06, "logits/chosen": 0.5666091442108154, "logits/rejected": 4.029051303863525, "logps/chosen": -470.02581787109375, "logps/rejected": -685.8119506835938, "loss": 1.9673, "rewards/accuracies": 0.5, "rewards/chosen": 4.457881450653076, "rewards/margins": -0.7246289849281311, "rewards/rejected": 5.1825103759765625, "step": 92 }, { "epoch": 0.05785381026438569, "grad_norm": 10.434638977050781, "learning_rate": 2.888198757763975e-06, "logits/chosen": 1.069718360900879, "logits/rejected": 2.851776123046875, "logps/chosen": -478.9818420410156, "logps/rejected": -548.6188354492188, "loss": 0.727, "rewards/accuracies": 0.75, "rewards/chosen": 3.8002443313598633, "rewards/margins": 1.7340096235275269, "rewards/rejected": 2.066234588623047, "step": 93 }, { "epoch": 0.05847589424572317, "grad_norm": 14.497246742248535, "learning_rate": 2.919254658385093e-06, "logits/chosen": -0.6864573955535889, "logits/rejected": 3.1548359394073486, "logps/chosen": -434.332275390625, "logps/rejected": -675.3155517578125, "loss": 1.6166, "rewards/accuracies": 0.375, "rewards/chosen": 1.7466373443603516, "rewards/margins": -0.7960720062255859, "rewards/rejected": 2.5427093505859375, "step": 94 }, { "epoch": 0.05909797822706065, "grad_norm": 16.510725021362305, "learning_rate": 2.9503105590062115e-06, "logits/chosen": -3.1882376670837402, "logits/rejected": 1.559233546257019, "logps/chosen": -414.3146667480469, "logps/rejected": -683.4759521484375, "loss": 1.3017, "rewards/accuracies": 0.5, "rewards/chosen": 2.7683920860290527, "rewards/margins": -0.1580268144607544, "rewards/rejected": 2.926419258117676, "step": 95 }, { "epoch": 0.059720062208398136, "grad_norm": 12.96300983428955, "learning_rate": 2.9813664596273295e-06, "logits/chosen": 1.2822189331054688, "logits/rejected": 2.211031436920166, "logps/chosen": -612.8045654296875, "logps/rejected": -641.1871948242188, "loss": 0.7385, "rewards/accuracies": 0.5, "rewards/chosen": 2.135913133621216, "rewards/margins": 0.4595198631286621, "rewards/rejected": 1.6763931512832642, "step": 96 }, { "epoch": 0.060342146189735615, "grad_norm": 18.275489807128906, "learning_rate": 3.0124223602484474e-06, "logits/chosen": 1.1116188764572144, "logits/rejected": 4.411532402038574, "logps/chosen": -346.83367919921875, "logps/rejected": -512.3101196289062, "loss": 2.2102, "rewards/accuracies": 0.125, "rewards/chosen": 1.5767388343811035, "rewards/margins": -1.908701777458191, "rewards/rejected": 3.485440731048584, "step": 97 }, { "epoch": 0.060964230171073094, "grad_norm": 16.191471099853516, "learning_rate": 3.043478260869566e-06, "logits/chosen": -0.47514790296554565, "logits/rejected": 2.6861495971679688, "logps/chosen": -350.1429748535156, "logps/rejected": -507.99169921875, "loss": 1.985, "rewards/accuracies": 0.25, "rewards/chosen": 1.6881792545318604, "rewards/margins": -1.5417598485946655, "rewards/rejected": 3.2299389839172363, "step": 98 }, { "epoch": 0.06158631415241057, "grad_norm": 17.9713191986084, "learning_rate": 3.0745341614906837e-06, "logits/chosen": 0.785693347454071, "logits/rejected": 2.4765233993530273, "logps/chosen": -517.0220336914062, "logps/rejected": -581.3613891601562, "loss": 2.1322, "rewards/accuracies": 0.375, "rewards/chosen": 1.7961797714233398, "rewards/margins": -1.5854010581970215, "rewards/rejected": 3.3815808296203613, "step": 99 }, { "epoch": 0.06220839813374806, "grad_norm": 15.961114883422852, "learning_rate": 3.1055900621118013e-06, "logits/chosen": 0.8884290456771851, "logits/rejected": 3.1704633235931396, "logps/chosen": -523.0325927734375, "logps/rejected": -628.36572265625, "loss": 1.6824, "rewards/accuracies": 0.375, "rewards/chosen": 2.544769763946533, "rewards/margins": -1.1001884937286377, "rewards/rejected": 3.64495849609375, "step": 100 }, { "epoch": 0.06283048211508553, "grad_norm": 17.8529052734375, "learning_rate": 3.1366459627329192e-06, "logits/chosen": -0.6043689846992493, "logits/rejected": 2.608380079269409, "logps/chosen": -394.1155090332031, "logps/rejected": -595.1231689453125, "loss": 2.2638, "rewards/accuracies": 0.5, "rewards/chosen": 1.9383867979049683, "rewards/margins": -1.5401040315628052, "rewards/rejected": 3.4784908294677734, "step": 101 }, { "epoch": 0.06345256609642301, "grad_norm": 17.48045539855957, "learning_rate": 3.1677018633540376e-06, "logits/chosen": -2.977806806564331, "logits/rejected": 4.158215045928955, "logps/chosen": -317.9443359375, "logps/rejected": -700.9193115234375, "loss": 1.9263, "rewards/accuracies": 0.375, "rewards/chosen": 1.4329849481582642, "rewards/margins": -0.973293662071228, "rewards/rejected": 2.406278610229492, "step": 102 }, { "epoch": 0.0640746500777605, "grad_norm": 12.750750541687012, "learning_rate": 3.1987577639751555e-06, "logits/chosen": -0.5801385641098022, "logits/rejected": 3.2421505451202393, "logps/chosen": -362.1575927734375, "logps/rejected": -635.2872314453125, "loss": 1.0094, "rewards/accuracies": 0.5, "rewards/chosen": 2.4802303314208984, "rewards/margins": -0.26808270812034607, "rewards/rejected": 2.7483131885528564, "step": 103 }, { "epoch": 0.06469673405909798, "grad_norm": 16.52475357055664, "learning_rate": 3.2298136645962735e-06, "logits/chosen": 0.8246936202049255, "logits/rejected": 2.3538520336151123, "logps/chosen": -491.8680725097656, "logps/rejected": -613.082763671875, "loss": 1.1785, "rewards/accuracies": 0.375, "rewards/chosen": 2.9013772010803223, "rewards/margins": -0.05259627103805542, "rewards/rejected": 2.9539732933044434, "step": 104 }, { "epoch": 0.06531881804043546, "grad_norm": 10.206239700317383, "learning_rate": 3.2608695652173914e-06, "logits/chosen": 0.03538012504577637, "logits/rejected": 1.4459773302078247, "logps/chosen": -424.48150634765625, "logps/rejected": -516.4052734375, "loss": 0.7333, "rewards/accuracies": 0.625, "rewards/chosen": 2.1903953552246094, "rewards/margins": 0.42389026284217834, "rewards/rejected": 1.766505241394043, "step": 105 }, { "epoch": 0.06594090202177294, "grad_norm": 18.53340721130371, "learning_rate": 3.29192546583851e-06, "logits/chosen": 0.17394961416721344, "logits/rejected": 3.564350128173828, "logps/chosen": -234.46054077148438, "logps/rejected": -455.53851318359375, "loss": 1.7133, "rewards/accuracies": 0.125, "rewards/chosen": 0.2630676031112671, "rewards/margins": -1.2879587411880493, "rewards/rejected": 1.5510263442993164, "step": 106 }, { "epoch": 0.06656298600311042, "grad_norm": 15.1698579788208, "learning_rate": 3.3229813664596278e-06, "logits/chosen": -0.4583001136779785, "logits/rejected": 0.537924587726593, "logps/chosen": -510.5384521484375, "logps/rejected": -595.7730102539062, "loss": 1.4217, "rewards/accuracies": 0.125, "rewards/chosen": 3.458120107650757, "rewards/margins": -0.7295631170272827, "rewards/rejected": 4.18768310546875, "step": 107 }, { "epoch": 0.0671850699844479, "grad_norm": 11.777779579162598, "learning_rate": 3.3540372670807457e-06, "logits/chosen": 1.580517292022705, "logits/rejected": 2.1494925022125244, "logps/chosen": -570.6925048828125, "logps/rejected": -570.509033203125, "loss": 0.9243, "rewards/accuracies": 0.5, "rewards/chosen": 2.904733419418335, "rewards/margins": 0.5720228552818298, "rewards/rejected": 2.3327105045318604, "step": 108 }, { "epoch": 0.06780715396578538, "grad_norm": 11.124746322631836, "learning_rate": 3.3850931677018632e-06, "logits/chosen": 2.283168315887451, "logits/rejected": 2.234805107116699, "logps/chosen": -635.3388671875, "logps/rejected": -641.2822875976562, "loss": 0.4695, "rewards/accuracies": 0.75, "rewards/chosen": 3.719472646713257, "rewards/margins": 1.2985727787017822, "rewards/rejected": 2.4208996295928955, "step": 109 }, { "epoch": 0.06842923794712286, "grad_norm": 10.408456802368164, "learning_rate": 3.4161490683229816e-06, "logits/chosen": 0.18637174367904663, "logits/rejected": 2.4890756607055664, "logps/chosen": -416.40740966796875, "logps/rejected": -548.4837646484375, "loss": 0.775, "rewards/accuracies": 0.625, "rewards/chosen": 3.542468309402466, "rewards/margins": 0.9549359083175659, "rewards/rejected": 2.5875320434570312, "step": 110 }, { "epoch": 0.06905132192846034, "grad_norm": 13.247396469116211, "learning_rate": 3.4472049689440996e-06, "logits/chosen": -0.6173598170280457, "logits/rejected": 3.1843693256378174, "logps/chosen": -425.20330810546875, "logps/rejected": -591.7823486328125, "loss": 0.8984, "rewards/accuracies": 0.625, "rewards/chosen": 1.6247044801712036, "rewards/margins": 0.22022280097007751, "rewards/rejected": 1.4044815301895142, "step": 111 }, { "epoch": 0.06967340590979783, "grad_norm": 11.37887954711914, "learning_rate": 3.4782608695652175e-06, "logits/chosen": 0.850402295589447, "logits/rejected": 3.167102098464966, "logps/chosen": -466.1937561035156, "logps/rejected": -625.3375244140625, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": 2.7119944095611572, "rewards/margins": 1.1932249069213867, "rewards/rejected": 1.5187699794769287, "step": 112 }, { "epoch": 0.07029548989113531, "grad_norm": 14.853896141052246, "learning_rate": 3.5093167701863355e-06, "logits/chosen": -3.9423811435699463, "logits/rejected": 1.371962308883667, "logps/chosen": -324.5252380371094, "logps/rejected": -580.9085693359375, "loss": 1.4503, "rewards/accuracies": 0.5, "rewards/chosen": 0.8935672044754028, "rewards/margins": -0.8176671266555786, "rewards/rejected": 1.7112343311309814, "step": 113 }, { "epoch": 0.07091757387247279, "grad_norm": 12.282033920288086, "learning_rate": 3.540372670807454e-06, "logits/chosen": -0.7895638942718506, "logits/rejected": 1.1006243228912354, "logps/chosen": -316.26910400390625, "logps/rejected": -413.5718078613281, "loss": 1.2336, "rewards/accuracies": 0.5, "rewards/chosen": 1.485090970993042, "rewards/margins": -0.3405936062335968, "rewards/rejected": 1.8256844282150269, "step": 114 }, { "epoch": 0.07153965785381027, "grad_norm": 16.173473358154297, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -0.9362825155258179, "logits/rejected": 2.5523064136505127, "logps/chosen": -473.16552734375, "logps/rejected": -647.4202270507812, "loss": 1.0592, "rewards/accuracies": 0.625, "rewards/chosen": 2.0608103275299072, "rewards/margins": 0.36317819356918335, "rewards/rejected": 1.6976323127746582, "step": 115 }, { "epoch": 0.07216174183514774, "grad_norm": 16.98232650756836, "learning_rate": 3.6024844720496897e-06, "logits/chosen": -2.35032320022583, "logits/rejected": 0.12284666299819946, "logps/chosen": -298.3292236328125, "logps/rejected": -483.40484619140625, "loss": 0.9504, "rewards/accuracies": 0.5, "rewards/chosen": 1.6204333305358887, "rewards/margins": 0.2157682180404663, "rewards/rejected": 1.404665231704712, "step": 116 }, { "epoch": 0.07278382581648522, "grad_norm": 12.480202674865723, "learning_rate": 3.633540372670808e-06, "logits/chosen": -0.8433324098587036, "logits/rejected": 1.5265121459960938, "logps/chosen": -456.8272399902344, "logps/rejected": -571.0210571289062, "loss": 1.4491, "rewards/accuracies": 0.5, "rewards/chosen": 2.1732237339019775, "rewards/margins": -0.24429386854171753, "rewards/rejected": 2.4175174236297607, "step": 117 }, { "epoch": 0.0734059097978227, "grad_norm": 11.653485298156738, "learning_rate": 3.664596273291926e-06, "logits/chosen": -2.875009059906006, "logits/rejected": 2.4798736572265625, "logps/chosen": -412.5729675292969, "logps/rejected": -663.5484008789062, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": 2.6050448417663574, "rewards/margins": 0.7395725846290588, "rewards/rejected": 1.8654723167419434, "step": 118 }, { "epoch": 0.07402799377916018, "grad_norm": 15.659344673156738, "learning_rate": 3.6956521739130436e-06, "logits/chosen": -2.1049094200134277, "logits/rejected": 2.7573556900024414, "logps/chosen": -286.6668395996094, "logps/rejected": -604.6309814453125, "loss": 1.5757, "rewards/accuracies": 0.5, "rewards/chosen": -0.1442634016275406, "rewards/margins": -0.6718887090682983, "rewards/rejected": 0.527625322341919, "step": 119 }, { "epoch": 0.07465007776049767, "grad_norm": 13.147564888000488, "learning_rate": 3.7267080745341615e-06, "logits/chosen": -2.316929817199707, "logits/rejected": 1.6243261098861694, "logps/chosen": -293.188232421875, "logps/rejected": -576.5936889648438, "loss": 0.7928, "rewards/accuracies": 0.5, "rewards/chosen": 1.9691627025604248, "rewards/margins": 0.47387444972991943, "rewards/rejected": 1.4952882528305054, "step": 120 }, { "epoch": 0.07527216174183515, "grad_norm": 11.02121639251709, "learning_rate": 3.7577639751552795e-06, "logits/chosen": 2.07505464553833, "logits/rejected": 2.70845365524292, "logps/chosen": -563.6138916015625, "logps/rejected": -643.9354248046875, "loss": 0.4981, "rewards/accuracies": 0.75, "rewards/chosen": 3.3107151985168457, "rewards/margins": 1.3117496967315674, "rewards/rejected": 1.9989655017852783, "step": 121 }, { "epoch": 0.07589424572317263, "grad_norm": 8.774572372436523, "learning_rate": 3.788819875776398e-06, "logits/chosen": -0.26993322372436523, "logits/rejected": 0.7964863777160645, "logps/chosen": -417.2394104003906, "logps/rejected": -495.690673828125, "loss": 0.6027, "rewards/accuracies": 0.5, "rewards/chosen": 2.0526046752929688, "rewards/margins": 1.4780837297439575, "rewards/rejected": 0.5745209455490112, "step": 122 }, { "epoch": 0.07651632970451011, "grad_norm": 18.699556350708008, "learning_rate": 3.819875776397516e-06, "logits/chosen": 1.2862492799758911, "logits/rejected": 3.086623191833496, "logps/chosen": -456.5997009277344, "logps/rejected": -512.4847412109375, "loss": 1.6742, "rewards/accuracies": 0.25, "rewards/chosen": 0.932388424873352, "rewards/margins": -0.5664696097373962, "rewards/rejected": 1.4988579750061035, "step": 123 }, { "epoch": 0.07713841368584759, "grad_norm": 13.291274070739746, "learning_rate": 3.850931677018634e-06, "logits/chosen": -2.284844398498535, "logits/rejected": 2.227841854095459, "logps/chosen": -217.15032958984375, "logps/rejected": -524.59521484375, "loss": 1.1794, "rewards/accuracies": 0.625, "rewards/chosen": 1.0233690738677979, "rewards/margins": 0.13595056533813477, "rewards/rejected": 0.8874184489250183, "step": 124 }, { "epoch": 0.07776049766718507, "grad_norm": 13.975177764892578, "learning_rate": 3.881987577639752e-06, "logits/chosen": -0.2947637140750885, "logits/rejected": 1.340907096862793, "logps/chosen": -373.473876953125, "logps/rejected": -494.6077880859375, "loss": 0.9861, "rewards/accuracies": 0.625, "rewards/chosen": 1.3585331439971924, "rewards/margins": 0.3273778557777405, "rewards/rejected": 1.0311551094055176, "step": 125 }, { "epoch": 0.07838258164852255, "grad_norm": 15.512024879455566, "learning_rate": 3.91304347826087e-06, "logits/chosen": 0.03474271297454834, "logits/rejected": 3.1309218406677246, "logps/chosen": -391.5765380859375, "logps/rejected": -586.3619384765625, "loss": 1.8219, "rewards/accuracies": 0.375, "rewards/chosen": 1.0997841358184814, "rewards/margins": -0.6230330467224121, "rewards/rejected": 1.722817301750183, "step": 126 }, { "epoch": 0.07900466562986003, "grad_norm": 15.874641418457031, "learning_rate": 3.9440993788819884e-06, "logits/chosen": 2.055415153503418, "logits/rejected": 2.7052502632141113, "logps/chosen": -516.319580078125, "logps/rejected": -570.6796264648438, "loss": 1.7473, "rewards/accuracies": 0.5, "rewards/chosen": 2.117936611175537, "rewards/margins": -0.45919865369796753, "rewards/rejected": 2.5771350860595703, "step": 127 }, { "epoch": 0.0796267496111975, "grad_norm": 16.0906925201416, "learning_rate": 3.975155279503106e-06, "logits/chosen": 1.8315917253494263, "logits/rejected": 4.382080078125, "logps/chosen": -470.73553466796875, "logps/rejected": -577.845947265625, "loss": 1.2574, "rewards/accuracies": 0.375, "rewards/chosen": 2.3403124809265137, "rewards/margins": -0.38605883717536926, "rewards/rejected": 2.7263715267181396, "step": 128 }, { "epoch": 0.080248833592535, "grad_norm": 17.62135887145996, "learning_rate": 4.0062111801242235e-06, "logits/chosen": 0.17830497026443481, "logits/rejected": 2.5713090896606445, "logps/chosen": -494.3236083984375, "logps/rejected": -623.1253051757812, "loss": 1.746, "rewards/accuracies": 0.375, "rewards/chosen": 1.6026488542556763, "rewards/margins": -1.0111427307128906, "rewards/rejected": 2.6137917041778564, "step": 129 }, { "epoch": 0.08087091757387248, "grad_norm": 11.60129451751709, "learning_rate": 4.037267080745342e-06, "logits/chosen": -3.4386911392211914, "logits/rejected": 2.7574286460876465, "logps/chosen": -257.7047424316406, "logps/rejected": -627.53955078125, "loss": 0.796, "rewards/accuracies": 0.625, "rewards/chosen": 1.8960356712341309, "rewards/margins": 1.6405115127563477, "rewards/rejected": 0.2555241584777832, "step": 130 }, { "epoch": 0.08149300155520996, "grad_norm": 13.185992240905762, "learning_rate": 4.06832298136646e-06, "logits/chosen": -0.11298033595085144, "logits/rejected": 2.206801414489746, "logps/chosen": -425.2882385253906, "logps/rejected": -585.9595336914062, "loss": 1.0282, "rewards/accuracies": 0.5, "rewards/chosen": 1.9650180339813232, "rewards/margins": 0.008340716361999512, "rewards/rejected": 1.9566774368286133, "step": 131 }, { "epoch": 0.08211508553654744, "grad_norm": 18.31060218811035, "learning_rate": 4.099378881987578e-06, "logits/chosen": -0.1761866807937622, "logits/rejected": 4.089554786682129, "logps/chosen": -380.66754150390625, "logps/rejected": -706.0594482421875, "loss": 1.3029, "rewards/accuracies": 0.5, "rewards/chosen": 2.057408332824707, "rewards/margins": 0.4352431893348694, "rewards/rejected": 1.6221649646759033, "step": 132 }, { "epoch": 0.08273716951788491, "grad_norm": 8.656877517700195, "learning_rate": 4.130434782608696e-06, "logits/chosen": -2.470470666885376, "logits/rejected": 2.555208206176758, "logps/chosen": -157.4940185546875, "logps/rejected": -460.08282470703125, "loss": 0.8588, "rewards/accuracies": 0.75, "rewards/chosen": 1.223191738128662, "rewards/margins": 0.5577363967895508, "rewards/rejected": 0.6654552221298218, "step": 133 }, { "epoch": 0.0833592534992224, "grad_norm": 18.718704223632812, "learning_rate": 4.1614906832298145e-06, "logits/chosen": -0.0035840272903442383, "logits/rejected": 2.8037209510803223, "logps/chosen": -442.99755859375, "logps/rejected": -640.841796875, "loss": 1.8066, "rewards/accuracies": 0.5, "rewards/chosen": 1.6039692163467407, "rewards/margins": -0.202498197555542, "rewards/rejected": 1.8064675331115723, "step": 134 }, { "epoch": 0.08398133748055987, "grad_norm": 8.409269332885742, "learning_rate": 4.192546583850932e-06, "logits/chosen": 0.6121279001235962, "logits/rejected": 3.616358995437622, "logps/chosen": -475.4168701171875, "logps/rejected": -649.7413330078125, "loss": 0.3765, "rewards/accuracies": 0.875, "rewards/chosen": 1.1593486070632935, "rewards/margins": 1.5151525735855103, "rewards/rejected": -0.35580405592918396, "step": 135 }, { "epoch": 0.08460342146189735, "grad_norm": 15.40246295928955, "learning_rate": 4.22360248447205e-06, "logits/chosen": -0.3376844525337219, "logits/rejected": 2.6802141666412354, "logps/chosen": -415.5157470703125, "logps/rejected": -546.9631958007812, "loss": 1.1904, "rewards/accuracies": 0.5, "rewards/chosen": 0.4231369197368622, "rewards/margins": 0.5718798041343689, "rewards/rejected": -0.14874297380447388, "step": 136 }, { "epoch": 0.08522550544323483, "grad_norm": 13.815793991088867, "learning_rate": 4.254658385093168e-06, "logits/chosen": 0.4633938670158386, "logits/rejected": 3.006338596343994, "logps/chosen": -387.02801513671875, "logps/rejected": -560.1219482421875, "loss": 0.7988, "rewards/accuracies": 0.625, "rewards/chosen": 1.1222738027572632, "rewards/margins": 1.3678871393203735, "rewards/rejected": -0.24561332166194916, "step": 137 }, { "epoch": 0.08584758942457232, "grad_norm": 13.101861000061035, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -0.850897490978241, "logits/rejected": 2.131795883178711, "logps/chosen": -514.452880859375, "logps/rejected": -722.8804321289062, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": 1.8776485919952393, "rewards/margins": 1.0855976343154907, "rewards/rejected": 0.7920509576797485, "step": 138 }, { "epoch": 0.0864696734059098, "grad_norm": 11.072582244873047, "learning_rate": 4.316770186335404e-06, "logits/chosen": -1.0411274433135986, "logits/rejected": 1.7559654712677002, "logps/chosen": -414.7952575683594, "logps/rejected": -646.4049072265625, "loss": 0.9405, "rewards/accuracies": 0.625, "rewards/chosen": 0.5711038112640381, "rewards/margins": 1.2949190139770508, "rewards/rejected": -0.7238152027130127, "step": 139 }, { "epoch": 0.08709175738724728, "grad_norm": 15.693482398986816, "learning_rate": 4.347826086956522e-06, "logits/chosen": -1.0289599895477295, "logits/rejected": 2.5186283588409424, "logps/chosen": -497.6200866699219, "logps/rejected": -742.9861450195312, "loss": 0.9332, "rewards/accuracies": 0.5, "rewards/chosen": 1.6734346151351929, "rewards/margins": 1.1827820539474487, "rewards/rejected": 0.49065250158309937, "step": 140 }, { "epoch": 0.08771384136858476, "grad_norm": 7.858894348144531, "learning_rate": 4.37888198757764e-06, "logits/chosen": -0.7212369441986084, "logits/rejected": 2.2959365844726562, "logps/chosen": -403.8475341796875, "logps/rejected": -642.38671875, "loss": 0.217, "rewards/accuracies": 0.875, "rewards/chosen": 1.161757469177246, "rewards/margins": 2.856011152267456, "rewards/rejected": -1.6942535638809204, "step": 141 }, { "epoch": 0.08833592534992224, "grad_norm": 14.733773231506348, "learning_rate": 4.409937888198758e-06, "logits/chosen": 0.1742795705795288, "logits/rejected": 3.587676763534546, "logps/chosen": -384.32366943359375, "logps/rejected": -565.3023681640625, "loss": 0.921, "rewards/accuracies": 0.75, "rewards/chosen": 0.8741454482078552, "rewards/margins": 0.5108630657196045, "rewards/rejected": 0.36328238248825073, "step": 142 }, { "epoch": 0.08895800933125972, "grad_norm": 11.316405296325684, "learning_rate": 4.4409937888198765e-06, "logits/chosen": -0.8727976083755493, "logits/rejected": 1.323494791984558, "logps/chosen": -451.0741882324219, "logps/rejected": -636.7959594726562, "loss": 0.5144, "rewards/accuracies": 0.75, "rewards/chosen": 2.957265853881836, "rewards/margins": 1.7889721393585205, "rewards/rejected": 1.168293833732605, "step": 143 }, { "epoch": 0.0895800933125972, "grad_norm": 12.712139129638672, "learning_rate": 4.472049689440994e-06, "logits/chosen": -1.3119699954986572, "logits/rejected": 2.372183322906494, "logps/chosen": -407.4181823730469, "logps/rejected": -595.0040893554688, "loss": 0.6512, "rewards/accuracies": 0.75, "rewards/chosen": 0.17220136523246765, "rewards/margins": 2.198647975921631, "rewards/rejected": -2.026446580886841, "step": 144 }, { "epoch": 0.09020217729393468, "grad_norm": 11.397430419921875, "learning_rate": 4.503105590062112e-06, "logits/chosen": -2.4750256538391113, "logits/rejected": 3.215958595275879, "logps/chosen": -212.1233673095703, "logps/rejected": -545.7522583007812, "loss": 1.017, "rewards/accuracies": 0.625, "rewards/chosen": 0.6188298463821411, "rewards/margins": 0.5848968029022217, "rewards/rejected": 0.03393308073282242, "step": 145 }, { "epoch": 0.09082426127527216, "grad_norm": 10.805316925048828, "learning_rate": 4.534161490683231e-06, "logits/chosen": -0.4496426582336426, "logits/rejected": 1.0504510402679443, "logps/chosen": -374.22808837890625, "logps/rejected": -502.6123352050781, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": 0.19181381165981293, "rewards/margins": 2.4179444313049316, "rewards/rejected": -2.226130485534668, "step": 146 }, { "epoch": 0.09144634525660965, "grad_norm": 10.55547046661377, "learning_rate": 4.565217391304348e-06, "logits/chosen": -1.8848826885223389, "logits/rejected": 1.359985589981079, "logps/chosen": -299.04547119140625, "logps/rejected": -568.8724365234375, "loss": 0.6872, "rewards/accuracies": 0.625, "rewards/chosen": -1.0375633239746094, "rewards/margins": 1.69275963306427, "rewards/rejected": -2.730323076248169, "step": 147 }, { "epoch": 0.09206842923794713, "grad_norm": 21.383508682250977, "learning_rate": 4.596273291925466e-06, "logits/chosen": -0.8560746908187866, "logits/rejected": 2.637990951538086, "logps/chosen": -386.06591796875, "logps/rejected": -598.1448974609375, "loss": 1.7864, "rewards/accuracies": 0.25, "rewards/chosen": 0.4232664704322815, "rewards/margins": -1.181652307510376, "rewards/rejected": 1.6049187183380127, "step": 148 }, { "epoch": 0.0926905132192846, "grad_norm": 14.351357460021973, "learning_rate": 4.627329192546584e-06, "logits/chosen": -1.2320556640625, "logits/rejected": 2.324796676635742, "logps/chosen": -422.949462890625, "logps/rejected": -599.7335205078125, "loss": 1.1846, "rewards/accuracies": 0.625, "rewards/chosen": 0.42700445652008057, "rewards/margins": 1.3264269828796387, "rewards/rejected": -0.8994225263595581, "step": 149 }, { "epoch": 0.09331259720062209, "grad_norm": 12.767666816711426, "learning_rate": 4.6583850931677025e-06, "logits/chosen": -0.617587685585022, "logits/rejected": 2.7065675258636475, "logps/chosen": -476.66656494140625, "logps/rejected": -669.7945556640625, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 1.266343593597412, "rewards/margins": 0.8541361689567566, "rewards/rejected": 0.4122074246406555, "step": 150 }, { "epoch": 0.09393468118195956, "grad_norm": 11.05392074584961, "learning_rate": 4.68944099378882e-06, "logits/chosen": 0.2834520936012268, "logits/rejected": 0.576147198677063, "logps/chosen": -526.546142578125, "logps/rejected": -558.4017944335938, "loss": 0.7869, "rewards/accuracies": 0.75, "rewards/chosen": 1.6659679412841797, "rewards/margins": 2.798304319381714, "rewards/rejected": -1.1323366165161133, "step": 151 }, { "epoch": 0.09455676516329704, "grad_norm": 8.572049140930176, "learning_rate": 4.7204968944099384e-06, "logits/chosen": 0.27595990896224976, "logits/rejected": 3.129425287246704, "logps/chosen": -365.931640625, "logps/rejected": -558.4929809570312, "loss": 0.3039, "rewards/accuracies": 0.75, "rewards/chosen": 1.4017826318740845, "rewards/margins": 2.737191677093506, "rewards/rejected": -1.335409164428711, "step": 152 }, { "epoch": 0.09517884914463452, "grad_norm": 13.241445541381836, "learning_rate": 4.751552795031056e-06, "logits/chosen": -1.7318871021270752, "logits/rejected": 0.8788809180259705, "logps/chosen": -382.02313232421875, "logps/rejected": -560.0665893554688, "loss": 0.8126, "rewards/accuracies": 0.625, "rewards/chosen": -0.083403080701828, "rewards/margins": 1.9525655508041382, "rewards/rejected": -2.035968780517578, "step": 153 }, { "epoch": 0.095800933125972, "grad_norm": 17.070524215698242, "learning_rate": 4.782608695652174e-06, "logits/chosen": 1.6011037826538086, "logits/rejected": 3.153493881225586, "logps/chosen": -567.4342651367188, "logps/rejected": -710.6505737304688, "loss": 1.5266, "rewards/accuracies": 0.625, "rewards/chosen": 1.424606442451477, "rewards/margins": 1.378676176071167, "rewards/rejected": 0.04593047499656677, "step": 154 }, { "epoch": 0.09642301710730948, "grad_norm": 12.96839427947998, "learning_rate": 4.813664596273293e-06, "logits/chosen": -2.5613062381744385, "logits/rejected": 0.49821412563323975, "logps/chosen": -292.32916259765625, "logps/rejected": -497.0234375, "loss": 0.8663, "rewards/accuracies": 0.75, "rewards/chosen": -0.8609590530395508, "rewards/margins": 0.8152419924736023, "rewards/rejected": -1.6762011051177979, "step": 155 }, { "epoch": 0.09704510108864697, "grad_norm": 10.087762832641602, "learning_rate": 4.84472049689441e-06, "logits/chosen": -0.324648380279541, "logits/rejected": 3.1282596588134766, "logps/chosen": -515.1690063476562, "logps/rejected": -732.888427734375, "loss": 0.3387, "rewards/accuracies": 0.75, "rewards/chosen": 3.212400436401367, "rewards/margins": 4.046263694763184, "rewards/rejected": -0.8338631391525269, "step": 156 }, { "epoch": 0.09766718506998445, "grad_norm": 10.375144958496094, "learning_rate": 4.875776397515528e-06, "logits/chosen": -1.9850226640701294, "logits/rejected": 2.7046704292297363, "logps/chosen": -309.62445068359375, "logps/rejected": -625.557861328125, "loss": 0.5774, "rewards/accuracies": 0.75, "rewards/chosen": 1.2270424365997314, "rewards/margins": 1.1084333658218384, "rewards/rejected": 0.11860904097557068, "step": 157 }, { "epoch": 0.09828926905132193, "grad_norm": 23.09259605407715, "learning_rate": 4.906832298136646e-06, "logits/chosen": -2.143054962158203, "logits/rejected": 0.9800859093666077, "logps/chosen": -454.3524169921875, "logps/rejected": -680.2357788085938, "loss": 2.6416, "rewards/accuracies": 0.25, "rewards/chosen": 0.20148468017578125, "rewards/margins": -1.6109024286270142, "rewards/rejected": 1.8123871088027954, "step": 158 }, { "epoch": 0.09891135303265941, "grad_norm": 6.472715854644775, "learning_rate": 4.9378881987577645e-06, "logits/chosen": -0.7787084579467773, "logits/rejected": 2.5501129627227783, "logps/chosen": -261.9610595703125, "logps/rejected": -528.9261474609375, "loss": 0.1461, "rewards/accuracies": 0.875, "rewards/chosen": 0.6717526912689209, "rewards/margins": 2.9916112422943115, "rewards/rejected": -2.3198585510253906, "step": 159 }, { "epoch": 0.09953343701399689, "grad_norm": 5.089502334594727, "learning_rate": 4.968944099378882e-06, "logits/chosen": -0.957655131816864, "logits/rejected": 1.6360690593719482, "logps/chosen": -411.20196533203125, "logps/rejected": -601.5455932617188, "loss": 0.1636, "rewards/accuracies": 0.875, "rewards/chosen": 0.9276119470596313, "rewards/margins": 4.392796993255615, "rewards/rejected": -3.4651849269866943, "step": 160 }, { "epoch": 0.10015552099533437, "grad_norm": 15.581966400146484, "learning_rate": 5e-06, "logits/chosen": -1.0357143878936768, "logits/rejected": 0.9435085654258728, "logps/chosen": -558.873779296875, "logps/rejected": -674.39794921875, "loss": 1.0018, "rewards/accuracies": 0.625, "rewards/chosen": -0.002629697322845459, "rewards/margins": 1.4850234985351562, "rewards/rejected": -1.4876528978347778, "step": 161 }, { "epoch": 0.10077760497667185, "grad_norm": 22.235620498657227, "learning_rate": 4.996542185338866e-06, "logits/chosen": 1.3053479194641113, "logits/rejected": 3.2917628288269043, "logps/chosen": -562.901611328125, "logps/rejected": -725.4825439453125, "loss": 1.0941, "rewards/accuracies": 0.75, "rewards/chosen": 2.026945114135742, "rewards/margins": 4.661256790161133, "rewards/rejected": -2.6343116760253906, "step": 162 }, { "epoch": 0.10139968895800933, "grad_norm": 16.269826889038086, "learning_rate": 4.993084370677732e-06, "logits/chosen": -0.13778042793273926, "logits/rejected": 1.7080318927764893, "logps/chosen": -560.675537109375, "logps/rejected": -732.7532348632812, "loss": 0.987, "rewards/accuracies": 0.625, "rewards/chosen": 1.0941566228866577, "rewards/margins": 1.1310991048812866, "rewards/rejected": -0.03694245219230652, "step": 163 }, { "epoch": 0.1020217729393468, "grad_norm": 9.44483470916748, "learning_rate": 4.989626556016598e-06, "logits/chosen": 0.11922720074653625, "logits/rejected": 1.5642731189727783, "logps/chosen": -399.0444641113281, "logps/rejected": -523.547607421875, "loss": 0.2493, "rewards/accuracies": 0.875, "rewards/chosen": 0.4019642472267151, "rewards/margins": 4.293483257293701, "rewards/rejected": -3.8915188312530518, "step": 164 }, { "epoch": 0.1026438569206843, "grad_norm": 11.563282012939453, "learning_rate": 4.986168741355464e-06, "logits/chosen": -2.694180727005005, "logits/rejected": -0.5539831519126892, "logps/chosen": -391.9232177734375, "logps/rejected": -514.771728515625, "loss": 0.469, "rewards/accuracies": 0.75, "rewards/chosen": -0.039527103304862976, "rewards/margins": 3.2702486515045166, "rewards/rejected": -3.3097758293151855, "step": 165 }, { "epoch": 0.10326594090202178, "grad_norm": 11.740907669067383, "learning_rate": 4.98271092669433e-06, "logits/chosen": -1.3099642992019653, "logits/rejected": 3.243171453475952, "logps/chosen": -395.3979797363281, "logps/rejected": -695.2596435546875, "loss": 0.7001, "rewards/accuracies": 0.5, "rewards/chosen": 0.45124369859695435, "rewards/margins": 1.1800510883331299, "rewards/rejected": -0.7288073301315308, "step": 166 }, { "epoch": 0.10388802488335926, "grad_norm": 20.14325523376465, "learning_rate": 4.979253112033195e-06, "logits/chosen": -1.3297227621078491, "logits/rejected": 2.5647099018096924, "logps/chosen": -491.02099609375, "logps/rejected": -732.8243408203125, "loss": 1.6841, "rewards/accuracies": 0.5, "rewards/chosen": -1.8422999382019043, "rewards/margins": -0.32352250814437866, "rewards/rejected": -1.51877760887146, "step": 167 }, { "epoch": 0.10451010886469674, "grad_norm": 10.54570198059082, "learning_rate": 4.975795297372061e-06, "logits/chosen": -0.0797119140625, "logits/rejected": 1.8367505073547363, "logps/chosen": -460.3702697753906, "logps/rejected": -640.3387451171875, "loss": 0.8308, "rewards/accuracies": 0.875, "rewards/chosen": -0.462015837430954, "rewards/margins": 4.400640487670898, "rewards/rejected": -4.862656116485596, "step": 168 }, { "epoch": 0.10513219284603421, "grad_norm": 9.544731140136719, "learning_rate": 4.9723374827109275e-06, "logits/chosen": -1.1494858264923096, "logits/rejected": 1.6733912229537964, "logps/chosen": -423.3375244140625, "logps/rejected": -623.7045288085938, "loss": 0.626, "rewards/accuracies": 0.875, "rewards/chosen": 0.3805038332939148, "rewards/margins": 3.891188144683838, "rewards/rejected": -3.5106844902038574, "step": 169 }, { "epoch": 0.1057542768273717, "grad_norm": 4.27135705947876, "learning_rate": 4.968879668049793e-06, "logits/chosen": 0.2885769009590149, "logits/rejected": 0.2936728000640869, "logps/chosen": -563.3328247070312, "logps/rejected": -575.6583251953125, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": 0.7351505756378174, "rewards/margins": 3.960043430328369, "rewards/rejected": -3.2248926162719727, "step": 170 }, { "epoch": 0.10637636080870917, "grad_norm": 4.6370368003845215, "learning_rate": 4.965421853388659e-06, "logits/chosen": -3.333397626876831, "logits/rejected": 1.3425174951553345, "logps/chosen": -408.08197021484375, "logps/rejected": -709.070556640625, "loss": 0.1404, "rewards/accuracies": 1.0, "rewards/chosen": 0.24222341179847717, "rewards/margins": 3.377171516418457, "rewards/rejected": -3.1349480152130127, "step": 171 }, { "epoch": 0.10699844479004665, "grad_norm": 8.306254386901855, "learning_rate": 4.9619640387275245e-06, "logits/chosen": 0.13803964853286743, "logits/rejected": 2.2443158626556396, "logps/chosen": -437.39947509765625, "logps/rejected": -626.3837280273438, "loss": 0.2762, "rewards/accuracies": 0.875, "rewards/chosen": 0.4506424069404602, "rewards/margins": 5.467344284057617, "rewards/rejected": -5.016701698303223, "step": 172 }, { "epoch": 0.10762052877138413, "grad_norm": 0.49179601669311523, "learning_rate": 4.95850622406639e-06, "logits/chosen": 0.8681771755218506, "logits/rejected": 1.6467424631118774, "logps/chosen": -629.63671875, "logps/rejected": -713.7977294921875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 1.161713719367981, "rewards/margins": 7.064383506774902, "rewards/rejected": -5.902669429779053, "step": 173 }, { "epoch": 0.10824261275272162, "grad_norm": 14.40332317352295, "learning_rate": 4.955048409405257e-06, "logits/chosen": -2.794297456741333, "logits/rejected": 1.9084908962249756, "logps/chosen": -373.3389892578125, "logps/rejected": -710.1351928710938, "loss": 0.5471, "rewards/accuracies": 0.75, "rewards/chosen": 0.26065561175346375, "rewards/margins": 2.1540377140045166, "rewards/rejected": -1.8933820724487305, "step": 174 }, { "epoch": 0.1088646967340591, "grad_norm": 11.906153678894043, "learning_rate": 4.951590594744122e-06, "logits/chosen": -2.1527581214904785, "logits/rejected": 2.1399941444396973, "logps/chosen": -204.360595703125, "logps/rejected": -560.0633544921875, "loss": 1.0301, "rewards/accuracies": 0.75, "rewards/chosen": 0.39860033988952637, "rewards/margins": 3.320188522338867, "rewards/rejected": -2.921588182449341, "step": 175 }, { "epoch": 0.10948678071539658, "grad_norm": 15.793335914611816, "learning_rate": 4.948132780082988e-06, "logits/chosen": -1.0387338399887085, "logits/rejected": 0.03949415683746338, "logps/chosen": -533.9537963867188, "logps/rejected": -646.6304321289062, "loss": 1.1681, "rewards/accuracies": 0.75, "rewards/chosen": -0.05117526650428772, "rewards/margins": 1.1503212451934814, "rewards/rejected": -1.2014964818954468, "step": 176 }, { "epoch": 0.11010886469673406, "grad_norm": 14.857304573059082, "learning_rate": 4.944674965421854e-06, "logits/chosen": -0.4052245616912842, "logits/rejected": 2.6427154541015625, "logps/chosen": -468.0205078125, "logps/rejected": -594.4483642578125, "loss": 0.581, "rewards/accuracies": 0.75, "rewards/chosen": 0.29150068759918213, "rewards/margins": 4.175938606262207, "rewards/rejected": -3.8844382762908936, "step": 177 }, { "epoch": 0.11073094867807154, "grad_norm": 6.707697868347168, "learning_rate": 4.94121715076072e-06, "logits/chosen": -2.06303071975708, "logits/rejected": 2.870866298675537, "logps/chosen": -211.53378295898438, "logps/rejected": -536.5917358398438, "loss": 0.2274, "rewards/accuracies": 0.875, "rewards/chosen": 0.23200228810310364, "rewards/margins": 3.0072736740112305, "rewards/rejected": -2.775271415710449, "step": 178 }, { "epoch": 0.11135303265940902, "grad_norm": 10.638102531433105, "learning_rate": 4.937759336099586e-06, "logits/chosen": 1.2498904466629028, "logits/rejected": 3.804900646209717, "logps/chosen": -438.4925842285156, "logps/rejected": -646.5635375976562, "loss": 0.4775, "rewards/accuracies": 0.875, "rewards/chosen": 0.23360222578048706, "rewards/margins": 2.6722140312194824, "rewards/rejected": -2.438612222671509, "step": 179 }, { "epoch": 0.1119751166407465, "grad_norm": 20.042150497436523, "learning_rate": 4.934301521438452e-06, "logits/chosen": -2.7955029010772705, "logits/rejected": 0.9656684398651123, "logps/chosen": -365.96697998046875, "logps/rejected": -628.2977294921875, "loss": 1.1672, "rewards/accuracies": 0.5, "rewards/chosen": 1.254376769065857, "rewards/margins": 0.8498780131340027, "rewards/rejected": 0.4044986963272095, "step": 180 }, { "epoch": 0.11259720062208398, "grad_norm": 7.192320823669434, "learning_rate": 4.930843706777317e-06, "logits/chosen": 0.3417157232761383, "logits/rejected": 2.7342491149902344, "logps/chosen": -524.7584838867188, "logps/rejected": -713.7581176757812, "loss": 0.2595, "rewards/accuracies": 0.875, "rewards/chosen": 0.3835466504096985, "rewards/margins": 4.255807876586914, "rewards/rejected": -3.8722615242004395, "step": 181 }, { "epoch": 0.11321928460342146, "grad_norm": 12.767531394958496, "learning_rate": 4.927385892116183e-06, "logits/chosen": -1.8989653587341309, "logits/rejected": 1.1872360706329346, "logps/chosen": -377.1787414550781, "logps/rejected": -687.2525634765625, "loss": 0.3572, "rewards/accuracies": 0.875, "rewards/chosen": -1.268122673034668, "rewards/margins": 3.9153528213500977, "rewards/rejected": -5.183475494384766, "step": 182 }, { "epoch": 0.11384136858475895, "grad_norm": 1.044492483139038, "learning_rate": 4.9239280774550495e-06, "logits/chosen": -0.3749435842037201, "logits/rejected": 1.9665062427520752, "logps/chosen": -397.6271667480469, "logps/rejected": -655.5355224609375, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 0.6718639731407166, "rewards/margins": 4.1324567794799805, "rewards/rejected": -3.4605932235717773, "step": 183 }, { "epoch": 0.11446345256609643, "grad_norm": 4.992862224578857, "learning_rate": 4.920470262793914e-06, "logits/chosen": 0.6102297306060791, "logits/rejected": 2.988198757171631, "logps/chosen": -443.219970703125, "logps/rejected": -631.0574951171875, "loss": 0.1264, "rewards/accuracies": 1.0, "rewards/chosen": 0.27231788635253906, "rewards/margins": 4.997542381286621, "rewards/rejected": -4.725224018096924, "step": 184 }, { "epoch": 0.1150855365474339, "grad_norm": 10.217452049255371, "learning_rate": 4.91701244813278e-06, "logits/chosen": -0.2010265588760376, "logits/rejected": 2.3122220039367676, "logps/chosen": -429.9006652832031, "logps/rejected": -635.1914672851562, "loss": 0.7406, "rewards/accuracies": 0.75, "rewards/chosen": 0.5990985631942749, "rewards/margins": 1.060754418373108, "rewards/rejected": -0.46165579557418823, "step": 185 }, { "epoch": 0.11570762052877138, "grad_norm": 9.368711471557617, "learning_rate": 4.913554633471646e-06, "logits/chosen": 0.16227489709854126, "logits/rejected": 2.135777473449707, "logps/chosen": -487.630859375, "logps/rejected": -654.1341552734375, "loss": 0.3744, "rewards/accuracies": 0.875, "rewards/chosen": 0.08759135007858276, "rewards/margins": 4.625253200531006, "rewards/rejected": -4.537662029266357, "step": 186 }, { "epoch": 0.11632970451010886, "grad_norm": 11.765965461730957, "learning_rate": 4.910096818810512e-06, "logits/chosen": -1.6820456981658936, "logits/rejected": 1.7127372026443481, "logps/chosen": -383.8602600097656, "logps/rejected": -607.8425903320312, "loss": 0.6451, "rewards/accuracies": 0.875, "rewards/chosen": -1.9297001361846924, "rewards/margins": 3.5949037075042725, "rewards/rejected": -5.524603843688965, "step": 187 }, { "epoch": 0.11695178849144634, "grad_norm": 6.174111366271973, "learning_rate": 4.906639004149378e-06, "logits/chosen": -1.045624017715454, "logits/rejected": 2.223209857940674, "logps/chosen": -421.2188415527344, "logps/rejected": -642.3206787109375, "loss": 0.2512, "rewards/accuracies": 0.875, "rewards/chosen": 0.5733550786972046, "rewards/margins": 3.576486587524414, "rewards/rejected": -3.00313138961792, "step": 188 }, { "epoch": 0.11757387247278382, "grad_norm": 9.631282806396484, "learning_rate": 4.9031811894882435e-06, "logits/chosen": 1.1736887693405151, "logits/rejected": 2.6334545612335205, "logps/chosen": -497.10406494140625, "logps/rejected": -629.0310668945312, "loss": 0.3164, "rewards/accuracies": 0.875, "rewards/chosen": -3.5284926891326904, "rewards/margins": 3.3808839321136475, "rewards/rejected": -6.909377098083496, "step": 189 }, { "epoch": 0.1181959564541213, "grad_norm": 15.640495300292969, "learning_rate": 4.899723374827109e-06, "logits/chosen": 1.6573617458343506, "logits/rejected": 1.9541635513305664, "logps/chosen": -663.0643920898438, "logps/rejected": -642.950927734375, "loss": 0.597, "rewards/accuracies": 0.625, "rewards/chosen": -0.5703495144844055, "rewards/margins": 3.1099233627319336, "rewards/rejected": -3.6802730560302734, "step": 190 }, { "epoch": 0.1188180404354588, "grad_norm": 20.173229217529297, "learning_rate": 4.896265560165976e-06, "logits/chosen": -1.9917049407958984, "logits/rejected": 2.3866207599639893, "logps/chosen": -322.85882568359375, "logps/rejected": -655.4404296875, "loss": 1.1819, "rewards/accuracies": 0.625, "rewards/chosen": 0.21187257766723633, "rewards/margins": 2.4619815349578857, "rewards/rejected": -2.2501091957092285, "step": 191 }, { "epoch": 0.11944012441679627, "grad_norm": 13.255785942077637, "learning_rate": 4.892807745504841e-06, "logits/chosen": -2.89833927154541, "logits/rejected": 1.6764167547225952, "logps/chosen": -475.44830322265625, "logps/rejected": -808.4315185546875, "loss": 0.6981, "rewards/accuracies": 0.625, "rewards/chosen": -1.3834810256958008, "rewards/margins": 1.547243356704712, "rewards/rejected": -2.9307243824005127, "step": 192 }, { "epoch": 0.12006220839813375, "grad_norm": 7.948220252990723, "learning_rate": 4.889349930843707e-06, "logits/chosen": -0.6844046115875244, "logits/rejected": 2.8930535316467285, "logps/chosen": -392.8423767089844, "logps/rejected": -679.4092407226562, "loss": 0.2273, "rewards/accuracies": 0.875, "rewards/chosen": -0.9696107506752014, "rewards/margins": 3.070042133331299, "rewards/rejected": -4.0396528244018555, "step": 193 }, { "epoch": 0.12068429237947123, "grad_norm": 9.143976211547852, "learning_rate": 4.885892116182573e-06, "logits/chosen": -0.8472806811332703, "logits/rejected": 1.000624418258667, "logps/chosen": -500.81158447265625, "logps/rejected": -794.472900390625, "loss": 0.7064, "rewards/accuracies": 0.875, "rewards/chosen": -0.474944144487381, "rewards/margins": 6.372384071350098, "rewards/rejected": -6.847327709197998, "step": 194 }, { "epoch": 0.12130637636080871, "grad_norm": 18.40662956237793, "learning_rate": 4.882434301521438e-06, "logits/chosen": -2.3656044006347656, "logits/rejected": -0.2138967216014862, "logps/chosen": -432.4219665527344, "logps/rejected": -618.6937866210938, "loss": 1.3552, "rewards/accuracies": 0.5, "rewards/chosen": -0.8293434381484985, "rewards/margins": 0.6811577677726746, "rewards/rejected": -1.5105011463165283, "step": 195 }, { "epoch": 0.12192846034214619, "grad_norm": 8.236791610717773, "learning_rate": 4.878976486860305e-06, "logits/chosen": 0.004116415977478027, "logits/rejected": 1.8698561191558838, "logps/chosen": -594.337646484375, "logps/rejected": -689.307861328125, "loss": 0.2515, "rewards/accuracies": 0.875, "rewards/chosen": -2.322143316268921, "rewards/margins": 2.3369507789611816, "rewards/rejected": -4.659093856811523, "step": 196 }, { "epoch": 0.12255054432348367, "grad_norm": 3.7814929485321045, "learning_rate": 4.875518672199171e-06, "logits/chosen": 0.23666934669017792, "logits/rejected": 3.122429847717285, "logps/chosen": -402.2366027832031, "logps/rejected": -565.245361328125, "loss": 0.1755, "rewards/accuracies": 1.0, "rewards/chosen": -0.054704517126083374, "rewards/margins": 3.465261220932007, "rewards/rejected": -3.5199661254882812, "step": 197 }, { "epoch": 0.12317262830482115, "grad_norm": 5.948967933654785, "learning_rate": 4.872060857538036e-06, "logits/chosen": 1.4310553073883057, "logits/rejected": 0.11027556657791138, "logps/chosen": -610.373779296875, "logps/rejected": -614.8685302734375, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": -0.19167746603488922, "rewards/margins": 5.161076545715332, "rewards/rejected": -5.35275411605835, "step": 198 }, { "epoch": 0.12379471228615863, "grad_norm": 18.564851760864258, "learning_rate": 4.868603042876902e-06, "logits/chosen": -0.3558012545108795, "logits/rejected": 1.9788715839385986, "logps/chosen": -550.2830200195312, "logps/rejected": -730.8470458984375, "loss": 2.1051, "rewards/accuracies": 0.625, "rewards/chosen": -0.35308414697647095, "rewards/margins": 0.11213564872741699, "rewards/rejected": -0.4652198553085327, "step": 199 }, { "epoch": 0.12441679626749612, "grad_norm": 2.9045567512512207, "learning_rate": 4.865145228215768e-06, "logits/chosen": -0.9682158827781677, "logits/rejected": 1.2349064350128174, "logps/chosen": -468.053955078125, "logps/rejected": -628.5726928710938, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": -1.8281490802764893, "rewards/margins": 5.6922807693481445, "rewards/rejected": -7.520429611206055, "step": 200 }, { "epoch": 0.12503888024883358, "grad_norm": 15.407296180725098, "learning_rate": 4.861687413554634e-06, "logits/chosen": 0.3468204736709595, "logits/rejected": 3.4497952461242676, "logps/chosen": -475.4268798828125, "logps/rejected": -657.09375, "loss": 1.6377, "rewards/accuracies": 0.75, "rewards/chosen": -1.3248292207717896, "rewards/margins": 3.176119327545166, "rewards/rejected": -4.500948429107666, "step": 201 }, { "epoch": 0.12566096423017106, "grad_norm": 9.55247974395752, "learning_rate": 4.8582295988935e-06, "logits/chosen": -2.791116714477539, "logits/rejected": 0.8328390717506409, "logps/chosen": -429.3715515136719, "logps/rejected": -756.6092529296875, "loss": 0.3853, "rewards/accuracies": 0.75, "rewards/chosen": -2.547234535217285, "rewards/margins": 6.244791030883789, "rewards/rejected": -8.792025566101074, "step": 202 }, { "epoch": 0.12628304821150854, "grad_norm": 18.725902557373047, "learning_rate": 4.8547717842323655e-06, "logits/chosen": 0.3372895419597626, "logits/rejected": 1.680553913116455, "logps/chosen": -549.9654541015625, "logps/rejected": -614.7357177734375, "loss": 0.8746, "rewards/accuracies": 0.625, "rewards/chosen": -2.104962110519409, "rewards/margins": 2.173964023590088, "rewards/rejected": -4.278926372528076, "step": 203 }, { "epoch": 0.12690513219284602, "grad_norm": 9.936744689941406, "learning_rate": 4.851313969571231e-06, "logits/chosen": -2.9147789478302, "logits/rejected": 2.3378725051879883, "logps/chosen": -428.87200927734375, "logps/rejected": -869.8580322265625, "loss": 0.2994, "rewards/accuracies": 0.875, "rewards/chosen": -1.0714854001998901, "rewards/margins": 4.843810081481934, "rewards/rejected": -5.915295600891113, "step": 204 }, { "epoch": 0.12752721617418353, "grad_norm": 1.7339391708374023, "learning_rate": 4.847856154910097e-06, "logits/chosen": -0.47420066595077515, "logits/rejected": 1.1462591886520386, "logps/chosen": -414.355712890625, "logps/rejected": -561.3446655273438, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -2.2788870334625244, "rewards/margins": 4.748244285583496, "rewards/rejected": -7.027131080627441, "step": 205 }, { "epoch": 0.128149300155521, "grad_norm": 3.1085612773895264, "learning_rate": 4.844398340248963e-06, "logits/chosen": -0.716675341129303, "logits/rejected": 0.7956051826477051, "logps/chosen": -445.4654235839844, "logps/rejected": -544.9368896484375, "loss": 0.1565, "rewards/accuracies": 0.875, "rewards/chosen": -1.7581572532653809, "rewards/margins": 3.530287742614746, "rewards/rejected": -5.288445472717285, "step": 206 }, { "epoch": 0.12877138413685849, "grad_norm": 9.75969123840332, "learning_rate": 4.840940525587829e-06, "logits/chosen": 0.7754518985748291, "logits/rejected": 3.400249481201172, "logps/chosen": -555.78125, "logps/rejected": -776.49560546875, "loss": 0.421, "rewards/accuracies": 0.875, "rewards/chosen": -3.540207862854004, "rewards/margins": 3.8135573863983154, "rewards/rejected": -7.353765487670898, "step": 207 }, { "epoch": 0.12939346811819596, "grad_norm": 15.840627670288086, "learning_rate": 4.837482710926695e-06, "logits/chosen": -0.13957789540290833, "logits/rejected": 3.3315048217773438, "logps/chosen": -355.3377685546875, "logps/rejected": -520.0291748046875, "loss": 1.5657, "rewards/accuracies": 0.5, "rewards/chosen": -2.917501926422119, "rewards/margins": 1.4613392353057861, "rewards/rejected": -4.378841400146484, "step": 208 }, { "epoch": 0.13001555209953344, "grad_norm": 12.050606727600098, "learning_rate": 4.83402489626556e-06, "logits/chosen": -2.1472315788269043, "logits/rejected": 0.3481689691543579, "logps/chosen": -424.21826171875, "logps/rejected": -628.8707885742188, "loss": 0.6088, "rewards/accuracies": 0.875, "rewards/chosen": -3.3607163429260254, "rewards/margins": 3.531402587890625, "rewards/rejected": -6.89211893081665, "step": 209 }, { "epoch": 0.13063763608087092, "grad_norm": 10.779645919799805, "learning_rate": 4.830567081604426e-06, "logits/chosen": -0.9528994560241699, "logits/rejected": 0.31826186180114746, "logps/chosen": -609.6506958007812, "logps/rejected": -694.06591796875, "loss": 0.2971, "rewards/accuracies": 0.875, "rewards/chosen": -2.4385464191436768, "rewards/margins": 6.031463623046875, "rewards/rejected": -8.470009803771973, "step": 210 }, { "epoch": 0.1312597200622084, "grad_norm": 10.583514213562012, "learning_rate": 4.827109266943293e-06, "logits/chosen": -0.07246696949005127, "logits/rejected": 2.1670680046081543, "logps/chosen": -526.3909301757812, "logps/rejected": -745.4481811523438, "loss": 0.3566, "rewards/accuracies": 0.875, "rewards/chosen": -1.604605793952942, "rewards/margins": 6.669104099273682, "rewards/rejected": -8.273710250854492, "step": 211 }, { "epoch": 0.13188180404354588, "grad_norm": 9.765030860900879, "learning_rate": 4.823651452282158e-06, "logits/chosen": 0.07162562012672424, "logits/rejected": 0.8539394736289978, "logps/chosen": -586.5045166015625, "logps/rejected": -691.008544921875, "loss": 0.4951, "rewards/accuracies": 0.875, "rewards/chosen": -0.9089323282241821, "rewards/margins": 2.843183755874634, "rewards/rejected": -3.7521159648895264, "step": 212 }, { "epoch": 0.13250388802488336, "grad_norm": 15.420610427856445, "learning_rate": 4.820193637621024e-06, "logits/chosen": -0.3238777816295624, "logits/rejected": 0.050200819969177246, "logps/chosen": -523.7664794921875, "logps/rejected": -530.4090576171875, "loss": 0.8864, "rewards/accuracies": 0.625, "rewards/chosen": -1.9911940097808838, "rewards/margins": 1.9262293577194214, "rewards/rejected": -3.9174232482910156, "step": 213 }, { "epoch": 0.13312597200622084, "grad_norm": 12.0591459274292, "learning_rate": 4.81673582295989e-06, "logits/chosen": 0.23275530338287354, "logits/rejected": 2.638697624206543, "logps/chosen": -521.8024291992188, "logps/rejected": -711.636962890625, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -2.2837510108947754, "rewards/margins": 2.9486207962036133, "rewards/rejected": -5.232371807098389, "step": 214 }, { "epoch": 0.13374805598755832, "grad_norm": 12.24421215057373, "learning_rate": 4.813278008298755e-06, "logits/chosen": -1.943312644958496, "logits/rejected": 1.404650092124939, "logps/chosen": -481.767578125, "logps/rejected": -747.7747192382812, "loss": 0.3337, "rewards/accuracies": 0.875, "rewards/chosen": -1.4595973491668701, "rewards/margins": 4.230775356292725, "rewards/rejected": -5.690372467041016, "step": 215 }, { "epoch": 0.1343701399688958, "grad_norm": 7.274106025695801, "learning_rate": 4.809820193637622e-06, "logits/chosen": -2.304079055786133, "logits/rejected": -0.4958508610725403, "logps/chosen": -419.9479675292969, "logps/rejected": -549.1765747070312, "loss": 0.3539, "rewards/accuracies": 0.875, "rewards/chosen": -0.7894752025604248, "rewards/margins": 4.034345626831055, "rewards/rejected": -4.8238205909729, "step": 216 }, { "epoch": 0.13499222395023328, "grad_norm": 16.901384353637695, "learning_rate": 4.8063623789764875e-06, "logits/chosen": -2.287665605545044, "logits/rejected": 1.6443270444869995, "logps/chosen": -359.76348876953125, "logps/rejected": -691.4766845703125, "loss": 1.8537, "rewards/accuracies": 0.625, "rewards/chosen": -1.9978723526000977, "rewards/margins": 0.8887101411819458, "rewards/rejected": -2.886582612991333, "step": 217 }, { "epoch": 0.13561430793157075, "grad_norm": 9.45565128326416, "learning_rate": 4.802904564315353e-06, "logits/chosen": 0.05842161178588867, "logits/rejected": 1.6737098693847656, "logps/chosen": -347.2401123046875, "logps/rejected": -496.21124267578125, "loss": 0.3585, "rewards/accuracies": 0.875, "rewards/chosen": -2.969522714614868, "rewards/margins": 4.467916488647461, "rewards/rejected": -7.437439918518066, "step": 218 }, { "epoch": 0.13623639191290823, "grad_norm": 12.00910758972168, "learning_rate": 4.799446749654219e-06, "logits/chosen": -0.757892370223999, "logits/rejected": 1.0919244289398193, "logps/chosen": -550.14892578125, "logps/rejected": -798.8545532226562, "loss": 0.4321, "rewards/accuracies": 0.875, "rewards/chosen": -2.940156936645508, "rewards/margins": 3.870500326156616, "rewards/rejected": -6.810657501220703, "step": 219 }, { "epoch": 0.1368584758942457, "grad_norm": 8.28059196472168, "learning_rate": 4.795988934993085e-06, "logits/chosen": -1.6422317028045654, "logits/rejected": 0.04340037703514099, "logps/chosen": -492.779052734375, "logps/rejected": -613.814453125, "loss": 0.3409, "rewards/accuracies": 0.875, "rewards/chosen": -2.1332297325134277, "rewards/margins": 3.6222167015075684, "rewards/rejected": -5.755446434020996, "step": 220 }, { "epoch": 0.1374805598755832, "grad_norm": 13.990933418273926, "learning_rate": 4.792531120331951e-06, "logits/chosen": -1.0089961290359497, "logits/rejected": 2.138538360595703, "logps/chosen": -459.6906433105469, "logps/rejected": -716.27294921875, "loss": 1.0147, "rewards/accuracies": 0.75, "rewards/chosen": -1.4709913730621338, "rewards/margins": 2.49360990524292, "rewards/rejected": -3.964601516723633, "step": 221 }, { "epoch": 0.13810264385692067, "grad_norm": 0.4851381182670593, "learning_rate": 4.789073305670817e-06, "logits/chosen": -2.0382261276245117, "logits/rejected": 1.7673249244689941, "logps/chosen": -383.02984619140625, "logps/rejected": -771.320068359375, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.8758177757263184, "rewards/margins": 8.565235137939453, "rewards/rejected": -9.44105339050293, "step": 222 }, { "epoch": 0.13872472783825818, "grad_norm": 4.004894733428955, "learning_rate": 4.785615491009682e-06, "logits/chosen": -1.2374968528747559, "logits/rejected": 1.8880839347839355, "logps/chosen": -326.47869873046875, "logps/rejected": -581.1654052734375, "loss": 0.1495, "rewards/accuracies": 0.875, "rewards/chosen": -0.41444605588912964, "rewards/margins": 4.205626010894775, "rewards/rejected": -4.6200714111328125, "step": 223 }, { "epoch": 0.13934681181959566, "grad_norm": 1.1451832056045532, "learning_rate": 4.782157676348548e-06, "logits/chosen": -5.820475101470947, "logits/rejected": -0.3580701947212219, "logps/chosen": -207.29710388183594, "logps/rejected": -623.8426513671875, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 0.22989436984062195, "rewards/margins": 7.097637176513672, "rewards/rejected": -6.867743015289307, "step": 224 }, { "epoch": 0.13996889580093314, "grad_norm": 11.990436553955078, "learning_rate": 4.7786998616874146e-06, "logits/chosen": -2.4406657218933105, "logits/rejected": 2.968722343444824, "logps/chosen": -335.2978210449219, "logps/rejected": -802.3515014648438, "loss": 0.5448, "rewards/accuracies": 0.875, "rewards/chosen": -1.0841412544250488, "rewards/margins": 6.818935871124268, "rewards/rejected": -7.903077125549316, "step": 225 }, { "epoch": 0.14059097978227061, "grad_norm": 21.280698776245117, "learning_rate": 4.77524204702628e-06, "logits/chosen": 0.15480676293373108, "logits/rejected": 1.2043757438659668, "logps/chosen": -564.95849609375, "logps/rejected": -643.2623901367188, "loss": 1.3647, "rewards/accuracies": 0.625, "rewards/chosen": -3.0344529151916504, "rewards/margins": 1.3274271488189697, "rewards/rejected": -4.361879825592041, "step": 226 }, { "epoch": 0.1412130637636081, "grad_norm": 5.853566646575928, "learning_rate": 4.771784232365146e-06, "logits/chosen": -1.7373931407928467, "logits/rejected": 1.2674592733383179, "logps/chosen": -387.7467956542969, "logps/rejected": -606.992431640625, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": -0.9129789471626282, "rewards/margins": 5.40875768661499, "rewards/rejected": -6.321736812591553, "step": 227 }, { "epoch": 0.14183514774494557, "grad_norm": 1.8564268350601196, "learning_rate": 4.768326417704012e-06, "logits/chosen": -1.6329180002212524, "logits/rejected": 0.033710233867168427, "logps/chosen": -285.3167419433594, "logps/rejected": -468.65020751953125, "loss": 0.0637, "rewards/accuracies": 1.0, "rewards/chosen": -1.4949727058410645, "rewards/margins": 6.504868507385254, "rewards/rejected": -7.999841690063477, "step": 228 }, { "epoch": 0.14245723172628305, "grad_norm": 3.348978042602539, "learning_rate": 4.764868603042877e-06, "logits/chosen": -1.9804818630218506, "logits/rejected": 1.4478893280029297, "logps/chosen": -404.0279846191406, "logps/rejected": -704.047607421875, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": -1.4428744316101074, "rewards/margins": 6.107295036315918, "rewards/rejected": -7.550169944763184, "step": 229 }, { "epoch": 0.14307931570762053, "grad_norm": 0.3754306435585022, "learning_rate": 4.761410788381743e-06, "logits/chosen": -0.3902093172073364, "logits/rejected": 1.649421215057373, "logps/chosen": -459.04998779296875, "logps/rejected": -676.79345703125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -2.128260612487793, "rewards/margins": 9.550674438476562, "rewards/rejected": -11.678936004638672, "step": 230 }, { "epoch": 0.143701399688958, "grad_norm": 9.059954643249512, "learning_rate": 4.757952973720609e-06, "logits/chosen": -2.208346366882324, "logits/rejected": 0.9578112959861755, "logps/chosen": -422.8230285644531, "logps/rejected": -666.8082885742188, "loss": 0.2499, "rewards/accuracies": 0.875, "rewards/chosen": -0.6198028326034546, "rewards/margins": 4.260037422180176, "rewards/rejected": -4.87984037399292, "step": 231 }, { "epoch": 0.1443234836702955, "grad_norm": 9.908191680908203, "learning_rate": 4.754495159059474e-06, "logits/chosen": -2.3181965351104736, "logits/rejected": 0.49289166927337646, "logps/chosen": -455.92803955078125, "logps/rejected": -676.4917602539062, "loss": 0.314, "rewards/accuracies": 0.75, "rewards/chosen": -2.8289620876312256, "rewards/margins": 3.4120523929595947, "rewards/rejected": -6.24101448059082, "step": 232 }, { "epoch": 0.14494556765163297, "grad_norm": 3.7947685718536377, "learning_rate": 4.751037344398341e-06, "logits/chosen": -1.4946904182434082, "logits/rejected": 2.0026485919952393, "logps/chosen": -255.35311889648438, "logps/rejected": -590.746337890625, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": -1.5952775478363037, "rewards/margins": 7.172544479370117, "rewards/rejected": -8.767822265625, "step": 233 }, { "epoch": 0.14556765163297045, "grad_norm": 10.014151573181152, "learning_rate": 4.7475795297372065e-06, "logits/chosen": -0.30448538064956665, "logits/rejected": 1.48101007938385, "logps/chosen": -518.90625, "logps/rejected": -672.8104248046875, "loss": 0.3052, "rewards/accuracies": 0.75, "rewards/chosen": -2.9174964427948, "rewards/margins": 4.945224761962891, "rewards/rejected": -7.8627214431762695, "step": 234 }, { "epoch": 0.14618973561430793, "grad_norm": 7.496145725250244, "learning_rate": 4.744121715076072e-06, "logits/chosen": 0.6570045948028564, "logits/rejected": 1.0923982858657837, "logps/chosen": -575.4700317382812, "logps/rejected": -633.54833984375, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": -2.260350227355957, "rewards/margins": 3.887481927871704, "rewards/rejected": -6.14783239364624, "step": 235 }, { "epoch": 0.1468118195956454, "grad_norm": 5.038055896759033, "learning_rate": 4.740663900414938e-06, "logits/chosen": -1.0094008445739746, "logits/rejected": 1.2427308559417725, "logps/chosen": -408.53704833984375, "logps/rejected": -594.5377197265625, "loss": 0.2033, "rewards/accuracies": 1.0, "rewards/chosen": -1.003747820854187, "rewards/margins": 3.9065873622894287, "rewards/rejected": -4.910335063934326, "step": 236 }, { "epoch": 0.14743390357698288, "grad_norm": 18.20849609375, "learning_rate": 4.7372060857538035e-06, "logits/chosen": -2.474367141723633, "logits/rejected": 0.8985600471496582, "logps/chosen": -464.19317626953125, "logps/rejected": -716.3055419921875, "loss": 1.0149, "rewards/accuracies": 0.625, "rewards/chosen": -2.5598015785217285, "rewards/margins": 1.7782859802246094, "rewards/rejected": -4.338088035583496, "step": 237 }, { "epoch": 0.14805598755832036, "grad_norm": 21.325502395629883, "learning_rate": 4.73374827109267e-06, "logits/chosen": -2.02020263671875, "logits/rejected": 1.5686231851577759, "logps/chosen": -462.01263427734375, "logps/rejected": -695.46044921875, "loss": 1.2723, "rewards/accuracies": 0.625, "rewards/chosen": -2.9080989360809326, "rewards/margins": 3.5409862995147705, "rewards/rejected": -6.449085235595703, "step": 238 }, { "epoch": 0.14867807153965784, "grad_norm": 12.398721694946289, "learning_rate": 4.730290456431536e-06, "logits/chosen": -0.4202864170074463, "logits/rejected": 1.9297188520431519, "logps/chosen": -656.5068969726562, "logps/rejected": -834.7481079101562, "loss": 0.6996, "rewards/accuracies": 0.875, "rewards/chosen": -3.811037540435791, "rewards/margins": 5.296793460845947, "rewards/rejected": -9.107830047607422, "step": 239 }, { "epoch": 0.14930015552099535, "grad_norm": 13.179778099060059, "learning_rate": 4.726832641770401e-06, "logits/chosen": -3.60860538482666, "logits/rejected": -0.9935283660888672, "logps/chosen": -332.74676513671875, "logps/rejected": -586.2742309570312, "loss": 0.5438, "rewards/accuracies": 0.875, "rewards/chosen": -1.4346678256988525, "rewards/margins": 4.114687919616699, "rewards/rejected": -5.549355506896973, "step": 240 }, { "epoch": 0.14992223950233283, "grad_norm": 15.419675827026367, "learning_rate": 4.723374827109267e-06, "logits/chosen": -1.5409873723983765, "logits/rejected": 0.47452038526535034, "logps/chosen": -438.4156799316406, "logps/rejected": -593.5914916992188, "loss": 0.9437, "rewards/accuracies": 0.75, "rewards/chosen": -3.658177375793457, "rewards/margins": 3.3200583457946777, "rewards/rejected": -6.978235721588135, "step": 241 }, { "epoch": 0.1505443234836703, "grad_norm": 21.20749282836914, "learning_rate": 4.719917012448133e-06, "logits/chosen": 0.6541311144828796, "logits/rejected": 0.3939392864704132, "logps/chosen": -576.1435546875, "logps/rejected": -566.4190063476562, "loss": 0.9267, "rewards/accuracies": 0.625, "rewards/chosen": -4.892140865325928, "rewards/margins": 1.976912498474121, "rewards/rejected": -6.869053363800049, "step": 242 }, { "epoch": 0.15116640746500778, "grad_norm": 2.3396997451782227, "learning_rate": 4.716459197786999e-06, "logits/chosen": -3.1188087463378906, "logits/rejected": 1.228123426437378, "logps/chosen": -470.78814697265625, "logps/rejected": -822.14697265625, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": -2.5003788471221924, "rewards/margins": 6.14935827255249, "rewards/rejected": -8.649737358093262, "step": 243 }, { "epoch": 0.15178849144634526, "grad_norm": 12.74854850769043, "learning_rate": 4.713001383125865e-06, "logits/chosen": 0.6285867094993591, "logits/rejected": 1.72483491897583, "logps/chosen": -644.8883056640625, "logps/rejected": -750.4856567382812, "loss": 0.3696, "rewards/accuracies": 0.75, "rewards/chosen": -3.9861502647399902, "rewards/margins": 5.273034572601318, "rewards/rejected": -9.259184837341309, "step": 244 }, { "epoch": 0.15241057542768274, "grad_norm": 9.774956703186035, "learning_rate": 4.709543568464731e-06, "logits/chosen": 0.43542414903640747, "logits/rejected": 2.8364152908325195, "logps/chosen": -584.3109130859375, "logps/rejected": -809.8812255859375, "loss": 0.3079, "rewards/accuracies": 0.875, "rewards/chosen": -1.9937427043914795, "rewards/margins": 7.057479381561279, "rewards/rejected": -9.05122184753418, "step": 245 }, { "epoch": 0.15303265940902022, "grad_norm": 15.009865760803223, "learning_rate": 4.706085753803596e-06, "logits/chosen": -2.090878963470459, "logits/rejected": -0.052771300077438354, "logps/chosen": -413.6505126953125, "logps/rejected": -598.9315185546875, "loss": 0.8066, "rewards/accuracies": 0.875, "rewards/chosen": -4.910340785980225, "rewards/margins": 4.829494953155518, "rewards/rejected": -9.739836692810059, "step": 246 }, { "epoch": 0.1536547433903577, "grad_norm": 10.674094200134277, "learning_rate": 4.702627939142462e-06, "logits/chosen": -0.32696330547332764, "logits/rejected": 1.627031683921814, "logps/chosen": -525.27734375, "logps/rejected": -689.9846801757812, "loss": 0.2823, "rewards/accuracies": 0.75, "rewards/chosen": -3.7172186374664307, "rewards/margins": 4.163555145263672, "rewards/rejected": -7.880773544311523, "step": 247 }, { "epoch": 0.15427682737169518, "grad_norm": 9.639894485473633, "learning_rate": 4.6991701244813285e-06, "logits/chosen": -0.22154799103736877, "logits/rejected": 1.388524055480957, "logps/chosen": -458.9217529296875, "logps/rejected": -612.6981201171875, "loss": 0.3998, "rewards/accuracies": 0.875, "rewards/chosen": -2.4425270557403564, "rewards/margins": 4.899855136871338, "rewards/rejected": -7.342381954193115, "step": 248 }, { "epoch": 0.15489891135303266, "grad_norm": 7.550816059112549, "learning_rate": 4.695712309820194e-06, "logits/chosen": -2.1596360206604004, "logits/rejected": 1.4292323589324951, "logps/chosen": -413.95355224609375, "logps/rejected": -710.8802490234375, "loss": 0.1226, "rewards/accuracies": 0.875, "rewards/chosen": -0.9096390604972839, "rewards/margins": 6.5312180519104, "rewards/rejected": -7.44085693359375, "step": 249 }, { "epoch": 0.15552099533437014, "grad_norm": 5.2478461265563965, "learning_rate": 4.69225449515906e-06, "logits/chosen": -0.28928232192993164, "logits/rejected": 2.048895835876465, "logps/chosen": -414.8948974609375, "logps/rejected": -603.7171630859375, "loss": 0.1166, "rewards/accuracies": 0.875, "rewards/chosen": -1.2157189846038818, "rewards/margins": 7.377256870269775, "rewards/rejected": -8.592975616455078, "step": 250 }, { "epoch": 0.15614307931570762, "grad_norm": 3.202226400375366, "learning_rate": 4.6887966804979255e-06, "logits/chosen": 0.13262879848480225, "logits/rejected": 1.8512684106826782, "logps/chosen": -609.9318237304688, "logps/rejected": -798.5213623046875, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": -4.749973773956299, "rewards/margins": 9.838062286376953, "rewards/rejected": -14.58803653717041, "step": 251 }, { "epoch": 0.1567651632970451, "grad_norm": 0.5320392847061157, "learning_rate": 4.685338865836791e-06, "logits/chosen": -1.4125306606292725, "logits/rejected": 0.7084207534790039, "logps/chosen": -464.9007568359375, "logps/rejected": -626.1807861328125, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -2.450713872909546, "rewards/margins": 7.482511043548584, "rewards/rejected": -9.93322467803955, "step": 252 }, { "epoch": 0.15738724727838257, "grad_norm": 10.76895809173584, "learning_rate": 4.681881051175658e-06, "logits/chosen": 0.5094907879829407, "logits/rejected": 2.1551709175109863, "logps/chosen": -564.8487548828125, "logps/rejected": -781.2236328125, "loss": 0.2508, "rewards/accuracies": 0.875, "rewards/chosen": -1.7215559482574463, "rewards/margins": 8.452046394348145, "rewards/rejected": -10.173601150512695, "step": 253 }, { "epoch": 0.15800933125972005, "grad_norm": 1.3702670335769653, "learning_rate": 4.678423236514523e-06, "logits/chosen": -0.6126558780670166, "logits/rejected": 1.668062448501587, "logps/chosen": -473.4520263671875, "logps/rejected": -722.7353515625, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -3.63812255859375, "rewards/margins": 6.2835469245910645, "rewards/rejected": -9.921669006347656, "step": 254 }, { "epoch": 0.15863141524105753, "grad_norm": 6.207293510437012, "learning_rate": 4.674965421853389e-06, "logits/chosen": -3.646106243133545, "logits/rejected": 0.1938755214214325, "logps/chosen": -246.47653198242188, "logps/rejected": -580.8173828125, "loss": 0.1, "rewards/accuracies": 0.875, "rewards/chosen": -0.47361141443252563, "rewards/margins": 8.135469436645508, "rewards/rejected": -8.609081268310547, "step": 255 }, { "epoch": 0.159253499222395, "grad_norm": 7.103205680847168, "learning_rate": 4.671507607192255e-06, "logits/chosen": -0.79026198387146, "logits/rejected": 1.6442009210586548, "logps/chosen": -511.1326904296875, "logps/rejected": -667.1676025390625, "loss": 0.1399, "rewards/accuracies": 0.875, "rewards/chosen": -1.811009407043457, "rewards/margins": 6.868584632873535, "rewards/rejected": -8.679594039916992, "step": 256 }, { "epoch": 0.1598755832037325, "grad_norm": 22.603649139404297, "learning_rate": 4.66804979253112e-06, "logits/chosen": 0.7336915731430054, "logits/rejected": 1.8532986640930176, "logps/chosen": -598.635009765625, "logps/rejected": -742.918212890625, "loss": 0.767, "rewards/accuracies": 0.5, "rewards/chosen": -6.134716987609863, "rewards/margins": 2.285792112350464, "rewards/rejected": -8.420509338378906, "step": 257 }, { "epoch": 0.16049766718507, "grad_norm": 12.186392784118652, "learning_rate": 4.664591977869987e-06, "logits/chosen": -2.326719045639038, "logits/rejected": 0.9775432348251343, "logps/chosen": -415.0740966796875, "logps/rejected": -654.5527954101562, "loss": 0.5815, "rewards/accuracies": 0.875, "rewards/chosen": -2.270674228668213, "rewards/margins": 3.5440921783447266, "rewards/rejected": -5.814765930175781, "step": 258 }, { "epoch": 0.16111975116640748, "grad_norm": 4.330519199371338, "learning_rate": 4.661134163208853e-06, "logits/chosen": -1.787456750869751, "logits/rejected": 1.2638174295425415, "logps/chosen": -357.8271179199219, "logps/rejected": -643.799072265625, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -1.6819653511047363, "rewards/margins": 7.315193176269531, "rewards/rejected": -8.997159004211426, "step": 259 }, { "epoch": 0.16174183514774496, "grad_norm": 3.946247100830078, "learning_rate": 4.657676348547718e-06, "logits/chosen": -1.253071665763855, "logits/rejected": 0.4437389373779297, "logps/chosen": -473.892822265625, "logps/rejected": -644.9855346679688, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": -3.8910157680511475, "rewards/margins": 6.748340606689453, "rewards/rejected": -10.63935661315918, "step": 260 }, { "epoch": 0.16236391912908243, "grad_norm": 0.07657834142446518, "learning_rate": 4.654218533886584e-06, "logits/chosen": -0.533647894859314, "logits/rejected": 1.3923450708389282, "logps/chosen": -548.9664306640625, "logps/rejected": -713.9586181640625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.6573634147644043, "rewards/margins": 9.294958114624023, "rewards/rejected": -11.952322006225586, "step": 261 }, { "epoch": 0.1629860031104199, "grad_norm": 8.643094062805176, "learning_rate": 4.6507607192254504e-06, "logits/chosen": -1.8945211172103882, "logits/rejected": 2.138239860534668, "logps/chosen": -444.17584228515625, "logps/rejected": -741.0984497070312, "loss": 0.1695, "rewards/accuracies": 0.875, "rewards/chosen": -3.1956851482391357, "rewards/margins": 5.546382904052734, "rewards/rejected": -8.74206829071045, "step": 262 }, { "epoch": 0.1636080870917574, "grad_norm": 2.7890069484710693, "learning_rate": 4.647302904564316e-06, "logits/chosen": 0.04565951228141785, "logits/rejected": 1.8965015411376953, "logps/chosen": -537.900390625, "logps/rejected": -715.051513671875, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -4.380223751068115, "rewards/margins": 7.408870220184326, "rewards/rejected": -11.789094924926758, "step": 263 }, { "epoch": 0.16423017107309487, "grad_norm": 15.174590110778809, "learning_rate": 4.643845089903182e-06, "logits/chosen": -0.8846542835235596, "logits/rejected": -2.10271954536438, "logps/chosen": -485.0090637207031, "logps/rejected": -502.6370849609375, "loss": 1.5033, "rewards/accuracies": 0.75, "rewards/chosen": -3.5849950313568115, "rewards/margins": 3.411698818206787, "rewards/rejected": -6.996694087982178, "step": 264 }, { "epoch": 0.16485225505443235, "grad_norm": 1.2516266107559204, "learning_rate": 4.6403872752420475e-06, "logits/chosen": -2.401421546936035, "logits/rejected": 1.8238811492919922, "logps/chosen": -306.5718994140625, "logps/rejected": -580.1861572265625, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -1.5649440288543701, "rewards/margins": 7.7721428871154785, "rewards/rejected": -9.337087631225586, "step": 265 }, { "epoch": 0.16547433903576983, "grad_norm": 5.465749263763428, "learning_rate": 4.636929460580913e-06, "logits/chosen": -2.8539490699768066, "logits/rejected": 1.1325531005859375, "logps/chosen": -408.11053466796875, "logps/rejected": -805.772216796875, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": -3.0690557956695557, "rewards/margins": 8.872047424316406, "rewards/rejected": -11.9411039352417, "step": 266 }, { "epoch": 0.1660964230171073, "grad_norm": 7.398975849151611, "learning_rate": 4.63347164591978e-06, "logits/chosen": -0.35056596994400024, "logits/rejected": 2.5457687377929688, "logps/chosen": -460.634521484375, "logps/rejected": -717.3281860351562, "loss": 0.2496, "rewards/accuracies": 0.875, "rewards/chosen": -0.2680894434452057, "rewards/margins": 5.694334983825684, "rewards/rejected": -5.962425231933594, "step": 267 }, { "epoch": 0.1667185069984448, "grad_norm": 12.573168754577637, "learning_rate": 4.630013831258645e-06, "logits/chosen": -0.3991909623146057, "logits/rejected": 2.4069125652313232, "logps/chosen": -614.88427734375, "logps/rejected": -873.7958374023438, "loss": 0.4442, "rewards/accuracies": 0.875, "rewards/chosen": -4.085229396820068, "rewards/margins": 4.987785339355469, "rewards/rejected": -9.073014259338379, "step": 268 }, { "epoch": 0.16734059097978227, "grad_norm": 9.359966278076172, "learning_rate": 4.626556016597511e-06, "logits/chosen": -0.803552508354187, "logits/rejected": 3.0195677280426025, "logps/chosen": -539.8129272460938, "logps/rejected": -838.1019897460938, "loss": 0.2417, "rewards/accuracies": 1.0, "rewards/chosen": -2.7998533248901367, "rewards/margins": 4.9789581298828125, "rewards/rejected": -7.778811454772949, "step": 269 }, { "epoch": 0.16796267496111975, "grad_norm": 1.462587833404541, "learning_rate": 4.623098201936377e-06, "logits/chosen": -0.718620777130127, "logits/rejected": 2.003183126449585, "logps/chosen": -478.9626159667969, "logps/rejected": -684.8155517578125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -1.216932773590088, "rewards/margins": 8.260567665100098, "rewards/rejected": -9.477499961853027, "step": 270 }, { "epoch": 0.16858475894245722, "grad_norm": 8.03159236907959, "learning_rate": 4.619640387275242e-06, "logits/chosen": -0.7762271165847778, "logits/rejected": 2.736423969268799, "logps/chosen": -359.5603942871094, "logps/rejected": -576.3167114257812, "loss": 0.3011, "rewards/accuracies": 0.875, "rewards/chosen": -3.7644598484039307, "rewards/margins": 3.182647943496704, "rewards/rejected": -6.947108268737793, "step": 271 }, { "epoch": 0.1692068429237947, "grad_norm": 9.998476028442383, "learning_rate": 4.616182572614109e-06, "logits/chosen": -0.735232949256897, "logits/rejected": 3.2213311195373535, "logps/chosen": -433.61907958984375, "logps/rejected": -773.2107543945312, "loss": 0.2285, "rewards/accuracies": 0.875, "rewards/chosen": -1.9446868896484375, "rewards/margins": 6.077488422393799, "rewards/rejected": -8.022174835205078, "step": 272 }, { "epoch": 0.16982892690513218, "grad_norm": 0.8653458952903748, "learning_rate": 4.6127247579529746e-06, "logits/chosen": -2.379232883453369, "logits/rejected": 1.3507249355316162, "logps/chosen": -360.03863525390625, "logps/rejected": -687.9876708984375, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": -0.7321940064430237, "rewards/margins": 9.980026245117188, "rewards/rejected": -10.71221923828125, "step": 273 }, { "epoch": 0.17045101088646966, "grad_norm": 8.043740272521973, "learning_rate": 4.609266943291839e-06, "logits/chosen": -0.6892336010932922, "logits/rejected": 0.85450679063797, "logps/chosen": -384.5489196777344, "logps/rejected": -600.2506713867188, "loss": 0.1616, "rewards/accuracies": 0.875, "rewards/chosen": -3.9526164531707764, "rewards/margins": 5.305567264556885, "rewards/rejected": -9.258183479309082, "step": 274 }, { "epoch": 0.17107309486780714, "grad_norm": 13.354936599731445, "learning_rate": 4.605809128630706e-06, "logits/chosen": 0.008952975273132324, "logits/rejected": 3.021392583847046, "logps/chosen": -460.06591796875, "logps/rejected": -738.4031982421875, "loss": 0.5289, "rewards/accuracies": 0.75, "rewards/chosen": -3.8850395679473877, "rewards/margins": 4.3622026443481445, "rewards/rejected": -8.247241973876953, "step": 275 }, { "epoch": 0.17169517884914465, "grad_norm": 6.681543350219727, "learning_rate": 4.602351313969572e-06, "logits/chosen": -2.176787853240967, "logits/rejected": 0.2526744604110718, "logps/chosen": -570.1998291015625, "logps/rejected": -777.09912109375, "loss": 0.1043, "rewards/accuracies": 0.875, "rewards/chosen": -4.824416637420654, "rewards/margins": 6.9663987159729, "rewards/rejected": -11.790815353393555, "step": 276 }, { "epoch": 0.17231726283048213, "grad_norm": 6.965024471282959, "learning_rate": 4.598893499308437e-06, "logits/chosen": 2.1363282203674316, "logits/rejected": 3.34602689743042, "logps/chosen": -737.9760131835938, "logps/rejected": -812.06005859375, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": -2.594902753829956, "rewards/margins": 5.333373069763184, "rewards/rejected": -7.928276062011719, "step": 277 }, { "epoch": 0.1729393468118196, "grad_norm": 0.255916029214859, "learning_rate": 4.595435684647303e-06, "logits/chosen": -0.5340772867202759, "logits/rejected": 1.5482295751571655, "logps/chosen": -408.4273376464844, "logps/rejected": -671.912353515625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -1.8431624174118042, "rewards/margins": 9.538469314575195, "rewards/rejected": -11.381630897521973, "step": 278 }, { "epoch": 0.17356143079315708, "grad_norm": 6.751749038696289, "learning_rate": 4.591977869986169e-06, "logits/chosen": -2.509678602218628, "logits/rejected": 0.3515166640281677, "logps/chosen": -307.0176086425781, "logps/rejected": -566.617431640625, "loss": 0.1329, "rewards/accuracies": 0.875, "rewards/chosen": -2.548825740814209, "rewards/margins": 6.478213310241699, "rewards/rejected": -9.027037620544434, "step": 279 }, { "epoch": 0.17418351477449456, "grad_norm": 15.989509582519531, "learning_rate": 4.588520055325035e-06, "logits/chosen": 0.601439356803894, "logits/rejected": 1.2552647590637207, "logps/chosen": -615.57275390625, "logps/rejected": -688.3743896484375, "loss": 1.6145, "rewards/accuracies": 0.75, "rewards/chosen": -5.439153671264648, "rewards/margins": 4.911909103393555, "rewards/rejected": -10.351062774658203, "step": 280 }, { "epoch": 0.17480559875583204, "grad_norm": 2.588643789291382, "learning_rate": 4.585062240663901e-06, "logits/chosen": -1.4265761375427246, "logits/rejected": 0.3371986150741577, "logps/chosen": -505.3673095703125, "logps/rejected": -693.4109497070312, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -2.790975570678711, "rewards/margins": 8.05557632446289, "rewards/rejected": -10.846551895141602, "step": 281 }, { "epoch": 0.17542768273716952, "grad_norm": 14.38697624206543, "learning_rate": 4.5816044260027665e-06, "logits/chosen": 0.9437137246131897, "logits/rejected": 0.9775892496109009, "logps/chosen": -657.5091552734375, "logps/rejected": -668.2377319335938, "loss": 1.1068, "rewards/accuracies": 0.875, "rewards/chosen": -3.1414573192596436, "rewards/margins": 5.252272605895996, "rewards/rejected": -8.393729209899902, "step": 282 }, { "epoch": 0.176049766718507, "grad_norm": 13.082420349121094, "learning_rate": 4.578146611341632e-06, "logits/chosen": -3.306584119796753, "logits/rejected": 2.4480957984924316, "logps/chosen": -244.163818359375, "logps/rejected": -728.885986328125, "loss": 0.8388, "rewards/accuracies": 0.75, "rewards/chosen": -2.1342248916625977, "rewards/margins": 5.663808345794678, "rewards/rejected": -7.798033714294434, "step": 283 }, { "epoch": 0.17667185069984448, "grad_norm": 14.865607261657715, "learning_rate": 4.574688796680498e-06, "logits/chosen": -0.4154762029647827, "logits/rejected": 0.172011137008667, "logps/chosen": -617.131103515625, "logps/rejected": -671.433349609375, "loss": 0.6575, "rewards/accuracies": 0.875, "rewards/chosen": -3.411029100418091, "rewards/margins": 5.586822509765625, "rewards/rejected": -8.997851371765137, "step": 284 }, { "epoch": 0.17729393468118196, "grad_norm": 0.04570363089442253, "learning_rate": 4.571230982019364e-06, "logits/chosen": -2.9767088890075684, "logits/rejected": 1.552046775817871, "logps/chosen": -405.144775390625, "logps/rejected": -809.362548828125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.24537521600723267, "rewards/margins": 8.153364181518555, "rewards/rejected": -8.398738861083984, "step": 285 }, { "epoch": 0.17791601866251944, "grad_norm": 7.76615047454834, "learning_rate": 4.56777316735823e-06, "logits/chosen": -1.0276365280151367, "logits/rejected": 0.2857726216316223, "logps/chosen": -424.6601867675781, "logps/rejected": -513.98095703125, "loss": 0.1717, "rewards/accuracies": 0.875, "rewards/chosen": -3.9996285438537598, "rewards/margins": 5.563259124755859, "rewards/rejected": -9.562887191772461, "step": 286 }, { "epoch": 0.17853810264385692, "grad_norm": 0.09615608304738998, "learning_rate": 4.564315352697096e-06, "logits/chosen": -2.578230142593384, "logits/rejected": 0.31838667392730713, "logps/chosen": -347.80859375, "logps/rejected": -604.3917236328125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.9627995491027832, "rewards/margins": 6.481869220733643, "rewards/rejected": -8.444669723510742, "step": 287 }, { "epoch": 0.1791601866251944, "grad_norm": 1.1004307270050049, "learning_rate": 4.560857538035961e-06, "logits/chosen": -2.0265450477600098, "logits/rejected": 1.3413851261138916, "logps/chosen": -457.03778076171875, "logps/rejected": -757.3197021484375, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -1.1656559705734253, "rewards/margins": 6.532405853271484, "rewards/rejected": -7.698061943054199, "step": 288 }, { "epoch": 0.17978227060653187, "grad_norm": 5.090839862823486, "learning_rate": 4.557399723374827e-06, "logits/chosen": 0.17114977538585663, "logits/rejected": 1.8369534015655518, "logps/chosen": -459.56829833984375, "logps/rejected": -653.49072265625, "loss": 0.2061, "rewards/accuracies": 0.875, "rewards/chosen": -4.236117362976074, "rewards/margins": 6.491977214813232, "rewards/rejected": -10.728094100952148, "step": 289 }, { "epoch": 0.18040435458786935, "grad_norm": 8.679951667785645, "learning_rate": 4.5539419087136936e-06, "logits/chosen": -4.507697105407715, "logits/rejected": 1.2680081129074097, "logps/chosen": -194.2377471923828, "logps/rejected": -649.5841064453125, "loss": 0.3803, "rewards/accuracies": 0.875, "rewards/chosen": -1.0905375480651855, "rewards/margins": 9.107283592224121, "rewards/rejected": -10.197820663452148, "step": 290 }, { "epoch": 0.18102643856920683, "grad_norm": 0.31311678886413574, "learning_rate": 4.550484094052559e-06, "logits/chosen": -2.9908690452575684, "logits/rejected": 0.37682318687438965, "logps/chosen": -331.86395263671875, "logps/rejected": -663.374267578125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -2.1786234378814697, "rewards/margins": 8.731188774108887, "rewards/rejected": -10.909812927246094, "step": 291 }, { "epoch": 0.1816485225505443, "grad_norm": 10.737432479858398, "learning_rate": 4.547026279391425e-06, "logits/chosen": -2.534325122833252, "logits/rejected": 2.4271931648254395, "logps/chosen": -313.55267333984375, "logps/rejected": -707.08056640625, "loss": 0.2901, "rewards/accuracies": 0.875, "rewards/chosen": -2.2728538513183594, "rewards/margins": 6.743180274963379, "rewards/rejected": -9.016034126281738, "step": 292 }, { "epoch": 0.1822706065318818, "grad_norm": 13.969117164611816, "learning_rate": 4.543568464730291e-06, "logits/chosen": -1.7838499546051025, "logits/rejected": 2.2653260231018066, "logps/chosen": -471.333251953125, "logps/rejected": -797.5987548828125, "loss": 0.7472, "rewards/accuracies": 0.75, "rewards/chosen": -4.759395122528076, "rewards/margins": 5.9565582275390625, "rewards/rejected": -10.71595287322998, "step": 293 }, { "epoch": 0.1828926905132193, "grad_norm": 0.15050821006298065, "learning_rate": 4.540110650069156e-06, "logits/chosen": -1.522807002067566, "logits/rejected": 3.106743335723877, "logps/chosen": -399.6716003417969, "logps/rejected": -800.451904296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.77045476436615, "rewards/margins": 9.06578254699707, "rewards/rejected": -10.836236953735352, "step": 294 }, { "epoch": 0.18351477449455678, "grad_norm": 3.63210129737854, "learning_rate": 4.536652835408023e-06, "logits/chosen": -0.29420968890190125, "logits/rejected": 1.5497915744781494, "logps/chosen": -565.326171875, "logps/rejected": -753.0763549804688, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -3.1053409576416016, "rewards/margins": 7.338171482086182, "rewards/rejected": -10.443511962890625, "step": 295 }, { "epoch": 0.18413685847589426, "grad_norm": 0.32757797837257385, "learning_rate": 4.5331950207468885e-06, "logits/chosen": -2.481358289718628, "logits/rejected": 1.1847180128097534, "logps/chosen": -362.6833801269531, "logps/rejected": -679.8433837890625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -2.0038769245147705, "rewards/margins": 8.584901809692383, "rewards/rejected": -10.588778495788574, "step": 296 }, { "epoch": 0.18475894245723173, "grad_norm": 16.18773651123047, "learning_rate": 4.529737206085754e-06, "logits/chosen": 0.6282098889350891, "logits/rejected": 0.3807709515094757, "logps/chosen": -597.3126220703125, "logps/rejected": -606.06103515625, "loss": 0.8778, "rewards/accuracies": 0.625, "rewards/chosen": -4.498796463012695, "rewards/margins": 3.43064546585083, "rewards/rejected": -7.929442405700684, "step": 297 }, { "epoch": 0.1853810264385692, "grad_norm": 15.155533790588379, "learning_rate": 4.52627939142462e-06, "logits/chosen": 1.0437029600143433, "logits/rejected": 1.6269657611846924, "logps/chosen": -648.49609375, "logps/rejected": -684.3096923828125, "loss": 0.9009, "rewards/accuracies": 0.75, "rewards/chosen": -2.88853120803833, "rewards/margins": 4.9692702293396, "rewards/rejected": -7.85780143737793, "step": 298 }, { "epoch": 0.1860031104199067, "grad_norm": 8.603474617004395, "learning_rate": 4.5228215767634855e-06, "logits/chosen": -1.3771677017211914, "logits/rejected": 0.2585065960884094, "logps/chosen": -532.3855590820312, "logps/rejected": -741.0958251953125, "loss": 0.1724, "rewards/accuracies": 0.875, "rewards/chosen": -3.696711540222168, "rewards/margins": 7.2699689865112305, "rewards/rejected": -10.966680526733398, "step": 299 }, { "epoch": 0.18662519440124417, "grad_norm": 1.3010094165802002, "learning_rate": 4.519363762102352e-06, "logits/chosen": -2.380284309387207, "logits/rejected": 0.8280973434448242, "logps/chosen": -293.75286865234375, "logps/rejected": -636.5174560546875, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": -1.3322535753250122, "rewards/margins": 8.445556640625, "rewards/rejected": -9.777810096740723, "step": 300 }, { "epoch": 0.18724727838258165, "grad_norm": 1.341991662979126, "learning_rate": 4.515905947441218e-06, "logits/chosen": -2.4323348999023438, "logits/rejected": 1.060656189918518, "logps/chosen": -281.36224365234375, "logps/rejected": -603.6076049804688, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 0.172649085521698, "rewards/margins": 9.819293022155762, "rewards/rejected": -9.64664363861084, "step": 301 }, { "epoch": 0.18786936236391913, "grad_norm": 4.605799198150635, "learning_rate": 4.512448132780083e-06, "logits/chosen": 1.091361403465271, "logits/rejected": 1.1667273044586182, "logps/chosen": -581.6498413085938, "logps/rejected": -621.510498046875, "loss": 0.1783, "rewards/accuracies": 1.0, "rewards/chosen": -0.3479340076446533, "rewards/margins": 3.68593168258667, "rewards/rejected": -4.033865451812744, "step": 302 }, { "epoch": 0.1884914463452566, "grad_norm": 3.7075376510620117, "learning_rate": 4.508990318118949e-06, "logits/chosen": -0.09788064658641815, "logits/rejected": 0.6345605850219727, "logps/chosen": -589.946533203125, "logps/rejected": -766.7717895507812, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": -2.34889817237854, "rewards/margins": 8.657028198242188, "rewards/rejected": -11.005925178527832, "step": 303 }, { "epoch": 0.1891135303265941, "grad_norm": 9.629120826721191, "learning_rate": 4.5055325034578155e-06, "logits/chosen": -2.320014715194702, "logits/rejected": 0.9263827204704285, "logps/chosen": -339.7724609375, "logps/rejected": -626.5345458984375, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": -3.134546995162964, "rewards/margins": 6.445633888244629, "rewards/rejected": -9.580181121826172, "step": 304 }, { "epoch": 0.18973561430793157, "grad_norm": 0.0878031849861145, "learning_rate": 4.502074688796681e-06, "logits/chosen": -3.6537017822265625, "logits/rejected": 0.8235005736351013, "logps/chosen": -229.49295043945312, "logps/rejected": -686.2156982421875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.4192272424697876, "rewards/margins": 10.958316802978516, "rewards/rejected": -11.377544403076172, "step": 305 }, { "epoch": 0.19035769828926905, "grad_norm": 15.320277214050293, "learning_rate": 4.498616874135547e-06, "logits/chosen": -0.07280959188938141, "logits/rejected": 1.213287115097046, "logps/chosen": -559.4677124023438, "logps/rejected": -682.7451782226562, "loss": 0.9861, "rewards/accuracies": 0.75, "rewards/chosen": -3.7915196418762207, "rewards/margins": 4.222086429595947, "rewards/rejected": -8.013605117797852, "step": 306 }, { "epoch": 0.19097978227060652, "grad_norm": 8.978217124938965, "learning_rate": 4.4951590594744126e-06, "logits/chosen": -0.7856301665306091, "logits/rejected": 1.5798273086547852, "logps/chosen": -533.0154418945312, "logps/rejected": -706.099365234375, "loss": 0.6997, "rewards/accuracies": 0.875, "rewards/chosen": -2.8133723735809326, "rewards/margins": 6.156833171844482, "rewards/rejected": -8.970205307006836, "step": 307 }, { "epoch": 0.191601866251944, "grad_norm": 1.4583317041397095, "learning_rate": 4.491701244813278e-06, "logits/chosen": -1.3197203874588013, "logits/rejected": 0.767465353012085, "logps/chosen": -489.6077880859375, "logps/rejected": -708.9853515625, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -3.2082359790802, "rewards/margins": 10.25620174407959, "rewards/rejected": -13.464438438415527, "step": 308 }, { "epoch": 0.19222395023328148, "grad_norm": 10.43602466583252, "learning_rate": 4.488243430152145e-06, "logits/chosen": -0.5446723699569702, "logits/rejected": 2.0502994060516357, "logps/chosen": -397.3498229980469, "logps/rejected": -591.723876953125, "loss": 0.26, "rewards/accuracies": 0.875, "rewards/chosen": -3.8332438468933105, "rewards/margins": 4.534789085388184, "rewards/rejected": -8.368032455444336, "step": 309 }, { "epoch": 0.19284603421461896, "grad_norm": 2.021714448928833, "learning_rate": 4.4847856154910104e-06, "logits/chosen": -0.022031545639038086, "logits/rejected": 3.144045829772949, "logps/chosen": -473.75726318359375, "logps/rejected": -728.9859619140625, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": -2.875516891479492, "rewards/margins": 8.01228141784668, "rewards/rejected": -10.887797355651855, "step": 310 }, { "epoch": 0.19346811819595647, "grad_norm": 0.2227923423051834, "learning_rate": 4.481327800829876e-06, "logits/chosen": -4.273709297180176, "logits/rejected": 0.8016602396965027, "logps/chosen": -360.2322692871094, "logps/rejected": -753.0556030273438, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.8820947408676147, "rewards/margins": 11.81657600402832, "rewards/rejected": -12.698671340942383, "step": 311 }, { "epoch": 0.19409020217729395, "grad_norm": 11.25919246673584, "learning_rate": 4.477869986168742e-06, "logits/chosen": -0.7706673741340637, "logits/rejected": 0.3160168528556824, "logps/chosen": -534.1943359375, "logps/rejected": -618.6209106445312, "loss": 0.3645, "rewards/accuracies": 0.875, "rewards/chosen": -3.824570417404175, "rewards/margins": 4.815325736999512, "rewards/rejected": -8.639896392822266, "step": 312 }, { "epoch": 0.19471228615863143, "grad_norm": 18.701457977294922, "learning_rate": 4.4744121715076075e-06, "logits/chosen": 0.26434850692749023, "logits/rejected": 1.9613255262374878, "logps/chosen": -628.41064453125, "logps/rejected": -716.9277954101562, "loss": 2.1107, "rewards/accuracies": 0.75, "rewards/chosen": -6.078549861907959, "rewards/margins": 3.6997904777526855, "rewards/rejected": -9.778340339660645, "step": 313 }, { "epoch": 0.1953343701399689, "grad_norm": 10.838750839233398, "learning_rate": 4.470954356846474e-06, "logits/chosen": -0.10986679792404175, "logits/rejected": 0.06669248640537262, "logps/chosen": -546.940673828125, "logps/rejected": -649.124755859375, "loss": 0.475, "rewards/accuracies": 0.875, "rewards/chosen": -2.6605281829833984, "rewards/margins": 5.670535564422607, "rewards/rejected": -8.331063270568848, "step": 314 }, { "epoch": 0.19595645412130638, "grad_norm": 2.817345380783081, "learning_rate": 4.46749654218534e-06, "logits/chosen": 1.9279276132583618, "logits/rejected": 2.1342928409576416, "logps/chosen": -667.2313232421875, "logps/rejected": -757.1134643554688, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": -5.024358749389648, "rewards/margins": 5.17492151260376, "rewards/rejected": -10.19927978515625, "step": 315 }, { "epoch": 0.19657853810264386, "grad_norm": 10.432488441467285, "learning_rate": 4.464038727524205e-06, "logits/chosen": -2.2854340076446533, "logits/rejected": -0.010273575782775879, "logps/chosen": -442.29052734375, "logps/rejected": -622.2279052734375, "loss": 0.4361, "rewards/accuracies": 0.875, "rewards/chosen": -3.6225013732910156, "rewards/margins": 4.12770938873291, "rewards/rejected": -7.750210762023926, "step": 316 }, { "epoch": 0.19720062208398134, "grad_norm": 12.273526191711426, "learning_rate": 4.460580912863071e-06, "logits/chosen": -1.980130672454834, "logits/rejected": 0.2232046276330948, "logps/chosen": -473.36077880859375, "logps/rejected": -722.2630004882812, "loss": 0.7692, "rewards/accuracies": 0.875, "rewards/chosen": -3.44732666015625, "rewards/margins": 5.5961594581604, "rewards/rejected": -9.043485641479492, "step": 317 }, { "epoch": 0.19782270606531882, "grad_norm": 2.9009850025177, "learning_rate": 4.457123098201937e-06, "logits/chosen": -1.7809903621673584, "logits/rejected": 1.912184476852417, "logps/chosen": -410.5981140136719, "logps/rejected": -789.62353515625, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": -1.9977567195892334, "rewards/margins": 6.327420234680176, "rewards/rejected": -8.325177192687988, "step": 318 }, { "epoch": 0.1984447900466563, "grad_norm": 15.945998191833496, "learning_rate": 4.453665283540803e-06, "logits/chosen": 0.22853931784629822, "logits/rejected": 0.057748615741729736, "logps/chosen": -470.5281982421875, "logps/rejected": -534.501220703125, "loss": 1.2058, "rewards/accuracies": 0.75, "rewards/chosen": -3.9762463569641113, "rewards/margins": 4.095469951629639, "rewards/rejected": -8.07171630859375, "step": 319 }, { "epoch": 0.19906687402799378, "grad_norm": 4.801665782928467, "learning_rate": 4.450207468879668e-06, "logits/chosen": -1.0848209857940674, "logits/rejected": 2.9608421325683594, "logps/chosen": -440.13043212890625, "logps/rejected": -769.8974609375, "loss": 0.1198, "rewards/accuracies": 0.875, "rewards/chosen": -3.172727108001709, "rewards/margins": 4.850074291229248, "rewards/rejected": -8.022801399230957, "step": 320 }, { "epoch": 0.19968895800933126, "grad_norm": 10.37282657623291, "learning_rate": 4.446749654218534e-06, "logits/chosen": 0.07172119617462158, "logits/rejected": 2.123762845993042, "logps/chosen": -482.71063232421875, "logps/rejected": -741.4547729492188, "loss": 0.4479, "rewards/accuracies": 0.875, "rewards/chosen": -4.179274559020996, "rewards/margins": 6.421230316162109, "rewards/rejected": -10.600503921508789, "step": 321 }, { "epoch": 0.20031104199066874, "grad_norm": 10.727299690246582, "learning_rate": 4.4432918395574e-06, "logits/chosen": -2.4296586513519287, "logits/rejected": 1.3786389827728271, "logps/chosen": -405.37939453125, "logps/rejected": -724.1179809570312, "loss": 0.1677, "rewards/accuracies": 0.875, "rewards/chosen": -3.8129231929779053, "rewards/margins": 6.650745391845703, "rewards/rejected": -10.463668823242188, "step": 322 }, { "epoch": 0.20093312597200622, "grad_norm": 4.164663791656494, "learning_rate": 4.439834024896266e-06, "logits/chosen": -0.399936705827713, "logits/rejected": 0.8989405632019043, "logps/chosen": -470.7669982910156, "logps/rejected": -657.80126953125, "loss": 0.1199, "rewards/accuracies": 0.875, "rewards/chosen": -0.937808632850647, "rewards/margins": 5.925185680389404, "rewards/rejected": -6.862994194030762, "step": 323 }, { "epoch": 0.2015552099533437, "grad_norm": 11.699934959411621, "learning_rate": 4.4363762102351316e-06, "logits/chosen": -0.8834425210952759, "logits/rejected": 1.2995545864105225, "logps/chosen": -407.9013671875, "logps/rejected": -597.3431396484375, "loss": 0.2833, "rewards/accuracies": 0.875, "rewards/chosen": -2.3537039756774902, "rewards/margins": 4.3665032386779785, "rewards/rejected": -6.720207214355469, "step": 324 }, { "epoch": 0.20217729393468117, "grad_norm": 8.346322059631348, "learning_rate": 4.432918395573997e-06, "logits/chosen": -1.6465966701507568, "logits/rejected": 0.3020745515823364, "logps/chosen": -403.94146728515625, "logps/rejected": -675.7890625, "loss": 0.4454, "rewards/accuracies": 0.75, "rewards/chosen": -2.4034719467163086, "rewards/margins": 6.286895751953125, "rewards/rejected": -8.69036865234375, "step": 325 }, { "epoch": 0.20279937791601865, "grad_norm": 1.0281274318695068, "learning_rate": 4.429460580912863e-06, "logits/chosen": -0.17888861894607544, "logits/rejected": 2.0089051723480225, "logps/chosen": -501.64215087890625, "logps/rejected": -710.72509765625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -1.9193187952041626, "rewards/margins": 9.336112976074219, "rewards/rejected": -11.25543212890625, "step": 326 }, { "epoch": 0.20342146189735613, "grad_norm": 4.884328365325928, "learning_rate": 4.4260027662517294e-06, "logits/chosen": -0.9429863691329956, "logits/rejected": 1.918354868888855, "logps/chosen": -455.91998291015625, "logps/rejected": -646.3065185546875, "loss": 0.1416, "rewards/accuracies": 1.0, "rewards/chosen": -1.0959547758102417, "rewards/margins": 4.100955486297607, "rewards/rejected": -5.1969099044799805, "step": 327 }, { "epoch": 0.2040435458786936, "grad_norm": 1.553388237953186, "learning_rate": 4.422544951590595e-06, "logits/chosen": -0.8443880081176758, "logits/rejected": 1.81691575050354, "logps/chosen": -270.8441467285156, "logps/rejected": -569.8352661132812, "loss": 0.0866, "rewards/accuracies": 1.0, "rewards/chosen": -1.1609303951263428, "rewards/margins": 8.997220993041992, "rewards/rejected": -10.158150672912598, "step": 328 }, { "epoch": 0.20466562986003112, "grad_norm": 10.504783630371094, "learning_rate": 4.419087136929461e-06, "logits/chosen": 1.433431625366211, "logits/rejected": 1.5410677194595337, "logps/chosen": -775.3016357421875, "logps/rejected": -827.853271484375, "loss": 0.4406, "rewards/accuracies": 0.875, "rewards/chosen": -6.494405746459961, "rewards/margins": 6.968729019165039, "rewards/rejected": -13.463134765625, "step": 329 }, { "epoch": 0.2052877138413686, "grad_norm": 3.2079076766967773, "learning_rate": 4.4156293222683265e-06, "logits/chosen": -1.5707435607910156, "logits/rejected": 2.8474361896514893, "logps/chosen": -269.7269287109375, "logps/rejected": -542.3631591796875, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 0.29627853631973267, "rewards/margins": 4.609586715698242, "rewards/rejected": -4.313308238983154, "step": 330 }, { "epoch": 0.20590979782270608, "grad_norm": 12.789731979370117, "learning_rate": 4.412171507607192e-06, "logits/chosen": -0.17676740884780884, "logits/rejected": 1.3701732158660889, "logps/chosen": -625.8714599609375, "logps/rejected": -686.5986328125, "loss": 0.72, "rewards/accuracies": 0.875, "rewards/chosen": -3.107220411300659, "rewards/margins": 4.961567401885986, "rewards/rejected": -8.068787574768066, "step": 331 }, { "epoch": 0.20653188180404355, "grad_norm": 7.869114398956299, "learning_rate": 4.408713692946059e-06, "logits/chosen": -2.1783716678619385, "logits/rejected": 0.5606423020362854, "logps/chosen": -393.09136962890625, "logps/rejected": -628.9302978515625, "loss": 0.1568, "rewards/accuracies": 0.875, "rewards/chosen": -1.4047056436538696, "rewards/margins": 5.342854976654053, "rewards/rejected": -6.747560501098633, "step": 332 }, { "epoch": 0.20715396578538103, "grad_norm": 11.267909049987793, "learning_rate": 4.405255878284924e-06, "logits/chosen": 1.4840877056121826, "logits/rejected": 2.9914982318878174, "logps/chosen": -609.139892578125, "logps/rejected": -756.2391357421875, "loss": 0.4872, "rewards/accuracies": 0.875, "rewards/chosen": -2.634385108947754, "rewards/margins": 2.8338968753814697, "rewards/rejected": -5.4682817459106445, "step": 333 }, { "epoch": 0.2077760497667185, "grad_norm": 10.5465726852417, "learning_rate": 4.40179806362379e-06, "logits/chosen": 2.0508370399475098, "logits/rejected": 2.819303512573242, "logps/chosen": -604.7826538085938, "logps/rejected": -693.8785400390625, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": -2.798710823059082, "rewards/margins": 5.256593704223633, "rewards/rejected": -8.055304527282715, "step": 334 }, { "epoch": 0.208398133748056, "grad_norm": 7.096404552459717, "learning_rate": 4.398340248962656e-06, "logits/chosen": -0.8837496638298035, "logits/rejected": 1.574455738067627, "logps/chosen": -442.5992431640625, "logps/rejected": -671.376953125, "loss": 0.2213, "rewards/accuracies": 0.875, "rewards/chosen": -1.811486005783081, "rewards/margins": 7.157325744628906, "rewards/rejected": -8.968811988830566, "step": 335 }, { "epoch": 0.20902021772939347, "grad_norm": 29.052284240722656, "learning_rate": 4.394882434301521e-06, "logits/chosen": -1.6337785720825195, "logits/rejected": 1.8117969036102295, "logps/chosen": -468.60821533203125, "logps/rejected": -697.0687255859375, "loss": 2.4599, "rewards/accuracies": 0.5, "rewards/chosen": -5.068663597106934, "rewards/margins": 1.6524235010147095, "rewards/rejected": -6.721087455749512, "step": 336 }, { "epoch": 0.20964230171073095, "grad_norm": 7.647010803222656, "learning_rate": 4.391424619640388e-06, "logits/chosen": -2.101735830307007, "logits/rejected": 1.4360787868499756, "logps/chosen": -362.33514404296875, "logps/rejected": -629.942138671875, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": -3.990177631378174, "rewards/margins": 5.600926399230957, "rewards/rejected": -9.591104507446289, "step": 337 }, { "epoch": 0.21026438569206843, "grad_norm": 9.697122573852539, "learning_rate": 4.3879668049792536e-06, "logits/chosen": -0.4014623165130615, "logits/rejected": 0.5092198252677917, "logps/chosen": -561.0017700195312, "logps/rejected": -683.2088623046875, "loss": 0.2896, "rewards/accuracies": 0.875, "rewards/chosen": -2.4962167739868164, "rewards/margins": 5.576139450073242, "rewards/rejected": -8.072355270385742, "step": 338 }, { "epoch": 0.2108864696734059, "grad_norm": 1.8784066438674927, "learning_rate": 4.384508990318119e-06, "logits/chosen": 0.5263804197311401, "logits/rejected": 2.394399404525757, "logps/chosen": -414.29254150390625, "logps/rejected": -612.1961059570312, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -2.134528160095215, "rewards/margins": 7.594970226287842, "rewards/rejected": -9.729498863220215, "step": 339 }, { "epoch": 0.2115085536547434, "grad_norm": 15.18982219696045, "learning_rate": 4.381051175656985e-06, "logits/chosen": -1.109804391860962, "logits/rejected": 1.8843512535095215, "logps/chosen": -412.0243835449219, "logps/rejected": -624.362060546875, "loss": 0.7618, "rewards/accuracies": 0.75, "rewards/chosen": -2.518000602722168, "rewards/margins": 6.094438552856445, "rewards/rejected": -8.61244010925293, "step": 340 }, { "epoch": 0.21213063763608087, "grad_norm": 15.113494873046875, "learning_rate": 4.3775933609958506e-06, "logits/chosen": 0.7939980030059814, "logits/rejected": 2.067089557647705, "logps/chosen": -659.59814453125, "logps/rejected": -798.05126953125, "loss": 1.1232, "rewards/accuracies": 0.875, "rewards/chosen": -2.6654255390167236, "rewards/margins": 6.175906181335449, "rewards/rejected": -8.841331481933594, "step": 341 }, { "epoch": 0.21275272161741834, "grad_norm": 1.8084304332733154, "learning_rate": 4.374135546334717e-06, "logits/chosen": -2.7187037467956543, "logits/rejected": 1.670148253440857, "logps/chosen": -277.630859375, "logps/rejected": -637.5504760742188, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -0.5235689878463745, "rewards/margins": 8.402307510375977, "rewards/rejected": -8.925875663757324, "step": 342 }, { "epoch": 0.21337480559875582, "grad_norm": 7.96411657333374, "learning_rate": 4.370677731673583e-06, "logits/chosen": -0.8248151540756226, "logits/rejected": 3.7894716262817383, "logps/chosen": -408.2667236328125, "logps/rejected": -749.1912841796875, "loss": 0.2153, "rewards/accuracies": 0.875, "rewards/chosen": -1.8039324283599854, "rewards/margins": 4.657402515411377, "rewards/rejected": -6.461335182189941, "step": 343 }, { "epoch": 0.2139968895800933, "grad_norm": 0.18026278913021088, "learning_rate": 4.3672199170124484e-06, "logits/chosen": -1.1616530418395996, "logits/rejected": 2.3355207443237305, "logps/chosen": -419.3733215332031, "logps/rejected": -722.0364990234375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.174919605255127, "rewards/margins": 7.349852561950684, "rewards/rejected": -11.524772644042969, "step": 344 }, { "epoch": 0.21461897356143078, "grad_norm": 17.10386848449707, "learning_rate": 4.363762102351314e-06, "logits/chosen": -0.8862954378128052, "logits/rejected": 2.630567789077759, "logps/chosen": -391.5943603515625, "logps/rejected": -639.7130737304688, "loss": 0.9271, "rewards/accuracies": 0.75, "rewards/chosen": -1.9701530933380127, "rewards/margins": 3.56577205657959, "rewards/rejected": -5.535924911499023, "step": 345 }, { "epoch": 0.21524105754276826, "grad_norm": 18.30052947998047, "learning_rate": 4.360304287690181e-06, "logits/chosen": -0.8496010303497314, "logits/rejected": 1.4629822969436646, "logps/chosen": -550.462158203125, "logps/rejected": -730.0476684570312, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": -3.0293831825256348, "rewards/margins": 6.317649841308594, "rewards/rejected": -9.347033500671387, "step": 346 }, { "epoch": 0.21586314152410577, "grad_norm": 0.10825525969266891, "learning_rate": 4.356846473029046e-06, "logits/chosen": -0.3806982636451721, "logits/rejected": 1.7053426504135132, "logps/chosen": -591.6561279296875, "logps/rejected": -804.8829345703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.5087711811065674, "rewards/margins": 7.476545333862305, "rewards/rejected": -10.985316276550293, "step": 347 }, { "epoch": 0.21648522550544325, "grad_norm": 1.612794041633606, "learning_rate": 4.353388658367912e-06, "logits/chosen": -3.04921293258667, "logits/rejected": -1.265383243560791, "logps/chosen": -402.2282409667969, "logps/rejected": -615.1707763671875, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -2.6701643466949463, "rewards/margins": 8.872179985046387, "rewards/rejected": -11.54234504699707, "step": 348 }, { "epoch": 0.21710730948678073, "grad_norm": 11.167433738708496, "learning_rate": 4.349930843706778e-06, "logits/chosen": -1.4469276666641235, "logits/rejected": 3.033637046813965, "logps/chosen": -404.5251770019531, "logps/rejected": -786.6861572265625, "loss": 0.414, "rewards/accuracies": 0.875, "rewards/chosen": -3.028228282928467, "rewards/margins": 9.016996383666992, "rewards/rejected": -12.045225143432617, "step": 349 }, { "epoch": 0.2177293934681182, "grad_norm": 0.26764681935310364, "learning_rate": 4.346473029045643e-06, "logits/chosen": -3.342344045639038, "logits/rejected": 0.4614518880844116, "logps/chosen": -272.1787109375, "logps/rejected": -643.1485595703125, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.5632357597351074, "rewards/margins": 8.322427749633789, "rewards/rejected": -10.885663032531738, "step": 350 }, { "epoch": 0.21835147744945568, "grad_norm": 13.985143661499023, "learning_rate": 4.34301521438451e-06, "logits/chosen": -1.0913925170898438, "logits/rejected": -0.050348103046417236, "logps/chosen": -506.35552978515625, "logps/rejected": -670.3692626953125, "loss": 0.9299, "rewards/accuracies": 0.75, "rewards/chosen": -3.857954502105713, "rewards/margins": 3.8564233779907227, "rewards/rejected": -7.714378356933594, "step": 351 }, { "epoch": 0.21897356143079316, "grad_norm": 0.3018890917301178, "learning_rate": 4.3395573997233755e-06, "logits/chosen": -3.0925559997558594, "logits/rejected": 0.9945049285888672, "logps/chosen": -246.6046142578125, "logps/rejected": -592.920166015625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -1.4772467613220215, "rewards/margins": 6.9012346267700195, "rewards/rejected": -8.378480911254883, "step": 352 }, { "epoch": 0.21959564541213064, "grad_norm": 8.058971405029297, "learning_rate": 4.336099585062241e-06, "logits/chosen": -1.9562466144561768, "logits/rejected": 1.7471662759780884, "logps/chosen": -502.2210693359375, "logps/rejected": -780.571044921875, "loss": 0.353, "rewards/accuracies": 0.875, "rewards/chosen": -2.2709994316101074, "rewards/margins": 8.68028450012207, "rewards/rejected": -10.951284408569336, "step": 353 }, { "epoch": 0.22021772939346812, "grad_norm": 17.590124130249023, "learning_rate": 4.332641770401107e-06, "logits/chosen": -1.236997365951538, "logits/rejected": 1.3847992420196533, "logps/chosen": -423.5195007324219, "logps/rejected": -656.462890625, "loss": 0.1645, "rewards/accuracies": 0.875, "rewards/chosen": -3.840425491333008, "rewards/margins": 6.396016597747803, "rewards/rejected": -10.236442565917969, "step": 354 }, { "epoch": 0.2208398133748056, "grad_norm": 4.438150405883789, "learning_rate": 4.3291839557399726e-06, "logits/chosen": -1.3621207475662231, "logits/rejected": 0.6476567387580872, "logps/chosen": -338.8612060546875, "logps/rejected": -543.4127197265625, "loss": 0.1077, "rewards/accuracies": 0.875, "rewards/chosen": -1.1995749473571777, "rewards/margins": 7.936796188354492, "rewards/rejected": -9.136371612548828, "step": 355 }, { "epoch": 0.22146189735614308, "grad_norm": 4.211209774017334, "learning_rate": 4.325726141078839e-06, "logits/chosen": -0.40472009778022766, "logits/rejected": 1.751699447631836, "logps/chosen": -538.4315795898438, "logps/rejected": -751.3974609375, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": -1.2301788330078125, "rewards/margins": 7.5867695808410645, "rewards/rejected": -8.816947937011719, "step": 356 }, { "epoch": 0.22208398133748056, "grad_norm": 8.917506217956543, "learning_rate": 4.322268326417705e-06, "logits/chosen": 1.960310459136963, "logits/rejected": 2.4075114727020264, "logps/chosen": -670.6586303710938, "logps/rejected": -691.6295776367188, "loss": 0.2585, "rewards/accuracies": 0.875, "rewards/chosen": -3.862544059753418, "rewards/margins": 4.000208854675293, "rewards/rejected": -7.862753391265869, "step": 357 }, { "epoch": 0.22270606531881804, "grad_norm": 4.473892688751221, "learning_rate": 4.31881051175657e-06, "logits/chosen": -0.6828299760818481, "logits/rejected": 2.0431294441223145, "logps/chosen": -395.7737121582031, "logps/rejected": -563.7492065429688, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/chosen": -1.7389918565750122, "rewards/margins": 5.518940448760986, "rewards/rejected": -7.257932186126709, "step": 358 }, { "epoch": 0.22332814930015552, "grad_norm": 1.4848108291625977, "learning_rate": 4.315352697095436e-06, "logits/chosen": -0.020970165729522705, "logits/rejected": 2.3290677070617676, "logps/chosen": -582.4356079101562, "logps/rejected": -762.418212890625, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -0.9743327498435974, "rewards/margins": 6.712645530700684, "rewards/rejected": -7.686977863311768, "step": 359 }, { "epoch": 0.223950233281493, "grad_norm": 7.887370586395264, "learning_rate": 4.311894882434302e-06, "logits/chosen": -0.7089744210243225, "logits/rejected": 1.7297426462173462, "logps/chosen": -536.74072265625, "logps/rejected": -753.6356201171875, "loss": 0.1974, "rewards/accuracies": 0.875, "rewards/chosen": -1.743908166885376, "rewards/margins": 8.169610977172852, "rewards/rejected": -9.913518905639648, "step": 360 }, { "epoch": 0.22457231726283047, "grad_norm": 9.539219856262207, "learning_rate": 4.308437067773168e-06, "logits/chosen": -0.46145376563072205, "logits/rejected": 0.2776165306568146, "logps/chosen": -526.5408935546875, "logps/rejected": -613.58349609375, "loss": 0.2857, "rewards/accuracies": 0.875, "rewards/chosen": -3.2432940006256104, "rewards/margins": 5.199886322021484, "rewards/rejected": -8.443180084228516, "step": 361 }, { "epoch": 0.22519440124416795, "grad_norm": 0.06694504618644714, "learning_rate": 4.304979253112034e-06, "logits/chosen": -0.37969791889190674, "logits/rejected": 3.0709779262542725, "logps/chosen": -438.948486328125, "logps/rejected": -742.4027099609375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.8926944732666016, "rewards/margins": 9.088038444519043, "rewards/rejected": -12.980732917785645, "step": 362 }, { "epoch": 0.22581648522550543, "grad_norm": 7.626397609710693, "learning_rate": 4.3015214384509e-06, "logits/chosen": -1.738087773323059, "logits/rejected": 0.3525831401348114, "logps/chosen": -423.63946533203125, "logps/rejected": -605.2889404296875, "loss": 0.3046, "rewards/accuracies": 0.75, "rewards/chosen": -3.132819414138794, "rewards/margins": 3.7709484100341797, "rewards/rejected": -6.903768062591553, "step": 363 }, { "epoch": 0.2264385692068429, "grad_norm": 7.3116984367370605, "learning_rate": 4.298063623789765e-06, "logits/chosen": -0.29621705412864685, "logits/rejected": 1.8098219633102417, "logps/chosen": -570.6257934570312, "logps/rejected": -752.5721435546875, "loss": 0.1201, "rewards/accuracies": 0.875, "rewards/chosen": -4.132789611816406, "rewards/margins": 7.6176605224609375, "rewards/rejected": -11.750450134277344, "step": 364 }, { "epoch": 0.22706065318818042, "grad_norm": 0.26016244292259216, "learning_rate": 4.294605809128631e-06, "logits/chosen": -1.8099310398101807, "logits/rejected": 2.8905587196350098, "logps/chosen": -412.2474365234375, "logps/rejected": -789.8032836914062, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.8394465446472168, "rewards/margins": 7.5067548751831055, "rewards/rejected": -9.346200942993164, "step": 365 }, { "epoch": 0.2276827371695179, "grad_norm": 0.9489650130271912, "learning_rate": 4.291147994467497e-06, "logits/chosen": -1.8214383125305176, "logits/rejected": 2.382387638092041, "logps/chosen": -416.0285339355469, "logps/rejected": -765.469970703125, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -3.25539493560791, "rewards/margins": 9.41097640991211, "rewards/rejected": -12.66637134552002, "step": 366 }, { "epoch": 0.22830482115085537, "grad_norm": 2.543398380279541, "learning_rate": 4.287690179806362e-06, "logits/chosen": -0.3245619535446167, "logits/rejected": 3.1671793460845947, "logps/chosen": -449.5426025390625, "logps/rejected": -816.8492431640625, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -2.7615981101989746, "rewards/margins": 9.901832580566406, "rewards/rejected": -12.663431167602539, "step": 367 }, { "epoch": 0.22892690513219285, "grad_norm": 0.5387323498725891, "learning_rate": 4.284232365145228e-06, "logits/chosen": -1.5377726554870605, "logits/rejected": 1.5982156991958618, "logps/chosen": -464.56610107421875, "logps/rejected": -756.6425170898438, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -1.7165791988372803, "rewards/margins": 7.706698894500732, "rewards/rejected": -9.42327880859375, "step": 368 }, { "epoch": 0.22954898911353033, "grad_norm": 3.8628933429718018, "learning_rate": 4.2807745504840945e-06, "logits/chosen": -1.3087849617004395, "logits/rejected": 1.1342480182647705, "logps/chosen": -436.21728515625, "logps/rejected": -687.9305419921875, "loss": 0.0857, "rewards/accuracies": 1.0, "rewards/chosen": -1.67164146900177, "rewards/margins": 6.515338897705078, "rewards/rejected": -8.186980247497559, "step": 369 }, { "epoch": 0.2301710730948678, "grad_norm": 26.755435943603516, "learning_rate": 4.27731673582296e-06, "logits/chosen": -3.1240649223327637, "logits/rejected": -0.18282252550125122, "logps/chosen": -346.0662841796875, "logps/rejected": -646.2769775390625, "loss": 1.6972, "rewards/accuracies": 0.625, "rewards/chosen": -2.92862868309021, "rewards/margins": 3.9491708278656006, "rewards/rejected": -6.8777995109558105, "step": 370 }, { "epoch": 0.2307931570762053, "grad_norm": 0.7069016098976135, "learning_rate": 4.273858921161826e-06, "logits/chosen": -2.081191301345825, "logits/rejected": 2.9470438957214355, "logps/chosen": -409.5639953613281, "logps/rejected": -753.1138305664062, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.1220028400421143, "rewards/margins": 7.609243392944336, "rewards/rejected": -9.731245994567871, "step": 371 }, { "epoch": 0.23141524105754277, "grad_norm": 8.341381072998047, "learning_rate": 4.2704011065006916e-06, "logits/chosen": -2.3618288040161133, "logits/rejected": 1.5979560613632202, "logps/chosen": -342.6532287597656, "logps/rejected": -651.988037109375, "loss": 0.3736, "rewards/accuracies": 0.75, "rewards/chosen": -3.0355944633483887, "rewards/margins": 4.0681962966918945, "rewards/rejected": -7.103790283203125, "step": 372 }, { "epoch": 0.23203732503888025, "grad_norm": 8.408726692199707, "learning_rate": 4.266943291839557e-06, "logits/chosen": 0.2767166495323181, "logits/rejected": 2.3922011852264404, "logps/chosen": -376.4231872558594, "logps/rejected": -503.5641174316406, "loss": 0.3779, "rewards/accuracies": 0.875, "rewards/chosen": -2.140350341796875, "rewards/margins": 4.458551406860352, "rewards/rejected": -6.598901748657227, "step": 373 }, { "epoch": 0.23265940902021773, "grad_norm": 9.788992881774902, "learning_rate": 4.263485477178424e-06, "logits/chosen": -1.7231264114379883, "logits/rejected": 0.5569015145301819, "logps/chosen": -416.5295104980469, "logps/rejected": -577.4541625976562, "loss": 0.3228, "rewards/accuracies": 0.75, "rewards/chosen": -3.496561050415039, "rewards/margins": 4.96079158782959, "rewards/rejected": -8.457352638244629, "step": 374 }, { "epoch": 0.2332814930015552, "grad_norm": 3.076627731323242, "learning_rate": 4.2600276625172894e-06, "logits/chosen": -2.7858710289001465, "logits/rejected": 2.4313647747039795, "logps/chosen": -431.3386535644531, "logps/rejected": -830.5308227539062, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -2.62522292137146, "rewards/margins": 8.715339660644531, "rewards/rejected": -11.34056282043457, "step": 375 }, { "epoch": 0.23390357698289269, "grad_norm": 9.840999603271484, "learning_rate": 4.256569847856155e-06, "logits/chosen": -0.6546146869659424, "logits/rejected": 2.979585886001587, "logps/chosen": -485.5546875, "logps/rejected": -811.9015502929688, "loss": 0.3016, "rewards/accuracies": 0.875, "rewards/chosen": -3.141195297241211, "rewards/margins": 5.539736270904541, "rewards/rejected": -8.68093204498291, "step": 376 }, { "epoch": 0.23452566096423016, "grad_norm": 1.2640222311019897, "learning_rate": 4.253112033195021e-06, "logits/chosen": -3.195009469985962, "logits/rejected": 1.8034260272979736, "logps/chosen": -276.50323486328125, "logps/rejected": -640.6697998046875, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": -0.27514615654945374, "rewards/margins": 5.312612533569336, "rewards/rejected": -5.587759017944336, "step": 377 }, { "epoch": 0.23514774494556764, "grad_norm": 0.3993472456932068, "learning_rate": 4.2496542185338864e-06, "logits/chosen": -0.13083088397979736, "logits/rejected": 1.8090416193008423, "logps/chosen": -468.075439453125, "logps/rejected": -705.4761352539062, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.493317127227783, "rewards/margins": 9.042068481445312, "rewards/rejected": -11.535385131835938, "step": 378 }, { "epoch": 0.23576982892690512, "grad_norm": 1.4940111637115479, "learning_rate": 4.246196403872753e-06, "logits/chosen": -0.3571454882621765, "logits/rejected": 2.180593967437744, "logps/chosen": -323.8885803222656, "logps/rejected": -589.1732177734375, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -2.7886757850646973, "rewards/margins": 6.285494804382324, "rewards/rejected": -9.07417106628418, "step": 379 }, { "epoch": 0.2363919129082426, "grad_norm": 15.417763710021973, "learning_rate": 4.242738589211619e-06, "logits/chosen": -1.4868839979171753, "logits/rejected": 1.1371216773986816, "logps/chosen": -523.4039916992188, "logps/rejected": -721.033203125, "loss": 1.0604, "rewards/accuracies": 0.75, "rewards/chosen": -4.2446513175964355, "rewards/margins": 6.161581516265869, "rewards/rejected": -10.406232833862305, "step": 380 }, { "epoch": 0.23701399688958008, "grad_norm": 4.814235210418701, "learning_rate": 4.239280774550484e-06, "logits/chosen": -0.4791300296783447, "logits/rejected": 2.120157480239868, "logps/chosen": -418.82208251953125, "logps/rejected": -647.6483154296875, "loss": 0.1771, "rewards/accuracies": 0.875, "rewards/chosen": -2.348912239074707, "rewards/margins": 5.31494665145874, "rewards/rejected": -7.663858890533447, "step": 381 }, { "epoch": 0.2376360808709176, "grad_norm": 13.02233600616455, "learning_rate": 4.23582295988935e-06, "logits/chosen": -3.1909708976745605, "logits/rejected": 0.8097673654556274, "logps/chosen": -354.4810485839844, "logps/rejected": -639.628173828125, "loss": 0.6952, "rewards/accuracies": 0.875, "rewards/chosen": -2.326988697052002, "rewards/margins": 6.82171106338501, "rewards/rejected": -9.148699760437012, "step": 382 }, { "epoch": 0.23825816485225507, "grad_norm": 10.776427268981934, "learning_rate": 4.232365145228216e-06, "logits/chosen": 0.8315434455871582, "logits/rejected": 1.6661361455917358, "logps/chosen": -483.113037109375, "logps/rejected": -576.11181640625, "loss": 0.3513, "rewards/accuracies": 0.875, "rewards/chosen": -2.1431586742401123, "rewards/margins": 4.914085865020752, "rewards/rejected": -7.057245254516602, "step": 383 }, { "epoch": 0.23888024883359255, "grad_norm": 8.599739074707031, "learning_rate": 4.228907330567082e-06, "logits/chosen": -3.410144329071045, "logits/rejected": 1.924527645111084, "logps/chosen": -182.75747680664062, "logps/rejected": -582.1602172851562, "loss": 0.1865, "rewards/accuracies": 0.875, "rewards/chosen": -0.8336835503578186, "rewards/margins": 4.1925201416015625, "rewards/rejected": -5.026203155517578, "step": 384 }, { "epoch": 0.23950233281493002, "grad_norm": 10.483428955078125, "learning_rate": 4.225449515905948e-06, "logits/chosen": -0.25581762194633484, "logits/rejected": 2.4798948764801025, "logps/chosen": -380.013427734375, "logps/rejected": -641.7288208007812, "loss": 0.2774, "rewards/accuracies": 0.875, "rewards/chosen": -2.6057300567626953, "rewards/margins": 4.600205421447754, "rewards/rejected": -7.205935478210449, "step": 385 }, { "epoch": 0.2401244167962675, "grad_norm": 3.814098834991455, "learning_rate": 4.2219917012448135e-06, "logits/chosen": -3.175114154815674, "logits/rejected": 2.2965762615203857, "logps/chosen": -299.7555847167969, "logps/rejected": -755.8237915039062, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": -2.373093605041504, "rewards/margins": 9.553924560546875, "rewards/rejected": -11.927020072937012, "step": 386 }, { "epoch": 0.24074650077760498, "grad_norm": 15.095627784729004, "learning_rate": 4.218533886583679e-06, "logits/chosen": -0.12323138117790222, "logits/rejected": 1.7435098886489868, "logps/chosen": -624.6740112304688, "logps/rejected": -790.91455078125, "loss": 0.9417, "rewards/accuracies": 0.625, "rewards/chosen": -4.97245454788208, "rewards/margins": 3.9849815368652344, "rewards/rejected": -8.957435607910156, "step": 387 }, { "epoch": 0.24136858475894246, "grad_norm": 1.5037442445755005, "learning_rate": 4.215076071922546e-06, "logits/chosen": -0.5295828580856323, "logits/rejected": 1.5840624570846558, "logps/chosen": -380.03424072265625, "logps/rejected": -645.4027099609375, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -2.0629801750183105, "rewards/margins": 9.36235523223877, "rewards/rejected": -11.425335884094238, "step": 388 }, { "epoch": 0.24199066874027994, "grad_norm": 11.285853385925293, "learning_rate": 4.211618257261411e-06, "logits/chosen": -1.9761393070220947, "logits/rejected": 1.48415207862854, "logps/chosen": -500.66143798828125, "logps/rejected": -800.345703125, "loss": 0.5824, "rewards/accuracies": 0.875, "rewards/chosen": -3.866258144378662, "rewards/margins": 7.1595001220703125, "rewards/rejected": -11.025758743286133, "step": 389 }, { "epoch": 0.24261275272161742, "grad_norm": 10.775274276733398, "learning_rate": 4.208160442600277e-06, "logits/chosen": 3.013396739959717, "logits/rejected": 3.5874881744384766, "logps/chosen": -683.9315795898438, "logps/rejected": -819.1393432617188, "loss": 0.3348, "rewards/accuracies": 0.875, "rewards/chosen": -2.1036128997802734, "rewards/margins": 6.154485702514648, "rewards/rejected": -8.258098602294922, "step": 390 }, { "epoch": 0.2432348367029549, "grad_norm": 13.624896049499512, "learning_rate": 4.204702627939143e-06, "logits/chosen": 0.3594086468219757, "logits/rejected": 0.552801787853241, "logps/chosen": -527.83203125, "logps/rejected": -519.905029296875, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": -3.453556776046753, "rewards/margins": 2.08355712890625, "rewards/rejected": -5.537114143371582, "step": 391 }, { "epoch": 0.24385692068429238, "grad_norm": 0.3872527778148651, "learning_rate": 4.2012448132780084e-06, "logits/chosen": -1.3638038635253906, "logits/rejected": 2.390726327896118, "logps/chosen": -448.9990234375, "logps/rejected": -807.5977783203125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.969994306564331, "rewards/margins": 12.557906150817871, "rewards/rejected": -14.527899742126465, "step": 392 }, { "epoch": 0.24447900466562986, "grad_norm": 1.7327483892440796, "learning_rate": 4.197786998616875e-06, "logits/chosen": 1.0285428762435913, "logits/rejected": 2.0675604343414307, "logps/chosen": -541.8523559570312, "logps/rejected": -640.2545166015625, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -5.347561836242676, "rewards/margins": 5.852660179138184, "rewards/rejected": -11.20022201538086, "step": 393 }, { "epoch": 0.24510108864696734, "grad_norm": 9.486278533935547, "learning_rate": 4.194329183955741e-06, "logits/chosen": -0.3046323359012604, "logits/rejected": 1.8420014381408691, "logps/chosen": -493.2582092285156, "logps/rejected": -659.5086059570312, "loss": 0.3556, "rewards/accuracies": 0.75, "rewards/chosen": -3.393686294555664, "rewards/margins": 5.061479568481445, "rewards/rejected": -8.45516586303711, "step": 394 }, { "epoch": 0.24572317262830481, "grad_norm": 13.542400360107422, "learning_rate": 4.190871369294606e-06, "logits/chosen": -0.34797245264053345, "logits/rejected": 1.3726774454116821, "logps/chosen": -553.9444580078125, "logps/rejected": -791.171630859375, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": -5.178127765655518, "rewards/margins": 5.395973205566406, "rewards/rejected": -10.574100494384766, "step": 395 }, { "epoch": 0.2463452566096423, "grad_norm": 13.488784790039062, "learning_rate": 4.187413554633472e-06, "logits/chosen": -1.592177152633667, "logits/rejected": 2.669046640396118, "logps/chosen": -462.88580322265625, "logps/rejected": -808.7183837890625, "loss": 0.4073, "rewards/accuracies": 0.875, "rewards/chosen": -3.7671890258789062, "rewards/margins": 6.608315944671631, "rewards/rejected": -10.375504493713379, "step": 396 }, { "epoch": 0.24696734059097977, "grad_norm": 5.2294158935546875, "learning_rate": 4.183955739972338e-06, "logits/chosen": 0.39239072799682617, "logits/rejected": 2.5534796714782715, "logps/chosen": -574.8798217773438, "logps/rejected": -798.0393676757812, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -4.108112335205078, "rewards/margins": 8.44875431060791, "rewards/rejected": -12.556866645812988, "step": 397 }, { "epoch": 0.24758942457231725, "grad_norm": 4.3387298583984375, "learning_rate": 4.180497925311204e-06, "logits/chosen": -3.1214184761047363, "logits/rejected": 0.19533616304397583, "logps/chosen": -431.1092529296875, "logps/rejected": -762.7857055664062, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": -3.425445079803467, "rewards/margins": 8.505861282348633, "rewards/rejected": -11.931305885314941, "step": 398 }, { "epoch": 0.24821150855365473, "grad_norm": 16.391197204589844, "learning_rate": 4.17704011065007e-06, "logits/chosen": -1.1457401514053345, "logits/rejected": 2.1615147590637207, "logps/chosen": -474.6165771484375, "logps/rejected": -795.545654296875, "loss": 0.5204, "rewards/accuracies": 0.875, "rewards/chosen": -4.87215518951416, "rewards/margins": 7.079375267028809, "rewards/rejected": -11.951530456542969, "step": 399 }, { "epoch": 0.24883359253499224, "grad_norm": 15.576864242553711, "learning_rate": 4.1735822959889355e-06, "logits/chosen": -2.6615185737609863, "logits/rejected": 1.806326150894165, "logps/chosen": -378.64788818359375, "logps/rejected": -701.8564453125, "loss": 0.6825, "rewards/accuracies": 0.75, "rewards/chosen": -2.1429715156555176, "rewards/margins": 5.234518527984619, "rewards/rejected": -7.377490043640137, "step": 400 }, { "epoch": 0.24945567651632972, "grad_norm": 4.522780418395996, "learning_rate": 4.170124481327801e-06, "logits/chosen": -3.047952890396118, "logits/rejected": -0.42208606004714966, "logps/chosen": -431.545166015625, "logps/rejected": -713.1854248046875, "loss": 0.1216, "rewards/accuracies": 1.0, "rewards/chosen": -4.081017017364502, "rewards/margins": 4.864312648773193, "rewards/rejected": -8.945329666137695, "step": 401 }, { "epoch": 0.25007776049766717, "grad_norm": 0.7030394673347473, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.086683988571167, "logits/rejected": 0.7389193773269653, "logps/chosen": -524.8357543945312, "logps/rejected": -753.16650390625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.1568245887756348, "rewards/margins": 9.150629997253418, "rewards/rejected": -12.307455062866211, "step": 402 }, { "epoch": 0.2506998444790047, "grad_norm": 3.6948044300079346, "learning_rate": 4.163208852005533e-06, "logits/chosen": 0.1012871265411377, "logits/rejected": 2.384535312652588, "logps/chosen": -299.6929016113281, "logps/rejected": -493.1545104980469, "loss": 0.3402, "rewards/accuracies": 0.75, "rewards/chosen": -3.2360973358154297, "rewards/margins": 4.704954624176025, "rewards/rejected": -7.941051959991455, "step": 403 }, { "epoch": 0.2513219284603421, "grad_norm": 0.9489257335662842, "learning_rate": 4.159751037344399e-06, "logits/chosen": -2.9379568099975586, "logits/rejected": 0.08956009149551392, "logps/chosen": -327.6954040527344, "logps/rejected": -554.350830078125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -2.7191359996795654, "rewards/margins": 6.988147258758545, "rewards/rejected": -9.707283973693848, "step": 404 }, { "epoch": 0.25194401244167963, "grad_norm": 14.700804710388184, "learning_rate": 4.156293222683265e-06, "logits/chosen": -0.4967370927333832, "logits/rejected": 1.8412714004516602, "logps/chosen": -524.108642578125, "logps/rejected": -698.3900146484375, "loss": 0.8405, "rewards/accuracies": 0.875, "rewards/chosen": -4.975113868713379, "rewards/margins": 5.8642778396606445, "rewards/rejected": -10.839391708374023, "step": 405 }, { "epoch": 0.2525660964230171, "grad_norm": 13.401819229125977, "learning_rate": 4.15283540802213e-06, "logits/chosen": -2.302302837371826, "logits/rejected": 2.548823118209839, "logps/chosen": -499.7018127441406, "logps/rejected": -824.0438232421875, "loss": 0.5382, "rewards/accuracies": 0.75, "rewards/chosen": -4.208133697509766, "rewards/margins": 4.3818440437316895, "rewards/rejected": -8.589977264404297, "step": 406 }, { "epoch": 0.2531881804043546, "grad_norm": 8.136439323425293, "learning_rate": 4.149377593360996e-06, "logits/chosen": 1.4156560897827148, "logits/rejected": 2.992307424545288, "logps/chosen": -481.8917236328125, "logps/rejected": -661.4564819335938, "loss": 0.2518, "rewards/accuracies": 0.75, "rewards/chosen": -1.1836870908737183, "rewards/margins": 3.5187997817993164, "rewards/rejected": -4.702486991882324, "step": 407 }, { "epoch": 0.25381026438569204, "grad_norm": 0.3760748505592346, "learning_rate": 4.145919778699863e-06, "logits/chosen": -1.6649041175842285, "logits/rejected": 2.378730535507202, "logps/chosen": -389.6162414550781, "logps/rejected": -756.239990234375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -2.758556604385376, "rewards/margins": 9.28575611114502, "rewards/rejected": -12.044313430786133, "step": 408 }, { "epoch": 0.25443234836702955, "grad_norm": 0.05014246329665184, "learning_rate": 4.142461964038728e-06, "logits/chosen": -3.648674488067627, "logits/rejected": 0.3448592722415924, "logps/chosen": -396.2196960449219, "logps/rejected": -748.8673706054688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.8835111856460571, "rewards/margins": 9.461939811706543, "rewards/rejected": -10.345451354980469, "step": 409 }, { "epoch": 0.25505443234836706, "grad_norm": 8.239520072937012, "learning_rate": 4.139004149377593e-06, "logits/chosen": -0.6901922225952148, "logits/rejected": 2.993799924850464, "logps/chosen": -402.48504638671875, "logps/rejected": -728.810302734375, "loss": 0.2027, "rewards/accuracies": 0.75, "rewards/chosen": -2.949723720550537, "rewards/margins": 7.873325347900391, "rewards/rejected": -10.823049545288086, "step": 410 }, { "epoch": 0.2556765163297045, "grad_norm": 11.380658149719238, "learning_rate": 4.13554633471646e-06, "logits/chosen": -0.25970789790153503, "logits/rejected": 0.6348339319229126, "logps/chosen": -662.3466186523438, "logps/rejected": -814.6033935546875, "loss": 0.4004, "rewards/accuracies": 0.875, "rewards/chosen": -2.5980660915374756, "rewards/margins": 8.731612205505371, "rewards/rejected": -11.32967758178711, "step": 411 }, { "epoch": 0.256298600311042, "grad_norm": 7.356199741363525, "learning_rate": 4.132088520055325e-06, "logits/chosen": 1.1254838705062866, "logits/rejected": 0.6603316068649292, "logps/chosen": -567.7224731445312, "logps/rejected": -561.0299072265625, "loss": 0.2132, "rewards/accuracies": 0.875, "rewards/chosen": -2.272413492202759, "rewards/margins": 3.912168502807617, "rewards/rejected": -6.184581756591797, "step": 412 }, { "epoch": 0.25692068429237946, "grad_norm": 11.53735637664795, "learning_rate": 4.128630705394191e-06, "logits/chosen": -2.9751806259155273, "logits/rejected": -1.0732980966567993, "logps/chosen": -467.6898193359375, "logps/rejected": -690.5264892578125, "loss": 0.9944, "rewards/accuracies": 0.75, "rewards/chosen": -2.4858570098876953, "rewards/margins": 6.43565034866333, "rewards/rejected": -8.921506881713867, "step": 413 }, { "epoch": 0.25754276827371697, "grad_norm": 0.44377601146698, "learning_rate": 4.125172890733057e-06, "logits/chosen": -1.0966317653656006, "logits/rejected": 1.5773104429244995, "logps/chosen": -481.7767333984375, "logps/rejected": -797.425537109375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.5956127643585205, "rewards/margins": 12.305105209350586, "rewards/rejected": -14.900718688964844, "step": 414 }, { "epoch": 0.2581648522550544, "grad_norm": 14.836285591125488, "learning_rate": 4.121715076071922e-06, "logits/chosen": 0.6843374371528625, "logits/rejected": 1.082850694656372, "logps/chosen": -695.25146484375, "logps/rejected": -758.0120849609375, "loss": 1.2462, "rewards/accuracies": 0.875, "rewards/chosen": -5.546614170074463, "rewards/margins": 3.1979928016662598, "rewards/rejected": -8.744606971740723, "step": 415 }, { "epoch": 0.25878693623639193, "grad_norm": 1.1331825256347656, "learning_rate": 4.118257261410789e-06, "logits/chosen": -2.1234140396118164, "logits/rejected": 1.0490177869796753, "logps/chosen": -434.031494140625, "logps/rejected": -815.7230224609375, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -2.217557430267334, "rewards/margins": 8.078803062438965, "rewards/rejected": -10.29636001586914, "step": 416 }, { "epoch": 0.2594090202177294, "grad_norm": 2.6942391395568848, "learning_rate": 4.1147994467496545e-06, "logits/chosen": -3.345094680786133, "logits/rejected": 0.49213069677352905, "logps/chosen": -347.250732421875, "logps/rejected": -708.26123046875, "loss": 0.0752, "rewards/accuracies": 1.0, "rewards/chosen": -2.685945510864258, "rewards/margins": 10.211959838867188, "rewards/rejected": -12.897904396057129, "step": 417 }, { "epoch": 0.2600311041990669, "grad_norm": 16.95615577697754, "learning_rate": 4.11134163208852e-06, "logits/chosen": -1.4867557287216187, "logits/rejected": 1.36226487159729, "logps/chosen": -515.5904541015625, "logps/rejected": -764.18212890625, "loss": 1.0031, "rewards/accuracies": 0.625, "rewards/chosen": -2.0072038173675537, "rewards/margins": 4.820002555847168, "rewards/rejected": -6.827206134796143, "step": 418 }, { "epoch": 0.26065318818040434, "grad_norm": 11.480805397033691, "learning_rate": 4.107883817427386e-06, "logits/chosen": -3.164961338043213, "logits/rejected": 0.7501891255378723, "logps/chosen": -319.4563903808594, "logps/rejected": -719.282958984375, "loss": 0.2541, "rewards/accuracies": 0.875, "rewards/chosen": -2.7740638256073, "rewards/margins": 9.042076110839844, "rewards/rejected": -11.816139221191406, "step": 419 }, { "epoch": 0.26127527216174184, "grad_norm": 1.732620358467102, "learning_rate": 4.1044260027662515e-06, "logits/chosen": -2.917612075805664, "logits/rejected": 0.12771211564540863, "logps/chosen": -353.97210693359375, "logps/rejected": -618.5401611328125, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": -3.280808448791504, "rewards/margins": 8.01445484161377, "rewards/rejected": -11.295263290405273, "step": 420 }, { "epoch": 0.2618973561430793, "grad_norm": 10.094254493713379, "learning_rate": 4.100968188105118e-06, "logits/chosen": -0.3429168462753296, "logits/rejected": 2.639591693878174, "logps/chosen": -557.6719360351562, "logps/rejected": -862.3924560546875, "loss": 0.3045, "rewards/accuracies": 0.875, "rewards/chosen": -3.3474009037017822, "rewards/margins": 4.699325084686279, "rewards/rejected": -8.04672622680664, "step": 421 }, { "epoch": 0.2625194401244168, "grad_norm": 1.3658376932144165, "learning_rate": 4.097510373443984e-06, "logits/chosen": -1.0132036209106445, "logits/rejected": 1.2111165523529053, "logps/chosen": -490.0577392578125, "logps/rejected": -694.4006958007812, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -1.9512234926223755, "rewards/margins": 7.13218879699707, "rewards/rejected": -9.083412170410156, "step": 422 }, { "epoch": 0.26314152410575425, "grad_norm": 8.24437427520752, "learning_rate": 4.094052558782849e-06, "logits/chosen": -0.7562248706817627, "logits/rejected": 1.9322937726974487, "logps/chosen": -353.6072998046875, "logps/rejected": -592.757080078125, "loss": 0.2691, "rewards/accuracies": 0.875, "rewards/chosen": -2.5112948417663574, "rewards/margins": 8.17378044128418, "rewards/rejected": -10.685075759887695, "step": 423 }, { "epoch": 0.26376360808709176, "grad_norm": 0.015072043053805828, "learning_rate": 4.090594744121715e-06, "logits/chosen": -0.7485297918319702, "logits/rejected": 2.1606249809265137, "logps/chosen": -440.0586853027344, "logps/rejected": -752.1514282226562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.8396202325820923, "rewards/margins": 12.187982559204102, "rewards/rejected": -14.027603149414062, "step": 424 }, { "epoch": 0.2643856920684292, "grad_norm": 0.22790521383285522, "learning_rate": 4.087136929460581e-06, "logits/chosen": -1.6275590658187866, "logits/rejected": 1.2756794691085815, "logps/chosen": -302.22283935546875, "logps/rejected": -642.9537963867188, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -2.2839369773864746, "rewards/margins": 8.674171447753906, "rewards/rejected": -10.958106994628906, "step": 425 }, { "epoch": 0.2650077760497667, "grad_norm": 11.433948516845703, "learning_rate": 4.083679114799447e-06, "logits/chosen": -0.7767620086669922, "logits/rejected": 1.9830691814422607, "logps/chosen": -322.4564208984375, "logps/rejected": -589.6417236328125, "loss": 0.2574, "rewards/accuracies": 0.875, "rewards/chosen": -0.6957510113716125, "rewards/margins": 6.553627014160156, "rewards/rejected": -7.249377727508545, "step": 426 }, { "epoch": 0.2656298600311042, "grad_norm": 0.07029318064451218, "learning_rate": 4.080221300138313e-06, "logits/chosen": -3.4179720878601074, "logits/rejected": 0.5506851673126221, "logps/chosen": -388.44989013671875, "logps/rejected": -682.4453125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.119952917098999, "rewards/margins": 8.431827545166016, "rewards/rejected": -10.551780700683594, "step": 427 }, { "epoch": 0.2662519440124417, "grad_norm": 9.503891944885254, "learning_rate": 4.076763485477179e-06, "logits/chosen": 0.5065744519233704, "logits/rejected": 2.163261890411377, "logps/chosen": -428.0357666015625, "logps/rejected": -591.0986938476562, "loss": 0.3505, "rewards/accuracies": 0.875, "rewards/chosen": -1.54045569896698, "rewards/margins": 6.531261444091797, "rewards/rejected": -8.071717262268066, "step": 428 }, { "epoch": 0.2668740279937792, "grad_norm": 12.091681480407715, "learning_rate": 4.073305670816044e-06, "logits/chosen": -2.1132264137268066, "logits/rejected": 0.3465748727321625, "logps/chosen": -579.4651489257812, "logps/rejected": -822.1565551757812, "loss": 0.7986, "rewards/accuracies": 0.75, "rewards/chosen": -4.084692478179932, "rewards/margins": 6.425690174102783, "rewards/rejected": -10.510381698608398, "step": 429 }, { "epoch": 0.26749611197511663, "grad_norm": 4.998798370361328, "learning_rate": 4.06984785615491e-06, "logits/chosen": -0.37712445855140686, "logits/rejected": 0.7267013788223267, "logps/chosen": -548.6878051757812, "logps/rejected": -742.9112548828125, "loss": 0.0799, "rewards/accuracies": 1.0, "rewards/chosen": -2.1420111656188965, "rewards/margins": 8.191718101501465, "rewards/rejected": -10.33372974395752, "step": 430 }, { "epoch": 0.26811819595645414, "grad_norm": 4.494533538818359, "learning_rate": 4.0663900414937765e-06, "logits/chosen": -0.53819739818573, "logits/rejected": 0.5718010663986206, "logps/chosen": -384.4983825683594, "logps/rejected": -549.7811279296875, "loss": 0.1581, "rewards/accuracies": 0.875, "rewards/chosen": -2.1365249156951904, "rewards/margins": 5.834264755249023, "rewards/rejected": -7.970789909362793, "step": 431 }, { "epoch": 0.2687402799377916, "grad_norm": 0.8543798327445984, "learning_rate": 4.062932226832642e-06, "logits/chosen": -3.539583683013916, "logits/rejected": 1.636909008026123, "logps/chosen": -291.42694091796875, "logps/rejected": -692.2765502929688, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -2.0390732288360596, "rewards/margins": 8.574129104614258, "rewards/rejected": -10.613202095031738, "step": 432 }, { "epoch": 0.2693623639191291, "grad_norm": 1.0238242149353027, "learning_rate": 4.059474412171508e-06, "logits/chosen": -3.1173903942108154, "logits/rejected": 0.869249701499939, "logps/chosen": -315.09735107421875, "logps/rejected": -682.9987182617188, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -1.1753504276275635, "rewards/margins": 9.408401489257812, "rewards/rejected": -10.583751678466797, "step": 433 }, { "epoch": 0.26998444790046655, "grad_norm": 15.087620735168457, "learning_rate": 4.0560165975103735e-06, "logits/chosen": -0.9387578368186951, "logits/rejected": 1.8587124347686768, "logps/chosen": -425.2380065917969, "logps/rejected": -665.5792846679688, "loss": 0.679, "rewards/accuracies": 0.75, "rewards/chosen": -4.090317726135254, "rewards/margins": 7.1099748611450195, "rewards/rejected": -11.200292587280273, "step": 434 }, { "epoch": 0.27060653188180406, "grad_norm": 0.6596493124961853, "learning_rate": 4.05255878284924e-06, "logits/chosen": -1.2613496780395508, "logits/rejected": 2.446895122528076, "logps/chosen": -356.5009765625, "logps/rejected": -750.2744750976562, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.4184625148773193, "rewards/margins": 10.808187484741211, "rewards/rejected": -12.22665023803711, "step": 435 }, { "epoch": 0.2712286158631415, "grad_norm": 7.993477821350098, "learning_rate": 4.049100968188106e-06, "logits/chosen": -0.28957557678222656, "logits/rejected": 1.2249037027359009, "logps/chosen": -606.7642822265625, "logps/rejected": -719.6381225585938, "loss": 0.1861, "rewards/accuracies": 0.875, "rewards/chosen": -4.034971237182617, "rewards/margins": 3.704296112060547, "rewards/rejected": -7.739267349243164, "step": 436 }, { "epoch": 0.271850699844479, "grad_norm": 0.38945409655570984, "learning_rate": 4.045643153526971e-06, "logits/chosen": -1.2714331150054932, "logits/rejected": 1.0209203958511353, "logps/chosen": -405.7791748046875, "logps/rejected": -700.2265014648438, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -2.6918892860412598, "rewards/margins": 10.451664924621582, "rewards/rejected": -13.1435546875, "step": 437 }, { "epoch": 0.27247278382581647, "grad_norm": 1.1840221881866455, "learning_rate": 4.042185338865837e-06, "logits/chosen": -1.3968441486358643, "logits/rejected": 1.3399296998977661, "logps/chosen": -567.6121826171875, "logps/rejected": -735.8326416015625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.0136945247650146, "rewards/margins": 8.472586631774902, "rewards/rejected": -11.48628044128418, "step": 438 }, { "epoch": 0.273094867807154, "grad_norm": 10.056412696838379, "learning_rate": 4.038727524204703e-06, "logits/chosen": -2.4012250900268555, "logits/rejected": 0.6289938688278198, "logps/chosen": -331.75128173828125, "logps/rejected": -601.293212890625, "loss": 0.4348, "rewards/accuracies": 0.875, "rewards/chosen": -3.425358533859253, "rewards/margins": 7.584898948669434, "rewards/rejected": -11.010257720947266, "step": 439 }, { "epoch": 0.2737169517884914, "grad_norm": 1.4584107398986816, "learning_rate": 4.035269709543569e-06, "logits/chosen": -0.7807049751281738, "logits/rejected": 1.8506962060928345, "logps/chosen": -411.1268310546875, "logps/rejected": -655.4127197265625, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": -2.1555051803588867, "rewards/margins": 7.151162147521973, "rewards/rejected": -9.30666732788086, "step": 440 }, { "epoch": 0.27433903576982893, "grad_norm": 0.5005113482475281, "learning_rate": 4.031811894882435e-06, "logits/chosen": -0.7958648800849915, "logits/rejected": 0.24272465705871582, "logps/chosen": -481.47491455078125, "logps/rejected": -698.90234375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.40390682220459, "rewards/margins": 10.91330623626709, "rewards/rejected": -13.31721305847168, "step": 441 }, { "epoch": 0.2749611197511664, "grad_norm": 0.07642216980457306, "learning_rate": 4.028354080221301e-06, "logits/chosen": -2.5948753356933594, "logits/rejected": 3.3095362186431885, "logps/chosen": -222.05628967285156, "logps/rejected": -626.8717041015625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.4834938049316406, "rewards/margins": 10.585895538330078, "rewards/rejected": -12.069390296936035, "step": 442 }, { "epoch": 0.2755832037325039, "grad_norm": 0.2830372154712677, "learning_rate": 4.024896265560166e-06, "logits/chosen": -1.8057490587234497, "logits/rejected": 2.8892433643341064, "logps/chosen": -384.0581359863281, "logps/rejected": -784.9635009765625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.0637799501419067, "rewards/margins": 9.079426765441895, "rewards/rejected": -10.143206596374512, "step": 443 }, { "epoch": 0.27620528771384134, "grad_norm": 0.7886137962341309, "learning_rate": 4.021438450899032e-06, "logits/chosen": -0.05486345291137695, "logits/rejected": 1.9790093898773193, "logps/chosen": -463.2649841308594, "logps/rejected": -708.4942626953125, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -3.1040961742401123, "rewards/margins": 7.975215911865234, "rewards/rejected": -11.079312324523926, "step": 444 }, { "epoch": 0.27682737169517885, "grad_norm": 1.711745262145996, "learning_rate": 4.0179806362378985e-06, "logits/chosen": -1.203722357749939, "logits/rejected": 0.737490713596344, "logps/chosen": -282.9237060546875, "logps/rejected": -563.2532348632812, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": -1.721040964126587, "rewards/margins": 8.542791366577148, "rewards/rejected": -10.263832092285156, "step": 445 }, { "epoch": 0.27744945567651635, "grad_norm": 4.483963966369629, "learning_rate": 4.014522821576764e-06, "logits/chosen": -3.314371109008789, "logits/rejected": 1.1260671615600586, "logps/chosen": -364.2464294433594, "logps/rejected": -886.88232421875, "loss": 0.1068, "rewards/accuracies": 0.875, "rewards/chosen": -1.7224769592285156, "rewards/margins": 10.505866050720215, "rewards/rejected": -12.228343963623047, "step": 446 }, { "epoch": 0.2780715396578538, "grad_norm": 8.890092849731445, "learning_rate": 4.01106500691563e-06, "logits/chosen": -0.5041534900665283, "logits/rejected": 1.0522918701171875, "logps/chosen": -539.2713623046875, "logps/rejected": -678.0741577148438, "loss": 0.2558, "rewards/accuracies": 0.875, "rewards/chosen": -3.2883405685424805, "rewards/margins": 6.841994285583496, "rewards/rejected": -10.130334854125977, "step": 447 }, { "epoch": 0.2786936236391913, "grad_norm": 12.893355369567871, "learning_rate": 4.0076071922544955e-06, "logits/chosen": -1.932800531387329, "logits/rejected": 0.6356517672538757, "logps/chosen": -551.1233520507812, "logps/rejected": -833.0950317382812, "loss": 0.6238, "rewards/accuracies": 0.875, "rewards/chosen": -6.605610370635986, "rewards/margins": 7.4469194412231445, "rewards/rejected": -14.052530288696289, "step": 448 }, { "epoch": 0.27931570762052876, "grad_norm": 13.502975463867188, "learning_rate": 4.004149377593361e-06, "logits/chosen": -0.8044370412826538, "logits/rejected": -0.18658232688903809, "logps/chosen": -580.7380981445312, "logps/rejected": -696.0023193359375, "loss": 0.4707, "rewards/accuracies": 0.875, "rewards/chosen": -4.33844518661499, "rewards/margins": 6.502445220947266, "rewards/rejected": -10.840889930725098, "step": 449 }, { "epoch": 0.27993779160186627, "grad_norm": 0.07770481705665588, "learning_rate": 4.000691562932228e-06, "logits/chosen": -2.4881503582000732, "logits/rejected": -0.1700318455696106, "logps/chosen": -310.66546630859375, "logps/rejected": -640.001220703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.8170865774154663, "rewards/margins": 8.564971923828125, "rewards/rejected": -10.382059097290039, "step": 450 }, { "epoch": 0.2805598755832037, "grad_norm": 11.578554153442383, "learning_rate": 3.997233748271093e-06, "logits/chosen": -1.588904857635498, "logits/rejected": 0.789355993270874, "logps/chosen": -503.2755432128906, "logps/rejected": -716.3681640625, "loss": 0.3534, "rewards/accuracies": 0.875, "rewards/chosen": -2.636223793029785, "rewards/margins": 6.8417840003967285, "rewards/rejected": -9.478007316589355, "step": 451 }, { "epoch": 0.28118195956454123, "grad_norm": 0.2162892073392868, "learning_rate": 3.993775933609959e-06, "logits/chosen": -0.07021141052246094, "logits/rejected": 1.3150216341018677, "logps/chosen": -571.6189575195312, "logps/rejected": -750.8739624023438, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.911626100540161, "rewards/margins": 10.442081451416016, "rewards/rejected": -13.353708267211914, "step": 452 }, { "epoch": 0.2818040435458787, "grad_norm": 1.998069167137146, "learning_rate": 3.990318118948825e-06, "logits/chosen": -1.7878814935684204, "logits/rejected": 2.6255545616149902, "logps/chosen": -383.6265869140625, "logps/rejected": -772.0843505859375, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -4.030620098114014, "rewards/margins": 10.12645435333252, "rewards/rejected": -14.157074928283691, "step": 453 }, { "epoch": 0.2824261275272162, "grad_norm": 0.7413366436958313, "learning_rate": 3.98686030428769e-06, "logits/chosen": -1.6085729598999023, "logits/rejected": 2.655160427093506, "logps/chosen": -418.6631774902344, "logps/rejected": -795.3058471679688, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.593167781829834, "rewards/margins": 9.13766098022461, "rewards/rejected": -11.730828285217285, "step": 454 }, { "epoch": 0.28304821150855364, "grad_norm": 0.18858017027378082, "learning_rate": 3.983402489626556e-06, "logits/chosen": -1.523180365562439, "logits/rejected": 0.9859204292297363, "logps/chosen": -464.97259521484375, "logps/rejected": -761.6781616210938, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.403613090515137, "rewards/margins": 9.982501983642578, "rewards/rejected": -14.386116027832031, "step": 455 }, { "epoch": 0.28367029548989114, "grad_norm": 1.1683863401412964, "learning_rate": 3.979944674965422e-06, "logits/chosen": 0.08320140838623047, "logits/rejected": 2.2921228408813477, "logps/chosen": -465.6105651855469, "logps/rejected": -612.6300048828125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -2.9610838890075684, "rewards/margins": 7.3763885498046875, "rewards/rejected": -10.337471961975098, "step": 456 }, { "epoch": 0.2842923794712286, "grad_norm": 1.2775192260742188, "learning_rate": 3.976486860304287e-06, "logits/chosen": -0.8554607629776001, "logits/rejected": 2.6879236698150635, "logps/chosen": -378.83544921875, "logps/rejected": -718.6917724609375, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -0.5337707996368408, "rewards/margins": 9.611761093139648, "rewards/rejected": -10.145532608032227, "step": 457 }, { "epoch": 0.2849144634525661, "grad_norm": 0.23516592383384705, "learning_rate": 3.973029045643154e-06, "logits/chosen": 1.334816813468933, "logits/rejected": 2.982363224029541, "logps/chosen": -592.3729858398438, "logps/rejected": -712.529052734375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -1.9227317571640015, "rewards/margins": 7.585987091064453, "rewards/rejected": -9.508718490600586, "step": 458 }, { "epoch": 0.28553654743390355, "grad_norm": 6.093252182006836, "learning_rate": 3.96957123098202e-06, "logits/chosen": -2.267732620239258, "logits/rejected": 1.8570624589920044, "logps/chosen": -317.57806396484375, "logps/rejected": -698.8861694335938, "loss": 0.1445, "rewards/accuracies": 0.875, "rewards/chosen": -2.073878765106201, "rewards/margins": 11.141674041748047, "rewards/rejected": -13.21555233001709, "step": 459 }, { "epoch": 0.28615863141524106, "grad_norm": 3.540588855743408, "learning_rate": 3.966113416320885e-06, "logits/chosen": -1.906139850616455, "logits/rejected": 1.127426266670227, "logps/chosen": -413.3196716308594, "logps/rejected": -777.0026245117188, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -3.8630237579345703, "rewards/margins": 8.190145492553711, "rewards/rejected": -12.053170204162598, "step": 460 }, { "epoch": 0.2867807153965785, "grad_norm": 10.090738296508789, "learning_rate": 3.962655601659751e-06, "logits/chosen": -0.19460391998291016, "logits/rejected": 2.142103433609009, "logps/chosen": -512.8101806640625, "logps/rejected": -728.7470703125, "loss": 0.3405, "rewards/accuracies": 0.875, "rewards/chosen": -4.451417922973633, "rewards/margins": 6.140223026275635, "rewards/rejected": -10.59164047241211, "step": 461 }, { "epoch": 0.287402799377916, "grad_norm": 11.949613571166992, "learning_rate": 3.959197786998617e-06, "logits/chosen": -1.1713415384292603, "logits/rejected": 0.7081327438354492, "logps/chosen": -442.0212097167969, "logps/rejected": -589.53662109375, "loss": 0.7857, "rewards/accuracies": 0.875, "rewards/chosen": -3.9480443000793457, "rewards/margins": 3.228327751159668, "rewards/rejected": -7.176372051239014, "step": 462 }, { "epoch": 0.2880248833592535, "grad_norm": 10.914491653442383, "learning_rate": 3.955739972337483e-06, "logits/chosen": -1.4142849445343018, "logits/rejected": 0.4412750005722046, "logps/chosen": -494.719482421875, "logps/rejected": -697.0148315429688, "loss": 0.3814, "rewards/accuracies": 0.75, "rewards/chosen": -4.63057804107666, "rewards/margins": 9.217939376831055, "rewards/rejected": -13.848516464233398, "step": 463 }, { "epoch": 0.288646967340591, "grad_norm": 0.5689676403999329, "learning_rate": 3.952282157676349e-06, "logits/chosen": -0.8130858540534973, "logits/rejected": 1.5666877031326294, "logps/chosen": -548.7515258789062, "logps/rejected": -841.9982299804688, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -4.7955403327941895, "rewards/margins": 9.551549911499023, "rewards/rejected": -14.347089767456055, "step": 464 }, { "epoch": 0.2892690513219285, "grad_norm": 7.0283122062683105, "learning_rate": 3.9488243430152145e-06, "logits/chosen": 0.40010160207748413, "logits/rejected": 2.0079169273376465, "logps/chosen": -400.7889404296875, "logps/rejected": -566.4635620117188, "loss": 0.2719, "rewards/accuracies": 0.875, "rewards/chosen": -3.2027175426483154, "rewards/margins": 5.441956520080566, "rewards/rejected": -8.644674301147461, "step": 465 }, { "epoch": 0.28989113530326593, "grad_norm": 0.8471835255622864, "learning_rate": 3.94536652835408e-06, "logits/chosen": -3.039276599884033, "logits/rejected": 1.3814703226089478, "logps/chosen": -428.572998046875, "logps/rejected": -784.4898681640625, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -2.8789889812469482, "rewards/margins": 7.9259514808654785, "rewards/rejected": -10.804940223693848, "step": 466 }, { "epoch": 0.29051321928460344, "grad_norm": 12.962052345275879, "learning_rate": 3.941908713692946e-06, "logits/chosen": -0.18432247638702393, "logits/rejected": 1.407023549079895, "logps/chosen": -603.5198974609375, "logps/rejected": -804.68017578125, "loss": 0.8628, "rewards/accuracies": 0.875, "rewards/chosen": -3.4652230739593506, "rewards/margins": 4.353940010070801, "rewards/rejected": -7.8191633224487305, "step": 467 }, { "epoch": 0.2911353032659409, "grad_norm": 9.906501770019531, "learning_rate": 3.938450899031812e-06, "logits/chosen": -0.8367794752120972, "logits/rejected": 1.9189621210098267, "logps/chosen": -479.49273681640625, "logps/rejected": -744.3640747070312, "loss": 0.1965, "rewards/accuracies": 0.875, "rewards/chosen": -5.292819499969482, "rewards/margins": 5.747117042541504, "rewards/rejected": -11.039936065673828, "step": 468 }, { "epoch": 0.2917573872472784, "grad_norm": 14.459101676940918, "learning_rate": 3.934993084370678e-06, "logits/chosen": -3.0206830501556396, "logits/rejected": -0.1902725100517273, "logps/chosen": -477.8424987792969, "logps/rejected": -755.1578369140625, "loss": 0.9431, "rewards/accuracies": 0.875, "rewards/chosen": -5.0963945388793945, "rewards/margins": 8.342973709106445, "rewards/rejected": -13.43936824798584, "step": 469 }, { "epoch": 0.29237947122861585, "grad_norm": 0.2950341999530792, "learning_rate": 3.931535269709544e-06, "logits/chosen": -2.4391674995422363, "logits/rejected": 2.6565160751342773, "logps/chosen": -277.8940124511719, "logps/rejected": -807.7977294921875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.009671688079834, "rewards/margins": 14.544286727905273, "rewards/rejected": -16.553958892822266, "step": 470 }, { "epoch": 0.29300155520995336, "grad_norm": 1.3387963771820068, "learning_rate": 3.928077455048409e-06, "logits/chosen": -2.361771583557129, "logits/rejected": 1.9186456203460693, "logps/chosen": -361.03387451171875, "logps/rejected": -750.7638549804688, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -0.7042896747589111, "rewards/margins": 8.28709602355957, "rewards/rejected": -8.991385459899902, "step": 471 }, { "epoch": 0.2936236391912908, "grad_norm": 8.258716583251953, "learning_rate": 3.924619640387275e-06, "logits/chosen": 2.502845287322998, "logits/rejected": 2.428725242614746, "logps/chosen": -604.6146240234375, "logps/rejected": -652.1255493164062, "loss": 0.1729, "rewards/accuracies": 0.875, "rewards/chosen": -5.31337833404541, "rewards/margins": 6.448269844055176, "rewards/rejected": -11.761649131774902, "step": 472 }, { "epoch": 0.2942457231726283, "grad_norm": 1.2534071207046509, "learning_rate": 3.921161825726142e-06, "logits/chosen": -0.8689680099487305, "logits/rejected": 2.3463034629821777, "logps/chosen": -471.45550537109375, "logps/rejected": -760.4877319335938, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -4.637989521026611, "rewards/margins": 9.407652854919434, "rewards/rejected": -14.045642852783203, "step": 473 }, { "epoch": 0.29486780715396577, "grad_norm": 6.944911479949951, "learning_rate": 3.917704011065007e-06, "logits/chosen": -0.7757019400596619, "logits/rejected": 0.4289734959602356, "logps/chosen": -602.4799194335938, "logps/rejected": -734.2423706054688, "loss": 0.1163, "rewards/accuracies": 0.875, "rewards/chosen": -3.0475997924804688, "rewards/margins": 8.179630279541016, "rewards/rejected": -11.227230072021484, "step": 474 }, { "epoch": 0.2954898911353033, "grad_norm": 10.916691780090332, "learning_rate": 3.914246196403873e-06, "logits/chosen": -2.7627809047698975, "logits/rejected": 0.9899921417236328, "logps/chosen": -263.61322021484375, "logps/rejected": -622.9185791015625, "loss": 0.5213, "rewards/accuracies": 0.875, "rewards/chosen": -2.696796178817749, "rewards/margins": 11.151111602783203, "rewards/rejected": -13.847908020019531, "step": 475 }, { "epoch": 0.2961119751166407, "grad_norm": 3.8994550704956055, "learning_rate": 3.910788381742739e-06, "logits/chosen": -1.2498345375061035, "logits/rejected": 1.8041307926177979, "logps/chosen": -475.386962890625, "logps/rejected": -746.247802734375, "loss": 0.0615, "rewards/accuracies": 1.0, "rewards/chosen": -4.496153354644775, "rewards/margins": 6.838209629058838, "rewards/rejected": -11.334362983703613, "step": 476 }, { "epoch": 0.29673405909797823, "grad_norm": 3.7805731296539307, "learning_rate": 3.907330567081605e-06, "logits/chosen": -2.201625347137451, "logits/rejected": 0.4619130492210388, "logps/chosen": -528.6572875976562, "logps/rejected": -822.131591796875, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -3.146613121032715, "rewards/margins": 6.900338172912598, "rewards/rejected": -10.046952247619629, "step": 477 }, { "epoch": 0.2973561430793157, "grad_norm": 2.2408125400543213, "learning_rate": 3.903872752420471e-06, "logits/chosen": 1.6627415418624878, "logits/rejected": 3.6632914543151855, "logps/chosen": -547.361572265625, "logps/rejected": -752.7223510742188, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": -2.452974796295166, "rewards/margins": 6.904186248779297, "rewards/rejected": -9.357161521911621, "step": 478 }, { "epoch": 0.2979782270606532, "grad_norm": 1.1198952198028564, "learning_rate": 3.9004149377593365e-06, "logits/chosen": -1.0383861064910889, "logits/rejected": 2.2763967514038086, "logps/chosen": -541.2638549804688, "logps/rejected": -869.6572265625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -3.372606039047241, "rewards/margins": 11.192401885986328, "rewards/rejected": -14.565008163452148, "step": 479 }, { "epoch": 0.2986003110419907, "grad_norm": 1.5390928983688354, "learning_rate": 3.896957123098202e-06, "logits/chosen": -1.9217121601104736, "logits/rejected": 2.2289812564849854, "logps/chosen": -408.04461669921875, "logps/rejected": -702.4415283203125, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": -3.5822510719299316, "rewards/margins": 8.85588264465332, "rewards/rejected": -12.438133239746094, "step": 480 }, { "epoch": 0.29922239502332815, "grad_norm": 11.019429206848145, "learning_rate": 3.893499308437068e-06, "logits/chosen": -0.7463717460632324, "logits/rejected": 2.1760520935058594, "logps/chosen": -527.68994140625, "logps/rejected": -847.1583251953125, "loss": 0.5631, "rewards/accuracies": 0.875, "rewards/chosen": -5.363083839416504, "rewards/margins": 9.204263687133789, "rewards/rejected": -14.567347526550293, "step": 481 }, { "epoch": 0.29984447900466565, "grad_norm": 0.20692849159240723, "learning_rate": 3.890041493775934e-06, "logits/chosen": -1.8672786951065063, "logits/rejected": 2.0884463787078857, "logps/chosen": -312.6790771484375, "logps/rejected": -653.47900390625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.4208874702453613, "rewards/margins": 8.734264373779297, "rewards/rejected": -10.155152320861816, "step": 482 }, { "epoch": 0.3004665629860031, "grad_norm": 0.2765861451625824, "learning_rate": 3.8865836791148e-06, "logits/chosen": -0.5949499607086182, "logits/rejected": 0.9601081609725952, "logps/chosen": -537.5997314453125, "logps/rejected": -654.1016845703125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.979041576385498, "rewards/margins": 8.73823356628418, "rewards/rejected": -10.71727466583252, "step": 483 }, { "epoch": 0.3010886469673406, "grad_norm": 15.945658683776855, "learning_rate": 3.883125864453666e-06, "logits/chosen": -1.8941237926483154, "logits/rejected": 0.7894805669784546, "logps/chosen": -498.7026062011719, "logps/rejected": -738.259521484375, "loss": 0.6536, "rewards/accuracies": 0.75, "rewards/chosen": -3.848419189453125, "rewards/margins": 4.970498085021973, "rewards/rejected": -8.818918228149414, "step": 484 }, { "epoch": 0.30171073094867806, "grad_norm": 1.472067952156067, "learning_rate": 3.879668049792531e-06, "logits/chosen": -0.10084740072488785, "logits/rejected": 1.6048977375030518, "logps/chosen": -557.6298217773438, "logps/rejected": -729.519775390625, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -4.258955001831055, "rewards/margins": 9.993483543395996, "rewards/rejected": -14.25243854522705, "step": 485 }, { "epoch": 0.30233281493001557, "grad_norm": 2.4296748638153076, "learning_rate": 3.876210235131397e-06, "logits/chosen": -0.5961840152740479, "logits/rejected": 1.6344832181930542, "logps/chosen": -499.2541809082031, "logps/rejected": -753.8890380859375, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -1.510149359703064, "rewards/margins": 8.928215026855469, "rewards/rejected": -10.438364028930664, "step": 486 }, { "epoch": 0.302954898911353, "grad_norm": 5.7762064933776855, "learning_rate": 3.872752420470264e-06, "logits/chosen": -0.22475498914718628, "logits/rejected": 2.688466787338257, "logps/chosen": -336.8910217285156, "logps/rejected": -573.825927734375, "loss": 0.1065, "rewards/accuracies": 0.875, "rewards/chosen": -2.7646427154541016, "rewards/margins": 6.289383411407471, "rewards/rejected": -9.054025650024414, "step": 487 }, { "epoch": 0.30357698289269053, "grad_norm": 9.958910942077637, "learning_rate": 3.869294605809129e-06, "logits/chosen": -0.8089866638183594, "logits/rejected": 1.3415088653564453, "logps/chosen": -500.0926818847656, "logps/rejected": -708.5562744140625, "loss": 0.2587, "rewards/accuracies": 0.875, "rewards/chosen": -2.3457374572753906, "rewards/margins": 7.336460113525391, "rewards/rejected": -9.682197570800781, "step": 488 }, { "epoch": 0.304199066874028, "grad_norm": 1.2014095783233643, "learning_rate": 3.865836791147995e-06, "logits/chosen": -1.683600902557373, "logits/rejected": 1.3398773670196533, "logps/chosen": -455.2821044921875, "logps/rejected": -812.8273315429688, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": -2.5893898010253906, "rewards/margins": 8.638779640197754, "rewards/rejected": -11.228168487548828, "step": 489 }, { "epoch": 0.3048211508553655, "grad_norm": 0.9360506534576416, "learning_rate": 3.862378976486861e-06, "logits/chosen": -1.5041589736938477, "logits/rejected": 1.7288331985473633, "logps/chosen": -347.6612243652344, "logps/rejected": -655.9993896484375, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": -2.4517176151275635, "rewards/margins": 8.994513511657715, "rewards/rejected": -11.4462308883667, "step": 490 }, { "epoch": 0.30544323483670294, "grad_norm": 0.18220064043998718, "learning_rate": 3.858921161825726e-06, "logits/chosen": -4.617347240447998, "logits/rejected": 1.248840570449829, "logps/chosen": -290.69775390625, "logps/rejected": -877.3275146484375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.456120729446411, "rewards/margins": 11.55953598022461, "rewards/rejected": -14.015657424926758, "step": 491 }, { "epoch": 0.30606531881804044, "grad_norm": 0.06468349695205688, "learning_rate": 3.855463347164593e-06, "logits/chosen": -2.0343096256256104, "logits/rejected": 2.152452230453491, "logps/chosen": -342.56512451171875, "logps/rejected": -741.280029296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.5884348154067993, "rewards/margins": 13.071104049682617, "rewards/rejected": -14.659539222717285, "step": 492 }, { "epoch": 0.3066874027993779, "grad_norm": 2.300629138946533, "learning_rate": 3.8520055325034585e-06, "logits/chosen": -0.23151543736457825, "logits/rejected": 1.7338509559631348, "logps/chosen": -496.5455322265625, "logps/rejected": -752.541748046875, "loss": 0.1549, "rewards/accuracies": 0.875, "rewards/chosen": -2.53190541267395, "rewards/margins": 11.853023529052734, "rewards/rejected": -14.384928703308105, "step": 493 }, { "epoch": 0.3073094867807154, "grad_norm": 9.339274406433105, "learning_rate": 3.848547717842324e-06, "logits/chosen": -2.331122398376465, "logits/rejected": 1.897793173789978, "logps/chosen": -282.5298156738281, "logps/rejected": -705.448974609375, "loss": 0.2534, "rewards/accuracies": 0.875, "rewards/chosen": -2.434752941131592, "rewards/margins": 8.95523738861084, "rewards/rejected": -11.389989852905273, "step": 494 }, { "epoch": 0.30793157076205285, "grad_norm": 0.7328986525535583, "learning_rate": 3.84508990318119e-06, "logits/chosen": -1.3002982139587402, "logits/rejected": 1.9528226852416992, "logps/chosen": -491.26318359375, "logps/rejected": -786.1280517578125, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -3.162794589996338, "rewards/margins": 9.359591484069824, "rewards/rejected": -12.522385597229004, "step": 495 }, { "epoch": 0.30855365474339036, "grad_norm": 0.018282007426023483, "learning_rate": 3.8416320885200555e-06, "logits/chosen": -0.3110625743865967, "logits/rejected": 1.5113799571990967, "logps/chosen": -575.611328125, "logps/rejected": -820.1544799804688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.139358997344971, "rewards/margins": 9.63023567199707, "rewards/rejected": -13.7695951461792, "step": 496 }, { "epoch": 0.3091757387247278, "grad_norm": 0.02998008392751217, "learning_rate": 3.838174273858922e-06, "logits/chosen": -2.100130558013916, "logits/rejected": 1.3824923038482666, "logps/chosen": -410.3341369628906, "logps/rejected": -741.41650390625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.641108512878418, "rewards/margins": 10.878056526184082, "rewards/rejected": -14.519164085388184, "step": 497 }, { "epoch": 0.3097978227060653, "grad_norm": 12.575135231018066, "learning_rate": 3.834716459197788e-06, "logits/chosen": -1.9138693809509277, "logits/rejected": -0.5079290270805359, "logps/chosen": -452.5340270996094, "logps/rejected": -563.9846801757812, "loss": 0.5736, "rewards/accuracies": 0.75, "rewards/chosen": -3.5396173000335693, "rewards/margins": 5.1526384353637695, "rewards/rejected": -8.692255973815918, "step": 498 }, { "epoch": 0.3104199066874028, "grad_norm": 14.566615104675293, "learning_rate": 3.831258644536653e-06, "logits/chosen": 1.34714674949646, "logits/rejected": 2.492173194885254, "logps/chosen": -466.7180480957031, "logps/rejected": -609.304443359375, "loss": 0.4464, "rewards/accuracies": 0.875, "rewards/chosen": -4.704555511474609, "rewards/margins": 6.536179065704346, "rewards/rejected": -11.240734100341797, "step": 499 }, { "epoch": 0.3110419906687403, "grad_norm": 11.887626647949219, "learning_rate": 3.827800829875519e-06, "logits/chosen": 0.07731494307518005, "logits/rejected": 1.488052248954773, "logps/chosen": -642.4278564453125, "logps/rejected": -806.1688232421875, "loss": 0.7587, "rewards/accuracies": 0.875, "rewards/chosen": -3.1446902751922607, "rewards/margins": 4.086873531341553, "rewards/rejected": -7.231564521789551, "step": 500 }, { "epoch": 0.3116640746500778, "grad_norm": 3.402850866317749, "learning_rate": 3.824343015214385e-06, "logits/chosen": -1.8811198472976685, "logits/rejected": 2.8428235054016113, "logps/chosen": -296.9568176269531, "logps/rejected": -648.8358154296875, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -2.247802734375, "rewards/margins": 7.176663875579834, "rewards/rejected": -9.424467086791992, "step": 501 }, { "epoch": 0.31228615863141523, "grad_norm": 0.5330920815467834, "learning_rate": 3.82088520055325e-06, "logits/chosen": -1.3517630100250244, "logits/rejected": 0.3058786988258362, "logps/chosen": -671.101806640625, "logps/rejected": -889.9093017578125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.337190628051758, "rewards/margins": 9.099037170410156, "rewards/rejected": -13.436226844787598, "step": 502 }, { "epoch": 0.31290824261275274, "grad_norm": 14.542479515075684, "learning_rate": 3.817427385892116e-06, "logits/chosen": -0.8564655780792236, "logits/rejected": 2.5220537185668945, "logps/chosen": -461.59002685546875, "logps/rejected": -748.9902954101562, "loss": 0.3539, "rewards/accuracies": 0.875, "rewards/chosen": -2.565673828125, "rewards/margins": 7.084969997406006, "rewards/rejected": -9.650644302368164, "step": 503 }, { "epoch": 0.3135303265940902, "grad_norm": 1.0605134963989258, "learning_rate": 3.813969571230982e-06, "logits/chosen": -0.6156041026115417, "logits/rejected": 1.9469146728515625, "logps/chosen": -472.7379150390625, "logps/rejected": -766.7308959960938, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": -0.2958894371986389, "rewards/margins": 9.338570594787598, "rewards/rejected": -9.634459495544434, "step": 504 }, { "epoch": 0.3141524105754277, "grad_norm": 16.123199462890625, "learning_rate": 3.810511756569848e-06, "logits/chosen": -3.274966239929199, "logits/rejected": 0.5726606845855713, "logps/chosen": -411.7179260253906, "logps/rejected": -775.1240844726562, "loss": 0.6741, "rewards/accuracies": 0.625, "rewards/chosen": -4.182135581970215, "rewards/margins": 8.18381118774414, "rewards/rejected": -12.365947723388672, "step": 505 }, { "epoch": 0.31477449455676515, "grad_norm": 1.260725975036621, "learning_rate": 3.807053941908714e-06, "logits/chosen": -1.6766161918640137, "logits/rejected": 1.5115097761154175, "logps/chosen": -538.420654296875, "logps/rejected": -822.8496704101562, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -4.350383281707764, "rewards/margins": 7.9692888259887695, "rewards/rejected": -12.319673538208008, "step": 506 }, { "epoch": 0.31539657853810266, "grad_norm": 0.22190283238887787, "learning_rate": 3.8035961272475796e-06, "logits/chosen": -3.3360042572021484, "logits/rejected": -0.38557305932044983, "logps/chosen": -427.2237548828125, "logps/rejected": -678.9574584960938, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.252829074859619, "rewards/margins": 8.173727035522461, "rewards/rejected": -11.426556587219238, "step": 507 }, { "epoch": 0.3160186625194401, "grad_norm": 0.20153078436851501, "learning_rate": 3.8001383125864457e-06, "logits/chosen": -2.166616201400757, "logits/rejected": 2.374972105026245, "logps/chosen": -334.0227355957031, "logps/rejected": -797.1873779296875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.405191659927368, "rewards/margins": 8.168095588684082, "rewards/rejected": -11.573287963867188, "step": 508 }, { "epoch": 0.3166407465007776, "grad_norm": 5.924962520599365, "learning_rate": 3.7966804979253114e-06, "logits/chosen": -1.59109365940094, "logits/rejected": 1.32582688331604, "logps/chosen": -438.9134521484375, "logps/rejected": -722.93115234375, "loss": 0.116, "rewards/accuracies": 0.875, "rewards/chosen": -3.9520082473754883, "rewards/margins": 8.584980010986328, "rewards/rejected": -12.536988258361816, "step": 509 }, { "epoch": 0.31726283048211507, "grad_norm": 0.2738174796104431, "learning_rate": 3.793222683264177e-06, "logits/chosen": -1.457834005355835, "logits/rejected": 1.2790567874908447, "logps/chosen": -340.47283935546875, "logps/rejected": -730.49365234375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.006382942199707, "rewards/margins": 13.690092086791992, "rewards/rejected": -15.696475982666016, "step": 510 }, { "epoch": 0.3178849144634526, "grad_norm": 11.29454231262207, "learning_rate": 3.789764868603043e-06, "logits/chosen": -1.2486698627471924, "logits/rejected": 1.699683666229248, "logps/chosen": -439.69573974609375, "logps/rejected": -803.3811645507812, "loss": 0.6416, "rewards/accuracies": 0.875, "rewards/chosen": -4.530969619750977, "rewards/margins": 6.18777322769165, "rewards/rejected": -10.718742370605469, "step": 511 }, { "epoch": 0.31850699844479, "grad_norm": 0.2264927327632904, "learning_rate": 3.786307053941909e-06, "logits/chosen": -2.2076144218444824, "logits/rejected": 0.48863470554351807, "logps/chosen": -339.59478759765625, "logps/rejected": -668.9453125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.214912176132202, "rewards/margins": 9.423866271972656, "rewards/rejected": -12.638778686523438, "step": 512 }, { "epoch": 0.31912908242612753, "grad_norm": 2.104602098464966, "learning_rate": 3.782849239280775e-06, "logits/chosen": -2.6314961910247803, "logits/rejected": 1.2773984670639038, "logps/chosen": -303.3700866699219, "logps/rejected": -633.6930541992188, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": -2.414785861968994, "rewards/margins": 7.669083595275879, "rewards/rejected": -10.083869934082031, "step": 513 }, { "epoch": 0.319751166407465, "grad_norm": 0.842685878276825, "learning_rate": 3.7793914246196406e-06, "logits/chosen": 0.19934800267219543, "logits/rejected": 1.76789128780365, "logps/chosen": -613.8941650390625, "logps/rejected": -859.8875732421875, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -2.8613290786743164, "rewards/margins": 11.134199142456055, "rewards/rejected": -13.995528221130371, "step": 514 }, { "epoch": 0.3203732503888025, "grad_norm": 4.6138763427734375, "learning_rate": 3.7759336099585063e-06, "logits/chosen": -1.4380714893341064, "logits/rejected": 0.7945225238800049, "logps/chosen": -495.5836486816406, "logps/rejected": -802.527587890625, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -3.6859843730926514, "rewards/margins": 10.857086181640625, "rewards/rejected": -14.543071746826172, "step": 515 }, { "epoch": 0.32099533437014, "grad_norm": 10.480985641479492, "learning_rate": 3.7724757952973724e-06, "logits/chosen": -0.5240859389305115, "logits/rejected": -0.2100089192390442, "logps/chosen": -607.87353515625, "logps/rejected": -731.7286987304688, "loss": 0.2647, "rewards/accuracies": 0.875, "rewards/chosen": -5.5673112869262695, "rewards/margins": 9.236555099487305, "rewards/rejected": -14.803866386413574, "step": 516 }, { "epoch": 0.32161741835147745, "grad_norm": 0.3771163821220398, "learning_rate": 3.769017980636238e-06, "logits/chosen": 0.947868287563324, "logits/rejected": 3.064948558807373, "logps/chosen": -546.0231323242188, "logps/rejected": -837.14697265625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -4.727848052978516, "rewards/margins": 12.068022727966309, "rewards/rejected": -16.79587173461914, "step": 517 }, { "epoch": 0.32223950233281495, "grad_norm": 6.13236665725708, "learning_rate": 3.765560165975104e-06, "logits/chosen": -0.7655460834503174, "logits/rejected": 2.2249984741210938, "logps/chosen": -499.16827392578125, "logps/rejected": -827.1525268554688, "loss": 0.0883, "rewards/accuracies": 0.875, "rewards/chosen": -2.575669765472412, "rewards/margins": 9.424173355102539, "rewards/rejected": -11.99984359741211, "step": 518 }, { "epoch": 0.3228615863141524, "grad_norm": 12.605459213256836, "learning_rate": 3.76210235131397e-06, "logits/chosen": -0.7498146295547485, "logits/rejected": 1.4736452102661133, "logps/chosen": -419.17236328125, "logps/rejected": -619.1251831054688, "loss": 0.655, "rewards/accuracies": 0.75, "rewards/chosen": -3.038458824157715, "rewards/margins": 7.873093605041504, "rewards/rejected": -10.911552429199219, "step": 519 }, { "epoch": 0.3234836702954899, "grad_norm": 8.781877517700195, "learning_rate": 3.7586445366528355e-06, "logits/chosen": -0.07907867431640625, "logits/rejected": 2.3776438236236572, "logps/chosen": -529.3690795898438, "logps/rejected": -800.8932495117188, "loss": 0.3022, "rewards/accuracies": 0.875, "rewards/chosen": -3.7072157859802246, "rewards/margins": 10.501382827758789, "rewards/rejected": -14.208599090576172, "step": 520 }, { "epoch": 0.32410575427682736, "grad_norm": 14.659026145935059, "learning_rate": 3.7551867219917016e-06, "logits/chosen": -1.6655278205871582, "logits/rejected": 1.5477590560913086, "logps/chosen": -510.2761535644531, "logps/rejected": -869.8839111328125, "loss": 0.4752, "rewards/accuracies": 0.75, "rewards/chosen": -4.096023082733154, "rewards/margins": 7.374521732330322, "rewards/rejected": -11.470544815063477, "step": 521 }, { "epoch": 0.32472783825816487, "grad_norm": 2.508837938308716, "learning_rate": 3.7517289073305673e-06, "logits/chosen": -2.2263786792755127, "logits/rejected": 0.19350892305374146, "logps/chosen": -504.9029235839844, "logps/rejected": -803.1685791015625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -3.4993441104888916, "rewards/margins": 7.087221622467041, "rewards/rejected": -10.586565971374512, "step": 522 }, { "epoch": 0.3253499222395023, "grad_norm": 0.0033622144255787134, "learning_rate": 3.7482710926694334e-06, "logits/chosen": 0.6291271448135376, "logits/rejected": 3.1799750328063965, "logps/chosen": -466.9697265625, "logps/rejected": -729.0017700195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.998875856399536, "rewards/margins": 11.616988182067871, "rewards/rejected": -14.615863800048828, "step": 523 }, { "epoch": 0.3259720062208398, "grad_norm": 0.37844160199165344, "learning_rate": 3.744813278008299e-06, "logits/chosen": -1.9292895793914795, "logits/rejected": 1.039460301399231, "logps/chosen": -502.3077392578125, "logps/rejected": -743.5958862304688, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.240077495574951, "rewards/margins": 11.387954711914062, "rewards/rejected": -14.628032684326172, "step": 524 }, { "epoch": 0.3265940902021773, "grad_norm": 2.326622486114502, "learning_rate": 3.741355463347165e-06, "logits/chosen": -1.195765733718872, "logits/rejected": -0.33443552255630493, "logps/chosen": -565.4251708984375, "logps/rejected": -670.3046875, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -4.506586074829102, "rewards/margins": 8.512721061706543, "rewards/rejected": -13.019308090209961, "step": 525 }, { "epoch": 0.3272161741835148, "grad_norm": 3.7585842609405518, "learning_rate": 3.737897648686031e-06, "logits/chosen": 0.8944180011749268, "logits/rejected": 2.3861947059631348, "logps/chosen": -641.789794921875, "logps/rejected": -848.3570556640625, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": -2.9837448596954346, "rewards/margins": 7.714084625244141, "rewards/rejected": -10.697829246520996, "step": 526 }, { "epoch": 0.32783825816485224, "grad_norm": 8.261795997619629, "learning_rate": 3.7344398340248965e-06, "logits/chosen": -0.5846983194351196, "logits/rejected": 1.9336605072021484, "logps/chosen": -457.1588134765625, "logps/rejected": -730.480712890625, "loss": 0.1265, "rewards/accuracies": 0.875, "rewards/chosen": -4.185298442840576, "rewards/margins": 7.148239612579346, "rewards/rejected": -11.333538055419922, "step": 527 }, { "epoch": 0.32846034214618974, "grad_norm": 0.5636158585548401, "learning_rate": 3.7309820193637626e-06, "logits/chosen": -0.9886958599090576, "logits/rejected": 0.4569053053855896, "logps/chosen": -553.538330078125, "logps/rejected": -709.1605224609375, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -4.610180854797363, "rewards/margins": 9.002449989318848, "rewards/rejected": -13.612629890441895, "step": 528 }, { "epoch": 0.3290824261275272, "grad_norm": 14.509989738464355, "learning_rate": 3.7275242047026282e-06, "logits/chosen": -1.2141422033309937, "logits/rejected": 2.026411533355713, "logps/chosen": -471.1565856933594, "logps/rejected": -768.511474609375, "loss": 0.4289, "rewards/accuracies": 0.875, "rewards/chosen": -3.50103497505188, "rewards/margins": 7.898999214172363, "rewards/rejected": -11.400033950805664, "step": 529 }, { "epoch": 0.3297045101088647, "grad_norm": 0.17644979059696198, "learning_rate": 3.7240663900414943e-06, "logits/chosen": 1.3656128644943237, "logits/rejected": 1.8121857643127441, "logps/chosen": -622.8464965820312, "logps/rejected": -735.2416381835938, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.719810724258423, "rewards/margins": 10.886140823364258, "rewards/rejected": -13.605951309204102, "step": 530 }, { "epoch": 0.33032659409020215, "grad_norm": 5.698550701141357, "learning_rate": 3.72060857538036e-06, "logits/chosen": -0.7565411925315857, "logits/rejected": 2.330198049545288, "logps/chosen": -498.67913818359375, "logps/rejected": -824.9256591796875, "loss": 0.2143, "rewards/accuracies": 0.875, "rewards/chosen": -2.509312629699707, "rewards/margins": 8.484278678894043, "rewards/rejected": -10.99359130859375, "step": 531 }, { "epoch": 0.33094867807153966, "grad_norm": 1.5997439622879028, "learning_rate": 3.7171507607192257e-06, "logits/chosen": -0.7979077696800232, "logits/rejected": 1.0519280433654785, "logps/chosen": -528.5811157226562, "logps/rejected": -668.3094482421875, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -4.973896026611328, "rewards/margins": 6.520534515380859, "rewards/rejected": -11.494430541992188, "step": 532 }, { "epoch": 0.33157076205287717, "grad_norm": 0.010816266760230064, "learning_rate": 3.713692946058092e-06, "logits/chosen": -0.38788050413131714, "logits/rejected": 1.4854451417922974, "logps/chosen": -638.2506103515625, "logps/rejected": -872.4563598632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.358449697494507, "rewards/margins": 12.082305908203125, "rewards/rejected": -15.440754890441895, "step": 533 }, { "epoch": 0.3321928460342146, "grad_norm": 0.12525595724582672, "learning_rate": 3.7102351313969575e-06, "logits/chosen": -1.6281386613845825, "logits/rejected": 1.0596306324005127, "logps/chosen": -436.7447814941406, "logps/rejected": -694.02978515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.996511936187744, "rewards/margins": 12.11928939819336, "rewards/rejected": -15.115800857543945, "step": 534 }, { "epoch": 0.3328149300155521, "grad_norm": 11.31616497039795, "learning_rate": 3.7067773167358236e-06, "logits/chosen": -1.3993080854415894, "logits/rejected": 1.7973817586898804, "logps/chosen": -475.78228759765625, "logps/rejected": -700.8096923828125, "loss": 0.6227, "rewards/accuracies": 0.875, "rewards/chosen": -3.8711557388305664, "rewards/margins": 4.232905387878418, "rewards/rejected": -8.104061126708984, "step": 535 }, { "epoch": 0.3334370139968896, "grad_norm": 5.261045932769775, "learning_rate": 3.7033195020746892e-06, "logits/chosen": -3.0142016410827637, "logits/rejected": -0.2594364881515503, "logps/chosen": -335.7835388183594, "logps/rejected": -602.826416015625, "loss": 0.4442, "rewards/accuracies": 0.875, "rewards/chosen": -3.0942542552948, "rewards/margins": 8.805808067321777, "rewards/rejected": -11.90006160736084, "step": 536 }, { "epoch": 0.3340590979782271, "grad_norm": 10.502153396606445, "learning_rate": 3.699861687413555e-06, "logits/chosen": -1.7810969352722168, "logits/rejected": -0.25532037019729614, "logps/chosen": -350.3209228515625, "logps/rejected": -535.1605224609375, "loss": 0.4754, "rewards/accuracies": 0.875, "rewards/chosen": -2.0644099712371826, "rewards/margins": 5.07130241394043, "rewards/rejected": -7.135712623596191, "step": 537 }, { "epoch": 0.33468118195956453, "grad_norm": 9.949618339538574, "learning_rate": 3.696403872752421e-06, "logits/chosen": -2.5537962913513184, "logits/rejected": -1.8489055633544922, "logps/chosen": -394.30810546875, "logps/rejected": -518.0142211914062, "loss": 0.2172, "rewards/accuracies": 0.875, "rewards/chosen": -3.6885135173797607, "rewards/margins": 5.3411173820495605, "rewards/rejected": -9.029630661010742, "step": 538 }, { "epoch": 0.33530326594090204, "grad_norm": 1.0713616609573364, "learning_rate": 3.6929460580912867e-06, "logits/chosen": -1.440812587738037, "logits/rejected": 0.14064207673072815, "logps/chosen": -376.575439453125, "logps/rejected": -539.2940673828125, "loss": 0.1056, "rewards/accuracies": 0.875, "rewards/chosen": -5.115077018737793, "rewards/margins": 8.650476455688477, "rewards/rejected": -13.765554428100586, "step": 539 }, { "epoch": 0.3359253499222395, "grad_norm": 6.506432056427002, "learning_rate": 3.6894882434301528e-06, "logits/chosen": -1.5465037822723389, "logits/rejected": 0.576355516910553, "logps/chosen": -417.2869873046875, "logps/rejected": -665.4422607421875, "loss": 0.1585, "rewards/accuracies": 0.875, "rewards/chosen": -3.5581021308898926, "rewards/margins": 8.063090324401855, "rewards/rejected": -11.621192932128906, "step": 540 }, { "epoch": 0.336547433903577, "grad_norm": 0.6153415441513062, "learning_rate": 3.6860304287690185e-06, "logits/chosen": -3.3456976413726807, "logits/rejected": 0.1794271022081375, "logps/chosen": -356.18096923828125, "logps/rejected": -713.1289672851562, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.143134355545044, "rewards/margins": 10.847480773925781, "rewards/rejected": -13.990614891052246, "step": 541 }, { "epoch": 0.33716951788491445, "grad_norm": 0.7708340287208557, "learning_rate": 3.6825726141078846e-06, "logits/chosen": -2.3884637355804443, "logits/rejected": 1.0711272954940796, "logps/chosen": -449.93011474609375, "logps/rejected": -780.32861328125, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -4.251334190368652, "rewards/margins": 8.545787811279297, "rewards/rejected": -12.797122955322266, "step": 542 }, { "epoch": 0.33779160186625196, "grad_norm": 18.22048568725586, "learning_rate": 3.6791147994467502e-06, "logits/chosen": -1.0135997533798218, "logits/rejected": 2.25089693069458, "logps/chosen": -531.9568481445312, "logps/rejected": -814.524658203125, "loss": 0.7637, "rewards/accuracies": 0.75, "rewards/chosen": -5.448249816894531, "rewards/margins": 8.08499526977539, "rewards/rejected": -13.533244132995605, "step": 543 }, { "epoch": 0.3384136858475894, "grad_norm": 9.034468650817871, "learning_rate": 3.675656984785616e-06, "logits/chosen": -4.084107875823975, "logits/rejected": 0.5394338369369507, "logps/chosen": -307.9772033691406, "logps/rejected": -713.9752197265625, "loss": 0.265, "rewards/accuracies": 0.875, "rewards/chosen": -3.4247775077819824, "rewards/margins": 6.039836406707764, "rewards/rejected": -9.464613914489746, "step": 544 }, { "epoch": 0.3390357698289269, "grad_norm": 0.45329979062080383, "learning_rate": 3.672199170124482e-06, "logits/chosen": -2.136702537536621, "logits/rejected": 1.7119501829147339, "logps/chosen": -376.43389892578125, "logps/rejected": -682.7272338867188, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.495503306388855, "rewards/margins": 8.205409049987793, "rewards/rejected": -9.700912475585938, "step": 545 }, { "epoch": 0.33965785381026437, "grad_norm": 0.9496540427207947, "learning_rate": 3.6687413554633473e-06, "logits/chosen": -1.8971904516220093, "logits/rejected": 1.8361307382583618, "logps/chosen": -359.90472412109375, "logps/rejected": -726.06689453125, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.106149911880493, "rewards/margins": 12.358431816101074, "rewards/rejected": -14.464581489562988, "step": 546 }, { "epoch": 0.34027993779160187, "grad_norm": 2.36594820022583, "learning_rate": 3.665283540802213e-06, "logits/chosen": -2.150912284851074, "logits/rejected": 1.0601011514663696, "logps/chosen": -483.8594665527344, "logps/rejected": -866.1015014648438, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -2.78558349609375, "rewards/margins": 13.141801834106445, "rewards/rejected": -15.927384376525879, "step": 547 }, { "epoch": 0.3409020217729393, "grad_norm": 0.3629215955734253, "learning_rate": 3.661825726141079e-06, "logits/chosen": -2.493523120880127, "logits/rejected": 1.7038516998291016, "logps/chosen": -320.1181945800781, "logps/rejected": -732.8024291992188, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -2.1839752197265625, "rewards/margins": 10.164045333862305, "rewards/rejected": -12.348020553588867, "step": 548 }, { "epoch": 0.34152410575427683, "grad_norm": 0.27537983655929565, "learning_rate": 3.6583679114799447e-06, "logits/chosen": 0.41598427295684814, "logits/rejected": 2.662325382232666, "logps/chosen": -503.9861755371094, "logps/rejected": -829.7061157226562, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.10223388671875, "rewards/margins": 12.926291465759277, "rewards/rejected": -16.028526306152344, "step": 549 }, { "epoch": 0.3421461897356143, "grad_norm": 0.30883321166038513, "learning_rate": 3.654910096818811e-06, "logits/chosen": -3.015754222869873, "logits/rejected": 0.4883459806442261, "logps/chosen": -354.55609130859375, "logps/rejected": -730.2498779296875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.242436170578003, "rewards/margins": 12.09792423248291, "rewards/rejected": -14.340360641479492, "step": 550 }, { "epoch": 0.3427682737169518, "grad_norm": 0.13555297255516052, "learning_rate": 3.6514522821576765e-06, "logits/chosen": -1.287082314491272, "logits/rejected": 1.1191844940185547, "logps/chosen": -488.5567626953125, "logps/rejected": -741.3524169921875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.8953022956848145, "rewards/margins": 10.395913124084473, "rewards/rejected": -15.291213989257812, "step": 551 }, { "epoch": 0.3433903576982893, "grad_norm": 9.899476051330566, "learning_rate": 3.647994467496542e-06, "logits/chosen": -2.6751604080200195, "logits/rejected": 1.8612481355667114, "logps/chosen": -393.312744140625, "logps/rejected": -820.8170776367188, "loss": 0.3608, "rewards/accuracies": 0.875, "rewards/chosen": -3.23514986038208, "rewards/margins": 8.356083869934082, "rewards/rejected": -11.591233253479004, "step": 552 }, { "epoch": 0.34401244167962675, "grad_norm": 8.679098129272461, "learning_rate": 3.6445366528354082e-06, "logits/chosen": 0.21109431982040405, "logits/rejected": 2.2748284339904785, "logps/chosen": -471.2346496582031, "logps/rejected": -642.8697509765625, "loss": 0.3117, "rewards/accuracies": 0.75, "rewards/chosen": -4.967227935791016, "rewards/margins": 7.156339168548584, "rewards/rejected": -12.123567581176758, "step": 553 }, { "epoch": 0.34463452566096425, "grad_norm": 10.258907318115234, "learning_rate": 3.641078838174274e-06, "logits/chosen": 0.08735692501068115, "logits/rejected": 0.5046429634094238, "logps/chosen": -606.5663452148438, "logps/rejected": -667.1451416015625, "loss": 0.3109, "rewards/accuracies": 0.875, "rewards/chosen": -4.395124435424805, "rewards/margins": 4.925813674926758, "rewards/rejected": -9.320938110351562, "step": 554 }, { "epoch": 0.3452566096423017, "grad_norm": 1.9337437152862549, "learning_rate": 3.63762102351314e-06, "logits/chosen": 0.23395615816116333, "logits/rejected": 1.4251956939697266, "logps/chosen": -549.6815795898438, "logps/rejected": -714.3658447265625, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": -4.2584381103515625, "rewards/margins": 10.403396606445312, "rewards/rejected": -14.661834716796875, "step": 555 }, { "epoch": 0.3458786936236392, "grad_norm": 0.5803667902946472, "learning_rate": 3.6341632088520057e-06, "logits/chosen": -0.9337853789329529, "logits/rejected": 2.008923292160034, "logps/chosen": -211.2215576171875, "logps/rejected": -567.2344360351562, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -1.0153312683105469, "rewards/margins": 8.598520278930664, "rewards/rejected": -9.613851547241211, "step": 556 }, { "epoch": 0.34650077760497666, "grad_norm": 3.5623018741607666, "learning_rate": 3.6307053941908714e-06, "logits/chosen": -0.7898664474487305, "logits/rejected": 0.7845414280891418, "logps/chosen": -645.83544921875, "logps/rejected": -847.7581787109375, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -6.258962631225586, "rewards/margins": 9.613384246826172, "rewards/rejected": -15.872346878051758, "step": 557 }, { "epoch": 0.34712286158631417, "grad_norm": 1.8435920476913452, "learning_rate": 3.6272475795297375e-06, "logits/chosen": -2.8293209075927734, "logits/rejected": -0.5850628018379211, "logps/chosen": -386.9139404296875, "logps/rejected": -572.6917724609375, "loss": 0.0779, "rewards/accuracies": 1.0, "rewards/chosen": -3.8048408031463623, "rewards/margins": 7.017304420471191, "rewards/rejected": -10.822145462036133, "step": 558 }, { "epoch": 0.3477449455676516, "grad_norm": 11.568678855895996, "learning_rate": 3.623789764868603e-06, "logits/chosen": -1.1972723007202148, "logits/rejected": -0.25566229224205017, "logps/chosen": -499.0116271972656, "logps/rejected": -610.2222900390625, "loss": 0.362, "rewards/accuracies": 0.75, "rewards/chosen": -3.608293056488037, "rewards/margins": 6.391110897064209, "rewards/rejected": -9.999403953552246, "step": 559 }, { "epoch": 0.3483670295489891, "grad_norm": 0.05713814124464989, "learning_rate": 3.6203319502074692e-06, "logits/chosen": -1.3840405941009521, "logits/rejected": 1.7053200006484985, "logps/chosen": -447.9930419921875, "logps/rejected": -788.389404296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.782646894454956, "rewards/margins": 14.257875442504883, "rewards/rejected": -17.0405216217041, "step": 560 }, { "epoch": 0.3489891135303266, "grad_norm": 9.388616561889648, "learning_rate": 3.616874135546335e-06, "logits/chosen": -0.5354636907577515, "logits/rejected": 2.170872211456299, "logps/chosen": -426.4405517578125, "logps/rejected": -754.2432861328125, "loss": 0.1855, "rewards/accuracies": 0.875, "rewards/chosen": -3.2466554641723633, "rewards/margins": 10.772565841674805, "rewards/rejected": -14.019221305847168, "step": 561 }, { "epoch": 0.3496111975116641, "grad_norm": 12.227457046508789, "learning_rate": 3.6134163208852006e-06, "logits/chosen": -0.02214604616165161, "logits/rejected": 0.2253643274307251, "logps/chosen": -606.9898681640625, "logps/rejected": -757.9022216796875, "loss": 0.5969, "rewards/accuracies": 0.75, "rewards/chosen": -5.836087226867676, "rewards/margins": 9.282434463500977, "rewards/rejected": -15.118520736694336, "step": 562 }, { "epoch": 0.35023328149300154, "grad_norm": 0.2231379747390747, "learning_rate": 3.6099585062240667e-06, "logits/chosen": -1.499894380569458, "logits/rejected": 1.6338140964508057, "logps/chosen": -276.1911315917969, "logps/rejected": -573.9856567382812, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -1.8737728595733643, "rewards/margins": 8.081971168518066, "rewards/rejected": -9.955743789672852, "step": 563 }, { "epoch": 0.35085536547433904, "grad_norm": 2.303009271621704, "learning_rate": 3.6065006915629324e-06, "logits/chosen": 0.8011282086372375, "logits/rejected": 1.9684299230575562, "logps/chosen": -665.9449462890625, "logps/rejected": -817.1199951171875, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -7.446137428283691, "rewards/margins": 7.944962978363037, "rewards/rejected": -15.391101837158203, "step": 564 }, { "epoch": 0.3514774494556765, "grad_norm": 0.04641938954591751, "learning_rate": 3.6030428769017985e-06, "logits/chosen": -3.346287965774536, "logits/rejected": 1.9346427917480469, "logps/chosen": -335.15081787109375, "logps/rejected": -796.55419921875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.27957284450531, "rewards/margins": 13.62586784362793, "rewards/rejected": -14.905441284179688, "step": 565 }, { "epoch": 0.352099533437014, "grad_norm": 0.15563420951366425, "learning_rate": 3.599585062240664e-06, "logits/chosen": -4.27918815612793, "logits/rejected": 0.7543269991874695, "logps/chosen": -432.3263244628906, "logps/rejected": -873.99658203125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.287291049957275, "rewards/margins": 14.165328979492188, "rewards/rejected": -18.452621459960938, "step": 566 }, { "epoch": 0.35272161741835145, "grad_norm": 0.5281693935394287, "learning_rate": 3.5961272475795302e-06, "logits/chosen": 0.0041316598653793335, "logits/rejected": 1.9111206531524658, "logps/chosen": -442.0358581542969, "logps/rejected": -665.741943359375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -5.391294956207275, "rewards/margins": 8.12633228302002, "rewards/rejected": -13.517627716064453, "step": 567 }, { "epoch": 0.35334370139968896, "grad_norm": 0.10786192119121552, "learning_rate": 3.592669432918396e-06, "logits/chosen": -1.3294379711151123, "logits/rejected": 1.5982346534729004, "logps/chosen": -359.5247802734375, "logps/rejected": -723.9915771484375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.146484375, "rewards/margins": 13.74747085571289, "rewards/rejected": -16.893953323364258, "step": 568 }, { "epoch": 0.35396578538102647, "grad_norm": 10.130666732788086, "learning_rate": 3.5892116182572616e-06, "logits/chosen": -0.8906326293945312, "logits/rejected": 1.1300044059753418, "logps/chosen": -465.86029052734375, "logps/rejected": -712.7401123046875, "loss": 0.3458, "rewards/accuracies": 0.875, "rewards/chosen": -6.319815635681152, "rewards/margins": 6.184998512268066, "rewards/rejected": -12.504813194274902, "step": 569 }, { "epoch": 0.3545878693623639, "grad_norm": 10.674747467041016, "learning_rate": 3.5857538035961277e-06, "logits/chosen": 0.02811729907989502, "logits/rejected": 2.2917301654815674, "logps/chosen": -606.5626220703125, "logps/rejected": -827.6113891601562, "loss": 0.4431, "rewards/accuracies": 0.875, "rewards/chosen": -5.6529221534729, "rewards/margins": 8.299846649169922, "rewards/rejected": -13.952768325805664, "step": 570 }, { "epoch": 0.3552099533437014, "grad_norm": 4.188703536987305, "learning_rate": 3.5822959889349933e-06, "logits/chosen": -1.2129759788513184, "logits/rejected": 0.35576122999191284, "logps/chosen": -458.3021545410156, "logps/rejected": -602.6893310546875, "loss": 0.1278, "rewards/accuracies": 0.875, "rewards/chosen": -3.9469237327575684, "rewards/margins": 6.4665045738220215, "rewards/rejected": -10.41342830657959, "step": 571 }, { "epoch": 0.3558320373250389, "grad_norm": 0.6425372958183289, "learning_rate": 3.5788381742738594e-06, "logits/chosen": -3.271240711212158, "logits/rejected": 0.7445456981658936, "logps/chosen": -400.28741455078125, "logps/rejected": -735.446044921875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -1.4091039896011353, "rewards/margins": 13.016305923461914, "rewards/rejected": -14.425411224365234, "step": 572 }, { "epoch": 0.3564541213063764, "grad_norm": 10.220561981201172, "learning_rate": 3.575380359612725e-06, "logits/chosen": 0.5601696372032166, "logits/rejected": 2.070058822631836, "logps/chosen": -590.73876953125, "logps/rejected": -807.1383666992188, "loss": 0.2437, "rewards/accuracies": 0.875, "rewards/chosen": -4.419074535369873, "rewards/margins": 6.512131690979004, "rewards/rejected": -10.931205749511719, "step": 573 }, { "epoch": 0.35707620528771383, "grad_norm": 12.59350299835205, "learning_rate": 3.571922544951591e-06, "logits/chosen": -0.3144303262233734, "logits/rejected": 0.3778662085533142, "logps/chosen": -523.6019897460938, "logps/rejected": -636.61376953125, "loss": 0.4461, "rewards/accuracies": 0.875, "rewards/chosen": -4.5513505935668945, "rewards/margins": 6.49387264251709, "rewards/rejected": -11.045223236083984, "step": 574 }, { "epoch": 0.35769828926905134, "grad_norm": 0.08127178996801376, "learning_rate": 3.568464730290457e-06, "logits/chosen": -2.8584237098693848, "logits/rejected": 2.1938881874084473, "logps/chosen": -370.448486328125, "logps/rejected": -802.3171997070312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.845527410507202, "rewards/margins": 12.984360694885254, "rewards/rejected": -15.829889297485352, "step": 575 }, { "epoch": 0.3583203732503888, "grad_norm": 4.162471294403076, "learning_rate": 3.5650069156293226e-06, "logits/chosen": -2.323119878768921, "logits/rejected": 1.5105665922164917, "logps/chosen": -318.3956298828125, "logps/rejected": -663.8530883789062, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": -1.511013388633728, "rewards/margins": 10.598156929016113, "rewards/rejected": -12.109170913696289, "step": 576 }, { "epoch": 0.3589424572317263, "grad_norm": 8.931647300720215, "learning_rate": 3.5615491009681887e-06, "logits/chosen": -2.188537120819092, "logits/rejected": 0.8879374265670776, "logps/chosen": -289.7794494628906, "logps/rejected": -564.8424682617188, "loss": 0.1795, "rewards/accuracies": 0.875, "rewards/chosen": -2.117579221725464, "rewards/margins": 8.888975143432617, "rewards/rejected": -11.006553649902344, "step": 577 }, { "epoch": 0.35956454121306375, "grad_norm": 0.19958320260047913, "learning_rate": 3.5580912863070543e-06, "logits/chosen": -0.07127795368432999, "logits/rejected": 2.1829771995544434, "logps/chosen": -580.6832885742188, "logps/rejected": -832.9295043945312, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.4673566818237305, "rewards/margins": 9.339287757873535, "rewards/rejected": -12.806644439697266, "step": 578 }, { "epoch": 0.36018662519440126, "grad_norm": 2.4940741062164307, "learning_rate": 3.55463347164592e-06, "logits/chosen": -1.362723708152771, "logits/rejected": -0.6218916177749634, "logps/chosen": -544.7763671875, "logps/rejected": -706.6228637695312, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": -5.22332763671875, "rewards/margins": 8.091087341308594, "rewards/rejected": -13.314414978027344, "step": 579 }, { "epoch": 0.3608087091757387, "grad_norm": 13.138229370117188, "learning_rate": 3.551175656984786e-06, "logits/chosen": -3.6776175498962402, "logits/rejected": -0.90312659740448, "logps/chosen": -371.3901062011719, "logps/rejected": -612.3235473632812, "loss": 0.3667, "rewards/accuracies": 0.875, "rewards/chosen": -2.0644450187683105, "rewards/margins": 9.278863906860352, "rewards/rejected": -11.343308448791504, "step": 580 }, { "epoch": 0.3614307931570762, "grad_norm": 4.825253486633301, "learning_rate": 3.5477178423236518e-06, "logits/chosen": 0.47648248076438904, "logits/rejected": 0.8091368675231934, "logps/chosen": -651.9102783203125, "logps/rejected": -759.71240234375, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": -3.543034076690674, "rewards/margins": 9.73138427734375, "rewards/rejected": -13.274417877197266, "step": 581 }, { "epoch": 0.36205287713841366, "grad_norm": 16.05032730102539, "learning_rate": 3.544260027662518e-06, "logits/chosen": -1.3461394309997559, "logits/rejected": 0.3981730043888092, "logps/chosen": -546.50732421875, "logps/rejected": -755.5911865234375, "loss": 0.4652, "rewards/accuracies": 0.75, "rewards/chosen": -5.663764953613281, "rewards/margins": 7.335277080535889, "rewards/rejected": -12.999042510986328, "step": 582 }, { "epoch": 0.36267496111975117, "grad_norm": 9.22971248626709, "learning_rate": 3.5408022130013836e-06, "logits/chosen": -1.8521195650100708, "logits/rejected": -0.6690396666526794, "logps/chosen": -654.583251953125, "logps/rejected": -754.9990234375, "loss": 0.2637, "rewards/accuracies": 0.875, "rewards/chosen": -6.476754665374756, "rewards/margins": 8.86435604095459, "rewards/rejected": -15.34111213684082, "step": 583 }, { "epoch": 0.3632970451010886, "grad_norm": 2.485956907272339, "learning_rate": 3.5373443983402496e-06, "logits/chosen": -0.7629578709602356, "logits/rejected": 2.0757927894592285, "logps/chosen": -502.5543212890625, "logps/rejected": -753.355224609375, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -4.870576858520508, "rewards/margins": 8.276368141174316, "rewards/rejected": -13.146944046020508, "step": 584 }, { "epoch": 0.36391912908242613, "grad_norm": 18.756977081298828, "learning_rate": 3.5338865836791153e-06, "logits/chosen": -0.1907253861427307, "logits/rejected": 2.7296154499053955, "logps/chosen": -681.9442749023438, "logps/rejected": -892.1580810546875, "loss": 1.1382, "rewards/accuracies": 0.625, "rewards/chosen": -6.803852558135986, "rewards/margins": 5.605984210968018, "rewards/rejected": -12.409836769104004, "step": 585 }, { "epoch": 0.3645412130637636, "grad_norm": 0.13891826570034027, "learning_rate": 3.530428769017981e-06, "logits/chosen": 0.43008822202682495, "logits/rejected": 1.9625842571258545, "logps/chosen": -513.066162109375, "logps/rejected": -692.7510986328125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.2135009765625, "rewards/margins": 9.864789962768555, "rewards/rejected": -14.078290939331055, "step": 586 }, { "epoch": 0.3651632970451011, "grad_norm": 0.10694268345832825, "learning_rate": 3.526970954356847e-06, "logits/chosen": -2.66204833984375, "logits/rejected": -0.035879313945770264, "logps/chosen": -443.89642333984375, "logps/rejected": -697.0347900390625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.310057640075684, "rewards/margins": 10.101310729980469, "rewards/rejected": -14.411367416381836, "step": 587 }, { "epoch": 0.3657853810264386, "grad_norm": 2.841848850250244, "learning_rate": 3.5235131396957128e-06, "logits/chosen": -0.43357720971107483, "logits/rejected": 1.6553572416305542, "logps/chosen": -521.71337890625, "logps/rejected": -709.72119140625, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": -3.428434371948242, "rewards/margins": 4.903413772583008, "rewards/rejected": -8.33184814453125, "step": 588 }, { "epoch": 0.36640746500777605, "grad_norm": 4.603222370147705, "learning_rate": 3.520055325034579e-06, "logits/chosen": -2.4097042083740234, "logits/rejected": -0.660600483417511, "logps/chosen": -412.7869567871094, "logps/rejected": -595.8972778320312, "loss": 0.1871, "rewards/accuracies": 0.875, "rewards/chosen": -3.1279032230377197, "rewards/margins": 7.688718795776367, "rewards/rejected": -10.816621780395508, "step": 589 }, { "epoch": 0.36702954898911355, "grad_norm": 7.155322551727295, "learning_rate": 3.5165975103734445e-06, "logits/chosen": -0.57789146900177, "logits/rejected": 2.2653307914733887, "logps/chosen": -649.032470703125, "logps/rejected": -905.6093139648438, "loss": 0.1561, "rewards/accuracies": 0.875, "rewards/chosen": -4.389533519744873, "rewards/margins": 5.931771278381348, "rewards/rejected": -10.321305274963379, "step": 590 }, { "epoch": 0.367651632970451, "grad_norm": 1.660971999168396, "learning_rate": 3.51313969571231e-06, "logits/chosen": -1.1013941764831543, "logits/rejected": 1.913456678390503, "logps/chosen": -399.48492431640625, "logps/rejected": -754.8251342773438, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -3.572141647338867, "rewards/margins": 11.750726699829102, "rewards/rejected": -15.322869300842285, "step": 591 }, { "epoch": 0.3682737169517885, "grad_norm": 9.850090980529785, "learning_rate": 3.509681881051176e-06, "logits/chosen": -3.5999162197113037, "logits/rejected": 0.8830776214599609, "logps/chosen": -319.4024353027344, "logps/rejected": -676.4735107421875, "loss": 0.5008, "rewards/accuracies": 0.875, "rewards/chosen": -3.6058483123779297, "rewards/margins": 7.200740814208984, "rewards/rejected": -10.806589126586914, "step": 592 }, { "epoch": 0.36889580093312596, "grad_norm": 9.681914329528809, "learning_rate": 3.5062240663900416e-06, "logits/chosen": -0.25577443838119507, "logits/rejected": 0.5999268293380737, "logps/chosen": -614.1806030273438, "logps/rejected": -719.989013671875, "loss": 0.2491, "rewards/accuracies": 0.875, "rewards/chosen": -7.469549179077148, "rewards/margins": 7.456684112548828, "rewards/rejected": -14.926233291625977, "step": 593 }, { "epoch": 0.36951788491446347, "grad_norm": 5.425186634063721, "learning_rate": 3.5027662517289072e-06, "logits/chosen": 0.6980199813842773, "logits/rejected": 2.567262887954712, "logps/chosen": -510.6280822753906, "logps/rejected": -763.8569946289062, "loss": 0.1217, "rewards/accuracies": 0.875, "rewards/chosen": -4.351706027984619, "rewards/margins": 8.793228149414062, "rewards/rejected": -13.14493465423584, "step": 594 }, { "epoch": 0.3701399688958009, "grad_norm": 14.428213119506836, "learning_rate": 3.4993084370677733e-06, "logits/chosen": 0.33437275886535645, "logits/rejected": 0.2007305920124054, "logps/chosen": -572.3868408203125, "logps/rejected": -618.7791137695312, "loss": 0.4857, "rewards/accuracies": 0.875, "rewards/chosen": -3.1451921463012695, "rewards/margins": 6.1744890213012695, "rewards/rejected": -9.319682121276855, "step": 595 }, { "epoch": 0.3707620528771384, "grad_norm": 0.6404609084129333, "learning_rate": 3.495850622406639e-06, "logits/chosen": -1.6780277490615845, "logits/rejected": 2.2958567142486572, "logps/chosen": -502.4979248046875, "logps/rejected": -811.8475341796875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.7590198516845703, "rewards/margins": 9.367961883544922, "rewards/rejected": -13.126981735229492, "step": 596 }, { "epoch": 0.3713841368584759, "grad_norm": 10.088691711425781, "learning_rate": 3.492392807745505e-06, "logits/chosen": 0.10908472537994385, "logits/rejected": 2.108067274093628, "logps/chosen": -598.8475341796875, "logps/rejected": -826.8762817382812, "loss": 0.1818, "rewards/accuracies": 0.875, "rewards/chosen": -4.634230613708496, "rewards/margins": 10.756006240844727, "rewards/rejected": -15.390235900878906, "step": 597 }, { "epoch": 0.3720062208398134, "grad_norm": 9.540902137756348, "learning_rate": 3.4889349930843708e-06, "logits/chosen": -1.3241510391235352, "logits/rejected": 0.23328295350074768, "logps/chosen": -427.7373352050781, "logps/rejected": -588.4275512695312, "loss": 0.3211, "rewards/accuracies": 0.875, "rewards/chosen": -2.8973727226257324, "rewards/margins": 7.729485988616943, "rewards/rejected": -10.626858711242676, "step": 598 }, { "epoch": 0.37262830482115084, "grad_norm": 0.08032742142677307, "learning_rate": 3.4854771784232365e-06, "logits/chosen": -2.312735080718994, "logits/rejected": 1.8967992067337036, "logps/chosen": -302.22308349609375, "logps/rejected": -685.40185546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.7433247566223145, "rewards/margins": 9.569781303405762, "rewards/rejected": -11.313105583190918, "step": 599 }, { "epoch": 0.37325038880248834, "grad_norm": 0.3465149402618408, "learning_rate": 3.4820193637621026e-06, "logits/chosen": 1.166282057762146, "logits/rejected": 2.296764612197876, "logps/chosen": -549.34130859375, "logps/rejected": -804.96484375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.6653378009796143, "rewards/margins": 11.625070571899414, "rewards/rejected": -15.290409088134766, "step": 600 }, { "epoch": 0.3738724727838258, "grad_norm": 3.32840633392334, "learning_rate": 3.4785615491009682e-06, "logits/chosen": 0.8780882358551025, "logits/rejected": 2.3796565532684326, "logps/chosen": -610.108642578125, "logps/rejected": -718.9921875, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": -3.9912304878234863, "rewards/margins": 7.084120750427246, "rewards/rejected": -11.07535171508789, "step": 601 }, { "epoch": 0.3744945567651633, "grad_norm": 8.247523307800293, "learning_rate": 3.4751037344398343e-06, "logits/chosen": -0.3969684839248657, "logits/rejected": 1.5738637447357178, "logps/chosen": -614.709716796875, "logps/rejected": -781.9828491210938, "loss": 0.2636, "rewards/accuracies": 0.875, "rewards/chosen": -4.695312023162842, "rewards/margins": 6.795304775238037, "rewards/rejected": -11.490617752075195, "step": 602 }, { "epoch": 0.37511664074650075, "grad_norm": 0.2280217856168747, "learning_rate": 3.4716459197787e-06, "logits/chosen": -3.126319408416748, "logits/rejected": 0.5818637013435364, "logps/chosen": -400.0958251953125, "logps/rejected": -724.36083984375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.3570053577423096, "rewards/margins": 7.459135055541992, "rewards/rejected": -9.816141128540039, "step": 603 }, { "epoch": 0.37573872472783826, "grad_norm": 6.304959297180176, "learning_rate": 3.4681881051175657e-06, "logits/chosen": -0.439656138420105, "logits/rejected": 2.232123374938965, "logps/chosen": -400.6800537109375, "logps/rejected": -685.3218994140625, "loss": 0.1682, "rewards/accuracies": 0.875, "rewards/chosen": -1.5219635963439941, "rewards/margins": 8.997200965881348, "rewards/rejected": -10.5191650390625, "step": 604 }, { "epoch": 0.37636080870917576, "grad_norm": 5.485596179962158, "learning_rate": 3.4647302904564318e-06, "logits/chosen": 0.9790081977844238, "logits/rejected": 0.47594118118286133, "logps/chosen": -607.5552978515625, "logps/rejected": -596.5540771484375, "loss": 0.1478, "rewards/accuracies": 0.875, "rewards/chosen": -3.246467113494873, "rewards/margins": 4.4095964431762695, "rewards/rejected": -7.656063079833984, "step": 605 }, { "epoch": 0.3769828926905132, "grad_norm": 0.3173864185810089, "learning_rate": 3.4612724757952974e-06, "logits/chosen": -0.0989447832107544, "logits/rejected": 0.6339715719223022, "logps/chosen": -512.7080688476562, "logps/rejected": -689.3731689453125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.911623001098633, "rewards/margins": 8.461206436157227, "rewards/rejected": -11.37282943725586, "step": 606 }, { "epoch": 0.3776049766718507, "grad_norm": 12.235793113708496, "learning_rate": 3.4578146611341635e-06, "logits/chosen": -1.4652231931686401, "logits/rejected": 0.750752329826355, "logps/chosen": -465.0700378417969, "logps/rejected": -669.3615112304688, "loss": 0.3429, "rewards/accuracies": 0.875, "rewards/chosen": -4.025668621063232, "rewards/margins": 7.724769115447998, "rewards/rejected": -11.75043773651123, "step": 607 }, { "epoch": 0.3782270606531882, "grad_norm": 4.71773624420166, "learning_rate": 3.4543568464730292e-06, "logits/chosen": -0.9253799915313721, "logits/rejected": 1.3774627447128296, "logps/chosen": -500.6837158203125, "logps/rejected": -703.6849365234375, "loss": 0.1067, "rewards/accuracies": 0.875, "rewards/chosen": -4.101426124572754, "rewards/margins": 6.542513847351074, "rewards/rejected": -10.643939971923828, "step": 608 }, { "epoch": 0.3788491446345257, "grad_norm": 0.012396390549838543, "learning_rate": 3.4508990318118953e-06, "logits/chosen": -2.927866220474243, "logits/rejected": 1.1039155721664429, "logps/chosen": -366.47821044921875, "logps/rejected": -759.7095336914062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.9519448280334473, "rewards/margins": 11.533233642578125, "rewards/rejected": -13.485177993774414, "step": 609 }, { "epoch": 0.37947122861586313, "grad_norm": 0.7829126119613647, "learning_rate": 3.447441217150761e-06, "logits/chosen": -0.870071291923523, "logits/rejected": 1.4296960830688477, "logps/chosen": -547.1006469726562, "logps/rejected": -800.30810546875, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -2.688765048980713, "rewards/margins": 9.182876586914062, "rewards/rejected": -11.871642112731934, "step": 610 }, { "epoch": 0.38009331259720064, "grad_norm": 0.6450961232185364, "learning_rate": 3.4439834024896267e-06, "logits/chosen": -2.6679694652557373, "logits/rejected": 0.2706039845943451, "logps/chosen": -308.663330078125, "logps/rejected": -586.9254150390625, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -2.3235578536987305, "rewards/margins": 8.911864280700684, "rewards/rejected": -11.235421180725098, "step": 611 }, { "epoch": 0.3807153965785381, "grad_norm": 1.8323769569396973, "learning_rate": 3.4405255878284928e-06, "logits/chosen": -1.716994047164917, "logits/rejected": 0.4489266872406006, "logps/chosen": -453.8514099121094, "logps/rejected": -687.9720458984375, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -4.9780426025390625, "rewards/margins": 8.454078674316406, "rewards/rejected": -13.432121276855469, "step": 612 }, { "epoch": 0.3813374805598756, "grad_norm": 3.5956695079803467, "learning_rate": 3.4370677731673584e-06, "logits/chosen": -1.049445390701294, "logits/rejected": 0.3866933286190033, "logps/chosen": -518.9500122070312, "logps/rejected": -705.39208984375, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": -2.675973415374756, "rewards/margins": 10.559874534606934, "rewards/rejected": -13.235847473144531, "step": 613 }, { "epoch": 0.38195956454121305, "grad_norm": 0.013759827241301537, "learning_rate": 3.4336099585062245e-06, "logits/chosen": -2.6414260864257812, "logits/rejected": 0.9212683439254761, "logps/chosen": -323.85552978515625, "logps/rejected": -719.2473754882812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.227468729019165, "rewards/margins": 11.276058197021484, "rewards/rejected": -13.50352668762207, "step": 614 }, { "epoch": 0.38258164852255055, "grad_norm": 9.476208686828613, "learning_rate": 3.43015214384509e-06, "logits/chosen": 0.8636153936386108, "logits/rejected": 1.4760973453521729, "logps/chosen": -534.2947998046875, "logps/rejected": -687.3189697265625, "loss": 0.1864, "rewards/accuracies": 0.875, "rewards/chosen": -4.719242572784424, "rewards/margins": 8.637039184570312, "rewards/rejected": -13.356282234191895, "step": 615 }, { "epoch": 0.383203732503888, "grad_norm": 4.835727691650391, "learning_rate": 3.426694329183956e-06, "logits/chosen": -0.138904869556427, "logits/rejected": 2.0865416526794434, "logps/chosen": -478.1963806152344, "logps/rejected": -780.6767578125, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -2.921337842941284, "rewards/margins": 11.439977645874023, "rewards/rejected": -14.361315727233887, "step": 616 }, { "epoch": 0.3838258164852255, "grad_norm": 0.3291782736778259, "learning_rate": 3.423236514522822e-06, "logits/chosen": 0.7893213033676147, "logits/rejected": 2.3378148078918457, "logps/chosen": -557.59814453125, "logps/rejected": -759.2935791015625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.7330162525177, "rewards/margins": 12.447348594665527, "rewards/rejected": -16.18036460876465, "step": 617 }, { "epoch": 0.38444790046656296, "grad_norm": 0.1337420642375946, "learning_rate": 3.4197786998616877e-06, "logits/chosen": 0.838657021522522, "logits/rejected": 2.8171944618225098, "logps/chosen": -491.823974609375, "logps/rejected": -749.5892333984375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.4607564210891724, "rewards/margins": 11.052667617797852, "rewards/rejected": -12.51342487335205, "step": 618 }, { "epoch": 0.38506998444790047, "grad_norm": 0.13039371371269226, "learning_rate": 3.4163208852005538e-06, "logits/chosen": -0.4781593978404999, "logits/rejected": 1.8465954065322876, "logps/chosen": -512.5775146484375, "logps/rejected": -744.61865234375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8931097388267517, "rewards/margins": 10.159296035766602, "rewards/rejected": -11.052406311035156, "step": 619 }, { "epoch": 0.3856920684292379, "grad_norm": 10.752327919006348, "learning_rate": 3.4128630705394194e-06, "logits/chosen": -1.6839624643325806, "logits/rejected": 0.2860128879547119, "logps/chosen": -431.9507751464844, "logps/rejected": -674.1161499023438, "loss": 0.5541, "rewards/accuracies": 0.875, "rewards/chosen": -3.7423267364501953, "rewards/margins": 8.271852493286133, "rewards/rejected": -12.014179229736328, "step": 620 }, { "epoch": 0.38631415241057543, "grad_norm": 1.2350847721099854, "learning_rate": 3.409405255878285e-06, "logits/chosen": -2.0917627811431885, "logits/rejected": 2.1637320518493652, "logps/chosen": -298.88629150390625, "logps/rejected": -691.8558349609375, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": -2.245750904083252, "rewards/margins": 8.862000465393066, "rewards/rejected": -11.107751846313477, "step": 621 }, { "epoch": 0.38693623639191294, "grad_norm": 1.6924008131027222, "learning_rate": 3.405947441217151e-06, "logits/chosen": 0.6900919675827026, "logits/rejected": 2.304530382156372, "logps/chosen": -602.36181640625, "logps/rejected": -836.4732666015625, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": -4.097162246704102, "rewards/margins": 9.744330406188965, "rewards/rejected": -13.841492652893066, "step": 622 }, { "epoch": 0.3875583203732504, "grad_norm": 3.527686357498169, "learning_rate": 3.402489626556017e-06, "logits/chosen": 0.2741889953613281, "logits/rejected": 2.6344118118286133, "logps/chosen": -505.8238525390625, "logps/rejected": -794.1734008789062, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": -2.780407190322876, "rewards/margins": 9.670248031616211, "rewards/rejected": -12.450654983520508, "step": 623 }, { "epoch": 0.3881804043545879, "grad_norm": 0.06536146253347397, "learning_rate": 3.399031811894883e-06, "logits/chosen": -3.9738125801086426, "logits/rejected": 1.4645802974700928, "logps/chosen": -277.276611328125, "logps/rejected": -761.3095703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.8830457925796509, "rewards/margins": 11.07553768157959, "rewards/rejected": -11.958582878112793, "step": 624 }, { "epoch": 0.38880248833592534, "grad_norm": 7.360811233520508, "learning_rate": 3.3955739972337486e-06, "logits/chosen": -2.614194393157959, "logits/rejected": 1.265906810760498, "logps/chosen": -441.8808898925781, "logps/rejected": -836.4769287109375, "loss": 0.1801, "rewards/accuracies": 0.875, "rewards/chosen": -3.7520408630371094, "rewards/margins": 8.943009376525879, "rewards/rejected": -12.695051193237305, "step": 625 }, { "epoch": 0.38942457231726285, "grad_norm": 0.5498335361480713, "learning_rate": 3.3921161825726147e-06, "logits/chosen": 0.47069358825683594, "logits/rejected": 1.683117151260376, "logps/chosen": -426.58795166015625, "logps/rejected": -663.201904296875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.2837743759155273, "rewards/margins": 10.185606002807617, "rewards/rejected": -13.469379425048828, "step": 626 }, { "epoch": 0.3900466562986003, "grad_norm": 0.33489876985549927, "learning_rate": 3.3886583679114804e-06, "logits/chosen": 1.3098037242889404, "logits/rejected": 1.8481377363204956, "logps/chosen": -729.124267578125, "logps/rejected": -847.1063232421875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.406599998474121, "rewards/margins": 8.989585876464844, "rewards/rejected": -13.396186828613281, "step": 627 }, { "epoch": 0.3906687402799378, "grad_norm": 0.3350321352481842, "learning_rate": 3.385200553250346e-06, "logits/chosen": -0.9219592809677124, "logits/rejected": 1.6338512897491455, "logps/chosen": -482.4681701660156, "logps/rejected": -762.8888549804688, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -6.556829452514648, "rewards/margins": 8.05724048614502, "rewards/rejected": -14.614070892333984, "step": 628 }, { "epoch": 0.39129082426127526, "grad_norm": 0.5914133191108704, "learning_rate": 3.381742738589212e-06, "logits/chosen": -3.1034719944000244, "logits/rejected": 2.0504133701324463, "logps/chosen": -184.3067626953125, "logps/rejected": -645.0467529296875, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -0.8897975087165833, "rewards/margins": 12.707173347473145, "rewards/rejected": -13.59697151184082, "step": 629 }, { "epoch": 0.39191290824261277, "grad_norm": 0.3379708528518677, "learning_rate": 3.378284923928078e-06, "logits/chosen": -1.8138878345489502, "logits/rejected": 2.2676093578338623, "logps/chosen": -444.2542419433594, "logps/rejected": -805.8871459960938, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.3795870542526245, "rewards/margins": 12.329050064086914, "rewards/rejected": -13.708637237548828, "step": 630 }, { "epoch": 0.3925349922239502, "grad_norm": 0.19982405006885529, "learning_rate": 3.374827109266944e-06, "logits/chosen": -0.49219873547554016, "logits/rejected": 1.3186544179916382, "logps/chosen": -441.41485595703125, "logps/rejected": -677.80908203125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.186324119567871, "rewards/margins": 9.863755226135254, "rewards/rejected": -14.050078392028809, "step": 631 }, { "epoch": 0.3931570762052877, "grad_norm": 2.6581051349639893, "learning_rate": 3.3713692946058096e-06, "logits/chosen": -1.6293195486068726, "logits/rejected": 1.8264111280441284, "logps/chosen": -481.6075744628906, "logps/rejected": -792.4683837890625, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": -2.976865291595459, "rewards/margins": 8.871871948242188, "rewards/rejected": -11.848735809326172, "step": 632 }, { "epoch": 0.3937791601866252, "grad_norm": 10.728352546691895, "learning_rate": 3.3679114799446753e-06, "logits/chosen": 0.6592696905136108, "logits/rejected": 0.32174065709114075, "logps/chosen": -543.1920166015625, "logps/rejected": -591.5214233398438, "loss": 0.3952, "rewards/accuracies": 0.875, "rewards/chosen": -3.338846445083618, "rewards/margins": 5.663122177124023, "rewards/rejected": -9.001968383789062, "step": 633 }, { "epoch": 0.3944012441679627, "grad_norm": 0.14632946252822876, "learning_rate": 3.3644536652835414e-06, "logits/chosen": 0.12235406041145325, "logits/rejected": 1.50004243850708, "logps/chosen": -404.7801513671875, "logps/rejected": -564.116455078125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -2.130303382873535, "rewards/margins": 8.51298999786377, "rewards/rejected": -10.643293380737305, "step": 634 }, { "epoch": 0.39502332814930013, "grad_norm": 0.010982617735862732, "learning_rate": 3.360995850622407e-06, "logits/chosen": -0.8696433305740356, "logits/rejected": 2.04046368598938, "logps/chosen": -405.26544189453125, "logps/rejected": -737.1810302734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.0155234336853027, "rewards/margins": 11.292312622070312, "rewards/rejected": -13.307836532592773, "step": 635 }, { "epoch": 0.39564541213063764, "grad_norm": 0.008551651611924171, "learning_rate": 3.3575380359612723e-06, "logits/chosen": -1.0029269456863403, "logits/rejected": 2.016935348510742, "logps/chosen": -402.2550048828125, "logps/rejected": -808.053955078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.901893138885498, "rewards/margins": 14.855788230895996, "rewards/rejected": -17.757680892944336, "step": 636 }, { "epoch": 0.3962674961119751, "grad_norm": 10.20107364654541, "learning_rate": 3.3540802213001384e-06, "logits/chosen": 0.1482783555984497, "logits/rejected": 1.983933925628662, "logps/chosen": -486.354248046875, "logps/rejected": -715.10888671875, "loss": 0.5624, "rewards/accuracies": 0.75, "rewards/chosen": -3.7223360538482666, "rewards/margins": 7.077658176422119, "rewards/rejected": -10.799993515014648, "step": 637 }, { "epoch": 0.3968895800933126, "grad_norm": 9.907259941101074, "learning_rate": 3.350622406639004e-06, "logits/chosen": 0.43466904759407043, "logits/rejected": 1.9535962343215942, "logps/chosen": -463.94183349609375, "logps/rejected": -611.06591796875, "loss": 0.1748, "rewards/accuracies": 0.875, "rewards/chosen": -3.0628433227539062, "rewards/margins": 8.665346145629883, "rewards/rejected": -11.728190422058105, "step": 638 }, { "epoch": 0.39751166407465005, "grad_norm": 0.08071774244308472, "learning_rate": 3.34716459197787e-06, "logits/chosen": -2.1568260192871094, "logits/rejected": 1.1422406435012817, "logps/chosen": -358.576904296875, "logps/rejected": -684.7677001953125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.0779173374176025, "rewards/margins": 12.04561710357666, "rewards/rejected": -14.12353515625, "step": 639 }, { "epoch": 0.39813374805598756, "grad_norm": 3.7608110904693604, "learning_rate": 3.343706777316736e-06, "logits/chosen": -1.308854579925537, "logits/rejected": 2.2386837005615234, "logps/chosen": -519.892822265625, "logps/rejected": -830.5930786132812, "loss": 0.0707, "rewards/accuracies": 1.0, "rewards/chosen": -3.429892063140869, "rewards/margins": 13.151208877563477, "rewards/rejected": -16.581100463867188, "step": 640 }, { "epoch": 0.39875583203732506, "grad_norm": 0.00955191534012556, "learning_rate": 3.3402489626556016e-06, "logits/chosen": 1.3519172668457031, "logits/rejected": 2.5916457176208496, "logps/chosen": -578.5109252929688, "logps/rejected": -806.9434814453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.251760482788086, "rewards/margins": 12.449394226074219, "rewards/rejected": -16.701156616210938, "step": 641 }, { "epoch": 0.3993779160186625, "grad_norm": 2.7427241802215576, "learning_rate": 3.3367911479944676e-06, "logits/chosen": -2.505798578262329, "logits/rejected": 2.1072421073913574, "logps/chosen": -397.797607421875, "logps/rejected": -726.3534545898438, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -3.953930377960205, "rewards/margins": 8.82105827331543, "rewards/rejected": -12.774989128112793, "step": 642 }, { "epoch": 0.4, "grad_norm": 0.9937102794647217, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.7306221723556519, "logits/rejected": 2.230940818786621, "logps/chosen": -393.63934326171875, "logps/rejected": -729.7666015625, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": -2.313868284225464, "rewards/margins": 11.42313003540039, "rewards/rejected": -13.736998558044434, "step": 643 }, { "epoch": 0.4006220839813375, "grad_norm": 0.0032642753794789314, "learning_rate": 3.3298755186721994e-06, "logits/chosen": -1.7426426410675049, "logits/rejected": 1.5063778162002563, "logps/chosen": -356.5050354003906, "logps/rejected": -697.7349853515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8635871410369873, "rewards/margins": 11.682292938232422, "rewards/rejected": -13.545880317687988, "step": 644 }, { "epoch": 0.401244167962675, "grad_norm": 0.07923304289579391, "learning_rate": 3.326417704011065e-06, "logits/chosen": -0.8975597620010376, "logits/rejected": 1.3860437870025635, "logps/chosen": -442.74053955078125, "logps/rejected": -723.7567138671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.4147753715515137, "rewards/margins": 12.292531967163086, "rewards/rejected": -13.707306861877441, "step": 645 }, { "epoch": 0.40186625194401243, "grad_norm": 14.276515007019043, "learning_rate": 3.3229598893499308e-06, "logits/chosen": -3.1505322456359863, "logits/rejected": 2.158036708831787, "logps/chosen": -312.1207275390625, "logps/rejected": -768.1331787109375, "loss": 0.2472, "rewards/accuracies": 0.875, "rewards/chosen": -3.6551642417907715, "rewards/margins": 8.738365173339844, "rewards/rejected": -12.393529891967773, "step": 646 }, { "epoch": 0.40248833592534994, "grad_norm": 7.409354209899902, "learning_rate": 3.319502074688797e-06, "logits/chosen": -1.052610993385315, "logits/rejected": 1.4959089756011963, "logps/chosen": -493.83843994140625, "logps/rejected": -754.4874267578125, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": -3.238936424255371, "rewards/margins": 5.7519941329956055, "rewards/rejected": -8.990930557250977, "step": 647 }, { "epoch": 0.4031104199066874, "grad_norm": 1.6471129655838013, "learning_rate": 3.3160442600276625e-06, "logits/chosen": -0.6442841291427612, "logits/rejected": 1.4412320852279663, "logps/chosen": -485.85205078125, "logps/rejected": -771.9279174804688, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -4.235671043395996, "rewards/margins": 9.86855697631836, "rewards/rejected": -14.104228019714355, "step": 648 }, { "epoch": 0.4037325038880249, "grad_norm": 0.09166280180215836, "learning_rate": 3.3125864453665286e-06, "logits/chosen": -0.5237993001937866, "logits/rejected": 2.456305503845215, "logps/chosen": -485.4988098144531, "logps/rejected": -766.07568359375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.3998935222625732, "rewards/margins": 10.237448692321777, "rewards/rejected": -11.63734245300293, "step": 649 }, { "epoch": 0.40435458786936235, "grad_norm": 4.0653510093688965, "learning_rate": 3.3091286307053943e-06, "logits/chosen": -0.23118279874324799, "logits/rejected": 0.7203813791275024, "logps/chosen": -446.2249450683594, "logps/rejected": -642.3449096679688, "loss": 0.1779, "rewards/accuracies": 0.875, "rewards/chosen": -3.4704999923706055, "rewards/margins": 10.632834434509277, "rewards/rejected": -14.103334426879883, "step": 650 }, { "epoch": 0.40497667185069985, "grad_norm": 6.155623912811279, "learning_rate": 3.3056708160442604e-06, "logits/chosen": -1.9541127681732178, "logits/rejected": 1.3531765937805176, "logps/chosen": -433.037841796875, "logps/rejected": -725.62548828125, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": -3.8255207538604736, "rewards/margins": 8.994915962219238, "rewards/rejected": -12.820436477661133, "step": 651 }, { "epoch": 0.4055987558320373, "grad_norm": 11.532463073730469, "learning_rate": 3.302213001383126e-06, "logits/chosen": 0.44987526535987854, "logits/rejected": 2.840175151824951, "logps/chosen": -422.11566162109375, "logps/rejected": -635.8391723632812, "loss": 0.6355, "rewards/accuracies": 0.875, "rewards/chosen": -3.0207712650299072, "rewards/margins": 9.024643898010254, "rewards/rejected": -12.045415878295898, "step": 652 }, { "epoch": 0.4062208398133748, "grad_norm": 6.931861877441406, "learning_rate": 3.2987551867219918e-06, "logits/chosen": -1.7386195659637451, "logits/rejected": 1.8302826881408691, "logps/chosen": -408.3453369140625, "logps/rejected": -669.2549438476562, "loss": 0.1751, "rewards/accuracies": 0.875, "rewards/chosen": -3.9353740215301514, "rewards/margins": 7.6817240715026855, "rewards/rejected": -11.617098808288574, "step": 653 }, { "epoch": 0.40684292379471226, "grad_norm": 10.324091911315918, "learning_rate": 3.295297372060858e-06, "logits/chosen": -3.9733428955078125, "logits/rejected": 1.7603663206100464, "logps/chosen": -415.6043701171875, "logps/rejected": -912.5545043945312, "loss": 0.5934, "rewards/accuracies": 0.875, "rewards/chosen": -1.8949611186981201, "rewards/margins": 15.61867904663086, "rewards/rejected": -17.513639450073242, "step": 654 }, { "epoch": 0.40746500777604977, "grad_norm": 1.9727427959442139, "learning_rate": 3.2918395573997235e-06, "logits/chosen": 1.6869640350341797, "logits/rejected": 1.757272720336914, "logps/chosen": -694.1307983398438, "logps/rejected": -868.170654296875, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -3.939452648162842, "rewards/margins": 10.84363079071045, "rewards/rejected": -14.78308391571045, "step": 655 }, { "epoch": 0.4080870917573872, "grad_norm": 0.9235299229621887, "learning_rate": 3.2883817427385896e-06, "logits/chosen": -2.2736637592315674, "logits/rejected": -0.09766936302185059, "logps/chosen": -301.520263671875, "logps/rejected": -528.9983520507812, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -2.1056957244873047, "rewards/margins": 7.543599605560303, "rewards/rejected": -9.649295806884766, "step": 656 }, { "epoch": 0.40870917573872473, "grad_norm": 0.5496695637702942, "learning_rate": 3.2849239280774553e-06, "logits/chosen": 0.44226568937301636, "logits/rejected": 1.7233555316925049, "logps/chosen": -558.8419189453125, "logps/rejected": -779.1488037109375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.686925888061523, "rewards/margins": 7.51154899597168, "rewards/rejected": -12.198474884033203, "step": 657 }, { "epoch": 0.40933125972006223, "grad_norm": 0.19928692281246185, "learning_rate": 3.281466113416321e-06, "logits/chosen": -2.6942951679229736, "logits/rejected": 1.5296684503555298, "logps/chosen": -357.0150146484375, "logps/rejected": -693.2840576171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.054616689682007, "rewards/margins": 9.228242874145508, "rewards/rejected": -11.282859802246094, "step": 658 }, { "epoch": 0.4099533437013997, "grad_norm": 0.2332155704498291, "learning_rate": 3.278008298755187e-06, "logits/chosen": -1.3595741987228394, "logits/rejected": 2.147449016571045, "logps/chosen": -477.114501953125, "logps/rejected": -865.1211547851562, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.952955722808838, "rewards/margins": 10.84054183959961, "rewards/rejected": -14.793497085571289, "step": 659 }, { "epoch": 0.4105754276827372, "grad_norm": 11.046630859375, "learning_rate": 3.2745504840940528e-06, "logits/chosen": -1.9563723802566528, "logits/rejected": 1.2342233657836914, "logps/chosen": -384.26904296875, "logps/rejected": -685.3289794921875, "loss": 0.3145, "rewards/accuracies": 0.875, "rewards/chosen": -5.211310863494873, "rewards/margins": 8.171344757080078, "rewards/rejected": -13.382655143737793, "step": 660 }, { "epoch": 0.41119751166407464, "grad_norm": 0.175026535987854, "learning_rate": 3.271092669432919e-06, "logits/chosen": -1.295689582824707, "logits/rejected": 2.76332950592041, "logps/chosen": -325.86407470703125, "logps/rejected": -678.9881591796875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.0810530185699463, "rewards/margins": 8.375470161437988, "rewards/rejected": -11.456521987915039, "step": 661 }, { "epoch": 0.41181959564541215, "grad_norm": 5.545994758605957, "learning_rate": 3.2676348547717845e-06, "logits/chosen": 0.27531301975250244, "logits/rejected": 1.683681607246399, "logps/chosen": -550.1492919921875, "logps/rejected": -778.548095703125, "loss": 0.1445, "rewards/accuracies": 0.875, "rewards/chosen": -4.033134937286377, "rewards/margins": 8.30805778503418, "rewards/rejected": -12.341192245483398, "step": 662 }, { "epoch": 0.4124416796267496, "grad_norm": 2.88179349899292, "learning_rate": 3.26417704011065e-06, "logits/chosen": 0.9971730709075928, "logits/rejected": 1.7520931959152222, "logps/chosen": -578.3229370117188, "logps/rejected": -708.9879760742188, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": -5.013568878173828, "rewards/margins": 7.123456954956055, "rewards/rejected": -12.137025833129883, "step": 663 }, { "epoch": 0.4130637636080871, "grad_norm": 13.079041481018066, "learning_rate": 3.2607192254495163e-06, "logits/chosen": 0.30747872591018677, "logits/rejected": 2.0404927730560303, "logps/chosen": -530.911376953125, "logps/rejected": -700.7384033203125, "loss": 0.5205, "rewards/accuracies": 0.875, "rewards/chosen": -6.095573425292969, "rewards/margins": 6.448812961578369, "rewards/rejected": -12.544386863708496, "step": 664 }, { "epoch": 0.41368584758942456, "grad_norm": 6.013885021209717, "learning_rate": 3.257261410788382e-06, "logits/chosen": -1.8929672241210938, "logits/rejected": 0.21973100304603577, "logps/chosen": -284.0830993652344, "logps/rejected": -498.4759521484375, "loss": 0.1698, "rewards/accuracies": 0.875, "rewards/chosen": -1.4939789772033691, "rewards/margins": 9.740009307861328, "rewards/rejected": -11.233988761901855, "step": 665 }, { "epoch": 0.41430793157076207, "grad_norm": 4.243404865264893, "learning_rate": 3.253803596127248e-06, "logits/chosen": 0.4065598249435425, "logits/rejected": 1.6949890851974487, "logps/chosen": -667.820068359375, "logps/rejected": -825.6172485351562, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": -2.675194263458252, "rewards/margins": 9.292121887207031, "rewards/rejected": -11.967315673828125, "step": 666 }, { "epoch": 0.4149300155520995, "grad_norm": 8.427876472473145, "learning_rate": 3.2503457814661137e-06, "logits/chosen": 0.38863468170166016, "logits/rejected": 1.513594150543213, "logps/chosen": -443.318603515625, "logps/rejected": -661.3179931640625, "loss": 0.5181, "rewards/accuracies": 0.875, "rewards/chosen": -3.784163475036621, "rewards/margins": 8.133288383483887, "rewards/rejected": -11.917451858520508, "step": 667 }, { "epoch": 0.415552099533437, "grad_norm": 4.463931560516357, "learning_rate": 3.24688796680498e-06, "logits/chosen": 0.586110532283783, "logits/rejected": 0.6465033888816833, "logps/chosen": -705.3720703125, "logps/rejected": -812.8052368164062, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": -6.306274890899658, "rewards/margins": 7.507619857788086, "rewards/rejected": -13.813894271850586, "step": 668 }, { "epoch": 0.4161741835147745, "grad_norm": 0.03926939144730568, "learning_rate": 3.2434301521438455e-06, "logits/chosen": -1.8786470890045166, "logits/rejected": 1.2086478471755981, "logps/chosen": -510.6563720703125, "logps/rejected": -868.0140380859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.5629770755767822, "rewards/margins": 15.31007194519043, "rewards/rejected": -18.873050689697266, "step": 669 }, { "epoch": 0.416796267496112, "grad_norm": 0.346242219209671, "learning_rate": 3.239972337482711e-06, "logits/chosen": -2.0090367794036865, "logits/rejected": 1.8708921670913696, "logps/chosen": -343.2098083496094, "logps/rejected": -729.623291015625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.690193772315979, "rewards/margins": 11.829962730407715, "rewards/rejected": -13.520155906677246, "step": 670 }, { "epoch": 0.41741835147744943, "grad_norm": 1.4998290538787842, "learning_rate": 3.2365145228215773e-06, "logits/chosen": -1.6840519905090332, "logits/rejected": 0.8073079586029053, "logps/chosen": -370.5256652832031, "logps/rejected": -660.1571044921875, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": -2.5298609733581543, "rewards/margins": 10.871633529663086, "rewards/rejected": -13.401494979858398, "step": 671 }, { "epoch": 0.41804043545878694, "grad_norm": 9.775925636291504, "learning_rate": 3.233056708160443e-06, "logits/chosen": -2.1028800010681152, "logits/rejected": 2.2222957611083984, "logps/chosen": -444.228271484375, "logps/rejected": -841.1094970703125, "loss": 0.2433, "rewards/accuracies": 0.875, "rewards/chosen": -4.033388137817383, "rewards/margins": 10.472757339477539, "rewards/rejected": -14.506145477294922, "step": 672 }, { "epoch": 0.4186625194401244, "grad_norm": 0.22538970410823822, "learning_rate": 3.229598893499309e-06, "logits/chosen": -5.363725662231445, "logits/rejected": 1.5873980522155762, "logps/chosen": -215.72366333007812, "logps/rejected": -752.394775390625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.562450885772705, "rewards/margins": 10.679720878601074, "rewards/rejected": -13.242172241210938, "step": 673 }, { "epoch": 0.4192846034214619, "grad_norm": 11.903820037841797, "learning_rate": 3.2261410788381747e-06, "logits/chosen": 0.5031086206436157, "logits/rejected": 1.5423349142074585, "logps/chosen": -592.8444213867188, "logps/rejected": -737.1952514648438, "loss": 0.6264, "rewards/accuracies": 0.75, "rewards/chosen": -5.330404758453369, "rewards/margins": 5.104585647583008, "rewards/rejected": -10.434989929199219, "step": 674 }, { "epoch": 0.4199066874027994, "grad_norm": 11.08259105682373, "learning_rate": 3.2226832641770404e-06, "logits/chosen": -1.1205742359161377, "logits/rejected": 0.9305554628372192, "logps/chosen": -397.8836669921875, "logps/rejected": -724.8209228515625, "loss": 0.6319, "rewards/accuracies": 0.875, "rewards/chosen": -2.0175395011901855, "rewards/margins": 9.977502822875977, "rewards/rejected": -11.99504280090332, "step": 675 }, { "epoch": 0.42052877138413686, "grad_norm": 10.631376266479492, "learning_rate": 3.2192254495159065e-06, "logits/chosen": -2.171011209487915, "logits/rejected": 1.2756054401397705, "logps/chosen": -503.49969482421875, "logps/rejected": -801.864501953125, "loss": 0.3474, "rewards/accuracies": 0.875, "rewards/chosen": -3.3229598999023438, "rewards/margins": 10.785900115966797, "rewards/rejected": -14.10886001586914, "step": 676 }, { "epoch": 0.42115085536547436, "grad_norm": 6.985343933105469, "learning_rate": 3.215767634854772e-06, "logits/chosen": -1.799622654914856, "logits/rejected": 0.653231680393219, "logps/chosen": -459.76947021484375, "logps/rejected": -729.4825439453125, "loss": 0.177, "rewards/accuracies": 0.875, "rewards/chosen": -3.7904040813446045, "rewards/margins": 9.216272354125977, "rewards/rejected": -13.00667667388916, "step": 677 }, { "epoch": 0.4217729393468118, "grad_norm": 3.563753366470337, "learning_rate": 3.2123098201936383e-06, "logits/chosen": -1.5398551225662231, "logits/rejected": 1.9844664335250854, "logps/chosen": -382.30657958984375, "logps/rejected": -689.4683837890625, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -2.0840530395507812, "rewards/margins": 9.961143493652344, "rewards/rejected": -12.045196533203125, "step": 678 }, { "epoch": 0.4223950233281493, "grad_norm": 9.403151512145996, "learning_rate": 3.208852005532504e-06, "logits/chosen": -1.2714649438858032, "logits/rejected": 1.8569127321243286, "logps/chosen": -438.8021240234375, "logps/rejected": -810.9757690429688, "loss": 0.3888, "rewards/accuracies": 0.875, "rewards/chosen": -3.917652130126953, "rewards/margins": 9.64877986907959, "rewards/rejected": -13.566431999206543, "step": 679 }, { "epoch": 0.4230171073094868, "grad_norm": 11.618382453918457, "learning_rate": 3.2053941908713696e-06, "logits/chosen": -1.2261033058166504, "logits/rejected": 2.25425386428833, "logps/chosen": -391.17877197265625, "logps/rejected": -704.4224243164062, "loss": 0.3948, "rewards/accuracies": 0.875, "rewards/chosen": -3.7610974311828613, "rewards/margins": 6.7278265953063965, "rewards/rejected": -10.488924980163574, "step": 680 }, { "epoch": 0.4236391912908243, "grad_norm": 0.20979534089565277, "learning_rate": 3.2019363762102353e-06, "logits/chosen": 0.27849024534225464, "logits/rejected": 2.715374231338501, "logps/chosen": -510.5223388671875, "logps/rejected": -851.9589233398438, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.393610954284668, "rewards/margins": 13.354633331298828, "rewards/rejected": -17.748245239257812, "step": 681 }, { "epoch": 0.42426127527216173, "grad_norm": 0.8542295098304749, "learning_rate": 3.198478561549101e-06, "logits/chosen": -0.7644785642623901, "logits/rejected": 0.45673567056655884, "logps/chosen": -349.2669372558594, "logps/rejected": -521.93505859375, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -1.6909774541854858, "rewards/margins": 10.30357837677002, "rewards/rejected": -11.99455451965332, "step": 682 }, { "epoch": 0.42488335925349924, "grad_norm": 0.01458861492574215, "learning_rate": 3.1950207468879666e-06, "logits/chosen": -1.65714693069458, "logits/rejected": 2.468562602996826, "logps/chosen": -485.73907470703125, "logps/rejected": -816.0733642578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.868625521659851, "rewards/margins": 13.75146484375, "rewards/rejected": -15.620089530944824, "step": 683 }, { "epoch": 0.4255054432348367, "grad_norm": 0.16329510509967804, "learning_rate": 3.1915629322268327e-06, "logits/chosen": -2.704524517059326, "logits/rejected": 2.3724300861358643, "logps/chosen": -350.6132507324219, "logps/rejected": -782.5008544921875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.5192220211029053, "rewards/margins": 13.39132308959961, "rewards/rejected": -15.910545349121094, "step": 684 }, { "epoch": 0.4261275272161742, "grad_norm": 3.075333833694458, "learning_rate": 3.1881051175656984e-06, "logits/chosen": 0.09814734756946564, "logits/rejected": 2.5649566650390625, "logps/chosen": -435.3100280761719, "logps/rejected": -710.2622680664062, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -3.2551212310791016, "rewards/margins": 13.05367374420166, "rewards/rejected": -16.308795928955078, "step": 685 }, { "epoch": 0.42674961119751165, "grad_norm": 4.477731704711914, "learning_rate": 3.1846473029045645e-06, "logits/chosen": -0.5426784753799438, "logits/rejected": 2.770493745803833, "logps/chosen": -437.3888244628906, "logps/rejected": -719.5175170898438, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": -3.0361671447753906, "rewards/margins": 7.626326084136963, "rewards/rejected": -10.662492752075195, "step": 686 }, { "epoch": 0.42737169517884915, "grad_norm": 12.728575706481934, "learning_rate": 3.18118948824343e-06, "logits/chosen": -0.3973678946495056, "logits/rejected": 1.5318927764892578, "logps/chosen": -614.48876953125, "logps/rejected": -800.8861694335938, "loss": 0.915, "rewards/accuracies": 0.875, "rewards/chosen": -4.877591609954834, "rewards/margins": 7.510666847229004, "rewards/rejected": -12.38825798034668, "step": 687 }, { "epoch": 0.4279937791601866, "grad_norm": 10.613053321838379, "learning_rate": 3.177731673582296e-06, "logits/chosen": -3.200235366821289, "logits/rejected": 0.0967845618724823, "logps/chosen": -402.4068298339844, "logps/rejected": -743.91552734375, "loss": 0.4169, "rewards/accuracies": 0.875, "rewards/chosen": -3.119838237762451, "rewards/margins": 9.32817268371582, "rewards/rejected": -12.448010444641113, "step": 688 }, { "epoch": 0.4286158631415241, "grad_norm": 0.1688246726989746, "learning_rate": 3.174273858921162e-06, "logits/chosen": -1.1192599534988403, "logits/rejected": 1.6919562816619873, "logps/chosen": -317.35858154296875, "logps/rejected": -616.4557495117188, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -3.227689504623413, "rewards/margins": 8.496843338012695, "rewards/rejected": -11.724532127380371, "step": 689 }, { "epoch": 0.42923794712286156, "grad_norm": 10.537223815917969, "learning_rate": 3.1708160442600276e-06, "logits/chosen": -2.1871795654296875, "logits/rejected": 0.5226501226425171, "logps/chosen": -387.8138427734375, "logps/rejected": -594.34228515625, "loss": 0.4599, "rewards/accuracies": 0.875, "rewards/chosen": -3.1344966888427734, "rewards/margins": 6.28641414642334, "rewards/rejected": -9.420909881591797, "step": 690 }, { "epoch": 0.42986003110419907, "grad_norm": 14.032938957214355, "learning_rate": 3.1673582295988937e-06, "logits/chosen": -2.1785640716552734, "logits/rejected": 2.0351693630218506, "logps/chosen": -394.87017822265625, "logps/rejected": -723.1807861328125, "loss": 0.5494, "rewards/accuracies": 0.75, "rewards/chosen": -5.515456199645996, "rewards/margins": 6.703310966491699, "rewards/rejected": -12.218767166137695, "step": 691 }, { "epoch": 0.4304821150855365, "grad_norm": 6.9674811363220215, "learning_rate": 3.1639004149377594e-06, "logits/chosen": -1.6190794706344604, "logits/rejected": 1.2349615097045898, "logps/chosen": -403.44091796875, "logps/rejected": -673.282470703125, "loss": 0.1543, "rewards/accuracies": 0.875, "rewards/chosen": -4.28837776184082, "rewards/margins": 6.536858558654785, "rewards/rejected": -10.825236320495605, "step": 692 }, { "epoch": 0.431104199066874, "grad_norm": 14.982398986816406, "learning_rate": 3.1604426002766255e-06, "logits/chosen": 2.256753921508789, "logits/rejected": 3.7474100589752197, "logps/chosen": -663.500244140625, "logps/rejected": -776.5855712890625, "loss": 0.6059, "rewards/accuracies": 0.75, "rewards/chosen": -5.132218360900879, "rewards/margins": 4.795773506164551, "rewards/rejected": -9.92799186706543, "step": 693 }, { "epoch": 0.43172628304821153, "grad_norm": 0.47392502427101135, "learning_rate": 3.156984785615491e-06, "logits/chosen": -0.23346151411533356, "logits/rejected": -0.31524187326431274, "logps/chosen": -559.4511108398438, "logps/rejected": -601.6775512695312, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -4.320326805114746, "rewards/margins": 10.536422729492188, "rewards/rejected": -14.856748580932617, "step": 694 }, { "epoch": 0.432348367029549, "grad_norm": 1.6567822694778442, "learning_rate": 3.153526970954357e-06, "logits/chosen": -1.3000528812408447, "logits/rejected": 2.235485315322876, "logps/chosen": -334.5820617675781, "logps/rejected": -678.7525634765625, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -3.799971103668213, "rewards/margins": 11.104300498962402, "rewards/rejected": -14.90427017211914, "step": 695 }, { "epoch": 0.4329704510108865, "grad_norm": 0.014551909640431404, "learning_rate": 3.150069156293223e-06, "logits/chosen": -2.5936341285705566, "logits/rejected": 2.0377118587493896, "logps/chosen": -374.3807678222656, "logps/rejected": -886.470703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.8306432962417603, "rewards/margins": 13.963092803955078, "rewards/rejected": -15.793737411499023, "step": 696 }, { "epoch": 0.43359253499222394, "grad_norm": 2.012770414352417, "learning_rate": 3.1466113416320886e-06, "logits/chosen": 0.5872510075569153, "logits/rejected": 2.227753162384033, "logps/chosen": -581.9154052734375, "logps/rejected": -734.0648803710938, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -2.7733664512634277, "rewards/margins": 7.580231189727783, "rewards/rejected": -10.353597640991211, "step": 697 }, { "epoch": 0.43421461897356145, "grad_norm": 5.446049690246582, "learning_rate": 3.1431535269709547e-06, "logits/chosen": -0.752666175365448, "logits/rejected": -0.42898476123809814, "logps/chosen": -566.1244506835938, "logps/rejected": -664.2681884765625, "loss": 0.1064, "rewards/accuracies": 0.875, "rewards/chosen": -4.451830863952637, "rewards/margins": 6.750617980957031, "rewards/rejected": -11.202448844909668, "step": 698 }, { "epoch": 0.4348367029548989, "grad_norm": 3.0100889205932617, "learning_rate": 3.1396957123098204e-06, "logits/chosen": -0.9783447980880737, "logits/rejected": 2.5343995094299316, "logps/chosen": -311.4595642089844, "logps/rejected": -667.2244262695312, "loss": 0.1135, "rewards/accuracies": 0.875, "rewards/chosen": -2.1098272800445557, "rewards/margins": 9.217432975769043, "rewards/rejected": -11.32726001739502, "step": 699 }, { "epoch": 0.4354587869362364, "grad_norm": 0.08864054828882217, "learning_rate": 3.136237897648686e-06, "logits/chosen": -1.035918951034546, "logits/rejected": 2.5011227130889893, "logps/chosen": -406.7484130859375, "logps/rejected": -790.5151977539062, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.589345932006836, "rewards/margins": 11.779244422912598, "rewards/rejected": -14.368590354919434, "step": 700 }, { "epoch": 0.43608087091757386, "grad_norm": 10.84559440612793, "learning_rate": 3.132780082987552e-06, "logits/chosen": -2.094726085662842, "logits/rejected": 1.918656349182129, "logps/chosen": -385.4011535644531, "logps/rejected": -745.6541137695312, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": -3.678028106689453, "rewards/margins": 8.90552043914795, "rewards/rejected": -12.583548545837402, "step": 701 }, { "epoch": 0.43670295489891137, "grad_norm": 0.029643505811691284, "learning_rate": 3.129322268326418e-06, "logits/chosen": -1.365593671798706, "logits/rejected": 2.027764320373535, "logps/chosen": -446.7522277832031, "logps/rejected": -780.4906005859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.691719055175781, "rewards/margins": 9.797523498535156, "rewards/rejected": -14.489242553710938, "step": 702 }, { "epoch": 0.4373250388802488, "grad_norm": 0.034818943589925766, "learning_rate": 3.125864453665284e-06, "logits/chosen": -2.489362955093384, "logits/rejected": 2.157773494720459, "logps/chosen": -248.21383666992188, "logps/rejected": -742.4315795898438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.8762140870094299, "rewards/margins": 13.77511978149414, "rewards/rejected": -14.651333808898926, "step": 703 }, { "epoch": 0.4379471228615863, "grad_norm": 3.0396409034729004, "learning_rate": 3.1224066390041496e-06, "logits/chosen": 0.5511890053749084, "logits/rejected": 2.797116279602051, "logps/chosen": -588.095703125, "logps/rejected": -828.099365234375, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": -5.251364707946777, "rewards/margins": 7.736400127410889, "rewards/rejected": -12.987764358520508, "step": 704 }, { "epoch": 0.4385692068429238, "grad_norm": 7.324325084686279, "learning_rate": 3.1189488243430153e-06, "logits/chosen": -1.0149247646331787, "logits/rejected": 0.8560662865638733, "logps/chosen": -441.1534423828125, "logps/rejected": -678.6373901367188, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": -5.113785743713379, "rewards/margins": 8.92860221862793, "rewards/rejected": -14.042387008666992, "step": 705 }, { "epoch": 0.4391912908242613, "grad_norm": 6.743057727813721, "learning_rate": 3.1154910096818814e-06, "logits/chosen": -0.7383509278297424, "logits/rejected": 0.15974515676498413, "logps/chosen": -404.7921447753906, "logps/rejected": -522.1397094726562, "loss": 0.1585, "rewards/accuracies": 0.875, "rewards/chosen": -2.208122491836548, "rewards/margins": 8.250724792480469, "rewards/rejected": -10.458847045898438, "step": 706 }, { "epoch": 0.43981337480559873, "grad_norm": 0.2867901921272278, "learning_rate": 3.112033195020747e-06, "logits/chosen": 0.014238402247428894, "logits/rejected": 2.236502170562744, "logps/chosen": -504.8896484375, "logps/rejected": -780.6438598632812, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.396859169006348, "rewards/margins": 12.214816093444824, "rewards/rejected": -16.611675262451172, "step": 707 }, { "epoch": 0.44043545878693624, "grad_norm": 0.05427764356136322, "learning_rate": 3.108575380359613e-06, "logits/chosen": -1.5963308811187744, "logits/rejected": 1.8816163539886475, "logps/chosen": -299.90447998046875, "logps/rejected": -635.202392578125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.732032299041748, "rewards/margins": 11.46823787689209, "rewards/rejected": -14.200270652770996, "step": 708 }, { "epoch": 0.4410575427682737, "grad_norm": 2.9901983737945557, "learning_rate": 3.105117565698479e-06, "logits/chosen": -4.364803791046143, "logits/rejected": 0.19566810131072998, "logps/chosen": -361.5037536621094, "logps/rejected": -761.5491333007812, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": -4.059610843658447, "rewards/margins": 9.0576810836792, "rewards/rejected": -13.117291450500488, "step": 709 }, { "epoch": 0.4416796267496112, "grad_norm": 5.741774082183838, "learning_rate": 3.101659751037345e-06, "logits/chosen": 0.6587420701980591, "logits/rejected": 2.1166939735412598, "logps/chosen": -504.3636779785156, "logps/rejected": -716.9227294921875, "loss": 0.1106, "rewards/accuracies": 0.875, "rewards/chosen": -2.365894317626953, "rewards/margins": 9.8243408203125, "rewards/rejected": -12.190235137939453, "step": 710 }, { "epoch": 0.4423017107309487, "grad_norm": 4.629523754119873, "learning_rate": 3.0982019363762106e-06, "logits/chosen": -0.34517642855644226, "logits/rejected": 2.3793282508850098, "logps/chosen": -483.5632019042969, "logps/rejected": -749.6613159179688, "loss": 0.1452, "rewards/accuracies": 0.875, "rewards/chosen": -3.045201063156128, "rewards/margins": 9.88872241973877, "rewards/rejected": -12.933923721313477, "step": 711 }, { "epoch": 0.44292379471228616, "grad_norm": 6.142333030700684, "learning_rate": 3.0947441217150763e-06, "logits/chosen": -0.11096763610839844, "logits/rejected": 3.2173640727996826, "logps/chosen": -455.15771484375, "logps/rejected": -739.3885498046875, "loss": 0.1334, "rewards/accuracies": 0.875, "rewards/chosen": -2.7734031677246094, "rewards/margins": 10.121636390686035, "rewards/rejected": -12.895039558410645, "step": 712 }, { "epoch": 0.44354587869362366, "grad_norm": 3.326890468597412, "learning_rate": 3.0912863070539424e-06, "logits/chosen": -3.5047852993011475, "logits/rejected": -0.0872359499335289, "logps/chosen": -357.24896240234375, "logps/rejected": -694.9989624023438, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": -2.7858734130859375, "rewards/margins": 12.651700019836426, "rewards/rejected": -15.43757438659668, "step": 713 }, { "epoch": 0.4441679626749611, "grad_norm": 0.019906649366021156, "learning_rate": 3.087828492392808e-06, "logits/chosen": -0.9687420129776001, "logits/rejected": 1.860982894897461, "logps/chosen": -362.2194519042969, "logps/rejected": -659.05908203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.25382661819458, "rewards/margins": 10.760574340820312, "rewards/rejected": -13.014400482177734, "step": 714 }, { "epoch": 0.4447900466562986, "grad_norm": 2.237276315689087, "learning_rate": 3.084370677731674e-06, "logits/chosen": -3.206549644470215, "logits/rejected": 0.2995449900627136, "logps/chosen": -349.06060791015625, "logps/rejected": -708.20263671875, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -2.9651365280151367, "rewards/margins": 9.084753036499023, "rewards/rejected": -12.04988956451416, "step": 715 }, { "epoch": 0.4454121306376361, "grad_norm": 0.28772684931755066, "learning_rate": 3.08091286307054e-06, "logits/chosen": 0.44255098700523376, "logits/rejected": 1.7277225255966187, "logps/chosen": -557.5095825195312, "logps/rejected": -723.0860595703125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -5.008021354675293, "rewards/margins": 8.728983879089355, "rewards/rejected": -13.737004280090332, "step": 716 }, { "epoch": 0.4460342146189736, "grad_norm": 0.0050316182896494865, "learning_rate": 3.0774550484094055e-06, "logits/chosen": -3.459505081176758, "logits/rejected": 2.6668598651885986, "logps/chosen": -208.3125457763672, "logps/rejected": -712.3590087890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.7346527576446533, "rewards/margins": 14.146993637084961, "rewards/rejected": -15.881645202636719, "step": 717 }, { "epoch": 0.44665629860031103, "grad_norm": 17.51116371154785, "learning_rate": 3.0739972337482716e-06, "logits/chosen": -1.1133579015731812, "logits/rejected": 2.061647891998291, "logps/chosen": -472.7833251953125, "logps/rejected": -761.3068237304688, "loss": 0.9351, "rewards/accuracies": 0.625, "rewards/chosen": -2.1725363731384277, "rewards/margins": 5.186452865600586, "rewards/rejected": -7.358989238739014, "step": 718 }, { "epoch": 0.44727838258164854, "grad_norm": 0.029986217617988586, "learning_rate": 3.0705394190871373e-06, "logits/chosen": -2.2766194343566895, "logits/rejected": 1.52131986618042, "logps/chosen": -416.427978515625, "logps/rejected": -806.0741577148438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.4974558353424072, "rewards/margins": 13.082174301147461, "rewards/rejected": -15.579630851745605, "step": 719 }, { "epoch": 0.447900466562986, "grad_norm": 1.7208304405212402, "learning_rate": 3.0670816044260034e-06, "logits/chosen": -0.42871132493019104, "logits/rejected": 1.369104027748108, "logps/chosen": -454.74053955078125, "logps/rejected": -667.4635009765625, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -3.3207778930664062, "rewards/margins": 6.532223701477051, "rewards/rejected": -9.853001594543457, "step": 720 }, { "epoch": 0.4485225505443235, "grad_norm": 7.669965744018555, "learning_rate": 3.063623789764869e-06, "logits/chosen": 0.43128448724746704, "logits/rejected": 2.2884891033172607, "logps/chosen": -685.76318359375, "logps/rejected": -860.8486328125, "loss": 0.1853, "rewards/accuracies": 0.875, "rewards/chosen": -1.7041456699371338, "rewards/margins": 10.447870254516602, "rewards/rejected": -12.152015686035156, "step": 721 }, { "epoch": 0.44914463452566095, "grad_norm": 6.198464393615723, "learning_rate": 3.0601659751037347e-06, "logits/chosen": -0.7020193934440613, "logits/rejected": 1.5930724143981934, "logps/chosen": -430.36273193359375, "logps/rejected": -638.633544921875, "loss": 0.185, "rewards/accuracies": 1.0, "rewards/chosen": -2.259809732437134, "rewards/margins": 6.790175437927246, "rewards/rejected": -9.049985885620117, "step": 722 }, { "epoch": 0.44976671850699845, "grad_norm": 11.20889949798584, "learning_rate": 3.056708160442601e-06, "logits/chosen": 1.4442417621612549, "logits/rejected": 3.9642751216888428, "logps/chosen": -575.2060546875, "logps/rejected": -832.597412109375, "loss": 0.3485, "rewards/accuracies": 0.75, "rewards/chosen": -3.273742437362671, "rewards/margins": 7.68789005279541, "rewards/rejected": -10.96163272857666, "step": 723 }, { "epoch": 0.4503888024883359, "grad_norm": 0.08017276227474213, "learning_rate": 3.0532503457814665e-06, "logits/chosen": -1.691275954246521, "logits/rejected": 0.8510454893112183, "logps/chosen": -474.8917541503906, "logps/rejected": -743.270263671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.9573373794555664, "rewards/margins": 11.011128425598145, "rewards/rejected": -12.968465805053711, "step": 724 }, { "epoch": 0.4510108864696734, "grad_norm": 7.945643424987793, "learning_rate": 3.0497925311203326e-06, "logits/chosen": -0.9796038866043091, "logits/rejected": 1.3270384073257446, "logps/chosen": -661.9572143554688, "logps/rejected": -817.2179565429688, "loss": 0.1882, "rewards/accuracies": 0.875, "rewards/chosen": -4.118674278259277, "rewards/margins": 7.844497203826904, "rewards/rejected": -11.963171005249023, "step": 725 }, { "epoch": 0.45163297045101086, "grad_norm": 7.258907794952393, "learning_rate": 3.046334716459198e-06, "logits/chosen": -2.8530001640319824, "logits/rejected": 0.3892638385295868, "logps/chosen": -444.63116455078125, "logps/rejected": -806.1827392578125, "loss": 0.1195, "rewards/accuracies": 0.875, "rewards/chosen": -3.3728339672088623, "rewards/margins": 11.749553680419922, "rewards/rejected": -15.122386932373047, "step": 726 }, { "epoch": 0.45225505443234837, "grad_norm": 8.815637588500977, "learning_rate": 3.0428769017980635e-06, "logits/chosen": 0.0035632923245429993, "logits/rejected": 2.1954150199890137, "logps/chosen": -514.3666381835938, "logps/rejected": -769.7069702148438, "loss": 0.322, "rewards/accuracies": 0.875, "rewards/chosen": -3.152292490005493, "rewards/margins": 8.501354217529297, "rewards/rejected": -11.653646469116211, "step": 727 }, { "epoch": 0.4528771384136858, "grad_norm": 11.27776050567627, "learning_rate": 3.0394190871369296e-06, "logits/chosen": 0.5696735382080078, "logits/rejected": 2.3467257022857666, "logps/chosen": -713.609375, "logps/rejected": -889.744384765625, "loss": 0.439, "rewards/accuracies": 0.75, "rewards/chosen": -4.995273590087891, "rewards/margins": 8.260981559753418, "rewards/rejected": -13.256254196166992, "step": 728 }, { "epoch": 0.4534992223950233, "grad_norm": 2.2402448654174805, "learning_rate": 3.0359612724757953e-06, "logits/chosen": -1.8808625936508179, "logits/rejected": 1.043796181678772, "logps/chosen": -423.27142333984375, "logps/rejected": -716.957275390625, "loss": 0.097, "rewards/accuracies": 0.875, "rewards/chosen": -1.2840181589126587, "rewards/margins": 8.761134147644043, "rewards/rejected": -10.04515266418457, "step": 729 }, { "epoch": 0.45412130637636083, "grad_norm": 0.4544994533061981, "learning_rate": 3.032503457814661e-06, "logits/chosen": -2.1568350791931152, "logits/rejected": 1.1574361324310303, "logps/chosen": -532.4506225585938, "logps/rejected": -825.7210693359375, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -4.8567585945129395, "rewards/margins": 11.81627082824707, "rewards/rejected": -16.67302894592285, "step": 730 }, { "epoch": 0.4547433903576983, "grad_norm": 0.32847049832344055, "learning_rate": 3.029045643153527e-06, "logits/chosen": -2.7892959117889404, "logits/rejected": 1.853650689125061, "logps/chosen": -454.8009338378906, "logps/rejected": -831.5844116210938, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.958532333374023, "rewards/margins": 12.624985694885254, "rewards/rejected": -17.583518981933594, "step": 731 }, { "epoch": 0.4553654743390358, "grad_norm": 0.03330639749765396, "learning_rate": 3.0255878284923927e-06, "logits/chosen": -2.3786582946777344, "logits/rejected": 2.1897826194763184, "logps/chosen": -288.08331298828125, "logps/rejected": -786.6109008789062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.0537655353546143, "rewards/margins": 12.041199684143066, "rewards/rejected": -14.094964981079102, "step": 732 }, { "epoch": 0.45598755832037324, "grad_norm": 0.06069202348589897, "learning_rate": 3.022130013831259e-06, "logits/chosen": -0.2544625997543335, "logits/rejected": 2.753920555114746, "logps/chosen": -477.0604248046875, "logps/rejected": -811.161865234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.119234323501587, "rewards/margins": 13.048916816711426, "rewards/rejected": -16.16815185546875, "step": 733 }, { "epoch": 0.45660964230171075, "grad_norm": 3.922071933746338, "learning_rate": 3.0186721991701245e-06, "logits/chosen": -2.0510897636413574, "logits/rejected": 2.5488157272338867, "logps/chosen": -420.5107421875, "logps/rejected": -824.8824462890625, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -2.0876171588897705, "rewards/margins": 11.934255599975586, "rewards/rejected": -14.021873474121094, "step": 734 }, { "epoch": 0.4572317262830482, "grad_norm": 0.6180808544158936, "learning_rate": 3.0152143845089906e-06, "logits/chosen": -1.5253479480743408, "logits/rejected": 1.1228837966918945, "logps/chosen": -278.7316589355469, "logps/rejected": -580.3323974609375, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -2.349078893661499, "rewards/margins": 10.124811172485352, "rewards/rejected": -12.473889350891113, "step": 735 }, { "epoch": 0.4578538102643857, "grad_norm": 1.72667396068573, "learning_rate": 3.0117565698478563e-06, "logits/chosen": -1.0331193208694458, "logits/rejected": 1.3170900344848633, "logps/chosen": -523.2300415039062, "logps/rejected": -743.0133056640625, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -3.4451475143432617, "rewards/margins": 10.577926635742188, "rewards/rejected": -14.023075103759766, "step": 736 }, { "epoch": 0.45847589424572316, "grad_norm": 1.6295750141143799, "learning_rate": 3.008298755186722e-06, "logits/chosen": -0.22234046459197998, "logits/rejected": 1.7111839056015015, "logps/chosen": -469.24554443359375, "logps/rejected": -746.6301879882812, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -4.926995277404785, "rewards/margins": 10.80273723602295, "rewards/rejected": -15.729732513427734, "step": 737 }, { "epoch": 0.45909797822706067, "grad_norm": 9.630468368530273, "learning_rate": 3.004840940525588e-06, "logits/chosen": -2.7610280513763428, "logits/rejected": 2.4612324237823486, "logps/chosen": -363.037353515625, "logps/rejected": -791.9073486328125, "loss": 0.282, "rewards/accuracies": 0.875, "rewards/chosen": -2.991577625274658, "rewards/margins": 10.19937515258789, "rewards/rejected": -13.190953254699707, "step": 738 }, { "epoch": 0.4597200622083981, "grad_norm": 6.337337970733643, "learning_rate": 3.0013831258644537e-06, "logits/chosen": -1.2793059349060059, "logits/rejected": 2.3154563903808594, "logps/chosen": -356.61883544921875, "logps/rejected": -615.3483276367188, "loss": 0.1525, "rewards/accuracies": 0.875, "rewards/chosen": -2.316521167755127, "rewards/margins": 9.875509262084961, "rewards/rejected": -12.19202995300293, "step": 739 }, { "epoch": 0.4603421461897356, "grad_norm": 1.9553436040878296, "learning_rate": 2.99792531120332e-06, "logits/chosen": -0.3469884991645813, "logits/rejected": 1.415436029434204, "logps/chosen": -529.076904296875, "logps/rejected": -647.0120239257812, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -3.657111406326294, "rewards/margins": 8.383366584777832, "rewards/rejected": -12.040477752685547, "step": 740 }, { "epoch": 0.4609642301710731, "grad_norm": 0.013914545066654682, "learning_rate": 2.9944674965421855e-06, "logits/chosen": -3.1489317417144775, "logits/rejected": 2.13686466217041, "logps/chosen": -320.142822265625, "logps/rejected": -838.8778686523438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.323423385620117, "rewards/margins": 14.666143417358398, "rewards/rejected": -16.989566802978516, "step": 741 }, { "epoch": 0.4615863141524106, "grad_norm": 10.756217002868652, "learning_rate": 2.991009681881051e-06, "logits/chosen": -1.5489249229431152, "logits/rejected": 1.7386950254440308, "logps/chosen": -459.6607666015625, "logps/rejected": -749.91748046875, "loss": 0.4838, "rewards/accuracies": 0.875, "rewards/chosen": -3.4601821899414062, "rewards/margins": 7.2551374435424805, "rewards/rejected": -10.715320587158203, "step": 742 }, { "epoch": 0.46220839813374803, "grad_norm": 7.462672233581543, "learning_rate": 2.9875518672199173e-06, "logits/chosen": -1.6180301904678345, "logits/rejected": 0.646544337272644, "logps/chosen": -389.6840515136719, "logps/rejected": -666.617431640625, "loss": 0.102, "rewards/accuracies": 0.875, "rewards/chosen": -2.907453775405884, "rewards/margins": 10.085267066955566, "rewards/rejected": -12.992721557617188, "step": 743 }, { "epoch": 0.46283048211508554, "grad_norm": 0.026175174862146378, "learning_rate": 2.984094052558783e-06, "logits/chosen": -0.9026546478271484, "logits/rejected": 2.0860936641693115, "logps/chosen": -449.5962829589844, "logps/rejected": -766.5008544921875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.387544870376587, "rewards/margins": 15.6162748336792, "rewards/rejected": -19.00381851196289, "step": 744 }, { "epoch": 0.463452566096423, "grad_norm": 7.469945907592773, "learning_rate": 2.980636237897649e-06, "logits/chosen": -2.028024196624756, "logits/rejected": 1.9433135986328125, "logps/chosen": -334.4049377441406, "logps/rejected": -668.7412109375, "loss": 0.2449, "rewards/accuracies": 0.875, "rewards/chosen": -2.1763999462127686, "rewards/margins": 13.228769302368164, "rewards/rejected": -15.405168533325195, "step": 745 }, { "epoch": 0.4640746500777605, "grad_norm": 3.606426239013672, "learning_rate": 2.9771784232365147e-06, "logits/chosen": -3.4823436737060547, "logits/rejected": -0.6158607602119446, "logps/chosen": -434.3836669921875, "logps/rejected": -720.673583984375, "loss": 0.092, "rewards/accuracies": 1.0, "rewards/chosen": -3.1005947589874268, "rewards/margins": 8.966793060302734, "rewards/rejected": -12.067388534545898, "step": 746 }, { "epoch": 0.464696734059098, "grad_norm": 0.8679505586624146, "learning_rate": 2.9737206085753804e-06, "logits/chosen": -0.306864857673645, "logits/rejected": 2.0049102306365967, "logps/chosen": -322.28228759765625, "logps/rejected": -608.8792114257812, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -3.2067694664001465, "rewards/margins": 11.397992134094238, "rewards/rejected": -14.604761123657227, "step": 747 }, { "epoch": 0.46531881804043546, "grad_norm": 0.013280616141855717, "learning_rate": 2.9702627939142465e-06, "logits/chosen": -2.4182138442993164, "logits/rejected": 1.0155261754989624, "logps/chosen": -374.8432312011719, "logps/rejected": -750.22705078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9369893074035645, "rewards/margins": 12.454451560974121, "rewards/rejected": -15.391441345214844, "step": 748 }, { "epoch": 0.46594090202177296, "grad_norm": 3.646437644958496, "learning_rate": 2.966804979253112e-06, "logits/chosen": 0.25956860184669495, "logits/rejected": 2.7566702365875244, "logps/chosen": -430.9888916015625, "logps/rejected": -744.0919189453125, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": -2.7358927726745605, "rewards/margins": 7.419732093811035, "rewards/rejected": -10.155624389648438, "step": 749 }, { "epoch": 0.4665629860031104, "grad_norm": 14.62672233581543, "learning_rate": 2.9633471645919783e-06, "logits/chosen": 0.5327641367912292, "logits/rejected": 1.4669005870819092, "logps/chosen": -532.407958984375, "logps/rejected": -746.432373046875, "loss": 0.6932, "rewards/accuracies": 0.875, "rewards/chosen": -4.417051315307617, "rewards/margins": 8.366933822631836, "rewards/rejected": -12.783985137939453, "step": 750 }, { "epoch": 0.4671850699844479, "grad_norm": 6.82889461517334, "learning_rate": 2.959889349930844e-06, "logits/chosen": -0.9305851459503174, "logits/rejected": 1.021018624305725, "logps/chosen": -457.11724853515625, "logps/rejected": -701.9202270507812, "loss": 0.2929, "rewards/accuracies": 0.875, "rewards/chosen": -2.87644624710083, "rewards/margins": 13.102041244506836, "rewards/rejected": -15.978487968444824, "step": 751 }, { "epoch": 0.46780715396578537, "grad_norm": 0.18674100935459137, "learning_rate": 2.95643153526971e-06, "logits/chosen": 0.9670653939247131, "logits/rejected": 1.3385288715362549, "logps/chosen": -474.38201904296875, "logps/rejected": -608.8443603515625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -2.7832205295562744, "rewards/margins": 9.81285285949707, "rewards/rejected": -12.596074104309082, "step": 752 }, { "epoch": 0.4684292379471229, "grad_norm": 8.648846626281738, "learning_rate": 2.9529737206085757e-06, "logits/chosen": -0.6550443172454834, "logits/rejected": 1.6410958766937256, "logps/chosen": -551.1663208007812, "logps/rejected": -754.9622802734375, "loss": 0.3162, "rewards/accuracies": 0.875, "rewards/chosen": -2.4769089221954346, "rewards/margins": 8.597972869873047, "rewards/rejected": -11.074882507324219, "step": 753 }, { "epoch": 0.46905132192846033, "grad_norm": 12.919448852539062, "learning_rate": 2.9495159059474414e-06, "logits/chosen": -3.1753220558166504, "logits/rejected": 0.27937957644462585, "logps/chosen": -330.33514404296875, "logps/rejected": -591.4677734375, "loss": 1.2465, "rewards/accuracies": 0.875, "rewards/chosen": -2.3241162300109863, "rewards/margins": 8.910348892211914, "rewards/rejected": -11.234464645385742, "step": 754 }, { "epoch": 0.46967340590979784, "grad_norm": 12.515019416809082, "learning_rate": 2.9460580912863075e-06, "logits/chosen": -1.3804985284805298, "logits/rejected": 0.24964234232902527, "logps/chosen": -497.03936767578125, "logps/rejected": -736.6233520507812, "loss": 1.0217, "rewards/accuracies": 0.875, "rewards/chosen": -3.451972007751465, "rewards/margins": 8.669477462768555, "rewards/rejected": -12.121448516845703, "step": 755 }, { "epoch": 0.4702954898911353, "grad_norm": 18.31880760192871, "learning_rate": 2.942600276625173e-06, "logits/chosen": 0.12169390916824341, "logits/rejected": 2.4365978240966797, "logps/chosen": -543.4795532226562, "logps/rejected": -736.1213989257812, "loss": 1.416, "rewards/accuracies": 0.625, "rewards/chosen": -4.9577436447143555, "rewards/margins": 10.255131721496582, "rewards/rejected": -15.212875366210938, "step": 756 }, { "epoch": 0.4709175738724728, "grad_norm": 0.1776166707277298, "learning_rate": 2.9391424619640392e-06, "logits/chosen": 2.812436103820801, "logits/rejected": 2.4633126258850098, "logps/chosen": -682.6735229492188, "logps/rejected": -735.7775268554688, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -6.131606101989746, "rewards/margins": 8.689409255981445, "rewards/rejected": -14.821015357971191, "step": 757 }, { "epoch": 0.47153965785381025, "grad_norm": 5.808371543884277, "learning_rate": 2.935684647302905e-06, "logits/chosen": 0.2484622299671173, "logits/rejected": 0.2521892786026001, "logps/chosen": -679.6835327148438, "logps/rejected": -700.8118286132812, "loss": 0.1655, "rewards/accuracies": 0.875, "rewards/chosen": -3.858510732650757, "rewards/margins": 7.597812175750732, "rewards/rejected": -11.456321716308594, "step": 758 }, { "epoch": 0.47216174183514775, "grad_norm": 7.215651988983154, "learning_rate": 2.9322268326417706e-06, "logits/chosen": -3.2874770164489746, "logits/rejected": -0.7016294598579407, "logps/chosen": -435.744384765625, "logps/rejected": -689.0988159179688, "loss": 0.1333, "rewards/accuracies": 0.875, "rewards/chosen": -1.6870291233062744, "rewards/margins": 9.981165885925293, "rewards/rejected": -11.668194770812988, "step": 759 }, { "epoch": 0.4727838258164852, "grad_norm": 14.464906692504883, "learning_rate": 2.9287690179806367e-06, "logits/chosen": 0.1948835849761963, "logits/rejected": 0.7579115629196167, "logps/chosen": -592.904296875, "logps/rejected": -690.428466796875, "loss": 1.0619, "rewards/accuracies": 0.75, "rewards/chosen": -5.188655376434326, "rewards/margins": 6.476170539855957, "rewards/rejected": -11.664826393127441, "step": 760 }, { "epoch": 0.4734059097978227, "grad_norm": 11.2656888961792, "learning_rate": 2.9253112033195024e-06, "logits/chosen": -1.4519120454788208, "logits/rejected": 2.472182512283325, "logps/chosen": -476.26702880859375, "logps/rejected": -749.05322265625, "loss": 0.385, "rewards/accuracies": 0.875, "rewards/chosen": -3.652994155883789, "rewards/margins": 6.868544101715088, "rewards/rejected": -10.521537780761719, "step": 761 }, { "epoch": 0.47402799377916016, "grad_norm": 0.9834259152412415, "learning_rate": 2.9218533886583685e-06, "logits/chosen": 0.5894418358802795, "logits/rejected": 3.5646252632141113, "logps/chosen": -428.6927490234375, "logps/rejected": -713.3218994140625, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": -2.3402018547058105, "rewards/margins": 9.103296279907227, "rewards/rejected": -11.443498611450195, "step": 762 }, { "epoch": 0.47465007776049767, "grad_norm": 0.0012660648208111525, "learning_rate": 2.918395573997234e-06, "logits/chosen": -0.9826505184173584, "logits/rejected": 2.3840858936309814, "logps/chosen": -527.2155151367188, "logps/rejected": -889.6929321289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.6804518699645996, "rewards/margins": 15.893087387084961, "rewards/rejected": -18.57353973388672, "step": 763 }, { "epoch": 0.4752721617418352, "grad_norm": 6.876689910888672, "learning_rate": 2.9149377593361e-06, "logits/chosen": 0.250545859336853, "logits/rejected": 2.471944570541382, "logps/chosen": -527.92578125, "logps/rejected": -708.1130981445312, "loss": 0.1495, "rewards/accuracies": 0.875, "rewards/chosen": -2.4136877059936523, "rewards/margins": 6.974236965179443, "rewards/rejected": -9.387924194335938, "step": 764 }, { "epoch": 0.4758942457231726, "grad_norm": 7.7720818519592285, "learning_rate": 2.911479944674966e-06, "logits/chosen": 1.602400779724121, "logits/rejected": 1.8558259010314941, "logps/chosen": -644.9960327148438, "logps/rejected": -683.6669311523438, "loss": 0.2496, "rewards/accuracies": 0.875, "rewards/chosen": -2.9434196949005127, "rewards/margins": 7.4508867263793945, "rewards/rejected": -10.394306182861328, "step": 765 }, { "epoch": 0.47651632970451013, "grad_norm": 6.905212879180908, "learning_rate": 2.9080221300138316e-06, "logits/chosen": -0.5166137218475342, "logits/rejected": 1.9035359621047974, "logps/chosen": -551.2916259765625, "logps/rejected": -827.5794677734375, "loss": 0.1113, "rewards/accuracies": 0.875, "rewards/chosen": -2.7271530628204346, "rewards/margins": 11.749853134155273, "rewards/rejected": -14.477006912231445, "step": 766 }, { "epoch": 0.4771384136858476, "grad_norm": 18.120454788208008, "learning_rate": 2.9045643153526977e-06, "logits/chosen": -2.3964920043945312, "logits/rejected": -0.24302253127098083, "logps/chosen": -464.3152160644531, "logps/rejected": -675.7804565429688, "loss": 0.8181, "rewards/accuracies": 0.75, "rewards/chosen": -3.9169716835021973, "rewards/margins": 7.095893859863281, "rewards/rejected": -11.012866020202637, "step": 767 }, { "epoch": 0.4777604976671851, "grad_norm": 0.325296014547348, "learning_rate": 2.9011065006915634e-06, "logits/chosen": -0.6314485669136047, "logits/rejected": 2.870893955230713, "logps/chosen": -399.208984375, "logps/rejected": -672.84326171875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -1.7347712516784668, "rewards/margins": 8.93397331237793, "rewards/rejected": -10.668744087219238, "step": 768 }, { "epoch": 0.47838258164852254, "grad_norm": 0.5934820771217346, "learning_rate": 2.8976486860304295e-06, "logits/chosen": -1.2716343402862549, "logits/rejected": 0.9507853388786316, "logps/chosen": -287.3315124511719, "logps/rejected": -522.9884643554688, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -1.8230342864990234, "rewards/margins": 7.966280937194824, "rewards/rejected": -9.789314270019531, "step": 769 }, { "epoch": 0.47900466562986005, "grad_norm": 7.945781707763672, "learning_rate": 2.894190871369295e-06, "logits/chosen": -0.3926021456718445, "logits/rejected": 0.8242150545120239, "logps/chosen": -604.8637084960938, "logps/rejected": -674.1680908203125, "loss": 0.1982, "rewards/accuracies": 0.875, "rewards/chosen": -2.8679234981536865, "rewards/margins": 7.818727493286133, "rewards/rejected": -10.686651229858398, "step": 770 }, { "epoch": 0.4796267496111975, "grad_norm": 8.589969635009766, "learning_rate": 2.890733056708161e-06, "logits/chosen": -2.6945464611053467, "logits/rejected": 0.5001758337020874, "logps/chosen": -419.3684387207031, "logps/rejected": -736.1264038085938, "loss": 0.2118, "rewards/accuracies": 0.875, "rewards/chosen": -2.588325023651123, "rewards/margins": 9.23220443725586, "rewards/rejected": -11.82052993774414, "step": 771 }, { "epoch": 0.480248833592535, "grad_norm": 2.422610282897949, "learning_rate": 2.887275242047026e-06, "logits/chosen": -0.7853689193725586, "logits/rejected": 1.656580924987793, "logps/chosen": -409.4538269042969, "logps/rejected": -717.7423095703125, "loss": 0.1157, "rewards/accuracies": 0.875, "rewards/chosen": -2.000337600708008, "rewards/margins": 9.804020881652832, "rewards/rejected": -11.80435848236084, "step": 772 }, { "epoch": 0.48087091757387246, "grad_norm": 6.5073041915893555, "learning_rate": 2.883817427385892e-06, "logits/chosen": 1.2587249279022217, "logits/rejected": 0.8521639108657837, "logps/chosen": -619.0385131835938, "logps/rejected": -687.81640625, "loss": 0.2582, "rewards/accuracies": 0.875, "rewards/chosen": -3.604969024658203, "rewards/margins": 9.394414901733398, "rewards/rejected": -12.999383926391602, "step": 773 }, { "epoch": 0.48149300155520997, "grad_norm": 0.9350523352622986, "learning_rate": 2.880359612724758e-06, "logits/chosen": -0.5437452793121338, "logits/rejected": 3.0648488998413086, "logps/chosen": -445.6175842285156, "logps/rejected": -771.0081787109375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -3.3415884971618652, "rewards/margins": 11.5773344039917, "rewards/rejected": -14.918924331665039, "step": 774 }, { "epoch": 0.4821150855365474, "grad_norm": 2.7919063568115234, "learning_rate": 2.876901798063624e-06, "logits/chosen": -1.0898349285125732, "logits/rejected": 1.9837009906768799, "logps/chosen": -524.8995971679688, "logps/rejected": -828.731689453125, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": -2.133089065551758, "rewards/margins": 7.766391277313232, "rewards/rejected": -9.899479866027832, "step": 775 }, { "epoch": 0.4827371695178849, "grad_norm": 8.589869499206543, "learning_rate": 2.8734439834024896e-06, "logits/chosen": 0.18621765077114105, "logits/rejected": 1.8180525302886963, "logps/chosen": -616.3333740234375, "logps/rejected": -841.8905029296875, "loss": 0.1454, "rewards/accuracies": 0.875, "rewards/chosen": -5.160834789276123, "rewards/margins": 10.548736572265625, "rewards/rejected": -15.709571838378906, "step": 776 }, { "epoch": 0.4833592534992224, "grad_norm": 10.158196449279785, "learning_rate": 2.8699861687413557e-06, "logits/chosen": -0.5885164737701416, "logits/rejected": 1.4068222045898438, "logps/chosen": -518.8425903320312, "logps/rejected": -749.74755859375, "loss": 0.2502, "rewards/accuracies": 0.875, "rewards/chosen": -4.375752925872803, "rewards/margins": 9.246931076049805, "rewards/rejected": -13.622684478759766, "step": 777 }, { "epoch": 0.4839813374805599, "grad_norm": 9.828699111938477, "learning_rate": 2.8665283540802214e-06, "logits/chosen": -2.80334734916687, "logits/rejected": -0.6855421662330627, "logps/chosen": -415.15399169921875, "logps/rejected": -608.0079956054688, "loss": 0.5089, "rewards/accuracies": 0.875, "rewards/chosen": -2.1512112617492676, "rewards/margins": 8.332806587219238, "rewards/rejected": -10.484017372131348, "step": 778 }, { "epoch": 0.48460342146189733, "grad_norm": 0.020104996860027313, "learning_rate": 2.863070539419087e-06, "logits/chosen": -0.26311540603637695, "logits/rejected": 1.6968138217926025, "logps/chosen": -548.4723510742188, "logps/rejected": -806.7774047851562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.5867953300476074, "rewards/margins": 12.3662109375, "rewards/rejected": -15.953006744384766, "step": 779 }, { "epoch": 0.48522550544323484, "grad_norm": 1.6705970764160156, "learning_rate": 2.859612724757953e-06, "logits/chosen": -2.711519956588745, "logits/rejected": 1.2743061780929565, "logps/chosen": -221.45947265625, "logps/rejected": -569.3951416015625, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -2.608492851257324, "rewards/margins": 9.633746147155762, "rewards/rejected": -12.24223804473877, "step": 780 }, { "epoch": 0.4858475894245723, "grad_norm": 15.589944839477539, "learning_rate": 2.856154910096819e-06, "logits/chosen": -3.2950150966644287, "logits/rejected": -0.21658968925476074, "logps/chosen": -430.476318359375, "logps/rejected": -726.1863403320312, "loss": 0.8515, "rewards/accuracies": 0.75, "rewards/chosen": -2.9831349849700928, "rewards/margins": 7.939026355743408, "rewards/rejected": -10.922162055969238, "step": 781 }, { "epoch": 0.4864696734059098, "grad_norm": 4.748611927032471, "learning_rate": 2.852697095435685e-06, "logits/chosen": -1.3626803159713745, "logits/rejected": -0.0053140223026275635, "logps/chosen": -393.5351867675781, "logps/rejected": -619.867919921875, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": -4.194700241088867, "rewards/margins": 10.407540321350098, "rewards/rejected": -14.602241516113281, "step": 782 }, { "epoch": 0.4870917573872473, "grad_norm": 4.190652370452881, "learning_rate": 2.8492392807745506e-06, "logits/chosen": -0.6324089765548706, "logits/rejected": 1.4899488687515259, "logps/chosen": -555.5341796875, "logps/rejected": -828.612548828125, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -3.7724788188934326, "rewards/margins": 11.813754081726074, "rewards/rejected": -15.586231231689453, "step": 783 }, { "epoch": 0.48771384136858476, "grad_norm": 8.359914779663086, "learning_rate": 2.8457814661134163e-06, "logits/chosen": -0.8883829712867737, "logits/rejected": -0.06685575842857361, "logps/chosen": -490.02667236328125, "logps/rejected": -592.2490234375, "loss": 0.2264, "rewards/accuracies": 0.875, "rewards/chosen": -1.3829407691955566, "rewards/margins": 6.736806392669678, "rewards/rejected": -8.119747161865234, "step": 784 }, { "epoch": 0.48833592534992226, "grad_norm": 3.7186858654022217, "learning_rate": 2.8423236514522824e-06, "logits/chosen": -1.1268842220306396, "logits/rejected": 2.134249687194824, "logps/chosen": -386.8497009277344, "logps/rejected": -650.5824584960938, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": -1.8763618469238281, "rewards/margins": 9.1633939743042, "rewards/rejected": -11.039754867553711, "step": 785 }, { "epoch": 0.4889580093312597, "grad_norm": 0.5301943421363831, "learning_rate": 2.838865836791148e-06, "logits/chosen": -0.7772079706192017, "logits/rejected": 2.2791929244995117, "logps/chosen": -533.16357421875, "logps/rejected": -863.53564453125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -4.438951015472412, "rewards/margins": 12.406270027160645, "rewards/rejected": -16.8452205657959, "step": 786 }, { "epoch": 0.4895800933125972, "grad_norm": 6.3787031173706055, "learning_rate": 2.835408022130014e-06, "logits/chosen": -2.0595245361328125, "logits/rejected": 0.7076666355133057, "logps/chosen": -410.786865234375, "logps/rejected": -756.104248046875, "loss": 0.1526, "rewards/accuracies": 0.875, "rewards/chosen": -4.039379596710205, "rewards/margins": 9.901930809020996, "rewards/rejected": -13.94131088256836, "step": 787 }, { "epoch": 0.49020217729393467, "grad_norm": 8.942523002624512, "learning_rate": 2.83195020746888e-06, "logits/chosen": 0.44868871569633484, "logits/rejected": 1.887967824935913, "logps/chosen": -528.84765625, "logps/rejected": -766.2671508789062, "loss": 0.4412, "rewards/accuracies": 0.875, "rewards/chosen": -2.996544122695923, "rewards/margins": 10.245563507080078, "rewards/rejected": -13.242107391357422, "step": 788 }, { "epoch": 0.4908242612752722, "grad_norm": 0.5475978255271912, "learning_rate": 2.8284923928077455e-06, "logits/chosen": -1.6582798957824707, "logits/rejected": 1.4793846607208252, "logps/chosen": -411.8215637207031, "logps/rejected": -714.6986083984375, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -3.391521692276001, "rewards/margins": 6.855010032653809, "rewards/rejected": -10.24653148651123, "step": 789 }, { "epoch": 0.49144634525660963, "grad_norm": 4.045506477355957, "learning_rate": 2.8250345781466116e-06, "logits/chosen": -1.488791584968567, "logits/rejected": 1.132124662399292, "logps/chosen": -430.41253662109375, "logps/rejected": -642.586669921875, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": -2.0482382774353027, "rewards/margins": 6.873682975769043, "rewards/rejected": -8.921920776367188, "step": 790 }, { "epoch": 0.49206842923794714, "grad_norm": 7.579257965087891, "learning_rate": 2.8215767634854773e-06, "logits/chosen": -1.8691233396530151, "logits/rejected": 1.362309217453003, "logps/chosen": -409.72991943359375, "logps/rejected": -721.4398193359375, "loss": 0.396, "rewards/accuracies": 0.875, "rewards/chosen": -3.7302517890930176, "rewards/margins": 10.259403228759766, "rewards/rejected": -13.989654541015625, "step": 791 }, { "epoch": 0.4926905132192846, "grad_norm": 0.5340853929519653, "learning_rate": 2.8181189488243434e-06, "logits/chosen": -1.4742560386657715, "logits/rejected": 1.7601102590560913, "logps/chosen": -431.6163330078125, "logps/rejected": -751.2813720703125, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.5576989650726318, "rewards/margins": 12.938612937927246, "rewards/rejected": -14.496312141418457, "step": 792 }, { "epoch": 0.4933125972006221, "grad_norm": 0.1092253029346466, "learning_rate": 2.814661134163209e-06, "logits/chosen": -1.6014957427978516, "logits/rejected": 2.109055519104004, "logps/chosen": -504.5906982421875, "logps/rejected": -825.5761108398438, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.674488067626953, "rewards/margins": 11.491287231445312, "rewards/rejected": -15.165776252746582, "step": 793 }, { "epoch": 0.49393468118195955, "grad_norm": 11.238272666931152, "learning_rate": 2.811203319502075e-06, "logits/chosen": 0.309939444065094, "logits/rejected": 1.5785530805587769, "logps/chosen": -622.827392578125, "logps/rejected": -785.8017578125, "loss": 0.2583, "rewards/accuracies": 0.875, "rewards/chosen": -3.5898451805114746, "rewards/margins": 8.61373519897461, "rewards/rejected": -12.203580856323242, "step": 794 }, { "epoch": 0.49455676516329705, "grad_norm": 9.126264572143555, "learning_rate": 2.807745504840941e-06, "logits/chosen": -0.623596727848053, "logits/rejected": 0.754486083984375, "logps/chosen": -558.5323486328125, "logps/rejected": -683.68017578125, "loss": 0.1748, "rewards/accuracies": 0.875, "rewards/chosen": -5.036941051483154, "rewards/margins": 7.129949569702148, "rewards/rejected": -12.166891098022461, "step": 795 }, { "epoch": 0.4951788491446345, "grad_norm": 3.4543616771698, "learning_rate": 2.8042876901798065e-06, "logits/chosen": -0.9819114208221436, "logits/rejected": 2.404608964920044, "logps/chosen": -561.914306640625, "logps/rejected": -845.6032104492188, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -2.8010685443878174, "rewards/margins": 9.483663558959961, "rewards/rejected": -12.284732818603516, "step": 796 }, { "epoch": 0.495800933125972, "grad_norm": 0.570750892162323, "learning_rate": 2.8008298755186726e-06, "logits/chosen": 0.6256065964698792, "logits/rejected": 2.7131636142730713, "logps/chosen": -457.27850341796875, "logps/rejected": -782.610595703125, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -3.076456069946289, "rewards/margins": 9.601327896118164, "rewards/rejected": -12.677783966064453, "step": 797 }, { "epoch": 0.49642301710730946, "grad_norm": 0.427304208278656, "learning_rate": 2.7973720608575382e-06, "logits/chosen": -1.1039800643920898, "logits/rejected": -0.006636232137680054, "logps/chosen": -526.756103515625, "logps/rejected": -747.43896484375, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -4.508908271789551, "rewards/margins": 6.627223968505859, "rewards/rejected": -11.13613224029541, "step": 798 }, { "epoch": 0.49704510108864697, "grad_norm": 0.00059564906405285, "learning_rate": 2.7939142461964043e-06, "logits/chosen": -2.549201011657715, "logits/rejected": 2.146068811416626, "logps/chosen": -322.14947509765625, "logps/rejected": -767.48779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4301600456237793, "rewards/margins": 12.822122573852539, "rewards/rejected": -15.252283096313477, "step": 799 }, { "epoch": 0.4976671850699845, "grad_norm": 0.7203413844108582, "learning_rate": 2.79045643153527e-06, "logits/chosen": 0.7414020299911499, "logits/rejected": 1.9987093210220337, "logps/chosen": -565.46435546875, "logps/rejected": -656.9459228515625, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -1.9139114618301392, "rewards/margins": 8.487006187438965, "rewards/rejected": -10.400918006896973, "step": 800 }, { "epoch": 0.4982892690513219, "grad_norm": 7.429201126098633, "learning_rate": 2.7869986168741357e-06, "logits/chosen": 1.6278167963027954, "logits/rejected": 2.699859142303467, "logps/chosen": -628.646240234375, "logps/rejected": -835.98291015625, "loss": 0.1307, "rewards/accuracies": 0.875, "rewards/chosen": -3.4962618350982666, "rewards/margins": 9.729076385498047, "rewards/rejected": -13.225337982177734, "step": 801 }, { "epoch": 0.49891135303265943, "grad_norm": 11.731837272644043, "learning_rate": 2.7835408022130018e-06, "logits/chosen": -1.9982085227966309, "logits/rejected": 0.4577261209487915, "logps/chosen": -594.568359375, "logps/rejected": -803.7784423828125, "loss": 0.3556, "rewards/accuracies": 0.875, "rewards/chosen": -4.50863790512085, "rewards/margins": 7.490248680114746, "rewards/rejected": -11.998886108398438, "step": 802 }, { "epoch": 0.4995334370139969, "grad_norm": 2.473478317260742, "learning_rate": 2.7800829875518675e-06, "logits/chosen": -3.159956693649292, "logits/rejected": 1.165968894958496, "logps/chosen": -349.6506042480469, "logps/rejected": -766.5775756835938, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": -2.1038320064544678, "rewards/margins": 9.515385627746582, "rewards/rejected": -11.619217872619629, "step": 803 }, { "epoch": 0.5001555209953343, "grad_norm": 1.9162797927856445, "learning_rate": 2.7766251728907336e-06, "logits/chosen": -2.3587303161621094, "logits/rejected": 1.609816551208496, "logps/chosen": -485.0361328125, "logps/rejected": -871.1732177734375, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -1.919492483139038, "rewards/margins": 12.516715049743652, "rewards/rejected": -14.43620777130127, "step": 804 }, { "epoch": 0.5007776049766719, "grad_norm": 6.1779656410217285, "learning_rate": 2.7731673582295992e-06, "logits/chosen": 0.05069756507873535, "logits/rejected": 1.1996898651123047, "logps/chosen": -558.810791015625, "logps/rejected": -653.5657348632812, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": -3.332866907119751, "rewards/margins": 7.561882972717285, "rewards/rejected": -10.89474868774414, "step": 805 }, { "epoch": 0.5013996889580093, "grad_norm": 1.0146476030349731, "learning_rate": 2.769709543568465e-06, "logits/chosen": -1.699363350868225, "logits/rejected": 1.2182319164276123, "logps/chosen": -485.5576171875, "logps/rejected": -809.6729125976562, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -4.574933052062988, "rewards/margins": 10.753710746765137, "rewards/rejected": -15.328643798828125, "step": 806 }, { "epoch": 0.5020217729393468, "grad_norm": 2.711418628692627, "learning_rate": 2.766251728907331e-06, "logits/chosen": -0.4189668893814087, "logits/rejected": 1.5042582750320435, "logps/chosen": -536.84619140625, "logps/rejected": -708.64453125, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -4.095189094543457, "rewards/margins": 6.650837421417236, "rewards/rejected": -10.746026992797852, "step": 807 }, { "epoch": 0.5026438569206843, "grad_norm": 8.901546478271484, "learning_rate": 2.7627939142461967e-06, "logits/chosen": 2.3976192474365234, "logits/rejected": 3.605376958847046, "logps/chosen": -573.814453125, "logps/rejected": -738.7477416992188, "loss": 0.2412, "rewards/accuracies": 0.875, "rewards/chosen": -2.7594799995422363, "rewards/margins": 8.838155746459961, "rewards/rejected": -11.597636222839355, "step": 808 }, { "epoch": 0.5032659409020218, "grad_norm": 0.5574727058410645, "learning_rate": 2.7593360995850628e-06, "logits/chosen": -2.5843729972839355, "logits/rejected": 1.1001543998718262, "logps/chosen": -294.521240234375, "logps/rejected": -644.4393310546875, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -2.826786518096924, "rewards/margins": 9.965702056884766, "rewards/rejected": -12.792489051818848, "step": 809 }, { "epoch": 0.5038880248833593, "grad_norm": 0.1372232884168625, "learning_rate": 2.7558782849239285e-06, "logits/chosen": 1.056994080543518, "logits/rejected": 2.3178138732910156, "logps/chosen": -457.7174072265625, "logps/rejected": -593.1788940429688, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.552973747253418, "rewards/margins": 9.540002822875977, "rewards/rejected": -13.092977523803711, "step": 810 }, { "epoch": 0.5045101088646967, "grad_norm": 6.776548385620117, "learning_rate": 2.7524204702627945e-06, "logits/chosen": 1.1314094066619873, "logits/rejected": 3.2914435863494873, "logps/chosen": -508.5345153808594, "logps/rejected": -773.8414306640625, "loss": 0.0637, "rewards/accuracies": 1.0, "rewards/chosen": -2.3169822692871094, "rewards/margins": 7.499330520629883, "rewards/rejected": -9.816312789916992, "step": 811 }, { "epoch": 0.5051321928460342, "grad_norm": 0.07891335338354111, "learning_rate": 2.7489626556016602e-06, "logits/chosen": -2.199852705001831, "logits/rejected": 2.191945791244507, "logps/chosen": -414.4356384277344, "logps/rejected": -819.935791015625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.6840360164642334, "rewards/margins": 12.948060989379883, "rewards/rejected": -14.632099151611328, "step": 812 }, { "epoch": 0.5057542768273717, "grad_norm": 0.008467848412692547, "learning_rate": 2.745504840940526e-06, "logits/chosen": 0.09926819801330566, "logits/rejected": 2.0967259407043457, "logps/chosen": -471.5517272949219, "logps/rejected": -796.74560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.6652488708496094, "rewards/margins": 13.726076126098633, "rewards/rejected": -17.391324996948242, "step": 813 }, { "epoch": 0.5063763608087092, "grad_norm": 0.000800020236056298, "learning_rate": 2.742047026279392e-06, "logits/chosen": -1.5399856567382812, "logits/rejected": 1.1752564907073975, "logps/chosen": -357.0132141113281, "logps/rejected": -669.0941162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.916733741760254, "rewards/margins": 13.277435302734375, "rewards/rejected": -15.194169044494629, "step": 814 }, { "epoch": 0.5069984447900466, "grad_norm": 9.437519073486328, "learning_rate": 2.7385892116182577e-06, "logits/chosen": -0.48492613434791565, "logits/rejected": 2.44058895111084, "logps/chosen": -497.6107177734375, "logps/rejected": -754.484375, "loss": 0.1833, "rewards/accuracies": 0.875, "rewards/chosen": -2.773843765258789, "rewards/margins": 9.508381843566895, "rewards/rejected": -12.282225608825684, "step": 815 }, { "epoch": 0.5076205287713841, "grad_norm": 15.652241706848145, "learning_rate": 2.7351313969571238e-06, "logits/chosen": -0.3112896680831909, "logits/rejected": 1.3046232461929321, "logps/chosen": -602.7271728515625, "logps/rejected": -873.5218505859375, "loss": 0.5127, "rewards/accuracies": 0.875, "rewards/chosen": -4.92423677444458, "rewards/margins": 12.000670433044434, "rewards/rejected": -16.924907684326172, "step": 816 }, { "epoch": 0.5082426127527216, "grad_norm": 3.9567806720733643, "learning_rate": 2.731673582295989e-06, "logits/chosen": -3.552175521850586, "logits/rejected": 0.9606392979621887, "logps/chosen": -361.5174560546875, "logps/rejected": -808.1981201171875, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -2.6411123275756836, "rewards/margins": 10.58466625213623, "rewards/rejected": -13.225777626037598, "step": 817 }, { "epoch": 0.5088646967340591, "grad_norm": 7.375206470489502, "learning_rate": 2.7282157676348547e-06, "logits/chosen": -0.39027291536331177, "logits/rejected": 1.7532305717468262, "logps/chosen": -550.9006958007812, "logps/rejected": -785.936279296875, "loss": 0.1531, "rewards/accuracies": 0.875, "rewards/chosen": -2.7712013721466064, "rewards/margins": 10.460086822509766, "rewards/rejected": -13.23128890991211, "step": 818 }, { "epoch": 0.5094867807153965, "grad_norm": 4.345978260040283, "learning_rate": 2.724757952973721e-06, "logits/chosen": 0.1867896020412445, "logits/rejected": 1.9367432594299316, "logps/chosen": -536.241455078125, "logps/rejected": -747.6674194335938, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": -5.238348007202148, "rewards/margins": 10.422866821289062, "rewards/rejected": -15.661214828491211, "step": 819 }, { "epoch": 0.5101088646967341, "grad_norm": 0.014316629618406296, "learning_rate": 2.7213001383125865e-06, "logits/chosen": 1.7345190048217773, "logits/rejected": 2.815479278564453, "logps/chosen": -717.8068237304688, "logps/rejected": -872.4273681640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.637908935546875, "rewards/margins": 12.473396301269531, "rewards/rejected": -15.111305236816406, "step": 820 }, { "epoch": 0.5107309486780716, "grad_norm": 0.28350508213043213, "learning_rate": 2.717842323651452e-06, "logits/chosen": -0.010654203593730927, "logits/rejected": 1.3988823890686035, "logps/chosen": -535.4199829101562, "logps/rejected": -760.4036865234375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.253536701202393, "rewards/margins": 8.09384536743164, "rewards/rejected": -12.347381591796875, "step": 821 }, { "epoch": 0.511353032659409, "grad_norm": 0.046211905777454376, "learning_rate": 2.7143845089903182e-06, "logits/chosen": -2.913323402404785, "logits/rejected": 0.12961331009864807, "logps/chosen": -335.9776306152344, "logps/rejected": -664.2984619140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.7074403762817383, "rewards/margins": 12.56165885925293, "rewards/rejected": -15.269099235534668, "step": 822 }, { "epoch": 0.5119751166407465, "grad_norm": 0.2673683166503906, "learning_rate": 2.710926694329184e-06, "logits/chosen": -1.3703092336654663, "logits/rejected": 1.3017255067825317, "logps/chosen": -471.1501770019531, "logps/rejected": -743.62646484375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.9759536981582642, "rewards/margins": 12.309216499328613, "rewards/rejected": -14.28516960144043, "step": 823 }, { "epoch": 0.512597200622084, "grad_norm": 5.155447959899902, "learning_rate": 2.70746887966805e-06, "logits/chosen": -0.7723738551139832, "logits/rejected": 2.415383815765381, "logps/chosen": -553.797119140625, "logps/rejected": -785.2659912109375, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": -3.648770332336426, "rewards/margins": 8.483132362365723, "rewards/rejected": -12.131902694702148, "step": 824 }, { "epoch": 0.5132192846034215, "grad_norm": 4.68203067779541, "learning_rate": 2.7040110650069157e-06, "logits/chosen": -0.7407625913619995, "logits/rejected": 1.3362473249435425, "logps/chosen": -485.90423583984375, "logps/rejected": -665.4288940429688, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": -3.0578763484954834, "rewards/margins": 7.216861724853516, "rewards/rejected": -10.274737358093262, "step": 825 }, { "epoch": 0.5138413685847589, "grad_norm": 0.4478921592235565, "learning_rate": 2.7005532503457814e-06, "logits/chosen": -0.5786160230636597, "logits/rejected": 2.3705639839172363, "logps/chosen": -381.42755126953125, "logps/rejected": -725.095947265625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -2.703498363494873, "rewards/margins": 7.871484279632568, "rewards/rejected": -10.574983596801758, "step": 826 }, { "epoch": 0.5144634525660964, "grad_norm": 3.3229613304138184, "learning_rate": 2.6970954356846475e-06, "logits/chosen": -2.104731321334839, "logits/rejected": 1.7328236103057861, "logps/chosen": -441.4781494140625, "logps/rejected": -749.4255981445312, "loss": 0.1355, "rewards/accuracies": 0.875, "rewards/chosen": -1.9731550216674805, "rewards/margins": 9.722774505615234, "rewards/rejected": -11.695928573608398, "step": 827 }, { "epoch": 0.5150855365474339, "grad_norm": 0.9683803915977478, "learning_rate": 2.693637621023513e-06, "logits/chosen": 0.022723138332366943, "logits/rejected": 3.365255832672119, "logps/chosen": -473.0911865234375, "logps/rejected": -787.535400390625, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -2.1985089778900146, "rewards/margins": 10.764823913574219, "rewards/rejected": -12.963332176208496, "step": 828 }, { "epoch": 0.5157076205287714, "grad_norm": 0.2308468073606491, "learning_rate": 2.6901798063623792e-06, "logits/chosen": -2.1939449310302734, "logits/rejected": 0.38754504919052124, "logps/chosen": -365.70513916015625, "logps/rejected": -651.8352661132812, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.5250089168548584, "rewards/margins": 10.350025177001953, "rewards/rejected": -13.875033378601074, "step": 829 }, { "epoch": 0.5163297045101088, "grad_norm": 4.103233814239502, "learning_rate": 2.686721991701245e-06, "logits/chosen": -0.6309167146682739, "logits/rejected": 2.2770798206329346, "logps/chosen": -623.141845703125, "logps/rejected": -918.5201416015625, "loss": 0.0986, "rewards/accuracies": 0.875, "rewards/chosen": -3.1182034015655518, "rewards/margins": 8.902615547180176, "rewards/rejected": -12.020819664001465, "step": 830 }, { "epoch": 0.5169517884914463, "grad_norm": 1.2431247234344482, "learning_rate": 2.6832641770401106e-06, "logits/chosen": -3.549687385559082, "logits/rejected": 1.8573200702667236, "logps/chosen": -289.8443298339844, "logps/rejected": -666.53271484375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.1627719402313232, "rewards/margins": 12.153861999511719, "rewards/rejected": -13.316635131835938, "step": 831 }, { "epoch": 0.5175738724727839, "grad_norm": 0.9992474913597107, "learning_rate": 2.6798063623789767e-06, "logits/chosen": -1.2823498249053955, "logits/rejected": 2.66428279876709, "logps/chosen": -345.2854309082031, "logps/rejected": -754.6093139648438, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -2.7585716247558594, "rewards/margins": 9.262018203735352, "rewards/rejected": -12.020590782165527, "step": 832 }, { "epoch": 0.5181959564541213, "grad_norm": 6.244182586669922, "learning_rate": 2.6763485477178423e-06, "logits/chosen": -0.21039354801177979, "logits/rejected": 1.3505587577819824, "logps/chosen": -500.20703125, "logps/rejected": -751.662353515625, "loss": 0.0866, "rewards/accuracies": 0.875, "rewards/chosen": -2.2685859203338623, "rewards/margins": 13.37126350402832, "rewards/rejected": -15.639849662780762, "step": 833 }, { "epoch": 0.5188180404354588, "grad_norm": 0.06849804520606995, "learning_rate": 2.6728907330567084e-06, "logits/chosen": -1.825751781463623, "logits/rejected": 1.9243922233581543, "logps/chosen": -398.87762451171875, "logps/rejected": -791.7412719726562, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.2089247703552246, "rewards/margins": 9.484277725219727, "rewards/rejected": -11.693202018737793, "step": 834 }, { "epoch": 0.5194401244167963, "grad_norm": 1.5089011192321777, "learning_rate": 2.669432918395574e-06, "logits/chosen": -2.7231011390686035, "logits/rejected": 0.5460342168807983, "logps/chosen": -343.23443603515625, "logps/rejected": -621.1883544921875, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -2.675900459289551, "rewards/margins": 9.028830528259277, "rewards/rejected": -11.704730987548828, "step": 835 }, { "epoch": 0.5200622083981338, "grad_norm": 6.8844804763793945, "learning_rate": 2.6659751037344402e-06, "logits/chosen": 0.4192070960998535, "logits/rejected": 1.4707965850830078, "logps/chosen": -548.3880615234375, "logps/rejected": -748.7999267578125, "loss": 0.1626, "rewards/accuracies": 0.875, "rewards/chosen": -4.771030426025391, "rewards/margins": 12.450023651123047, "rewards/rejected": -17.221054077148438, "step": 836 }, { "epoch": 0.5206842923794712, "grad_norm": 0.09701990336179733, "learning_rate": 2.662517289073306e-06, "logits/chosen": -1.566151738166809, "logits/rejected": 0.31144726276397705, "logps/chosen": -483.73626708984375, "logps/rejected": -778.809814453125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.827867031097412, "rewards/margins": 14.15554428100586, "rewards/rejected": -15.98341178894043, "step": 837 }, { "epoch": 0.5213063763608087, "grad_norm": 12.408698081970215, "learning_rate": 2.6590594744121716e-06, "logits/chosen": -0.27712106704711914, "logits/rejected": 1.919643521308899, "logps/chosen": -461.52276611328125, "logps/rejected": -627.791259765625, "loss": 0.5104, "rewards/accuracies": 0.875, "rewards/chosen": -4.413634300231934, "rewards/margins": 5.872857570648193, "rewards/rejected": -10.286492347717285, "step": 838 }, { "epoch": 0.5219284603421462, "grad_norm": 0.00045895209768787026, "learning_rate": 2.6556016597510377e-06, "logits/chosen": -1.4491337537765503, "logits/rejected": 3.076049327850342, "logps/chosen": -378.77294921875, "logps/rejected": -831.9737548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.050514578819275, "rewards/margins": 17.44437599182129, "rewards/rejected": -18.494890213012695, "step": 839 }, { "epoch": 0.5225505443234837, "grad_norm": 14.928807258605957, "learning_rate": 2.6521438450899033e-06, "logits/chosen": 0.9747679829597473, "logits/rejected": 2.411466598510742, "logps/chosen": -541.9779052734375, "logps/rejected": -713.09130859375, "loss": 0.5398, "rewards/accuracies": 0.875, "rewards/chosen": -4.0559468269348145, "rewards/margins": 8.678203582763672, "rewards/rejected": -12.734149932861328, "step": 840 }, { "epoch": 0.5231726283048211, "grad_norm": 5.6416850090026855, "learning_rate": 2.6486860304287694e-06, "logits/chosen": 0.17006933689117432, "logits/rejected": 1.9439977407455444, "logps/chosen": -456.9559020996094, "logps/rejected": -633.7489624023438, "loss": 0.2057, "rewards/accuracies": 0.875, "rewards/chosen": -3.2416391372680664, "rewards/margins": 6.754123210906982, "rewards/rejected": -9.99576187133789, "step": 841 }, { "epoch": 0.5237947122861586, "grad_norm": 7.3606486320495605, "learning_rate": 2.645228215767635e-06, "logits/chosen": -1.4521063566207886, "logits/rejected": 1.2681409120559692, "logps/chosen": -501.98138427734375, "logps/rejected": -745.1168823242188, "loss": 0.1333, "rewards/accuracies": 0.875, "rewards/chosen": -3.713022232055664, "rewards/margins": 8.915688514709473, "rewards/rejected": -12.62870979309082, "step": 842 }, { "epoch": 0.5244167962674962, "grad_norm": 12.82140064239502, "learning_rate": 2.6417704011065008e-06, "logits/chosen": 1.06523859500885, "logits/rejected": 1.8307665586471558, "logps/chosen": -653.0620727539062, "logps/rejected": -770.46875, "loss": 0.9752, "rewards/accuracies": 0.875, "rewards/chosen": -2.875339984893799, "rewards/margins": 8.946435928344727, "rewards/rejected": -11.821775436401367, "step": 843 }, { "epoch": 0.5250388802488336, "grad_norm": 9.69031047821045, "learning_rate": 2.638312586445367e-06, "logits/chosen": -0.11086753010749817, "logits/rejected": 2.0056729316711426, "logps/chosen": -570.9319458007812, "logps/rejected": -779.7575073242188, "loss": 0.2135, "rewards/accuracies": 0.875, "rewards/chosen": -3.8729212284088135, "rewards/margins": 9.544912338256836, "rewards/rejected": -13.41783332824707, "step": 844 }, { "epoch": 0.5256609642301711, "grad_norm": 13.867210388183594, "learning_rate": 2.6348547717842326e-06, "logits/chosen": -0.5404865145683289, "logits/rejected": 2.7610225677490234, "logps/chosen": -467.103759765625, "logps/rejected": -711.236083984375, "loss": 0.7646, "rewards/accuracies": 0.75, "rewards/chosen": -4.120180130004883, "rewards/margins": 4.436974048614502, "rewards/rejected": -8.557153701782227, "step": 845 }, { "epoch": 0.5262830482115085, "grad_norm": 0.048817217350006104, "learning_rate": 2.6313969571230987e-06, "logits/chosen": -2.87780499458313, "logits/rejected": 0.2316511571407318, "logps/chosen": -372.46282958984375, "logps/rejected": -712.667236328125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.56717586517334, "rewards/margins": 12.469561576843262, "rewards/rejected": -15.036737442016602, "step": 846 }, { "epoch": 0.5269051321928461, "grad_norm": 0.4714060127735138, "learning_rate": 2.6279391424619643e-06, "logits/chosen": -1.1185495853424072, "logits/rejected": -0.8957788944244385, "logps/chosen": -569.531494140625, "logps/rejected": -704.3427734375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.398099899291992, "rewards/margins": 11.039609909057617, "rewards/rejected": -15.437707901000977, "step": 847 }, { "epoch": 0.5275272161741835, "grad_norm": 0.008700737729668617, "learning_rate": 2.62448132780083e-06, "logits/chosen": -2.9681105613708496, "logits/rejected": 0.9155920743942261, "logps/chosen": -414.8067626953125, "logps/rejected": -866.390380859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7064781188964844, "rewards/margins": 10.940406799316406, "rewards/rejected": -13.64688491821289, "step": 848 }, { "epoch": 0.528149300155521, "grad_norm": 6.896342754364014, "learning_rate": 2.621023513139696e-06, "logits/chosen": 1.28227698802948, "logits/rejected": 2.7109546661376953, "logps/chosen": -530.7550048828125, "logps/rejected": -707.4141845703125, "loss": 0.1451, "rewards/accuracies": 0.875, "rewards/chosen": -2.8157849311828613, "rewards/margins": 11.002253532409668, "rewards/rejected": -13.818038940429688, "step": 849 }, { "epoch": 0.5287713841368584, "grad_norm": 5.201210975646973, "learning_rate": 2.6175656984785618e-06, "logits/chosen": -3.4094748497009277, "logits/rejected": 1.0206577777862549, "logps/chosen": -313.08587646484375, "logps/rejected": -691.5407104492188, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": -3.367981433868408, "rewards/margins": 10.737176895141602, "rewards/rejected": -14.105157852172852, "step": 850 }, { "epoch": 0.529393468118196, "grad_norm": 7.505495548248291, "learning_rate": 2.614107883817428e-06, "logits/chosen": -0.42962461709976196, "logits/rejected": 1.3082517385482788, "logps/chosen": -547.2723388671875, "logps/rejected": -762.7313232421875, "loss": 0.139, "rewards/accuracies": 0.875, "rewards/chosen": -2.9786899089813232, "rewards/margins": 7.981723785400391, "rewards/rejected": -10.960413932800293, "step": 851 }, { "epoch": 0.5300155520995334, "grad_norm": 8.54387378692627, "learning_rate": 2.6106500691562935e-06, "logits/chosen": -1.2395621538162231, "logits/rejected": 0.3543500304222107, "logps/chosen": -351.17572021484375, "logps/rejected": -489.43084716796875, "loss": 0.2457, "rewards/accuracies": 0.875, "rewards/chosen": -3.139695644378662, "rewards/margins": 4.403896331787109, "rewards/rejected": -7.54359245300293, "step": 852 }, { "epoch": 0.5306376360808709, "grad_norm": 0.07706073671579361, "learning_rate": 2.6071922544951596e-06, "logits/chosen": -0.3109931945800781, "logits/rejected": 2.3791139125823975, "logps/chosen": -429.270263671875, "logps/rejected": -738.830810546875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.6534030437469482, "rewards/margins": 11.84909439086914, "rewards/rejected": -14.502498626708984, "step": 853 }, { "epoch": 0.5312597200622085, "grad_norm": 5.891592025756836, "learning_rate": 2.6037344398340253e-06, "logits/chosen": -3.6216237545013428, "logits/rejected": 1.3730096817016602, "logps/chosen": -319.1530456542969, "logps/rejected": -783.1572265625, "loss": 0.7166, "rewards/accuracies": 0.875, "rewards/chosen": -2.449763774871826, "rewards/margins": 12.064699172973633, "rewards/rejected": -14.514463424682617, "step": 854 }, { "epoch": 0.5318818040435459, "grad_norm": 2.1987998485565186, "learning_rate": 2.600276625172891e-06, "logits/chosen": -0.5740342140197754, "logits/rejected": 2.157291889190674, "logps/chosen": -501.46893310546875, "logps/rejected": -728.6790161132812, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -5.783341407775879, "rewards/margins": 6.7000579833984375, "rewards/rejected": -12.483400344848633, "step": 855 }, { "epoch": 0.5325038880248834, "grad_norm": 0.6568763852119446, "learning_rate": 2.596818810511757e-06, "logits/chosen": -3.2539010047912598, "logits/rejected": 0.30289357900619507, "logps/chosen": -359.56201171875, "logps/rejected": -730.9524536132812, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.252196788787842, "rewards/margins": 10.027812957763672, "rewards/rejected": -12.280010223388672, "step": 856 }, { "epoch": 0.5331259720062208, "grad_norm": 0.3170051872730255, "learning_rate": 2.5933609958506228e-06, "logits/chosen": -0.21149927377700806, "logits/rejected": 2.0923194885253906, "logps/chosen": -459.89569091796875, "logps/rejected": -696.9242553710938, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.000885963439941, "rewards/margins": 8.608548164367676, "rewards/rejected": -12.6094331741333, "step": 857 }, { "epoch": 0.5337480559875584, "grad_norm": 6.321094512939453, "learning_rate": 2.589903181189489e-06, "logits/chosen": -3.521068811416626, "logits/rejected": 1.0143897533416748, "logps/chosen": -350.3503723144531, "logps/rejected": -828.6209716796875, "loss": 0.1277, "rewards/accuracies": 0.875, "rewards/chosen": -1.6417231559753418, "rewards/margins": 10.194904327392578, "rewards/rejected": -11.836627960205078, "step": 858 }, { "epoch": 0.5343701399688958, "grad_norm": 0.9227604269981384, "learning_rate": 2.5864453665283545e-06, "logits/chosen": -2.747382402420044, "logits/rejected": 0.81141197681427, "logps/chosen": -450.5257873535156, "logps/rejected": -781.0877685546875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -3.2049224376678467, "rewards/margins": 12.344651222229004, "rewards/rejected": -15.549572944641113, "step": 859 }, { "epoch": 0.5349922239502333, "grad_norm": 0.646002471446991, "learning_rate": 2.58298755186722e-06, "logits/chosen": -0.34163740277290344, "logits/rejected": 1.6976463794708252, "logps/chosen": -425.6351623535156, "logps/rejected": -671.9130859375, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -1.9945714473724365, "rewards/margins": 10.540374755859375, "rewards/rejected": -12.534945487976074, "step": 860 }, { "epoch": 0.5356143079315707, "grad_norm": 12.107315063476562, "learning_rate": 2.5795297372060863e-06, "logits/chosen": -0.1147310733795166, "logits/rejected": 0.5523694753646851, "logps/chosen": -568.417236328125, "logps/rejected": -681.3090209960938, "loss": 0.731, "rewards/accuracies": 0.875, "rewards/chosen": -3.572232723236084, "rewards/margins": 6.6808576583862305, "rewards/rejected": -10.253089904785156, "step": 861 }, { "epoch": 0.5362363919129083, "grad_norm": 0.06052294746041298, "learning_rate": 2.5760719225449516e-06, "logits/chosen": -1.6789189577102661, "logits/rejected": 1.4593360424041748, "logps/chosen": -537.2366333007812, "logps/rejected": -811.0751953125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.784329414367676, "rewards/margins": 10.294975280761719, "rewards/rejected": -14.079305648803711, "step": 862 }, { "epoch": 0.5368584758942457, "grad_norm": 0.09776905179023743, "learning_rate": 2.5726141078838172e-06, "logits/chosen": -0.8672104477882385, "logits/rejected": 2.042964458465576, "logps/chosen": -443.274658203125, "logps/rejected": -795.3909912109375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.659024715423584, "rewards/margins": 11.183073043823242, "rewards/rejected": -15.842098236083984, "step": 863 }, { "epoch": 0.5374805598755832, "grad_norm": 0.33768558502197266, "learning_rate": 2.5691562932226833e-06, "logits/chosen": -1.0540237426757812, "logits/rejected": 1.0383448600769043, "logps/chosen": -492.3240966796875, "logps/rejected": -716.623291015625, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.312061071395874, "rewards/margins": 10.526165008544922, "rewards/rejected": -13.838226318359375, "step": 864 }, { "epoch": 0.5381026438569206, "grad_norm": 1.926538348197937, "learning_rate": 2.565698478561549e-06, "logits/chosen": -2.2433245182037354, "logits/rejected": 1.6043426990509033, "logps/chosen": -480.3562927246094, "logps/rejected": -820.168701171875, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -3.0641226768493652, "rewards/margins": 9.685659408569336, "rewards/rejected": -12.74978256225586, "step": 865 }, { "epoch": 0.5387247278382582, "grad_norm": 0.011444531381130219, "learning_rate": 2.562240663900415e-06, "logits/chosen": -1.7229702472686768, "logits/rejected": 1.2502331733703613, "logps/chosen": -456.4857177734375, "logps/rejected": -811.71337890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.344876527786255, "rewards/margins": 11.691463470458984, "rewards/rejected": -15.03633975982666, "step": 866 }, { "epoch": 0.5393468118195957, "grad_norm": 6.426474094390869, "learning_rate": 2.5587828492392808e-06, "logits/chosen": -1.3821085691452026, "logits/rejected": 1.5565470457077026, "logps/chosen": -505.5169982910156, "logps/rejected": -775.5040283203125, "loss": 0.1333, "rewards/accuracies": 0.875, "rewards/chosen": -2.230504035949707, "rewards/margins": 11.363288879394531, "rewards/rejected": -13.593793869018555, "step": 867 }, { "epoch": 0.5399688958009331, "grad_norm": 9.2164945602417, "learning_rate": 2.5553250345781465e-06, "logits/chosen": -1.1115256547927856, "logits/rejected": 0.9305959343910217, "logps/chosen": -500.7325439453125, "logps/rejected": -704.6104736328125, "loss": 0.3637, "rewards/accuracies": 0.875, "rewards/chosen": -4.713256359100342, "rewards/margins": 8.120781898498535, "rewards/rejected": -12.834038734436035, "step": 868 }, { "epoch": 0.5405909797822706, "grad_norm": 1.1244754791259766, "learning_rate": 2.5518672199170125e-06, "logits/chosen": -0.3649643659591675, "logits/rejected": 1.9490447044372559, "logps/chosen": -514.34228515625, "logps/rejected": -746.3378295898438, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -1.323508858680725, "rewards/margins": 10.549452781677246, "rewards/rejected": -11.87296199798584, "step": 869 }, { "epoch": 0.5412130637636081, "grad_norm": 5.828649044036865, "learning_rate": 2.5484094052558782e-06, "logits/chosen": -0.8659449815750122, "logits/rejected": 0.6496329307556152, "logps/chosen": -494.5976257324219, "logps/rejected": -729.111572265625, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": -4.437461853027344, "rewards/margins": 9.776254653930664, "rewards/rejected": -14.213716506958008, "step": 870 }, { "epoch": 0.5418351477449456, "grad_norm": 5.77534818649292, "learning_rate": 2.5449515905947443e-06, "logits/chosen": -0.6894445419311523, "logits/rejected": 1.7411448955535889, "logps/chosen": -574.1994018554688, "logps/rejected": -786.432861328125, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": -5.043036460876465, "rewards/margins": 7.578890323638916, "rewards/rejected": -12.621925354003906, "step": 871 }, { "epoch": 0.542457231726283, "grad_norm": 3.227311849594116, "learning_rate": 2.54149377593361e-06, "logits/chosen": -0.23890534043312073, "logits/rejected": 1.4069501161575317, "logps/chosen": -462.05804443359375, "logps/rejected": -714.2733154296875, "loss": 0.1329, "rewards/accuracies": 0.875, "rewards/chosen": -2.9602527618408203, "rewards/margins": 9.44040298461914, "rewards/rejected": -12.400655746459961, "step": 872 }, { "epoch": 0.5430793157076206, "grad_norm": 2.8627521991729736, "learning_rate": 2.5380359612724757e-06, "logits/chosen": -4.590663909912109, "logits/rejected": 0.10111960768699646, "logps/chosen": -287.8867492675781, "logps/rejected": -679.6129760742188, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": -1.8607137203216553, "rewards/margins": 13.212772369384766, "rewards/rejected": -15.073486328125, "step": 873 }, { "epoch": 0.543701399688958, "grad_norm": 0.5004326701164246, "learning_rate": 2.5345781466113418e-06, "logits/chosen": -2.824276924133301, "logits/rejected": 0.1807805299758911, "logps/chosen": -371.7115173339844, "logps/rejected": -623.52783203125, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.6533328294754028, "rewards/margins": 8.8392333984375, "rewards/rejected": -10.492565155029297, "step": 874 }, { "epoch": 0.5443234836702955, "grad_norm": 5.830672740936279, "learning_rate": 2.5311203319502074e-06, "logits/chosen": -0.15256795287132263, "logits/rejected": 2.1012825965881348, "logps/chosen": -352.7666015625, "logps/rejected": -575.0555419921875, "loss": 0.1312, "rewards/accuracies": 0.875, "rewards/chosen": -2.337549924850464, "rewards/margins": 6.870055675506592, "rewards/rejected": -9.207605361938477, "step": 875 }, { "epoch": 0.5449455676516329, "grad_norm": 13.315792083740234, "learning_rate": 2.5276625172890735e-06, "logits/chosen": -1.0077438354492188, "logits/rejected": 2.0275118350982666, "logps/chosen": -463.268310546875, "logps/rejected": -760.4005737304688, "loss": 0.5088, "rewards/accuracies": 0.875, "rewards/chosen": -3.1664927005767822, "rewards/margins": 8.820655822753906, "rewards/rejected": -11.98714828491211, "step": 876 }, { "epoch": 0.5455676516329705, "grad_norm": 0.2639257609844208, "learning_rate": 2.5242047026279392e-06, "logits/chosen": 1.2172627449035645, "logits/rejected": 2.52079701423645, "logps/chosen": -578.4287719726562, "logps/rejected": -832.2799072265625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.6825138330459595, "rewards/margins": 11.32406997680664, "rewards/rejected": -13.006584167480469, "step": 877 }, { "epoch": 0.546189735614308, "grad_norm": 8.06668758392334, "learning_rate": 2.5207468879668053e-06, "logits/chosen": -2.826172351837158, "logits/rejected": 0.775820255279541, "logps/chosen": -428.5758056640625, "logps/rejected": -856.8690795898438, "loss": 0.1286, "rewards/accuracies": 0.875, "rewards/chosen": -3.64652156829834, "rewards/margins": 12.721778869628906, "rewards/rejected": -16.368301391601562, "step": 878 }, { "epoch": 0.5468118195956454, "grad_norm": 4.591684341430664, "learning_rate": 2.517289073305671e-06, "logits/chosen": 0.5131932497024536, "logits/rejected": 2.220463514328003, "logps/chosen": -591.7940673828125, "logps/rejected": -819.0051879882812, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -2.937981605529785, "rewards/margins": 12.01667308807373, "rewards/rejected": -14.954654693603516, "step": 879 }, { "epoch": 0.5474339035769828, "grad_norm": 0.06477294117212296, "learning_rate": 2.5138312586445367e-06, "logits/chosen": -0.206301748752594, "logits/rejected": 0.819780707359314, "logps/chosen": -518.1549072265625, "logps/rejected": -717.0284423828125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.507136821746826, "rewards/margins": 12.646249771118164, "rewards/rejected": -15.153387069702148, "step": 880 }, { "epoch": 0.5480559875583204, "grad_norm": 0.11177181452512741, "learning_rate": 2.5103734439834028e-06, "logits/chosen": -1.1164665222167969, "logits/rejected": 1.3287062644958496, "logps/chosen": -471.8994140625, "logps/rejected": -736.11962890625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.8170151710510254, "rewards/margins": 12.250486373901367, "rewards/rejected": -15.067501068115234, "step": 881 }, { "epoch": 0.5486780715396579, "grad_norm": 7.308872699737549, "learning_rate": 2.5069156293222684e-06, "logits/chosen": -1.803208827972412, "logits/rejected": 0.13363532721996307, "logps/chosen": -390.02667236328125, "logps/rejected": -569.3712768554688, "loss": 0.1831, "rewards/accuracies": 0.875, "rewards/chosen": -1.5463545322418213, "rewards/margins": 5.758317470550537, "rewards/rejected": -7.304671764373779, "step": 882 }, { "epoch": 0.5493001555209953, "grad_norm": 0.11784283071756363, "learning_rate": 2.5034578146611345e-06, "logits/chosen": 0.04986089468002319, "logits/rejected": 1.6071568727493286, "logps/chosen": -552.369140625, "logps/rejected": -666.7308959960938, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.4538521766662598, "rewards/margins": 11.05957317352295, "rewards/rejected": -12.513423919677734, "step": 883 }, { "epoch": 0.5499222395023328, "grad_norm": 2.948369264602661, "learning_rate": 2.5e-06, "logits/chosen": 1.4801064729690552, "logits/rejected": 2.4523019790649414, "logps/chosen": -635.5760498046875, "logps/rejected": -740.2680053710938, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -3.980482339859009, "rewards/margins": 10.87419319152832, "rewards/rejected": -14.85467529296875, "step": 884 }, { "epoch": 0.5505443234836703, "grad_norm": 0.08998950570821762, "learning_rate": 2.496542185338866e-06, "logits/chosen": -3.0749266147613525, "logits/rejected": 0.651315450668335, "logps/chosen": -404.6993103027344, "logps/rejected": -823.4277954101562, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.666795015335083, "rewards/margins": 13.710150718688965, "rewards/rejected": -15.376945495605469, "step": 885 }, { "epoch": 0.5511664074650078, "grad_norm": 0.035672981292009354, "learning_rate": 2.493084370677732e-06, "logits/chosen": -1.1393064260482788, "logits/rejected": 2.1426377296447754, "logps/chosen": -485.33123779296875, "logps/rejected": -869.30712890625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.454486846923828, "rewards/margins": 13.281171798706055, "rewards/rejected": -15.735658645629883, "step": 886 }, { "epoch": 0.5517884914463452, "grad_norm": 17.292354583740234, "learning_rate": 2.4896265560165977e-06, "logits/chosen": -0.9754741191864014, "logits/rejected": 0.3162088990211487, "logps/chosen": -560.6351318359375, "logps/rejected": -738.3533935546875, "loss": 1.1643, "rewards/accuracies": 0.875, "rewards/chosen": -6.016690731048584, "rewards/margins": 7.822524070739746, "rewards/rejected": -13.839214324951172, "step": 887 }, { "epoch": 0.5524105754276827, "grad_norm": 0.02428634651005268, "learning_rate": 2.4861687413554637e-06, "logits/chosen": -0.15613193809986115, "logits/rejected": 1.2891194820404053, "logps/chosen": -534.3314819335938, "logps/rejected": -782.0521240234375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.5727314949035645, "rewards/margins": 11.879975318908691, "rewards/rejected": -15.452707290649414, "step": 888 }, { "epoch": 0.5530326594090202, "grad_norm": 1.5632928609848022, "learning_rate": 2.4827109266943294e-06, "logits/chosen": -2.433758497238159, "logits/rejected": 2.1800589561462402, "logps/chosen": -446.082275390625, "logps/rejected": -807.4172973632812, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -4.2988739013671875, "rewards/margins": 8.319612503051758, "rewards/rejected": -12.618485450744629, "step": 889 }, { "epoch": 0.5536547433903577, "grad_norm": 0.27046501636505127, "learning_rate": 2.479253112033195e-06, "logits/chosen": -1.328334093093872, "logits/rejected": 1.9604742527008057, "logps/chosen": -403.48675537109375, "logps/rejected": -742.0716552734375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.22146163880825043, "rewards/margins": 10.216853141784668, "rewards/rejected": -10.438314437866211, "step": 890 }, { "epoch": 0.5542768273716951, "grad_norm": 4.106342315673828, "learning_rate": 2.475795297372061e-06, "logits/chosen": -1.0387437343597412, "logits/rejected": -0.37412524223327637, "logps/chosen": -549.7869262695312, "logps/rejected": -696.0029296875, "loss": 0.0779, "rewards/accuracies": 1.0, "rewards/chosen": -2.4418954849243164, "rewards/margins": 9.401920318603516, "rewards/rejected": -11.843816757202148, "step": 891 }, { "epoch": 0.5548989113530327, "grad_norm": 0.16096417605876923, "learning_rate": 2.472337482710927e-06, "logits/chosen": 0.5760812163352966, "logits/rejected": 2.233522891998291, "logps/chosen": -566.0213012695312, "logps/rejected": -812.1668701171875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.0801172256469727, "rewards/margins": 12.648391723632812, "rewards/rejected": -15.728508949279785, "step": 892 }, { "epoch": 0.5555209953343702, "grad_norm": 10.417119979858398, "learning_rate": 2.468879668049793e-06, "logits/chosen": -0.49470365047454834, "logits/rejected": 1.0202226638793945, "logps/chosen": -552.6075439453125, "logps/rejected": -738.5296020507812, "loss": 0.5363, "rewards/accuracies": 0.875, "rewards/chosen": -3.7938928604125977, "rewards/margins": 10.599638938903809, "rewards/rejected": -14.393532752990723, "step": 893 }, { "epoch": 0.5561430793157076, "grad_norm": 9.355098724365234, "learning_rate": 2.4654218533886586e-06, "logits/chosen": -0.35855603218078613, "logits/rejected": 1.0310938358306885, "logps/chosen": -527.212646484375, "logps/rejected": -698.5621948242188, "loss": 0.2663, "rewards/accuracies": 0.875, "rewards/chosen": -2.820709228515625, "rewards/margins": 6.412031650543213, "rewards/rejected": -9.23274040222168, "step": 894 }, { "epoch": 0.5567651632970451, "grad_norm": 4.323993682861328, "learning_rate": 2.4619640387275247e-06, "logits/chosen": 1.1287932395935059, "logits/rejected": 2.0206797122955322, "logps/chosen": -575.0306396484375, "logps/rejected": -678.2618408203125, "loss": 0.126, "rewards/accuracies": 0.875, "rewards/chosen": -5.9033942222595215, "rewards/margins": 6.85048770904541, "rewards/rejected": -12.753881454467773, "step": 895 }, { "epoch": 0.5573872472783826, "grad_norm": 0.01519110519438982, "learning_rate": 2.45850622406639e-06, "logits/chosen": -0.03569376468658447, "logits/rejected": 0.7555198669433594, "logps/chosen": -560.6253662109375, "logps/rejected": -858.6680908203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.9908604621887207, "rewards/margins": 15.717243194580078, "rewards/rejected": -19.70810317993164, "step": 896 }, { "epoch": 0.5580093312597201, "grad_norm": 0.49905604124069214, "learning_rate": 2.455048409405256e-06, "logits/chosen": -1.075939416885376, "logits/rejected": 1.8443858623504639, "logps/chosen": -492.8347473144531, "logps/rejected": -776.7230224609375, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.3185954093933105, "rewards/margins": 11.480352401733398, "rewards/rejected": -13.798949241638184, "step": 897 }, { "epoch": 0.5586314152410575, "grad_norm": 0.9247255325317383, "learning_rate": 2.4515905947441218e-06, "logits/chosen": -1.3443939685821533, "logits/rejected": 1.2595183849334717, "logps/chosen": -386.6767272949219, "logps/rejected": -607.3983154296875, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -1.7378156185150146, "rewards/margins": 12.297340393066406, "rewards/rejected": -14.03515625, "step": 898 }, { "epoch": 0.559253499222395, "grad_norm": 0.4893006682395935, "learning_rate": 2.448132780082988e-06, "logits/chosen": -2.4501609802246094, "logits/rejected": 1.5746978521347046, "logps/chosen": -334.2552490234375, "logps/rejected": -776.4136962890625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.1820054054260254, "rewards/margins": 13.61904239654541, "rewards/rejected": -15.801048278808594, "step": 899 }, { "epoch": 0.5598755832037325, "grad_norm": 0.35524627566337585, "learning_rate": 2.4446749654218535e-06, "logits/chosen": -1.3581421375274658, "logits/rejected": 1.503446102142334, "logps/chosen": -350.68756103515625, "logps/rejected": -698.3187866210938, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.8667004108428955, "rewards/margins": 9.694417953491211, "rewards/rejected": -12.561119079589844, "step": 900 }, { "epoch": 0.56049766718507, "grad_norm": 0.8566383123397827, "learning_rate": 2.441217150760719e-06, "logits/chosen": -1.4916125535964966, "logits/rejected": 1.7033089399337769, "logps/chosen": -432.06591796875, "logps/rejected": -785.703857421875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.8376469612121582, "rewards/margins": 13.935076713562012, "rewards/rejected": -15.772723197937012, "step": 901 }, { "epoch": 0.5611197511664074, "grad_norm": 9.154833793640137, "learning_rate": 2.4377593360995853e-06, "logits/chosen": -0.6568223834037781, "logits/rejected": 2.0007829666137695, "logps/chosen": -429.5390930175781, "logps/rejected": -747.906982421875, "loss": 0.2345, "rewards/accuracies": 0.875, "rewards/chosen": -2.6756162643432617, "rewards/margins": 7.854908466339111, "rewards/rejected": -10.530524253845215, "step": 902 }, { "epoch": 0.5617418351477449, "grad_norm": 0.15060357749462128, "learning_rate": 2.434301521438451e-06, "logits/chosen": -1.8662853240966797, "logits/rejected": 2.805867910385132, "logps/chosen": -415.5635070800781, "logps/rejected": -842.9158935546875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.0693788528442383, "rewards/margins": 13.07082748413086, "rewards/rejected": -16.140207290649414, "step": 903 }, { "epoch": 0.5623639191290825, "grad_norm": 0.10793473571538925, "learning_rate": 2.430843706777317e-06, "logits/chosen": -2.8588995933532715, "logits/rejected": 1.5638978481292725, "logps/chosen": -282.64971923828125, "logps/rejected": -643.3683471679688, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.078824996948242, "rewards/margins": 11.031987190246582, "rewards/rejected": -13.110812187194824, "step": 904 }, { "epoch": 0.5629860031104199, "grad_norm": 0.3075536787509918, "learning_rate": 2.4273858921161828e-06, "logits/chosen": -0.21243011951446533, "logits/rejected": 1.7905488014221191, "logps/chosen": -541.974365234375, "logps/rejected": -724.8560791015625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.68098783493042, "rewards/margins": 9.14254093170166, "rewards/rejected": -11.823529243469238, "step": 905 }, { "epoch": 0.5636080870917574, "grad_norm": 2.950749397277832, "learning_rate": 2.4239280774550484e-06, "logits/chosen": -0.7267728447914124, "logits/rejected": 0.6746105551719666, "logps/chosen": -449.3104248046875, "logps/rejected": -653.4066162109375, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": -3.4746270179748535, "rewards/margins": 7.832556247711182, "rewards/rejected": -11.307183265686035, "step": 906 }, { "epoch": 0.5642301710730949, "grad_norm": 0.024075526744127274, "learning_rate": 2.4204702627939145e-06, "logits/chosen": -2.045548915863037, "logits/rejected": 0.9274657368659973, "logps/chosen": -476.3931884765625, "logps/rejected": -829.8742065429688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.04834801331162453, "rewards/margins": 13.953317642211914, "rewards/rejected": -14.001667022705078, "step": 907 }, { "epoch": 0.5648522550544324, "grad_norm": 11.3624267578125, "learning_rate": 2.41701244813278e-06, "logits/chosen": 0.06974712014198303, "logits/rejected": 2.414491653442383, "logps/chosen": -648.5630493164062, "logps/rejected": -851.5100708007812, "loss": 0.2901, "rewards/accuracies": 0.875, "rewards/chosen": -4.853083610534668, "rewards/margins": 9.798460006713867, "rewards/rejected": -14.651543617248535, "step": 908 }, { "epoch": 0.5654743390357698, "grad_norm": 1.0540317296981812, "learning_rate": 2.4135546334716463e-06, "logits/chosen": -2.1795003414154053, "logits/rejected": 1.2089951038360596, "logps/chosen": -350.14599609375, "logps/rejected": -720.2377319335938, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -2.2329182624816895, "rewards/margins": 9.904064178466797, "rewards/rejected": -12.136981964111328, "step": 909 }, { "epoch": 0.5660964230171073, "grad_norm": 8.097391128540039, "learning_rate": 2.410096818810512e-06, "logits/chosen": 1.1211110353469849, "logits/rejected": 2.235106945037842, "logps/chosen": -638.3709716796875, "logps/rejected": -825.66845703125, "loss": 0.1978, "rewards/accuracies": 0.875, "rewards/chosen": -3.5421481132507324, "rewards/margins": 10.459745407104492, "rewards/rejected": -14.001893997192383, "step": 910 }, { "epoch": 0.5667185069984448, "grad_norm": 11.999796867370605, "learning_rate": 2.4066390041493776e-06, "logits/chosen": -0.8166235089302063, "logits/rejected": 1.3394523859024048, "logps/chosen": -460.5404052734375, "logps/rejected": -646.5728759765625, "loss": 0.3837, "rewards/accuracies": 0.875, "rewards/chosen": -3.3040881156921387, "rewards/margins": 9.247632026672363, "rewards/rejected": -12.551719665527344, "step": 911 }, { "epoch": 0.5673405909797823, "grad_norm": 0.41875317692756653, "learning_rate": 2.4031811894882437e-06, "logits/chosen": -1.4202826023101807, "logits/rejected": 1.2137165069580078, "logps/chosen": -486.0645446777344, "logps/rejected": -723.7294921875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.864779233932495, "rewards/margins": 11.179669380187988, "rewards/rejected": -14.044448852539062, "step": 912 }, { "epoch": 0.5679626749611197, "grad_norm": 0.7444563508033752, "learning_rate": 2.3997233748271094e-06, "logits/chosen": -0.30865833163261414, "logits/rejected": 2.4436957836151123, "logps/chosen": -403.3447265625, "logps/rejected": -669.7191162109375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.6807522773742676, "rewards/margins": 11.850944519042969, "rewards/rejected": -14.531697273254395, "step": 913 }, { "epoch": 0.5685847589424572, "grad_norm": 0.2764438986778259, "learning_rate": 2.3962655601659755e-06, "logits/chosen": -2.432281494140625, "logits/rejected": 2.463907241821289, "logps/chosen": -394.365966796875, "logps/rejected": -823.5703125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.666998267173767, "rewards/margins": 9.656478881835938, "rewards/rejected": -11.323477745056152, "step": 914 }, { "epoch": 0.5692068429237948, "grad_norm": 9.435851097106934, "learning_rate": 2.392807745504841e-06, "logits/chosen": 0.5731032490730286, "logits/rejected": 0.5877424478530884, "logps/chosen": -634.8724975585938, "logps/rejected": -734.4451904296875, "loss": 0.2176, "rewards/accuracies": 0.875, "rewards/chosen": -6.02477502822876, "rewards/margins": 8.32628345489502, "rewards/rejected": -14.351058959960938, "step": 915 }, { "epoch": 0.5698289269051322, "grad_norm": 1.024430513381958, "learning_rate": 2.3893499308437073e-06, "logits/chosen": -0.8742019534111023, "logits/rejected": 1.6324412822723389, "logps/chosen": -520.28857421875, "logps/rejected": -813.2391967773438, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -3.1112823486328125, "rewards/margins": 14.393203735351562, "rewards/rejected": -17.504486083984375, "step": 916 }, { "epoch": 0.5704510108864697, "grad_norm": 2.020036220550537, "learning_rate": 2.385892116182573e-06, "logits/chosen": -2.9983139038085938, "logits/rejected": 0.779320478439331, "logps/chosen": -278.71044921875, "logps/rejected": -614.6688842773438, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": -2.8076725006103516, "rewards/margins": 10.896881103515625, "rewards/rejected": -13.704553604125977, "step": 917 }, { "epoch": 0.5710730948678071, "grad_norm": 1.5248711109161377, "learning_rate": 2.3824343015214386e-06, "logits/chosen": -2.7555947303771973, "logits/rejected": -0.41062498092651367, "logps/chosen": -386.8077087402344, "logps/rejected": -660.2780151367188, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -2.5401782989501953, "rewards/margins": 10.47075080871582, "rewards/rejected": -13.0109281539917, "step": 918 }, { "epoch": 0.5716951788491447, "grad_norm": 0.25369125604629517, "learning_rate": 2.3789764868603043e-06, "logits/chosen": 0.30348581075668335, "logits/rejected": 1.256783127784729, "logps/chosen": -553.3196411132812, "logps/rejected": -723.69580078125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -1.670626163482666, "rewards/margins": 11.499987602233887, "rewards/rejected": -13.170614242553711, "step": 919 }, { "epoch": 0.5723172628304821, "grad_norm": 6.726269721984863, "learning_rate": 2.3755186721991704e-06, "logits/chosen": -0.8130156993865967, "logits/rejected": 1.6541866064071655, "logps/chosen": -435.51348876953125, "logps/rejected": -713.2994384765625, "loss": 0.1022, "rewards/accuracies": 0.875, "rewards/chosen": -3.384213447570801, "rewards/margins": 5.489146709442139, "rewards/rejected": -8.873360633850098, "step": 920 }, { "epoch": 0.5729393468118196, "grad_norm": 0.0059307897463440895, "learning_rate": 2.372060857538036e-06, "logits/chosen": -0.8209717869758606, "logits/rejected": 2.44582462310791, "logps/chosen": -556.219482421875, "logps/rejected": -900.2086181640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.3726115226745605, "rewards/margins": 13.122572898864746, "rewards/rejected": -16.49518585205078, "step": 921 }, { "epoch": 0.573561430793157, "grad_norm": 1.0664347410202026, "learning_rate": 2.3686030428769018e-06, "logits/chosen": 0.48277348279953003, "logits/rejected": 2.3433890342712402, "logps/chosen": -566.5703125, "logps/rejected": -795.5602416992188, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -3.159879207611084, "rewards/margins": 9.724858283996582, "rewards/rejected": -12.884737968444824, "step": 922 }, { "epoch": 0.5741835147744946, "grad_norm": 11.237595558166504, "learning_rate": 2.365145228215768e-06, "logits/chosen": -0.32899242639541626, "logits/rejected": 2.189133644104004, "logps/chosen": -568.7750244140625, "logps/rejected": -775.7718505859375, "loss": 0.3179, "rewards/accuracies": 0.875, "rewards/chosen": -5.363801956176758, "rewards/margins": 9.511669158935547, "rewards/rejected": -14.875471115112305, "step": 923 }, { "epoch": 0.574805598755832, "grad_norm": 4.856917858123779, "learning_rate": 2.3616874135546335e-06, "logits/chosen": 0.054648905992507935, "logits/rejected": 1.371339201927185, "logps/chosen": -578.970458984375, "logps/rejected": -760.6804809570312, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": -4.0121331214904785, "rewards/margins": 11.35140323638916, "rewards/rejected": -15.363536834716797, "step": 924 }, { "epoch": 0.5754276827371695, "grad_norm": 0.10243432968854904, "learning_rate": 2.3582295988934996e-06, "logits/chosen": -1.1398417949676514, "logits/rejected": 2.9489400386810303, "logps/chosen": -275.157958984375, "logps/rejected": -667.268798828125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.0676698684692383, "rewards/margins": 11.796116828918457, "rewards/rejected": -13.863786697387695, "step": 925 }, { "epoch": 0.576049766718507, "grad_norm": 5.961414337158203, "learning_rate": 2.3547717842323653e-06, "logits/chosen": -1.2913240194320679, "logits/rejected": 0.24492591619491577, "logps/chosen": -463.61517333984375, "logps/rejected": -646.5625, "loss": 0.1049, "rewards/accuracies": 0.875, "rewards/chosen": -2.948385000228882, "rewards/margins": 9.325738906860352, "rewards/rejected": -12.274124145507812, "step": 926 }, { "epoch": 0.5766718506998445, "grad_norm": 0.024064254015684128, "learning_rate": 2.351313969571231e-06, "logits/chosen": -0.04731544852256775, "logits/rejected": 2.088932991027832, "logps/chosen": -504.44732666015625, "logps/rejected": -830.69189453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.8135764598846436, "rewards/margins": 12.456151008605957, "rewards/rejected": -16.26972770690918, "step": 927 }, { "epoch": 0.577293934681182, "grad_norm": 0.13935111463069916, "learning_rate": 2.347856154910097e-06, "logits/chosen": -0.8524416089057922, "logits/rejected": 0.5246989130973816, "logps/chosen": -470.7748107910156, "logps/rejected": -705.3029174804688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.855940818786621, "rewards/margins": 12.054895401000977, "rewards/rejected": -14.910837173461914, "step": 928 }, { "epoch": 0.5779160186625194, "grad_norm": 0.02382746711373329, "learning_rate": 2.3443983402489627e-06, "logits/chosen": -0.22496792674064636, "logits/rejected": 0.7089909315109253, "logps/chosen": -624.9590454101562, "logps/rejected": -833.75634765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.654544353485107, "rewards/margins": 13.828263282775879, "rewards/rejected": -18.482807159423828, "step": 929 }, { "epoch": 0.578538102643857, "grad_norm": 1.6458019018173218, "learning_rate": 2.340940525587829e-06, "logits/chosen": 0.247907817363739, "logits/rejected": 2.3140435218811035, "logps/chosen": -475.192138671875, "logps/rejected": -664.3414306640625, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -4.8816938400268555, "rewards/margins": 8.780475616455078, "rewards/rejected": -13.662169456481934, "step": 930 }, { "epoch": 0.5791601866251944, "grad_norm": 1.0713306665420532, "learning_rate": 2.3374827109266945e-06, "logits/chosen": -3.176450252532959, "logits/rejected": -0.21672849357128143, "logps/chosen": -316.8189697265625, "logps/rejected": -654.0693969726562, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -2.0057554244995117, "rewards/margins": 9.46002197265625, "rewards/rejected": -11.465777397155762, "step": 931 }, { "epoch": 0.5797822706065319, "grad_norm": 0.5485743880271912, "learning_rate": 2.33402489626556e-06, "logits/chosen": -1.9021055698394775, "logits/rejected": 2.378045082092285, "logps/chosen": -399.4111328125, "logps/rejected": -713.87646484375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -1.736467957496643, "rewards/margins": 8.696184158325195, "rewards/rejected": -10.43265151977539, "step": 932 }, { "epoch": 0.5804043545878693, "grad_norm": 7.9954986572265625, "learning_rate": 2.3305670816044263e-06, "logits/chosen": -0.5097438097000122, "logits/rejected": 1.9277666807174683, "logps/chosen": -491.9041442871094, "logps/rejected": -783.8505859375, "loss": 0.2031, "rewards/accuracies": 0.875, "rewards/chosen": -2.7609102725982666, "rewards/margins": 10.342706680297852, "rewards/rejected": -13.103616714477539, "step": 933 }, { "epoch": 0.5810264385692069, "grad_norm": 3.811490297317505, "learning_rate": 2.327109266943292e-06, "logits/chosen": 0.05574220418930054, "logits/rejected": 2.0727696418762207, "logps/chosen": -595.0008544921875, "logps/rejected": -777.3809204101562, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -3.8693017959594727, "rewards/margins": 9.177431106567383, "rewards/rejected": -13.046733856201172, "step": 934 }, { "epoch": 0.5816485225505443, "grad_norm": 18.61717987060547, "learning_rate": 2.323651452282158e-06, "logits/chosen": -2.7799882888793945, "logits/rejected": 0.4935357868671417, "logps/chosen": -395.6555480957031, "logps/rejected": -703.665771484375, "loss": 1.1513, "rewards/accuracies": 0.75, "rewards/chosen": -3.463102102279663, "rewards/margins": 5.696136474609375, "rewards/rejected": -9.159238815307617, "step": 935 }, { "epoch": 0.5822706065318818, "grad_norm": 0.5068051815032959, "learning_rate": 2.3201936376210237e-06, "logits/chosen": -0.7079392671585083, "logits/rejected": 1.3873798847198486, "logps/chosen": -456.39666748046875, "logps/rejected": -738.1734008789062, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.9629251956939697, "rewards/margins": 10.879646301269531, "rewards/rejected": -13.842572212219238, "step": 936 }, { "epoch": 0.5828926905132192, "grad_norm": 4.450121879577637, "learning_rate": 2.31673582295989e-06, "logits/chosen": 3.0996994972229004, "logits/rejected": 3.69953989982605, "logps/chosen": -619.3745727539062, "logps/rejected": -881.6199951171875, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -3.0106167793273926, "rewards/margins": 11.582401275634766, "rewards/rejected": -14.593017578125, "step": 937 }, { "epoch": 0.5835147744945568, "grad_norm": 0.5499042272567749, "learning_rate": 2.3132780082987555e-06, "logits/chosen": -3.336367130279541, "logits/rejected": 2.218440055847168, "logps/chosen": -299.4913330078125, "logps/rejected": -760.2181396484375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.346615791320801, "rewards/margins": 11.962839126586914, "rewards/rejected": -14.309455871582031, "step": 938 }, { "epoch": 0.5841368584758942, "grad_norm": 5.769629955291748, "learning_rate": 2.309820193637621e-06, "logits/chosen": -1.503235936164856, "logits/rejected": 0.5956352949142456, "logps/chosen": -508.25152587890625, "logps/rejected": -684.7520141601562, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": -5.800031661987305, "rewards/margins": 6.9889631271362305, "rewards/rejected": -12.788993835449219, "step": 939 }, { "epoch": 0.5847589424572317, "grad_norm": 1.5535073280334473, "learning_rate": 2.3063623789764873e-06, "logits/chosen": -1.3261394500732422, "logits/rejected": 1.3361835479736328, "logps/chosen": -488.88885498046875, "logps/rejected": -773.4171142578125, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -3.6209731101989746, "rewards/margins": 9.77686882019043, "rewards/rejected": -13.397842407226562, "step": 940 }, { "epoch": 0.5853810264385692, "grad_norm": 7.691009521484375, "learning_rate": 2.302904564315353e-06, "logits/chosen": -0.5740677118301392, "logits/rejected": 0.75951087474823, "logps/chosen": -618.6478271484375, "logps/rejected": -711.2984008789062, "loss": 0.1657, "rewards/accuracies": 0.875, "rewards/chosen": -7.6014251708984375, "rewards/margins": 6.063532829284668, "rewards/rejected": -13.664958000183105, "step": 941 }, { "epoch": 0.5860031104199067, "grad_norm": 0.618105411529541, "learning_rate": 2.2994467496542186e-06, "logits/chosen": -1.7741047143936157, "logits/rejected": 0.3460093140602112, "logps/chosen": -491.9883117675781, "logps/rejected": -784.67578125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.751840114593506, "rewards/margins": 12.326622009277344, "rewards/rejected": -18.078462600708008, "step": 942 }, { "epoch": 0.5866251944012442, "grad_norm": 0.04738277941942215, "learning_rate": 2.2959889349930843e-06, "logits/chosen": -1.1980443000793457, "logits/rejected": 0.8390300273895264, "logps/chosen": -496.7899475097656, "logps/rejected": -709.0966796875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.968389511108398, "rewards/margins": 9.99521541595459, "rewards/rejected": -14.963605880737305, "step": 943 }, { "epoch": 0.5872472783825816, "grad_norm": 9.651183128356934, "learning_rate": 2.2925311203319504e-06, "logits/chosen": -2.7339141368865967, "logits/rejected": -0.8426005840301514, "logps/chosen": -433.9680480957031, "logps/rejected": -612.5326538085938, "loss": 0.2819, "rewards/accuracies": 0.875, "rewards/chosen": -2.3640666007995605, "rewards/margins": 9.043968200683594, "rewards/rejected": -11.408034324645996, "step": 944 }, { "epoch": 0.5878693623639192, "grad_norm": 0.006478056777268648, "learning_rate": 2.289073305670816e-06, "logits/chosen": -3.4326508045196533, "logits/rejected": 0.8170045018196106, "logps/chosen": -360.94140625, "logps/rejected": -822.6298828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.845106363296509, "rewards/margins": 13.344867706298828, "rewards/rejected": -17.189973831176758, "step": 945 }, { "epoch": 0.5884914463452566, "grad_norm": 3.6866798400878906, "learning_rate": 2.285615491009682e-06, "logits/chosen": -0.8533228039741516, "logits/rejected": 0.8241989612579346, "logps/chosen": -363.2625732421875, "logps/rejected": -512.415771484375, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": -2.0122947692871094, "rewards/margins": 6.548659801483154, "rewards/rejected": -8.560955047607422, "step": 946 }, { "epoch": 0.5891135303265941, "grad_norm": 0.12914109230041504, "learning_rate": 2.282157676348548e-06, "logits/chosen": -0.5658924579620361, "logits/rejected": 1.7090320587158203, "logps/chosen": -451.58453369140625, "logps/rejected": -733.0187377929688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.3764700889587402, "rewards/margins": 11.311630249023438, "rewards/rejected": -13.688100814819336, "step": 947 }, { "epoch": 0.5897356143079315, "grad_norm": 0.013812885619699955, "learning_rate": 2.2786998616874135e-06, "logits/chosen": -2.1015982627868652, "logits/rejected": 3.0763354301452637, "logps/chosen": -358.0718078613281, "logps/rejected": -888.750244140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.399985432624817, "rewards/margins": 14.732809066772461, "rewards/rejected": -16.132795333862305, "step": 948 }, { "epoch": 0.5903576982892691, "grad_norm": 5.073156833648682, "learning_rate": 2.2752420470262796e-06, "logits/chosen": -1.2231485843658447, "logits/rejected": 1.2260279655456543, "logps/chosen": -478.8951721191406, "logps/rejected": -714.345458984375, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": -2.0262672901153564, "rewards/margins": 8.799802780151367, "rewards/rejected": -10.826070785522461, "step": 949 }, { "epoch": 0.5909797822706065, "grad_norm": 0.0022258388344198465, "learning_rate": 2.2717842323651453e-06, "logits/chosen": -0.5866354703903198, "logits/rejected": 3.036163568496704, "logps/chosen": -527.0118408203125, "logps/rejected": -915.2420043945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9588887691497803, "rewards/margins": 14.165103912353516, "rewards/rejected": -18.123992919921875, "step": 950 }, { "epoch": 0.591601866251944, "grad_norm": 1.224341869354248, "learning_rate": 2.2683264177040114e-06, "logits/chosen": -2.1586315631866455, "logits/rejected": 0.9640223979949951, "logps/chosen": -379.2501220703125, "logps/rejected": -654.7781982421875, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -1.9618911743164062, "rewards/margins": 9.96478271484375, "rewards/rejected": -11.926673889160156, "step": 951 }, { "epoch": 0.5922239502332814, "grad_norm": 0.7204467058181763, "learning_rate": 2.264868603042877e-06, "logits/chosen": 1.3590327501296997, "logits/rejected": 2.024003267288208, "logps/chosen": -716.2952270507812, "logps/rejected": -827.584716796875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -4.825126647949219, "rewards/margins": 9.794397354125977, "rewards/rejected": -14.619523048400879, "step": 952 }, { "epoch": 0.592846034214619, "grad_norm": 0.035169266164302826, "learning_rate": 2.2614107883817427e-06, "logits/chosen": -0.12697458267211914, "logits/rejected": 0.9327027797698975, "logps/chosen": -597.2501220703125, "logps/rejected": -859.7117919921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.918300151824951, "rewards/margins": 14.60717487335205, "rewards/rejected": -18.525474548339844, "step": 953 }, { "epoch": 0.5934681181959565, "grad_norm": 7.961827278137207, "learning_rate": 2.257952973720609e-06, "logits/chosen": 0.40050652623176575, "logits/rejected": 2.3102993965148926, "logps/chosen": -546.6370239257812, "logps/rejected": -797.568115234375, "loss": 0.6173, "rewards/accuracies": 0.875, "rewards/chosen": -2.323857545852661, "rewards/margins": 10.235958099365234, "rewards/rejected": -12.559815406799316, "step": 954 }, { "epoch": 0.5940902021772939, "grad_norm": 7.086859703063965, "learning_rate": 2.2544951590594745e-06, "logits/chosen": -2.7076306343078613, "logits/rejected": 0.6084806323051453, "logps/chosen": -422.71832275390625, "logps/rejected": -751.5383911132812, "loss": 0.1656, "rewards/accuracies": 0.875, "rewards/chosen": -2.637875556945801, "rewards/margins": 10.727333068847656, "rewards/rejected": -13.365208625793457, "step": 955 }, { "epoch": 0.5947122861586314, "grad_norm": 7.9145827293396, "learning_rate": 2.2510373443983406e-06, "logits/chosen": -0.059936195611953735, "logits/rejected": 2.4507839679718018, "logps/chosen": -518.3193359375, "logps/rejected": -800.3818969726562, "loss": 0.1211, "rewards/accuracies": 0.875, "rewards/chosen": -4.473962306976318, "rewards/margins": 11.378662109375, "rewards/rejected": -15.85262393951416, "step": 956 }, { "epoch": 0.5953343701399689, "grad_norm": 0.10670798271894455, "learning_rate": 2.2475795297372063e-06, "logits/chosen": -1.1489874124526978, "logits/rejected": 3.4704463481903076, "logps/chosen": -357.69329833984375, "logps/rejected": -868.149169921875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.782266616821289, "rewards/margins": 14.906902313232422, "rewards/rejected": -16.689167022705078, "step": 957 }, { "epoch": 0.5959564541213064, "grad_norm": 5.263230323791504, "learning_rate": 2.2441217150760724e-06, "logits/chosen": -0.21247714757919312, "logits/rejected": -0.16034066677093506, "logps/chosen": -479.7233581542969, "logps/rejected": -602.3734741210938, "loss": 0.1894, "rewards/accuracies": 0.875, "rewards/chosen": -4.4400553703308105, "rewards/margins": 8.499614715576172, "rewards/rejected": -12.939669609069824, "step": 958 }, { "epoch": 0.5965785381026438, "grad_norm": 0.2797870337963104, "learning_rate": 2.240663900414938e-06, "logits/chosen": -1.7176613807678223, "logits/rejected": 1.0218979120254517, "logps/chosen": -394.3291015625, "logps/rejected": -776.91650390625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -3.557638645172119, "rewards/margins": 10.68807601928711, "rewards/rejected": -14.24571418762207, "step": 959 }, { "epoch": 0.5972006220839814, "grad_norm": 0.34161534905433655, "learning_rate": 2.2372060857538037e-06, "logits/chosen": -2.0510098934173584, "logits/rejected": 0.6918720006942749, "logps/chosen": -503.19793701171875, "logps/rejected": -831.8006591796875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.7114973068237305, "rewards/margins": 14.321964263916016, "rewards/rejected": -18.03346061706543, "step": 960 }, { "epoch": 0.5978227060653188, "grad_norm": 4.349493980407715, "learning_rate": 2.23374827109267e-06, "logits/chosen": -0.9193814992904663, "logits/rejected": 2.831984281539917, "logps/chosen": -510.60418701171875, "logps/rejected": -864.6718139648438, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": -4.5065765380859375, "rewards/margins": 13.197561264038086, "rewards/rejected": -17.70413589477539, "step": 961 }, { "epoch": 0.5984447900466563, "grad_norm": 0.23090516030788422, "learning_rate": 2.2302904564315355e-06, "logits/chosen": -1.1777265071868896, "logits/rejected": 1.5459868907928467, "logps/chosen": -442.1393737792969, "logps/rejected": -738.650390625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.9812586307525635, "rewards/margins": 10.279169082641602, "rewards/rejected": -14.26042652130127, "step": 962 }, { "epoch": 0.5990668740279937, "grad_norm": 0.6581725478172302, "learning_rate": 2.2268326417704016e-06, "logits/chosen": -0.7561290264129639, "logits/rejected": 1.4954062700271606, "logps/chosen": -509.436767578125, "logps/rejected": -729.263916015625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -4.434615612030029, "rewards/margins": 9.962506294250488, "rewards/rejected": -14.397120475769043, "step": 963 }, { "epoch": 0.5996889580093313, "grad_norm": 0.04395288601517677, "learning_rate": 2.223374827109267e-06, "logits/chosen": -1.868870735168457, "logits/rejected": 2.168858051300049, "logps/chosen": -338.48150634765625, "logps/rejected": -712.5068359375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.2085118293762207, "rewards/margins": 13.5519380569458, "rewards/rejected": -15.760448455810547, "step": 964 }, { "epoch": 0.6003110419906688, "grad_norm": 0.17721351981163025, "learning_rate": 2.219917012448133e-06, "logits/chosen": 2.095607042312622, "logits/rejected": 2.9207186698913574, "logps/chosen": -562.4119873046875, "logps/rejected": -786.907958984375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.4818902015686035, "rewards/margins": 11.37049674987793, "rewards/rejected": -13.852387428283691, "step": 965 }, { "epoch": 0.6009331259720062, "grad_norm": 11.799866676330566, "learning_rate": 2.2164591977869986e-06, "logits/chosen": 0.31343090534210205, "logits/rejected": 1.316147804260254, "logps/chosen": -584.7515869140625, "logps/rejected": -739.6664428710938, "loss": 0.3049, "rewards/accuracies": 0.875, "rewards/chosen": -4.386332988739014, "rewards/margins": 9.294452667236328, "rewards/rejected": -13.680784225463867, "step": 966 }, { "epoch": 0.6015552099533437, "grad_norm": 1.7393101453781128, "learning_rate": 2.2130013831258647e-06, "logits/chosen": -2.0455565452575684, "logits/rejected": 1.3957821130752563, "logps/chosen": -344.50164794921875, "logps/rejected": -689.9654541015625, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -1.6856659650802612, "rewards/margins": 12.55105209350586, "rewards/rejected": -14.236719131469727, "step": 967 }, { "epoch": 0.6021772939346812, "grad_norm": 0.5300350189208984, "learning_rate": 2.2095435684647304e-06, "logits/chosen": -3.2944722175598145, "logits/rejected": 1.2742276191711426, "logps/chosen": -381.88140869140625, "logps/rejected": -842.5942993164062, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.6086153984069824, "rewards/margins": 14.660472869873047, "rewards/rejected": -16.269088745117188, "step": 968 }, { "epoch": 0.6027993779160187, "grad_norm": 0.4497907757759094, "learning_rate": 2.206085753803596e-06, "logits/chosen": 0.09757590293884277, "logits/rejected": 2.2122931480407715, "logps/chosen": -455.0145568847656, "logps/rejected": -642.0513916015625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.022597551345825, "rewards/margins": 9.553382873535156, "rewards/rejected": -11.575981140136719, "step": 969 }, { "epoch": 0.6034214618973561, "grad_norm": 4.407611846923828, "learning_rate": 2.202627939142462e-06, "logits/chosen": -0.13510632514953613, "logits/rejected": 1.6411466598510742, "logps/chosen": -561.638671875, "logps/rejected": -777.2831420898438, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": -3.822789430618286, "rewards/margins": 12.667469024658203, "rewards/rejected": -16.490259170532227, "step": 970 }, { "epoch": 0.6040435458786936, "grad_norm": 3.6543426513671875, "learning_rate": 2.199170124481328e-06, "logits/chosen": -2.57094669342041, "logits/rejected": 0.4322543740272522, "logps/chosen": -354.1568603515625, "logps/rejected": -594.9813842773438, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -2.272228956222534, "rewards/margins": 9.327291488647461, "rewards/rejected": -11.599519729614258, "step": 971 }, { "epoch": 0.6046656298600311, "grad_norm": 8.426532745361328, "learning_rate": 2.195712309820194e-06, "logits/chosen": -2.2171292304992676, "logits/rejected": 1.6522676944732666, "logps/chosen": -510.5018310546875, "logps/rejected": -809.6028442382812, "loss": 0.1341, "rewards/accuracies": 0.875, "rewards/chosen": -2.431938409805298, "rewards/margins": 11.221406936645508, "rewards/rejected": -13.653345108032227, "step": 972 }, { "epoch": 0.6052877138413686, "grad_norm": 0.2562905550003052, "learning_rate": 2.1922544951590596e-06, "logits/chosen": -2.513306140899658, "logits/rejected": 0.5795740485191345, "logps/chosen": -402.89227294921875, "logps/rejected": -700.706787109375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.148787021636963, "rewards/margins": 12.809677124023438, "rewards/rejected": -14.958464622497559, "step": 973 }, { "epoch": 0.605909797822706, "grad_norm": 4.476144313812256, "learning_rate": 2.1887966804979253e-06, "logits/chosen": -1.592576026916504, "logits/rejected": 1.5608631372451782, "logps/chosen": -469.10711669921875, "logps/rejected": -875.3367919921875, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": -2.9314446449279785, "rewards/margins": 11.062034606933594, "rewards/rejected": -13.993478775024414, "step": 974 }, { "epoch": 0.6065318818040435, "grad_norm": 2.120291233062744, "learning_rate": 2.1853388658367914e-06, "logits/chosen": -4.368592262268066, "logits/rejected": 1.390554428100586, "logps/chosen": -401.1041259765625, "logps/rejected": -922.1307983398438, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -3.5193734169006348, "rewards/margins": 10.934623718261719, "rewards/rejected": -14.453995704650879, "step": 975 }, { "epoch": 0.6071539657853811, "grad_norm": 0.0020141329150646925, "learning_rate": 2.181881051175657e-06, "logits/chosen": -0.02676105499267578, "logits/rejected": 2.5707831382751465, "logps/chosen": -556.2714233398438, "logps/rejected": -852.64013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4323601722717285, "rewards/margins": 13.307117462158203, "rewards/rejected": -16.739479064941406, "step": 976 }, { "epoch": 0.6077760497667185, "grad_norm": 0.03325255587697029, "learning_rate": 2.178423236514523e-06, "logits/chosen": 0.4494156241416931, "logits/rejected": 2.6505439281463623, "logps/chosen": -504.2938537597656, "logps/rejected": -801.7711791992188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.076378107070923, "rewards/margins": 10.20052719116211, "rewards/rejected": -13.276905059814453, "step": 977 }, { "epoch": 0.608398133748056, "grad_norm": 11.712307929992676, "learning_rate": 2.174965421853389e-06, "logits/chosen": 1.1390999555587769, "logits/rejected": 2.3368515968322754, "logps/chosen": -549.4978637695312, "logps/rejected": -802.3196411132812, "loss": 0.4684, "rewards/accuracies": 0.875, "rewards/chosen": -5.414333820343018, "rewards/margins": 11.2832670211792, "rewards/rejected": -16.697599411010742, "step": 978 }, { "epoch": 0.6090202177293935, "grad_norm": 2.6091272830963135, "learning_rate": 2.171507607192255e-06, "logits/chosen": -3.7168078422546387, "logits/rejected": 0.8818256855010986, "logps/chosen": -328.77923583984375, "logps/rejected": -720.1170043945312, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": -2.4112210273742676, "rewards/margins": 10.801239013671875, "rewards/rejected": -13.2124605178833, "step": 979 }, { "epoch": 0.609642301710731, "grad_norm": 10.744643211364746, "learning_rate": 2.1680497925311206e-06, "logits/chosen": -3.72502064704895, "logits/rejected": -0.37451475858688354, "logps/chosen": -356.48126220703125, "logps/rejected": -676.3464965820312, "loss": 0.2687, "rewards/accuracies": 0.875, "rewards/chosen": -2.409928321838379, "rewards/margins": 13.429485321044922, "rewards/rejected": -15.839412689208984, "step": 980 }, { "epoch": 0.6102643856920684, "grad_norm": 12.171833992004395, "learning_rate": 2.1645919778699863e-06, "logits/chosen": 0.8939465880393982, "logits/rejected": 3.534095287322998, "logps/chosen": -618.3985595703125, "logps/rejected": -903.4392700195312, "loss": 0.276, "rewards/accuracies": 0.875, "rewards/chosen": -4.15968656539917, "rewards/margins": 8.485418319702148, "rewards/rejected": -12.64510440826416, "step": 981 }, { "epoch": 0.6108864696734059, "grad_norm": 0.015898270532488823, "learning_rate": 2.1611341632088524e-06, "logits/chosen": -3.235353469848633, "logits/rejected": 1.9901905059814453, "logps/chosen": -420.8085632324219, "logps/rejected": -961.6116943359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8485291004180908, "rewards/margins": 18.401851654052734, "rewards/rejected": -20.25037956237793, "step": 982 }, { "epoch": 0.6115085536547434, "grad_norm": 0.14086174964904785, "learning_rate": 2.157676348547718e-06, "logits/chosen": -2.6031675338745117, "logits/rejected": -0.18891924619674683, "logps/chosen": -290.99957275390625, "logps/rejected": -572.3098754882812, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.729489803314209, "rewards/margins": 8.723758697509766, "rewards/rejected": -11.453248977661133, "step": 983 }, { "epoch": 0.6121306376360809, "grad_norm": 5.122599124908447, "learning_rate": 2.154218533886584e-06, "logits/chosen": -1.8256328105926514, "logits/rejected": 0.9196978211402893, "logps/chosen": -452.5506591796875, "logps/rejected": -799.994873046875, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": -2.1646595001220703, "rewards/margins": 14.538305282592773, "rewards/rejected": -16.702964782714844, "step": 984 }, { "epoch": 0.6127527216174183, "grad_norm": 14.85488510131836, "learning_rate": 2.15076071922545e-06, "logits/chosen": -0.7954829931259155, "logits/rejected": 1.5028237104415894, "logps/chosen": -611.6622924804688, "logps/rejected": -878.7798461914062, "loss": 0.4962, "rewards/accuracies": 0.75, "rewards/chosen": -6.676080226898193, "rewards/margins": 12.448995590209961, "rewards/rejected": -19.125076293945312, "step": 985 }, { "epoch": 0.6133748055987558, "grad_norm": 0.023373287171125412, "learning_rate": 2.1473029045643155e-06, "logits/chosen": -0.24543839693069458, "logits/rejected": 3.1470518112182617, "logps/chosen": -404.5933837890625, "logps/rejected": -788.4996948242188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.287032127380371, "rewards/margins": 16.27682876586914, "rewards/rejected": -18.563861846923828, "step": 986 }, { "epoch": 0.6139968895800934, "grad_norm": 0.08637022972106934, "learning_rate": 2.143845089903181e-06, "logits/chosen": -1.6670886278152466, "logits/rejected": 0.874323308467865, "logps/chosen": -416.7777099609375, "logps/rejected": -664.5081787109375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.6998684406280518, "rewards/margins": 11.14942455291748, "rewards/rejected": -14.849292755126953, "step": 987 }, { "epoch": 0.6146189735614308, "grad_norm": 0.008582806214690208, "learning_rate": 2.1403872752420473e-06, "logits/chosen": -1.2615329027175903, "logits/rejected": 1.1603541374206543, "logps/chosen": -587.0240478515625, "logps/rejected": -862.7952270507812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.335999488830566, "rewards/margins": 12.948976516723633, "rewards/rejected": -17.284976959228516, "step": 988 }, { "epoch": 0.6152410575427683, "grad_norm": 1.9921743869781494, "learning_rate": 2.136929460580913e-06, "logits/chosen": -0.5984097719192505, "logits/rejected": 1.6723387241363525, "logps/chosen": -395.90380859375, "logps/rejected": -695.7029418945312, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -3.595017194747925, "rewards/margins": 11.64875316619873, "rewards/rejected": -15.243770599365234, "step": 989 }, { "epoch": 0.6158631415241057, "grad_norm": 3.9478540420532227, "learning_rate": 2.1334716459197786e-06, "logits/chosen": -0.9224528074264526, "logits/rejected": 0.9235408306121826, "logps/chosen": -551.9451293945312, "logps/rejected": -683.4635620117188, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": -2.5700254440307617, "rewards/margins": 9.46286392211914, "rewards/rejected": -12.032890319824219, "step": 990 }, { "epoch": 0.6164852255054433, "grad_norm": 14.989389419555664, "learning_rate": 2.1300138312586447e-06, "logits/chosen": 1.1094504594802856, "logits/rejected": 1.8903937339782715, "logps/chosen": -534.196533203125, "logps/rejected": -656.464111328125, "loss": 0.7482, "rewards/accuracies": 0.75, "rewards/chosen": -3.1404001712799072, "rewards/margins": 7.9133758544921875, "rewards/rejected": -11.053775787353516, "step": 991 }, { "epoch": 0.6171073094867807, "grad_norm": 1.576110601425171, "learning_rate": 2.1265560165975104e-06, "logits/chosen": -2.946510076522827, "logits/rejected": 1.484669804573059, "logps/chosen": -301.953857421875, "logps/rejected": -740.6759033203125, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -3.1644287109375, "rewards/margins": 11.269078254699707, "rewards/rejected": -14.433506965637207, "step": 992 }, { "epoch": 0.6177293934681182, "grad_norm": 4.128149032592773, "learning_rate": 2.1230982019363765e-06, "logits/chosen": 1.161936640739441, "logits/rejected": 1.4278016090393066, "logps/chosen": -592.1834716796875, "logps/rejected": -737.8643188476562, "loss": 0.0818, "rewards/accuracies": 1.0, "rewards/chosen": -3.338054895401001, "rewards/margins": 9.058060646057129, "rewards/rejected": -12.39611530303955, "step": 993 }, { "epoch": 0.6183514774494556, "grad_norm": 0.0003885370970238, "learning_rate": 2.119640387275242e-06, "logits/chosen": -1.6989907026290894, "logits/rejected": 1.5001914501190186, "logps/chosen": -456.3538513183594, "logps/rejected": -885.2760620117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3309340476989746, "rewards/margins": 14.357995986938477, "rewards/rejected": -16.68893051147461, "step": 994 }, { "epoch": 0.6189735614307932, "grad_norm": 7.151096343994141, "learning_rate": 2.116182572614108e-06, "logits/chosen": -2.300100803375244, "logits/rejected": 0.296688050031662, "logps/chosen": -509.4205017089844, "logps/rejected": -834.5781860351562, "loss": 0.1397, "rewards/accuracies": 0.875, "rewards/chosen": -2.87115478515625, "rewards/margins": 11.924084663391113, "rewards/rejected": -14.795238494873047, "step": 995 }, { "epoch": 0.6195956454121306, "grad_norm": 0.24201366305351257, "learning_rate": 2.112724757952974e-06, "logits/chosen": -2.4004197120666504, "logits/rejected": 0.46452146768569946, "logps/chosen": -427.5924987792969, "logps/rejected": -672.8216552734375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.258427619934082, "rewards/margins": 11.206836700439453, "rewards/rejected": -12.465263366699219, "step": 996 }, { "epoch": 0.6202177293934681, "grad_norm": 0.3841325342655182, "learning_rate": 2.1092669432918396e-06, "logits/chosen": -2.237016201019287, "logits/rejected": 1.8048465251922607, "logps/chosen": -477.4273376464844, "logps/rejected": -778.448486328125, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.511019706726074, "rewards/margins": 11.273201942443848, "rewards/rejected": -13.784222602844238, "step": 997 }, { "epoch": 0.6208398133748056, "grad_norm": 0.0020036548376083374, "learning_rate": 2.1058091286307057e-06, "logits/chosen": -1.206825852394104, "logits/rejected": 2.503096103668213, "logps/chosen": -477.86981201171875, "logps/rejected": -807.9362182617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.312061309814453, "rewards/margins": 13.152772903442383, "rewards/rejected": -15.464834213256836, "step": 998 }, { "epoch": 0.6214618973561431, "grad_norm": 8.395869255065918, "learning_rate": 2.1023513139695714e-06, "logits/chosen": -0.3179383873939514, "logits/rejected": 1.5632257461547852, "logps/chosen": -405.28253173828125, "logps/rejected": -675.0516967773438, "loss": 0.1803, "rewards/accuracies": 0.875, "rewards/chosen": -2.990370750427246, "rewards/margins": 11.327940940856934, "rewards/rejected": -14.31831169128418, "step": 999 }, { "epoch": 0.6220839813374806, "grad_norm": 11.235270500183105, "learning_rate": 2.0988934993084375e-06, "logits/chosen": 0.12897466123104095, "logits/rejected": 0.990669846534729, "logps/chosen": -517.72412109375, "logps/rejected": -650.8975830078125, "loss": 1.1586, "rewards/accuracies": 0.875, "rewards/chosen": -5.291604518890381, "rewards/margins": 10.53437614440918, "rewards/rejected": -15.825981140136719, "step": 1000 }, { "epoch": 0.622706065318818, "grad_norm": 0.04516918212175369, "learning_rate": 2.095435684647303e-06, "logits/chosen": 0.5018932223320007, "logits/rejected": 1.036027431488037, "logps/chosen": -626.29833984375, "logps/rejected": -764.4306640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.939140319824219, "rewards/margins": 11.391695976257324, "rewards/rejected": -18.33083724975586, "step": 1001 }, { "epoch": 0.6233281493001556, "grad_norm": 0.003115558298304677, "learning_rate": 2.091977869986169e-06, "logits/chosen": -1.7589203119277954, "logits/rejected": 1.435057520866394, "logps/chosen": -382.2220458984375, "logps/rejected": -769.798583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.516915798187256, "rewards/margins": 14.0879545211792, "rewards/rejected": -16.604869842529297, "step": 1002 }, { "epoch": 0.623950233281493, "grad_norm": 10.660874366760254, "learning_rate": 2.088520055325035e-06, "logits/chosen": -0.7018440961837769, "logits/rejected": 0.9751147627830505, "logps/chosen": -560.6834106445312, "logps/rejected": -728.4033203125, "loss": 0.3339, "rewards/accuracies": 0.75, "rewards/chosen": -4.111453533172607, "rewards/margins": 5.908658981323242, "rewards/rejected": -10.020112991333008, "step": 1003 }, { "epoch": 0.6245723172628305, "grad_norm": 9.585150718688965, "learning_rate": 2.0850622406639006e-06, "logits/chosen": 0.5304701924324036, "logits/rejected": 0.9541545510292053, "logps/chosen": -628.6130981445312, "logps/rejected": -779.0594482421875, "loss": 0.2953, "rewards/accuracies": 0.875, "rewards/chosen": -5.631964683532715, "rewards/margins": 8.24686336517334, "rewards/rejected": -13.878828048706055, "step": 1004 }, { "epoch": 0.6251944012441679, "grad_norm": 0.21276047825813293, "learning_rate": 2.0816044260027667e-06, "logits/chosen": 1.6849368810653687, "logits/rejected": 2.5698063373565674, "logps/chosen": -551.0446166992188, "logps/rejected": -742.5975341796875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -2.8569495677948, "rewards/margins": 12.745777130126953, "rewards/rejected": -15.602726936340332, "step": 1005 }, { "epoch": 0.6258164852255055, "grad_norm": 0.25935444235801697, "learning_rate": 2.0781466113416324e-06, "logits/chosen": -1.1086622476577759, "logits/rejected": 0.79587721824646, "logps/chosen": -487.7467041015625, "logps/rejected": -708.9419555664062, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -2.9280004501342773, "rewards/margins": 10.164024353027344, "rewards/rejected": -13.092023849487305, "step": 1006 }, { "epoch": 0.6264385692068429, "grad_norm": 3.0840532779693604, "learning_rate": 2.074688796680498e-06, "logits/chosen": 0.41001245379447937, "logits/rejected": 0.3385047912597656, "logps/chosen": -525.9742431640625, "logps/rejected": -601.5859375, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": -4.685369491577148, "rewards/margins": 5.445958137512207, "rewards/rejected": -10.131326675415039, "step": 1007 }, { "epoch": 0.6270606531881804, "grad_norm": 3.8512637615203857, "learning_rate": 2.071230982019364e-06, "logits/chosen": -1.886763334274292, "logits/rejected": 0.4452914297580719, "logps/chosen": -443.632568359375, "logps/rejected": -783.9464721679688, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -4.340965270996094, "rewards/margins": 13.199124336242676, "rewards/rejected": -17.540088653564453, "step": 1008 }, { "epoch": 0.6276827371695178, "grad_norm": 4.226061820983887, "learning_rate": 2.06777316735823e-06, "logits/chosen": -1.2665694952011108, "logits/rejected": 3.2109904289245605, "logps/chosen": -485.752197265625, "logps/rejected": -874.17724609375, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": -2.3707189559936523, "rewards/margins": 11.38040828704834, "rewards/rejected": -13.751127243041992, "step": 1009 }, { "epoch": 0.6283048211508554, "grad_norm": 14.14891242980957, "learning_rate": 2.0643153526970955e-06, "logits/chosen": -0.028902024030685425, "logits/rejected": 1.553565263748169, "logps/chosen": -659.6243896484375, "logps/rejected": -820.82177734375, "loss": 0.3156, "rewards/accuracies": 0.75, "rewards/chosen": -4.610879421234131, "rewards/margins": 9.103914260864258, "rewards/rejected": -13.714794158935547, "step": 1010 }, { "epoch": 0.6289269051321928, "grad_norm": 1.3959747552871704, "learning_rate": 2.060857538035961e-06, "logits/chosen": -0.5028128623962402, "logits/rejected": 0.40239813923835754, "logps/chosen": -450.92218017578125, "logps/rejected": -581.0523681640625, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -2.8122398853302, "rewards/margins": 9.81031608581543, "rewards/rejected": -12.62255573272705, "step": 1011 }, { "epoch": 0.6295489891135303, "grad_norm": 13.474916458129883, "learning_rate": 2.0573997233748273e-06, "logits/chosen": -2.119572639465332, "logits/rejected": 1.345198154449463, "logps/chosen": -532.615478515625, "logps/rejected": -771.0291748046875, "loss": 0.7716, "rewards/accuracies": 0.75, "rewards/chosen": -2.9069406986236572, "rewards/margins": 10.948951721191406, "rewards/rejected": -13.855892181396484, "step": 1012 }, { "epoch": 0.6301710730948679, "grad_norm": 0.047208670526742935, "learning_rate": 2.053941908713693e-06, "logits/chosen": 0.18464431166648865, "logits/rejected": 1.5313124656677246, "logps/chosen": -586.4111328125, "logps/rejected": -790.73291015625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.2165184020996094, "rewards/margins": 12.601154327392578, "rewards/rejected": -15.817671775817871, "step": 1013 }, { "epoch": 0.6307931570762053, "grad_norm": 4.660248756408691, "learning_rate": 2.050484094052559e-06, "logits/chosen": -4.489312648773193, "logits/rejected": 0.8564904928207397, "logps/chosen": -283.9761962890625, "logps/rejected": -767.8623657226562, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": -3.3246588706970215, "rewards/margins": 12.73607349395752, "rewards/rejected": -16.060733795166016, "step": 1014 }, { "epoch": 0.6314152410575428, "grad_norm": 3.807382345199585, "learning_rate": 2.0470262793914247e-06, "logits/chosen": 0.5082501173019409, "logits/rejected": 1.695326566696167, "logps/chosen": -561.3466186523438, "logps/rejected": -740.3934326171875, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": -5.820663928985596, "rewards/margins": 8.159883499145508, "rewards/rejected": -13.980546951293945, "step": 1015 }, { "epoch": 0.6320373250388802, "grad_norm": 2.7073705196380615, "learning_rate": 2.0435684647302904e-06, "logits/chosen": -1.758224368095398, "logits/rejected": 2.666069269180298, "logps/chosen": -312.75970458984375, "logps/rejected": -714.1618041992188, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": -2.2081661224365234, "rewards/margins": 11.59206771850586, "rewards/rejected": -13.800233840942383, "step": 1016 }, { "epoch": 0.6326594090202178, "grad_norm": 6.559261798858643, "learning_rate": 2.0401106500691565e-06, "logits/chosen": 0.7781310081481934, "logits/rejected": 1.0883543491363525, "logps/chosen": -736.278564453125, "logps/rejected": -815.175048828125, "loss": 0.089, "rewards/accuracies": 0.875, "rewards/chosen": -4.678498268127441, "rewards/margins": 8.405096054077148, "rewards/rejected": -13.08359432220459, "step": 1017 }, { "epoch": 0.6332814930015552, "grad_norm": 0.03576594963669777, "learning_rate": 2.036652835408022e-06, "logits/chosen": 1.8591142892837524, "logits/rejected": 1.7278776168823242, "logps/chosen": -765.4885864257812, "logps/rejected": -855.8074340820312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.717234134674072, "rewards/margins": 12.289228439331055, "rewards/rejected": -18.00646209716797, "step": 1018 }, { "epoch": 0.6339035769828927, "grad_norm": 0.06353059411048889, "learning_rate": 2.0331950207468883e-06, "logits/chosen": 1.6117891073226929, "logits/rejected": 1.6115003824234009, "logps/chosen": -743.84423828125, "logps/rejected": -862.3128662109375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.461956024169922, "rewards/margins": 12.450166702270508, "rewards/rejected": -18.91212272644043, "step": 1019 }, { "epoch": 0.6345256609642301, "grad_norm": 1.6241285800933838, "learning_rate": 2.029737206085754e-06, "logits/chosen": -0.6900770664215088, "logits/rejected": 1.1067521572113037, "logps/chosen": -521.3330078125, "logps/rejected": -833.4386596679688, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -3.41713285446167, "rewards/margins": 11.109548568725586, "rewards/rejected": -14.526679992675781, "step": 1020 }, { "epoch": 0.6351477449455677, "grad_norm": 1.4660099744796753, "learning_rate": 2.02627939142462e-06, "logits/chosen": -1.5786712169647217, "logits/rejected": 0.793453574180603, "logps/chosen": -415.57525634765625, "logps/rejected": -646.1599731445312, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -3.2418465614318848, "rewards/margins": 12.172435760498047, "rewards/rejected": -15.414281845092773, "step": 1021 }, { "epoch": 0.6357698289269051, "grad_norm": 1.5944817066192627, "learning_rate": 2.0228215767634857e-06, "logits/chosen": -0.8222180604934692, "logits/rejected": -0.12083059549331665, "logps/chosen": -498.1280212402344, "logps/rejected": -682.919921875, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -4.966358184814453, "rewards/margins": 9.738667488098145, "rewards/rejected": -14.705025672912598, "step": 1022 }, { "epoch": 0.6363919129082426, "grad_norm": 12.734455108642578, "learning_rate": 2.0193637621023514e-06, "logits/chosen": 0.5811883211135864, "logits/rejected": 2.617615222930908, "logps/chosen": -571.2073974609375, "logps/rejected": -826.748291015625, "loss": 0.247, "rewards/accuracies": 0.875, "rewards/chosen": -3.0289077758789062, "rewards/margins": 9.473535537719727, "rewards/rejected": -12.502443313598633, "step": 1023 }, { "epoch": 0.63701399688958, "grad_norm": 5.989953994750977, "learning_rate": 2.0159059474412175e-06, "logits/chosen": -0.5756019949913025, "logits/rejected": 1.9505432844161987, "logps/chosen": -474.55731201171875, "logps/rejected": -796.0640869140625, "loss": 0.1362, "rewards/accuracies": 0.875, "rewards/chosen": -3.5359718799591064, "rewards/margins": 9.325483322143555, "rewards/rejected": -12.861454963684082, "step": 1024 }, { "epoch": 0.6376360808709176, "grad_norm": 0.023319177329540253, "learning_rate": 2.012448132780083e-06, "logits/chosen": -0.10281744599342346, "logits/rejected": 2.37847900390625, "logps/chosen": -512.6126708984375, "logps/rejected": -771.48828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.500169038772583, "rewards/margins": 13.292675971984863, "rewards/rejected": -16.792844772338867, "step": 1025 }, { "epoch": 0.6382581648522551, "grad_norm": 16.506500244140625, "learning_rate": 2.0089903181189492e-06, "logits/chosen": 1.7582335472106934, "logits/rejected": 0.8570284843444824, "logps/chosen": -611.6314697265625, "logps/rejected": -681.496826171875, "loss": 0.6524, "rewards/accuracies": 0.75, "rewards/chosen": -5.613626956939697, "rewards/margins": 8.154857635498047, "rewards/rejected": -13.768485069274902, "step": 1026 }, { "epoch": 0.6388802488335925, "grad_norm": 3.3379263877868652, "learning_rate": 2.005532503457815e-06, "logits/chosen": -2.4246559143066406, "logits/rejected": 1.0628880262374878, "logps/chosen": -456.07904052734375, "logps/rejected": -846.11083984375, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -4.807278633117676, "rewards/margins": 12.449106216430664, "rewards/rejected": -17.256385803222656, "step": 1027 }, { "epoch": 0.63950233281493, "grad_norm": 0.826388418674469, "learning_rate": 2.0020746887966806e-06, "logits/chosen": 0.002698175609111786, "logits/rejected": 1.5106563568115234, "logps/chosen": -490.64617919921875, "logps/rejected": -671.0999145507812, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -1.99764084815979, "rewards/margins": 10.739684104919434, "rewards/rejected": -12.737324714660645, "step": 1028 }, { "epoch": 0.6401244167962675, "grad_norm": 0.11519481986761093, "learning_rate": 1.9986168741355467e-06, "logits/chosen": -0.4384591579437256, "logits/rejected": 1.5224398374557495, "logps/chosen": -469.76971435546875, "logps/rejected": -728.1442260742188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.772770881652832, "rewards/margins": 15.150619506835938, "rewards/rejected": -17.923389434814453, "step": 1029 }, { "epoch": 0.640746500777605, "grad_norm": 2.1558008193969727, "learning_rate": 1.9951590594744124e-06, "logits/chosen": -4.33176326751709, "logits/rejected": 0.2484569251537323, "logps/chosen": -334.58660888671875, "logps/rejected": -777.5438232421875, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": -1.2476811408996582, "rewards/margins": 12.536150932312012, "rewards/rejected": -13.783832550048828, "step": 1030 }, { "epoch": 0.6413685847589424, "grad_norm": 0.7711590528488159, "learning_rate": 1.991701244813278e-06, "logits/chosen": 0.6645958423614502, "logits/rejected": 2.059553861618042, "logps/chosen": -590.2259521484375, "logps/rejected": -841.4727783203125, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -2.808763027191162, "rewards/margins": 10.857090950012207, "rewards/rejected": -13.665853500366211, "step": 1031 }, { "epoch": 0.64199066874028, "grad_norm": 6.598252773284912, "learning_rate": 1.9882434301521437e-06, "logits/chosen": 0.6372920870780945, "logits/rejected": 2.0033369064331055, "logps/chosen": -647.0784912109375, "logps/rejected": -855.994140625, "loss": 0.1387, "rewards/accuracies": 0.875, "rewards/chosen": -4.65526008605957, "rewards/margins": 10.399763107299805, "rewards/rejected": -15.055023193359375, "step": 1032 }, { "epoch": 0.6426127527216174, "grad_norm": 1.496707797050476, "learning_rate": 1.98478561549101e-06, "logits/chosen": -1.1858296394348145, "logits/rejected": 1.7196447849273682, "logps/chosen": -413.4046936035156, "logps/rejected": -723.0321655273438, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -3.6104660034179688, "rewards/margins": 11.506616592407227, "rewards/rejected": -15.117082595825195, "step": 1033 }, { "epoch": 0.6432348367029549, "grad_norm": 0.03092067688703537, "learning_rate": 1.9813278008298755e-06, "logits/chosen": -2.103832483291626, "logits/rejected": 2.0937297344207764, "logps/chosen": -301.41778564453125, "logps/rejected": -775.4114990234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.9782029390335083, "rewards/margins": 15.77208423614502, "rewards/rejected": -17.750286102294922, "step": 1034 }, { "epoch": 0.6438569206842923, "grad_norm": 0.05091284215450287, "learning_rate": 1.9778699861687416e-06, "logits/chosen": -0.74751877784729, "logits/rejected": 1.9809004068374634, "logps/chosen": -311.16357421875, "logps/rejected": -635.2210693359375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.50241756439209, "rewards/margins": 11.734981536865234, "rewards/rejected": -14.237399101257324, "step": 1035 }, { "epoch": 0.6444790046656299, "grad_norm": 11.258312225341797, "learning_rate": 1.9744121715076073e-06, "logits/chosen": -0.7430388927459717, "logits/rejected": 1.3374900817871094, "logps/chosen": -537.7489624023438, "logps/rejected": -808.2744750976562, "loss": 0.4795, "rewards/accuracies": 0.75, "rewards/chosen": -3.963845729827881, "rewards/margins": 10.04364013671875, "rewards/rejected": -14.007485389709473, "step": 1036 }, { "epoch": 0.6451010886469674, "grad_norm": 0.370720773935318, "learning_rate": 1.970954356846473e-06, "logits/chosen": 0.806008517742157, "logits/rejected": 2.227003812789917, "logps/chosen": -613.8025512695312, "logps/rejected": -801.310546875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.9764208793640137, "rewards/margins": 9.135719299316406, "rewards/rejected": -12.112139701843262, "step": 1037 }, { "epoch": 0.6457231726283048, "grad_norm": 0.21426904201507568, "learning_rate": 1.967496542185339e-06, "logits/chosen": -2.2818212509155273, "logits/rejected": -0.38423052430152893, "logps/chosen": -301.31884765625, "logps/rejected": -579.7852783203125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9693481922149658, "rewards/margins": 11.621204376220703, "rewards/rejected": -13.590551376342773, "step": 1038 }, { "epoch": 0.6463452566096423, "grad_norm": 0.3064160943031311, "learning_rate": 1.9640387275242047e-06, "logits/chosen": -0.47753292322158813, "logits/rejected": 2.235549211502075, "logps/chosen": -478.419677734375, "logps/rejected": -810.7544555664062, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.309533596038818, "rewards/margins": 12.654848098754883, "rewards/rejected": -16.96438217163086, "step": 1039 }, { "epoch": 0.6469673405909798, "grad_norm": 16.278154373168945, "learning_rate": 1.960580912863071e-06, "logits/chosen": 2.1310782432556152, "logits/rejected": 2.7688515186309814, "logps/chosen": -686.7191162109375, "logps/rejected": -848.0804443359375, "loss": 0.973, "rewards/accuracies": 0.75, "rewards/chosen": -4.569868087768555, "rewards/margins": 9.041950225830078, "rewards/rejected": -13.611818313598633, "step": 1040 }, { "epoch": 0.6475894245723173, "grad_norm": 10.152179718017578, "learning_rate": 1.9571230982019365e-06, "logits/chosen": -1.0120471715927124, "logits/rejected": 1.0672812461853027, "logps/chosen": -492.72589111328125, "logps/rejected": -678.5810546875, "loss": 0.1645, "rewards/accuracies": 0.875, "rewards/chosen": -4.950984954833984, "rewards/margins": 9.779288291931152, "rewards/rejected": -14.73027229309082, "step": 1041 }, { "epoch": 0.6482115085536547, "grad_norm": 4.811702251434326, "learning_rate": 1.9536652835408026e-06, "logits/chosen": -0.7813871502876282, "logits/rejected": 0.9111435413360596, "logps/chosen": -637.8029174804688, "logps/rejected": -845.5501708984375, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": -3.0409328937530518, "rewards/margins": 12.065515518188477, "rewards/rejected": -15.10644817352295, "step": 1042 }, { "epoch": 0.6488335925349922, "grad_norm": 12.039648056030273, "learning_rate": 1.9502074688796682e-06, "logits/chosen": -0.48023903369903564, "logits/rejected": 1.1338739395141602, "logps/chosen": -573.4889526367188, "logps/rejected": -727.9575805664062, "loss": 1.0683, "rewards/accuracies": 0.75, "rewards/chosen": -5.742473602294922, "rewards/margins": 8.082014083862305, "rewards/rejected": -13.824487686157227, "step": 1043 }, { "epoch": 0.6494556765163297, "grad_norm": 0.48438093066215515, "learning_rate": 1.946749654218534e-06, "logits/chosen": -3.7791428565979004, "logits/rejected": 2.259133815765381, "logps/chosen": -363.3900146484375, "logps/rejected": -789.5064697265625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -2.1658499240875244, "rewards/margins": 13.440871238708496, "rewards/rejected": -15.606721878051758, "step": 1044 }, { "epoch": 0.6500777604976672, "grad_norm": 2.003484010696411, "learning_rate": 1.9432918395574e-06, "logits/chosen": 0.12645182013511658, "logits/rejected": 2.551563262939453, "logps/chosen": -463.9303894042969, "logps/rejected": -697.5665893554688, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -2.0740773677825928, "rewards/margins": 9.715627670288086, "rewards/rejected": -11.789705276489258, "step": 1045 }, { "epoch": 0.6506998444790046, "grad_norm": 0.21789084374904633, "learning_rate": 1.9398340248962657e-06, "logits/chosen": -3.5167763233184814, "logits/rejected": 1.4612488746643066, "logps/chosen": -243.96316528320312, "logps/rejected": -593.94384765625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -1.6443634033203125, "rewards/margins": 8.8632173538208, "rewards/rejected": -10.507580757141113, "step": 1046 }, { "epoch": 0.6513219284603421, "grad_norm": 10.110757827758789, "learning_rate": 1.936376210235132e-06, "logits/chosen": 0.6530430316925049, "logits/rejected": 1.6738708019256592, "logps/chosen": -599.3795166015625, "logps/rejected": -744.84716796875, "loss": 0.3057, "rewards/accuracies": 0.875, "rewards/chosen": -3.708158254623413, "rewards/margins": 7.704784393310547, "rewards/rejected": -11.412942886352539, "step": 1047 }, { "epoch": 0.6519440124416797, "grad_norm": 1.8253889083862305, "learning_rate": 1.9329183955739975e-06, "logits/chosen": 1.172483205795288, "logits/rejected": 2.689832925796509, "logps/chosen": -550.9580688476562, "logps/rejected": -702.495849609375, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -3.298330545425415, "rewards/margins": 7.814839839935303, "rewards/rejected": -11.113170623779297, "step": 1048 }, { "epoch": 0.6525660964230171, "grad_norm": 7.1217474937438965, "learning_rate": 1.929460580912863e-06, "logits/chosen": -0.24819883704185486, "logits/rejected": 2.263699769973755, "logps/chosen": -452.10089111328125, "logps/rejected": -773.5488891601562, "loss": 0.133, "rewards/accuracies": 0.875, "rewards/chosen": -4.7288618087768555, "rewards/margins": 8.212913513183594, "rewards/rejected": -12.94177532196045, "step": 1049 }, { "epoch": 0.6531881804043546, "grad_norm": 0.6879950165748596, "learning_rate": 1.9260027662517292e-06, "logits/chosen": -1.6932296752929688, "logits/rejected": 1.4742567539215088, "logps/chosen": -439.53533935546875, "logps/rejected": -747.8695068359375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -2.9899792671203613, "rewards/margins": 9.645586967468262, "rewards/rejected": -12.635564804077148, "step": 1050 }, { "epoch": 0.6538102643856921, "grad_norm": 1.5503251552581787, "learning_rate": 1.922544951590595e-06, "logits/chosen": 0.29990726709365845, "logits/rejected": 2.6180996894836426, "logps/chosen": -559.8137817382812, "logps/rejected": -871.88720703125, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": -6.564856052398682, "rewards/margins": 9.437233924865723, "rewards/rejected": -16.002090454101562, "step": 1051 }, { "epoch": 0.6544323483670296, "grad_norm": 0.059440597891807556, "learning_rate": 1.919087136929461e-06, "logits/chosen": -0.5005910396575928, "logits/rejected": 2.2705631256103516, "logps/chosen": -506.644775390625, "logps/rejected": -770.611083984375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.935871601104736, "rewards/margins": 9.507561683654785, "rewards/rejected": -14.44343376159668, "step": 1052 }, { "epoch": 0.655054432348367, "grad_norm": 4.024876594543457, "learning_rate": 1.9156293222683267e-06, "logits/chosen": -0.15780453383922577, "logits/rejected": 2.288727045059204, "logps/chosen": -476.67431640625, "logps/rejected": -791.0120849609375, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": -3.1946725845336914, "rewards/margins": 12.331758499145508, "rewards/rejected": -15.5264310836792, "step": 1053 }, { "epoch": 0.6556765163297045, "grad_norm": 0.3611021935939789, "learning_rate": 1.9121715076071924e-06, "logits/chosen": -0.8191186785697937, "logits/rejected": 1.408942699432373, "logps/chosen": -490.61505126953125, "logps/rejected": -735.5208740234375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -2.098384141921997, "rewards/margins": 11.237668991088867, "rewards/rejected": -13.336053848266602, "step": 1054 }, { "epoch": 0.656298600311042, "grad_norm": 0.34494248032569885, "learning_rate": 1.908713692946058e-06, "logits/chosen": 0.19897377490997314, "logits/rejected": 2.2970542907714844, "logps/chosen": -547.977294921875, "logps/rejected": -805.5403442382812, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1150412559509277, "rewards/margins": 15.187854766845703, "rewards/rejected": -18.302894592285156, "step": 1055 }, { "epoch": 0.6569206842923795, "grad_norm": 6.628651142120361, "learning_rate": 1.905255878284924e-06, "logits/chosen": -3.3367063999176025, "logits/rejected": -0.07507431507110596, "logps/chosen": -225.01573181152344, "logps/rejected": -531.7635498046875, "loss": 0.2799, "rewards/accuracies": 0.875, "rewards/chosen": -0.7402350902557373, "rewards/margins": 9.08974838256836, "rewards/rejected": -9.829983711242676, "step": 1056 }, { "epoch": 0.6575427682737169, "grad_norm": 0.042386919260025024, "learning_rate": 1.9017980636237898e-06, "logits/chosen": -2.9269347190856934, "logits/rejected": 1.5136233568191528, "logps/chosen": -284.03973388671875, "logps/rejected": -691.5225830078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.5467650890350342, "rewards/margins": 12.708568572998047, "rewards/rejected": -14.255332946777344, "step": 1057 }, { "epoch": 0.6581648522550544, "grad_norm": 0.11074140667915344, "learning_rate": 1.8983402489626557e-06, "logits/chosen": -1.558821678161621, "logits/rejected": 1.1062127351760864, "logps/chosen": -457.4238586425781, "logps/rejected": -799.7845458984375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.970675230026245, "rewards/margins": 12.692710876464844, "rewards/rejected": -16.663387298583984, "step": 1058 }, { "epoch": 0.658786936236392, "grad_norm": 1.1652216911315918, "learning_rate": 1.8948824343015216e-06, "logits/chosen": -1.6260493993759155, "logits/rejected": 1.3686001300811768, "logps/chosen": -457.19598388671875, "logps/rejected": -791.5732421875, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -4.9548821449279785, "rewards/margins": 11.611612319946289, "rewards/rejected": -16.566495895385742, "step": 1059 }, { "epoch": 0.6594090202177294, "grad_norm": 4.605720043182373, "learning_rate": 1.8914246196403875e-06, "logits/chosen": 0.35489967465400696, "logits/rejected": 1.2143810987472534, "logps/chosen": -563.5030517578125, "logps/rejected": -730.0916748046875, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": -5.5228590965271, "rewards/margins": 10.185713768005371, "rewards/rejected": -15.708572387695312, "step": 1060 }, { "epoch": 0.6600311041990669, "grad_norm": 2.7395095825195312, "learning_rate": 1.8879668049792531e-06, "logits/chosen": -1.245688557624817, "logits/rejected": 1.4819563627243042, "logps/chosen": -584.6234741210938, "logps/rejected": -917.8807373046875, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": -5.2972187995910645, "rewards/margins": 12.06915283203125, "rewards/rejected": -17.366371154785156, "step": 1061 }, { "epoch": 0.6606531881804043, "grad_norm": 16.45001983642578, "learning_rate": 1.884508990318119e-06, "logits/chosen": 0.2940090596675873, "logits/rejected": 2.6731133460998535, "logps/chosen": -519.51171875, "logps/rejected": -850.6254272460938, "loss": 0.6898, "rewards/accuracies": 0.875, "rewards/chosen": -4.458443641662598, "rewards/margins": 13.66417121887207, "rewards/rejected": -18.122615814208984, "step": 1062 }, { "epoch": 0.6612752721617419, "grad_norm": 10.427206039428711, "learning_rate": 1.881051175656985e-06, "logits/chosen": 1.077031135559082, "logits/rejected": 2.2125847339630127, "logps/chosen": -634.8086547851562, "logps/rejected": -751.8177490234375, "loss": 0.1634, "rewards/accuracies": 0.875, "rewards/chosen": -5.3660407066345215, "rewards/margins": 4.572610855102539, "rewards/rejected": -9.938652038574219, "step": 1063 }, { "epoch": 0.6618973561430793, "grad_norm": 0.4396788775920868, "learning_rate": 1.8775933609958508e-06, "logits/chosen": 1.2623273134231567, "logits/rejected": 3.368971824645996, "logps/chosen": -648.06640625, "logps/rejected": -881.6256103515625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -4.502276420593262, "rewards/margins": 10.512121200561523, "rewards/rejected": -15.014396667480469, "step": 1064 }, { "epoch": 0.6625194401244168, "grad_norm": 1.726747751235962, "learning_rate": 1.8741355463347167e-06, "logits/chosen": -0.11991359293460846, "logits/rejected": 2.6458427906036377, "logps/chosen": -516.3016357421875, "logps/rejected": -859.52392578125, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -3.300980806350708, "rewards/margins": 13.026629447937012, "rewards/rejected": -16.32761001586914, "step": 1065 }, { "epoch": 0.6631415241057543, "grad_norm": 1.1693699359893799, "learning_rate": 1.8706777316735826e-06, "logits/chosen": -0.49305635690689087, "logits/rejected": 1.5675594806671143, "logps/chosen": -591.2108154296875, "logps/rejected": -911.1290893554688, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -4.294110298156738, "rewards/margins": 12.678876876831055, "rewards/rejected": -16.97298812866211, "step": 1066 }, { "epoch": 0.6637636080870918, "grad_norm": 10.80457878112793, "learning_rate": 1.8672199170124482e-06, "logits/chosen": -1.5988669395446777, "logits/rejected": 0.8705654144287109, "logps/chosen": -512.2140502929688, "logps/rejected": -806.09765625, "loss": 0.4485, "rewards/accuracies": 0.875, "rewards/chosen": -3.551145315170288, "rewards/margins": 12.93505859375, "rewards/rejected": -16.486204147338867, "step": 1067 }, { "epoch": 0.6643856920684292, "grad_norm": 2.5615625381469727, "learning_rate": 1.8637621023513141e-06, "logits/chosen": -2.4821906089782715, "logits/rejected": 1.2083396911621094, "logps/chosen": -422.328369140625, "logps/rejected": -760.3848876953125, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": -3.064101219177246, "rewards/margins": 10.793773651123047, "rewards/rejected": -13.85787582397461, "step": 1068 }, { "epoch": 0.6650077760497667, "grad_norm": 0.003601853968575597, "learning_rate": 1.86030428769018e-06, "logits/chosen": 0.09036673605442047, "logits/rejected": 3.169595718383789, "logps/chosen": -468.8918151855469, "logps/rejected": -858.6209716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.408201217651367, "rewards/margins": 14.999587059020996, "rewards/rejected": -19.40778923034668, "step": 1069 }, { "epoch": 0.6656298600311042, "grad_norm": 12.976881980895996, "learning_rate": 1.856846473029046e-06, "logits/chosen": 0.09823396801948547, "logits/rejected": 1.9355928897857666, "logps/chosen": -529.9743041992188, "logps/rejected": -749.6134033203125, "loss": 0.5726, "rewards/accuracies": 0.75, "rewards/chosen": -5.049110412597656, "rewards/margins": 7.191032409667969, "rewards/rejected": -12.240142822265625, "step": 1070 }, { "epoch": 0.6662519440124417, "grad_norm": 10.563179016113281, "learning_rate": 1.8533886583679118e-06, "logits/chosen": -2.427809715270996, "logits/rejected": 1.8756613731384277, "logps/chosen": -380.4163513183594, "logps/rejected": -728.9166870117188, "loss": 0.2192, "rewards/accuracies": 0.875, "rewards/chosen": -4.603232383728027, "rewards/margins": 11.378522872924805, "rewards/rejected": -15.981756210327148, "step": 1071 }, { "epoch": 0.6668740279937792, "grad_norm": 0.14788885414600372, "learning_rate": 1.8499308437067775e-06, "logits/chosen": -0.277435302734375, "logits/rejected": 1.3089938163757324, "logps/chosen": -547.5185546875, "logps/rejected": -791.9442749023438, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.8885748386383057, "rewards/margins": 11.341448783874512, "rewards/rejected": -15.230024337768555, "step": 1072 }, { "epoch": 0.6674961119751166, "grad_norm": 3.188408136367798, "learning_rate": 1.8464730290456433e-06, "logits/chosen": -2.2596986293792725, "logits/rejected": 0.8410656452178955, "logps/chosen": -324.6328430175781, "logps/rejected": -598.1616821289062, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": -1.4935029745101929, "rewards/margins": 5.752256870269775, "rewards/rejected": -7.245759963989258, "step": 1073 }, { "epoch": 0.6681181959564542, "grad_norm": 0.03675384446978569, "learning_rate": 1.8430152143845092e-06, "logits/chosen": -0.8693726062774658, "logits/rejected": 1.5161409378051758, "logps/chosen": -498.406494140625, "logps/rejected": -752.718505859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.566410541534424, "rewards/margins": 13.583147048950195, "rewards/rejected": -16.14955711364746, "step": 1074 }, { "epoch": 0.6687402799377916, "grad_norm": 0.0020973007194697857, "learning_rate": 1.8395573997233751e-06, "logits/chosen": -0.10269686579704285, "logits/rejected": 2.631887435913086, "logps/chosen": -404.41241455078125, "logps/rejected": -735.026123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.638554573059082, "rewards/margins": 13.470610618591309, "rewards/rejected": -17.10916519165039, "step": 1075 }, { "epoch": 0.6693623639191291, "grad_norm": 0.3651786744594574, "learning_rate": 1.836099585062241e-06, "logits/chosen": -0.8673775792121887, "logits/rejected": 1.531367301940918, "logps/chosen": -450.03778076171875, "logps/rejected": -695.3096923828125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -3.878875255584717, "rewards/margins": 10.878089904785156, "rewards/rejected": -14.756965637207031, "step": 1076 }, { "epoch": 0.6699844479004665, "grad_norm": 7.3607988357543945, "learning_rate": 1.8326417704011065e-06, "logits/chosen": -0.5112236738204956, "logits/rejected": 0.7025718688964844, "logps/chosen": -499.9620056152344, "logps/rejected": -668.3945922851562, "loss": 0.1258, "rewards/accuracies": 0.875, "rewards/chosen": -5.150419235229492, "rewards/margins": 8.551305770874023, "rewards/rejected": -13.7017240524292, "step": 1077 }, { "epoch": 0.6706065318818041, "grad_norm": 4.611263751983643, "learning_rate": 1.8291839557399723e-06, "logits/chosen": -0.9964049458503723, "logits/rejected": 0.9579240083694458, "logps/chosen": -524.4741821289062, "logps/rejected": -764.2987670898438, "loss": 0.1185, "rewards/accuracies": 0.875, "rewards/chosen": -5.831058502197266, "rewards/margins": 10.16044807434082, "rewards/rejected": -15.991506576538086, "step": 1078 }, { "epoch": 0.6712286158631415, "grad_norm": 4.2436203956604, "learning_rate": 1.8257261410788382e-06, "logits/chosen": -2.2074825763702393, "logits/rejected": 1.4451165199279785, "logps/chosen": -440.4949951171875, "logps/rejected": -806.1306762695312, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": -2.3167564868927, "rewards/margins": 15.035575866699219, "rewards/rejected": -17.352333068847656, "step": 1079 }, { "epoch": 0.671850699844479, "grad_norm": 7.689334869384766, "learning_rate": 1.8222683264177041e-06, "logits/chosen": -0.030307695269584656, "logits/rejected": 0.6843103170394897, "logps/chosen": -490.571044921875, "logps/rejected": -626.142578125, "loss": 0.1538, "rewards/accuracies": 0.875, "rewards/chosen": -6.0009050369262695, "rewards/margins": 8.979663848876953, "rewards/rejected": -14.980569839477539, "step": 1080 }, { "epoch": 0.6724727838258164, "grad_norm": 3.292104959487915, "learning_rate": 1.81881051175657e-06, "logits/chosen": -1.7974581718444824, "logits/rejected": 2.301703929901123, "logps/chosen": -381.3676452636719, "logps/rejected": -728.9878540039062, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": -3.4869728088378906, "rewards/margins": 8.751140594482422, "rewards/rejected": -12.238113403320312, "step": 1081 }, { "epoch": 0.673094867807154, "grad_norm": 0.0011872448958456516, "learning_rate": 1.8153526970954357e-06, "logits/chosen": -1.0084781646728516, "logits/rejected": 1.5871435403823853, "logps/chosen": -431.7565612792969, "logps/rejected": -799.1888427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9928975105285645, "rewards/margins": 16.138193130493164, "rewards/rejected": -19.13109016418457, "step": 1082 }, { "epoch": 0.6737169517884914, "grad_norm": 0.006312511395663023, "learning_rate": 1.8118948824343016e-06, "logits/chosen": 0.21030393242835999, "logits/rejected": 1.755011796951294, "logps/chosen": -559.6727294921875, "logps/rejected": -852.0909423828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.775818824768066, "rewards/margins": 14.454465866088867, "rewards/rejected": -20.23028564453125, "step": 1083 }, { "epoch": 0.6743390357698289, "grad_norm": 0.2854938507080078, "learning_rate": 1.8084370677731675e-06, "logits/chosen": -3.0373058319091797, "logits/rejected": 1.504826307296753, "logps/chosen": -197.9976806640625, "logps/rejected": -599.3348388671875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.6667191982269287, "rewards/margins": 11.379118919372559, "rewards/rejected": -12.04583740234375, "step": 1084 }, { "epoch": 0.6749611197511665, "grad_norm": 0.0021677310578525066, "learning_rate": 1.8049792531120333e-06, "logits/chosen": -2.9376912117004395, "logits/rejected": 2.2050387859344482, "logps/chosen": -242.47576904296875, "logps/rejected": -780.513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.6308200359344482, "rewards/margins": 17.769222259521484, "rewards/rejected": -20.400039672851562, "step": 1085 }, { "epoch": 0.6755832037325039, "grad_norm": 0.08663962781429291, "learning_rate": 1.8015214384508992e-06, "logits/chosen": -3.3391573429107666, "logits/rejected": 1.1729381084442139, "logps/chosen": -355.6374816894531, "logps/rejected": -777.2959594726562, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.2463706731796265, "rewards/margins": 12.833768844604492, "rewards/rejected": -14.080141067504883, "step": 1086 }, { "epoch": 0.6762052877138414, "grad_norm": 0.032826051115989685, "learning_rate": 1.7980636237897651e-06, "logits/chosen": 0.39126479625701904, "logits/rejected": 2.8599305152893066, "logps/chosen": -305.1252136230469, "logps/rejected": -547.9595947265625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.5152595043182373, "rewards/margins": 10.486005783081055, "rewards/rejected": -12.001264572143555, "step": 1087 }, { "epoch": 0.6768273716951788, "grad_norm": 4.602621555328369, "learning_rate": 1.7946058091286308e-06, "logits/chosen": -3.169222354888916, "logits/rejected": 1.2877531051635742, "logps/chosen": -382.23504638671875, "logps/rejected": -832.298095703125, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -3.868680477142334, "rewards/margins": 13.968950271606445, "rewards/rejected": -17.837631225585938, "step": 1088 }, { "epoch": 0.6774494556765164, "grad_norm": 6.954931735992432, "learning_rate": 1.7911479944674967e-06, "logits/chosen": -0.34693098068237305, "logits/rejected": 0.678508996963501, "logps/chosen": -571.1856689453125, "logps/rejected": -713.855224609375, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": -3.4686484336853027, "rewards/margins": 9.241241455078125, "rewards/rejected": -12.70988941192627, "step": 1089 }, { "epoch": 0.6780715396578538, "grad_norm": 11.540175437927246, "learning_rate": 1.7876901798063626e-06, "logits/chosen": 0.07278554141521454, "logits/rejected": -0.1789189726114273, "logps/chosen": -671.7821044921875, "logps/rejected": -676.2683715820312, "loss": 0.4492, "rewards/accuracies": 0.625, "rewards/chosen": -1.632270097732544, "rewards/margins": 6.603313446044922, "rewards/rejected": -8.235583305358887, "step": 1090 }, { "epoch": 0.6786936236391913, "grad_norm": 1.190700888633728, "learning_rate": 1.7842323651452284e-06, "logits/chosen": -0.6940876245498657, "logits/rejected": 2.1496429443359375, "logps/chosen": -476.2362060546875, "logps/rejected": -791.1129760742188, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -4.178332328796387, "rewards/margins": 14.140974998474121, "rewards/rejected": -18.319307327270508, "step": 1091 }, { "epoch": 0.6793157076205287, "grad_norm": 0.021951347589492798, "learning_rate": 1.7807745504840943e-06, "logits/chosen": -1.7036783695220947, "logits/rejected": 2.2257513999938965, "logps/chosen": -287.741455078125, "logps/rejected": -660.2666625976562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.2365105152130127, "rewards/margins": 11.053616523742676, "rewards/rejected": -12.29012680053711, "step": 1092 }, { "epoch": 0.6799377916018663, "grad_norm": 0.9538974165916443, "learning_rate": 1.77731673582296e-06, "logits/chosen": -3.931630849838257, "logits/rejected": 1.2709847688674927, "logps/chosen": -400.5167236328125, "logps/rejected": -983.2349243164062, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -2.3013153076171875, "rewards/margins": 17.429447174072266, "rewards/rejected": -19.730762481689453, "step": 1093 }, { "epoch": 0.6805598755832037, "grad_norm": 2.336557149887085, "learning_rate": 1.7738589211618259e-06, "logits/chosen": -2.258193016052246, "logits/rejected": 1.8008806705474854, "logps/chosen": -470.6827697753906, "logps/rejected": -883.7933349609375, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -5.316094398498535, "rewards/margins": 10.857684135437012, "rewards/rejected": -16.173778533935547, "step": 1094 }, { "epoch": 0.6811819595645412, "grad_norm": 1.0904747247695923, "learning_rate": 1.7704011065006918e-06, "logits/chosen": -2.023524045944214, "logits/rejected": 2.252817392349243, "logps/chosen": -420.70611572265625, "logps/rejected": -805.8851318359375, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.2677054405212402, "rewards/margins": 9.925800323486328, "rewards/rejected": -13.193506240844727, "step": 1095 }, { "epoch": 0.6818040435458786, "grad_norm": 2.1821858882904053, "learning_rate": 1.7669432918395577e-06, "logits/chosen": -3.492037534713745, "logits/rejected": 2.052449941635132, "logps/chosen": -219.43270874023438, "logps/rejected": -680.7523193359375, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": -0.689826250076294, "rewards/margins": 14.058903694152832, "rewards/rejected": -14.748729705810547, "step": 1096 }, { "epoch": 0.6824261275272162, "grad_norm": 10.808926582336426, "learning_rate": 1.7634854771784235e-06, "logits/chosen": 0.4890906810760498, "logits/rejected": 1.5305228233337402, "logps/chosen": -512.201416015625, "logps/rejected": -716.216796875, "loss": 0.2832, "rewards/accuracies": 0.875, "rewards/chosen": -5.416069984436035, "rewards/margins": 11.228296279907227, "rewards/rejected": -16.644367218017578, "step": 1097 }, { "epoch": 0.6830482115085537, "grad_norm": 5.210123538970947, "learning_rate": 1.7600276625172894e-06, "logits/chosen": -1.7565345764160156, "logits/rejected": 0.4324713945388794, "logps/chosen": -429.02490234375, "logps/rejected": -704.7572021484375, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": -1.6888118982315063, "rewards/margins": 11.8233060836792, "rewards/rejected": -13.512117385864258, "step": 1098 }, { "epoch": 0.6836702954898911, "grad_norm": 8.820576667785645, "learning_rate": 1.756569847856155e-06, "logits/chosen": -0.4208769202232361, "logits/rejected": 0.5820061564445496, "logps/chosen": -640.795654296875, "logps/rejected": -790.4542236328125, "loss": 0.1952, "rewards/accuracies": 0.875, "rewards/chosen": -1.618236780166626, "rewards/margins": 10.99665641784668, "rewards/rejected": -12.614892959594727, "step": 1099 }, { "epoch": 0.6842923794712286, "grad_norm": 0.17468155920505524, "learning_rate": 1.7531120331950208e-06, "logits/chosen": -1.2055456638336182, "logits/rejected": 2.4186465740203857, "logps/chosen": -453.8619384765625, "logps/rejected": -856.1904907226562, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.5742030143737793, "rewards/margins": 15.24583625793457, "rewards/rejected": -18.820039749145508, "step": 1100 }, { "epoch": 0.6849144634525661, "grad_norm": 3.2464711666107178, "learning_rate": 1.7496542185338867e-06, "logits/chosen": -0.9557915329933167, "logits/rejected": 1.173177719116211, "logps/chosen": -469.0939636230469, "logps/rejected": -723.173828125, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -3.0151262283325195, "rewards/margins": 10.941130638122559, "rewards/rejected": -13.956256866455078, "step": 1101 }, { "epoch": 0.6855365474339036, "grad_norm": 1.6630734205245972, "learning_rate": 1.7461964038727526e-06, "logits/chosen": -0.8412757515907288, "logits/rejected": 0.7623392343521118, "logps/chosen": -550.8668823242188, "logps/rejected": -740.575439453125, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -4.915467262268066, "rewards/margins": 12.158658981323242, "rewards/rejected": -17.074125289916992, "step": 1102 }, { "epoch": 0.686158631415241, "grad_norm": 0.8297721743583679, "learning_rate": 1.7427385892116182e-06, "logits/chosen": -1.976477026939392, "logits/rejected": -0.41926461458206177, "logps/chosen": -565.2374877929688, "logps/rejected": -710.3485717773438, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -4.9878997802734375, "rewards/margins": 8.000348091125488, "rewards/rejected": -12.988248825073242, "step": 1103 }, { "epoch": 0.6867807153965786, "grad_norm": 0.9236965775489807, "learning_rate": 1.7392807745504841e-06, "logits/chosen": -0.30539172887802124, "logits/rejected": 2.2200779914855957, "logps/chosen": -574.3755493164062, "logps/rejected": -886.7408447265625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -6.807694435119629, "rewards/margins": 12.771883010864258, "rewards/rejected": -19.579578399658203, "step": 1104 }, { "epoch": 0.687402799377916, "grad_norm": 2.293628454208374, "learning_rate": 1.73582295988935e-06, "logits/chosen": -1.0992774963378906, "logits/rejected": 0.5938657522201538, "logps/chosen": -572.6663208007812, "logps/rejected": -842.9981689453125, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -5.3925251960754395, "rewards/margins": 14.370223999023438, "rewards/rejected": -19.76274871826172, "step": 1105 }, { "epoch": 0.6880248833592535, "grad_norm": 0.7085720896720886, "learning_rate": 1.7323651452282159e-06, "logits/chosen": -2.385364055633545, "logits/rejected": 1.4980586767196655, "logps/chosen": -298.02276611328125, "logps/rejected": -672.7850341796875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.2075164318084717, "rewards/margins": 13.089982986450195, "rewards/rejected": -15.297500610351562, "step": 1106 }, { "epoch": 0.6886469673405909, "grad_norm": 0.011442109942436218, "learning_rate": 1.7289073305670818e-06, "logits/chosen": 0.9927573800086975, "logits/rejected": 2.6192076206207275, "logps/chosen": -563.0386962890625, "logps/rejected": -824.0861206054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.1668782234191895, "rewards/margins": 13.66159439086914, "rewards/rejected": -17.828472137451172, "step": 1107 }, { "epoch": 0.6892690513219285, "grad_norm": 10.182701110839844, "learning_rate": 1.7254495159059477e-06, "logits/chosen": -1.753928303718567, "logits/rejected": 3.248563528060913, "logps/chosen": -381.5770263671875, "logps/rejected": -861.3785400390625, "loss": 0.3277, "rewards/accuracies": 0.875, "rewards/chosen": -2.628838062286377, "rewards/margins": 13.924395561218262, "rewards/rejected": -16.553232192993164, "step": 1108 }, { "epoch": 0.689891135303266, "grad_norm": 18.342693328857422, "learning_rate": 1.7219917012448133e-06, "logits/chosen": -1.3735618591308594, "logits/rejected": 2.132735013961792, "logps/chosen": -593.2496337890625, "logps/rejected": -937.80810546875, "loss": 0.6541, "rewards/accuracies": 0.75, "rewards/chosen": -8.129141807556152, "rewards/margins": 12.264921188354492, "rewards/rejected": -20.39406394958496, "step": 1109 }, { "epoch": 0.6905132192846034, "grad_norm": 0.01916677877306938, "learning_rate": 1.7185338865836792e-06, "logits/chosen": 0.6682705879211426, "logits/rejected": 2.327928304672241, "logps/chosen": -501.89013671875, "logps/rejected": -681.25439453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.5737478733062744, "rewards/margins": 13.154827117919922, "rewards/rejected": -16.728574752807617, "step": 1110 }, { "epoch": 0.6911353032659409, "grad_norm": 0.1780615746974945, "learning_rate": 1.715076071922545e-06, "logits/chosen": 0.956490159034729, "logits/rejected": 2.7623844146728516, "logps/chosen": -473.90185546875, "logps/rejected": -729.6797485351562, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.528240442276001, "rewards/margins": 13.174820899963379, "rewards/rejected": -16.703062057495117, "step": 1111 }, { "epoch": 0.6917573872472784, "grad_norm": 8.397978782653809, "learning_rate": 1.711618257261411e-06, "logits/chosen": -1.4854750633239746, "logits/rejected": 1.973698377609253, "logps/chosen": -410.59954833984375, "logps/rejected": -766.8997192382812, "loss": 0.124, "rewards/accuracies": 0.875, "rewards/chosen": -4.497822284698486, "rewards/margins": 11.80315113067627, "rewards/rejected": -16.300973892211914, "step": 1112 }, { "epoch": 0.6923794712286159, "grad_norm": 2.9602229595184326, "learning_rate": 1.7081604426002769e-06, "logits/chosen": -2.0298004150390625, "logits/rejected": 0.5238627195358276, "logps/chosen": -420.08233642578125, "logps/rejected": -646.8685302734375, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -5.485627174377441, "rewards/margins": 8.015793800354004, "rewards/rejected": -13.501420974731445, "step": 1113 }, { "epoch": 0.6930015552099533, "grad_norm": 5.770056247711182, "learning_rate": 1.7047026279391426e-06, "logits/chosen": -1.4492266178131104, "logits/rejected": 2.720872640609741, "logps/chosen": -278.65765380859375, "logps/rejected": -714.165771484375, "loss": 0.1403, "rewards/accuracies": 0.875, "rewards/chosen": -2.6103525161743164, "rewards/margins": 13.180394172668457, "rewards/rejected": -15.790748596191406, "step": 1114 }, { "epoch": 0.6936236391912908, "grad_norm": 0.570961058139801, "learning_rate": 1.7012448132780084e-06, "logits/chosen": -2.1260085105895996, "logits/rejected": 0.6073211431503296, "logps/chosen": -520.1668701171875, "logps/rejected": -818.4796142578125, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -4.7443037033081055, "rewards/margins": 13.07916259765625, "rewards/rejected": -17.823467254638672, "step": 1115 }, { "epoch": 0.6942457231726283, "grad_norm": 0.044903095811605453, "learning_rate": 1.6977869986168743e-06, "logits/chosen": -1.058962345123291, "logits/rejected": 1.8916115760803223, "logps/chosen": -370.6036071777344, "logps/rejected": -723.4937744140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.3580021858215332, "rewards/margins": 13.278923988342285, "rewards/rejected": -14.636927604675293, "step": 1116 }, { "epoch": 0.6948678071539658, "grad_norm": 0.3557204604148865, "learning_rate": 1.6943291839557402e-06, "logits/chosen": -2.9158737659454346, "logits/rejected": 2.2547740936279297, "logps/chosen": -332.3334655761719, "logps/rejected": -814.16748046875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.1664812564849854, "rewards/margins": 11.296545028686523, "rewards/rejected": -13.46302604675293, "step": 1117 }, { "epoch": 0.6954898911353032, "grad_norm": 4.956876754760742, "learning_rate": 1.690871369294606e-06, "logits/chosen": -0.14785760641098022, "logits/rejected": 0.9937933683395386, "logps/chosen": -481.3138427734375, "logps/rejected": -637.0747680664062, "loss": 0.1928, "rewards/accuracies": 0.875, "rewards/chosen": -2.6469762325286865, "rewards/margins": 7.194599628448486, "rewards/rejected": -9.841575622558594, "step": 1118 }, { "epoch": 0.6961119751166407, "grad_norm": 6.928215980529785, "learning_rate": 1.687413554633472e-06, "logits/chosen": -1.4179723262786865, "logits/rejected": 2.667884111404419, "logps/chosen": -415.31768798828125, "logps/rejected": -771.015380859375, "loss": 0.1118, "rewards/accuracies": 0.875, "rewards/chosen": -2.110477924346924, "rewards/margins": 11.644866943359375, "rewards/rejected": -13.755345344543457, "step": 1119 }, { "epoch": 0.6967340590979783, "grad_norm": 0.9137997627258301, "learning_rate": 1.6839557399723377e-06, "logits/chosen": -2.7979722023010254, "logits/rejected": 1.8717658519744873, "logps/chosen": -295.9502258300781, "logps/rejected": -706.8751220703125, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -2.192861795425415, "rewards/margins": 13.385395050048828, "rewards/rejected": -15.57825756072998, "step": 1120 }, { "epoch": 0.6973561430793157, "grad_norm": 0.00018486895714886487, "learning_rate": 1.6804979253112035e-06, "logits/chosen": 0.403113454580307, "logits/rejected": 2.1745870113372803, "logps/chosen": -677.9456787109375, "logps/rejected": -906.7767333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.1944336891174316, "rewards/margins": 15.367548942565918, "rewards/rejected": -17.561981201171875, "step": 1121 }, { "epoch": 0.6979782270606532, "grad_norm": 0.032116781920194626, "learning_rate": 1.6770401106500692e-06, "logits/chosen": 0.0689801573753357, "logits/rejected": 1.6815273761749268, "logps/chosen": -553.7499389648438, "logps/rejected": -789.6849975585938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.846442222595215, "rewards/margins": 11.27425765991211, "rewards/rejected": -17.12070083618164, "step": 1122 }, { "epoch": 0.6986003110419907, "grad_norm": 0.3950237035751343, "learning_rate": 1.673582295988935e-06, "logits/chosen": 0.2665238082408905, "logits/rejected": 1.1012974977493286, "logps/chosen": -512.2118530273438, "logps/rejected": -735.1558227539062, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.364822864532471, "rewards/margins": 12.899419784545898, "rewards/rejected": -17.26424217224121, "step": 1123 }, { "epoch": 0.6992223950233282, "grad_norm": 9.305964469909668, "learning_rate": 1.6701244813278008e-06, "logits/chosen": 0.05828496813774109, "logits/rejected": 1.5214295387268066, "logps/chosen": -448.3961181640625, "logps/rejected": -658.0900268554688, "loss": 0.2315, "rewards/accuracies": 0.75, "rewards/chosen": -1.895863652229309, "rewards/margins": 8.475996017456055, "rewards/rejected": -10.371859550476074, "step": 1124 }, { "epoch": 0.6998444790046656, "grad_norm": 2.1334726810455322, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -3.159213066101074, "logits/rejected": 0.04908277094364166, "logps/chosen": -397.90032958984375, "logps/rejected": -726.68994140625, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -3.7977190017700195, "rewards/margins": 11.092351913452148, "rewards/rejected": -14.890070915222168, "step": 1125 }, { "epoch": 0.7004665629860031, "grad_norm": 0.011673110537230968, "learning_rate": 1.6632088520055325e-06, "logits/chosen": 0.6486165523529053, "logits/rejected": 2.0914106369018555, "logps/chosen": -475.814697265625, "logps/rejected": -649.7374877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.836455821990967, "rewards/margins": 11.970232963562012, "rewards/rejected": -15.80668830871582, "step": 1126 }, { "epoch": 0.7010886469673406, "grad_norm": 12.183952331542969, "learning_rate": 1.6597510373443984e-06, "logits/chosen": -0.10396319627761841, "logits/rejected": 3.231382131576538, "logps/chosen": -546.6926879882812, "logps/rejected": -909.4453735351562, "loss": 0.3461, "rewards/accuracies": 0.875, "rewards/chosen": -4.467593669891357, "rewards/margins": 14.48413372039795, "rewards/rejected": -18.95172691345215, "step": 1127 }, { "epoch": 0.7017107309486781, "grad_norm": 0.2624814510345459, "learning_rate": 1.6562932226832643e-06, "logits/chosen": -1.2583670616149902, "logits/rejected": -0.031533755362033844, "logps/chosen": -414.7713623046875, "logps/rejected": -609.9779052734375, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -3.7613840103149414, "rewards/margins": 10.085761070251465, "rewards/rejected": -13.84714412689209, "step": 1128 }, { "epoch": 0.7023328149300155, "grad_norm": 0.014767914079129696, "learning_rate": 1.6528354080221302e-06, "logits/chosen": -1.8991444110870361, "logits/rejected": 1.3560476303100586, "logps/chosen": -395.64764404296875, "logps/rejected": -734.5914306640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.010532855987549, "rewards/margins": 14.391338348388672, "rewards/rejected": -17.401870727539062, "step": 1129 }, { "epoch": 0.702954898911353, "grad_norm": 5.673132419586182, "learning_rate": 1.6493775933609959e-06, "logits/chosen": 0.4276261329650879, "logits/rejected": 2.2243545055389404, "logps/chosen": -603.572265625, "logps/rejected": -876.7821044921875, "loss": 0.113, "rewards/accuracies": 0.875, "rewards/chosen": -4.423176288604736, "rewards/margins": 10.759359359741211, "rewards/rejected": -15.182535171508789, "step": 1130 }, { "epoch": 0.7035769828926906, "grad_norm": 0.0113848727196455, "learning_rate": 1.6459197786998618e-06, "logits/chosen": -0.4110603630542755, "logits/rejected": 0.5374962091445923, "logps/chosen": -521.80419921875, "logps/rejected": -659.2481689453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.362962007522583, "rewards/margins": 10.852470397949219, "rewards/rejected": -14.215433120727539, "step": 1131 }, { "epoch": 0.704199066874028, "grad_norm": 8.364277839660645, "learning_rate": 1.6424619640387277e-06, "logits/chosen": -3.4958910942077637, "logits/rejected": 0.49035927653312683, "logps/chosen": -375.2361755371094, "logps/rejected": -797.0616455078125, "loss": 0.1427, "rewards/accuracies": 0.875, "rewards/chosen": -2.2565126419067383, "rewards/margins": 15.421339988708496, "rewards/rejected": -17.6778507232666, "step": 1132 }, { "epoch": 0.7048211508553655, "grad_norm": 9.506075859069824, "learning_rate": 1.6390041493775935e-06, "logits/chosen": -1.719527006149292, "logits/rejected": 2.152451992034912, "logps/chosen": -439.39910888671875, "logps/rejected": -857.5626220703125, "loss": 0.2284, "rewards/accuracies": 0.875, "rewards/chosen": -3.988229751586914, "rewards/margins": 10.314420700073242, "rewards/rejected": -14.30264949798584, "step": 1133 }, { "epoch": 0.7054432348367029, "grad_norm": 0.951762855052948, "learning_rate": 1.6355463347164594e-06, "logits/chosen": -0.8138654232025146, "logits/rejected": 1.3860888481140137, "logps/chosen": -505.2121887207031, "logps/rejected": -714.35400390625, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -2.314666748046875, "rewards/margins": 9.325754165649414, "rewards/rejected": -11.640420913696289, "step": 1134 }, { "epoch": 0.7060653188180405, "grad_norm": 1.0158611536026, "learning_rate": 1.632088520055325e-06, "logits/chosen": -2.618993043899536, "logits/rejected": 1.1887083053588867, "logps/chosen": -388.04931640625, "logps/rejected": -760.1138916015625, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.77988862991333, "rewards/margins": 10.658020973205566, "rewards/rejected": -14.437910079956055, "step": 1135 }, { "epoch": 0.7066874027993779, "grad_norm": 4.441841125488281, "learning_rate": 1.628630705394191e-06, "logits/chosen": -3.430938243865967, "logits/rejected": 0.6349539756774902, "logps/chosen": -338.98529052734375, "logps/rejected": -766.0817260742188, "loss": 0.0612, "rewards/accuracies": 1.0, "rewards/chosen": -3.28741717338562, "rewards/margins": 10.986239433288574, "rewards/rejected": -14.273656845092773, "step": 1136 }, { "epoch": 0.7073094867807154, "grad_norm": 3.352576732635498, "learning_rate": 1.6251728907330569e-06, "logits/chosen": 0.11717858910560608, "logits/rejected": 2.080000162124634, "logps/chosen": -538.0499267578125, "logps/rejected": -719.8656005859375, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -3.96099853515625, "rewards/margins": 6.785627365112305, "rewards/rejected": -10.746625900268555, "step": 1137 }, { "epoch": 0.7079315707620529, "grad_norm": 1.8192871809005737, "learning_rate": 1.6217150760719228e-06, "logits/chosen": -2.6265740394592285, "logits/rejected": 1.5469976663589478, "logps/chosen": -377.98468017578125, "logps/rejected": -754.6124267578125, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -3.4503889083862305, "rewards/margins": 11.081504821777344, "rewards/rejected": -14.53189468383789, "step": 1138 }, { "epoch": 0.7085536547433904, "grad_norm": 5.786879539489746, "learning_rate": 1.6182572614107886e-06, "logits/chosen": -0.760535717010498, "logits/rejected": 0.8460087776184082, "logps/chosen": -546.8169555664062, "logps/rejected": -749.6250610351562, "loss": 0.2398, "rewards/accuracies": 0.875, "rewards/chosen": -3.8657684326171875, "rewards/margins": 9.522623062133789, "rewards/rejected": -13.388391494750977, "step": 1139 }, { "epoch": 0.7091757387247278, "grad_norm": 1.0393327474594116, "learning_rate": 1.6147994467496545e-06, "logits/chosen": 0.056744277477264404, "logits/rejected": 2.234447717666626, "logps/chosen": -520.0684814453125, "logps/rejected": -817.3350219726562, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -4.211949348449707, "rewards/margins": 12.301795959472656, "rewards/rejected": -16.513744354248047, "step": 1140 }, { "epoch": 0.7097978227060653, "grad_norm": 1.2862509489059448, "learning_rate": 1.6113416320885202e-06, "logits/chosen": -1.269035816192627, "logits/rejected": 1.7601583003997803, "logps/chosen": -503.9523620605469, "logps/rejected": -882.49169921875, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -4.358367443084717, "rewards/margins": 14.955290794372559, "rewards/rejected": -19.313657760620117, "step": 1141 }, { "epoch": 0.7104199066874028, "grad_norm": 8.040385246276855, "learning_rate": 1.607883817427386e-06, "logits/chosen": 0.43578076362609863, "logits/rejected": 2.28662109375, "logps/chosen": -458.2288513183594, "logps/rejected": -682.0508422851562, "loss": 0.1584, "rewards/accuracies": 0.875, "rewards/chosen": -2.6926610469818115, "rewards/margins": 8.59858512878418, "rewards/rejected": -11.291245460510254, "step": 1142 }, { "epoch": 0.7110419906687403, "grad_norm": 3.8463568687438965, "learning_rate": 1.604426002766252e-06, "logits/chosen": -0.20588147640228271, "logits/rejected": 2.2962594032287598, "logps/chosen": -528.2001953125, "logps/rejected": -824.1529541015625, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -5.612648010253906, "rewards/margins": 10.496711730957031, "rewards/rejected": -16.109359741210938, "step": 1143 }, { "epoch": 0.7116640746500777, "grad_norm": 2.1915981769561768, "learning_rate": 1.6009681881051176e-06, "logits/chosen": -4.549889087677002, "logits/rejected": 0.3215818405151367, "logps/chosen": -311.5777587890625, "logps/rejected": -877.8465576171875, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -2.6836485862731934, "rewards/margins": 12.616403579711914, "rewards/rejected": -15.300052642822266, "step": 1144 }, { "epoch": 0.7122861586314152, "grad_norm": 0.05165935680270195, "learning_rate": 1.5975103734439833e-06, "logits/chosen": -1.6857300996780396, "logits/rejected": 0.6297056078910828, "logps/chosen": -528.4012451171875, "logps/rejected": -804.4505004882812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.800350666046143, "rewards/margins": 13.275749206542969, "rewards/rejected": -18.076099395751953, "step": 1145 }, { "epoch": 0.7129082426127528, "grad_norm": 0.011632733047008514, "learning_rate": 1.5940525587828492e-06, "logits/chosen": -3.3893883228302, "logits/rejected": 1.394503116607666, "logps/chosen": -238.62149047851562, "logps/rejected": -758.5062866210938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.607045888900757, "rewards/margins": 16.020538330078125, "rewards/rejected": -18.627586364746094, "step": 1146 }, { "epoch": 0.7135303265940902, "grad_norm": 5.158133506774902, "learning_rate": 1.590594744121715e-06, "logits/chosen": 0.8793729543685913, "logits/rejected": 2.5050103664398193, "logps/chosen": -512.16748046875, "logps/rejected": -746.3818359375, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": -4.257543563842773, "rewards/margins": 10.564891815185547, "rewards/rejected": -14.822434425354004, "step": 1147 }, { "epoch": 0.7141524105754277, "grad_norm": 0.01271029468625784, "learning_rate": 1.587136929460581e-06, "logits/chosen": -2.7259342670440674, "logits/rejected": -0.10426785051822662, "logps/chosen": -421.90106201171875, "logps/rejected": -790.1093139648438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.0931925773620605, "rewards/margins": 15.133329391479492, "rewards/rejected": -20.226520538330078, "step": 1148 }, { "epoch": 0.7147744945567651, "grad_norm": 0.09410641342401505, "learning_rate": 1.5836791147994469e-06, "logits/chosen": -1.5038176774978638, "logits/rejected": 0.8887146711349487, "logps/chosen": -433.1878662109375, "logps/rejected": -683.9208374023438, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.4951891899108887, "rewards/margins": 10.006722450256348, "rewards/rejected": -12.501911163330078, "step": 1149 }, { "epoch": 0.7153965785381027, "grad_norm": 1.4610326290130615, "learning_rate": 1.5802213001383128e-06, "logits/chosen": -1.3296780586242676, "logits/rejected": 2.161696195602417, "logps/chosen": -354.413330078125, "logps/rejected": -704.2504272460938, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -2.9581923484802246, "rewards/margins": 12.033364295959473, "rewards/rejected": -14.991556167602539, "step": 1150 }, { "epoch": 0.7160186625194401, "grad_norm": 0.22405344247817993, "learning_rate": 1.5767634854771784e-06, "logits/chosen": -1.4017596244812012, "logits/rejected": 1.5163604021072388, "logps/chosen": -520.4783325195312, "logps/rejected": -791.8408203125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.712601661682129, "rewards/margins": 13.67676830291748, "rewards/rejected": -17.38936996459961, "step": 1151 }, { "epoch": 0.7166407465007776, "grad_norm": 5.207883358001709, "learning_rate": 1.5733056708160443e-06, "logits/chosen": -0.18849143385887146, "logits/rejected": 1.4558908939361572, "logps/chosen": -468.4718017578125, "logps/rejected": -723.547607421875, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": -6.326025009155273, "rewards/margins": 10.292337417602539, "rewards/rejected": -16.618362426757812, "step": 1152 }, { "epoch": 0.717262830482115, "grad_norm": 1.580371379852295, "learning_rate": 1.5698478561549102e-06, "logits/chosen": -2.420145034790039, "logits/rejected": 2.7650609016418457, "logps/chosen": -432.6175231933594, "logps/rejected": -932.2700805664062, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -6.1257829666137695, "rewards/margins": 14.688467979431152, "rewards/rejected": -20.814250946044922, "step": 1153 }, { "epoch": 0.7178849144634526, "grad_norm": 0.00026385500677861273, "learning_rate": 1.566390041493776e-06, "logits/chosen": -2.582540988922119, "logits/rejected": 2.4641976356506348, "logps/chosen": -273.9823303222656, "logps/rejected": -806.62158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.9919500350952148, "rewards/margins": 17.37290382385254, "rewards/rejected": -19.364852905273438, "step": 1154 }, { "epoch": 0.71850699844479, "grad_norm": 0.006291497033089399, "learning_rate": 1.562932226832642e-06, "logits/chosen": 0.9827436804771423, "logits/rejected": 2.800081491470337, "logps/chosen": -551.9671020507812, "logps/rejected": -796.28564453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7596194744110107, "rewards/margins": 12.890427589416504, "rewards/rejected": -16.650047302246094, "step": 1155 }, { "epoch": 0.7191290824261275, "grad_norm": 12.246211051940918, "learning_rate": 1.5594744121715076e-06, "logits/chosen": 0.577423095703125, "logits/rejected": 1.1642279624938965, "logps/chosen": -515.6978149414062, "logps/rejected": -720.38037109375, "loss": 1.1458, "rewards/accuracies": 0.75, "rewards/chosen": -3.65787410736084, "rewards/margins": 6.078922748565674, "rewards/rejected": -9.736796379089355, "step": 1156 }, { "epoch": 0.7197511664074651, "grad_norm": 0.16573543846607208, "learning_rate": 1.5560165975103735e-06, "logits/chosen": -2.861276626586914, "logits/rejected": 1.6824699640274048, "logps/chosen": -366.15740966796875, "logps/rejected": -789.825927734375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.3959503173828125, "rewards/margins": 11.087020874023438, "rewards/rejected": -15.48297119140625, "step": 1157 }, { "epoch": 0.7203732503888025, "grad_norm": 14.14643669128418, "learning_rate": 1.5525587828492394e-06, "logits/chosen": 0.7038554549217224, "logits/rejected": 1.5344527959823608, "logps/chosen": -534.3895874023438, "logps/rejected": -713.1590576171875, "loss": 0.2072, "rewards/accuracies": 0.875, "rewards/chosen": -4.204388618469238, "rewards/margins": 10.015838623046875, "rewards/rejected": -14.220227241516113, "step": 1158 }, { "epoch": 0.72099533437014, "grad_norm": 9.952912330627441, "learning_rate": 1.5491009681881053e-06, "logits/chosen": 0.6553945541381836, "logits/rejected": 2.4073400497436523, "logps/chosen": -479.4063720703125, "logps/rejected": -737.9957275390625, "loss": 0.3882, "rewards/accuracies": 0.75, "rewards/chosen": -4.027148723602295, "rewards/margins": 11.115606307983398, "rewards/rejected": -15.142754554748535, "step": 1159 }, { "epoch": 0.7216174183514774, "grad_norm": 12.956445693969727, "learning_rate": 1.5456431535269712e-06, "logits/chosen": -2.223792552947998, "logits/rejected": 2.3121845722198486, "logps/chosen": -406.50689697265625, "logps/rejected": -702.6038818359375, "loss": 0.9655, "rewards/accuracies": 0.875, "rewards/chosen": -3.8724143505096436, "rewards/margins": 9.82856559753418, "rewards/rejected": -13.700980186462402, "step": 1160 }, { "epoch": 0.722239502332815, "grad_norm": 1.4429672956466675, "learning_rate": 1.542185338865837e-06, "logits/chosen": -0.04577261209487915, "logits/rejected": 1.5161032676696777, "logps/chosen": -522.6357421875, "logps/rejected": -780.0931396484375, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -5.561697959899902, "rewards/margins": 8.892523765563965, "rewards/rejected": -14.45422077178955, "step": 1161 }, { "epoch": 0.7228615863141524, "grad_norm": 6.408003807067871, "learning_rate": 1.5387275242047028e-06, "logits/chosen": -0.9005637168884277, "logits/rejected": 0.4727382957935333, "logps/chosen": -513.85302734375, "logps/rejected": -753.517333984375, "loss": 0.3459, "rewards/accuracies": 0.875, "rewards/chosen": -4.203969955444336, "rewards/margins": 11.437922477722168, "rewards/rejected": -15.641891479492188, "step": 1162 }, { "epoch": 0.7234836702954899, "grad_norm": 2.938655138015747, "learning_rate": 1.5352697095435686e-06, "logits/chosen": -2.383744478225708, "logits/rejected": 0.6305767893791199, "logps/chosen": -408.9247741699219, "logps/rejected": -768.7081909179688, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -4.102982997894287, "rewards/margins": 12.922101974487305, "rewards/rejected": -17.02508544921875, "step": 1163 }, { "epoch": 0.7241057542768273, "grad_norm": 10.900776863098145, "learning_rate": 1.5318118948824345e-06, "logits/chosen": -2.163926601409912, "logits/rejected": 0.9711679220199585, "logps/chosen": -433.38897705078125, "logps/rejected": -730.8160400390625, "loss": 0.2675, "rewards/accuracies": 0.875, "rewards/chosen": -3.962512254714966, "rewards/margins": 11.079669952392578, "rewards/rejected": -15.042181968688965, "step": 1164 }, { "epoch": 0.7247278382581649, "grad_norm": 10.95413589477539, "learning_rate": 1.5283540802213004e-06, "logits/chosen": -0.8407635688781738, "logits/rejected": 1.9003974199295044, "logps/chosen": -415.8904113769531, "logps/rejected": -703.5606689453125, "loss": 0.1816, "rewards/accuracies": 0.875, "rewards/chosen": -4.042840003967285, "rewards/margins": 11.096967697143555, "rewards/rejected": -15.139809608459473, "step": 1165 }, { "epoch": 0.7253499222395023, "grad_norm": 8.489850044250488, "learning_rate": 1.5248962655601663e-06, "logits/chosen": -1.3141114711761475, "logits/rejected": 1.2827707529067993, "logps/chosen": -473.940185546875, "logps/rejected": -651.896484375, "loss": 0.0926, "rewards/accuracies": 0.875, "rewards/chosen": -6.029555320739746, "rewards/margins": 11.222504615783691, "rewards/rejected": -17.252059936523438, "step": 1166 }, { "epoch": 0.7259720062208398, "grad_norm": 0.45654332637786865, "learning_rate": 1.5214384508990318e-06, "logits/chosen": 0.9646785259246826, "logits/rejected": 1.731150507926941, "logps/chosen": -603.2440185546875, "logps/rejected": -732.8345947265625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.3700270652770996, "rewards/margins": 10.47246265411377, "rewards/rejected": -13.842489242553711, "step": 1167 }, { "epoch": 0.7265940902021772, "grad_norm": 0.11650560796260834, "learning_rate": 1.5179806362378976e-06, "logits/chosen": 0.506636917591095, "logits/rejected": 1.0808582305908203, "logps/chosen": -586.63330078125, "logps/rejected": -732.0504150390625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -6.385423183441162, "rewards/margins": 11.841510772705078, "rewards/rejected": -18.226932525634766, "step": 1168 }, { "epoch": 0.7272161741835148, "grad_norm": 5.31183385848999, "learning_rate": 1.5145228215767635e-06, "logits/chosen": -1.230210542678833, "logits/rejected": 2.2017931938171387, "logps/chosen": -406.00408935546875, "logps/rejected": -829.4962768554688, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": -3.5606205463409424, "rewards/margins": 12.957470893859863, "rewards/rejected": -16.518091201782227, "step": 1169 }, { "epoch": 0.7278382581648523, "grad_norm": 7.4230732917785645, "learning_rate": 1.5110650069156294e-06, "logits/chosen": -1.0817581415176392, "logits/rejected": 0.9864222407341003, "logps/chosen": -527.3875732421875, "logps/rejected": -813.91943359375, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": -6.438192367553711, "rewards/margins": 10.822029113769531, "rewards/rejected": -17.260223388671875, "step": 1170 }, { "epoch": 0.7284603421461897, "grad_norm": 0.6591944694519043, "learning_rate": 1.5076071922544953e-06, "logits/chosen": -1.7693991661071777, "logits/rejected": 1.491078495979309, "logps/chosen": -450.1063232421875, "logps/rejected": -815.431640625, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -4.752045631408691, "rewards/margins": 12.34725284576416, "rewards/rejected": -17.09929847717285, "step": 1171 }, { "epoch": 0.7290824261275272, "grad_norm": 5.738795280456543, "learning_rate": 1.504149377593361e-06, "logits/chosen": -1.250427007675171, "logits/rejected": 1.7064287662506104, "logps/chosen": -471.4356689453125, "logps/rejected": -743.809814453125, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": -3.600668430328369, "rewards/margins": 9.99831771850586, "rewards/rejected": -13.598987579345703, "step": 1172 }, { "epoch": 0.7297045101088647, "grad_norm": 3.1799190044403076, "learning_rate": 1.5006915629322269e-06, "logits/chosen": -0.561762273311615, "logits/rejected": 1.9553264379501343, "logps/chosen": -408.5714111328125, "logps/rejected": -672.8405151367188, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": -4.673155784606934, "rewards/margins": 12.669715881347656, "rewards/rejected": -17.342872619628906, "step": 1173 }, { "epoch": 0.7303265940902022, "grad_norm": 1.1645147800445557, "learning_rate": 1.4972337482710927e-06, "logits/chosen": -1.1560033559799194, "logits/rejected": 2.790388345718384, "logps/chosen": -320.25909423828125, "logps/rejected": -690.1944580078125, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -4.178570747375488, "rewards/margins": 13.431252479553223, "rewards/rejected": -17.609821319580078, "step": 1174 }, { "epoch": 0.7309486780715396, "grad_norm": 1.3553528785705566, "learning_rate": 1.4937759336099586e-06, "logits/chosen": -0.9110782742500305, "logits/rejected": 2.138676643371582, "logps/chosen": -595.1483764648438, "logps/rejected": -933.8458862304688, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -4.9914469718933105, "rewards/margins": 10.823322296142578, "rewards/rejected": -15.814767837524414, "step": 1175 }, { "epoch": 0.7315707620528772, "grad_norm": 0.006906880997121334, "learning_rate": 1.4903181189488245e-06, "logits/chosen": -4.405219078063965, "logits/rejected": 0.5938413739204407, "logps/chosen": -341.02899169921875, "logps/rejected": -842.62841796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.9540653228759766, "rewards/margins": 15.0906982421875, "rewards/rejected": -17.044761657714844, "step": 1176 }, { "epoch": 0.7321928460342146, "grad_norm": 0.2723273038864136, "learning_rate": 1.4868603042876902e-06, "logits/chosen": -1.9786365032196045, "logits/rejected": 1.7471815347671509, "logps/chosen": -348.24658203125, "logps/rejected": -717.231201171875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.5691189765930176, "rewards/margins": 11.468405723571777, "rewards/rejected": -15.037525177001953, "step": 1177 }, { "epoch": 0.7328149300155521, "grad_norm": 0.18075311183929443, "learning_rate": 1.483402489626556e-06, "logits/chosen": -2.7961831092834473, "logits/rejected": 1.2335789203643799, "logps/chosen": -317.2989196777344, "logps/rejected": -716.1901245117188, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.4208152294158936, "rewards/margins": 13.70668888092041, "rewards/rejected": -15.127504348754883, "step": 1178 }, { "epoch": 0.7334370139968895, "grad_norm": 2.8972949981689453, "learning_rate": 1.479944674965422e-06, "logits/chosen": -0.7941773533821106, "logits/rejected": 0.5161871314048767, "logps/chosen": -632.91650390625, "logps/rejected": -810.6824340820312, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -5.0617523193359375, "rewards/margins": 12.403446197509766, "rewards/rejected": -17.465198516845703, "step": 1179 }, { "epoch": 0.7340590979782271, "grad_norm": 9.28952693939209, "learning_rate": 1.4764868603042879e-06, "logits/chosen": 1.601052165031433, "logits/rejected": 2.922710418701172, "logps/chosen": -671.0143432617188, "logps/rejected": -821.1246337890625, "loss": 0.3607, "rewards/accuracies": 0.75, "rewards/chosen": -3.616194009780884, "rewards/margins": 8.66051959991455, "rewards/rejected": -12.276714324951172, "step": 1180 }, { "epoch": 0.7346811819595646, "grad_norm": 0.009370475076138973, "learning_rate": 1.4730290456431537e-06, "logits/chosen": -2.292464256286621, "logits/rejected": 1.1543776988983154, "logps/chosen": -484.7025451660156, "logps/rejected": -853.7625732421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.74615478515625, "rewards/margins": 16.83014488220215, "rewards/rejected": -23.5762996673584, "step": 1181 }, { "epoch": 0.735303265940902, "grad_norm": 0.004507301840931177, "learning_rate": 1.4695712309820196e-06, "logits/chosen": -1.7925007343292236, "logits/rejected": 0.7398731708526611, "logps/chosen": -297.6701965332031, "logps/rejected": -645.4959716796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7627081871032715, "rewards/margins": 14.718207359313965, "rewards/rejected": -17.480915069580078, "step": 1182 }, { "epoch": 0.7359253499222395, "grad_norm": 2.3407843112945557, "learning_rate": 1.4661134163208853e-06, "logits/chosen": -1.3881579637527466, "logits/rejected": 1.2950356006622314, "logps/chosen": -572.4093627929688, "logps/rejected": -770.6861572265625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -5.865146160125732, "rewards/margins": 10.927026748657227, "rewards/rejected": -16.792173385620117, "step": 1183 }, { "epoch": 0.736547433903577, "grad_norm": 0.1942383050918579, "learning_rate": 1.4626556016597512e-06, "logits/chosen": -1.4892041683197021, "logits/rejected": 1.2450973987579346, "logps/chosen": -423.97662353515625, "logps/rejected": -753.7246704101562, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.169781684875488, "rewards/margins": 15.80416488647461, "rewards/rejected": -19.97394561767578, "step": 1184 }, { "epoch": 0.7371695178849145, "grad_norm": 0.017992381006479263, "learning_rate": 1.459197786998617e-06, "logits/chosen": -0.6590406894683838, "logits/rejected": 0.9250731468200684, "logps/chosen": -574.4696044921875, "logps/rejected": -850.411376953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.879560947418213, "rewards/margins": 13.470085144042969, "rewards/rejected": -19.349645614624023, "step": 1185 }, { "epoch": 0.7377916018662519, "grad_norm": 12.010857582092285, "learning_rate": 1.455739972337483e-06, "logits/chosen": 0.6955028772354126, "logits/rejected": 1.3440337181091309, "logps/chosen": -685.5115356445312, "logps/rejected": -782.8362426757812, "loss": 0.2584, "rewards/accuracies": 0.75, "rewards/chosen": -4.31658935546875, "rewards/margins": 6.152716159820557, "rewards/rejected": -10.469305038452148, "step": 1186 }, { "epoch": 0.7384136858475894, "grad_norm": 1.1073112487792969, "learning_rate": 1.4522821576763488e-06, "logits/chosen": -0.14232800900936127, "logits/rejected": 2.376594305038452, "logps/chosen": -571.20947265625, "logps/rejected": -923.0261840820312, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -5.747735977172852, "rewards/margins": 14.872795104980469, "rewards/rejected": -20.62053108215332, "step": 1187 }, { "epoch": 0.7390357698289269, "grad_norm": 0.11369112879037857, "learning_rate": 1.4488243430152147e-06, "logits/chosen": 0.07817887514829636, "logits/rejected": 3.6601195335388184, "logps/chosen": -461.837890625, "logps/rejected": -765.26611328125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.3528988361358643, "rewards/margins": 16.428327560424805, "rewards/rejected": -18.781227111816406, "step": 1188 }, { "epoch": 0.7396578538102644, "grad_norm": 1.7109262943267822, "learning_rate": 1.4453665283540804e-06, "logits/chosen": -3.0373752117156982, "logits/rejected": 2.0595664978027344, "logps/chosen": -300.2303161621094, "logps/rejected": -816.4912109375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -3.3775954246520996, "rewards/margins": 16.370357513427734, "rewards/rejected": -19.747953414916992, "step": 1189 }, { "epoch": 0.7402799377916018, "grad_norm": 4.212658882141113, "learning_rate": 1.441908713692946e-06, "logits/chosen": -3.5580215454101562, "logits/rejected": 1.232956886291504, "logps/chosen": -389.397216796875, "logps/rejected": -762.8502197265625, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -3.4514522552490234, "rewards/margins": 11.305686950683594, "rewards/rejected": -14.757139205932617, "step": 1190 }, { "epoch": 0.7409020217729394, "grad_norm": 38.220577239990234, "learning_rate": 1.438450899031812e-06, "logits/chosen": -0.2926905155181885, "logits/rejected": 1.493470311164856, "logps/chosen": -559.235107421875, "logps/rejected": -744.3903198242188, "loss": 0.3641, "rewards/accuracies": 0.875, "rewards/chosen": -6.159960746765137, "rewards/margins": 6.5682220458984375, "rewards/rejected": -12.728182792663574, "step": 1191 }, { "epoch": 0.7415241057542769, "grad_norm": 0.11958660185337067, "learning_rate": 1.4349930843706778e-06, "logits/chosen": -4.171557903289795, "logits/rejected": 0.7387654185295105, "logps/chosen": -422.1143493652344, "logps/rejected": -902.22216796875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.2943949699401855, "rewards/margins": 14.411043167114258, "rewards/rejected": -18.70543670654297, "step": 1192 }, { "epoch": 0.7421461897356143, "grad_norm": 12.561365127563477, "learning_rate": 1.4315352697095435e-06, "logits/chosen": -1.5998972654342651, "logits/rejected": -0.6377407312393188, "logps/chosen": -521.8119506835938, "logps/rejected": -774.1421508789062, "loss": 0.4011, "rewards/accuracies": 0.75, "rewards/chosen": -4.50105094909668, "rewards/margins": 10.003253936767578, "rewards/rejected": -14.504304885864258, "step": 1193 }, { "epoch": 0.7427682737169518, "grad_norm": 0.3583850562572479, "learning_rate": 1.4280774550484094e-06, "logits/chosen": 1.0509757995605469, "logits/rejected": 0.9459468722343445, "logps/chosen": -627.3001708984375, "logps/rejected": -736.188232421875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.605869770050049, "rewards/margins": 10.999320983886719, "rewards/rejected": -15.60519027709961, "step": 1194 }, { "epoch": 0.7433903576982893, "grad_norm": 3.186988115310669, "learning_rate": 1.4246196403872753e-06, "logits/chosen": -2.2397305965423584, "logits/rejected": 1.1159934997558594, "logps/chosen": -502.5550537109375, "logps/rejected": -881.2744140625, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -5.385392189025879, "rewards/margins": 13.956791877746582, "rewards/rejected": -19.342182159423828, "step": 1195 }, { "epoch": 0.7440124416796268, "grad_norm": 0.7348478436470032, "learning_rate": 1.4211618257261412e-06, "logits/chosen": 0.5990583896636963, "logits/rejected": -0.4085114598274231, "logps/chosen": -603.859130859375, "logps/rejected": -760.9273071289062, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -3.5537610054016113, "rewards/margins": 11.546674728393555, "rewards/rejected": -15.100435256958008, "step": 1196 }, { "epoch": 0.7446345256609642, "grad_norm": 1.1328163146972656, "learning_rate": 1.417704011065007e-06, "logits/chosen": -1.3089392185211182, "logits/rejected": 2.147690773010254, "logps/chosen": -422.98590087890625, "logps/rejected": -758.39306640625, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -3.400853157043457, "rewards/margins": 13.991851806640625, "rewards/rejected": -17.3927059173584, "step": 1197 }, { "epoch": 0.7452566096423017, "grad_norm": 0.021472284570336342, "learning_rate": 1.4142461964038727e-06, "logits/chosen": -0.41622021794319153, "logits/rejected": 1.8125836849212646, "logps/chosen": -471.09478759765625, "logps/rejected": -749.3162841796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.3504791259765625, "rewards/margins": 11.26353645324707, "rewards/rejected": -15.614015579223633, "step": 1198 }, { "epoch": 0.7458786936236392, "grad_norm": 10.588460922241211, "learning_rate": 1.4107883817427386e-06, "logits/chosen": -1.0321152210235596, "logits/rejected": 0.8235028386116028, "logps/chosen": -470.266845703125, "logps/rejected": -690.5302124023438, "loss": 0.2698, "rewards/accuracies": 0.875, "rewards/chosen": -5.260035991668701, "rewards/margins": 9.501744270324707, "rewards/rejected": -14.761780738830566, "step": 1199 }, { "epoch": 0.7465007776049767, "grad_norm": 0.021125737577676773, "learning_rate": 1.4073305670816045e-06, "logits/chosen": -0.991514265537262, "logits/rejected": 0.3483167588710785, "logps/chosen": -466.12066650390625, "logps/rejected": -761.2283325195312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.5139639377593994, "rewards/margins": 16.28211784362793, "rewards/rejected": -19.79608154296875, "step": 1200 }, { "epoch": 0.7471228615863141, "grad_norm": 0.9355553388595581, "learning_rate": 1.4038727524204704e-06, "logits/chosen": -2.556581974029541, "logits/rejected": 2.1694085597991943, "logps/chosen": -420.676513671875, "logps/rejected": -838.3154296875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -3.9424405097961426, "rewards/margins": 14.14123821258545, "rewards/rejected": -18.08367919921875, "step": 1201 }, { "epoch": 0.7477449455676516, "grad_norm": 0.04760259389877319, "learning_rate": 1.4004149377593363e-06, "logits/chosen": -4.169564247131348, "logits/rejected": 1.3665965795516968, "logps/chosen": -265.7261047363281, "logps/rejected": -806.938720703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.327432632446289, "rewards/margins": 17.277137756347656, "rewards/rejected": -20.604570388793945, "step": 1202 }, { "epoch": 0.7483670295489891, "grad_norm": 0.1427813619375229, "learning_rate": 1.3969571230982022e-06, "logits/chosen": -5.401397228240967, "logits/rejected": -0.9636671543121338, "logps/chosen": -314.22515869140625, "logps/rejected": -746.95751953125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.765713691711426, "rewards/margins": 15.49448299407959, "rewards/rejected": -20.26019859313965, "step": 1203 }, { "epoch": 0.7489891135303266, "grad_norm": 9.485186576843262, "learning_rate": 1.3934993084370678e-06, "logits/chosen": -3.664107322692871, "logits/rejected": 1.6542599201202393, "logps/chosen": -322.4087829589844, "logps/rejected": -744.4974975585938, "loss": 0.3205, "rewards/accuracies": 0.875, "rewards/chosen": -1.2432278394699097, "rewards/margins": 9.54750919342041, "rewards/rejected": -10.79073715209961, "step": 1204 }, { "epoch": 0.749611197511664, "grad_norm": 10.001455307006836, "learning_rate": 1.3900414937759337e-06, "logits/chosen": -1.1418333053588867, "logits/rejected": -0.4370773434638977, "logps/chosen": -452.611572265625, "logps/rejected": -616.7849731445312, "loss": 0.2438, "rewards/accuracies": 0.875, "rewards/chosen": -3.5377511978149414, "rewards/margins": 6.775075912475586, "rewards/rejected": -10.312826156616211, "step": 1205 }, { "epoch": 0.7502332814930015, "grad_norm": 9.17621898651123, "learning_rate": 1.3865836791147996e-06, "logits/chosen": -0.7163441777229309, "logits/rejected": 1.8633050918579102, "logps/chosen": -566.4046020507812, "logps/rejected": -853.783935546875, "loss": 0.4393, "rewards/accuracies": 0.875, "rewards/chosen": -7.099792003631592, "rewards/margins": 13.496251106262207, "rewards/rejected": -20.59604263305664, "step": 1206 }, { "epoch": 0.7508553654743391, "grad_norm": 0.009980311617255211, "learning_rate": 1.3831258644536655e-06, "logits/chosen": -3.675764322280884, "logits/rejected": -0.1759061962366104, "logps/chosen": -416.973388671875, "logps/rejected": -762.86279296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.011529445648193, "rewards/margins": 13.293500900268555, "rewards/rejected": -17.305030822753906, "step": 1207 }, { "epoch": 0.7514774494556765, "grad_norm": 1.4973187446594238, "learning_rate": 1.3796680497925314e-06, "logits/chosen": 0.28539103269577026, "logits/rejected": 1.5906161069869995, "logps/chosen": -481.5164489746094, "logps/rejected": -628.2119140625, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -4.556268692016602, "rewards/margins": 10.959025382995605, "rewards/rejected": -15.515295028686523, "step": 1208 }, { "epoch": 0.752099533437014, "grad_norm": 0.07746069133281708, "learning_rate": 1.3762102351313973e-06, "logits/chosen": -2.8204095363616943, "logits/rejected": 2.270554542541504, "logps/chosen": -470.9994812011719, "logps/rejected": -926.6956176757812, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.972426176071167, "rewards/margins": 13.719223976135254, "rewards/rejected": -17.691650390625, "step": 1209 }, { "epoch": 0.7527216174183515, "grad_norm": 2.3078672885894775, "learning_rate": 1.372752420470263e-06, "logits/chosen": -3.7805495262145996, "logits/rejected": -1.9660223722457886, "logps/chosen": -327.2149658203125, "logps/rejected": -497.6336975097656, "loss": 0.1468, "rewards/accuracies": 0.875, "rewards/chosen": -2.9098477363586426, "rewards/margins": 9.45797061920166, "rewards/rejected": -12.367820739746094, "step": 1210 }, { "epoch": 0.753343701399689, "grad_norm": 12.615619659423828, "learning_rate": 1.3692946058091288e-06, "logits/chosen": -2.6664481163024902, "logits/rejected": 1.8136637210845947, "logps/chosen": -424.7005310058594, "logps/rejected": -811.2325439453125, "loss": 0.4002, "rewards/accuracies": 0.875, "rewards/chosen": -2.156242609024048, "rewards/margins": 9.788275718688965, "rewards/rejected": -11.944518089294434, "step": 1211 }, { "epoch": 0.7539657853810264, "grad_norm": 8.353232383728027, "learning_rate": 1.3658367911479945e-06, "logits/chosen": 0.4568924307823181, "logits/rejected": 2.795104503631592, "logps/chosen": -627.4772338867188, "logps/rejected": -900.2474365234375, "loss": 0.1939, "rewards/accuracies": 0.875, "rewards/chosen": -4.603178977966309, "rewards/margins": 14.265107154846191, "rewards/rejected": -18.8682861328125, "step": 1212 }, { "epoch": 0.7545878693623639, "grad_norm": 0.004080959130078554, "learning_rate": 1.3623789764868604e-06, "logits/chosen": -0.9833959937095642, "logits/rejected": 1.9703147411346436, "logps/chosen": -412.8741455078125, "logps/rejected": -809.86474609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.029632091522217, "rewards/margins": 17.01657485961914, "rewards/rejected": -19.046207427978516, "step": 1213 }, { "epoch": 0.7552099533437014, "grad_norm": 0.007544025778770447, "learning_rate": 1.358921161825726e-06, "logits/chosen": -3.814112424850464, "logits/rejected": 0.6169370412826538, "logps/chosen": -373.6060791015625, "logps/rejected": -840.278564453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.721683502197266, "rewards/margins": 15.925864219665527, "rewards/rejected": -20.64754867553711, "step": 1214 }, { "epoch": 0.7558320373250389, "grad_norm": 0.26863396167755127, "learning_rate": 1.355463347164592e-06, "logits/chosen": -0.8833622336387634, "logits/rejected": 1.7980408668518066, "logps/chosen": -499.93865966796875, "logps/rejected": -755.989013671875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.626401424407959, "rewards/margins": 12.69153881072998, "rewards/rejected": -17.31793975830078, "step": 1215 }, { "epoch": 0.7564541213063763, "grad_norm": 11.663763046264648, "learning_rate": 1.3520055325034578e-06, "logits/chosen": 0.4759812355041504, "logits/rejected": 2.527010917663574, "logps/chosen": -558.2933959960938, "logps/rejected": -777.4957275390625, "loss": 0.3, "rewards/accuracies": 0.875, "rewards/chosen": -3.2170376777648926, "rewards/margins": 10.90041732788086, "rewards/rejected": -14.117454528808594, "step": 1216 }, { "epoch": 0.7570762052877138, "grad_norm": 16.229276657104492, "learning_rate": 1.3485477178423237e-06, "logits/chosen": -1.2370485067367554, "logits/rejected": 1.344881296157837, "logps/chosen": -440.12994384765625, "logps/rejected": -695.6482543945312, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": -6.70377254486084, "rewards/margins": 9.777131080627441, "rewards/rejected": -16.48090362548828, "step": 1217 }, { "epoch": 0.7576982892690514, "grad_norm": 0.0007094276952557266, "learning_rate": 1.3450899031811896e-06, "logits/chosen": -2.0160562992095947, "logits/rejected": 2.680678129196167, "logps/chosen": -447.72650146484375, "logps/rejected": -873.8001098632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.313559532165527, "rewards/margins": 16.264726638793945, "rewards/rejected": -20.57828712463379, "step": 1218 }, { "epoch": 0.7583203732503888, "grad_norm": 9.134259223937988, "learning_rate": 1.3416320885200553e-06, "logits/chosen": -0.5711604356765747, "logits/rejected": 0.7800401449203491, "logps/chosen": -609.3330078125, "logps/rejected": -851.1119995117188, "loss": 0.2485, "rewards/accuracies": 0.875, "rewards/chosen": -6.801517009735107, "rewards/margins": 14.295014381408691, "rewards/rejected": -21.09653091430664, "step": 1219 }, { "epoch": 0.7589424572317263, "grad_norm": 18.67616081237793, "learning_rate": 1.3381742738589212e-06, "logits/chosen": -3.0102040767669678, "logits/rejected": 0.6589641571044922, "logps/chosen": -455.55926513671875, "logps/rejected": -802.17919921875, "loss": 1.1897, "rewards/accuracies": 0.75, "rewards/chosen": -6.220843315124512, "rewards/margins": 9.706228256225586, "rewards/rejected": -15.927070617675781, "step": 1220 }, { "epoch": 0.7595645412130637, "grad_norm": 1.291364312171936, "learning_rate": 1.334716459197787e-06, "logits/chosen": 0.44359689950942993, "logits/rejected": 1.9575949907302856, "logps/chosen": -552.631103515625, "logps/rejected": -757.189453125, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -4.587469577789307, "rewards/margins": 12.438658714294434, "rewards/rejected": -17.0261287689209, "step": 1221 }, { "epoch": 0.7601866251944013, "grad_norm": 0.07958652079105377, "learning_rate": 1.331258644536653e-06, "logits/chosen": -0.27534377574920654, "logits/rejected": 1.5600539445877075, "logps/chosen": -539.5811157226562, "logps/rejected": -827.995849609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.306221961975098, "rewards/margins": 13.944232940673828, "rewards/rejected": -20.25045394897461, "step": 1222 }, { "epoch": 0.7608087091757387, "grad_norm": 0.0033131849486380816, "learning_rate": 1.3278008298755188e-06, "logits/chosen": -1.7156676054000854, "logits/rejected": 0.25685980916023254, "logps/chosen": -375.489501953125, "logps/rejected": -580.051025390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6700351238250732, "rewards/margins": 11.827942848205566, "rewards/rejected": -13.497978210449219, "step": 1223 }, { "epoch": 0.7614307931570762, "grad_norm": 0.205499067902565, "learning_rate": 1.3243430152143847e-06, "logits/chosen": -1.1460318565368652, "logits/rejected": 0.47781801223754883, "logps/chosen": -513.9737548828125, "logps/rejected": -748.2268676757812, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.303009033203125, "rewards/margins": 12.19505786895752, "rewards/rejected": -16.49806785583496, "step": 1224 }, { "epoch": 0.7620528771384136, "grad_norm": 11.021306037902832, "learning_rate": 1.3208852005532504e-06, "logits/chosen": -0.01636466383934021, "logits/rejected": 0.9181078672409058, "logps/chosen": -647.0047607421875, "logps/rejected": -800.19140625, "loss": 0.171, "rewards/accuracies": 0.875, "rewards/chosen": -7.944740295410156, "rewards/margins": 8.797403335571289, "rewards/rejected": -16.742143630981445, "step": 1225 }, { "epoch": 0.7626749611197512, "grad_norm": 3.762831449508667, "learning_rate": 1.3174273858921163e-06, "logits/chosen": -1.0666617155075073, "logits/rejected": 1.905846357345581, "logps/chosen": -542.5707397460938, "logps/rejected": -838.490234375, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": -3.2831673622131348, "rewards/margins": 12.123820304870605, "rewards/rejected": -15.406988143920898, "step": 1226 }, { "epoch": 0.7632970451010886, "grad_norm": 0.040536340326070786, "learning_rate": 1.3139695712309822e-06, "logits/chosen": 0.6988925933837891, "logits/rejected": 1.6499724388122559, "logps/chosen": -521.4002075195312, "logps/rejected": -694.7281494140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.943024635314941, "rewards/margins": 10.342194557189941, "rewards/rejected": -15.2852201461792, "step": 1227 }, { "epoch": 0.7639191290824261, "grad_norm": 3.608476161956787, "learning_rate": 1.310511756569848e-06, "logits/chosen": -1.7452962398529053, "logits/rejected": 1.3442888259887695, "logps/chosen": -568.9551391601562, "logps/rejected": -891.3764038085938, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": -4.5682454109191895, "rewards/margins": 9.736841201782227, "rewards/rejected": -14.30508804321289, "step": 1228 }, { "epoch": 0.7645412130637637, "grad_norm": 6.79505729675293, "learning_rate": 1.307053941908714e-06, "logits/chosen": -2.503175735473633, "logits/rejected": 1.7010334730148315, "logps/chosen": -382.5596008300781, "logps/rejected": -710.754150390625, "loss": 0.1121, "rewards/accuracies": 0.875, "rewards/chosen": -3.5294764041900635, "rewards/margins": 9.117036819458008, "rewards/rejected": -12.646513938903809, "step": 1229 }, { "epoch": 0.7651632970451011, "grad_norm": 2.1455252170562744, "learning_rate": 1.3035961272475798e-06, "logits/chosen": -1.5811262130737305, "logits/rejected": 1.3630297183990479, "logps/chosen": -488.22247314453125, "logps/rejected": -818.0501708984375, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -3.6386780738830566, "rewards/margins": 13.792112350463867, "rewards/rejected": -17.4307918548584, "step": 1230 }, { "epoch": 0.7657853810264386, "grad_norm": 13.21014404296875, "learning_rate": 1.3001383125864455e-06, "logits/chosen": 0.6425709128379822, "logits/rejected": 1.8632869720458984, "logps/chosen": -647.7941284179688, "logps/rejected": -859.2852172851562, "loss": 0.3407, "rewards/accuracies": 0.875, "rewards/chosen": -7.200288772583008, "rewards/margins": 9.280425071716309, "rewards/rejected": -16.480712890625, "step": 1231 }, { "epoch": 0.766407465007776, "grad_norm": 2.462399482727051, "learning_rate": 1.2966804979253114e-06, "logits/chosen": -3.33858323097229, "logits/rejected": 1.5956666469573975, "logps/chosen": -291.7204895019531, "logps/rejected": -663.83203125, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -4.981064796447754, "rewards/margins": 9.732316970825195, "rewards/rejected": -14.713380813598633, "step": 1232 }, { "epoch": 0.7670295489891136, "grad_norm": 12.445592880249023, "learning_rate": 1.2932226832641773e-06, "logits/chosen": -1.7012206315994263, "logits/rejected": 1.3045161962509155, "logps/chosen": -458.1094970703125, "logps/rejected": -795.281982421875, "loss": 0.6303, "rewards/accuracies": 0.875, "rewards/chosen": -6.478001117706299, "rewards/margins": 11.540486335754395, "rewards/rejected": -18.01848602294922, "step": 1233 }, { "epoch": 0.767651632970451, "grad_norm": 13.630520820617676, "learning_rate": 1.2897648686030432e-06, "logits/chosen": -1.5486005544662476, "logits/rejected": -0.3552760183811188, "logps/chosen": -539.357177734375, "logps/rejected": -681.2017211914062, "loss": 1.2499, "rewards/accuracies": 0.875, "rewards/chosen": -6.4164276123046875, "rewards/margins": 7.0912628173828125, "rewards/rejected": -13.5076904296875, "step": 1234 }, { "epoch": 0.7682737169517885, "grad_norm": 0.09812422841787338, "learning_rate": 1.2863070539419086e-06, "logits/chosen": 0.6744793653488159, "logits/rejected": 2.5875658988952637, "logps/chosen": -654.3623657226562, "logps/rejected": -857.6705322265625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.817391872406006, "rewards/margins": 10.678570747375488, "rewards/rejected": -15.495963096618652, "step": 1235 }, { "epoch": 0.7688958009331259, "grad_norm": 11.619990348815918, "learning_rate": 1.2828492392807745e-06, "logits/chosen": -2.3428971767425537, "logits/rejected": -0.10660946369171143, "logps/chosen": -492.30914306640625, "logps/rejected": -731.184814453125, "loss": 0.5899, "rewards/accuracies": 0.875, "rewards/chosen": -4.722409248352051, "rewards/margins": 11.779485702514648, "rewards/rejected": -16.501895904541016, "step": 1236 }, { "epoch": 0.7695178849144635, "grad_norm": 1.2990361452102661, "learning_rate": 1.2793914246196404e-06, "logits/chosen": -1.9259202480316162, "logits/rejected": 1.1245478391647339, "logps/chosen": -475.1590576171875, "logps/rejected": -744.08935546875, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -5.481305122375488, "rewards/margins": 14.680113792419434, "rewards/rejected": -20.161418914794922, "step": 1237 }, { "epoch": 0.7701399688958009, "grad_norm": 9.901052474975586, "learning_rate": 1.2759336099585063e-06, "logits/chosen": -2.4025797843933105, "logits/rejected": 0.8231541514396667, "logps/chosen": -344.82305908203125, "logps/rejected": -715.8572387695312, "loss": 0.1913, "rewards/accuracies": 0.875, "rewards/chosen": -3.471104383468628, "rewards/margins": 13.314970016479492, "rewards/rejected": -16.786075592041016, "step": 1238 }, { "epoch": 0.7707620528771384, "grad_norm": 13.503618240356445, "learning_rate": 1.2724757952973722e-06, "logits/chosen": 0.29361653327941895, "logits/rejected": 2.4575533866882324, "logps/chosen": -544.5206298828125, "logps/rejected": -777.5345458984375, "loss": 0.6374, "rewards/accuracies": 0.875, "rewards/chosen": -4.469199180603027, "rewards/margins": 10.223840713500977, "rewards/rejected": -14.69304084777832, "step": 1239 }, { "epoch": 0.7713841368584758, "grad_norm": 9.739383697509766, "learning_rate": 1.2690179806362378e-06, "logits/chosen": -2.3776655197143555, "logits/rejected": 1.2286568880081177, "logps/chosen": -311.035888671875, "logps/rejected": -644.0858154296875, "loss": 0.1549, "rewards/accuracies": 0.875, "rewards/chosen": -1.858437418937683, "rewards/margins": 13.944459915161133, "rewards/rejected": -15.802896499633789, "step": 1240 }, { "epoch": 0.7720062208398134, "grad_norm": 10.045507431030273, "learning_rate": 1.2655601659751037e-06, "logits/chosen": 0.5416525602340698, "logits/rejected": 3.425825357437134, "logps/chosen": -508.592041015625, "logps/rejected": -872.87890625, "loss": 0.3479, "rewards/accuracies": 0.875, "rewards/chosen": -4.747520446777344, "rewards/margins": 13.007368087768555, "rewards/rejected": -17.75489044189453, "step": 1241 }, { "epoch": 0.7726283048211509, "grad_norm": 16.17880630493164, "learning_rate": 1.2621023513139696e-06, "logits/chosen": -0.9377941489219666, "logits/rejected": 0.7408784627914429, "logps/chosen": -624.0111694335938, "logps/rejected": -838.9092407226562, "loss": 0.2327, "rewards/accuracies": 0.875, "rewards/chosen": -6.344542503356934, "rewards/margins": 9.446586608886719, "rewards/rejected": -15.791128158569336, "step": 1242 }, { "epoch": 0.7732503888024883, "grad_norm": 0.053557138890028, "learning_rate": 1.2586445366528355e-06, "logits/chosen": -0.1588079333305359, "logits/rejected": 2.2508528232574463, "logps/chosen": -592.96435546875, "logps/rejected": -907.7225341796875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.8415682315826416, "rewards/margins": 13.709413528442383, "rewards/rejected": -17.550981521606445, "step": 1243 }, { "epoch": 0.7738724727838259, "grad_norm": 16.56268310546875, "learning_rate": 1.2551867219917014e-06, "logits/chosen": 1.1482330560684204, "logits/rejected": 2.6718950271606445, "logps/chosen": -622.3114624023438, "logps/rejected": -793.6072387695312, "loss": 0.6092, "rewards/accuracies": 0.875, "rewards/chosen": -4.467023849487305, "rewards/margins": 8.309226989746094, "rewards/rejected": -12.776250839233398, "step": 1244 }, { "epoch": 0.7744945567651633, "grad_norm": 0.03017052449285984, "learning_rate": 1.2517289073305673e-06, "logits/chosen": 1.5661126375198364, "logits/rejected": 1.5264503955841064, "logps/chosen": -664.9033203125, "logps/rejected": -827.547119140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.003208637237549, "rewards/margins": 12.028064727783203, "rewards/rejected": -18.031272888183594, "step": 1245 }, { "epoch": 0.7751166407465008, "grad_norm": 1.2538882493972778, "learning_rate": 1.248271092669433e-06, "logits/chosen": -1.1989738941192627, "logits/rejected": 2.7826502323150635, "logps/chosen": -304.14129638671875, "logps/rejected": -698.9002685546875, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -3.037710666656494, "rewards/margins": 12.748371124267578, "rewards/rejected": -15.78608226776123, "step": 1246 }, { "epoch": 0.7757387247278382, "grad_norm": 9.3817720413208, "learning_rate": 1.2448132780082988e-06, "logits/chosen": -2.781756639480591, "logits/rejected": 0.5088878273963928, "logps/chosen": -515.9522705078125, "logps/rejected": -770.4732666015625, "loss": 0.2774, "rewards/accuracies": 0.875, "rewards/chosen": -5.124436855316162, "rewards/margins": 11.259809494018555, "rewards/rejected": -16.384246826171875, "step": 1247 }, { "epoch": 0.7763608087091758, "grad_norm": 0.03160645440220833, "learning_rate": 1.2413554633471647e-06, "logits/chosen": -2.9802052974700928, "logits/rejected": 1.5844905376434326, "logps/chosen": -317.1291198730469, "logps/rejected": -754.8489990234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.7082672119140625, "rewards/margins": 11.826979637145996, "rewards/rejected": -14.535247802734375, "step": 1248 }, { "epoch": 0.7769828926905132, "grad_norm": 0.36448320746421814, "learning_rate": 1.2378976486860306e-06, "logits/chosen": -1.905295968055725, "logits/rejected": 1.6722726821899414, "logps/chosen": -474.4230651855469, "logps/rejected": -823.7603149414062, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -5.620505332946777, "rewards/margins": 14.636770248413086, "rewards/rejected": -20.25727653503418, "step": 1249 }, { "epoch": 0.7776049766718507, "grad_norm": 8.436664581298828, "learning_rate": 1.2344398340248965e-06, "logits/chosen": -1.4560916423797607, "logits/rejected": 1.5818103551864624, "logps/chosen": -471.5996398925781, "logps/rejected": -833.1858520507812, "loss": 0.1939, "rewards/accuracies": 0.875, "rewards/chosen": -4.227414131164551, "rewards/margins": 13.265769958496094, "rewards/rejected": -17.493183135986328, "step": 1250 }, { "epoch": 0.7782270606531881, "grad_norm": 5.063947677612305, "learning_rate": 1.2309820193637624e-06, "logits/chosen": -1.677452802658081, "logits/rejected": 1.239478588104248, "logps/chosen": -458.9784240722656, "logps/rejected": -767.5501708984375, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": -3.7733404636383057, "rewards/margins": 12.038658142089844, "rewards/rejected": -15.81199836730957, "step": 1251 }, { "epoch": 0.7788491446345257, "grad_norm": 0.46015191078186035, "learning_rate": 1.227524204702628e-06, "logits/chosen": -0.4238088130950928, "logits/rejected": 1.6729496717453003, "logps/chosen": -539.281494140625, "logps/rejected": -776.490478515625, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -4.720961093902588, "rewards/margins": 13.203474044799805, "rewards/rejected": -17.924434661865234, "step": 1252 }, { "epoch": 0.7794712286158632, "grad_norm": 14.37130355834961, "learning_rate": 1.224066390041494e-06, "logits/chosen": -1.6803243160247803, "logits/rejected": 1.5848281383514404, "logps/chosen": -409.61572265625, "logps/rejected": -710.7590942382812, "loss": 1.1181, "rewards/accuracies": 0.75, "rewards/chosen": -4.122034072875977, "rewards/margins": 10.838115692138672, "rewards/rejected": -14.960150718688965, "step": 1253 }, { "epoch": 0.7800933125972006, "grad_norm": 0.3019509017467499, "learning_rate": 1.2206085753803596e-06, "logits/chosen": -0.6016656756401062, "logits/rejected": 1.2408396005630493, "logps/chosen": -392.5195617675781, "logps/rejected": -610.5278930664062, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.1815550327301025, "rewards/margins": 12.500040054321289, "rewards/rejected": -15.681594848632812, "step": 1254 }, { "epoch": 0.7807153965785381, "grad_norm": 0.6098394393920898, "learning_rate": 1.2171507607192255e-06, "logits/chosen": 1.4270235300064087, "logits/rejected": 2.6003313064575195, "logps/chosen": -705.7081298828125, "logps/rejected": -864.791748046875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -6.10780668258667, "rewards/margins": 8.447437286376953, "rewards/rejected": -14.555244445800781, "step": 1255 }, { "epoch": 0.7813374805598756, "grad_norm": 0.31633761525154114, "learning_rate": 1.2136929460580914e-06, "logits/chosen": -2.7459030151367188, "logits/rejected": 1.4790310859680176, "logps/chosen": -377.26593017578125, "logps/rejected": -741.345458984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -1.7852559089660645, "rewards/margins": 13.360594749450684, "rewards/rejected": -15.145851135253906, "step": 1256 }, { "epoch": 0.7819595645412131, "grad_norm": 0.0065107326954603195, "learning_rate": 1.2102351313969573e-06, "logits/chosen": -2.504744529724121, "logits/rejected": 0.8796257972717285, "logps/chosen": -461.1879577636719, "logps/rejected": -828.2766723632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.698330879211426, "rewards/margins": 14.3059720993042, "rewards/rejected": -18.004302978515625, "step": 1257 }, { "epoch": 0.7825816485225505, "grad_norm": 14.154812812805176, "learning_rate": 1.2067773167358231e-06, "logits/chosen": -1.1623008251190186, "logits/rejected": 1.522471308708191, "logps/chosen": -457.2164001464844, "logps/rejected": -810.2890625, "loss": 0.9732, "rewards/accuracies": 0.75, "rewards/chosen": -3.192183494567871, "rewards/margins": 12.859447479248047, "rewards/rejected": -16.051631927490234, "step": 1258 }, { "epoch": 0.783203732503888, "grad_norm": 0.18512770533561707, "learning_rate": 1.2033195020746888e-06, "logits/chosen": -1.1726185083389282, "logits/rejected": 2.6997671127319336, "logps/chosen": -466.0726623535156, "logps/rejected": -869.9688110351562, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.9260566234588623, "rewards/margins": 12.859270095825195, "rewards/rejected": -16.785327911376953, "step": 1259 }, { "epoch": 0.7838258164852255, "grad_norm": 0.004519362468272448, "learning_rate": 1.1998616874135547e-06, "logits/chosen": 0.32994189858436584, "logits/rejected": 2.133074998855591, "logps/chosen": -621.4857177734375, "logps/rejected": -950.90869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.4449687004089355, "rewards/margins": 18.821636199951172, "rewards/rejected": -23.266603469848633, "step": 1260 }, { "epoch": 0.784447900466563, "grad_norm": 0.5271346569061279, "learning_rate": 1.1964038727524206e-06, "logits/chosen": -1.3863641023635864, "logits/rejected": 2.049956798553467, "logps/chosen": -384.109375, "logps/rejected": -701.3455200195312, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -1.4092161655426025, "rewards/margins": 11.275032043457031, "rewards/rejected": -12.684247970581055, "step": 1261 }, { "epoch": 0.7850699844479004, "grad_norm": 0.14567671716213226, "learning_rate": 1.1929460580912865e-06, "logits/chosen": -0.7990472316741943, "logits/rejected": 3.1854379177093506, "logps/chosen": -364.2894592285156, "logps/rejected": -781.602294921875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.442189931869507, "rewards/margins": 13.423539161682129, "rewards/rejected": -16.86573028564453, "step": 1262 }, { "epoch": 0.785692068429238, "grad_norm": 4.41299295425415, "learning_rate": 1.1894882434301522e-06, "logits/chosen": -2.3691468238830566, "logits/rejected": 1.6543501615524292, "logps/chosen": -435.0484313964844, "logps/rejected": -792.427978515625, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": -5.514183044433594, "rewards/margins": 14.88491439819336, "rewards/rejected": -20.399097442626953, "step": 1263 }, { "epoch": 0.7863141524105755, "grad_norm": 6.8701348304748535, "learning_rate": 1.186030428769018e-06, "logits/chosen": -0.3183002173900604, "logits/rejected": 1.025041103363037, "logps/chosen": -585.76904296875, "logps/rejected": -758.4483642578125, "loss": 0.1226, "rewards/accuracies": 0.875, "rewards/chosen": -6.006821632385254, "rewards/margins": 10.571466445922852, "rewards/rejected": -16.578289031982422, "step": 1264 }, { "epoch": 0.7869362363919129, "grad_norm": 0.27814653515815735, "learning_rate": 1.182572614107884e-06, "logits/chosen": -2.7144711017608643, "logits/rejected": 1.7591458559036255, "logps/chosen": -447.02203369140625, "logps/rejected": -925.0657958984375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.780238389968872, "rewards/margins": 14.965193748474121, "rewards/rejected": -18.745431900024414, "step": 1265 }, { "epoch": 0.7875583203732504, "grad_norm": 12.514571189880371, "learning_rate": 1.1791147994467498e-06, "logits/chosen": -0.42276933789253235, "logits/rejected": 2.2784008979797363, "logps/chosen": -514.2608642578125, "logps/rejected": -754.9779052734375, "loss": 0.3902, "rewards/accuracies": 0.75, "rewards/chosen": -4.290280818939209, "rewards/margins": 10.469820022583008, "rewards/rejected": -14.760101318359375, "step": 1266 }, { "epoch": 0.7881804043545879, "grad_norm": 16.221914291381836, "learning_rate": 1.1756569847856155e-06, "logits/chosen": -0.34458398818969727, "logits/rejected": 1.298403263092041, "logps/chosen": -585.9843139648438, "logps/rejected": -778.9339599609375, "loss": 0.3923, "rewards/accuracies": 0.75, "rewards/chosen": -5.669952392578125, "rewards/margins": 9.533330917358398, "rewards/rejected": -15.203283309936523, "step": 1267 }, { "epoch": 0.7888024883359254, "grad_norm": 0.302496075630188, "learning_rate": 1.1721991701244814e-06, "logits/chosen": -2.8124585151672363, "logits/rejected": 0.9311859011650085, "logps/chosen": -410.50128173828125, "logps/rejected": -776.2421264648438, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.987764358520508, "rewards/margins": 15.948014259338379, "rewards/rejected": -19.93577766418457, "step": 1268 }, { "epoch": 0.7894245723172628, "grad_norm": 9.933697700500488, "learning_rate": 1.1687413554633473e-06, "logits/chosen": -1.4593498706817627, "logits/rejected": 1.284300446510315, "logps/chosen": -517.1085815429688, "logps/rejected": -812.63525390625, "loss": 0.251, "rewards/accuracies": 0.75, "rewards/chosen": -6.0775933265686035, "rewards/margins": 12.881742477416992, "rewards/rejected": -18.959335327148438, "step": 1269 }, { "epoch": 0.7900466562986003, "grad_norm": 0.005133127328008413, "learning_rate": 1.1652835408022131e-06, "logits/chosen": -0.38240477442741394, "logits/rejected": 1.3849914073944092, "logps/chosen": -618.1557006835938, "logps/rejected": -895.3968505859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.9069201946258545, "rewards/margins": 17.311471939086914, "rewards/rejected": -20.218393325805664, "step": 1270 }, { "epoch": 0.7906687402799378, "grad_norm": 1.9683564901351929, "learning_rate": 1.161825726141079e-06, "logits/chosen": 0.595526933670044, "logits/rejected": 2.5383753776550293, "logps/chosen": -288.782470703125, "logps/rejected": -565.8150634765625, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -1.9905784130096436, "rewards/margins": 10.492025375366211, "rewards/rejected": -12.482603073120117, "step": 1271 }, { "epoch": 0.7912908242612753, "grad_norm": 7.188394069671631, "learning_rate": 1.158367911479945e-06, "logits/chosen": 0.8621138334274292, "logits/rejected": 1.9355257749557495, "logps/chosen": -577.1099853515625, "logps/rejected": -725.8158569335938, "loss": 0.097, "rewards/accuracies": 0.875, "rewards/chosen": -6.557938575744629, "rewards/margins": 8.737264633178711, "rewards/rejected": -15.29520320892334, "step": 1272 }, { "epoch": 0.7919129082426127, "grad_norm": 5.68092679977417, "learning_rate": 1.1549100968188106e-06, "logits/chosen": 1.7704967260360718, "logits/rejected": 1.2525421380996704, "logps/chosen": -671.2696533203125, "logps/rejected": -736.071044921875, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": -8.051796913146973, "rewards/margins": 10.41525650024414, "rewards/rejected": -18.467052459716797, "step": 1273 }, { "epoch": 0.7925349922239502, "grad_norm": 7.622984886169434, "learning_rate": 1.1514522821576765e-06, "logits/chosen": -2.1167595386505127, "logits/rejected": -0.2704860270023346, "logps/chosen": -449.6957092285156, "logps/rejected": -724.13330078125, "loss": 0.797, "rewards/accuracies": 0.875, "rewards/chosen": -5.844965934753418, "rewards/margins": 7.980135440826416, "rewards/rejected": -13.825101852416992, "step": 1274 }, { "epoch": 0.7931570762052877, "grad_norm": 8.087628364562988, "learning_rate": 1.1479944674965422e-06, "logits/chosen": -0.4186546206474304, "logits/rejected": 2.8974690437316895, "logps/chosen": -493.057373046875, "logps/rejected": -796.52734375, "loss": 0.1468, "rewards/accuracies": 0.875, "rewards/chosen": -6.826409816741943, "rewards/margins": 8.553226470947266, "rewards/rejected": -15.37963581085205, "step": 1275 }, { "epoch": 0.7937791601866252, "grad_norm": 0.05863155052065849, "learning_rate": 1.144536652835408e-06, "logits/chosen": -0.08510121703147888, "logits/rejected": 2.23657488822937, "logps/chosen": -529.562255859375, "logps/rejected": -816.55810546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.831898212432861, "rewards/margins": 14.778359413146973, "rewards/rejected": -19.61025619506836, "step": 1276 }, { "epoch": 0.7944012441679627, "grad_norm": 0.19495946168899536, "learning_rate": 1.141078838174274e-06, "logits/chosen": 3.020155906677246, "logits/rejected": 3.668471336364746, "logps/chosen": -780.5364990234375, "logps/rejected": -859.7434692382812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.99791145324707, "rewards/margins": 10.319195747375488, "rewards/rejected": -17.317108154296875, "step": 1277 }, { "epoch": 0.7950233281493001, "grad_norm": 0.07374799251556396, "learning_rate": 1.1376210235131398e-06, "logits/chosen": -0.6702171564102173, "logits/rejected": 0.6911787986755371, "logps/chosen": -391.6767578125, "logps/rejected": -598.7886962890625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.914581060409546, "rewards/margins": 10.474151611328125, "rewards/rejected": -14.38873291015625, "step": 1278 }, { "epoch": 0.7956454121306377, "grad_norm": 0.006767085287719965, "learning_rate": 1.1341632088520057e-06, "logits/chosen": -0.3825370669364929, "logits/rejected": 3.294511318206787, "logps/chosen": -499.09185791015625, "logps/rejected": -877.4091796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7367520332336426, "rewards/margins": 12.806866645812988, "rewards/rejected": -16.54361915588379, "step": 1279 }, { "epoch": 0.7962674961119751, "grad_norm": 4.501291751861572, "learning_rate": 1.1307053941908714e-06, "logits/chosen": -3.6046981811523438, "logits/rejected": 0.822663426399231, "logps/chosen": -259.84881591796875, "logps/rejected": -608.688232421875, "loss": 0.1802, "rewards/accuracies": 0.875, "rewards/chosen": -2.040160655975342, "rewards/margins": 10.838726043701172, "rewards/rejected": -12.878885269165039, "step": 1280 }, { "epoch": 0.7968895800933126, "grad_norm": 0.785681426525116, "learning_rate": 1.1272475795297373e-06, "logits/chosen": -0.5754196047782898, "logits/rejected": 2.9377012252807617, "logps/chosen": -510.323974609375, "logps/rejected": -870.332275390625, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -3.9798543453216553, "rewards/margins": 13.631769180297852, "rewards/rejected": -17.611621856689453, "step": 1281 }, { "epoch": 0.7975116640746501, "grad_norm": 1.5109750032424927, "learning_rate": 1.1237897648686031e-06, "logits/chosen": -0.3876732587814331, "logits/rejected": 1.722317099571228, "logps/chosen": -523.6480712890625, "logps/rejected": -830.8994750976562, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -3.3159546852111816, "rewards/margins": 12.533428192138672, "rewards/rejected": -15.849382400512695, "step": 1282 }, { "epoch": 0.7981337480559876, "grad_norm": 0.524721086025238, "learning_rate": 1.120331950207469e-06, "logits/chosen": -1.540387749671936, "logits/rejected": 1.6148297786712646, "logps/chosen": -495.429931640625, "logps/rejected": -794.56103515625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.509489059448242, "rewards/margins": 12.464055061340332, "rewards/rejected": -17.973543167114258, "step": 1283 }, { "epoch": 0.798755832037325, "grad_norm": 7.370581150054932, "learning_rate": 1.116874135546335e-06, "logits/chosen": -1.5582247972488403, "logits/rejected": 0.31504201889038086, "logps/chosen": -604.5326538085938, "logps/rejected": -836.63134765625, "loss": 0.1274, "rewards/accuracies": 0.875, "rewards/chosen": -3.806576728820801, "rewards/margins": 11.222973823547363, "rewards/rejected": -15.02955150604248, "step": 1284 }, { "epoch": 0.7993779160186625, "grad_norm": 11.710280418395996, "learning_rate": 1.1134163208852008e-06, "logits/chosen": 0.8185760378837585, "logits/rejected": 0.42767781019210815, "logps/chosen": -477.73016357421875, "logps/rejected": -662.5391845703125, "loss": 0.6685, "rewards/accuracies": 0.875, "rewards/chosen": -4.8998517990112305, "rewards/margins": 10.24984073638916, "rewards/rejected": -15.14969253540039, "step": 1285 }, { "epoch": 0.8, "grad_norm": 3.3164381980895996, "learning_rate": 1.1099585062240665e-06, "logits/chosen": -2.7505786418914795, "logits/rejected": 1.2307448387145996, "logps/chosen": -304.9454345703125, "logps/rejected": -676.33935546875, "loss": 0.1052, "rewards/accuracies": 0.875, "rewards/chosen": -3.4857301712036133, "rewards/margins": 9.555858612060547, "rewards/rejected": -13.04158878326416, "step": 1286 }, { "epoch": 0.8006220839813375, "grad_norm": 5.184134006500244, "learning_rate": 1.1065006915629324e-06, "logits/chosen": -2.62157940864563, "logits/rejected": 1.8134069442749023, "logps/chosen": -307.93731689453125, "logps/rejected": -787.3015747070312, "loss": 0.1249, "rewards/accuracies": 0.875, "rewards/chosen": -3.6794114112854004, "rewards/margins": 15.299356460571289, "rewards/rejected": -18.978769302368164, "step": 1287 }, { "epoch": 0.801244167962675, "grad_norm": 0.06692773103713989, "learning_rate": 1.103042876901798e-06, "logits/chosen": 0.08008898794651031, "logits/rejected": 3.056776523590088, "logps/chosen": -444.55059814453125, "logps/rejected": -745.1697998046875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.291983604431152, "rewards/margins": 12.97140121459961, "rewards/rejected": -17.263383865356445, "step": 1288 }, { "epoch": 0.8018662519440124, "grad_norm": 0.4232214391231537, "learning_rate": 1.099585062240664e-06, "logits/chosen": -0.5476794838905334, "logits/rejected": 2.1280994415283203, "logps/chosen": -465.5563659667969, "logps/rejected": -755.6217041015625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.094665050506592, "rewards/margins": 12.352485656738281, "rewards/rejected": -16.44715118408203, "step": 1289 }, { "epoch": 0.80248833592535, "grad_norm": 2.704054594039917, "learning_rate": 1.0961272475795298e-06, "logits/chosen": -3.748836040496826, "logits/rejected": 0.3109694719314575, "logps/chosen": -348.151123046875, "logps/rejected": -840.7587890625, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -3.840470314025879, "rewards/margins": 12.600534439086914, "rewards/rejected": -16.44100570678711, "step": 1290 }, { "epoch": 0.8031104199066874, "grad_norm": 1.1417760848999023, "learning_rate": 1.0926694329183957e-06, "logits/chosen": -2.137065887451172, "logits/rejected": 1.0439426898956299, "logps/chosen": -585.0442504882812, "logps/rejected": -884.7874755859375, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -3.698413133621216, "rewards/margins": 12.743579864501953, "rewards/rejected": -16.441993713378906, "step": 1291 }, { "epoch": 0.8037325038880249, "grad_norm": 0.004825478885322809, "learning_rate": 1.0892116182572616e-06, "logits/chosen": -1.714672327041626, "logits/rejected": 1.3089693784713745, "logps/chosen": -407.80908203125, "logps/rejected": -856.5692749023438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.3106226921081543, "rewards/margins": 14.46967601776123, "rewards/rejected": -16.780298233032227, "step": 1292 }, { "epoch": 0.8043545878693623, "grad_norm": 0.01949823647737503, "learning_rate": 1.0857538035961275e-06, "logits/chosen": -1.1258214712142944, "logits/rejected": 1.144755244255066, "logps/chosen": -578.2625122070312, "logps/rejected": -842.7352294921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.710251331329346, "rewards/margins": 14.136610984802246, "rewards/rejected": -19.84686279296875, "step": 1293 }, { "epoch": 0.8049766718506999, "grad_norm": 1.2861297130584717, "learning_rate": 1.0822959889349931e-06, "logits/chosen": -1.2053552865982056, "logits/rejected": 2.420220375061035, "logps/chosen": -470.58294677734375, "logps/rejected": -867.7210083007812, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -3.44787859916687, "rewards/margins": 11.996654510498047, "rewards/rejected": -15.444533348083496, "step": 1294 }, { "epoch": 0.8055987558320373, "grad_norm": 10.353989601135254, "learning_rate": 1.078838174273859e-06, "logits/chosen": -2.8751564025878906, "logits/rejected": 0.46566465497016907, "logps/chosen": -361.47259521484375, "logps/rejected": -679.950439453125, "loss": 0.2196, "rewards/accuracies": 0.875, "rewards/chosen": -3.396026372909546, "rewards/margins": 9.097265243530273, "rewards/rejected": -12.493290901184082, "step": 1295 }, { "epoch": 0.8062208398133748, "grad_norm": 9.722496032714844, "learning_rate": 1.075380359612725e-06, "logits/chosen": 0.5857282280921936, "logits/rejected": 2.8099350929260254, "logps/chosen": -478.2886962890625, "logps/rejected": -762.066162109375, "loss": 0.153, "rewards/accuracies": 0.875, "rewards/chosen": -4.069216728210449, "rewards/margins": 11.362369537353516, "rewards/rejected": -15.431585311889648, "step": 1296 }, { "epoch": 0.8068429237947123, "grad_norm": 0.7801067233085632, "learning_rate": 1.0719225449515906e-06, "logits/chosen": -1.3304216861724854, "logits/rejected": 1.6216132640838623, "logps/chosen": -281.18994140625, "logps/rejected": -642.2678833007812, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": -2.774709701538086, "rewards/margins": 12.139242172241211, "rewards/rejected": -14.91395092010498, "step": 1297 }, { "epoch": 0.8074650077760498, "grad_norm": 11.8325777053833, "learning_rate": 1.0684647302904565e-06, "logits/chosen": -2.368720054626465, "logits/rejected": 0.8762595057487488, "logps/chosen": -502.75604248046875, "logps/rejected": -777.464111328125, "loss": 1.0046, "rewards/accuracies": 0.875, "rewards/chosen": -4.743224620819092, "rewards/margins": 10.081891059875488, "rewards/rejected": -14.825115203857422, "step": 1298 }, { "epoch": 0.8080870917573872, "grad_norm": 0.7602382898330688, "learning_rate": 1.0650069156293224e-06, "logits/chosen": -0.08203822374343872, "logits/rejected": 2.053806781768799, "logps/chosen": -565.6295166015625, "logps/rejected": -823.6919555664062, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.777017593383789, "rewards/margins": 12.923656463623047, "rewards/rejected": -16.700672149658203, "step": 1299 }, { "epoch": 0.8087091757387247, "grad_norm": 10.297561645507812, "learning_rate": 1.0615491009681882e-06, "logits/chosen": -1.213075876235962, "logits/rejected": 1.1128454208374023, "logps/chosen": -603.1849975585938, "logps/rejected": -854.880126953125, "loss": 0.193, "rewards/accuracies": 0.875, "rewards/chosen": -3.194856643676758, "rewards/margins": 10.730962753295898, "rewards/rejected": -13.925819396972656, "step": 1300 }, { "epoch": 0.8093312597200623, "grad_norm": 0.42701926827430725, "learning_rate": 1.058091286307054e-06, "logits/chosen": -2.766136884689331, "logits/rejected": 0.7224398255348206, "logps/chosen": -322.7425537109375, "logps/rejected": -621.508544921875, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -1.5349202156066895, "rewards/margins": 13.846423149108887, "rewards/rejected": -15.381343841552734, "step": 1301 }, { "epoch": 0.8099533437013997, "grad_norm": 0.2938283383846283, "learning_rate": 1.0546334716459198e-06, "logits/chosen": -1.1893504858016968, "logits/rejected": 2.0147435665130615, "logps/chosen": -349.83514404296875, "logps/rejected": -759.466796875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -3.246273994445801, "rewards/margins": 12.938138008117676, "rewards/rejected": -16.18441390991211, "step": 1302 }, { "epoch": 0.8105754276827372, "grad_norm": 0.3250730037689209, "learning_rate": 1.0511756569847857e-06, "logits/chosen": -1.8163785934448242, "logits/rejected": 0.9539297223091125, "logps/chosen": -394.477783203125, "logps/rejected": -730.7607421875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.29082727432251, "rewards/margins": 12.300640106201172, "rewards/rejected": -16.591468811035156, "step": 1303 }, { "epoch": 0.8111975116640746, "grad_norm": 0.02391306683421135, "learning_rate": 1.0477178423236516e-06, "logits/chosen": -1.2628096342086792, "logits/rejected": 2.4500246047973633, "logps/chosen": -457.04949951171875, "logps/rejected": -771.25341796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.32575798034668, "rewards/margins": 11.56831169128418, "rewards/rejected": -15.89406967163086, "step": 1304 }, { "epoch": 0.8118195956454122, "grad_norm": 0.031947068870067596, "learning_rate": 1.0442600276625175e-06, "logits/chosen": -1.9339940547943115, "logits/rejected": 0.876315712928772, "logps/chosen": -347.9347229003906, "logps/rejected": -665.5185546875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.9112515449523926, "rewards/margins": 12.05921745300293, "rewards/rejected": -13.97046947479248, "step": 1305 }, { "epoch": 0.8124416796267496, "grad_norm": 0.05162962153553963, "learning_rate": 1.0408022130013833e-06, "logits/chosen": -1.2402105331420898, "logits/rejected": 3.781574010848999, "logps/chosen": -432.3200378417969, "logps/rejected": -952.7854614257812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.4947969913482666, "rewards/margins": 15.0423583984375, "rewards/rejected": -17.537155151367188, "step": 1306 }, { "epoch": 0.8130637636080871, "grad_norm": 0.005692610517144203, "learning_rate": 1.037344398340249e-06, "logits/chosen": -2.272217273712158, "logits/rejected": 0.5706815123558044, "logps/chosen": -368.5299072265625, "logps/rejected": -756.9783935546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.232287883758545, "rewards/margins": 15.321781158447266, "rewards/rejected": -19.55406951904297, "step": 1307 }, { "epoch": 0.8136858475894245, "grad_norm": 1.271409034729004, "learning_rate": 1.033886583679115e-06, "logits/chosen": -2.4349374771118164, "logits/rejected": 0.7994560599327087, "logps/chosen": -515.5551147460938, "logps/rejected": -822.0625, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -5.113642692565918, "rewards/margins": 10.517034530639648, "rewards/rejected": -15.630677223205566, "step": 1308 }, { "epoch": 0.8143079315707621, "grad_norm": 11.72814655303955, "learning_rate": 1.0304287690179806e-06, "logits/chosen": -0.47221148014068604, "logits/rejected": 1.8670220375061035, "logps/chosen": -558.9076538085938, "logps/rejected": -788.3414916992188, "loss": 0.6816, "rewards/accuracies": 0.875, "rewards/chosen": -6.855216026306152, "rewards/margins": 9.39391803741455, "rewards/rejected": -16.249134063720703, "step": 1309 }, { "epoch": 0.8149300155520995, "grad_norm": 0.5180342197418213, "learning_rate": 1.0269709543568465e-06, "logits/chosen": 0.5426323413848877, "logits/rejected": 2.470148801803589, "logps/chosen": -543.4552612304688, "logps/rejected": -844.589111328125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -4.5039520263671875, "rewards/margins": 14.556987762451172, "rewards/rejected": -19.06093978881836, "step": 1310 }, { "epoch": 0.815552099533437, "grad_norm": 0.39070653915405273, "learning_rate": 1.0235131396957124e-06, "logits/chosen": -1.8013322353363037, "logits/rejected": 0.5506967306137085, "logps/chosen": -507.18560791015625, "logps/rejected": -820.3834228515625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.499456405639648, "rewards/margins": 15.212932586669922, "rewards/rejected": -19.712387084960938, "step": 1311 }, { "epoch": 0.8161741835147744, "grad_norm": 4.566457748413086, "learning_rate": 1.0200553250345782e-06, "logits/chosen": -2.3922038078308105, "logits/rejected": 0.33956682682037354, "logps/chosen": -482.93975830078125, "logps/rejected": -735.2327880859375, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": -3.824122428894043, "rewards/margins": 9.538813591003418, "rewards/rejected": -13.362935066223145, "step": 1312 }, { "epoch": 0.816796267496112, "grad_norm": 8.939526557922363, "learning_rate": 1.0165975103734441e-06, "logits/chosen": -0.6893076300621033, "logits/rejected": 0.546741247177124, "logps/chosen": -450.5965881347656, "logps/rejected": -586.162353515625, "loss": 0.2328, "rewards/accuracies": 0.875, "rewards/chosen": -5.1088128089904785, "rewards/margins": 11.080528259277344, "rewards/rejected": -16.189340591430664, "step": 1313 }, { "epoch": 0.8174183514774495, "grad_norm": 0.2698509991168976, "learning_rate": 1.01313969571231e-06, "logits/chosen": -2.5487189292907715, "logits/rejected": 0.7299424409866333, "logps/chosen": -329.8349304199219, "logps/rejected": -663.2611083984375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.69761061668396, "rewards/margins": 10.053991317749023, "rewards/rejected": -12.751602172851562, "step": 1314 }, { "epoch": 0.8180404354587869, "grad_norm": 0.0007212011842057109, "learning_rate": 1.0096818810511757e-06, "logits/chosen": -3.341907501220703, "logits/rejected": 1.8579503297805786, "logps/chosen": -357.162353515625, "logps/rejected": -859.2657470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.7435417175292969, "rewards/margins": 17.116003036499023, "rewards/rejected": -18.859542846679688, "step": 1315 }, { "epoch": 0.8186625194401245, "grad_norm": 2.0347092151641846, "learning_rate": 1.0062240663900416e-06, "logits/chosen": -3.720144033432007, "logits/rejected": -0.3171952962875366, "logps/chosen": -364.93670654296875, "logps/rejected": -735.1425170898438, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -3.7706174850463867, "rewards/margins": 13.37431812286377, "rewards/rejected": -17.144935607910156, "step": 1316 }, { "epoch": 0.8192846034214619, "grad_norm": 4.0687360763549805, "learning_rate": 1.0027662517289075e-06, "logits/chosen": 1.1685250997543335, "logits/rejected": 1.2853538990020752, "logps/chosen": -576.25927734375, "logps/rejected": -721.3223876953125, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": -5.922859191894531, "rewards/margins": 11.187511444091797, "rewards/rejected": -17.110370635986328, "step": 1317 }, { "epoch": 0.8199066874027994, "grad_norm": 1.8853086233139038, "learning_rate": 9.993084370677733e-07, "logits/chosen": -1.2846870422363281, "logits/rejected": 1.328723669052124, "logps/chosen": -379.6622314453125, "logps/rejected": -643.0596923828125, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -3.2331957817077637, "rewards/margins": 10.447683334350586, "rewards/rejected": -13.680879592895508, "step": 1318 }, { "epoch": 0.8205287713841368, "grad_norm": 18.38396644592285, "learning_rate": 9.95850622406639e-07, "logits/chosen": -0.29736393690109253, "logits/rejected": 1.1460208892822266, "logps/chosen": -606.738525390625, "logps/rejected": -754.93798828125, "loss": 0.8473, "rewards/accuracies": 0.625, "rewards/chosen": -6.546107769012451, "rewards/margins": 8.931340217590332, "rewards/rejected": -15.477449417114258, "step": 1319 }, { "epoch": 0.8211508553654744, "grad_norm": 3.149142265319824, "learning_rate": 9.92392807745505e-07, "logits/chosen": -1.1778290271759033, "logits/rejected": 1.1719555854797363, "logps/chosen": -462.60906982421875, "logps/rejected": -714.6094360351562, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": -3.5274267196655273, "rewards/margins": 11.816180229187012, "rewards/rejected": -15.343606948852539, "step": 1320 }, { "epoch": 0.8217729393468118, "grad_norm": 0.5285313129425049, "learning_rate": 9.889349930843708e-07, "logits/chosen": 0.26200050115585327, "logits/rejected": 2.6836585998535156, "logps/chosen": -545.5755004882812, "logps/rejected": -843.5029296875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.87269926071167, "rewards/margins": 15.639558792114258, "rewards/rejected": -19.512258529663086, "step": 1321 }, { "epoch": 0.8223950233281493, "grad_norm": 0.861649751663208, "learning_rate": 9.854771784232365e-07, "logits/chosen": -1.1205819845199585, "logits/rejected": 0.627934455871582, "logps/chosen": -458.62451171875, "logps/rejected": -694.7573852539062, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -2.7938039302825928, "rewards/margins": 10.764945030212402, "rewards/rejected": -13.558748245239258, "step": 1322 }, { "epoch": 0.8230171073094867, "grad_norm": 12.45577335357666, "learning_rate": 9.820193637621024e-07, "logits/chosen": -1.5794190168380737, "logits/rejected": 0.5801475644111633, "logps/chosen": -496.46734619140625, "logps/rejected": -772.162109375, "loss": 0.4255, "rewards/accuracies": 0.875, "rewards/chosen": -5.992701053619385, "rewards/margins": 10.512711524963379, "rewards/rejected": -16.50541114807129, "step": 1323 }, { "epoch": 0.8236391912908243, "grad_norm": 8.178467750549316, "learning_rate": 9.785615491009682e-07, "logits/chosen": -0.24439431726932526, "logits/rejected": 1.1344170570373535, "logps/chosen": -456.52197265625, "logps/rejected": -655.1531372070312, "loss": 0.304, "rewards/accuracies": 0.875, "rewards/chosen": -3.7074806690216064, "rewards/margins": 9.550355911254883, "rewards/rejected": -13.25783634185791, "step": 1324 }, { "epoch": 0.8242612752721618, "grad_norm": 0.032792650163173676, "learning_rate": 9.751037344398341e-07, "logits/chosen": -1.4394570589065552, "logits/rejected": 1.340558409690857, "logps/chosen": -483.6750183105469, "logps/rejected": -847.97412109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.3498167991638184, "rewards/margins": 12.444425582885742, "rewards/rejected": -15.794242858886719, "step": 1325 }, { "epoch": 0.8248833592534992, "grad_norm": 0.19337275624275208, "learning_rate": 9.716459197787e-07, "logits/chosen": 0.14591360092163086, "logits/rejected": 1.4384404420852661, "logps/chosen": -484.7488708496094, "logps/rejected": -675.6716918945312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.910926103591919, "rewards/margins": 11.327999114990234, "rewards/rejected": -14.238924026489258, "step": 1326 }, { "epoch": 0.8255054432348367, "grad_norm": 0.04562671482563019, "learning_rate": 9.68188105117566e-07, "logits/chosen": 0.5693418979644775, "logits/rejected": 1.430376648902893, "logps/chosen": -556.4989013671875, "logps/rejected": -732.6883544921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.044366836547852, "rewards/margins": 11.590433120727539, "rewards/rejected": -15.634800910949707, "step": 1327 }, { "epoch": 0.8261275272161742, "grad_norm": 0.0025146733969449997, "learning_rate": 9.647302904564316e-07, "logits/chosen": -0.48970693349838257, "logits/rejected": 0.9110351800918579, "logps/chosen": -478.7364196777344, "logps/rejected": -673.9733276367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.531184196472168, "rewards/margins": 14.097567558288574, "rewards/rejected": -17.628751754760742, "step": 1328 }, { "epoch": 0.8267496111975117, "grad_norm": 0.3600122034549713, "learning_rate": 9.612724757952975e-07, "logits/chosen": -1.2722668647766113, "logits/rejected": 1.3349355459213257, "logps/chosen": -489.1748352050781, "logps/rejected": -865.2108154296875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.505204200744629, "rewards/margins": 14.515049934387207, "rewards/rejected": -19.020252227783203, "step": 1329 }, { "epoch": 0.8273716951788491, "grad_norm": 5.293062686920166, "learning_rate": 9.578146611341633e-07, "logits/chosen": -1.0846469402313232, "logits/rejected": 2.003509998321533, "logps/chosen": -557.578125, "logps/rejected": -823.9796142578125, "loss": 0.112, "rewards/accuracies": 1.0, "rewards/chosen": -5.489738464355469, "rewards/margins": 9.96224594116211, "rewards/rejected": -15.451984405517578, "step": 1330 }, { "epoch": 0.8279937791601866, "grad_norm": 1.3970305919647217, "learning_rate": 9.54356846473029e-07, "logits/chosen": -1.0385339260101318, "logits/rejected": 2.27046537399292, "logps/chosen": -382.1994323730469, "logps/rejected": -752.9017333984375, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -3.319052219390869, "rewards/margins": 12.103020668029785, "rewards/rejected": -15.422073364257812, "step": 1331 }, { "epoch": 0.8286158631415241, "grad_norm": 0.225614532828331, "learning_rate": 9.508990318118949e-07, "logits/chosen": 0.20837068557739258, "logits/rejected": 2.383972644805908, "logps/chosen": -529.0086669921875, "logps/rejected": -880.7655639648438, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.22091007232666, "rewards/margins": 12.746540069580078, "rewards/rejected": -16.967451095581055, "step": 1332 }, { "epoch": 0.8292379471228616, "grad_norm": 0.005054804030805826, "learning_rate": 9.474412171507608e-07, "logits/chosen": 0.7382409572601318, "logits/rejected": 2.282156229019165, "logps/chosen": -639.5353393554688, "logps/rejected": -887.4595336914062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.553280830383301, "rewards/margins": 14.08661937713623, "rewards/rejected": -19.63990020751953, "step": 1333 }, { "epoch": 0.829860031104199, "grad_norm": 12.920382499694824, "learning_rate": 9.439834024896266e-07, "logits/chosen": -1.1334662437438965, "logits/rejected": 0.44019484519958496, "logps/chosen": -537.5182495117188, "logps/rejected": -690.8375244140625, "loss": 0.3812, "rewards/accuracies": 0.875, "rewards/chosen": -5.264579772949219, "rewards/margins": 7.730589866638184, "rewards/rejected": -12.995169639587402, "step": 1334 }, { "epoch": 0.8304821150855366, "grad_norm": 14.718012809753418, "learning_rate": 9.405255878284925e-07, "logits/chosen": -1.3425624370574951, "logits/rejected": 2.144822359085083, "logps/chosen": -485.8384094238281, "logps/rejected": -788.0764770507812, "loss": 0.7119, "rewards/accuracies": 0.875, "rewards/chosen": -4.001196384429932, "rewards/margins": 11.216999053955078, "rewards/rejected": -15.218194961547852, "step": 1335 }, { "epoch": 0.831104199066874, "grad_norm": 2.900662422180176, "learning_rate": 9.370677731673583e-07, "logits/chosen": 0.2593168020248413, "logits/rejected": 1.9643827676773071, "logps/chosen": -451.28277587890625, "logps/rejected": -668.6449584960938, "loss": 0.1076, "rewards/accuracies": 0.875, "rewards/chosen": -3.0676589012145996, "rewards/margins": 8.528879165649414, "rewards/rejected": -11.596537590026855, "step": 1336 }, { "epoch": 0.8317262830482115, "grad_norm": 6.341903209686279, "learning_rate": 9.336099585062241e-07, "logits/chosen": -3.362675666809082, "logits/rejected": -0.3063855469226837, "logps/chosen": -392.78509521484375, "logps/rejected": -721.7537841796875, "loss": 0.1062, "rewards/accuracies": 0.875, "rewards/chosen": -3.4929919242858887, "rewards/margins": 11.770377159118652, "rewards/rejected": -15.2633695602417, "step": 1337 }, { "epoch": 0.832348367029549, "grad_norm": 0.1403876692056656, "learning_rate": 9.3015214384509e-07, "logits/chosen": 1.3362776041030884, "logits/rejected": 2.666769504547119, "logps/chosen": -524.3467407226562, "logps/rejected": -734.7777099609375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.006568431854248, "rewards/margins": 13.599903106689453, "rewards/rejected": -16.60647201538086, "step": 1338 }, { "epoch": 0.8329704510108865, "grad_norm": 0.10948459804058075, "learning_rate": 9.266943291839559e-07, "logits/chosen": 0.4632435441017151, "logits/rejected": -0.47776299715042114, "logps/chosen": -590.8575439453125, "logps/rejected": -567.9110107421875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.0416877269744873, "rewards/margins": 9.136734008789062, "rewards/rejected": -12.178421020507812, "step": 1339 }, { "epoch": 0.833592534992224, "grad_norm": 0.30080172419548035, "learning_rate": 9.232365145228217e-07, "logits/chosen": -0.7094452381134033, "logits/rejected": 1.7144113779067993, "logps/chosen": -418.07952880859375, "logps/rejected": -697.9678955078125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.6920619010925293, "rewards/margins": 13.438688278198242, "rewards/rejected": -16.130748748779297, "step": 1340 }, { "epoch": 0.8342146189735614, "grad_norm": 3.7305452823638916, "learning_rate": 9.197786998616876e-07, "logits/chosen": 0.6945167779922485, "logits/rejected": 1.5903186798095703, "logps/chosen": -590.7051391601562, "logps/rejected": -705.3280639648438, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": -4.753786563873291, "rewards/margins": 9.778225898742676, "rewards/rejected": -14.532012939453125, "step": 1341 }, { "epoch": 0.8348367029548989, "grad_norm": 9.471783638000488, "learning_rate": 9.163208852005532e-07, "logits/chosen": -1.955011010169983, "logits/rejected": 0.48641395568847656, "logps/chosen": -524.670166015625, "logps/rejected": -781.0850830078125, "loss": 0.3763, "rewards/accuracies": 0.875, "rewards/chosen": -3.608844757080078, "rewards/margins": 8.302030563354492, "rewards/rejected": -11.910876274108887, "step": 1342 }, { "epoch": 0.8354587869362364, "grad_norm": 0.0064768255688250065, "learning_rate": 9.128630705394191e-07, "logits/chosen": -3.000549554824829, "logits/rejected": 0.9102052450180054, "logps/chosen": -389.06732177734375, "logps/rejected": -747.5743408203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.9797048568725586, "rewards/margins": 13.444413185119629, "rewards/rejected": -17.424118041992188, "step": 1343 }, { "epoch": 0.8360808709175739, "grad_norm": 13.225577354431152, "learning_rate": 9.09405255878285e-07, "logits/chosen": -0.17028212547302246, "logits/rejected": 1.4072377681732178, "logps/chosen": -558.7680053710938, "logps/rejected": -785.1991577148438, "loss": 0.5216, "rewards/accuracies": 0.875, "rewards/chosen": -5.453101634979248, "rewards/margins": 9.274470329284668, "rewards/rejected": -14.727571487426758, "step": 1344 }, { "epoch": 0.8367029548989113, "grad_norm": 0.2019207775592804, "learning_rate": 9.059474412171508e-07, "logits/chosen": -3.2381317615509033, "logits/rejected": 1.5668666362762451, "logps/chosen": -321.7070007324219, "logps/rejected": -723.7025756835938, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.104261875152588, "rewards/margins": 13.174274444580078, "rewards/rejected": -17.278535842895508, "step": 1345 }, { "epoch": 0.8373250388802488, "grad_norm": 1.266997218132019, "learning_rate": 9.024896265560167e-07, "logits/chosen": 0.8543847799301147, "logits/rejected": 0.8320592045783997, "logps/chosen": -539.4231567382812, "logps/rejected": -633.588623046875, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -2.302938222885132, "rewards/margins": 8.438183784484863, "rewards/rejected": -10.741122245788574, "step": 1346 }, { "epoch": 0.8379471228615863, "grad_norm": 2.9454104900360107, "learning_rate": 8.990318118948826e-07, "logits/chosen": 0.0013751983642578125, "logits/rejected": 3.272942066192627, "logps/chosen": -535.0616455078125, "logps/rejected": -850.158203125, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": -3.918030261993408, "rewards/margins": 10.071514129638672, "rewards/rejected": -13.989544868469238, "step": 1347 }, { "epoch": 0.8385692068429238, "grad_norm": 0.005711916368454695, "learning_rate": 8.955739972337483e-07, "logits/chosen": -2.4882516860961914, "logits/rejected": 1.6005373001098633, "logps/chosen": -254.40719604492188, "logps/rejected": -684.0638427734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.706209421157837, "rewards/margins": 14.920669555664062, "rewards/rejected": -17.62687873840332, "step": 1348 }, { "epoch": 0.8391912908242612, "grad_norm": 0.10940305143594742, "learning_rate": 8.921161825726142e-07, "logits/chosen": -3.2102601528167725, "logits/rejected": 0.9551908373832703, "logps/chosen": -263.2800598144531, "logps/rejected": -700.904296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.0797934532165527, "rewards/margins": 14.57675552368164, "rewards/rejected": -16.65654754638672, "step": 1349 }, { "epoch": 0.8398133748055988, "grad_norm": 0.0058289929293096066, "learning_rate": 8.8865836791148e-07, "logits/chosen": 0.03441344201564789, "logits/rejected": 0.7335182428359985, "logps/chosen": -549.462890625, "logps/rejected": -830.7279052734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.383805751800537, "rewards/margins": 14.987558364868164, "rewards/rejected": -19.37136459350586, "step": 1350 }, { "epoch": 0.8404354587869363, "grad_norm": 0.028143687173724174, "learning_rate": 8.852005532503459e-07, "logits/chosen": -3.870971202850342, "logits/rejected": 1.6191153526306152, "logps/chosen": -272.2510681152344, "logps/rejected": -750.7105102539062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.586188554763794, "rewards/margins": 12.034337997436523, "rewards/rejected": -14.620525360107422, "step": 1351 }, { "epoch": 0.8410575427682737, "grad_norm": 0.018001124262809753, "learning_rate": 8.817427385892118e-07, "logits/chosen": -1.1499505043029785, "logits/rejected": 1.8359794616699219, "logps/chosen": -384.08251953125, "logps/rejected": -729.1475219726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.245957851409912, "rewards/margins": 12.060300827026367, "rewards/rejected": -16.306257247924805, "step": 1352 }, { "epoch": 0.8416796267496112, "grad_norm": 6.862741470336914, "learning_rate": 8.782849239280774e-07, "logits/chosen": -2.006868839263916, "logits/rejected": 1.562628149986267, "logps/chosen": -501.603271484375, "logps/rejected": -863.5671997070312, "loss": 0.1077, "rewards/accuracies": 0.875, "rewards/chosen": -3.4207043647766113, "rewards/margins": 17.213699340820312, "rewards/rejected": -20.634403228759766, "step": 1353 }, { "epoch": 0.8423017107309487, "grad_norm": 1.333160161972046, "learning_rate": 8.748271092669433e-07, "logits/chosen": 1.5362548828125, "logits/rejected": 2.6612493991851807, "logps/chosen": -660.3275146484375, "logps/rejected": -835.5550537109375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -4.851016998291016, "rewards/margins": 11.205677032470703, "rewards/rejected": -16.05669593811035, "step": 1354 }, { "epoch": 0.8429237947122862, "grad_norm": 5.248742309049703e-05, "learning_rate": 8.713692946058091e-07, "logits/chosen": -4.058653354644775, "logits/rejected": 1.7126109600067139, "logps/chosen": -288.18414306640625, "logps/rejected": -854.5462036132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.216399073600769, "rewards/margins": 19.67736053466797, "rewards/rejected": -20.893760681152344, "step": 1355 }, { "epoch": 0.8435458786936236, "grad_norm": 13.230230331420898, "learning_rate": 8.67911479944675e-07, "logits/chosen": -4.1434173583984375, "logits/rejected": 0.5164927244186401, "logps/chosen": -237.41200256347656, "logps/rejected": -641.2313232421875, "loss": 0.417, "rewards/accuracies": 0.875, "rewards/chosen": -2.782505750656128, "rewards/margins": 11.066420555114746, "rewards/rejected": -13.848926544189453, "step": 1356 }, { "epoch": 0.8441679626749611, "grad_norm": 0.003207965288311243, "learning_rate": 8.644536652835409e-07, "logits/chosen": -2.3563225269317627, "logits/rejected": 2.1720571517944336, "logps/chosen": -272.1138916015625, "logps/rejected": -683.6136474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5066637992858887, "rewards/margins": 14.406944274902344, "rewards/rejected": -15.91360855102539, "step": 1357 }, { "epoch": 0.8447900466562986, "grad_norm": 0.7036660313606262, "learning_rate": 8.609958506224067e-07, "logits/chosen": -1.394829511642456, "logits/rejected": 0.881989598274231, "logps/chosen": -516.1932983398438, "logps/rejected": -888.181640625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -5.468153953552246, "rewards/margins": 14.007284164428711, "rewards/rejected": -19.475439071655273, "step": 1358 }, { "epoch": 0.8454121306376361, "grad_norm": 0.011394154280424118, "learning_rate": 8.575380359612726e-07, "logits/chosen": -3.0608134269714355, "logits/rejected": 1.6058084964752197, "logps/chosen": -266.5459899902344, "logps/rejected": -744.260986328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.872096061706543, "rewards/margins": 14.397649765014648, "rewards/rejected": -17.269744873046875, "step": 1359 }, { "epoch": 0.8460342146189735, "grad_norm": 0.07397133111953735, "learning_rate": 8.540802213001384e-07, "logits/chosen": 1.049421787261963, "logits/rejected": 3.0411643981933594, "logps/chosen": -414.5494384765625, "logps/rejected": -830.6416015625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.3990598917007446, "rewards/margins": 12.567740440368652, "rewards/rejected": -13.966800689697266, "step": 1360 }, { "epoch": 0.846656298600311, "grad_norm": 7.310213565826416, "learning_rate": 8.506224066390042e-07, "logits/chosen": -2.226365327835083, "logits/rejected": -0.41856732964515686, "logps/chosen": -543.5997924804688, "logps/rejected": -758.863037109375, "loss": 0.2589, "rewards/accuracies": 0.875, "rewards/chosen": -3.990732192993164, "rewards/margins": 11.986639022827148, "rewards/rejected": -15.977372169494629, "step": 1361 }, { "epoch": 0.8472783825816486, "grad_norm": 0.07070755213499069, "learning_rate": 8.471645919778701e-07, "logits/chosen": -4.999320983886719, "logits/rejected": -0.378592848777771, "logps/chosen": -319.30267333984375, "logps/rejected": -716.9548950195312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.3934333324432373, "rewards/margins": 11.307397842407227, "rewards/rejected": -13.700831413269043, "step": 1362 }, { "epoch": 0.847900466562986, "grad_norm": 0.04150617867708206, "learning_rate": 8.43706777316736e-07, "logits/chosen": 1.017225742340088, "logits/rejected": 2.564547538757324, "logps/chosen": -649.9676513671875, "logps/rejected": -868.9345092773438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.901700019836426, "rewards/margins": 12.61135482788086, "rewards/rejected": -17.5130558013916, "step": 1363 }, { "epoch": 0.8485225505443235, "grad_norm": 11.709734916687012, "learning_rate": 8.402489626556018e-07, "logits/chosen": -1.2615110874176025, "logits/rejected": 1.6257734298706055, "logps/chosen": -533.0253295898438, "logps/rejected": -793.2706298828125, "loss": 0.2694, "rewards/accuracies": 0.875, "rewards/chosen": -6.151252746582031, "rewards/margins": 11.598294258117676, "rewards/rejected": -17.74954605102539, "step": 1364 }, { "epoch": 0.8491446345256609, "grad_norm": 0.0007369054364971817, "learning_rate": 8.367911479944676e-07, "logits/chosen": -1.0436058044433594, "logits/rejected": 1.38249933719635, "logps/chosen": -527.0166015625, "logps/rejected": -792.3524169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5765233039855957, "rewards/margins": 14.381919860839844, "rewards/rejected": -17.95844268798828, "step": 1365 }, { "epoch": 0.8497667185069985, "grad_norm": 5.157384872436523, "learning_rate": 8.333333333333333e-07, "logits/chosen": -1.1935375928878784, "logits/rejected": 1.0840433835983276, "logps/chosen": -477.71282958984375, "logps/rejected": -776.1897583007812, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": -6.438204288482666, "rewards/margins": 12.184131622314453, "rewards/rejected": -18.62233543395996, "step": 1366 }, { "epoch": 0.8503888024883359, "grad_norm": 0.08662021905183792, "learning_rate": 8.298755186721992e-07, "logits/chosen": -0.22443029284477234, "logits/rejected": 1.3784890174865723, "logps/chosen": -581.3570556640625, "logps/rejected": -854.7089233398438, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.905120611190796, "rewards/margins": 13.831296920776367, "rewards/rejected": -17.73641586303711, "step": 1367 }, { "epoch": 0.8510108864696734, "grad_norm": 0.0001285905163967982, "learning_rate": 8.264177040110651e-07, "logits/chosen": -0.5906904339790344, "logits/rejected": 2.230081081390381, "logps/chosen": -606.8558959960938, "logps/rejected": -874.8109741210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.215940475463867, "rewards/margins": 16.946250915527344, "rewards/rejected": -20.162193298339844, "step": 1368 }, { "epoch": 0.8516329704510109, "grad_norm": 12.074705123901367, "learning_rate": 8.229598893499309e-07, "logits/chosen": 0.9988129138946533, "logits/rejected": 1.6083152294158936, "logps/chosen": -736.2765502929688, "logps/rejected": -822.7845458984375, "loss": 0.2783, "rewards/accuracies": 0.875, "rewards/chosen": -7.024781703948975, "rewards/margins": 8.590043067932129, "rewards/rejected": -15.614825248718262, "step": 1369 }, { "epoch": 0.8522550544323484, "grad_norm": 11.134469032287598, "learning_rate": 8.195020746887968e-07, "logits/chosen": 0.5310500860214233, "logits/rejected": 1.646470069885254, "logps/chosen": -611.5623779296875, "logps/rejected": -725.1040649414062, "loss": 0.295, "rewards/accuracies": 0.875, "rewards/chosen": -3.576406955718994, "rewards/margins": 5.060890197753906, "rewards/rejected": -8.637297630310059, "step": 1370 }, { "epoch": 0.8528771384136858, "grad_norm": 9.671308517456055, "learning_rate": 8.160442600276625e-07, "logits/chosen": -0.26062333583831787, "logits/rejected": 2.7353827953338623, "logps/chosen": -460.89898681640625, "logps/rejected": -784.552734375, "loss": 0.2607, "rewards/accuracies": 0.875, "rewards/chosen": -5.218446731567383, "rewards/margins": 9.869426727294922, "rewards/rejected": -15.087873458862305, "step": 1371 }, { "epoch": 0.8534992223950233, "grad_norm": 0.4493100047111511, "learning_rate": 8.125864453665284e-07, "logits/chosen": -3.580364227294922, "logits/rejected": 1.163841962814331, "logps/chosen": -364.869140625, "logps/rejected": -784.8856811523438, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -2.201347589492798, "rewards/margins": 14.422802925109863, "rewards/rejected": -16.624149322509766, "step": 1372 }, { "epoch": 0.8541213063763609, "grad_norm": 5.212620735168457, "learning_rate": 8.091286307053943e-07, "logits/chosen": -1.2408788204193115, "logits/rejected": 1.1992793083190918, "logps/chosen": -345.6400146484375, "logps/rejected": -635.1298828125, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -2.8372135162353516, "rewards/margins": 10.374082565307617, "rewards/rejected": -13.211296081542969, "step": 1373 }, { "epoch": 0.8547433903576983, "grad_norm": 8.535562515258789, "learning_rate": 8.056708160442601e-07, "logits/chosen": 0.30715852975845337, "logits/rejected": 2.0107920169830322, "logps/chosen": -643.1803588867188, "logps/rejected": -872.908203125, "loss": 0.1797, "rewards/accuracies": 0.875, "rewards/chosen": -4.986381530761719, "rewards/margins": 10.883899688720703, "rewards/rejected": -15.870281219482422, "step": 1374 }, { "epoch": 0.8553654743390358, "grad_norm": 1.3095946311950684, "learning_rate": 8.02213001383126e-07, "logits/chosen": -1.2480816841125488, "logits/rejected": 2.046372175216675, "logps/chosen": -407.83673095703125, "logps/rejected": -736.2703247070312, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -4.496979713439941, "rewards/margins": 9.761946678161621, "rewards/rejected": -14.258926391601562, "step": 1375 }, { "epoch": 0.8559875583203732, "grad_norm": 0.009364476427435875, "learning_rate": 7.987551867219917e-07, "logits/chosen": -1.8134479522705078, "logits/rejected": 3.4571945667266846, "logps/chosen": -432.1346740722656, "logps/rejected": -920.4662475585938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.685150146484375, "rewards/margins": 17.970428466796875, "rewards/rejected": -20.655576705932617, "step": 1376 }, { "epoch": 0.8566096423017108, "grad_norm": 0.22083696722984314, "learning_rate": 7.952973720608575e-07, "logits/chosen": 0.1405980885028839, "logits/rejected": 1.5262184143066406, "logps/chosen": -473.7147521972656, "logps/rejected": -679.0631103515625, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.756345510482788, "rewards/margins": 9.944561004638672, "rewards/rejected": -13.700905799865723, "step": 1377 }, { "epoch": 0.8572317262830482, "grad_norm": 3.816192150115967, "learning_rate": 7.918395573997234e-07, "logits/chosen": 0.7515852451324463, "logits/rejected": 1.9171783924102783, "logps/chosen": -498.9930114746094, "logps/rejected": -662.2570190429688, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -3.1550369262695312, "rewards/margins": 9.093594551086426, "rewards/rejected": -12.248631477355957, "step": 1378 }, { "epoch": 0.8578538102643857, "grad_norm": 8.978050231933594, "learning_rate": 7.883817427385892e-07, "logits/chosen": -1.4691927433013916, "logits/rejected": 0.75522381067276, "logps/chosen": -543.03076171875, "logps/rejected": -805.3604125976562, "loss": 0.331, "rewards/accuracies": 0.875, "rewards/chosen": -4.248265266418457, "rewards/margins": 10.700071334838867, "rewards/rejected": -14.94833755493164, "step": 1379 }, { "epoch": 0.8584758942457231, "grad_norm": 2.4178318977355957, "learning_rate": 7.849239280774551e-07, "logits/chosen": -1.6253913640975952, "logits/rejected": 0.8435375690460205, "logps/chosen": -479.9702453613281, "logps/rejected": -767.1831665039062, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -4.441349506378174, "rewards/margins": 11.86941146850586, "rewards/rejected": -16.310762405395508, "step": 1380 }, { "epoch": 0.8590979782270607, "grad_norm": 0.48978862166404724, "learning_rate": 7.81466113416321e-07, "logits/chosen": -2.6471264362335205, "logits/rejected": 0.6247217655181885, "logps/chosen": -344.5264892578125, "logps/rejected": -612.7557983398438, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -4.714255332946777, "rewards/margins": 10.312060356140137, "rewards/rejected": -15.026315689086914, "step": 1381 }, { "epoch": 0.8597200622083981, "grad_norm": 6.182023048400879, "learning_rate": 7.780082987551868e-07, "logits/chosen": -0.1082909107208252, "logits/rejected": 1.5570793151855469, "logps/chosen": -601.014892578125, "logps/rejected": -910.1640625, "loss": 0.0918, "rewards/accuracies": 0.875, "rewards/chosen": -5.905216217041016, "rewards/margins": 13.815837860107422, "rewards/rejected": -19.721054077148438, "step": 1382 }, { "epoch": 0.8603421461897356, "grad_norm": 0.024595655500888824, "learning_rate": 7.745504840940527e-07, "logits/chosen": -2.6741578578948975, "logits/rejected": 1.1215143203735352, "logps/chosen": -341.64605712890625, "logps/rejected": -820.6256103515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.2337241172790527, "rewards/margins": 17.776105880737305, "rewards/rejected": -20.009830474853516, "step": 1383 }, { "epoch": 0.860964230171073, "grad_norm": 2.9983041286468506, "learning_rate": 7.710926694329185e-07, "logits/chosen": 0.018856346607208252, "logits/rejected": 2.3888607025146484, "logps/chosen": -463.0792236328125, "logps/rejected": -819.1570434570312, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -3.785944700241089, "rewards/margins": 11.84388542175293, "rewards/rejected": -15.629829406738281, "step": 1384 }, { "epoch": 0.8615863141524106, "grad_norm": 5.210484027862549, "learning_rate": 7.676348547717843e-07, "logits/chosen": -0.46806907653808594, "logits/rejected": 0.0537090003490448, "logps/chosen": -569.5161743164062, "logps/rejected": -714.6705322265625, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": -3.8570396900177, "rewards/margins": 12.085654258728027, "rewards/rejected": -15.942694664001465, "step": 1385 }, { "epoch": 0.862208398133748, "grad_norm": 0.012987494468688965, "learning_rate": 7.641770401106502e-07, "logits/chosen": -1.5736701488494873, "logits/rejected": 1.6592588424682617, "logps/chosen": -484.045166015625, "logps/rejected": -854.8268432617188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.8748068809509277, "rewards/margins": 15.584949493408203, "rewards/rejected": -19.45975685119629, "step": 1386 }, { "epoch": 0.8628304821150855, "grad_norm": 0.03192966803908348, "learning_rate": 7.607192254495159e-07, "logits/chosen": -1.372562050819397, "logits/rejected": 0.44019877910614014, "logps/chosen": -548.3515625, "logps/rejected": -854.657470703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.667433977127075, "rewards/margins": 17.325468063354492, "rewards/rejected": -20.992904663085938, "step": 1387 }, { "epoch": 0.8634525660964231, "grad_norm": 1.1009788513183594, "learning_rate": 7.572614107883818e-07, "logits/chosen": -0.8460559844970703, "logits/rejected": -0.1236199140548706, "logps/chosen": -598.9733276367188, "logps/rejected": -733.540771484375, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -7.199460506439209, "rewards/margins": 10.72243881225586, "rewards/rejected": -17.921899795532227, "step": 1388 }, { "epoch": 0.8640746500777605, "grad_norm": 10.737199783325195, "learning_rate": 7.538035961272477e-07, "logits/chosen": -2.96671724319458, "logits/rejected": 0.8957569003105164, "logps/chosen": -366.66534423828125, "logps/rejected": -768.0054321289062, "loss": 0.2642, "rewards/accuracies": 0.875, "rewards/chosen": -4.470893859863281, "rewards/margins": 11.980895042419434, "rewards/rejected": -16.45178985595703, "step": 1389 }, { "epoch": 0.864696734059098, "grad_norm": 1.6479535102844238, "learning_rate": 7.503457814661134e-07, "logits/chosen": -2.05588960647583, "logits/rejected": 1.945601224899292, "logps/chosen": -327.6200256347656, "logps/rejected": -705.5081787109375, "loss": 0.1305, "rewards/accuracies": 0.875, "rewards/chosen": -2.5512354373931885, "rewards/margins": 15.873303413391113, "rewards/rejected": -18.42453956604004, "step": 1390 }, { "epoch": 0.8653188180404354, "grad_norm": 0.0028443310875445604, "learning_rate": 7.468879668049793e-07, "logits/chosen": -0.9263023138046265, "logits/rejected": 1.1838053464889526, "logps/chosen": -560.9407958984375, "logps/rejected": -874.5242919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8259623050689697, "rewards/margins": 16.26862907409668, "rewards/rejected": -19.094593048095703, "step": 1391 }, { "epoch": 0.865940902021773, "grad_norm": 10.601418495178223, "learning_rate": 7.434301521438451e-07, "logits/chosen": 2.6268818378448486, "logits/rejected": 4.109800815582275, "logps/chosen": -755.13916015625, "logps/rejected": -989.1985473632812, "loss": 0.2051, "rewards/accuracies": 0.875, "rewards/chosen": -6.594049453735352, "rewards/margins": 9.29098892211914, "rewards/rejected": -15.885038375854492, "step": 1392 }, { "epoch": 0.8665629860031104, "grad_norm": 0.38172370195388794, "learning_rate": 7.39972337482711e-07, "logits/chosen": -1.5836894512176514, "logits/rejected": 1.798182725906372, "logps/chosen": -518.0062255859375, "logps/rejected": -822.2635498046875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -5.466948509216309, "rewards/margins": 13.988347053527832, "rewards/rejected": -19.45529556274414, "step": 1393 }, { "epoch": 0.8671850699844479, "grad_norm": 5.430830001831055, "learning_rate": 7.365145228215769e-07, "logits/chosen": -2.41754150390625, "logits/rejected": 1.1162140369415283, "logps/chosen": -355.3067932128906, "logps/rejected": -634.556640625, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": -3.560699701309204, "rewards/margins": 10.378538131713867, "rewards/rejected": -13.939236640930176, "step": 1394 }, { "epoch": 0.8678071539657853, "grad_norm": 0.019041309133172035, "learning_rate": 7.330567081604426e-07, "logits/chosen": -0.2181365042924881, "logits/rejected": 2.9022388458251953, "logps/chosen": -443.5439147949219, "logps/rejected": -732.521728515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.8353159427642822, "rewards/margins": 13.631763458251953, "rewards/rejected": -17.467079162597656, "step": 1395 }, { "epoch": 0.8684292379471229, "grad_norm": 9.69019889831543, "learning_rate": 7.295988934993085e-07, "logits/chosen": 0.6469730138778687, "logits/rejected": 3.232738971710205, "logps/chosen": -529.0391845703125, "logps/rejected": -796.0674438476562, "loss": 0.2223, "rewards/accuracies": 0.875, "rewards/chosen": -4.051665306091309, "rewards/margins": 8.79385757446289, "rewards/rejected": -12.8455228805542, "step": 1396 }, { "epoch": 0.8690513219284604, "grad_norm": 17.920333862304688, "learning_rate": 7.261410788381744e-07, "logits/chosen": -2.646892547607422, "logits/rejected": 0.15039432048797607, "logps/chosen": -514.814453125, "logps/rejected": -778.33642578125, "loss": 0.7821, "rewards/accuracies": 0.875, "rewards/chosen": -4.745429992675781, "rewards/margins": 9.697530746459961, "rewards/rejected": -14.442960739135742, "step": 1397 }, { "epoch": 0.8696734059097978, "grad_norm": 11.870722770690918, "learning_rate": 7.226832641770402e-07, "logits/chosen": -1.017686128616333, "logits/rejected": 2.1543354988098145, "logps/chosen": -417.48486328125, "logps/rejected": -786.0995483398438, "loss": 0.9964, "rewards/accuracies": 0.875, "rewards/chosen": -2.801851511001587, "rewards/margins": 12.55488395690918, "rewards/rejected": -15.356735229492188, "step": 1398 }, { "epoch": 0.8702954898911353, "grad_norm": 8.021544456481934, "learning_rate": 7.19225449515906e-07, "logits/chosen": 0.08194294571876526, "logits/rejected": 2.7336831092834473, "logps/chosen": -538.41015625, "logps/rejected": -836.6657104492188, "loss": 0.1994, "rewards/accuracies": 0.875, "rewards/chosen": -2.0980591773986816, "rewards/margins": 11.115087509155273, "rewards/rejected": -13.21314811706543, "step": 1399 }, { "epoch": 0.8709175738724728, "grad_norm": 1.0668821334838867, "learning_rate": 7.157676348547718e-07, "logits/chosen": -0.5868552923202515, "logits/rejected": 0.6949260234832764, "logps/chosen": -513.611083984375, "logps/rejected": -708.9735107421875, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -3.4980578422546387, "rewards/margins": 9.969182014465332, "rewards/rejected": -13.467239379882812, "step": 1400 }, { "epoch": 0.8715396578538103, "grad_norm": 8.87119197845459, "learning_rate": 7.123098201936376e-07, "logits/chosen": -0.17994895577430725, "logits/rejected": 2.1608054637908936, "logps/chosen": -556.9896240234375, "logps/rejected": -639.0247192382812, "loss": 0.4497, "rewards/accuracies": 0.875, "rewards/chosen": -3.5011186599731445, "rewards/margins": 6.547773361206055, "rewards/rejected": -10.0488920211792, "step": 1401 }, { "epoch": 0.8721617418351477, "grad_norm": 10.818016052246094, "learning_rate": 7.088520055325035e-07, "logits/chosen": 0.28853344917297363, "logits/rejected": 2.017643928527832, "logps/chosen": -589.4359741210938, "logps/rejected": -783.9228515625, "loss": 0.3757, "rewards/accuracies": 0.875, "rewards/chosen": -3.7464447021484375, "rewards/margins": 13.00349235534668, "rewards/rejected": -16.74993896484375, "step": 1402 }, { "epoch": 0.8727838258164852, "grad_norm": 1.0852562189102173, "learning_rate": 7.053941908713693e-07, "logits/chosen": -2.258829116821289, "logits/rejected": 1.1972754001617432, "logps/chosen": -372.9857482910156, "logps/rejected": -744.4384765625, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -2.3523507118225098, "rewards/margins": 10.927803039550781, "rewards/rejected": -13.28015422821045, "step": 1403 }, { "epoch": 0.8734059097978227, "grad_norm": 1.5867135524749756, "learning_rate": 7.019363762102352e-07, "logits/chosen": -1.815974235534668, "logits/rejected": 0.7950289249420166, "logps/chosen": -384.2825012207031, "logps/rejected": -641.048828125, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": -1.8861889839172363, "rewards/margins": 7.233743190765381, "rewards/rejected": -9.119932174682617, "step": 1404 }, { "epoch": 0.8740279937791602, "grad_norm": 0.7310445308685303, "learning_rate": 6.984785615491011e-07, "logits/chosen": -2.6081879138946533, "logits/rejected": 3.058281660079956, "logps/chosen": -395.30548095703125, "logps/rejected": -908.789794921875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -3.690181255340576, "rewards/margins": 14.369673728942871, "rewards/rejected": -18.05985450744629, "step": 1405 }, { "epoch": 0.8746500777604976, "grad_norm": 11.29207992553711, "learning_rate": 6.950207468879669e-07, "logits/chosen": -0.00781300663948059, "logits/rejected": 1.1955921649932861, "logps/chosen": -533.467529296875, "logps/rejected": -698.3003540039062, "loss": 0.2666, "rewards/accuracies": 0.875, "rewards/chosen": -4.7238945960998535, "rewards/margins": 7.854576587677002, "rewards/rejected": -12.578471183776855, "step": 1406 }, { "epoch": 0.8752721617418352, "grad_norm": 0.0835161805152893, "learning_rate": 6.915629322268328e-07, "logits/chosen": -2.086176633834839, "logits/rejected": 1.9443038702011108, "logps/chosen": -401.334228515625, "logps/rejected": -739.4104614257812, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.937685251235962, "rewards/margins": 14.385418891906738, "rewards/rejected": -18.323104858398438, "step": 1407 }, { "epoch": 0.8758942457231726, "grad_norm": 0.009362341836094856, "learning_rate": 6.881051175656986e-07, "logits/chosen": -2.498250961303711, "logits/rejected": 0.8302533030509949, "logps/chosen": -300.707763671875, "logps/rejected": -647.570068359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.50215482711792, "rewards/margins": 14.151586532592773, "rewards/rejected": -16.65374183654785, "step": 1408 }, { "epoch": 0.8765163297045101, "grad_norm": 14.93471908569336, "learning_rate": 6.846473029045644e-07, "logits/chosen": -0.6871500015258789, "logits/rejected": 2.1881706714630127, "logps/chosen": -523.37841796875, "logps/rejected": -820.8674926757812, "loss": 0.3771, "rewards/accuracies": 0.875, "rewards/chosen": -4.21358585357666, "rewards/margins": 13.172981262207031, "rewards/rejected": -17.38656997680664, "step": 1409 }, { "epoch": 0.8771384136858476, "grad_norm": 0.003472434589639306, "learning_rate": 6.811894882434302e-07, "logits/chosen": -1.9387335777282715, "logits/rejected": 2.243098020553589, "logps/chosen": -322.2276306152344, "logps/rejected": -802.04931640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.294560194015503, "rewards/margins": 15.571359634399414, "rewards/rejected": -18.86591911315918, "step": 1410 }, { "epoch": 0.8777604976671851, "grad_norm": 1.1506764888763428, "learning_rate": 6.77731673582296e-07, "logits/chosen": -0.5779008865356445, "logits/rejected": 1.769566297531128, "logps/chosen": -358.8513488769531, "logps/rejected": -578.7428588867188, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -1.9300750494003296, "rewards/margins": 10.129292488098145, "rewards/rejected": -12.059367179870605, "step": 1411 }, { "epoch": 0.8783825816485226, "grad_norm": 0.6167739629745483, "learning_rate": 6.742738589211619e-07, "logits/chosen": 0.2576761841773987, "logits/rejected": 1.1397533416748047, "logps/chosen": -624.6177978515625, "logps/rejected": -777.5796508789062, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -4.530205726623535, "rewards/margins": 9.51661491394043, "rewards/rejected": -14.046819686889648, "step": 1412 }, { "epoch": 0.87900466562986, "grad_norm": 7.0976128578186035, "learning_rate": 6.708160442600276e-07, "logits/chosen": 0.16638028621673584, "logits/rejected": 1.976906657218933, "logps/chosen": -570.1187744140625, "logps/rejected": -846.4263916015625, "loss": 0.1637, "rewards/accuracies": 0.875, "rewards/chosen": -3.0160248279571533, "rewards/margins": 13.625778198242188, "rewards/rejected": -16.641801834106445, "step": 1413 }, { "epoch": 0.8796267496111975, "grad_norm": 0.02713729999959469, "learning_rate": 6.673582295988935e-07, "logits/chosen": -2.6876535415649414, "logits/rejected": 1.369954228401184, "logps/chosen": -399.1125793457031, "logps/rejected": -837.5007934570312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.556065082550049, "rewards/margins": 16.69470977783203, "rewards/rejected": -19.250774383544922, "step": 1414 }, { "epoch": 0.880248833592535, "grad_norm": 0.035688366740942, "learning_rate": 6.639004149377594e-07, "logits/chosen": -1.1252596378326416, "logits/rejected": 1.9453790187835693, "logps/chosen": -460.26165771484375, "logps/rejected": -812.6611328125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.6670496463775635, "rewards/margins": 11.58353042602539, "rewards/rejected": -14.250579833984375, "step": 1415 }, { "epoch": 0.8808709175738725, "grad_norm": 0.23432131111621857, "learning_rate": 6.604426002766252e-07, "logits/chosen": -1.4184788465499878, "logits/rejected": 0.8993440866470337, "logps/chosen": -405.36468505859375, "logps/rejected": -725.34130859375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.782729387283325, "rewards/margins": 13.66423225402832, "rewards/rejected": -16.446962356567383, "step": 1416 }, { "epoch": 0.8814930015552099, "grad_norm": 5.003701210021973, "learning_rate": 6.569847856154911e-07, "logits/chosen": -0.17210662364959717, "logits/rejected": 0.3148987591266632, "logps/chosen": -607.8765869140625, "logps/rejected": -811.6129150390625, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": -6.158517360687256, "rewards/margins": 11.490523338317871, "rewards/rejected": -17.64904022216797, "step": 1417 }, { "epoch": 0.8821150855365474, "grad_norm": 12.458187103271484, "learning_rate": 6.53526970954357e-07, "logits/chosen": 1.687244176864624, "logits/rejected": 1.638144612312317, "logps/chosen": -630.2857055664062, "logps/rejected": -725.8136596679688, "loss": 0.3976, "rewards/accuracies": 0.875, "rewards/chosen": -5.348006248474121, "rewards/margins": 11.518725395202637, "rewards/rejected": -16.866731643676758, "step": 1418 }, { "epoch": 0.882737169517885, "grad_norm": 9.221601486206055, "learning_rate": 6.500691562932227e-07, "logits/chosen": -0.5628889799118042, "logits/rejected": 0.5146147608757019, "logps/chosen": -531.3296508789062, "logps/rejected": -714.38671875, "loss": 0.3953, "rewards/accuracies": 0.875, "rewards/chosen": -1.9627403020858765, "rewards/margins": 11.386821746826172, "rewards/rejected": -13.34956169128418, "step": 1419 }, { "epoch": 0.8833592534992224, "grad_norm": 0.1647167056798935, "learning_rate": 6.466113416320886e-07, "logits/chosen": -1.7690558433532715, "logits/rejected": 0.28362834453582764, "logps/chosen": -488.35125732421875, "logps/rejected": -772.2354736328125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.6709685325622559, "rewards/margins": 13.972212791442871, "rewards/rejected": -15.643180847167969, "step": 1420 }, { "epoch": 0.8839813374805598, "grad_norm": 13.848608016967773, "learning_rate": 6.431535269709543e-07, "logits/chosen": 0.9614359736442566, "logits/rejected": 0.7644275426864624, "logps/chosen": -615.85595703125, "logps/rejected": -771.2326049804688, "loss": 0.4733, "rewards/accuracies": 0.875, "rewards/chosen": -4.622065544128418, "rewards/margins": 13.143218994140625, "rewards/rejected": -17.765283584594727, "step": 1421 }, { "epoch": 0.8846034214618974, "grad_norm": 3.3489890098571777, "learning_rate": 6.396957123098202e-07, "logits/chosen": -0.32018807530403137, "logits/rejected": 1.1269984245300293, "logps/chosen": -611.63818359375, "logps/rejected": -694.8058471679688, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": -3.8444929122924805, "rewards/margins": 6.3532819747924805, "rewards/rejected": -10.197774887084961, "step": 1422 }, { "epoch": 0.8852255054432349, "grad_norm": 6.687376976013184, "learning_rate": 6.362378976486861e-07, "logits/chosen": 0.1935296654701233, "logits/rejected": 2.671964645385742, "logps/chosen": -480.5648193359375, "logps/rejected": -723.6080322265625, "loss": 0.0968, "rewards/accuracies": 0.875, "rewards/chosen": -3.217930793762207, "rewards/margins": 10.29656982421875, "rewards/rejected": -13.514500617980957, "step": 1423 }, { "epoch": 0.8858475894245723, "grad_norm": 1.536763310432434, "learning_rate": 6.327800829875519e-07, "logits/chosen": -1.2438335418701172, "logits/rejected": 1.0853835344314575, "logps/chosen": -460.01898193359375, "logps/rejected": -794.2838745117188, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -5.035848140716553, "rewards/margins": 11.029908180236816, "rewards/rejected": -16.065757751464844, "step": 1424 }, { "epoch": 0.8864696734059098, "grad_norm": 1.486583948135376, "learning_rate": 6.293222683264177e-07, "logits/chosen": 1.4703075885772705, "logits/rejected": 3.612927198410034, "logps/chosen": -518.6131591796875, "logps/rejected": -824.2154541015625, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -4.4929609298706055, "rewards/margins": 16.49998664855957, "rewards/rejected": -20.99294662475586, "step": 1425 }, { "epoch": 0.8870917573872473, "grad_norm": 0.16098709404468536, "learning_rate": 6.258644536652836e-07, "logits/chosen": -1.2929434776306152, "logits/rejected": 1.3138850927352905, "logps/chosen": -513.4607543945312, "logps/rejected": -848.7319946289062, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.835465669631958, "rewards/margins": 12.721563339233398, "rewards/rejected": -15.557029724121094, "step": 1426 }, { "epoch": 0.8877138413685848, "grad_norm": 1.4064867496490479, "learning_rate": 6.224066390041494e-07, "logits/chosen": -0.20151236653327942, "logits/rejected": 1.5906782150268555, "logps/chosen": -475.8955078125, "logps/rejected": -727.2015991210938, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -3.164332866668701, "rewards/margins": 11.431944847106934, "rewards/rejected": -14.596277236938477, "step": 1427 }, { "epoch": 0.8883359253499222, "grad_norm": 0.5028932094573975, "learning_rate": 6.189488243430153e-07, "logits/chosen": -2.3301663398742676, "logits/rejected": 1.4567216634750366, "logps/chosen": -292.1645202636719, "logps/rejected": -751.6553955078125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -3.365790843963623, "rewards/margins": 14.16955280303955, "rewards/rejected": -17.535343170166016, "step": 1428 }, { "epoch": 0.8889580093312597, "grad_norm": 0.17805232107639313, "learning_rate": 6.154910096818812e-07, "logits/chosen": -3.3790149688720703, "logits/rejected": 1.7472096681594849, "logps/chosen": -432.0624084472656, "logps/rejected": -920.7142333984375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.85337495803833, "rewards/margins": 15.761892318725586, "rewards/rejected": -19.61526870727539, "step": 1429 }, { "epoch": 0.8895800933125972, "grad_norm": 0.0022782618179917336, "learning_rate": 6.12033195020747e-07, "logits/chosen": 0.8022010922431946, "logits/rejected": 1.7169420719146729, "logps/chosen": -604.2806396484375, "logps/rejected": -817.2542114257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.843914031982422, "rewards/margins": 14.365386962890625, "rewards/rejected": -19.209300994873047, "step": 1430 }, { "epoch": 0.8902021772939347, "grad_norm": 0.1656440794467926, "learning_rate": 6.085753803596127e-07, "logits/chosen": -1.4316818714141846, "logits/rejected": 0.6714645028114319, "logps/chosen": -562.2213134765625, "logps/rejected": -780.10986328125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.258724689483643, "rewards/margins": 12.647088050842285, "rewards/rejected": -16.905813217163086, "step": 1431 }, { "epoch": 0.8908242612752721, "grad_norm": 0.14361347258090973, "learning_rate": 6.051175656984786e-07, "logits/chosen": 0.6119691133499146, "logits/rejected": 1.5473650693893433, "logps/chosen": -560.6784057617188, "logps/rejected": -745.4575805664062, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.155247688293457, "rewards/margins": 12.619548797607422, "rewards/rejected": -16.774797439575195, "step": 1432 }, { "epoch": 0.8914463452566096, "grad_norm": 3.3019142150878906, "learning_rate": 6.016597510373444e-07, "logits/chosen": -2.8109922409057617, "logits/rejected": 0.740748405456543, "logps/chosen": -457.2281494140625, "logps/rejected": -779.3369140625, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": -3.839186429977417, "rewards/margins": 8.805957794189453, "rewards/rejected": -12.64514446258545, "step": 1433 }, { "epoch": 0.8920684292379472, "grad_norm": 5.038474082946777, "learning_rate": 5.982019363762103e-07, "logits/chosen": -1.8050143718719482, "logits/rejected": 1.7200336456298828, "logps/chosen": -550.4028930664062, "logps/rejected": -824.8980712890625, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": -4.575119972229004, "rewards/margins": 10.620918273925781, "rewards/rejected": -15.196037292480469, "step": 1434 }, { "epoch": 0.8926905132192846, "grad_norm": 0.40341225266456604, "learning_rate": 5.947441217150761e-07, "logits/chosen": -0.5748858451843262, "logits/rejected": 1.6418057680130005, "logps/chosen": -275.563720703125, "logps/rejected": -426.9505920410156, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -2.9301748275756836, "rewards/margins": 6.426263809204102, "rewards/rejected": -9.356438636779785, "step": 1435 }, { "epoch": 0.8933125972006221, "grad_norm": 0.1546129286289215, "learning_rate": 5.91286307053942e-07, "logits/chosen": 0.208055317401886, "logits/rejected": -0.695311963558197, "logps/chosen": -571.74951171875, "logps/rejected": -575.35302734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.958728790283203, "rewards/margins": 9.173510551452637, "rewards/rejected": -13.132240295410156, "step": 1436 }, { "epoch": 0.8939346811819595, "grad_norm": 11.871170043945312, "learning_rate": 5.878284923928077e-07, "logits/chosen": -0.32613325119018555, "logits/rejected": 2.5185444355010986, "logps/chosen": -519.622314453125, "logps/rejected": -783.6097412109375, "loss": 0.6411, "rewards/accuracies": 0.875, "rewards/chosen": -5.563169479370117, "rewards/margins": 9.484539031982422, "rewards/rejected": -15.047709465026855, "step": 1437 }, { "epoch": 0.8945567651632971, "grad_norm": 0.01453971303999424, "learning_rate": 5.843706777316736e-07, "logits/chosen": -1.1340763568878174, "logits/rejected": 0.8874017000198364, "logps/chosen": -410.6207580566406, "logps/rejected": -663.9677734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.3984804153442383, "rewards/margins": 10.593545913696289, "rewards/rejected": -13.992025375366211, "step": 1438 }, { "epoch": 0.8951788491446345, "grad_norm": 9.183199882507324, "learning_rate": 5.809128630705395e-07, "logits/chosen": -3.3522768020629883, "logits/rejected": 1.414174199104309, "logps/chosen": -428.20281982421875, "logps/rejected": -776.6268310546875, "loss": 0.2685, "rewards/accuracies": 0.875, "rewards/chosen": -3.992354393005371, "rewards/margins": 8.217218399047852, "rewards/rejected": -12.209571838378906, "step": 1439 }, { "epoch": 0.895800933125972, "grad_norm": 10.225379943847656, "learning_rate": 5.774550484094053e-07, "logits/chosen": 0.11867895722389221, "logits/rejected": 3.4438436031341553, "logps/chosen": -461.0334167480469, "logps/rejected": -762.928466796875, "loss": 0.4123, "rewards/accuracies": 0.875, "rewards/chosen": -3.6588401794433594, "rewards/margins": 9.946977615356445, "rewards/rejected": -13.605817794799805, "step": 1440 }, { "epoch": 0.8964230171073095, "grad_norm": 1.4720853567123413, "learning_rate": 5.739972337482711e-07, "logits/chosen": 0.19453346729278564, "logits/rejected": 2.920020580291748, "logps/chosen": -555.24609375, "logps/rejected": -864.2928466796875, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -3.572134494781494, "rewards/margins": 11.39036750793457, "rewards/rejected": -14.962503433227539, "step": 1441 }, { "epoch": 0.897045101088647, "grad_norm": 0.9293034076690674, "learning_rate": 5.70539419087137e-07, "logits/chosen": 0.4277433753013611, "logits/rejected": 2.0681934356689453, "logps/chosen": -660.9112548828125, "logps/rejected": -859.9409790039062, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -2.917433738708496, "rewards/margins": 7.717751979827881, "rewards/rejected": -10.635185241699219, "step": 1442 }, { "epoch": 0.8976671850699844, "grad_norm": 1.7794396877288818, "learning_rate": 5.670816044260028e-07, "logits/chosen": -1.1579705476760864, "logits/rejected": 0.7546205520629883, "logps/chosen": -473.51007080078125, "logps/rejected": -664.9842529296875, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -2.6328747272491455, "rewards/margins": 9.706864356994629, "rewards/rejected": -12.339738845825195, "step": 1443 }, { "epoch": 0.8982892690513219, "grad_norm": 0.09491566568613052, "learning_rate": 5.636237897648686e-07, "logits/chosen": -2.8767292499542236, "logits/rejected": 2.1472840309143066, "logps/chosen": -320.4462890625, "logps/rejected": -799.0859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.7896580696105957, "rewards/margins": 14.235816955566406, "rewards/rejected": -18.025474548339844, "step": 1444 }, { "epoch": 0.8989113530326595, "grad_norm": 1.6215115785598755, "learning_rate": 5.601659751037345e-07, "logits/chosen": -2.9174933433532715, "logits/rejected": 0.8685494661331177, "logps/chosen": -331.8122863769531, "logps/rejected": -798.7713623046875, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -3.8779218196868896, "rewards/margins": 13.897171020507812, "rewards/rejected": -17.77509307861328, "step": 1445 }, { "epoch": 0.8995334370139969, "grad_norm": 3.2274699211120605, "learning_rate": 5.567081604426004e-07, "logits/chosen": 1.1807541847229004, "logits/rejected": 2.1711924076080322, "logps/chosen": -571.8338623046875, "logps/rejected": -759.391845703125, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": -5.260222434997559, "rewards/margins": 10.59776496887207, "rewards/rejected": -15.857988357543945, "step": 1446 }, { "epoch": 0.9001555209953344, "grad_norm": 15.007549285888672, "learning_rate": 5.532503457814662e-07, "logits/chosen": 1.9797394275665283, "logits/rejected": 3.3568010330200195, "logps/chosen": -553.0282592773438, "logps/rejected": -746.5451049804688, "loss": 0.7117, "rewards/accuracies": 0.875, "rewards/chosen": -4.2186174392700195, "rewards/margins": 10.341630935668945, "rewards/rejected": -14.560248374938965, "step": 1447 }, { "epoch": 0.9007776049766718, "grad_norm": 0.6334243416786194, "learning_rate": 5.49792531120332e-07, "logits/chosen": -2.648848056793213, "logits/rejected": 2.0881552696228027, "logps/chosen": -337.37078857421875, "logps/rejected": -759.9859008789062, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -3.150609016418457, "rewards/margins": 14.992120742797852, "rewards/rejected": -18.142728805541992, "step": 1448 }, { "epoch": 0.9013996889580094, "grad_norm": 1.7155299186706543, "learning_rate": 5.463347164591978e-07, "logits/chosen": -0.2594708502292633, "logits/rejected": 1.4908413887023926, "logps/chosen": -558.3201904296875, "logps/rejected": -874.2298583984375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -3.998410701751709, "rewards/margins": 12.76402473449707, "rewards/rejected": -16.762435913085938, "step": 1449 }, { "epoch": 0.9020217729393468, "grad_norm": 0.11685359477996826, "learning_rate": 5.428769017980637e-07, "logits/chosen": -0.3029470443725586, "logits/rejected": 1.535508394241333, "logps/chosen": -347.53961181640625, "logps/rejected": -558.446044921875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -1.5834128856658936, "rewards/margins": 7.711281776428223, "rewards/rejected": -9.294694900512695, "step": 1450 }, { "epoch": 0.9026438569206843, "grad_norm": 0.007481109816581011, "learning_rate": 5.394190871369295e-07, "logits/chosen": -1.6014339923858643, "logits/rejected": 1.9459352493286133, "logps/chosen": -435.97735595703125, "logps/rejected": -795.7625122070312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.6368167400360107, "rewards/margins": 15.289512634277344, "rewards/rejected": -17.92633056640625, "step": 1451 }, { "epoch": 0.9032659409020217, "grad_norm": 9.042195320129395, "learning_rate": 5.359612724757953e-07, "logits/chosen": -0.4747803509235382, "logits/rejected": 2.9576404094696045, "logps/chosen": -492.944091796875, "logps/rejected": -804.94482421875, "loss": 0.1653, "rewards/accuracies": 0.875, "rewards/chosen": -5.554488182067871, "rewards/margins": 10.259726524353027, "rewards/rejected": -15.814214706420898, "step": 1452 }, { "epoch": 0.9038880248833593, "grad_norm": 7.078778266906738, "learning_rate": 5.325034578146612e-07, "logits/chosen": -2.033644914627075, "logits/rejected": 0.956307053565979, "logps/chosen": -314.28656005859375, "logps/rejected": -657.49072265625, "loss": 0.7885, "rewards/accuracies": 0.875, "rewards/chosen": -2.766798496246338, "rewards/margins": 12.712461471557617, "rewards/rejected": -15.479260444641113, "step": 1453 }, { "epoch": 0.9045101088646967, "grad_norm": 7.745594024658203, "learning_rate": 5.29045643153527e-07, "logits/chosen": -1.9259412288665771, "logits/rejected": 1.697417974472046, "logps/chosen": -529.8311157226562, "logps/rejected": -833.0335693359375, "loss": 0.1916, "rewards/accuracies": 0.875, "rewards/chosen": -5.933501243591309, "rewards/margins": 10.662010192871094, "rewards/rejected": -16.59551239013672, "step": 1454 }, { "epoch": 0.9051321928460342, "grad_norm": 4.846780300140381, "learning_rate": 5.255878284923928e-07, "logits/chosen": -1.1878126859664917, "logits/rejected": 1.876765251159668, "logps/chosen": -356.77783203125, "logps/rejected": -631.2303466796875, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": -2.1937432289123535, "rewards/margins": 12.510306358337402, "rewards/rejected": -14.704049110412598, "step": 1455 }, { "epoch": 0.9057542768273716, "grad_norm": 7.957666397094727, "learning_rate": 5.221300138312587e-07, "logits/chosen": -0.3681551218032837, "logits/rejected": 1.7987143993377686, "logps/chosen": -532.7719116210938, "logps/rejected": -751.0033569335938, "loss": 0.3457, "rewards/accuracies": 0.875, "rewards/chosen": -3.5109124183654785, "rewards/margins": 6.914301872253418, "rewards/rejected": -10.425214767456055, "step": 1456 }, { "epoch": 0.9063763608087092, "grad_norm": 3.3493337631225586, "learning_rate": 5.186721991701245e-07, "logits/chosen": -2.598339796066284, "logits/rejected": 0.8983297348022461, "logps/chosen": -390.8603820800781, "logps/rejected": -716.1077880859375, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -3.1072566509246826, "rewards/margins": 12.92424488067627, "rewards/rejected": -16.03150177001953, "step": 1457 }, { "epoch": 0.9069984447900467, "grad_norm": 0.3975439667701721, "learning_rate": 5.152143845089903e-07, "logits/chosen": -1.5385003089904785, "logits/rejected": 1.7762432098388672, "logps/chosen": -360.1870422363281, "logps/rejected": -676.0075073242188, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -2.083003520965576, "rewards/margins": 9.356901168823242, "rewards/rejected": -11.439903259277344, "step": 1458 }, { "epoch": 0.9076205287713841, "grad_norm": 2.8431878089904785, "learning_rate": 5.117565698478562e-07, "logits/chosen": -2.9596781730651855, "logits/rejected": 1.3715825080871582, "logps/chosen": -323.9750061035156, "logps/rejected": -791.4835205078125, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -3.7252724170684814, "rewards/margins": 13.341262817382812, "rewards/rejected": -17.06653594970703, "step": 1459 }, { "epoch": 0.9082426127527217, "grad_norm": 0.023707645013928413, "learning_rate": 5.082987551867221e-07, "logits/chosen": -2.7472968101501465, "logits/rejected": 1.660959005355835, "logps/chosen": -315.56829833984375, "logps/rejected": -725.6936645507812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.3748574256896973, "rewards/margins": 11.816588401794434, "rewards/rejected": -13.191445350646973, "step": 1460 }, { "epoch": 0.9088646967340591, "grad_norm": 0.08443538844585419, "learning_rate": 5.048409405255878e-07, "logits/chosen": -0.3013702630996704, "logits/rejected": 2.61268949508667, "logps/chosen": -367.6524353027344, "logps/rejected": -683.3698120117188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -3.1470658779144287, "rewards/margins": 11.85451889038086, "rewards/rejected": -15.001585006713867, "step": 1461 }, { "epoch": 0.9094867807153966, "grad_norm": 4.5101399421691895, "learning_rate": 5.013831258644537e-07, "logits/chosen": -4.378769397735596, "logits/rejected": 0.6312875747680664, "logps/chosen": -314.7661437988281, "logps/rejected": -768.94677734375, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -2.185915946960449, "rewards/margins": 14.008753776550293, "rewards/rejected": -16.194669723510742, "step": 1462 }, { "epoch": 0.910108864696734, "grad_norm": 1.1714937686920166, "learning_rate": 4.979253112033195e-07, "logits/chosen": -0.7367267608642578, "logits/rejected": 1.561671495437622, "logps/chosen": -553.3486938476562, "logps/rejected": -846.4337158203125, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -3.656877040863037, "rewards/margins": 14.870012283325195, "rewards/rejected": -18.52688980102539, "step": 1463 }, { "epoch": 0.9107309486780716, "grad_norm": 10.674068450927734, "learning_rate": 4.944674965421854e-07, "logits/chosen": 1.3820637464523315, "logits/rejected": 3.106642246246338, "logps/chosen": -605.3300170898438, "logps/rejected": -783.963623046875, "loss": 0.3482, "rewards/accuracies": 0.875, "rewards/chosen": -2.2258996963500977, "rewards/margins": 8.71350383758545, "rewards/rejected": -10.93940258026123, "step": 1464 }, { "epoch": 0.911353032659409, "grad_norm": 7.689693927764893, "learning_rate": 4.910096818810512e-07, "logits/chosen": -1.892621397972107, "logits/rejected": 2.1019606590270996, "logps/chosen": -513.6094360351562, "logps/rejected": -954.5704345703125, "loss": 0.1868, "rewards/accuracies": 0.875, "rewards/chosen": -6.04621696472168, "rewards/margins": 13.769493103027344, "rewards/rejected": -19.815710067749023, "step": 1465 }, { "epoch": 0.9119751166407465, "grad_norm": 12.045949935913086, "learning_rate": 4.875518672199171e-07, "logits/chosen": -1.77559494972229, "logits/rejected": 1.0355404615402222, "logps/chosen": -540.2722778320312, "logps/rejected": -846.2046508789062, "loss": 0.4532, "rewards/accuracies": 0.875, "rewards/chosen": -5.070008754730225, "rewards/margins": 12.437443733215332, "rewards/rejected": -17.50745391845703, "step": 1466 }, { "epoch": 0.9125972006220839, "grad_norm": 0.003177879611030221, "learning_rate": 4.84094052558783e-07, "logits/chosen": -0.4620545506477356, "logits/rejected": 1.5154417753219604, "logps/chosen": -416.4202880859375, "logps/rejected": -660.5771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.0061445236206055, "rewards/margins": 12.807272911071777, "rewards/rejected": -16.813417434692383, "step": 1467 }, { "epoch": 0.9132192846034215, "grad_norm": 7.89936637878418, "learning_rate": 4.806362378976487e-07, "logits/chosen": -0.5486880540847778, "logits/rejected": 2.730628252029419, "logps/chosen": -450.9072265625, "logps/rejected": -873.876220703125, "loss": 0.0814, "rewards/accuracies": 1.0, "rewards/chosen": -4.074484825134277, "rewards/margins": 14.339876174926758, "rewards/rejected": -18.41436195373535, "step": 1468 }, { "epoch": 0.913841368584759, "grad_norm": 10.738627433776855, "learning_rate": 4.771784232365145e-07, "logits/chosen": -0.5453152060508728, "logits/rejected": 2.8190457820892334, "logps/chosen": -406.93426513671875, "logps/rejected": -760.6094970703125, "loss": 0.678, "rewards/accuracies": 0.875, "rewards/chosen": -3.8559494018554688, "rewards/margins": 10.396709442138672, "rewards/rejected": -14.252659797668457, "step": 1469 }, { "epoch": 0.9144634525660964, "grad_norm": 6.856982707977295, "learning_rate": 4.737206085753804e-07, "logits/chosen": -0.05188632011413574, "logits/rejected": 0.4159351587295532, "logps/chosen": -513.7274169921875, "logps/rejected": -717.6285400390625, "loss": 0.094, "rewards/accuracies": 0.875, "rewards/chosen": -4.865007400512695, "rewards/margins": 11.267074584960938, "rewards/rejected": -16.132080078125, "step": 1470 }, { "epoch": 0.9150855365474339, "grad_norm": 1.559171438217163, "learning_rate": 4.702627939142462e-07, "logits/chosen": -1.8055647611618042, "logits/rejected": 1.2176705598831177, "logps/chosen": -422.52899169921875, "logps/rejected": -754.9882202148438, "loss": 0.1096, "rewards/accuracies": 0.875, "rewards/chosen": -3.0819449424743652, "rewards/margins": 11.390767097473145, "rewards/rejected": -14.472711563110352, "step": 1471 }, { "epoch": 0.9157076205287714, "grad_norm": 1.1758536100387573, "learning_rate": 4.6680497925311206e-07, "logits/chosen": -2.975848436355591, "logits/rejected": 0.033451199531555176, "logps/chosen": -371.422607421875, "logps/rejected": -692.0443115234375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -2.640578269958496, "rewards/margins": 11.881332397460938, "rewards/rejected": -14.521910667419434, "step": 1472 }, { "epoch": 0.9163297045101089, "grad_norm": 0.00011615510447882116, "learning_rate": 4.6334716459197795e-07, "logits/chosen": -2.4863955974578857, "logits/rejected": 2.968559741973877, "logps/chosen": -360.88311767578125, "logps/rejected": -962.9573364257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.2309863567352295, "rewards/margins": 17.27608871459961, "rewards/rejected": -19.5070743560791, "step": 1473 }, { "epoch": 0.9169517884914463, "grad_norm": 8.304816246032715, "learning_rate": 4.598893499308438e-07, "logits/chosen": 0.6510341763496399, "logits/rejected": 2.1869540214538574, "logps/chosen": -620.340087890625, "logps/rejected": -804.0247802734375, "loss": 0.111, "rewards/accuracies": 0.875, "rewards/chosen": -3.776372194290161, "rewards/margins": 10.560134887695312, "rewards/rejected": -14.336506843566895, "step": 1474 }, { "epoch": 0.9175738724727839, "grad_norm": 0.8452247381210327, "learning_rate": 4.5643153526970956e-07, "logits/chosen": -1.1516263484954834, "logits/rejected": 1.39576256275177, "logps/chosen": -509.9710388183594, "logps/rejected": -804.6676025390625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.6628024578094482, "rewards/margins": 10.969822883605957, "rewards/rejected": -14.632623672485352, "step": 1475 }, { "epoch": 0.9181959564541213, "grad_norm": 13.252323150634766, "learning_rate": 4.529737206085754e-07, "logits/chosen": -1.1855800151824951, "logits/rejected": 1.276323914527893, "logps/chosen": -509.5328063964844, "logps/rejected": -785.7333984375, "loss": 0.3595, "rewards/accuracies": 0.875, "rewards/chosen": -4.520510673522949, "rewards/margins": 10.389047622680664, "rewards/rejected": -14.909557342529297, "step": 1476 }, { "epoch": 0.9188180404354588, "grad_norm": 0.00013192665937822312, "learning_rate": 4.495159059474413e-07, "logits/chosen": -4.287675380706787, "logits/rejected": 1.2201006412506104, "logps/chosen": -275.25482177734375, "logps/rejected": -778.931884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5836806297302246, "rewards/margins": 16.143505096435547, "rewards/rejected": -17.727184295654297, "step": 1477 }, { "epoch": 0.9194401244167962, "grad_norm": 5.446385383605957, "learning_rate": 4.460580912863071e-07, "logits/chosen": -1.846035122871399, "logits/rejected": 3.0661497116088867, "logps/chosen": -318.20269775390625, "logps/rejected": -732.0631713867188, "loss": 0.0965, "rewards/accuracies": 0.875, "rewards/chosen": -2.7287187576293945, "rewards/margins": 10.47070598602295, "rewards/rejected": -13.199424743652344, "step": 1478 }, { "epoch": 0.9200622083981338, "grad_norm": 2.3951303958892822, "learning_rate": 4.4260027662517294e-07, "logits/chosen": -3.0589780807495117, "logits/rejected": 1.4286210536956787, "logps/chosen": -487.0649719238281, "logps/rejected": -784.7760009765625, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -4.506537437438965, "rewards/margins": 9.907398223876953, "rewards/rejected": -14.413934707641602, "step": 1479 }, { "epoch": 0.9206842923794712, "grad_norm": 4.840212821960449, "learning_rate": 4.391424619640387e-07, "logits/chosen": -0.4159850478172302, "logits/rejected": 1.3065357208251953, "logps/chosen": -546.5531616210938, "logps/rejected": -800.9768676757812, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": -4.232282638549805, "rewards/margins": 9.402071952819824, "rewards/rejected": -13.634354591369629, "step": 1480 }, { "epoch": 0.9213063763608087, "grad_norm": 10.157102584838867, "learning_rate": 4.3568464730290456e-07, "logits/chosen": 0.3726942539215088, "logits/rejected": 0.9195795059204102, "logps/chosen": -357.4367980957031, "logps/rejected": -517.5038452148438, "loss": 0.5689, "rewards/accuracies": 0.875, "rewards/chosen": -2.6168298721313477, "rewards/margins": 8.07404613494873, "rewards/rejected": -10.690876007080078, "step": 1481 }, { "epoch": 0.9219284603421461, "grad_norm": 0.27228304743766785, "learning_rate": 4.3222683264177044e-07, "logits/chosen": -2.2999091148376465, "logits/rejected": -0.6475305557250977, "logps/chosen": -567.763916015625, "logps/rejected": -903.051025390625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.824256896972656, "rewards/margins": 11.655745506286621, "rewards/rejected": -16.480003356933594, "step": 1482 }, { "epoch": 0.9225505443234837, "grad_norm": 0.0006143409991636872, "learning_rate": 4.287690179806363e-07, "logits/chosen": -2.988739013671875, "logits/rejected": 1.471644639968872, "logps/chosen": -384.5038146972656, "logps/rejected": -847.5885009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.053596019744873, "rewards/margins": 18.16282081604004, "rewards/rejected": -22.21641731262207, "step": 1483 }, { "epoch": 0.9231726283048212, "grad_norm": 0.14876942336559296, "learning_rate": 4.253112033195021e-07, "logits/chosen": 1.1200125217437744, "logits/rejected": 2.812039613723755, "logps/chosen": -576.5919189453125, "logps/rejected": -787.423583984375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.0661139488220215, "rewards/margins": 13.413179397583008, "rewards/rejected": -17.479293823242188, "step": 1484 }, { "epoch": 0.9237947122861586, "grad_norm": 0.009984598495066166, "learning_rate": 4.21853388658368e-07, "logits/chosen": -0.03465063124895096, "logits/rejected": 0.4710065722465515, "logps/chosen": -654.781982421875, "logps/rejected": -833.243408203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.719283103942871, "rewards/margins": 15.339999198913574, "rewards/rejected": -19.059282302856445, "step": 1485 }, { "epoch": 0.9244167962674961, "grad_norm": 12.444994926452637, "learning_rate": 4.183955739972338e-07, "logits/chosen": -0.5390093326568604, "logits/rejected": 1.3384099006652832, "logps/chosen": -443.9561462402344, "logps/rejected": -664.459228515625, "loss": 0.2756, "rewards/accuracies": 0.75, "rewards/chosen": -6.994746685028076, "rewards/margins": 6.36599063873291, "rewards/rejected": -13.360736846923828, "step": 1486 }, { "epoch": 0.9250388802488336, "grad_norm": 0.07635397464036942, "learning_rate": 4.149377593360996e-07, "logits/chosen": -0.5679030418395996, "logits/rejected": 3.5354416370391846, "logps/chosen": -437.87677001953125, "logps/rejected": -864.7110595703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.8465821743011475, "rewards/margins": 16.09186363220215, "rewards/rejected": -18.938446044921875, "step": 1487 }, { "epoch": 0.9256609642301711, "grad_norm": 1.7386114597320557, "learning_rate": 4.1147994467496544e-07, "logits/chosen": -1.1519631147384644, "logits/rejected": 2.2489707469940186, "logps/chosen": -504.7139892578125, "logps/rejected": -918.5, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -4.884095668792725, "rewards/margins": 17.8101863861084, "rewards/rejected": -22.69428253173828, "step": 1488 }, { "epoch": 0.9262830482115085, "grad_norm": 0.09362678229808807, "learning_rate": 4.080221300138313e-07, "logits/chosen": -1.8258460760116577, "logits/rejected": 2.0662150382995605, "logps/chosen": -343.0851135253906, "logps/rejected": -795.84716796875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.263042449951172, "rewards/margins": 14.03846549987793, "rewards/rejected": -17.301509857177734, "step": 1489 }, { "epoch": 0.926905132192846, "grad_norm": 1.4076992273330688, "learning_rate": 4.0456431535269716e-07, "logits/chosen": -0.8916260004043579, "logits/rejected": 2.236344814300537, "logps/chosen": -422.2193908691406, "logps/rejected": -728.3214111328125, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -3.0439491271972656, "rewards/margins": 13.015901565551758, "rewards/rejected": -16.059852600097656, "step": 1490 }, { "epoch": 0.9275272161741835, "grad_norm": 0.6712805032730103, "learning_rate": 4.01106500691563e-07, "logits/chosen": -1.2574436664581299, "logits/rejected": 1.138153076171875, "logps/chosen": -428.27679443359375, "logps/rejected": -799.3668212890625, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -3.6606407165527344, "rewards/margins": 14.857869148254395, "rewards/rejected": -18.518508911132812, "step": 1491 }, { "epoch": 0.928149300155521, "grad_norm": 0.00181769288610667, "learning_rate": 3.976486860304288e-07, "logits/chosen": -2.49100399017334, "logits/rejected": 0.7510606050491333, "logps/chosen": -383.1665954589844, "logps/rejected": -729.90234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4801323413848877, "rewards/margins": 11.920439720153809, "rewards/rejected": -14.400572776794434, "step": 1492 }, { "epoch": 0.9287713841368584, "grad_norm": 0.01834547519683838, "learning_rate": 3.941908713692946e-07, "logits/chosen": -2.0182347297668457, "logits/rejected": 0.9906941652297974, "logps/chosen": -399.16192626953125, "logps/rejected": -806.47412109375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.9304661750793457, "rewards/margins": 15.606968879699707, "rewards/rejected": -17.53743553161621, "step": 1493 }, { "epoch": 0.929393468118196, "grad_norm": 0.09516987949609756, "learning_rate": 3.907330567081605e-07, "logits/chosen": -2.988079786300659, "logits/rejected": 0.840813159942627, "logps/chosen": -341.9148864746094, "logps/rejected": -698.49755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.404346227645874, "rewards/margins": 10.514049530029297, "rewards/rejected": -12.918395042419434, "step": 1494 }, { "epoch": 0.9300155520995335, "grad_norm": 7.899932384490967, "learning_rate": 3.872752420470263e-07, "logits/chosen": 0.19899821281433105, "logits/rejected": 1.687016487121582, "logps/chosen": -511.45849609375, "logps/rejected": -816.2158813476562, "loss": 0.1743, "rewards/accuracies": 0.875, "rewards/chosen": -7.760147571563721, "rewards/margins": 10.689716339111328, "rewards/rejected": -18.44986343383789, "step": 1495 }, { "epoch": 0.9306376360808709, "grad_norm": 12.452661514282227, "learning_rate": 3.8381742738589216e-07, "logits/chosen": -2.7329719066619873, "logits/rejected": 0.3853139877319336, "logps/chosen": -472.5107421875, "logps/rejected": -757.3126220703125, "loss": 0.3532, "rewards/accuracies": 0.75, "rewards/chosen": -4.014552593231201, "rewards/margins": 6.1559247970581055, "rewards/rejected": -10.170476913452148, "step": 1496 }, { "epoch": 0.9312597200622084, "grad_norm": 2.1367621421813965, "learning_rate": 3.8035961272475794e-07, "logits/chosen": -1.171846866607666, "logits/rejected": 2.2882866859436035, "logps/chosen": -486.2050476074219, "logps/rejected": -812.8958740234375, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -4.927613258361816, "rewards/margins": 10.403205871582031, "rewards/rejected": -15.330819129943848, "step": 1497 }, { "epoch": 0.9318818040435459, "grad_norm": 0.3108869791030884, "learning_rate": 3.769017980636238e-07, "logits/chosen": -2.39148211479187, "logits/rejected": 2.4167563915252686, "logps/chosen": -398.72698974609375, "logps/rejected": -872.3837890625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.665180206298828, "rewards/margins": 12.407148361206055, "rewards/rejected": -17.072328567504883, "step": 1498 }, { "epoch": 0.9325038880248834, "grad_norm": 2.2684152126312256, "learning_rate": 3.7344398340248966e-07, "logits/chosen": 0.26399895548820496, "logits/rejected": 0.994949221611023, "logps/chosen": -606.6917114257812, "logps/rejected": -763.152099609375, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -4.9805803298950195, "rewards/margins": 11.517979621887207, "rewards/rejected": -16.498559951782227, "step": 1499 }, { "epoch": 0.9331259720062208, "grad_norm": 0.02908671274781227, "learning_rate": 3.699861687413555e-07, "logits/chosen": -2.991191864013672, "logits/rejected": 2.323463201522827, "logps/chosen": -421.55401611328125, "logps/rejected": -928.364990234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.364017963409424, "rewards/margins": 12.341465950012207, "rewards/rejected": -15.705484390258789, "step": 1500 }, { "epoch": 0.9337480559875583, "grad_norm": 5.878055572509766, "learning_rate": 3.665283540802213e-07, "logits/chosen": -0.42002198100090027, "logits/rejected": 1.5352654457092285, "logps/chosen": -448.0211486816406, "logps/rejected": -607.1275634765625, "loss": 0.1938, "rewards/accuracies": 0.875, "rewards/chosen": -5.31156063079834, "rewards/margins": 7.6229166984558105, "rewards/rejected": -12.934476852416992, "step": 1501 }, { "epoch": 0.9343701399688958, "grad_norm": 0.12541088461875916, "learning_rate": 3.630705394190872e-07, "logits/chosen": 0.28969892859458923, "logits/rejected": 2.1697070598602295, "logps/chosen": -554.0612182617188, "logps/rejected": -761.7791137695312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.604984998703003, "rewards/margins": 10.402400016784668, "rewards/rejected": -13.00738525390625, "step": 1502 }, { "epoch": 0.9349922239502333, "grad_norm": 0.02767164260149002, "learning_rate": 3.59612724757953e-07, "logits/chosen": -1.0063825845718384, "logits/rejected": 0.9170354604721069, "logps/chosen": -443.04693603515625, "logps/rejected": -697.2694091796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.0314249992370605, "rewards/margins": 13.77879524230957, "rewards/rejected": -17.810218811035156, "step": 1503 }, { "epoch": 0.9356143079315707, "grad_norm": 0.020736703649163246, "learning_rate": 3.561549100968188e-07, "logits/chosen": -2.894665241241455, "logits/rejected": 1.8047807216644287, "logps/chosen": -203.73631286621094, "logps/rejected": -677.605712890625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.8841956853866577, "rewards/margins": 17.116321563720703, "rewards/rejected": -18.000518798828125, "step": 1504 }, { "epoch": 0.9362363919129082, "grad_norm": 15.067363739013672, "learning_rate": 3.5269709543568466e-07, "logits/chosen": -0.6422203183174133, "logits/rejected": 1.5079469680786133, "logps/chosen": -616.841552734375, "logps/rejected": -877.3644409179688, "loss": 0.3851, "rewards/accuracies": 0.875, "rewards/chosen": -6.7121663093566895, "rewards/margins": 9.755440711975098, "rewards/rejected": -16.467607498168945, "step": 1505 }, { "epoch": 0.9368584758942458, "grad_norm": 6.540612697601318, "learning_rate": 3.4923928077455054e-07, "logits/chosen": -0.8412160277366638, "logits/rejected": 0.8433480858802795, "logps/chosen": -484.1073303222656, "logps/rejected": -655.442138671875, "loss": 0.1485, "rewards/accuracies": 0.875, "rewards/chosen": -4.170140266418457, "rewards/margins": 9.867389678955078, "rewards/rejected": -14.037528991699219, "step": 1506 }, { "epoch": 0.9374805598755832, "grad_norm": 0.0829324796795845, "learning_rate": 3.457814661134164e-07, "logits/chosen": -0.43071043491363525, "logits/rejected": 0.7872965931892395, "logps/chosen": -545.3976440429688, "logps/rejected": -773.6107788085938, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.097855091094971, "rewards/margins": 14.32176399230957, "rewards/rejected": -18.419618606567383, "step": 1507 }, { "epoch": 0.9381026438569207, "grad_norm": 9.86187744140625, "learning_rate": 3.423236514522822e-07, "logits/chosen": -0.2670916020870209, "logits/rejected": 0.9901536703109741, "logps/chosen": -668.8111572265625, "logps/rejected": -960.8660278320312, "loss": 0.2206, "rewards/accuracies": 0.875, "rewards/chosen": -4.077305793762207, "rewards/margins": 13.150307655334473, "rewards/rejected": -17.227611541748047, "step": 1508 }, { "epoch": 0.9387247278382581, "grad_norm": 0.19724632799625397, "learning_rate": 3.38865836791148e-07, "logits/chosen": -3.1104323863983154, "logits/rejected": -0.17572087049484253, "logps/chosen": -402.1249084472656, "logps/rejected": -659.510009765625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.625974416732788, "rewards/margins": 8.815404891967773, "rewards/rejected": -11.441378593444824, "step": 1509 }, { "epoch": 0.9393468118195957, "grad_norm": 12.546536445617676, "learning_rate": 3.354080221300138e-07, "logits/chosen": -1.5293309688568115, "logits/rejected": 1.2697745561599731, "logps/chosen": -391.48162841796875, "logps/rejected": -667.7274169921875, "loss": 0.4493, "rewards/accuracies": 0.875, "rewards/chosen": -4.018593788146973, "rewards/margins": 8.212364196777344, "rewards/rejected": -12.230957984924316, "step": 1510 }, { "epoch": 0.9399688958009331, "grad_norm": 0.052367690950632095, "learning_rate": 3.319502074688797e-07, "logits/chosen": -0.444234162569046, "logits/rejected": 2.2842347621917725, "logps/chosen": -374.25872802734375, "logps/rejected": -665.2982788085938, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.056352138519287, "rewards/margins": 12.294376373291016, "rewards/rejected": -16.350727081298828, "step": 1511 }, { "epoch": 0.9405909797822706, "grad_norm": 6.366074085235596, "learning_rate": 3.2849239280774554e-07, "logits/chosen": 0.5913575887680054, "logits/rejected": 2.0309362411499023, "logps/chosen": -496.16552734375, "logps/rejected": -719.8499145507812, "loss": 0.0965, "rewards/accuracies": 0.875, "rewards/chosen": -3.6162874698638916, "rewards/margins": 11.064377784729004, "rewards/rejected": -14.680665969848633, "step": 1512 }, { "epoch": 0.9412130637636081, "grad_norm": 0.11548803001642227, "learning_rate": 3.250345781466114e-07, "logits/chosen": -1.865290641784668, "logits/rejected": 1.4083446264266968, "logps/chosen": -437.8604431152344, "logps/rejected": -831.4605712890625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.72156286239624, "rewards/margins": 16.175809860229492, "rewards/rejected": -20.897371292114258, "step": 1513 }, { "epoch": 0.9418351477449456, "grad_norm": 9.291182518005371, "learning_rate": 3.2157676348547715e-07, "logits/chosen": 1.103979229927063, "logits/rejected": 2.116345167160034, "logps/chosen": -666.323486328125, "logps/rejected": -821.7340087890625, "loss": 0.2193, "rewards/accuracies": 0.875, "rewards/chosen": -3.8898463249206543, "rewards/margins": 12.274580001831055, "rewards/rejected": -16.164426803588867, "step": 1514 }, { "epoch": 0.942457231726283, "grad_norm": 11.197491645812988, "learning_rate": 3.1811894882434304e-07, "logits/chosen": -2.0354256629943848, "logits/rejected": 2.820720911026001, "logps/chosen": -341.85601806640625, "logps/rejected": -802.88330078125, "loss": 0.4042, "rewards/accuracies": 0.875, "rewards/chosen": -2.524524211883545, "rewards/margins": 9.693910598754883, "rewards/rejected": -12.218435287475586, "step": 1515 }, { "epoch": 0.9430793157076205, "grad_norm": 0.00014515525253955275, "learning_rate": 3.1466113416320887e-07, "logits/chosen": -0.3794043958187103, "logits/rejected": 2.158768653869629, "logps/chosen": -530.6067504882812, "logps/rejected": -850.1831665039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4419612884521484, "rewards/margins": 16.420076370239258, "rewards/rejected": -19.862037658691406, "step": 1516 }, { "epoch": 0.943701399688958, "grad_norm": 0.04613490775227547, "learning_rate": 3.112033195020747e-07, "logits/chosen": -2.270944595336914, "logits/rejected": -0.0015374720096588135, "logps/chosen": -346.1475830078125, "logps/rejected": -623.0887451171875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.2340760231018066, "rewards/margins": 12.253314971923828, "rewards/rejected": -15.487390518188477, "step": 1517 }, { "epoch": 0.9443234836702955, "grad_norm": 10.157634735107422, "learning_rate": 3.077455048409406e-07, "logits/chosen": -2.2936642169952393, "logits/rejected": 1.5570006370544434, "logps/chosen": -424.56396484375, "logps/rejected": -712.8133544921875, "loss": 0.2692, "rewards/accuracies": 0.875, "rewards/chosen": -2.1034915447235107, "rewards/margins": 8.809778213500977, "rewards/rejected": -10.91326904296875, "step": 1518 }, { "epoch": 0.944945567651633, "grad_norm": 12.773795127868652, "learning_rate": 3.0428769017980637e-07, "logits/chosen": -0.18564078211784363, "logits/rejected": 0.12225285172462463, "logps/chosen": -558.9400634765625, "logps/rejected": -744.89990234375, "loss": 0.326, "rewards/accuracies": 0.875, "rewards/chosen": -4.826589107513428, "rewards/margins": 10.65787410736084, "rewards/rejected": -15.484461784362793, "step": 1519 }, { "epoch": 0.9455676516329704, "grad_norm": 0.5052871108055115, "learning_rate": 3.008298755186722e-07, "logits/chosen": -1.9988384246826172, "logits/rejected": 0.3945826590061188, "logps/chosen": -500.2262268066406, "logps/rejected": -814.4442749023438, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.11403751373291, "rewards/margins": 17.001861572265625, "rewards/rejected": -21.11590003967285, "step": 1520 }, { "epoch": 0.946189735614308, "grad_norm": 0.009612703695893288, "learning_rate": 2.9737206085753804e-07, "logits/chosen": -0.15359750390052795, "logits/rejected": 1.5777616500854492, "logps/chosen": -475.3687438964844, "logps/rejected": -706.9713134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.513751268386841, "rewards/margins": 12.457344055175781, "rewards/rejected": -15.97109603881836, "step": 1521 }, { "epoch": 0.9468118195956454, "grad_norm": 0.0008137205149978399, "learning_rate": 2.9391424619640387e-07, "logits/chosen": -3.5792646408081055, "logits/rejected": 2.3332266807556152, "logps/chosen": -298.8536376953125, "logps/rejected": -892.1405029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9347429275512695, "rewards/margins": 18.78467559814453, "rewards/rejected": -21.719419479370117, "step": 1522 }, { "epoch": 0.9474339035769829, "grad_norm": 0.002397279255092144, "learning_rate": 2.9045643153526976e-07, "logits/chosen": -1.2490248680114746, "logits/rejected": 2.205015182495117, "logps/chosen": -519.5060424804688, "logps/rejected": -873.041748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.879042625427246, "rewards/margins": 14.990987777709961, "rewards/rejected": -20.87002944946289, "step": 1523 }, { "epoch": 0.9480559875583203, "grad_norm": 0.20828041434288025, "learning_rate": 2.8699861687413554e-07, "logits/chosen": -1.4127881526947021, "logits/rejected": 0.9738033413887024, "logps/chosen": -481.376220703125, "logps/rejected": -733.7556762695312, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.3263745307922363, "rewards/margins": 12.925792694091797, "rewards/rejected": -16.252166748046875, "step": 1524 }, { "epoch": 0.9486780715396579, "grad_norm": 13.801078796386719, "learning_rate": 2.835408022130014e-07, "logits/chosen": -1.2401267290115356, "logits/rejected": 1.1230303049087524, "logps/chosen": -596.0855102539062, "logps/rejected": -931.1829223632812, "loss": 0.5556, "rewards/accuracies": 0.875, "rewards/chosen": -7.94588565826416, "rewards/margins": 12.015524864196777, "rewards/rejected": -19.961410522460938, "step": 1525 }, { "epoch": 0.9493001555209953, "grad_norm": 3.323911666870117, "learning_rate": 2.8008298755186726e-07, "logits/chosen": -2.2632949352264404, "logits/rejected": 2.410269021987915, "logps/chosen": -307.24285888671875, "logps/rejected": -753.9850463867188, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -3.5417468547821045, "rewards/margins": 12.931978225708008, "rewards/rejected": -16.473726272583008, "step": 1526 }, { "epoch": 0.9499222395023328, "grad_norm": 8.782593727111816, "learning_rate": 2.766251728907331e-07, "logits/chosen": -1.5890283584594727, "logits/rejected": 2.0673530101776123, "logps/chosen": -332.4222717285156, "logps/rejected": -756.3360595703125, "loss": 0.1436, "rewards/accuracies": 0.875, "rewards/chosen": -1.7534908056259155, "rewards/margins": 14.890535354614258, "rewards/rejected": -16.644027709960938, "step": 1527 }, { "epoch": 0.9505443234836704, "grad_norm": 0.29920321702957153, "learning_rate": 2.731673582295989e-07, "logits/chosen": -0.9150745868682861, "logits/rejected": 2.4671788215637207, "logps/chosen": -503.2266845703125, "logps/rejected": -846.8163452148438, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -5.103777885437012, "rewards/margins": 14.150735855102539, "rewards/rejected": -19.254512786865234, "step": 1528 }, { "epoch": 0.9511664074650078, "grad_norm": 0.022912031039595604, "learning_rate": 2.6970954356846476e-07, "logits/chosen": -0.6790182590484619, "logits/rejected": 2.2809972763061523, "logps/chosen": -486.34222412109375, "logps/rejected": -792.0262451171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.8332505226135254, "rewards/margins": 14.360332489013672, "rewards/rejected": -18.193584442138672, "step": 1529 }, { "epoch": 0.9517884914463453, "grad_norm": 0.08280880004167557, "learning_rate": 2.662517289073306e-07, "logits/chosen": -0.21404844522476196, "logits/rejected": 2.1036627292633057, "logps/chosen": -546.2064208984375, "logps/rejected": -801.6506958007812, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.5713043212890625, "rewards/margins": 9.85201644897461, "rewards/rejected": -14.423320770263672, "step": 1530 }, { "epoch": 0.9524105754276827, "grad_norm": 1.2304530143737793, "learning_rate": 2.627939142461964e-07, "logits/chosen": 0.7650445699691772, "logits/rejected": 2.394505500793457, "logps/chosen": -585.1988525390625, "logps/rejected": -852.7142333984375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.451056718826294, "rewards/margins": 15.720792770385742, "rewards/rejected": -19.171850204467773, "step": 1531 }, { "epoch": 0.9530326594090203, "grad_norm": 0.018275698646903038, "learning_rate": 2.5933609958506226e-07, "logits/chosen": -2.3823533058166504, "logits/rejected": 1.0826777219772339, "logps/chosen": -503.57940673828125, "logps/rejected": -874.535400390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.612125873565674, "rewards/margins": 12.302767753601074, "rewards/rejected": -15.914894104003906, "step": 1532 }, { "epoch": 0.9536547433903577, "grad_norm": 4.723761081695557, "learning_rate": 2.558782849239281e-07, "logits/chosen": 0.3223056495189667, "logits/rejected": 3.870847702026367, "logps/chosen": -414.83172607421875, "logps/rejected": -703.7427978515625, "loss": 0.064, "rewards/accuracies": 1.0, "rewards/chosen": -3.606599807739258, "rewards/margins": 11.333426475524902, "rewards/rejected": -14.94002628326416, "step": 1533 }, { "epoch": 0.9542768273716952, "grad_norm": 0.01829158328473568, "learning_rate": 2.524204702627939e-07, "logits/chosen": -0.3641936182975769, "logits/rejected": 2.3222568035125732, "logps/chosen": -555.9605712890625, "logps/rejected": -861.835693359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.06725549697876, "rewards/margins": 14.788288116455078, "rewards/rejected": -18.85554313659668, "step": 1534 }, { "epoch": 0.9548989113530326, "grad_norm": 0.1500415951013565, "learning_rate": 2.4896265560165975e-07, "logits/chosen": 0.4342190623283386, "logits/rejected": 1.165311574935913, "logps/chosen": -587.0590209960938, "logps/rejected": -775.3698120117188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.6179704666137695, "rewards/margins": 13.51778793334961, "rewards/rejected": -16.135757446289062, "step": 1535 }, { "epoch": 0.9555209953343702, "grad_norm": 0.031078308820724487, "learning_rate": 2.455048409405256e-07, "logits/chosen": -1.7139317989349365, "logits/rejected": 0.8549051880836487, "logps/chosen": -426.7078857421875, "logps/rejected": -670.10888671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.2191972732543945, "rewards/margins": 13.253702163696289, "rewards/rejected": -16.472900390625, "step": 1536 }, { "epoch": 0.9561430793157076, "grad_norm": 10.57620906829834, "learning_rate": 2.420470262793915e-07, "logits/chosen": -1.224377155303955, "logits/rejected": 1.0194950103759766, "logps/chosen": -432.858642578125, "logps/rejected": -656.4475708007812, "loss": 0.5292, "rewards/accuracies": 0.875, "rewards/chosen": -4.1023993492126465, "rewards/margins": 10.600327491760254, "rewards/rejected": -14.702727317810059, "step": 1537 }, { "epoch": 0.9567651632970451, "grad_norm": 0.29772108793258667, "learning_rate": 2.3858921161825725e-07, "logits/chosen": -2.537130355834961, "logits/rejected": 1.2822535037994385, "logps/chosen": -330.567138671875, "logps/rejected": -684.5046997070312, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.5440661907196045, "rewards/margins": 12.692886352539062, "rewards/rejected": -16.23695182800293, "step": 1538 }, { "epoch": 0.9573872472783825, "grad_norm": 0.19515375792980194, "learning_rate": 2.351313969571231e-07, "logits/chosen": -2.651651620864868, "logits/rejected": -0.6324683427810669, "logps/chosen": -414.35906982421875, "logps/rejected": -654.8836669921875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.1223702430725098, "rewards/margins": 10.706483840942383, "rewards/rejected": -13.828853607177734, "step": 1539 }, { "epoch": 0.9580093312597201, "grad_norm": 0.00010136785567738116, "learning_rate": 2.3167358229598897e-07, "logits/chosen": -1.9493086338043213, "logits/rejected": 2.0097298622131348, "logps/chosen": -503.27490234375, "logps/rejected": -968.0970458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.2895309925079346, "rewards/margins": 17.02383804321289, "rewards/rejected": -19.313369750976562, "step": 1540 }, { "epoch": 0.9586314152410575, "grad_norm": 0.011455756612122059, "learning_rate": 2.2821576763485478e-07, "logits/chosen": -1.8426775932312012, "logits/rejected": 2.9476571083068848, "logps/chosen": -256.5271911621094, "logps/rejected": -751.1910400390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.2290189266204834, "rewards/margins": 15.71557331085205, "rewards/rejected": -16.944591522216797, "step": 1541 }, { "epoch": 0.959253499222395, "grad_norm": 11.869149208068848, "learning_rate": 2.2475795297372064e-07, "logits/chosen": -1.0521043539047241, "logits/rejected": 0.6745296120643616, "logps/chosen": -555.755859375, "logps/rejected": -686.623046875, "loss": 0.9045, "rewards/accuracies": 0.875, "rewards/chosen": -5.145055294036865, "rewards/margins": 9.310370445251465, "rewards/rejected": -14.455427169799805, "step": 1542 }, { "epoch": 0.9598755832037325, "grad_norm": 6.648984909057617, "learning_rate": 2.2130013831258647e-07, "logits/chosen": -4.095139503479004, "logits/rejected": 0.7199077606201172, "logps/chosen": -352.61834716796875, "logps/rejected": -838.4937744140625, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": -2.1716346740722656, "rewards/margins": 14.20274543762207, "rewards/rejected": -16.374380111694336, "step": 1543 }, { "epoch": 0.96049766718507, "grad_norm": 11.940282821655273, "learning_rate": 2.1784232365145228e-07, "logits/chosen": -0.5905404686927795, "logits/rejected": 1.2449567317962646, "logps/chosen": -536.0224609375, "logps/rejected": -712.2678833007812, "loss": 0.7575, "rewards/accuracies": 0.875, "rewards/chosen": -3.746596336364746, "rewards/margins": 8.702506065368652, "rewards/rejected": -12.449102401733398, "step": 1544 }, { "epoch": 0.9611197511664075, "grad_norm": 0.007170742843300104, "learning_rate": 2.1438450899031814e-07, "logits/chosen": -1.2545244693756104, "logits/rejected": 2.082170248031616, "logps/chosen": -324.8861389160156, "logps/rejected": -632.4902954101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.113025426864624, "rewards/margins": 16.253585815429688, "rewards/rejected": -17.366613388061523, "step": 1545 }, { "epoch": 0.9617418351477449, "grad_norm": 0.0035321838222444057, "learning_rate": 2.10926694329184e-07, "logits/chosen": -0.7418175935745239, "logits/rejected": 2.863408327102661, "logps/chosen": -379.0358581542969, "logps/rejected": -730.5086059570312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.403069257736206, "rewards/margins": 14.533123970031738, "rewards/rejected": -16.936195373535156, "step": 1546 }, { "epoch": 0.9623639191290825, "grad_norm": 0.6311712861061096, "learning_rate": 2.074688796680498e-07, "logits/chosen": -2.021195411682129, "logits/rejected": 0.4675508141517639, "logps/chosen": -313.4454345703125, "logps/rejected": -594.7869873046875, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -1.4394303560256958, "rewards/margins": 9.133974075317383, "rewards/rejected": -10.573404312133789, "step": 1547 }, { "epoch": 0.9629860031104199, "grad_norm": 0.03309885412454605, "learning_rate": 2.0401106500691564e-07, "logits/chosen": -1.6929857730865479, "logits/rejected": 1.8000214099884033, "logps/chosen": -364.95758056640625, "logps/rejected": -746.5244140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.5106658935546875, "rewards/margins": 14.32687759399414, "rewards/rejected": -15.837543487548828, "step": 1548 }, { "epoch": 0.9636080870917574, "grad_norm": 0.04415616765618324, "learning_rate": 2.005532503457815e-07, "logits/chosen": 0.8085697293281555, "logits/rejected": 2.484827995300293, "logps/chosen": -682.8201904296875, "logps/rejected": -821.82666015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.8033266067504883, "rewards/margins": 11.43174934387207, "rewards/rejected": -15.235076904296875, "step": 1549 }, { "epoch": 0.9642301710730948, "grad_norm": 0.0844784751534462, "learning_rate": 1.970954356846473e-07, "logits/chosen": -0.19745740294456482, "logits/rejected": 1.803208827972412, "logps/chosen": -529.65673828125, "logps/rejected": -823.01025390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.970705270767212, "rewards/margins": 13.859357833862305, "rewards/rejected": -17.830062866210938, "step": 1550 }, { "epoch": 0.9648522550544324, "grad_norm": 2.1351161003112793, "learning_rate": 1.9363762102351316e-07, "logits/chosen": 0.011476993560791016, "logits/rejected": 2.1366004943847656, "logps/chosen": -514.6240234375, "logps/rejected": -786.8828735351562, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -3.639784812927246, "rewards/margins": 14.652302742004395, "rewards/rejected": -18.292089462280273, "step": 1551 }, { "epoch": 0.9654743390357698, "grad_norm": 2.9624438285827637, "learning_rate": 1.9017980636237897e-07, "logits/chosen": -3.663362503051758, "logits/rejected": 0.6506034135818481, "logps/chosen": -413.92218017578125, "logps/rejected": -859.7926025390625, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": -3.923753261566162, "rewards/margins": 15.381858825683594, "rewards/rejected": -19.30561065673828, "step": 1552 }, { "epoch": 0.9660964230171073, "grad_norm": 5.205399513244629, "learning_rate": 1.8672199170124483e-07, "logits/chosen": 0.19705966114997864, "logits/rejected": 1.1380702257156372, "logps/chosen": -514.0706176757812, "logps/rejected": -730.266845703125, "loss": 0.1066, "rewards/accuracies": 0.875, "rewards/chosen": -5.549016952514648, "rewards/margins": 9.77924633026123, "rewards/rejected": -15.328262329101562, "step": 1553 }, { "epoch": 0.9667185069984447, "grad_norm": 1.4268311262130737, "learning_rate": 1.8326417704011066e-07, "logits/chosen": 0.7183610796928406, "logits/rejected": 2.714831590652466, "logps/chosen": -547.2266845703125, "logps/rejected": -830.1661376953125, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -3.8605234622955322, "rewards/margins": 11.226141929626465, "rewards/rejected": -15.086665153503418, "step": 1554 }, { "epoch": 0.9673405909797823, "grad_norm": 1.2064564228057861, "learning_rate": 1.798063623789765e-07, "logits/chosen": -0.40805429220199585, "logits/rejected": 1.9281114339828491, "logps/chosen": -540.417236328125, "logps/rejected": -863.123779296875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -4.08026123046875, "rewards/margins": 14.170088768005371, "rewards/rejected": -18.250350952148438, "step": 1555 }, { "epoch": 0.9679626749611198, "grad_norm": 1.9970312118530273, "learning_rate": 1.7634854771784233e-07, "logits/chosen": -0.3715158700942993, "logits/rejected": 1.944554328918457, "logps/chosen": -448.35150146484375, "logps/rejected": -716.911376953125, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -5.452033996582031, "rewards/margins": 12.853092193603516, "rewards/rejected": -18.305126190185547, "step": 1556 }, { "epoch": 0.9685847589424572, "grad_norm": 0.017970247194170952, "learning_rate": 1.728907330567082e-07, "logits/chosen": -2.6948776245117188, "logits/rejected": 0.7367205619812012, "logps/chosen": -437.25787353515625, "logps/rejected": -781.0352783203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.7750802040100098, "rewards/margins": 13.180065155029297, "rewards/rejected": -16.95514488220215, "step": 1557 }, { "epoch": 0.9692068429237947, "grad_norm": 11.161453247070312, "learning_rate": 1.69432918395574e-07, "logits/chosen": 1.9523109197616577, "logits/rejected": 3.260286331176758, "logps/chosen": -611.6287841796875, "logps/rejected": -707.8623046875, "loss": 0.4281, "rewards/accuracies": 0.875, "rewards/chosen": -4.9549784660339355, "rewards/margins": 8.105986595153809, "rewards/rejected": -13.060964584350586, "step": 1558 }, { "epoch": 0.9698289269051322, "grad_norm": 0.8565696477890015, "learning_rate": 1.6597510373443985e-07, "logits/chosen": -0.6643438339233398, "logits/rejected": 1.50982666015625, "logps/chosen": -519.898193359375, "logps/rejected": -835.9902954101562, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -5.655638217926025, "rewards/margins": 13.74305534362793, "rewards/rejected": -19.398693084716797, "step": 1559 }, { "epoch": 0.9704510108864697, "grad_norm": 2.831205368041992, "learning_rate": 1.625172890733057e-07, "logits/chosen": 1.7786824703216553, "logits/rejected": 2.3559446334838867, "logps/chosen": -634.5929565429688, "logps/rejected": -845.9027099609375, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": -4.328904151916504, "rewards/margins": 12.371463775634766, "rewards/rejected": -16.700368881225586, "step": 1560 }, { "epoch": 0.9710730948678071, "grad_norm": 5.573691368103027, "learning_rate": 1.5905947441217152e-07, "logits/chosen": -0.9601710438728333, "logits/rejected": -1.0674189329147339, "logps/chosen": -493.5607604980469, "logps/rejected": -603.0784912109375, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": -2.0458645820617676, "rewards/margins": 8.517045974731445, "rewards/rejected": -10.562911033630371, "step": 1561 }, { "epoch": 0.9716951788491446, "grad_norm": 0.2648980915546417, "learning_rate": 1.5560165975103735e-07, "logits/chosen": -1.2928415536880493, "logits/rejected": 1.9937264919281006, "logps/chosen": -388.36419677734375, "logps/rejected": -803.8955078125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.4528422355651855, "rewards/margins": 17.0137882232666, "rewards/rejected": -19.466630935668945, "step": 1562 }, { "epoch": 0.9723172628304821, "grad_norm": 2.7700612545013428, "learning_rate": 1.5214384508990319e-07, "logits/chosen": -1.268303632736206, "logits/rejected": 0.031665682792663574, "logps/chosen": -561.8787231445312, "logps/rejected": -712.3067016601562, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": -6.764484405517578, "rewards/margins": 9.113414764404297, "rewards/rejected": -15.877898216247559, "step": 1563 }, { "epoch": 0.9729393468118196, "grad_norm": 1.6701796054840088, "learning_rate": 1.4868603042876902e-07, "logits/chosen": 0.02167147397994995, "logits/rejected": 2.425312042236328, "logps/chosen": -568.426025390625, "logps/rejected": -816.6539306640625, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -5.132253646850586, "rewards/margins": 11.070293426513672, "rewards/rejected": -16.202547073364258, "step": 1564 }, { "epoch": 0.973561430793157, "grad_norm": 2.234860420227051, "learning_rate": 1.4522821576763488e-07, "logits/chosen": -5.306347846984863, "logits/rejected": 0.6736952066421509, "logps/chosen": -249.33102416992188, "logps/rejected": -723.729736328125, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": -1.262890100479126, "rewards/margins": 13.001165390014648, "rewards/rejected": -14.264055252075195, "step": 1565 }, { "epoch": 0.9741835147744946, "grad_norm": 0.12099715322256088, "learning_rate": 1.417704011065007e-07, "logits/chosen": 0.05978900194168091, "logits/rejected": 1.8876545429229736, "logps/chosen": -444.42218017578125, "logps/rejected": -767.0238037109375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -3.7884368896484375, "rewards/margins": 11.69379997253418, "rewards/rejected": -15.482236862182617, "step": 1566 }, { "epoch": 0.9748055987558321, "grad_norm": 12.424419403076172, "learning_rate": 1.3831258644536654e-07, "logits/chosen": 0.49236994981765747, "logits/rejected": 1.3416082859039307, "logps/chosen": -658.4081420898438, "logps/rejected": -909.7670288085938, "loss": 0.2286, "rewards/accuracies": 0.875, "rewards/chosen": -7.0918474197387695, "rewards/margins": 10.68513298034668, "rewards/rejected": -17.776979446411133, "step": 1567 }, { "epoch": 0.9754276827371695, "grad_norm": 0.016544483602046967, "learning_rate": 1.3485477178423238e-07, "logits/chosen": 0.034759461879730225, "logits/rejected": 1.862891435623169, "logps/chosen": -519.8224487304688, "logps/rejected": -792.83740234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.4151742458343506, "rewards/margins": 13.368707656860352, "rewards/rejected": -16.78388214111328, "step": 1568 }, { "epoch": 0.976049766718507, "grad_norm": 0.023624040186405182, "learning_rate": 1.313969571230982e-07, "logits/chosen": -0.10885626077651978, "logits/rejected": 0.9151186943054199, "logps/chosen": -567.5103149414062, "logps/rejected": -715.1940307617188, "loss": 0.0868, "rewards/accuracies": 1.0, "rewards/chosen": -4.4652533531188965, "rewards/margins": 9.383955001831055, "rewards/rejected": -13.849209785461426, "step": 1569 }, { "epoch": 0.9766718506998445, "grad_norm": 1.0062243938446045, "learning_rate": 1.2793914246196404e-07, "logits/chosen": -2.9531264305114746, "logits/rejected": -0.4524778127670288, "logps/chosen": -392.9182434082031, "logps/rejected": -666.7723999023438, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -1.774857521057129, "rewards/margins": 7.732518672943115, "rewards/rejected": -9.507376670837402, "step": 1570 }, { "epoch": 0.977293934681182, "grad_norm": 3.988088846206665, "learning_rate": 1.2448132780082988e-07, "logits/chosen": -2.1142637729644775, "logits/rejected": 0.7047243118286133, "logps/chosen": -490.88800048828125, "logps/rejected": -774.3189697265625, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": -4.174463748931885, "rewards/margins": 11.02910041809082, "rewards/rejected": -15.203563690185547, "step": 1571 }, { "epoch": 0.9779160186625194, "grad_norm": 2.212214469909668, "learning_rate": 1.2102351313969574e-07, "logits/chosen": -1.1277285814285278, "logits/rejected": 1.7097554206848145, "logps/chosen": -368.8323059082031, "logps/rejected": -644.16357421875, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -2.851954460144043, "rewards/margins": 7.968801498413086, "rewards/rejected": -10.820755958557129, "step": 1572 }, { "epoch": 0.9785381026438569, "grad_norm": 5.499125003814697, "learning_rate": 1.1756569847856156e-07, "logits/chosen": -4.320676326751709, "logits/rejected": 2.5876269340515137, "logps/chosen": -291.15020751953125, "logps/rejected": -928.403564453125, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": -2.836251735687256, "rewards/margins": 18.819536209106445, "rewards/rejected": -21.65578842163086, "step": 1573 }, { "epoch": 0.9791601866251944, "grad_norm": 6.047277927398682, "learning_rate": 1.1410788381742739e-07, "logits/chosen": 0.5327462553977966, "logits/rejected": 1.1916804313659668, "logps/chosen": -507.224609375, "logps/rejected": -649.6524658203125, "loss": 0.3106, "rewards/accuracies": 0.875, "rewards/chosen": -4.399947643280029, "rewards/margins": 12.195821762084961, "rewards/rejected": -16.59576988220215, "step": 1574 }, { "epoch": 0.9797822706065319, "grad_norm": 0.9368109107017517, "learning_rate": 1.1065006915629324e-07, "logits/chosen": -1.6990697383880615, "logits/rejected": 1.1483949422836304, "logps/chosen": -508.99200439453125, "logps/rejected": -776.5418701171875, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -3.7406609058380127, "rewards/margins": 10.541295051574707, "rewards/rejected": -14.281956672668457, "step": 1575 }, { "epoch": 0.9804043545878693, "grad_norm": 9.66633415222168, "learning_rate": 1.0719225449515907e-07, "logits/chosen": 0.44794347882270813, "logits/rejected": 0.5682996511459351, "logps/chosen": -593.1841430664062, "logps/rejected": -766.03466796875, "loss": 0.1964, "rewards/accuracies": 0.875, "rewards/chosen": -6.2166748046875, "rewards/margins": 8.318780899047852, "rewards/rejected": -14.535455703735352, "step": 1576 }, { "epoch": 0.9810264385692068, "grad_norm": 2.6417572498321533, "learning_rate": 1.037344398340249e-07, "logits/chosen": -1.888482928276062, "logits/rejected": 0.507079541683197, "logps/chosen": -251.20663452148438, "logps/rejected": -534.8086547851562, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -1.218980312347412, "rewards/margins": 14.062263488769531, "rewards/rejected": -15.281245231628418, "step": 1577 }, { "epoch": 0.9816485225505444, "grad_norm": 0.03893828019499779, "learning_rate": 1.0027662517289075e-07, "logits/chosen": -1.948725938796997, "logits/rejected": 1.3928252458572388, "logps/chosen": -300.5382080078125, "logps/rejected": -669.07275390625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.5580976009368896, "rewards/margins": 15.937835693359375, "rewards/rejected": -18.495935440063477, "step": 1578 }, { "epoch": 0.9822706065318818, "grad_norm": 0.021250003948807716, "learning_rate": 9.681881051175658e-08, "logits/chosen": -3.2927842140197754, "logits/rejected": 1.0782612562179565, "logps/chosen": -523.6760864257812, "logps/rejected": -907.0556640625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.5903611183166504, "rewards/margins": 13.556657791137695, "rewards/rejected": -16.147018432617188, "step": 1579 }, { "epoch": 0.9828926905132193, "grad_norm": 0.3931594491004944, "learning_rate": 9.336099585062241e-08, "logits/chosen": -1.237513542175293, "logits/rejected": 1.2071993350982666, "logps/chosen": -440.05194091796875, "logps/rejected": -806.3787231445312, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.7245163917541504, "rewards/margins": 12.151735305786133, "rewards/rejected": -14.876253128051758, "step": 1580 }, { "epoch": 0.9835147744945568, "grad_norm": 0.6283904910087585, "learning_rate": 8.990318118948825e-08, "logits/chosen": -1.5360066890716553, "logits/rejected": 1.2831445932388306, "logps/chosen": -434.362548828125, "logps/rejected": -659.1688842773438, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.4294538497924805, "rewards/margins": 11.354024887084961, "rewards/rejected": -13.783478736877441, "step": 1581 }, { "epoch": 0.9841368584758943, "grad_norm": 7.694677352905273, "learning_rate": 8.64453665283541e-08, "logits/chosen": -0.07682466506958008, "logits/rejected": 1.3886399269104004, "logps/chosen": -484.074462890625, "logps/rejected": -689.6983642578125, "loss": 0.2003, "rewards/accuracies": 0.875, "rewards/chosen": -4.543185710906982, "rewards/margins": 10.547569274902344, "rewards/rejected": -15.090754508972168, "step": 1582 }, { "epoch": 0.9847589424572317, "grad_norm": 0.31465888023376465, "learning_rate": 8.298755186721993e-08, "logits/chosen": 0.4256591498851776, "logits/rejected": 1.1971001625061035, "logps/chosen": -598.690673828125, "logps/rejected": -858.3633422851562, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.389150619506836, "rewards/margins": 12.232458114624023, "rewards/rejected": -16.62160873413086, "step": 1583 }, { "epoch": 0.9853810264385692, "grad_norm": 3.761500597000122, "learning_rate": 7.952973720608576e-08, "logits/chosen": -0.7873997092247009, "logits/rejected": 1.4089903831481934, "logps/chosen": -493.55615234375, "logps/rejected": -770.2498168945312, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": -4.764012813568115, "rewards/margins": 11.77115249633789, "rewards/rejected": -16.535165786743164, "step": 1584 }, { "epoch": 0.9860031104199067, "grad_norm": 0.027463171631097794, "learning_rate": 7.607192254495159e-08, "logits/chosen": -2.4276371002197266, "logits/rejected": 1.2843549251556396, "logps/chosen": -276.3843078613281, "logps/rejected": -705.26953125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.2226333618164062, "rewards/margins": 14.640146255493164, "rewards/rejected": -16.86277961730957, "step": 1585 }, { "epoch": 0.9866251944012442, "grad_norm": 0.43335676193237305, "learning_rate": 7.261410788381744e-08, "logits/chosen": -1.1822329759597778, "logits/rejected": 2.245922565460205, "logps/chosen": -423.20159912109375, "logps/rejected": -844.0552368164062, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -3.269690752029419, "rewards/margins": 12.975107192993164, "rewards/rejected": -16.244796752929688, "step": 1586 }, { "epoch": 0.9872472783825816, "grad_norm": 0.003726461436599493, "learning_rate": 6.915629322268327e-08, "logits/chosen": -1.8683693408966064, "logits/rejected": 1.236525535583496, "logps/chosen": -295.48199462890625, "logps/rejected": -656.7084350585938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.1296887397766113, "rewards/margins": 14.68983268737793, "rewards/rejected": -16.819520950317383, "step": 1587 }, { "epoch": 0.9878693623639191, "grad_norm": 0.5636923313140869, "learning_rate": 6.56984785615491e-08, "logits/chosen": -2.750426769256592, "logits/rejected": 2.2429304122924805, "logps/chosen": -405.189697265625, "logps/rejected": -908.8010864257812, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.451976776123047, "rewards/margins": 16.424623489379883, "rewards/rejected": -19.876602172851562, "step": 1588 }, { "epoch": 0.9884914463452567, "grad_norm": 0.005049501080065966, "learning_rate": 6.224066390041494e-08, "logits/chosen": -2.2462644577026367, "logits/rejected": 1.1538283824920654, "logps/chosen": -407.2684326171875, "logps/rejected": -777.2849731445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.622037410736084, "rewards/margins": 15.885320663452148, "rewards/rejected": -20.50735855102539, "step": 1589 }, { "epoch": 0.9891135303265941, "grad_norm": 0.059935737401247025, "learning_rate": 5.878284923928078e-08, "logits/chosen": -1.0047647953033447, "logits/rejected": 0.8983525633811951, "logps/chosen": -514.8770141601562, "logps/rejected": -776.4554443359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.368917942047119, "rewards/margins": 13.796619415283203, "rewards/rejected": -18.165538787841797, "step": 1590 }, { "epoch": 0.9897356143079316, "grad_norm": 0.001928742160089314, "learning_rate": 5.532503457814662e-08, "logits/chosen": -2.6652493476867676, "logits/rejected": 1.954436182975769, "logps/chosen": -374.34832763671875, "logps/rejected": -854.2645263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7334938049316406, "rewards/margins": 18.311786651611328, "rewards/rejected": -21.04528045654297, "step": 1591 }, { "epoch": 0.990357698289269, "grad_norm": 0.025692729279398918, "learning_rate": 5.186721991701245e-08, "logits/chosen": 1.3157975673675537, "logits/rejected": 2.1941699981689453, "logps/chosen": -583.7099609375, "logps/rejected": -770.6248168945312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.3850526809692383, "rewards/margins": 13.637388229370117, "rewards/rejected": -17.022441864013672, "step": 1592 }, { "epoch": 0.9909797822706066, "grad_norm": 41.99399948120117, "learning_rate": 4.840940525587829e-08, "logits/chosen": -1.2653218507766724, "logits/rejected": 0.7471568584442139, "logps/chosen": -467.0107421875, "logps/rejected": -706.9974975585938, "loss": 0.4885, "rewards/accuracies": 0.75, "rewards/chosen": -5.72601842880249, "rewards/margins": 9.918241500854492, "rewards/rejected": -15.644259452819824, "step": 1593 }, { "epoch": 0.991601866251944, "grad_norm": 0.14947909116744995, "learning_rate": 4.4951590594744124e-08, "logits/chosen": 0.9293948411941528, "logits/rejected": 2.489546060562134, "logps/chosen": -674.1929931640625, "logps/rejected": -898.6318359375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.4879984855651855, "rewards/margins": 13.477157592773438, "rewards/rejected": -18.96515464782715, "step": 1594 }, { "epoch": 0.9922239502332815, "grad_norm": 0.038526613265275955, "learning_rate": 4.1493775933609963e-08, "logits/chosen": -2.769157648086548, "logits/rejected": 2.3426613807678223, "logps/chosen": -307.521728515625, "logps/rejected": -765.708740234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.8973703384399414, "rewards/margins": 15.384183883666992, "rewards/rejected": -18.281553268432617, "step": 1595 }, { "epoch": 0.9928460342146189, "grad_norm": 7.8814496994018555, "learning_rate": 3.8035961272475797e-08, "logits/chosen": 0.42720240354537964, "logits/rejected": 1.5588419437408447, "logps/chosen": -665.358642578125, "logps/rejected": -868.8599853515625, "loss": 0.199, "rewards/accuracies": 0.875, "rewards/chosen": -4.4054856300354, "rewards/margins": 7.876606464385986, "rewards/rejected": -12.28209114074707, "step": 1596 }, { "epoch": 0.9934681181959565, "grad_norm": 0.001687605632469058, "learning_rate": 3.4578146611341636e-08, "logits/chosen": -1.5893828868865967, "logits/rejected": 0.42079970240592957, "logps/chosen": -475.39031982421875, "logps/rejected": -775.4556884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.463973522186279, "rewards/margins": 13.781604766845703, "rewards/rejected": -18.24557876586914, "step": 1597 }, { "epoch": 0.9940902021772939, "grad_norm": 4.156440734863281, "learning_rate": 3.112033195020747e-08, "logits/chosen": -1.1414874792099, "logits/rejected": 0.8766258955001831, "logps/chosen": -505.50885009765625, "logps/rejected": -735.5494995117188, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -3.2051823139190674, "rewards/margins": 12.84793472290039, "rewards/rejected": -16.053117752075195, "step": 1598 }, { "epoch": 0.9947122861586314, "grad_norm": 10.47283935546875, "learning_rate": 2.766251728907331e-08, "logits/chosen": -0.1731289029121399, "logits/rejected": 1.4308111667633057, "logps/chosen": -484.865478515625, "logps/rejected": -640.674072265625, "loss": 0.2338, "rewards/accuracies": 0.875, "rewards/chosen": -4.504328727722168, "rewards/margins": 8.184914588928223, "rewards/rejected": -12.68924331665039, "step": 1599 }, { "epoch": 0.995334370139969, "grad_norm": 2.055359125137329, "learning_rate": 2.4204702627939145e-08, "logits/chosen": -1.6867294311523438, "logits/rejected": 1.9117958545684814, "logps/chosen": -318.17303466796875, "logps/rejected": -749.297607421875, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -2.68890643119812, "rewards/margins": 12.414870262145996, "rewards/rejected": -15.103776931762695, "step": 1600 }, { "epoch": 0.9959564541213064, "grad_norm": 13.99416732788086, "learning_rate": 2.0746887966804982e-08, "logits/chosen": -0.4514329731464386, "logits/rejected": 1.2991617918014526, "logps/chosen": -588.4736328125, "logps/rejected": -853.2936401367188, "loss": 0.8047, "rewards/accuracies": 0.875, "rewards/chosen": -4.038499355316162, "rewards/margins": 13.580487251281738, "rewards/rejected": -17.618988037109375, "step": 1601 }, { "epoch": 0.9965785381026439, "grad_norm": 9.918309211730957, "learning_rate": 1.7289073305670818e-08, "logits/chosen": 1.885743260383606, "logits/rejected": 2.4654700756073, "logps/chosen": -644.1548461914062, "logps/rejected": -736.1688232421875, "loss": 0.2252, "rewards/accuracies": 0.875, "rewards/chosen": -2.613771915435791, "rewards/margins": 7.047847747802734, "rewards/rejected": -9.661619186401367, "step": 1602 }, { "epoch": 0.9972006220839813, "grad_norm": 1.9691005945205688, "learning_rate": 1.3831258644536654e-08, "logits/chosen": -1.3531639575958252, "logits/rejected": 2.3963005542755127, "logps/chosen": -425.22833251953125, "logps/rejected": -794.0748291015625, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -3.3977909088134766, "rewards/margins": 10.881702423095703, "rewards/rejected": -14.279492378234863, "step": 1603 }, { "epoch": 0.9978227060653189, "grad_norm": 1.0110143423080444, "learning_rate": 1.0373443983402491e-08, "logits/chosen": 0.6420966386795044, "logits/rejected": 2.6212522983551025, "logps/chosen": -599.81982421875, "logps/rejected": -861.3963012695312, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -3.304236650466919, "rewards/margins": 10.048107147216797, "rewards/rejected": -13.352343559265137, "step": 1604 }, { "epoch": 0.9984447900466563, "grad_norm": 9.439444541931152, "learning_rate": 6.915629322268327e-09, "logits/chosen": -0.18070551753044128, "logits/rejected": 2.4111857414245605, "logps/chosen": -480.1266174316406, "logps/rejected": -759.507568359375, "loss": 0.229, "rewards/accuracies": 0.875, "rewards/chosen": -4.10009241104126, "rewards/margins": 7.733498573303223, "rewards/rejected": -11.83359146118164, "step": 1605 }, { "epoch": 0.9990668740279938, "grad_norm": 6.697127342224121, "learning_rate": 3.4578146611341636e-09, "logits/chosen": -1.0914044380187988, "logits/rejected": 0.5531443357467651, "logps/chosen": -383.8687438964844, "logps/rejected": -613.2139282226562, "loss": 0.1228, "rewards/accuracies": 0.875, "rewards/chosen": -3.927517890930176, "rewards/margins": 9.896014213562012, "rewards/rejected": -13.823532104492188, "step": 1606 }, { "epoch": 0.9996889580093312, "grad_norm": 0.0001396771549480036, "learning_rate": 0.0, "logits/chosen": -2.0554685592651367, "logits/rejected": 2.366708278656006, "logps/chosen": -428.7611389160156, "logps/rejected": -868.82080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8558599948883057, "rewards/margins": 18.02965545654297, "rewards/rejected": -20.885513305664062, "step": 1607 } ], "logging_steps": 1, "max_steps": 1607, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }