{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993482511405605, "eval_steps": 10000, "global_step": 1150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017379969585053225, "grad_norm": 204.86322966456535, "learning_rate": 8e-09, "logits/chosen": 0.1418704390525818, "logits/rejected": 0.2809927761554718, "logps/chosen": -477.3938293457031, "logps/rejected": -431.13787841796875, "loss": 0.6996, "nll_loss": 0.44202908873558044, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01593799516558647, "rewards/margins": -0.04110307618975639, "rewards/rejected": 0.025165079161524773, "step": 2 }, { "epoch": 0.003475993917010645, "grad_norm": 150.00820365668775, "learning_rate": 1.6e-08, "logits/chosen": 0.36849209666252136, "logits/rejected": 0.4490591883659363, "logps/chosen": -482.40179443359375, "logps/rejected": -446.5460510253906, "loss": 0.6997, "nll_loss": 0.4239627420902252, "rewards/accuracies": 0.5, "rewards/chosen": -0.041008852422237396, "rewards/margins": -0.017010685056447983, "rewards/rejected": -0.023998167365789413, "step": 4 }, { "epoch": 0.005213990875515968, "grad_norm": 193.7012862807946, "learning_rate": 2.3999999999999997e-08, "logits/chosen": 0.1794443428516388, "logits/rejected": 0.3568742275238037, "logps/chosen": -519.8365478515625, "logps/rejected": -510.6430969238281, "loss": 0.7099, "nll_loss": 0.46996381878852844, "rewards/accuracies": 0.4375, "rewards/chosen": -0.037707142531871796, "rewards/margins": -0.050306886434555054, "rewards/rejected": 0.01259975228458643, "step": 6 }, { "epoch": 0.00695198783402129, "grad_norm": 205.73843329421297, "learning_rate": 3.2e-08, "logits/chosen": 0.44375962018966675, "logits/rejected": 0.3268170952796936, "logps/chosen": -543.103759765625, "logps/rejected": -496.75482177734375, "loss": 0.6864, "nll_loss": 0.5108887553215027, "rewards/accuracies": 0.625, "rewards/chosen": 0.058286286890506744, "rewards/margins": 0.06145687401294708, "rewards/rejected": -0.0031705868896096945, "step": 8 }, { "epoch": 0.008689984792526613, "grad_norm": 200.84076796614073, "learning_rate": 4e-08, "logits/chosen": 0.35375142097473145, "logits/rejected": 0.7080434560775757, "logps/chosen": -415.6636962890625, "logps/rejected": -469.8753662109375, "loss": 0.7101, "nll_loss": 0.398910790681839, "rewards/accuracies": 0.375, "rewards/chosen": -0.062073614448308945, "rewards/margins": -0.04637365788221359, "rewards/rejected": -0.01569996029138565, "step": 10 }, { "epoch": 0.010427981751031936, "grad_norm": 197.54313125975258, "learning_rate": 4.799999999999999e-08, "logits/chosen": 0.35080885887145996, "logits/rejected": 0.41447973251342773, "logps/chosen": -541.642822265625, "logps/rejected": -553.8869018554688, "loss": 0.7093, "nll_loss": 0.4971155524253845, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01922778971493244, "rewards/margins": 0.019500732421875, "rewards/rejected": -0.00027294084429740906, "step": 12 }, { "epoch": 0.012165978709537258, "grad_norm": 238.36601819149988, "learning_rate": 5.6000000000000005e-08, "logits/chosen": 0.2947372794151306, "logits/rejected": 0.23111987113952637, "logps/chosen": -556.4337158203125, "logps/rejected": -469.99188232421875, "loss": 0.6999, "nll_loss": 0.503655195236206, "rewards/accuracies": 0.625, "rewards/chosen": -0.004339606035500765, "rewards/margins": 0.020965958014130592, "rewards/rejected": -0.025305552408099174, "step": 14 }, { "epoch": 0.01390397566804258, "grad_norm": 185.56933262588134, "learning_rate": 6.4e-08, "logits/chosen": 0.44016793370246887, "logits/rejected": 0.40880700945854187, "logps/chosen": -454.967529296875, "logps/rejected": -460.52435302734375, "loss": 0.6806, "nll_loss": 0.4614264667034149, "rewards/accuracies": 0.625, "rewards/chosen": -0.0023229592479765415, "rewards/margins": 0.07889167219400406, "rewards/rejected": -0.08121462166309357, "step": 16 }, { "epoch": 0.015641972626547904, "grad_norm": 147.57386401300792, "learning_rate": 7.2e-08, "logits/chosen": 0.6430061459541321, "logits/rejected": 0.6047405004501343, "logps/chosen": -488.2435607910156, "logps/rejected": -495.81756591796875, "loss": 0.6886, "nll_loss": 0.46323227882385254, "rewards/accuracies": 0.75, "rewards/chosen": 0.0426841676235199, "rewards/margins": 0.06148987263441086, "rewards/rejected": -0.018805695697665215, "step": 18 }, { "epoch": 0.017379969585053227, "grad_norm": 190.85823681589352, "learning_rate": 8e-08, "logits/chosen": 0.6724389791488647, "logits/rejected": 0.7834637761116028, "logps/chosen": -481.4169616699219, "logps/rejected": -496.19061279296875, "loss": 0.7134, "nll_loss": 0.4593673050403595, "rewards/accuracies": 0.5, "rewards/chosen": -0.014772225171327591, "rewards/margins": 0.004669668152928352, "rewards/rejected": -0.019441891461610794, "step": 20 }, { "epoch": 0.01911796654355855, "grad_norm": 188.80442020206598, "learning_rate": 8.8e-08, "logits/chosen": 0.4140382409095764, "logits/rejected": 0.46662065386772156, "logps/chosen": -430.3562927246094, "logps/rejected": -462.67132568359375, "loss": 0.7065, "nll_loss": 0.4172236919403076, "rewards/accuracies": 0.5625, "rewards/chosen": -0.032254599034786224, "rewards/margins": 0.02239227294921875, "rewards/rejected": -0.054646871984004974, "step": 22 }, { "epoch": 0.02085596350206387, "grad_norm": 228.2165299617552, "learning_rate": 9.599999999999999e-08, "logits/chosen": 0.45874932408332825, "logits/rejected": 0.3751963973045349, "logps/chosen": -530.1676025390625, "logps/rejected": -504.3202819824219, "loss": 0.7172, "nll_loss": 0.5031083822250366, "rewards/accuracies": 0.4375, "rewards/chosen": 0.005689051002264023, "rewards/margins": -0.030557062476873398, "rewards/rejected": 0.03624610975384712, "step": 24 }, { "epoch": 0.022593960460569194, "grad_norm": 265.02955682128356, "learning_rate": 1.04e-07, "logits/chosen": 0.5117384791374207, "logits/rejected": 0.4469182789325714, "logps/chosen": -551.666748046875, "logps/rejected": -496.4570007324219, "loss": 0.7189, "nll_loss": 0.5280334949493408, "rewards/accuracies": 0.5, "rewards/chosen": -0.06254005432128906, "rewards/margins": -0.009257127530872822, "rewards/rejected": -0.053282931447029114, "step": 26 }, { "epoch": 0.024331957419074516, "grad_norm": 161.93619304345123, "learning_rate": 1.1200000000000001e-07, "logits/chosen": 0.3897124230861664, "logits/rejected": 0.45780280232429504, "logps/chosen": -481.9251403808594, "logps/rejected": -518.1923828125, "loss": 0.7133, "nll_loss": 0.4771922826766968, "rewards/accuracies": 0.625, "rewards/chosen": -0.036876581609249115, "rewards/margins": 0.03441982343792915, "rewards/rejected": -0.07129640877246857, "step": 28 }, { "epoch": 0.02606995437757984, "grad_norm": 225.11472685922325, "learning_rate": 1.2e-07, "logits/chosen": 0.5148178339004517, "logits/rejected": 0.5700947642326355, "logps/chosen": -473.7587585449219, "logps/rejected": -545.1907958984375, "loss": 0.6845, "nll_loss": 0.4589259624481201, "rewards/accuracies": 0.625, "rewards/chosen": 0.04277763515710831, "rewards/margins": 0.042406272143125534, "rewards/rejected": 0.0003713611513376236, "step": 30 }, { "epoch": 0.02780795133608516, "grad_norm": 317.47335723089736, "learning_rate": 1.28e-07, "logits/chosen": 0.27863508462905884, "logits/rejected": 0.368697851896286, "logps/chosen": -477.912353515625, "logps/rejected": -506.5833435058594, "loss": 0.7088, "nll_loss": 0.4421365559101105, "rewards/accuracies": 0.5, "rewards/chosen": -0.014108658768236637, "rewards/margins": -0.04797716438770294, "rewards/rejected": 0.03386850655078888, "step": 32 }, { "epoch": 0.029545948294590483, "grad_norm": 210.72668411395, "learning_rate": 1.36e-07, "logits/chosen": 0.10042007267475128, "logits/rejected": 0.028779903426766396, "logps/chosen": -548.9990844726562, "logps/rejected": -481.5511779785156, "loss": 0.7029, "nll_loss": 0.4736991822719574, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0562286376953125, "rewards/margins": -0.02881174348294735, "rewards/rejected": -0.0274168960750103, "step": 34 }, { "epoch": 0.03128394525309581, "grad_norm": 209.72814001229338, "learning_rate": 1.44e-07, "logits/chosen": 0.6066944003105164, "logits/rejected": 0.5373053550720215, "logps/chosen": -528.7098999023438, "logps/rejected": -506.41632080078125, "loss": 0.6697, "nll_loss": 0.49464890360832214, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09787749499082565, "rewards/margins": 0.1495254635810852, "rewards/rejected": -0.05164794996380806, "step": 36 }, { "epoch": 0.03302194221160113, "grad_norm": 190.89703407210013, "learning_rate": 1.5199999999999998e-07, "logits/chosen": 0.46255195140838623, "logits/rejected": 0.5460544228553772, "logps/chosen": -482.3389587402344, "logps/rejected": -484.7127990722656, "loss": 0.6884, "nll_loss": 0.4956420958042145, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06698399037122726, "rewards/margins": 0.08448342978954315, "rewards/rejected": -0.017499446868896484, "step": 38 }, { "epoch": 0.03475993917010645, "grad_norm": 199.4593307406834, "learning_rate": 1.6e-07, "logits/chosen": 0.22364471852779388, "logits/rejected": 0.31248578429222107, "logps/chosen": -453.6600341796875, "logps/rejected": -450.4988708496094, "loss": 0.7112, "nll_loss": 0.4363557696342468, "rewards/accuracies": 0.5, "rewards/chosen": -0.05961894989013672, "rewards/margins": -0.03394022583961487, "rewards/rejected": -0.0256787296384573, "step": 40 }, { "epoch": 0.036497936128611776, "grad_norm": 239.19190157790425, "learning_rate": 1.68e-07, "logits/chosen": 0.34523797035217285, "logits/rejected": 0.2734060287475586, "logps/chosen": -522.078857421875, "logps/rejected": -544.6460571289062, "loss": 0.6942, "nll_loss": 0.5025855302810669, "rewards/accuracies": 0.625, "rewards/chosen": -0.013720512390136719, "rewards/margins": 0.032335568219423294, "rewards/rejected": -0.046056076884269714, "step": 42 }, { "epoch": 0.0382359330871171, "grad_norm": 309.03061373800654, "learning_rate": 1.76e-07, "logits/chosen": 0.38464105129241943, "logits/rejected": 0.40568751096725464, "logps/chosen": -469.6867370605469, "logps/rejected": -469.88507080078125, "loss": 0.7049, "nll_loss": 0.4352942705154419, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0630766823887825, "rewards/margins": -0.039861395955085754, "rewards/rejected": -0.023215293884277344, "step": 44 }, { "epoch": 0.03997393004562242, "grad_norm": 288.0039980121617, "learning_rate": 1.84e-07, "logits/chosen": 0.39405491948127747, "logits/rejected": 0.37233734130859375, "logps/chosen": -533.5667114257812, "logps/rejected": -514.7755126953125, "loss": 0.6918, "nll_loss": 0.4772227108478546, "rewards/accuracies": 0.375, "rewards/chosen": 0.031320951879024506, "rewards/margins": 0.005975722335278988, "rewards/rejected": 0.025345228612422943, "step": 46 }, { "epoch": 0.04171192700412774, "grad_norm": 263.27022905858814, "learning_rate": 1.9199999999999997e-07, "logits/chosen": 0.5493794679641724, "logits/rejected": 0.5213139057159424, "logps/chosen": -439.7628173828125, "logps/rejected": -462.7975158691406, "loss": 0.6906, "nll_loss": 0.41747790575027466, "rewards/accuracies": 0.25, "rewards/chosen": 0.03803615644574165, "rewards/margins": -0.06382055580615997, "rewards/rejected": 0.10185670852661133, "step": 48 }, { "epoch": 0.043449923962633065, "grad_norm": 190.65381482431846, "learning_rate": 2e-07, "logits/chosen": 0.6704590320587158, "logits/rejected": 0.5686213374137878, "logps/chosen": -457.4028625488281, "logps/rejected": -493.1829833984375, "loss": 0.6826, "nll_loss": 0.43137940764427185, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05906439200043678, "rewards/margins": 0.08382105827331543, "rewards/rejected": -0.024756668135523796, "step": 50 }, { "epoch": 0.04518792092113839, "grad_norm": 248.201816470774, "learning_rate": 1.99998368664865e-07, "logits/chosen": 0.5073150992393494, "logits/rejected": 0.519672155380249, "logps/chosen": -532.7998046875, "logps/rejected": -534.9505615234375, "loss": 0.6769, "nll_loss": 0.5028703212738037, "rewards/accuracies": 0.625, "rewards/chosen": 0.001774406060576439, "rewards/margins": 0.07039718329906464, "rewards/rejected": -0.06862278282642365, "step": 52 }, { "epoch": 0.04692591787964371, "grad_norm": 211.00552194335916, "learning_rate": 1.9999347471268514e-07, "logits/chosen": 0.6019046902656555, "logits/rejected": 0.6082484126091003, "logps/chosen": -472.231689453125, "logps/rejected": -471.65594482421875, "loss": 0.7054, "nll_loss": 0.4506027400493622, "rewards/accuracies": 0.625, "rewards/chosen": -0.05322122946381569, "rewards/margins": 0.033222489058971405, "rewards/rejected": -0.0864437147974968, "step": 54 }, { "epoch": 0.04866391483814903, "grad_norm": 189.41970732319982, "learning_rate": 1.9998531830313392e-07, "logits/chosen": 0.6955520510673523, "logits/rejected": 0.4593581259250641, "logps/chosen": -482.1744384765625, "logps/rejected": -411.509521484375, "loss": 0.6807, "nll_loss": 0.4741944968700409, "rewards/accuracies": 0.375, "rewards/chosen": 0.03270740434527397, "rewards/margins": -0.01927652209997177, "rewards/rejected": 0.05198393017053604, "step": 56 }, { "epoch": 0.050401911796654354, "grad_norm": 200.24361432233923, "learning_rate": 1.9997389970232808e-07, "logits/chosen": 0.3443059027194977, "logits/rejected": 0.4937664568424225, "logps/chosen": -467.9244384765625, "logps/rejected": -454.7276916503906, "loss": 0.7053, "nll_loss": 0.4400884211063385, "rewards/accuracies": 0.375, "rewards/chosen": -0.06452464312314987, "rewards/margins": -0.08429832011461258, "rewards/rejected": 0.01977367326617241, "step": 58 }, { "epoch": 0.05213990875515968, "grad_norm": 163.032159459724, "learning_rate": 1.9995921928281893e-07, "logits/chosen": 0.422140896320343, "logits/rejected": 0.5024916529655457, "logps/chosen": -464.2950439453125, "logps/rejected": -526.7002563476562, "loss": 0.6961, "nll_loss": 0.4522109031677246, "rewards/accuracies": 0.375, "rewards/chosen": -0.04293403401970863, "rewards/margins": -0.04488658532500267, "rewards/rejected": 0.0019525480456650257, "step": 60 }, { "epoch": 0.053877905713665, "grad_norm": 163.5730877723102, "learning_rate": 1.9994127752358013e-07, "logits/chosen": 0.6417545080184937, "logits/rejected": 0.5685979723930359, "logps/chosen": -535.2261962890625, "logps/rejected": -503.78277587890625, "loss": 0.6931, "nll_loss": 0.5154188871383667, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01960907131433487, "rewards/margins": 0.09830178320407867, "rewards/rejected": -0.0786927193403244, "step": 62 }, { "epoch": 0.05561590267217032, "grad_norm": 164.19734726606004, "learning_rate": 1.9992007500999212e-07, "logits/chosen": 0.2852614223957062, "logits/rejected": 0.309150367975235, "logps/chosen": -434.3333435058594, "logps/rejected": -432.8827209472656, "loss": 0.6697, "nll_loss": 0.42650461196899414, "rewards/accuracies": 0.75, "rewards/chosen": 0.06457091122865677, "rewards/margins": 0.11283140629529953, "rewards/rejected": -0.04826049506664276, "step": 64 }, { "epoch": 0.057353899630675644, "grad_norm": 208.2379517632823, "learning_rate": 1.998956124338231e-07, "logits/chosen": 0.6129292845726013, "logits/rejected": 0.5678445100784302, "logps/chosen": -449.71087646484375, "logps/rejected": -409.4271240234375, "loss": 0.7017, "nll_loss": 0.43394580483436584, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03126039355993271, "rewards/margins": 0.05173855274915695, "rewards/rejected": -0.020478159189224243, "step": 66 }, { "epoch": 0.059091896589180966, "grad_norm": 293.2951894044055, "learning_rate": 1.9986789059320613e-07, "logits/chosen": 0.5414038896560669, "logits/rejected": 0.3374328017234802, "logps/chosen": -512.413818359375, "logps/rejected": -405.919677734375, "loss": 0.6938, "nll_loss": 0.492817223072052, "rewards/accuracies": 0.75, "rewards/chosen": 0.17506714165210724, "rewards/margins": 0.13103190064430237, "rewards/rejected": 0.04403524845838547, "step": 68 }, { "epoch": 0.06082989354768629, "grad_norm": 196.4370462875855, "learning_rate": 1.9983691039261354e-07, "logits/chosen": 0.34781157970428467, "logits/rejected": 0.24695612490177155, "logps/chosen": -475.7244567871094, "logps/rejected": -478.4767761230469, "loss": 0.6567, "nll_loss": 0.4392806589603424, "rewards/accuracies": 0.75, "rewards/chosen": 0.07889433950185776, "rewards/margins": 0.14110715687274933, "rewards/rejected": -0.06221282482147217, "step": 70 }, { "epoch": 0.06256789050619162, "grad_norm": 163.4462233111686, "learning_rate": 1.9980267284282714e-07, "logits/chosen": 0.5121564269065857, "logits/rejected": 0.5738804340362549, "logps/chosen": -440.2140808105469, "logps/rejected": -519.8204345703125, "loss": 0.6778, "nll_loss": 0.42939573526382446, "rewards/accuracies": 0.4375, "rewards/chosen": 0.09369020164012909, "rewards/margins": 0.0274474136531353, "rewards/rejected": 0.06624279171228409, "step": 72 }, { "epoch": 0.06430588746469694, "grad_norm": 226.25500851573472, "learning_rate": 1.9976517906090527e-07, "logits/chosen": 0.482619047164917, "logits/rejected": 0.45125705003738403, "logps/chosen": -459.4217529296875, "logps/rejected": -491.68072509765625, "loss": 0.6939, "nll_loss": 0.46240657567977905, "rewards/accuracies": 0.375, "rewards/chosen": 0.07695970684289932, "rewards/margins": -0.011097146198153496, "rewards/rejected": 0.08805684745311737, "step": 74 }, { "epoch": 0.06604388442320226, "grad_norm": 187.52225346061883, "learning_rate": 1.9972443027014636e-07, "logits/chosen": 0.37846773862838745, "logits/rejected": 0.28516721725463867, "logps/chosen": -477.3094482421875, "logps/rejected": -492.2334899902344, "loss": 0.669, "nll_loss": 0.4478228986263275, "rewards/accuracies": 0.5, "rewards/chosen": 0.18834134936332703, "rewards/margins": -0.018651390448212624, "rewards/rejected": 0.2069927304983139, "step": 76 }, { "epoch": 0.06778188138170758, "grad_norm": 286.4122926243434, "learning_rate": 1.9968042780004915e-07, "logits/chosen": 0.5606168508529663, "logits/rejected": 0.7590384483337402, "logps/chosen": -471.123779296875, "logps/rejected": -469.21038818359375, "loss": 0.6712, "nll_loss": 0.46313852071762085, "rewards/accuracies": 0.5625, "rewards/chosen": 0.18222695589065552, "rewards/margins": 0.03420209884643555, "rewards/rejected": 0.14802484214305878, "step": 78 }, { "epoch": 0.0695198783402129, "grad_norm": 157.28755514832162, "learning_rate": 1.9963317308626913e-07, "logits/chosen": 0.49132657051086426, "logits/rejected": 0.6132655739784241, "logps/chosen": -426.67108154296875, "logps/rejected": -420.4914245605469, "loss": 0.7003, "nll_loss": 0.412084698677063, "rewards/accuracies": 0.5625, "rewards/chosen": 0.16370698809623718, "rewards/margins": -0.004933936521410942, "rewards/rejected": 0.16864091157913208, "step": 80 }, { "epoch": 0.07125787529871823, "grad_norm": 151.4016129399313, "learning_rate": 1.995826676705718e-07, "logits/chosen": 0.5328776836395264, "logits/rejected": 0.45003530383110046, "logps/chosen": -461.80194091796875, "logps/rejected": -497.72930908203125, "loss": 0.6519, "nll_loss": 0.46158552169799805, "rewards/accuracies": 0.6875, "rewards/chosen": 0.260319322347641, "rewards/margins": 0.15180225670337677, "rewards/rejected": 0.10851707309484482, "step": 82 }, { "epoch": 0.07299587225722355, "grad_norm": 245.99900032176603, "learning_rate": 1.9952891320078235e-07, "logits/chosen": 0.2650616765022278, "logits/rejected": 0.2826390862464905, "logps/chosen": -517.3169555664062, "logps/rejected": -564.4183349609375, "loss": 0.6991, "nll_loss": 0.49650338292121887, "rewards/accuracies": 0.4375, "rewards/chosen": 0.193414106965065, "rewards/margins": -0.06524582207202911, "rewards/rejected": 0.2586599588394165, "step": 84 }, { "epoch": 0.07473386921572887, "grad_norm": 336.95168542352815, "learning_rate": 1.9947191143073184e-07, "logits/chosen": 0.3787465989589691, "logits/rejected": 0.31394198536872864, "logps/chosen": -476.78155517578125, "logps/rejected": -523.172607421875, "loss": 0.7061, "nll_loss": 0.48117950558662415, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2214973419904709, "rewards/margins": 0.1059291809797287, "rewards/rejected": 0.11556817591190338, "step": 86 }, { "epoch": 0.0764718661742342, "grad_norm": 154.50191840037084, "learning_rate": 1.9941166422020012e-07, "logits/chosen": 0.5573416352272034, "logits/rejected": 0.5087465643882751, "logps/chosen": -487.1637268066406, "logps/rejected": -495.28753662109375, "loss": 0.6828, "nll_loss": 0.5002313852310181, "rewards/accuracies": 0.5625, "rewards/chosen": 0.14835377037525177, "rewards/margins": -0.005983538925647736, "rewards/rejected": 0.1543373167514801, "step": 88 }, { "epoch": 0.07820986313273952, "grad_norm": 166.90828372159802, "learning_rate": 1.99348173534855e-07, "logits/chosen": 0.4055403769016266, "logits/rejected": 0.5100260972976685, "logps/chosen": -398.7601318359375, "logps/rejected": -449.6722106933594, "loss": 0.648, "nll_loss": 0.3796139061450958, "rewards/accuracies": 0.625, "rewards/chosen": 0.08756943047046661, "rewards/margins": 0.035093024373054504, "rewards/rejected": 0.05247640609741211, "step": 90 }, { "epoch": 0.07994786009124484, "grad_norm": 195.34860330922328, "learning_rate": 1.9928144144618822e-07, "logits/chosen": 0.35453662276268005, "logits/rejected": 0.17467136681079865, "logps/chosen": -527.5667114257812, "logps/rejected": -457.3980407714844, "loss": 0.6638, "nll_loss": 0.4913034439086914, "rewards/accuracies": 0.625, "rewards/chosen": 0.05260029435157776, "rewards/margins": 0.0027903541922569275, "rewards/rejected": 0.04980994015932083, "step": 92 }, { "epoch": 0.08168585704975016, "grad_norm": 193.22023003323437, "learning_rate": 1.992114701314478e-07, "logits/chosen": 0.43743571639060974, "logits/rejected": 0.4072650074958801, "logps/chosen": -459.80718994140625, "logps/rejected": -480.4713439941406, "loss": 0.6549, "nll_loss": 0.442990243434906, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11542568355798721, "rewards/margins": 0.06230240315198898, "rewards/rejected": 0.053123295307159424, "step": 94 }, { "epoch": 0.08342385400825549, "grad_norm": 152.59229490030327, "learning_rate": 1.9913826187356696e-07, "logits/chosen": 0.2101035714149475, "logits/rejected": 0.2910143733024597, "logps/chosen": -496.73968505859375, "logps/rejected": -562.0128784179688, "loss": 0.693, "nll_loss": 0.4974033236503601, "rewards/accuracies": 0.5, "rewards/chosen": 0.035814739763736725, "rewards/margins": -0.005833083763718605, "rewards/rejected": 0.04164781793951988, "step": 96 }, { "epoch": 0.08516185096676081, "grad_norm": 197.87306458058632, "learning_rate": 1.990618190610898e-07, "logits/chosen": 0.6426812410354614, "logits/rejected": 0.6269667148590088, "logps/chosen": -482.47320556640625, "logps/rejected": -498.030517578125, "loss": 0.6742, "nll_loss": 0.45372655987739563, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1103857010602951, "rewards/margins": 0.13235224783420563, "rewards/rejected": -0.02196655422449112, "step": 98 }, { "epoch": 0.08689984792526613, "grad_norm": 162.0131801417083, "learning_rate": 1.9898214418809327e-07, "logits/chosen": 0.30567625164985657, "logits/rejected": 0.3604564666748047, "logps/chosen": -491.9207458496094, "logps/rejected": -483.3408508300781, "loss": 0.6603, "nll_loss": 0.45867347717285156, "rewards/accuracies": 0.4375, "rewards/chosen": -0.022484881803393364, "rewards/margins": -0.0881936103105545, "rewards/rejected": 0.06570873409509659, "step": 100 }, { "epoch": 0.08863784488377145, "grad_norm": 196.02128008024215, "learning_rate": 1.9889923985410573e-07, "logits/chosen": 0.3616268038749695, "logits/rejected": 0.5539577603340149, "logps/chosen": -507.529541015625, "logps/rejected": -502.7597961425781, "loss": 0.6621, "nll_loss": 0.502344012260437, "rewards/accuracies": 0.75, "rewards/chosen": 0.13264751434326172, "rewards/margins": 0.15221844613552094, "rewards/rejected": -0.01957092434167862, "step": 102 }, { "epoch": 0.09037584184227677, "grad_norm": 218.21958324850453, "learning_rate": 1.9881310876402223e-07, "logits/chosen": 0.38627803325653076, "logits/rejected": 0.3380679786205292, "logps/chosen": -490.83709716796875, "logps/rejected": -445.6140441894531, "loss": 0.708, "nll_loss": 0.4629852771759033, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02814798429608345, "rewards/margins": 0.07472963631153107, "rewards/rejected": -0.04658164829015732, "step": 104 }, { "epoch": 0.0921138388007821, "grad_norm": 177.62809187588084, "learning_rate": 1.9872375372801627e-07, "logits/chosen": 0.48396801948547363, "logits/rejected": 0.7172459363937378, "logps/chosen": -467.0228576660156, "logps/rejected": -553.0386352539062, "loss": 0.6736, "nll_loss": 0.44019949436187744, "rewards/accuracies": 0.5, "rewards/chosen": 0.11023005843162537, "rewards/margins": 0.025584600865840912, "rewards/rejected": 0.08464546501636505, "step": 106 }, { "epoch": 0.09385183575928742, "grad_norm": 133.9957527520213, "learning_rate": 1.9863117766144804e-07, "logits/chosen": 0.6896294355392456, "logits/rejected": 0.5460060834884644, "logps/chosen": -519.9830322265625, "logps/rejected": -495.1474304199219, "loss": 0.6545, "nll_loss": 0.5037875771522522, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04275064915418625, "rewards/margins": 0.25670328736305237, "rewards/rejected": -0.21395263075828552, "step": 108 }, { "epoch": 0.09558983271779274, "grad_norm": 251.08859259838198, "learning_rate": 1.985353835847693e-07, "logits/chosen": 0.5879981517791748, "logits/rejected": 0.3931455612182617, "logps/chosen": -512.4330444335938, "logps/rejected": -445.7779541015625, "loss": 0.6627, "nll_loss": 0.49814072251319885, "rewards/accuracies": 0.875, "rewards/chosen": 0.01979961059987545, "rewards/margins": 0.12016735970973969, "rewards/rejected": -0.10036774724721909, "step": 110 }, { "epoch": 0.09732782967629806, "grad_norm": 251.4984918048756, "learning_rate": 1.9843637462342496e-07, "logits/chosen": 0.6077919602394104, "logits/rejected": 0.6450572609901428, "logps/chosen": -452.0365295410156, "logps/rejected": -490.2518310546875, "loss": 0.6746, "nll_loss": 0.44132208824157715, "rewards/accuracies": 0.5, "rewards/chosen": 0.06424455344676971, "rewards/margins": 0.1958976835012436, "rewards/rejected": -0.13165313005447388, "step": 112 }, { "epoch": 0.09906582663480339, "grad_norm": 176.76300504083784, "learning_rate": 1.9833415400775092e-07, "logits/chosen": 0.8745415210723877, "logits/rejected": 0.7276170253753662, "logps/chosen": -474.9569396972656, "logps/rejected": -453.19403076171875, "loss": 0.6874, "nll_loss": 0.45759472250938416, "rewards/accuracies": 0.5, "rewards/chosen": 0.04741678386926651, "rewards/margins": 0.028619293123483658, "rewards/rejected": 0.018797490745782852, "step": 114 }, { "epoch": 0.10080382359330871, "grad_norm": 195.6725076422546, "learning_rate": 1.9822872507286887e-07, "logits/chosen": 0.25701722502708435, "logits/rejected": 0.27016139030456543, "logps/chosen": -468.56292724609375, "logps/rejected": -488.0750732421875, "loss": 0.659, "nll_loss": 0.4411785900592804, "rewards/accuracies": 0.625, "rewards/chosen": 0.141248419880867, "rewards/margins": 0.12923917174339294, "rewards/rejected": 0.012009241618216038, "step": 116 }, { "epoch": 0.10254182055181403, "grad_norm": 223.97345705886528, "learning_rate": 1.9812009125857728e-07, "logits/chosen": 0.24996408820152283, "logits/rejected": 0.18013660609722137, "logps/chosen": -454.4783935546875, "logps/rejected": -452.7356872558594, "loss": 0.6587, "nll_loss": 0.4465644955635071, "rewards/accuracies": 0.5625, "rewards/chosen": 0.20268402993679047, "rewards/margins": 0.1341714859008789, "rewards/rejected": 0.06851252913475037, "step": 118 }, { "epoch": 0.10427981751031935, "grad_norm": 259.9580968692604, "learning_rate": 1.9800825610923934e-07, "logits/chosen": 0.4408469498157501, "logits/rejected": 0.41384851932525635, "logps/chosen": -448.9534912109375, "logps/rejected": -466.5663146972656, "loss": 0.7332, "nll_loss": 0.4316788911819458, "rewards/accuracies": 0.625, "rewards/chosen": 0.009281730279326439, "rewards/margins": 0.08196325600147247, "rewards/rejected": -0.07268151640892029, "step": 120 }, { "epoch": 0.10601781446882468, "grad_norm": 166.16660946467783, "learning_rate": 1.9789322327366719e-07, "logits/chosen": 0.4468734562397003, "logits/rejected": 0.2030533403158188, "logps/chosen": -529.8795776367188, "logps/rejected": -479.37957763671875, "loss": 0.6549, "nll_loss": 0.4917633831501007, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0391303114593029, "rewards/margins": 0.13398800790309906, "rewards/rejected": -0.17311832308769226, "step": 122 }, { "epoch": 0.10775581142733, "grad_norm": 223.08401763734403, "learning_rate": 1.97774996505003e-07, "logits/chosen": 0.5849189758300781, "logits/rejected": 0.4860993027687073, "logps/chosen": -477.4208984375, "logps/rejected": -426.2466125488281, "loss": 0.6735, "nll_loss": 0.4537978768348694, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07169342041015625, "rewards/margins": 0.189275324344635, "rewards/rejected": -0.11758189648389816, "step": 124 }, { "epoch": 0.10949380838583532, "grad_norm": 214.45226686525862, "learning_rate": 1.9765357966059635e-07, "logits/chosen": 0.3300391137599945, "logits/rejected": 0.3301427662372589, "logps/chosen": -495.9730224609375, "logps/rejected": -525.8510131835938, "loss": 0.6301, "nll_loss": 0.4744013547897339, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016232872381806374, "rewards/margins": 0.2226220965385437, "rewards/rejected": -0.20638923346996307, "step": 126 }, { "epoch": 0.11123180534434064, "grad_norm": 196.16284112503507, "learning_rate": 1.975289767018786e-07, "logits/chosen": 0.383301705121994, "logits/rejected": 0.3108516037464142, "logps/chosen": -477.2252502441406, "logps/rejected": -413.8544921875, "loss": 0.6543, "nll_loss": 0.4823342263698578, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07313746213912964, "rewards/margins": -0.002835087478160858, "rewards/rejected": 0.0759725570678711, "step": 128 }, { "epoch": 0.11296980230284596, "grad_norm": 172.7848965773789, "learning_rate": 1.9740119169423335e-07, "logits/chosen": 0.41243690252304077, "logits/rejected": 0.500731885433197, "logps/chosen": -560.973876953125, "logps/rejected": -556.9948120117188, "loss": 0.6745, "nll_loss": 0.537980854511261, "rewards/accuracies": 0.8125, "rewards/chosen": 0.018784141167998314, "rewards/margins": 0.18619613349437714, "rewards/rejected": -0.16741199791431427, "step": 130 }, { "epoch": 0.11470779926135129, "grad_norm": 107.09619636786023, "learning_rate": 1.972702288068641e-07, "logits/chosen": 0.5883492827415466, "logits/rejected": 0.6014207601547241, "logps/chosen": -492.56719970703125, "logps/rejected": -571.8795166015625, "loss": 0.6591, "nll_loss": 0.4626290500164032, "rewards/accuracies": 0.75, "rewards/chosen": 0.03946494683623314, "rewards/margins": 0.3372575640678406, "rewards/rejected": -0.29779261350631714, "step": 132 }, { "epoch": 0.11644579621985661, "grad_norm": 450.5904425564273, "learning_rate": 1.9713609231265803e-07, "logits/chosen": 0.2925584614276886, "logits/rejected": 0.2094520628452301, "logps/chosen": -466.8580322265625, "logps/rejected": -448.7713928222656, "loss": 0.7074, "nll_loss": 0.4871719181537628, "rewards/accuracies": 0.625, "rewards/chosen": 0.10256430506706238, "rewards/margins": -0.13159841299057007, "rewards/rejected": 0.23416268825531006, "step": 134 }, { "epoch": 0.11818379317836193, "grad_norm": 230.5018212403494, "learning_rate": 1.969987865880467e-07, "logits/chosen": 0.471775084733963, "logits/rejected": 0.591442883014679, "logps/chosen": -499.01312255859375, "logps/rejected": -505.15234375, "loss": 0.6955, "nll_loss": 0.4720504581928253, "rewards/accuracies": 0.375, "rewards/chosen": -0.08871403336524963, "rewards/margins": -0.016466330736875534, "rewards/rejected": -0.0722476989030838, "step": 136 }, { "epoch": 0.11992179013686725, "grad_norm": 140.2346299082145, "learning_rate": 1.968583161128631e-07, "logits/chosen": 0.3392627239227295, "logits/rejected": 0.4022282660007477, "logps/chosen": -459.1971740722656, "logps/rejected": -466.5972595214844, "loss": 0.6409, "nll_loss": 0.4637642502784729, "rewards/accuracies": 0.5, "rewards/chosen": 0.10886505246162415, "rewards/margins": 0.055580608546733856, "rewards/rejected": 0.05328445881605148, "step": 138 }, { "epoch": 0.12165978709537258, "grad_norm": 231.29684861071243, "learning_rate": 1.967146854701957e-07, "logits/chosen": 0.5594834685325623, "logits/rejected": 0.39609870314598083, "logps/chosen": -528.64453125, "logps/rejected": -495.0397033691406, "loss": 0.7025, "nll_loss": 0.49201473593711853, "rewards/accuracies": 0.5, "rewards/chosen": 0.022034645080566406, "rewards/margins": 0.07131557166576385, "rewards/rejected": -0.04928094893693924, "step": 140 }, { "epoch": 0.12339778405387791, "grad_norm": 149.25121625290038, "learning_rate": 1.965678993462388e-07, "logits/chosen": 0.3449370265007019, "logits/rejected": 0.48329412937164307, "logps/chosen": -487.4235534667969, "logps/rejected": -518.1926879882812, "loss": 0.6558, "nll_loss": 0.4604225754737854, "rewards/accuracies": 0.5625, "rewards/chosen": -0.023956965655088425, "rewards/margins": 0.1537911593914032, "rewards/rejected": -0.17774812877178192, "step": 142 }, { "epoch": 0.12513578101238323, "grad_norm": 378.4646764352743, "learning_rate": 1.9641796253013955e-07, "logits/chosen": 0.40967145562171936, "logits/rejected": 0.7070332169532776, "logps/chosen": -441.1578063964844, "logps/rejected": -520.5462646484375, "loss": 0.6765, "nll_loss": 0.4579017758369446, "rewards/accuracies": 0.75, "rewards/chosen": 0.0898236334323883, "rewards/margins": 0.07469739019870758, "rewards/rejected": 0.015126226469874382, "step": 144 }, { "epoch": 0.12687377797088856, "grad_norm": 173.82766602111045, "learning_rate": 1.9626487991384193e-07, "logits/chosen": 0.7006528377532959, "logits/rejected": 0.6683908700942993, "logps/chosen": -466.95941162109375, "logps/rejected": -433.3965759277344, "loss": 0.6957, "nll_loss": 0.4601879417896271, "rewards/accuracies": 0.625, "rewards/chosen": 0.00033187679946422577, "rewards/margins": 0.07793959975242615, "rewards/rejected": -0.07760772109031677, "step": 146 }, { "epoch": 0.12861177492939388, "grad_norm": 186.16920415747254, "learning_rate": 1.9610865649192693e-07, "logits/chosen": 0.5055305361747742, "logits/rejected": 0.34722912311553955, "logps/chosen": -526.1424560546875, "logps/rejected": -392.083740234375, "loss": 0.7046, "nll_loss": 0.4694558382034302, "rewards/accuracies": 0.5625, "rewards/chosen": -0.25952208042144775, "rewards/margins": -0.08689786493778229, "rewards/rejected": -0.17262420058250427, "step": 148 }, { "epoch": 0.1303497718878992, "grad_norm": 140.55582904451313, "learning_rate": 1.9594929736144973e-07, "logits/chosen": 0.6043753623962402, "logits/rejected": 0.5178260803222656, "logps/chosen": -484.9609375, "logps/rejected": -496.45562744140625, "loss": 0.6977, "nll_loss": 0.4659056067466736, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002574533224105835, "rewards/margins": 0.12758272886276245, "rewards/rejected": -0.13015729188919067, "step": 150 }, { "epoch": 0.13208776884640452, "grad_norm": 144.45587301387354, "learning_rate": 1.9578680772177326e-07, "logits/chosen": 0.6586911082267761, "logits/rejected": 0.6202735304832458, "logps/chosen": -486.1122741699219, "logps/rejected": -505.4885559082031, "loss": 0.6498, "nll_loss": 0.45736804604530334, "rewards/accuracies": 0.4375, "rewards/chosen": -0.034781839698553085, "rewards/margins": -0.023356247693300247, "rewards/rejected": -0.011425594799220562, "step": 152 }, { "epoch": 0.13382576580490985, "grad_norm": 168.4370224409038, "learning_rate": 1.956211928743987e-07, "logits/chosen": 0.3550521731376648, "logits/rejected": 0.37308141589164734, "logps/chosen": -452.2578430175781, "logps/rejected": -458.32623291015625, "loss": 0.7033, "nll_loss": 0.44661641120910645, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0856170654296875, "rewards/margins": 0.16778725385665894, "rewards/rejected": -0.08217020332813263, "step": 154 }, { "epoch": 0.13556376276341517, "grad_norm": 141.91361126736982, "learning_rate": 1.9545245822279242e-07, "logits/chosen": 0.6770870685577393, "logits/rejected": 0.6404905915260315, "logps/chosen": -462.7332763671875, "logps/rejected": -408.6764831542969, "loss": 0.6552, "nll_loss": 0.4415394067764282, "rewards/accuracies": 0.625, "rewards/chosen": -0.07933865487575531, "rewards/margins": 0.038278963416814804, "rewards/rejected": -0.11761761456727982, "step": 156 }, { "epoch": 0.1373017597219205, "grad_norm": 230.74705101230725, "learning_rate": 1.9528060927220979e-07, "logits/chosen": 0.5159198045730591, "logits/rejected": 0.6130585074424744, "logps/chosen": -481.01568603515625, "logps/rejected": -521.5822143554688, "loss": 0.6778, "nll_loss": 0.47335052490234375, "rewards/accuracies": 0.4375, "rewards/chosen": -0.14715270698070526, "rewards/margins": 0.11525917053222656, "rewards/rejected": -0.262411892414093, "step": 158 }, { "epoch": 0.1390397566804258, "grad_norm": 202.01845557198166, "learning_rate": 1.9510565162951537e-07, "logits/chosen": 0.7327361702919006, "logits/rejected": 0.7527709603309631, "logps/chosen": -461.3598937988281, "logps/rejected": -520.5428466796875, "loss": 0.6427, "nll_loss": 0.4450530409812927, "rewards/accuracies": 0.4375, "rewards/chosen": 0.05635242164134979, "rewards/margins": 0.10485029220581055, "rewards/rejected": -0.048497870564460754, "step": 160 }, { "epoch": 0.14077775363893114, "grad_norm": 195.27141708111617, "learning_rate": 1.9492759100300015e-07, "logits/chosen": 0.35761919617652893, "logits/rejected": 0.39191699028015137, "logps/chosen": -457.64892578125, "logps/rejected": -505.98175048828125, "loss": 0.6494, "nll_loss": 0.4446874260902405, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0755319595336914, "rewards/margins": 0.2582038938999176, "rewards/rejected": -0.3337358832359314, "step": 162 }, { "epoch": 0.14251575059743646, "grad_norm": 156.30006812274797, "learning_rate": 1.947464332021953e-07, "logits/chosen": 0.4818640947341919, "logits/rejected": 0.49132969975471497, "logps/chosen": -516.2056274414062, "logps/rejected": -585.5494995117188, "loss": 0.6678, "nll_loss": 0.46864089369773865, "rewards/accuracies": 0.625, "rewards/chosen": 0.0640132874250412, "rewards/margins": 0.2289627343416214, "rewards/rejected": -0.1649494171142578, "step": 164 }, { "epoch": 0.14425374755594178, "grad_norm": 240.13788888052562, "learning_rate": 1.9456218413768248e-07, "logits/chosen": 0.3798179030418396, "logits/rejected": 0.5492743253707886, "logps/chosen": -508.6365051269531, "logps/rejected": -582.9490966796875, "loss": 0.6732, "nll_loss": 0.4977247714996338, "rewards/accuracies": 0.5625, "rewards/chosen": -0.009553920477628708, "rewards/margins": 0.3013032376766205, "rewards/rejected": -0.3108571171760559, "step": 166 }, { "epoch": 0.1459917445144471, "grad_norm": 204.66513708394191, "learning_rate": 1.9437484982090119e-07, "logits/chosen": 0.22984851896762848, "logits/rejected": 0.21164825558662415, "logps/chosen": -551.8775024414062, "logps/rejected": -499.84112548828125, "loss": 0.6811, "nll_loss": 0.5149954557418823, "rewards/accuracies": 0.75, "rewards/chosen": 0.07367496192455292, "rewards/margins": 0.2555732727050781, "rewards/rejected": -0.1818983256816864, "step": 168 }, { "epoch": 0.14772974147295242, "grad_norm": 207.99963697618364, "learning_rate": 1.941844363639525e-07, "logits/chosen": 0.26835981011390686, "logits/rejected": 0.3549097776412964, "logps/chosen": -486.7258605957031, "logps/rejected": -511.31573486328125, "loss": 0.6888, "nll_loss": 0.48714739084243774, "rewards/accuracies": 0.5, "rewards/chosen": 0.2133750021457672, "rewards/margins": -0.16475935280323029, "rewards/rejected": 0.3781343698501587, "step": 170 }, { "epoch": 0.14946773843145775, "grad_norm": 196.09540219875737, "learning_rate": 1.9399094997939956e-07, "logits/chosen": 0.6336839199066162, "logits/rejected": 0.5925776958465576, "logps/chosen": -450.0120544433594, "logps/rejected": -426.1235046386719, "loss": 0.6385, "nll_loss": 0.41613397002220154, "rewards/accuracies": 0.625, "rewards/chosen": 0.3170146942138672, "rewards/margins": 0.191303551197052, "rewards/rejected": 0.12571117281913757, "step": 172 }, { "epoch": 0.15120573538996307, "grad_norm": 240.86095927305658, "learning_rate": 1.937943969800652e-07, "logits/chosen": 0.2430100440979004, "logits/rejected": 0.46853843331336975, "logps/chosen": -448.8415222167969, "logps/rejected": -533.2725219726562, "loss": 0.6814, "nll_loss": 0.4187857210636139, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17961281538009644, "rewards/margins": 0.09986276179552078, "rewards/rejected": 0.07975006848573685, "step": 174 }, { "epoch": 0.1529437323484684, "grad_norm": 166.34961492388385, "learning_rate": 1.9359478377882566e-07, "logits/chosen": 0.6643735766410828, "logits/rejected": 0.6006796360015869, "logps/chosen": -500.4281311035156, "logps/rejected": -508.8857421875, "loss": 0.6521, "nll_loss": 0.46919527649879456, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2530907392501831, "rewards/margins": 0.1539958119392395, "rewards/rejected": 0.09909495711326599, "step": 176 }, { "epoch": 0.15468172930697371, "grad_norm": 157.1484708841, "learning_rate": 1.9339211688840155e-07, "logits/chosen": 0.48778361082077026, "logits/rejected": 0.5057650804519653, "logps/chosen": -546.6986694335938, "logps/rejected": -591.006591796875, "loss": 0.6418, "nll_loss": 0.5191956162452698, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2293836623430252, "rewards/margins": 0.06616592407226562, "rewards/rejected": 0.16321773827075958, "step": 178 }, { "epoch": 0.15641972626547904, "grad_norm": 179.7526837035682, "learning_rate": 1.9318640292114523e-07, "logits/chosen": 0.14600294828414917, "logits/rejected": 0.18026244640350342, "logps/chosen": -515.8131103515625, "logps/rejected": -492.68585205078125, "loss": 0.6491, "nll_loss": 0.49840864539146423, "rewards/accuracies": 0.75, "rewards/chosen": 0.3661031723022461, "rewards/margins": 0.4645713269710541, "rewards/rejected": -0.09846819937229156, "step": 180 }, { "epoch": 0.15815772322398436, "grad_norm": 156.21169756335783, "learning_rate": 1.9297764858882513e-07, "logits/chosen": 0.2845636308193207, "logits/rejected": 0.3478243052959442, "logps/chosen": -433.43304443359375, "logps/rejected": -502.88873291015625, "loss": 0.7059, "nll_loss": 0.4498353600502014, "rewards/accuracies": 0.5625, "rewards/chosen": 0.032468315213918686, "rewards/margins": 0.11264065653085709, "rewards/rejected": -0.0801723524928093, "step": 182 }, { "epoch": 0.15989572018248968, "grad_norm": 114.38837551366716, "learning_rate": 1.9276586070240682e-07, "logits/chosen": 0.5281504988670349, "logits/rejected": 0.5060007572174072, "logps/chosen": -533.6888427734375, "logps/rejected": -484.2074279785156, "loss": 0.6467, "nll_loss": 0.5050809383392334, "rewards/accuracies": 0.4375, "rewards/chosen": 0.07081624865531921, "rewards/margins": -0.019865036010742188, "rewards/rejected": 0.0906812772154808, "step": 184 }, { "epoch": 0.161633717140995, "grad_norm": 190.3601062924143, "learning_rate": 1.9255104617183066e-07, "logits/chosen": 0.42118650674819946, "logits/rejected": 0.551128089427948, "logps/chosen": -447.9131774902344, "logps/rejected": -466.6624755859375, "loss": 0.6818, "nll_loss": 0.42739805579185486, "rewards/accuracies": 0.625, "rewards/chosen": 0.13303765654563904, "rewards/margins": 0.11362534016370773, "rewards/rejected": 0.019412323832511902, "step": 186 }, { "epoch": 0.16337171409950033, "grad_norm": 174.81457999255366, "learning_rate": 1.9233321200578657e-07, "logits/chosen": 0.6645621061325073, "logits/rejected": 0.7099349498748779, "logps/chosen": -479.3392028808594, "logps/rejected": -501.9243469238281, "loss": 0.7046, "nll_loss": 0.45945340394973755, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11817818135023117, "rewards/margins": 0.06987819075584412, "rewards/rejected": 0.04829998314380646, "step": 188 }, { "epoch": 0.16510971105800565, "grad_norm": 214.65093516136065, "learning_rate": 1.92112365311485e-07, "logits/chosen": 0.488328754901886, "logits/rejected": 0.4507230818271637, "logps/chosen": -542.5758056640625, "logps/rejected": -518.2319946289062, "loss": 0.6863, "nll_loss": 0.5090084075927734, "rewards/accuracies": 0.5, "rewards/chosen": -0.04686184227466583, "rewards/margins": -0.026256389915943146, "rewards/rejected": -0.02060546912252903, "step": 190 }, { "epoch": 0.16684770801651097, "grad_norm": 175.7065914966413, "learning_rate": 1.9188851329442546e-07, "logits/chosen": 0.3762364089488983, "logits/rejected": 0.42694732546806335, "logps/chosen": -578.720703125, "logps/rejected": -550.4427490234375, "loss": 0.7033, "nll_loss": 0.5272043943405151, "rewards/accuracies": 0.5625, "rewards/chosen": 0.031178677454590797, "rewards/margins": 0.00896589457988739, "rewards/rejected": 0.02221280336380005, "step": 192 }, { "epoch": 0.1685857049750163, "grad_norm": 243.86672474625263, "learning_rate": 1.9166166325816117e-07, "logits/chosen": 0.43783852458000183, "logits/rejected": 0.41702184081077576, "logps/chosen": -455.8414001464844, "logps/rejected": -523.8115234375, "loss": 0.6369, "nll_loss": 0.45409929752349854, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2518928647041321, "rewards/margins": 0.3256605863571167, "rewards/rejected": -0.07376771420240402, "step": 194 }, { "epoch": 0.17032370193352162, "grad_norm": 169.63122858450672, "learning_rate": 1.9143182260406076e-07, "logits/chosen": 0.32926619052886963, "logits/rejected": 0.3965323567390442, "logps/chosen": -518.8862915039062, "logps/rejected": -475.6866760253906, "loss": 0.6773, "nll_loss": 0.49594274163246155, "rewards/accuracies": 0.4375, "rewards/chosen": 0.010239500552415848, "rewards/margins": -0.037561044096946716, "rewards/rejected": 0.04780054837465286, "step": 196 }, { "epoch": 0.17206169889202694, "grad_norm": 239.31635495177088, "learning_rate": 1.91198998831067e-07, "logits/chosen": 0.2399873584508896, "logits/rejected": 0.3778996467590332, "logps/chosen": -429.89898681640625, "logps/rejected": -490.7498779296875, "loss": 0.6738, "nll_loss": 0.4365084767341614, "rewards/accuracies": 0.5625, "rewards/chosen": 0.49901849031448364, "rewards/margins": 0.2948850393295288, "rewards/rejected": 0.20413342118263245, "step": 198 }, { "epoch": 0.17379969585053226, "grad_norm": 442.23377481682337, "learning_rate": 1.9096319953545185e-07, "logits/chosen": 0.2612355947494507, "logits/rejected": 0.21945816278457642, "logps/chosen": -421.193359375, "logps/rejected": -504.9490966796875, "loss": 0.6253, "nll_loss": 0.41177114844322205, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4579131007194519, "rewards/margins": 0.3565528094768524, "rewards/rejected": 0.10136031359434128, "step": 200 }, { "epoch": 0.17553769280903758, "grad_norm": 183.97075955828043, "learning_rate": 1.9072443241056882e-07, "logits/chosen": 0.37889376282691956, "logits/rejected": 0.3554326593875885, "logps/chosen": -549.0498657226562, "logps/rejected": -564.4247436523438, "loss": 0.6834, "nll_loss": 0.5282487869262695, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3155590295791626, "rewards/margins": 0.1674705594778061, "rewards/rejected": 0.1480884701013565, "step": 202 }, { "epoch": 0.1772756897675429, "grad_norm": 212.7149629473822, "learning_rate": 1.9048270524660196e-07, "logits/chosen": 0.4008958041667938, "logits/rejected": 0.5223476886749268, "logps/chosen": -499.8232116699219, "logps/rejected": -517.2408447265625, "loss": 0.6574, "nll_loss": 0.4931824207305908, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2734750509262085, "rewards/margins": -0.03305815905332565, "rewards/rejected": 0.3065332770347595, "step": 204 }, { "epoch": 0.17901368672604823, "grad_norm": 123.98091350420249, "learning_rate": 1.9023802593031153e-07, "logits/chosen": 0.678055465221405, "logits/rejected": 0.49451562762260437, "logps/chosen": -457.2571716308594, "logps/rejected": -453.9916076660156, "loss": 0.6521, "nll_loss": 0.45098334550857544, "rewards/accuracies": 0.625, "rewards/chosen": 0.20749235153198242, "rewards/margins": 0.07694320380687714, "rewards/rejected": 0.13054914772510529, "step": 206 }, { "epoch": 0.18075168368455355, "grad_norm": 122.84485776556372, "learning_rate": 1.899904024447769e-07, "logits/chosen": 0.4941789507865906, "logits/rejected": 0.4557379186153412, "logps/chosen": -494.46221923828125, "logps/rejected": -435.7831115722656, "loss": 0.5938, "nll_loss": 0.4718150496482849, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15648937225341797, "rewards/margins": 0.09144438803195953, "rewards/rejected": 0.06504497677087784, "step": 208 }, { "epoch": 0.18248968064305887, "grad_norm": 171.76865319699047, "learning_rate": 1.8973984286913583e-07, "logits/chosen": 0.3957821726799011, "logits/rejected": 0.45247000455856323, "logps/chosen": -439.8001708984375, "logps/rejected": -512.7393798828125, "loss": 0.6688, "nll_loss": 0.4396136999130249, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4999141991138458, "rewards/margins": 0.47802239656448364, "rewards/rejected": 0.02189178764820099, "step": 210 }, { "epoch": 0.1842276776015642, "grad_norm": 160.6284900707324, "learning_rate": 1.8948635537832118e-07, "logits/chosen": 0.8007611036300659, "logits/rejected": 0.8054407835006714, "logps/chosen": -540.0386352539062, "logps/rejected": -521.6272583007812, "loss": 0.6516, "nll_loss": 0.5008341073989868, "rewards/accuracies": 0.625, "rewards/chosen": 0.3313617706298828, "rewards/margins": 0.06706114113330841, "rewards/rejected": 0.2643006443977356, "step": 212 }, { "epoch": 0.18596567456006952, "grad_norm": 274.1780536054067, "learning_rate": 1.8922994824279393e-07, "logits/chosen": 0.4809880256652832, "logits/rejected": 0.5513718128204346, "logps/chosen": -487.8301696777344, "logps/rejected": -520.1082763671875, "loss": 0.6354, "nll_loss": 0.47460833191871643, "rewards/accuracies": 0.5, "rewards/chosen": 0.3392031490802765, "rewards/margins": 0.3161538243293762, "rewards/rejected": 0.023049363866448402, "step": 214 }, { "epoch": 0.18770367151857484, "grad_norm": 231.93646441377587, "learning_rate": 1.8897062982827343e-07, "logits/chosen": 0.3585096001625061, "logits/rejected": 0.5087136030197144, "logps/chosen": -442.2716064453125, "logps/rejected": -525.7360229492188, "loss": 0.652, "nll_loss": 0.4545535743236542, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47251102328300476, "rewards/margins": 0.3076295256614685, "rewards/rejected": 0.16488152742385864, "step": 216 }, { "epoch": 0.18944166847708016, "grad_norm": 149.52722877002068, "learning_rate": 1.8870840859546453e-07, "logits/chosen": 0.5101938247680664, "logits/rejected": 0.45677730441093445, "logps/chosen": -560.6141967773438, "logps/rejected": -477.9551696777344, "loss": 0.6815, "nll_loss": 0.4999621510505676, "rewards/accuracies": 0.5, "rewards/chosen": 0.3215439021587372, "rewards/margins": 0.1319688856601715, "rewards/rejected": 0.18957501649856567, "step": 218 }, { "epoch": 0.19117966543558548, "grad_norm": 153.20737081906722, "learning_rate": 1.8844329309978143e-07, "logits/chosen": 0.24976252019405365, "logits/rejected": 0.14467594027519226, "logps/chosen": -422.8174743652344, "logps/rejected": -497.1708984375, "loss": 0.6218, "nll_loss": 0.42366060614585876, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4763959050178528, "rewards/margins": 0.3291512727737427, "rewards/rejected": 0.1472446471452713, "step": 220 }, { "epoch": 0.1929176623940908, "grad_norm": 312.09631072055066, "learning_rate": 1.8817529199106857e-07, "logits/chosen": 0.4518931806087494, "logits/rejected": 0.4724608063697815, "logps/chosen": -480.308837890625, "logps/rejected": -486.3529968261719, "loss": 0.6416, "nll_loss": 0.4695611000061035, "rewards/accuracies": 0.75, "rewards/chosen": 0.3075574040412903, "rewards/margins": 0.2838830053806305, "rewards/rejected": 0.023674391210079193, "step": 222 }, { "epoch": 0.19465565935259613, "grad_norm": 291.3739480103135, "learning_rate": 1.8790441401331843e-07, "logits/chosen": 0.42263418436050415, "logits/rejected": 0.2607260048389435, "logps/chosen": -553.7242431640625, "logps/rejected": -509.631103515625, "loss": 0.616, "nll_loss": 0.5167679786682129, "rewards/accuracies": 0.75, "rewards/chosen": 0.40777724981307983, "rewards/margins": 0.2215692400932312, "rewards/rejected": 0.18620796501636505, "step": 224 }, { "epoch": 0.19639365631110145, "grad_norm": 135.6469032071128, "learning_rate": 1.8763066800438634e-07, "logits/chosen": 0.11122694611549377, "logits/rejected": 0.23442375659942627, "logps/chosen": -451.4000244140625, "logps/rejected": -476.5044860839844, "loss": 0.6749, "nll_loss": 0.4101489782333374, "rewards/accuracies": 0.75, "rewards/chosen": 0.4101632833480835, "rewards/margins": 0.17843975126743317, "rewards/rejected": 0.23172350227832794, "step": 226 }, { "epoch": 0.19813165326960677, "grad_norm": 233.63954629459604, "learning_rate": 1.873540628957019e-07, "logits/chosen": 0.3454923629760742, "logits/rejected": 0.31212350726127625, "logps/chosen": -476.87188720703125, "logps/rejected": -442.26727294921875, "loss": 0.6688, "nll_loss": 0.46371322870254517, "rewards/accuracies": 0.625, "rewards/chosen": 0.2975081205368042, "rewards/margins": 0.16047295928001404, "rewards/rejected": 0.13703517615795135, "step": 228 }, { "epoch": 0.1998696502281121, "grad_norm": 175.03944828947417, "learning_rate": 1.8707460771197773e-07, "logits/chosen": 0.5401207208633423, "logits/rejected": 0.46543338894844055, "logps/chosen": -408.17254638671875, "logps/rejected": -425.7143859863281, "loss": 0.653, "nll_loss": 0.3989378809928894, "rewards/accuracies": 0.625, "rewards/chosen": 0.22502900660037994, "rewards/margins": 0.1482989490032196, "rewards/rejected": 0.07673005759716034, "step": 230 }, { "epoch": 0.20160764718661742, "grad_norm": 163.9379882296477, "learning_rate": 1.8679231157091504e-07, "logits/chosen": 0.276883989572525, "logits/rejected": 0.21855014562606812, "logps/chosen": -479.3912353515625, "logps/rejected": -444.1050109863281, "loss": 0.7128, "nll_loss": 0.46482211351394653, "rewards/accuracies": 0.4375, "rewards/chosen": 0.21144676208496094, "rewards/margins": 0.10382099449634552, "rewards/rejected": 0.10762576013803482, "step": 232 }, { "epoch": 0.20334564414512274, "grad_norm": 148.0762245669831, "learning_rate": 1.865071836829061e-07, "logits/chosen": 0.5332834124565125, "logits/rejected": 0.6724749207496643, "logps/chosen": -514.0674438476562, "logps/rejected": -577.0899658203125, "loss": 0.6766, "nll_loss": 0.513380229473114, "rewards/accuracies": 0.375, "rewards/chosen": -0.020297817885875702, "rewards/margins": 0.03417910635471344, "rewards/rejected": -0.05447692424058914, "step": 234 }, { "epoch": 0.20508364110362806, "grad_norm": 321.381717812454, "learning_rate": 1.8621923335073374e-07, "logits/chosen": 0.41222482919692993, "logits/rejected": 0.49463099241256714, "logps/chosen": -458.3787841796875, "logps/rejected": -498.8921203613281, "loss": 0.6443, "nll_loss": 0.4330087900161743, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3461689054965973, "rewards/margins": 0.3577164113521576, "rewards/rejected": -0.01154746487736702, "step": 236 }, { "epoch": 0.20682163806213338, "grad_norm": 142.48431563998605, "learning_rate": 1.859284699692679e-07, "logits/chosen": 0.31183576583862305, "logits/rejected": 0.3098088204860687, "logps/chosen": -494.8736877441406, "logps/rejected": -479.8580322265625, "loss": 0.6803, "nll_loss": 0.44527143239974976, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39485302567481995, "rewards/margins": 0.4110240936279297, "rewards/rejected": -0.01617107354104519, "step": 238 }, { "epoch": 0.2085596350206387, "grad_norm": 180.45494632724726, "learning_rate": 1.856349030251589e-07, "logits/chosen": 0.352733314037323, "logits/rejected": 0.4097552001476288, "logps/chosen": -500.9227600097656, "logps/rejected": -464.92645263671875, "loss": 0.624, "nll_loss": 0.47565579414367676, "rewards/accuracies": 0.3125, "rewards/chosen": -0.10063820332288742, "rewards/margins": -0.12395801395177841, "rewards/rejected": 0.023319821804761887, "step": 240 }, { "epoch": 0.21029763197914403, "grad_norm": 254.73609406952772, "learning_rate": 1.8533854209652816e-07, "logits/chosen": 0.5989456176757812, "logits/rejected": 0.6028575897216797, "logps/chosen": -549.3515014648438, "logps/rejected": -577.7095336914062, "loss": 0.6406, "nll_loss": 0.5223766565322876, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07228565216064453, "rewards/margins": 0.14688776433467865, "rewards/rejected": -0.21917343139648438, "step": 242 }, { "epoch": 0.21203562893764935, "grad_norm": 149.56967633528575, "learning_rate": 1.8503939685265566e-07, "logits/chosen": 0.508268415927887, "logits/rejected": 0.5876305103302002, "logps/chosen": -490.4624328613281, "logps/rejected": -507.3514099121094, "loss": 0.6362, "nll_loss": 0.46683698892593384, "rewards/accuracies": 0.5, "rewards/chosen": 0.1168375164270401, "rewards/margins": 0.04881094768643379, "rewards/rejected": 0.06802655756473541, "step": 244 }, { "epoch": 0.21377362589615467, "grad_norm": 170.4776083017974, "learning_rate": 1.8473747705366425e-07, "logits/chosen": 0.5282714366912842, "logits/rejected": 0.4593840539455414, "logps/chosen": -501.4341125488281, "logps/rejected": -465.8924865722656, "loss": 0.6624, "nll_loss": 0.4600476324558258, "rewards/accuracies": 0.5625, "rewards/chosen": -0.015180783346295357, "rewards/margins": 0.0027478188276290894, "rewards/rejected": -0.017928607761859894, "step": 246 }, { "epoch": 0.21551162285466, "grad_norm": 219.96275433334395, "learning_rate": 1.844327925502015e-07, "logits/chosen": 0.5558596253395081, "logits/rejected": 0.49703449010849, "logps/chosen": -519.3560180664062, "logps/rejected": -495.0567626953125, "loss": 0.6754, "nll_loss": 0.5084698796272278, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0020979903638362885, "rewards/margins": -0.023166075348854065, "rewards/rejected": 0.025264078751206398, "step": 248 }, { "epoch": 0.21724961981316532, "grad_norm": 147.08443939240553, "learning_rate": 1.8412535328311812e-07, "logits/chosen": 0.3448954224586487, "logits/rejected": 0.5552514791488647, "logps/chosen": -468.8410339355469, "logps/rejected": -556.8023681640625, "loss": 0.6558, "nll_loss": 0.49719932675361633, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1901819258928299, "rewards/margins": 0.39644375443458557, "rewards/rejected": -0.20626182854175568, "step": 250 }, { "epoch": 0.21898761677167064, "grad_norm": 114.32403562795194, "learning_rate": 1.8381516928314365e-07, "logits/chosen": 0.40455546975135803, "logits/rejected": 0.4251713156700134, "logps/chosen": -513.6260986328125, "logps/rejected": -538.343505859375, "loss": 0.6049, "nll_loss": 0.4971601963043213, "rewards/accuracies": 0.625, "rewards/chosen": 0.06984252482652664, "rewards/margins": 0.26666852831840515, "rewards/rejected": -0.19682598114013672, "step": 252 }, { "epoch": 0.22072561373017596, "grad_norm": 205.23489490998628, "learning_rate": 1.8350225067055925e-07, "logits/chosen": 0.3792281150817871, "logits/rejected": 0.4229609966278076, "logps/chosen": -504.125732421875, "logps/rejected": -526.7474975585938, "loss": 0.6086, "nll_loss": 0.4739050269126892, "rewards/accuracies": 0.625, "rewards/chosen": 0.09776955097913742, "rewards/margins": 0.28990498185157776, "rewards/rejected": -0.19213542342185974, "step": 254 }, { "epoch": 0.22246361068868128, "grad_norm": 188.55359487032592, "learning_rate": 1.8318660765486747e-07, "logits/chosen": 0.545383095741272, "logits/rejected": 0.46527981758117676, "logps/chosen": -475.6557312011719, "logps/rejected": -476.0321350097656, "loss": 0.6352, "nll_loss": 0.47079068422317505, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13502216339111328, "rewards/margins": 0.1221860870718956, "rewards/rejected": 0.012836072593927383, "step": 256 }, { "epoch": 0.2242016076471866, "grad_norm": 148.2343414448246, "learning_rate": 1.8286825053445916e-07, "logits/chosen": 0.34472447633743286, "logits/rejected": 0.2841368317604065, "logps/chosen": -480.28387451171875, "logps/rejected": -527.3165893554688, "loss": 0.6399, "nll_loss": 0.4767462909221649, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17577162384986877, "rewards/margins": 0.5391324162483215, "rewards/rejected": -0.36336082220077515, "step": 258 }, { "epoch": 0.22593960460569193, "grad_norm": 177.1228089021279, "learning_rate": 1.8254718969627739e-07, "logits/chosen": 0.38141781091690063, "logits/rejected": 0.5402300953865051, "logps/chosen": -436.6199645996094, "logps/rejected": -474.4032287597656, "loss": 0.6239, "nll_loss": 0.4355260133743286, "rewards/accuracies": 0.5625, "rewards/chosen": 0.036749646067619324, "rewards/margins": 0.2768980860710144, "rewards/rejected": -0.24014845490455627, "step": 260 }, { "epoch": 0.22767760156419725, "grad_norm": 234.61717429643062, "learning_rate": 1.8222343561547872e-07, "logits/chosen": 0.5197017788887024, "logits/rejected": 0.43862244486808777, "logps/chosen": -532.12158203125, "logps/rejected": -473.5322265625, "loss": 0.6838, "nll_loss": 0.4841741621494293, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04262780770659447, "rewards/margins": 0.08237245678901672, "rewards/rejected": -0.12500028312206268, "step": 262 }, { "epoch": 0.22941559852270257, "grad_norm": 127.58154940984313, "learning_rate": 1.8189699885509127e-07, "logits/chosen": 0.2861557900905609, "logits/rejected": 0.18688474595546722, "logps/chosen": -466.5064392089844, "logps/rejected": -457.0404052734375, "loss": 0.5996, "nll_loss": 0.4528616964817047, "rewards/accuracies": 0.625, "rewards/chosen": 0.14327269792556763, "rewards/margins": 0.2772614657878876, "rewards/rejected": -0.13398876786231995, "step": 264 }, { "epoch": 0.2311535954812079, "grad_norm": 197.96923520461218, "learning_rate": 1.8156789006567017e-07, "logits/chosen": 0.4333341121673584, "logits/rejected": 0.34512364864349365, "logps/chosen": -526.6718139648438, "logps/rejected": -497.8265380859375, "loss": 0.6638, "nll_loss": 0.5166666507720947, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2582641839981079, "rewards/margins": 0.13742609322071075, "rewards/rejected": 0.12083806842565536, "step": 266 }, { "epoch": 0.23289159243971322, "grad_norm": 129.3864196211325, "learning_rate": 1.8123611998495006e-07, "logits/chosen": 0.4436326026916504, "logits/rejected": 0.3769025206565857, "logps/chosen": -409.8662414550781, "logps/rejected": -483.3121032714844, "loss": 0.5929, "nll_loss": 0.40314382314682007, "rewards/accuracies": 0.625, "rewards/chosen": 0.2736284136772156, "rewards/margins": 0.7211030721664429, "rewards/rejected": -0.4474746882915497, "step": 268 }, { "epoch": 0.23462958939821854, "grad_norm": 153.36158996693672, "learning_rate": 1.8090169943749475e-07, "logits/chosen": 0.37940219044685364, "logits/rejected": 0.47512713074684143, "logps/chosen": -486.993408203125, "logps/rejected": -486.84765625, "loss": 0.6207, "nll_loss": 0.443195104598999, "rewards/accuracies": 0.625, "rewards/chosen": 0.23140813410282135, "rewards/margins": 0.31433334946632385, "rewards/rejected": -0.0829252228140831, "step": 270 }, { "epoch": 0.23636758635672386, "grad_norm": 192.5367921969003, "learning_rate": 1.8056463933434396e-07, "logits/chosen": 0.11284930258989334, "logits/rejected": 0.253543496131897, "logps/chosen": -473.71661376953125, "logps/rejected": -574.8849487304688, "loss": 0.6702, "nll_loss": 0.4648784399032593, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3114520311355591, "rewards/margins": 0.43774574995040894, "rewards/rejected": -0.12629374861717224, "step": 272 }, { "epoch": 0.23810558331522919, "grad_norm": 194.30911480270703, "learning_rate": 1.802249506726575e-07, "logits/chosen": 0.7151396870613098, "logits/rejected": 0.6841371059417725, "logps/chosen": -485.168212890625, "logps/rejected": -480.3174133300781, "loss": 0.6563, "nll_loss": 0.46294671297073364, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06923913955688477, "rewards/margins": -0.021972376853227615, "rewards/rejected": 0.09121151268482208, "step": 274 }, { "epoch": 0.2398435802737345, "grad_norm": 368.64577683366, "learning_rate": 1.7988264453535638e-07, "logits/chosen": 0.26056328415870667, "logits/rejected": 0.4656585454940796, "logps/chosen": -528.419189453125, "logps/rejected": -503.22540283203125, "loss": 0.6334, "nll_loss": 0.519210159778595, "rewards/accuracies": 0.5, "rewards/chosen": 0.10040701925754547, "rewards/margins": 0.06605224311351776, "rewards/rejected": 0.03435477986931801, "step": 276 }, { "epoch": 0.24158157723223983, "grad_norm": 213.32577873138953, "learning_rate": 1.7953773209076107e-07, "logits/chosen": 0.582006573677063, "logits/rejected": 0.4061691164970398, "logps/chosen": -549.0662841796875, "logps/rejected": -487.228759765625, "loss": 0.6662, "nll_loss": 0.5219871401786804, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2095928192138672, "rewards/margins": 0.15073338150978088, "rewards/rejected": 0.0588594488799572, "step": 278 }, { "epoch": 0.24331957419074515, "grad_norm": 197.41029523602992, "learning_rate": 1.7919022459222751e-07, "logits/chosen": 0.5505284070968628, "logits/rejected": 0.5329501032829285, "logps/chosen": -446.39239501953125, "logps/rejected": -451.7146301269531, "loss": 0.629, "nll_loss": 0.44017502665519714, "rewards/accuracies": 0.75, "rewards/chosen": 0.3613952100276947, "rewards/margins": 0.33678293228149414, "rewards/rejected": 0.02461223118007183, "step": 280 }, { "epoch": 0.2450575711492505, "grad_norm": 243.5627781827318, "learning_rate": 1.788401333777794e-07, "logits/chosen": 0.6572059988975525, "logits/rejected": 0.6510287523269653, "logps/chosen": -483.048583984375, "logps/rejected": -568.91357421875, "loss": 0.6825, "nll_loss": 0.50696861743927, "rewards/accuracies": 0.625, "rewards/chosen": 0.3178718686103821, "rewards/margins": 0.37436118721961975, "rewards/rejected": -0.05648936703801155, "step": 282 }, { "epoch": 0.24679556810775582, "grad_norm": 217.6697464767649, "learning_rate": 1.784874698697388e-07, "logits/chosen": 0.054942190647125244, "logits/rejected": 0.23091381788253784, "logps/chosen": -436.7183837890625, "logps/rejected": -494.40802001953125, "loss": 0.6891, "nll_loss": 0.4455685019493103, "rewards/accuracies": 0.625, "rewards/chosen": 0.24671289324760437, "rewards/margins": 0.219809427857399, "rewards/rejected": 0.026903443038463593, "step": 284 }, { "epoch": 0.24853356506626115, "grad_norm": 136.79160723167686, "learning_rate": 1.7813224557435312e-07, "logits/chosen": 0.38629719614982605, "logits/rejected": 0.4690326750278473, "logps/chosen": -432.0810852050781, "logps/rejected": -431.36181640625, "loss": 0.6467, "nll_loss": 0.41893959045410156, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2837182879447937, "rewards/margins": 0.09321805089712143, "rewards/rejected": 0.19050025939941406, "step": 286 }, { "epoch": 0.25027156202476647, "grad_norm": 222.0196942780997, "learning_rate": 1.7777447208141978e-07, "logits/chosen": 0.24580180644989014, "logits/rejected": 0.2410634458065033, "logps/chosen": -480.53411865234375, "logps/rejected": -448.2469787597656, "loss": 0.6514, "nll_loss": 0.43868565559387207, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4032495319843292, "rewards/margins": 0.11636673659086227, "rewards/rejected": 0.28688275814056396, "step": 288 }, { "epoch": 0.25200955898327176, "grad_norm": 139.84237516124608, "learning_rate": 1.7741416106390824e-07, "logits/chosen": 0.3706471025943756, "logits/rejected": 0.4256122410297394, "logps/chosen": -485.7464294433594, "logps/rejected": -518.0656127929688, "loss": 0.6718, "nll_loss": 0.48721954226493835, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3327837884426117, "rewards/margins": 0.07422256469726562, "rewards/rejected": 0.25856122374534607, "step": 290 }, { "epoch": 0.2537475559417771, "grad_norm": 155.130400096225, "learning_rate": 1.7705132427757892e-07, "logits/chosen": 0.10826490819454193, "logits/rejected": 0.20488005876541138, "logps/chosen": -476.7195739746094, "logps/rejected": -534.486083984375, "loss": 0.6035, "nll_loss": 0.4442020058631897, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5589705109596252, "rewards/margins": 0.3643781840801239, "rewards/rejected": 0.19459228217601776, "step": 292 }, { "epoch": 0.2554855529002824, "grad_norm": 139.1832762554709, "learning_rate": 1.7668597356059976e-07, "logits/chosen": 0.45618000626564026, "logits/rejected": 0.33908048272132874, "logps/chosen": -432.85546875, "logps/rejected": -436.5083923339844, "loss": 0.6213, "nll_loss": 0.42752811312675476, "rewards/accuracies": 0.625, "rewards/chosen": 0.4356747269630432, "rewards/margins": 0.17109422385692596, "rewards/rejected": 0.26458045840263367, "step": 294 }, { "epoch": 0.25722354985878776, "grad_norm": 295.1345977616581, "learning_rate": 1.7631812083316002e-07, "logits/chosen": 0.3745659589767456, "logits/rejected": 0.19597268104553223, "logps/chosen": -512.7102661132812, "logps/rejected": -494.8677673339844, "loss": 0.5835, "nll_loss": 0.5069454312324524, "rewards/accuracies": 0.75, "rewards/chosen": 0.3890928030014038, "rewards/margins": 0.4358100891113281, "rewards/rejected": -0.046717267483472824, "step": 296 }, { "epoch": 0.25896154681729305, "grad_norm": 274.1732165312349, "learning_rate": 1.7594777809708125e-07, "logits/chosen": 0.7644711136817932, "logits/rejected": 0.5483787059783936, "logps/chosen": -485.9484558105469, "logps/rejected": -411.6966857910156, "loss": 0.6717, "nll_loss": 0.46617794036865234, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3494039475917816, "rewards/margins": 0.08869829773902893, "rewards/rejected": 0.2607056796550751, "step": 298 }, { "epoch": 0.2606995437757984, "grad_norm": 144.10098828420033, "learning_rate": 1.7557495743542582e-07, "logits/chosen": 0.38872650265693665, "logits/rejected": 0.40973731875419617, "logps/chosen": -418.7256774902344, "logps/rejected": -484.5044860839844, "loss": 0.6249, "nll_loss": 0.4207072854042053, "rewards/accuracies": 0.6875, "rewards/chosen": 0.38238725066185, "rewards/margins": 0.35693395137786865, "rewards/rejected": 0.025453299283981323, "step": 300 }, { "epoch": 0.2624375407343037, "grad_norm": 131.5081786745525, "learning_rate": 1.751996710121026e-07, "logits/chosen": 0.47483396530151367, "logits/rejected": 0.6197552680969238, "logps/chosen": -517.57080078125, "logps/rejected": -512.1171264648438, "loss": 0.6765, "nll_loss": 0.48550546169281006, "rewards/accuracies": 0.625, "rewards/chosen": 0.3952723741531372, "rewards/margins": 0.048422060906887054, "rewards/rejected": 0.34685030579566956, "step": 302 }, { "epoch": 0.26417553769280905, "grad_norm": 122.21498248416636, "learning_rate": 1.7482193107147012e-07, "logits/chosen": 0.3328361511230469, "logits/rejected": 0.4578433632850647, "logps/chosen": -443.4249267578125, "logps/rejected": -480.59283447265625, "loss": 0.605, "nll_loss": 0.3879072368144989, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3531206250190735, "rewards/margins": 0.3702266812324524, "rewards/rejected": -0.0171060748398304, "step": 304 }, { "epoch": 0.26591353465131434, "grad_norm": 158.87287212409825, "learning_rate": 1.744417499379372e-07, "logits/chosen": 0.24786585569381714, "logits/rejected": 0.30546700954437256, "logps/chosen": -477.049560546875, "logps/rejected": -483.1158447265625, "loss": 0.6181, "nll_loss": 0.43729695677757263, "rewards/accuracies": 0.6875, "rewards/chosen": 0.54217529296875, "rewards/margins": 0.22286450862884521, "rewards/rejected": 0.3193108141422272, "step": 306 }, { "epoch": 0.2676515316098197, "grad_norm": 238.43769366296277, "learning_rate": 1.7405914001556057e-07, "logits/chosen": 0.180156871676445, "logits/rejected": 0.15245738625526428, "logps/chosen": -454.74737548828125, "logps/rejected": -483.5639343261719, "loss": 0.6133, "nll_loss": 0.4567978084087372, "rewards/accuracies": 0.75, "rewards/chosen": 0.490037739276886, "rewards/margins": 0.5519703030586243, "rewards/rejected": -0.06193256005644798, "step": 308 }, { "epoch": 0.269389528568325, "grad_norm": 177.99780939922397, "learning_rate": 1.7367411378764046e-07, "logits/chosen": 0.6208648085594177, "logits/rejected": 0.6263742446899414, "logps/chosen": -455.6944580078125, "logps/rejected": -451.45758056640625, "loss": 0.5971, "nll_loss": 0.4412953853607178, "rewards/accuracies": 0.625, "rewards/chosen": 0.3768067955970764, "rewards/margins": 0.09082222729921341, "rewards/rejected": 0.285984605550766, "step": 310 }, { "epoch": 0.27112752552683034, "grad_norm": 212.07380967509832, "learning_rate": 1.7328668381631318e-07, "logits/chosen": 0.330152302980423, "logits/rejected": 0.21931253373622894, "logps/chosen": -490.38299560546875, "logps/rejected": -454.47613525390625, "loss": 0.6657, "nll_loss": 0.5036713480949402, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1547100991010666, "rewards/margins": -0.04359874129295349, "rewards/rejected": 0.19830884039402008, "step": 312 }, { "epoch": 0.27286552248533563, "grad_norm": 138.10076491846291, "learning_rate": 1.7289686274214114e-07, "logits/chosen": 0.18456527590751648, "logits/rejected": 0.25792205333709717, "logps/chosen": -454.43865966796875, "logps/rejected": -411.9728088378906, "loss": 0.6191, "nll_loss": 0.42922234535217285, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4555009603500366, "rewards/margins": 0.13302592933177948, "rewards/rejected": 0.32247504591941833, "step": 314 }, { "epoch": 0.274603519443841, "grad_norm": 141.35852524297098, "learning_rate": 1.7250466328370068e-07, "logits/chosen": 0.5290156602859497, "logits/rejected": 0.3320920467376709, "logps/chosen": -477.7109375, "logps/rejected": -503.2304382324219, "loss": 0.6023, "nll_loss": 0.4656658172607422, "rewards/accuracies": 0.9375, "rewards/chosen": 0.772936999797821, "rewards/margins": 0.5684661865234375, "rewards/rejected": 0.20447082817554474, "step": 316 }, { "epoch": 0.2763415164023463, "grad_norm": 316.8106303332851, "learning_rate": 1.7211009823716693e-07, "logits/chosen": 0.1151304543018341, "logits/rejected": 0.12682472169399261, "logps/chosen": -487.7864990234375, "logps/rejected": -439.44549560546875, "loss": 0.6177, "nll_loss": 0.47066113352775574, "rewards/accuracies": 0.75, "rewards/chosen": 0.49904000759124756, "rewards/margins": 0.1463506519794464, "rewards/rejected": 0.35268935561180115, "step": 318 }, { "epoch": 0.2780795133608516, "grad_norm": 235.51591779073905, "learning_rate": 1.7171318047589637e-07, "logits/chosen": 0.4797361493110657, "logits/rejected": 0.5132091045379639, "logps/chosen": -405.1888427734375, "logps/rejected": -472.8603820800781, "loss": 0.6178, "nll_loss": 0.3952030539512634, "rewards/accuracies": 0.625, "rewards/chosen": 0.5869803428649902, "rewards/margins": 0.20901460945606232, "rewards/rejected": 0.3779657483100891, "step": 320 }, { "epoch": 0.2798175103193569, "grad_norm": 289.2881197970101, "learning_rate": 1.7131392295000672e-07, "logits/chosen": 0.5913805961608887, "logits/rejected": 0.5887272357940674, "logps/chosen": -423.531005859375, "logps/rejected": -460.1776428222656, "loss": 0.6779, "nll_loss": 0.4493826925754547, "rewards/accuracies": 0.5, "rewards/chosen": 0.44546252489089966, "rewards/margins": 0.2519988715648651, "rewards/rejected": 0.19346359372138977, "step": 322 }, { "epoch": 0.28155550727786227, "grad_norm": 190.5144705146162, "learning_rate": 1.7091233868595465e-07, "logits/chosen": 0.5008837580680847, "logits/rejected": 0.4548027217388153, "logps/chosen": -460.7749328613281, "logps/rejected": -466.6297912597656, "loss": 0.611, "nll_loss": 0.43400996923446655, "rewards/accuracies": 0.5625, "rewards/chosen": 0.39121007919311523, "rewards/margins": 0.06272707879543304, "rewards/rejected": 0.3284830152988434, "step": 324 }, { "epoch": 0.28329350423636757, "grad_norm": 207.28223953117717, "learning_rate": 1.7050844078611054e-07, "logits/chosen": 0.31017956137657166, "logits/rejected": 0.324398934841156, "logps/chosen": -488.0496826171875, "logps/rejected": -508.5542297363281, "loss": 0.6418, "nll_loss": 0.4621453285217285, "rewards/accuracies": 0.5, "rewards/chosen": 0.328339546918869, "rewards/margins": 0.13088390231132507, "rewards/rejected": 0.19745570421218872, "step": 326 }, { "epoch": 0.2850315011948729, "grad_norm": 195.5379731252921, "learning_rate": 1.7010224242833106e-07, "logits/chosen": 0.7597770094871521, "logits/rejected": 0.6047714352607727, "logps/chosen": -529.1282958984375, "logps/rejected": -497.84375, "loss": 0.6742, "nll_loss": 0.4810156226158142, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06996631622314453, "rewards/margins": -0.1163499504327774, "rewards/rejected": 0.18631629645824432, "step": 328 }, { "epoch": 0.2867694981533782, "grad_norm": 158.75613524542254, "learning_rate": 1.6969375686552937e-07, "logits/chosen": 0.5037192702293396, "logits/rejected": 0.49104276299476624, "logps/chosen": -542.0267333984375, "logps/rejected": -481.6624755859375, "loss": 0.6433, "nll_loss": 0.49876895546913147, "rewards/accuracies": 0.4375, "rewards/chosen": 0.22195303440093994, "rewards/margins": 0.008428195491433144, "rewards/rejected": 0.21352483332157135, "step": 330 }, { "epoch": 0.28850749511188356, "grad_norm": 179.9369850212301, "learning_rate": 1.6928299742524231e-07, "logits/chosen": 0.4871769845485687, "logits/rejected": 0.5428123474121094, "logps/chosen": -483.11773681640625, "logps/rejected": -457.1769104003906, "loss": 0.6557, "nll_loss": 0.4703104794025421, "rewards/accuracies": 0.6875, "rewards/chosen": 0.37591439485549927, "rewards/margins": 0.0965644121170044, "rewards/rejected": 0.2793499529361725, "step": 332 }, { "epoch": 0.29024549207038886, "grad_norm": 169.22992683207792, "learning_rate": 1.6886997750919616e-07, "logits/chosen": 0.41568654775619507, "logits/rejected": 0.640016496181488, "logps/chosen": -419.57464599609375, "logps/rejected": -440.6019287109375, "loss": 0.6416, "nll_loss": 0.4042522609233856, "rewards/accuracies": 0.5, "rewards/chosen": 0.21573497354984283, "rewards/margins": -0.052317921072244644, "rewards/rejected": 0.268052875995636, "step": 334 }, { "epoch": 0.2919834890288942, "grad_norm": 537.549948640332, "learning_rate": 1.6845471059286887e-07, "logits/chosen": 0.2585112452507019, "logits/rejected": 0.40234676003456116, "logps/chosen": -469.12744140625, "logps/rejected": -407.4085388183594, "loss": 0.6697, "nll_loss": 0.43277740478515625, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4075109362602234, "rewards/margins": -0.004766283556818962, "rewards/rejected": 0.4122772216796875, "step": 336 }, { "epoch": 0.2937214859873995, "grad_norm": 143.88283356900885, "learning_rate": 1.6803721022505065e-07, "logits/chosen": 0.2862776815891266, "logits/rejected": 0.5013993382453918, "logps/chosen": -419.91168212890625, "logps/rejected": -475.68621826171875, "loss": 0.6574, "nll_loss": 0.4124099910259247, "rewards/accuracies": 0.625, "rewards/chosen": 0.5104472637176514, "rewards/margins": 0.1771218478679657, "rewards/rejected": 0.3333253860473633, "step": 338 }, { "epoch": 0.29545948294590485, "grad_norm": 277.79529042691826, "learning_rate": 1.6761749002740193e-07, "logits/chosen": 0.46205711364746094, "logits/rejected": 0.46211087703704834, "logps/chosen": -460.75201416015625, "logps/rejected": -521.1112060546875, "loss": 0.5561, "nll_loss": 0.4728389382362366, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6726080775260925, "rewards/margins": 0.5335967540740967, "rewards/rejected": 0.1390111893415451, "step": 340 }, { "epoch": 0.29719747990441014, "grad_norm": 211.81872281009484, "learning_rate": 1.6719556369400878e-07, "logits/chosen": 0.31371983885765076, "logits/rejected": 0.3987221121788025, "logps/chosen": -508.58856201171875, "logps/rejected": -506.62841796875, "loss": 0.6018, "nll_loss": 0.5143205523490906, "rewards/accuracies": 0.5, "rewards/chosen": 0.5597350001335144, "rewards/margins": 0.17063526809215546, "rewards/rejected": 0.38909977674484253, "step": 342 }, { "epoch": 0.2989354768629155, "grad_norm": 202.01733690053692, "learning_rate": 1.6677144499093625e-07, "logits/chosen": 0.3940061628818512, "logits/rejected": 0.5614907145500183, "logps/chosen": -493.75067138671875, "logps/rejected": -516.7576293945312, "loss": 0.6558, "nll_loss": 0.47246384620666504, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5523098111152649, "rewards/margins": 0.15993386507034302, "rewards/rejected": 0.39237597584724426, "step": 344 }, { "epoch": 0.3006734738214208, "grad_norm": 233.30784637715007, "learning_rate": 1.6634514775577918e-07, "logits/chosen": 0.22863087058067322, "logits/rejected": 0.4731798768043518, "logps/chosen": -481.202392578125, "logps/rejected": -556.5811157226562, "loss": 0.6281, "nll_loss": 0.46664902567863464, "rewards/accuracies": 0.625, "rewards/chosen": 0.8687335252761841, "rewards/margins": 0.3619176149368286, "rewards/rejected": 0.5068159103393555, "step": 346 }, { "epoch": 0.30241147077992614, "grad_norm": 157.2977888244691, "learning_rate": 1.659166858972107e-07, "logits/chosen": 0.5143783092498779, "logits/rejected": 0.5790827870368958, "logps/chosen": -468.9595947265625, "logps/rejected": -502.225341796875, "loss": 0.6525, "nll_loss": 0.46870219707489014, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5990110039710999, "rewards/margins": 0.23531866073608398, "rewards/rejected": 0.3636922836303711, "step": 348 }, { "epoch": 0.30414946773843143, "grad_norm": 163.6646025660051, "learning_rate": 1.6548607339452852e-07, "logits/chosen": 0.3285616338253021, "logits/rejected": 0.4201197028160095, "logps/chosen": -502.6363525390625, "logps/rejected": -548.9605712890625, "loss": 0.6349, "nll_loss": 0.4952363669872284, "rewards/accuracies": 0.875, "rewards/chosen": 0.9033224582672119, "rewards/margins": 0.7526147961616516, "rewards/rejected": 0.15070763230323792, "step": 350 }, { "epoch": 0.3058874646969368, "grad_norm": 130.2464033824859, "learning_rate": 1.650533242971987e-07, "logits/chosen": 0.11030253022909164, "logits/rejected": 0.20594020187854767, "logps/chosen": -430.72265625, "logps/rejected": -448.17144775390625, "loss": 0.564, "nll_loss": 0.4082512855529785, "rewards/accuracies": 0.6875, "rewards/chosen": 0.643363356590271, "rewards/margins": 0.14355525374412537, "rewards/rejected": 0.499808132648468, "step": 352 }, { "epoch": 0.3076254616554421, "grad_norm": 185.75442858935745, "learning_rate": 1.646184527243974e-07, "logits/chosen": 0.3853408694267273, "logits/rejected": 0.325025737285614, "logps/chosen": -499.373291015625, "logps/rejected": -458.1291198730469, "loss": 0.732, "nll_loss": 0.4488787353038788, "rewards/accuracies": 0.625, "rewards/chosen": 0.49317529797554016, "rewards/margins": 0.06869325041770935, "rewards/rejected": 0.4244820773601532, "step": 354 }, { "epoch": 0.30936345861394743, "grad_norm": 145.25722335543503, "learning_rate": 1.6418147286455017e-07, "logits/chosen": 0.6253547668457031, "logits/rejected": 0.570826530456543, "logps/chosen": -466.4505920410156, "logps/rejected": -448.43341064453125, "loss": 0.5772, "nll_loss": 0.4469682276248932, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5305427312850952, "rewards/margins": 0.28476476669311523, "rewards/rejected": 0.24577800929546356, "step": 356 }, { "epoch": 0.3111014555724527, "grad_norm": 273.4568869901171, "learning_rate": 1.6374239897486896e-07, "logits/chosen": 0.419254869222641, "logits/rejected": 0.4319905638694763, "logps/chosen": -460.3009033203125, "logps/rejected": -478.2269592285156, "loss": 0.6748, "nll_loss": 0.4449344575405121, "rewards/accuracies": 0.5, "rewards/chosen": 0.4517764449119568, "rewards/margins": 0.13741624355316162, "rewards/rejected": 0.3143601417541504, "step": 358 }, { "epoch": 0.3128394525309581, "grad_norm": 138.77165017718346, "learning_rate": 1.6330124538088703e-07, "logits/chosen": 0.5652934908866882, "logits/rejected": 0.639899730682373, "logps/chosen": -475.1581726074219, "logps/rejected": -452.1554870605469, "loss": 0.5829, "nll_loss": 0.46334606409072876, "rewards/accuracies": 0.625, "rewards/chosen": 0.5346073508262634, "rewards/margins": 0.22912371158599854, "rewards/rejected": 0.3054836094379425, "step": 360 }, { "epoch": 0.31457744948946337, "grad_norm": 207.1164704179335, "learning_rate": 1.6285802647599154e-07, "logits/chosen": 0.30453720688819885, "logits/rejected": 0.2256930023431778, "logps/chosen": -553.0308227539062, "logps/rejected": -515.8295288085938, "loss": 0.6774, "nll_loss": 0.49242475628852844, "rewards/accuracies": 0.625, "rewards/chosen": 0.3522112965583801, "rewards/margins": 0.2572094798088074, "rewards/rejected": 0.09500180184841156, "step": 362 }, { "epoch": 0.3163154464479687, "grad_norm": 167.9520774877879, "learning_rate": 1.6241275672095395e-07, "logits/chosen": 0.3028857111930847, "logits/rejected": 0.3282950818538666, "logps/chosen": -465.1825256347656, "logps/rejected": -501.9438171386719, "loss": 0.6284, "nll_loss": 0.4515576958656311, "rewards/accuracies": 0.625, "rewards/chosen": 0.3405414819717407, "rewards/margins": 0.2630569636821747, "rewards/rejected": 0.07748451828956604, "step": 364 }, { "epoch": 0.318053443406474, "grad_norm": 127.49701641606168, "learning_rate": 1.619654506434581e-07, "logits/chosen": 0.524957537651062, "logits/rejected": 0.43854832649230957, "logps/chosen": -540.3823852539062, "logps/rejected": -550.18701171875, "loss": 0.5756, "nll_loss": 0.5099707245826721, "rewards/accuracies": 0.5, "rewards/chosen": 0.5246202349662781, "rewards/margins": 0.42196738719940186, "rewards/rejected": 0.10265286266803741, "step": 366 }, { "epoch": 0.31979144036497936, "grad_norm": 189.5220473685797, "learning_rate": 1.615161228376265e-07, "logits/chosen": 0.383512943983078, "logits/rejected": 0.44961869716644287, "logps/chosen": -460.8105163574219, "logps/rejected": -469.990478515625, "loss": 0.6288, "nll_loss": 0.4280567765235901, "rewards/accuracies": 0.875, "rewards/chosen": 0.5916939377784729, "rewards/margins": 0.3190990686416626, "rewards/rejected": 0.2725948393344879, "step": 368 }, { "epoch": 0.32152943732348466, "grad_norm": 145.17291751298188, "learning_rate": 1.6106478796354383e-07, "logits/chosen": 0.42558369040489197, "logits/rejected": 0.6559165716171265, "logps/chosen": -410.0570983886719, "logps/rejected": -533.5880126953125, "loss": 0.5969, "nll_loss": 0.43435239791870117, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5680219531059265, "rewards/margins": 0.5121182799339294, "rewards/rejected": 0.05590362474322319, "step": 370 }, { "epoch": 0.32326743428199, "grad_norm": 166.70070172002627, "learning_rate": 1.6061146074677882e-07, "logits/chosen": 0.3405163586139679, "logits/rejected": 0.3031487762928009, "logps/chosen": -470.0008544921875, "logps/rejected": -466.5913391113281, "loss": 0.582, "nll_loss": 0.4428296387195587, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6726614236831665, "rewards/margins": 0.19486792385578156, "rewards/rejected": 0.47779348492622375, "step": 372 }, { "epoch": 0.3250054312404953, "grad_norm": 167.16311879860288, "learning_rate": 1.6015615597790385e-07, "logits/chosen": 0.5631195902824402, "logits/rejected": 0.46143069863319397, "logps/chosen": -536.484130859375, "logps/rejected": -510.6805419921875, "loss": 0.6317, "nll_loss": 0.507157564163208, "rewards/accuracies": 0.625, "rewards/chosen": 0.4220741391181946, "rewards/margins": 0.30352985858917236, "rewards/rejected": 0.11854429543018341, "step": 374 }, { "epoch": 0.32674342819900065, "grad_norm": 161.6759360651135, "learning_rate": 1.5969888851201225e-07, "logits/chosen": 0.6711764931678772, "logits/rejected": 0.597443699836731, "logps/chosen": -476.9120178222656, "logps/rejected": -482.572998046875, "loss": 0.6185, "nll_loss": 0.47335314750671387, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5396869778633118, "rewards/margins": 0.584469199180603, "rewards/rejected": -0.044782258570194244, "step": 376 }, { "epoch": 0.328481425157506, "grad_norm": 417.54699929930405, "learning_rate": 1.5923967326823368e-07, "logits/chosen": 0.33243659138679504, "logits/rejected": 0.5092381238937378, "logps/chosen": -475.91656494140625, "logps/rejected": -518.292236328125, "loss": 0.6348, "nll_loss": 0.46231335401535034, "rewards/accuracies": 0.625, "rewards/chosen": 0.49808621406555176, "rewards/margins": 0.240754634141922, "rewards/rejected": 0.25733163952827454, "step": 378 }, { "epoch": 0.3302194221160113, "grad_norm": 175.66639079342582, "learning_rate": 1.5877852522924732e-07, "logits/chosen": 0.6205189228057861, "logits/rejected": 0.48265188932418823, "logps/chosen": -543.364013671875, "logps/rejected": -542.3245239257812, "loss": 0.5893, "nll_loss": 0.5013877749443054, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7062149047851562, "rewards/margins": 0.5146379470825195, "rewards/rejected": 0.19157695770263672, "step": 380 }, { "epoch": 0.33195741907451665, "grad_norm": 138.56625803103043, "learning_rate": 1.583154594407932e-07, "logits/chosen": 0.4925762414932251, "logits/rejected": 0.2916370630264282, "logps/chosen": -441.38153076171875, "logps/rejected": -441.0870666503906, "loss": 0.5991, "nll_loss": 0.40977394580841064, "rewards/accuracies": 0.75, "rewards/chosen": 0.4514932632446289, "rewards/margins": 0.4026234745979309, "rewards/rejected": 0.048869796097278595, "step": 382 }, { "epoch": 0.33369541603302194, "grad_norm": 148.55643375008012, "learning_rate": 1.5785049101118108e-07, "logits/chosen": 0.11955571174621582, "logits/rejected": 0.03210335969924927, "logps/chosen": -517.887451171875, "logps/rejected": -560.5753173828125, "loss": 0.6206, "nll_loss": 0.4803450107574463, "rewards/accuracies": 0.5, "rewards/chosen": 0.4510236978530884, "rewards/margins": 0.289794921875, "rewards/rejected": 0.161228746175766, "step": 384 }, { "epoch": 0.3354334129915273, "grad_norm": 216.1495030153299, "learning_rate": 1.5738363511079773e-07, "logits/chosen": 0.356658935546875, "logits/rejected": 0.5617293119430542, "logps/chosen": -469.86114501953125, "logps/rejected": -553.1449584960938, "loss": 0.6355, "nll_loss": 0.45090147852897644, "rewards/accuracies": 0.875, "rewards/chosen": 0.6025825142860413, "rewards/margins": 0.5343748331069946, "rewards/rejected": 0.068207748234272, "step": 386 }, { "epoch": 0.3371714099500326, "grad_norm": 116.90273062908595, "learning_rate": 1.569149069716118e-07, "logits/chosen": 0.36243703961372375, "logits/rejected": 0.3176945745944977, "logps/chosen": -473.4244079589844, "logps/rejected": -524.5529174804688, "loss": 0.6079, "nll_loss": 0.48573052883148193, "rewards/accuracies": 0.625, "rewards/chosen": 0.34765779972076416, "rewards/margins": 0.26835328340530396, "rewards/rejected": 0.07930450141429901, "step": 388 }, { "epoch": 0.33890940690853794, "grad_norm": 166.76293201719108, "learning_rate": 1.5644432188667694e-07, "logits/chosen": 0.44438979029655457, "logits/rejected": 0.5965635776519775, "logps/chosen": -505.97113037109375, "logps/rejected": -534.6060180664062, "loss": 0.6398, "nll_loss": 0.4930253028869629, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3729788064956665, "rewards/margins": 0.09345950931310654, "rewards/rejected": 0.27951928973197937, "step": 390 }, { "epoch": 0.34064740386704323, "grad_norm": 110.02770232107177, "learning_rate": 1.5597189520963274e-07, "logits/chosen": 0.6275568008422852, "logits/rejected": 0.6534876823425293, "logps/chosen": -489.2644348144531, "logps/rejected": -464.12115478515625, "loss": 0.6371, "nll_loss": 0.4506080746650696, "rewards/accuracies": 0.625, "rewards/chosen": 0.267756849527359, "rewards/margins": 0.11544962972402573, "rewards/rejected": 0.15230722725391388, "step": 392 }, { "epoch": 0.3423854008255486, "grad_norm": 125.7814206886685, "learning_rate": 1.5549764235420404e-07, "logits/chosen": 0.36072108149528503, "logits/rejected": 0.42248329520225525, "logps/chosen": -470.70941162109375, "logps/rejected": -500.63494873046875, "loss": 0.5711, "nll_loss": 0.4840225279331207, "rewards/accuracies": 0.75, "rewards/chosen": 0.4070228338241577, "rewards/margins": 0.5793428421020508, "rewards/rejected": -0.17231999337673187, "step": 394 }, { "epoch": 0.3441233977840539, "grad_norm": 131.40847630338484, "learning_rate": 1.550215787936977e-07, "logits/chosen": 0.6912932395935059, "logits/rejected": 0.5712380409240723, "logps/chosen": -495.5016174316406, "logps/rejected": -454.8415832519531, "loss": 0.6201, "nll_loss": 0.4484928250312805, "rewards/accuracies": 0.625, "rewards/chosen": 0.5907407999038696, "rewards/margins": 0.10202912986278534, "rewards/rejected": 0.4887116253376007, "step": 396 }, { "epoch": 0.3458613947425592, "grad_norm": 178.1175808193889, "learning_rate": 1.54543720060498e-07, "logits/chosen": 0.3643982708454132, "logits/rejected": 0.37503209710121155, "logps/chosen": -422.71673583984375, "logps/rejected": -456.2169189453125, "loss": 0.5849, "nll_loss": 0.4056944251060486, "rewards/accuracies": 0.75, "rewards/chosen": 0.5937540531158447, "rewards/margins": 0.3985627293586731, "rewards/rejected": 0.19519129395484924, "step": 398 }, { "epoch": 0.3475993917010645, "grad_norm": 171.13756750638748, "learning_rate": 1.5406408174555975e-07, "logits/chosen": 0.27237066626548767, "logits/rejected": 0.1704144924879074, "logps/chosen": -402.96527099609375, "logps/rejected": -468.3968505859375, "loss": 0.5572, "nll_loss": 0.39846640825271606, "rewards/accuracies": 0.8125, "rewards/chosen": 0.546933650970459, "rewards/margins": 0.40997716784477234, "rewards/rejected": 0.13695651292800903, "step": 400 }, { "epoch": 0.34933738865956987, "grad_norm": 184.01278443708705, "learning_rate": 1.5358267949789966e-07, "logits/chosen": 0.35213690996170044, "logits/rejected": 0.380864679813385, "logps/chosen": -490.4032287597656, "logps/rejected": -485.1588134765625, "loss": 0.6594, "nll_loss": 0.47046926617622375, "rewards/accuracies": 0.625, "rewards/chosen": 0.8484295010566711, "rewards/margins": 0.26358741521835327, "rewards/rejected": 0.5848420858383179, "step": 402 }, { "epoch": 0.35107538561807516, "grad_norm": 219.69038077247276, "learning_rate": 1.5309952902408573e-07, "logits/chosen": 0.06591632217168808, "logits/rejected": 0.10977748036384583, "logps/chosen": -491.1231689453125, "logps/rejected": -453.5677490234375, "loss": 0.6137, "nll_loss": 0.4714672267436981, "rewards/accuracies": 0.75, "rewards/chosen": 0.7929648160934448, "rewards/margins": 0.27987584471702576, "rewards/rejected": 0.5130888819694519, "step": 404 }, { "epoch": 0.3528133825765805, "grad_norm": 192.96528830282867, "learning_rate": 1.5261464608772485e-07, "logits/chosen": 0.31062737107276917, "logits/rejected": 0.18132522702217102, "logps/chosen": -458.4087829589844, "logps/rejected": -395.68212890625, "loss": 0.605, "nll_loss": 0.46351706981658936, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7857881188392639, "rewards/margins": 0.40948447585105896, "rewards/rejected": 0.3763035833835602, "step": 406 }, { "epoch": 0.3545513795350858, "grad_norm": 138.57973896271315, "learning_rate": 1.5212804650894838e-07, "logits/chosen": 0.5299434661865234, "logits/rejected": 0.5789748430252075, "logps/chosen": -506.7216796875, "logps/rejected": -518.6016845703125, "loss": 0.6216, "nll_loss": 0.4909980893135071, "rewards/accuracies": 0.75, "rewards/chosen": 0.5659937858581543, "rewards/margins": 0.21054793894290924, "rewards/rejected": 0.35544586181640625, "step": 408 }, { "epoch": 0.35628937649359116, "grad_norm": 135.43117518574306, "learning_rate": 1.516397461638962e-07, "logits/chosen": 0.16620784997940063, "logits/rejected": 0.02259686216711998, "logps/chosen": -496.93560791015625, "logps/rejected": -462.2376708984375, "loss": 0.5299, "nll_loss": 0.4484490752220154, "rewards/accuracies": 0.75, "rewards/chosen": 0.9257678389549255, "rewards/margins": 0.42007893323898315, "rewards/rejected": 0.5056889057159424, "step": 410 }, { "epoch": 0.35802737345209645, "grad_norm": 146.69116581523053, "learning_rate": 1.511497609841984e-07, "logits/chosen": 0.4435105621814728, "logits/rejected": 0.28421181440353394, "logps/chosen": -503.27984619140625, "logps/rejected": -554.2760620117188, "loss": 0.5986, "nll_loss": 0.4445800185203552, "rewards/accuracies": 0.75, "rewards/chosen": 0.7452791929244995, "rewards/margins": 0.675082802772522, "rewards/rejected": 0.07019642740488052, "step": 412 }, { "epoch": 0.3597653704106018, "grad_norm": 179.0729044371433, "learning_rate": 1.5065810695645583e-07, "logits/chosen": 0.16551180183887482, "logits/rejected": 0.1484622061252594, "logps/chosen": -449.44976806640625, "logps/rejected": -505.42425537109375, "loss": 0.579, "nll_loss": 0.42006924748420715, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0582197904586792, "rewards/margins": 0.6258028149604797, "rewards/rejected": 0.4324168264865875, "step": 414 }, { "epoch": 0.3615033673691071, "grad_norm": 216.75622432526953, "learning_rate": 1.5016480012171825e-07, "logits/chosen": 0.6741234660148621, "logits/rejected": 0.7510842084884644, "logps/chosen": -485.66668701171875, "logps/rejected": -454.1160888671875, "loss": 0.6953, "nll_loss": 0.46377602219581604, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6574243307113647, "rewards/margins": 0.11631478369235992, "rewards/rejected": 0.541109561920166, "step": 416 }, { "epoch": 0.36324136432761245, "grad_norm": 202.74502581658425, "learning_rate": 1.4966985657496112e-07, "logits/chosen": 0.5062256455421448, "logits/rejected": 0.5332990884780884, "logps/chosen": -482.54144287109375, "logps/rejected": -508.75897216796875, "loss": 0.73, "nll_loss": 0.46353021264076233, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7008476853370667, "rewards/margins": -0.03729201853275299, "rewards/rejected": 0.7381397485733032, "step": 418 }, { "epoch": 0.36497936128611774, "grad_norm": 145.0881914686186, "learning_rate": 1.491732924645604e-07, "logits/chosen": 0.2643652558326721, "logits/rejected": 0.5007381439208984, "logps/chosen": -470.0370178222656, "logps/rejected": -493.4355163574219, "loss": 0.6496, "nll_loss": 0.44312015175819397, "rewards/accuracies": 0.4375, "rewards/chosen": 0.7855318784713745, "rewards/margins": -0.0823485404253006, "rewards/rejected": 0.8678804636001587, "step": 420 }, { "epoch": 0.3667173582446231, "grad_norm": 185.84119843255152, "learning_rate": 1.4867512399176562e-07, "logits/chosen": 0.23039156198501587, "logits/rejected": 0.35220447182655334, "logps/chosen": -505.6122741699219, "logps/rejected": -561.8280029296875, "loss": 0.688, "nll_loss": 0.47781142592430115, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8302431106567383, "rewards/margins": 0.5848037600517273, "rewards/rejected": 0.24543944001197815, "step": 422 }, { "epoch": 0.3684553552031284, "grad_norm": 115.85134944328905, "learning_rate": 1.4817536741017152e-07, "logits/chosen": 0.5234343409538269, "logits/rejected": 0.46782320737838745, "logps/chosen": -483.80517578125, "logps/rejected": -459.695556640625, "loss": 0.632, "nll_loss": 0.4559612572193146, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7901848554611206, "rewards/margins": 0.22588977217674255, "rewards/rejected": 0.5642951130867004, "step": 424 }, { "epoch": 0.37019335216163374, "grad_norm": 186.22651924677427, "learning_rate": 1.476740390251875e-07, "logits/chosen": 0.6512681245803833, "logits/rejected": 0.6760206818580627, "logps/chosen": -533.1652221679688, "logps/rejected": -559.06103515625, "loss": 0.6956, "nll_loss": 0.505631148815155, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9320828318595886, "rewards/margins": 0.308445006608963, "rewards/rejected": 0.6236377954483032, "step": 426 }, { "epoch": 0.37193134912013903, "grad_norm": 141.6956282085058, "learning_rate": 1.4717115519350568e-07, "logits/chosen": 0.31024739146232605, "logits/rejected": 0.40230733156204224, "logps/chosen": -506.8525390625, "logps/rejected": -467.3164978027344, "loss": 0.575, "nll_loss": 0.4690600633621216, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2685997486114502, "rewards/margins": 0.598934531211853, "rewards/rejected": 0.6696651577949524, "step": 428 }, { "epoch": 0.3736693460786444, "grad_norm": 259.2302220113569, "learning_rate": 1.4666673232256736e-07, "logits/chosen": 0.3985351324081421, "logits/rejected": 0.45808982849121094, "logps/chosen": -430.1258850097656, "logps/rejected": -482.2582092285156, "loss": 0.614, "nll_loss": 0.3921167552471161, "rewards/accuracies": 0.4375, "rewards/chosen": 0.8013219237327576, "rewards/margins": -0.042902953922748566, "rewards/rejected": 0.8442248106002808, "step": 430 }, { "epoch": 0.3754073430371497, "grad_norm": 155.46043813901522, "learning_rate": 1.461607868700276e-07, "logits/chosen": 0.425076961517334, "logits/rejected": 0.3554433286190033, "logps/chosen": -502.87225341796875, "logps/rejected": -452.2707824707031, "loss": 0.6385, "nll_loss": 0.45759153366088867, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7368870973587036, "rewards/margins": 0.3387683928012848, "rewards/rejected": 0.3981187343597412, "step": 432 }, { "epoch": 0.377145339995655, "grad_norm": 379.83578925167035, "learning_rate": 1.4565333534321824e-07, "logits/chosen": 0.2826644778251648, "logits/rejected": 0.5096839070320129, "logps/chosen": -476.4322509765625, "logps/rejected": -443.9337463378906, "loss": 0.5913, "nll_loss": 0.4821905195713043, "rewards/accuracies": 0.625, "rewards/chosen": 0.7236399054527283, "rewards/margins": 0.4021589457988739, "rewards/rejected": 0.32148098945617676, "step": 434 }, { "epoch": 0.3788833369541603, "grad_norm": 173.5884492355553, "learning_rate": 1.4514439429860941e-07, "logits/chosen": 0.07746727764606476, "logits/rejected": 0.1786593645811081, "logps/chosen": -447.15380859375, "logps/rejected": -424.5791931152344, "loss": 0.575, "nll_loss": 0.43714332580566406, "rewards/accuracies": 0.625, "rewards/chosen": 0.8142626881599426, "rewards/margins": 0.2937620282173157, "rewards/rejected": 0.520500659942627, "step": 436 }, { "epoch": 0.38062133391266567, "grad_norm": 189.44734705913655, "learning_rate": 1.4463398034126918e-07, "logits/chosen": 0.21691399812698364, "logits/rejected": 0.1062842532992363, "logps/chosen": -470.154296875, "logps/rejected": -515.6326293945312, "loss": 0.5866, "nll_loss": 0.4814315438270569, "rewards/accuracies": 0.6875, "rewards/chosen": 0.932270884513855, "rewards/margins": 0.4025800824165344, "rewards/rejected": 0.5296907424926758, "step": 438 }, { "epoch": 0.38235933087117097, "grad_norm": 180.5203885167642, "learning_rate": 1.4412211012432212e-07, "logits/chosen": 0.6636056303977966, "logits/rejected": 0.7748678922653198, "logps/chosen": -545.3409423828125, "logps/rejected": -541.329833984375, "loss": 0.6366, "nll_loss": 0.5039398074150085, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4772527813911438, "rewards/margins": 0.14066831767559052, "rewards/rejected": 0.3365844786167145, "step": 440 }, { "epoch": 0.3840973278296763, "grad_norm": 145.78613263537494, "learning_rate": 1.4360880034840552e-07, "logits/chosen": 0.698187530040741, "logits/rejected": 0.5469500422477722, "logps/chosen": -516.9849853515625, "logps/rejected": -449.78717041015625, "loss": 0.6094, "nll_loss": 0.4981119930744171, "rewards/accuracies": 0.4375, "rewards/chosen": 0.33729931712150574, "rewards/margins": 0.2149997502565384, "rewards/rejected": 0.12229958176612854, "step": 442 }, { "epoch": 0.3858353247881816, "grad_norm": 226.70536629048328, "learning_rate": 1.4309406776112488e-07, "logits/chosen": 0.2957366108894348, "logits/rejected": 0.28157639503479004, "logps/chosen": -534.7179565429688, "logps/rejected": -508.509521484375, "loss": 0.7085, "nll_loss": 0.4948302209377289, "rewards/accuracies": 0.625, "rewards/chosen": 0.23636344075202942, "rewards/margins": -0.018209829926490784, "rewards/rejected": 0.254573255777359, "step": 444 }, { "epoch": 0.38757332174668696, "grad_norm": 141.01878995081393, "learning_rate": 1.4257792915650726e-07, "logits/chosen": 0.5431017875671387, "logits/rejected": 0.4987832009792328, "logps/chosen": -474.10003662109375, "logps/rejected": -469.18994140625, "loss": 0.6611, "nll_loss": 0.44907432794570923, "rewards/accuracies": 0.625, "rewards/chosen": 0.5094869136810303, "rewards/margins": 0.28341084718704224, "rewards/rejected": 0.22607611119747162, "step": 446 }, { "epoch": 0.38931131870519226, "grad_norm": 143.73168116686412, "learning_rate": 1.4206040137445348e-07, "logits/chosen": 0.6262254118919373, "logits/rejected": 0.6136873364448547, "logps/chosen": -458.2574768066406, "logps/rejected": -475.05126953125, "loss": 0.5773, "nll_loss": 0.4416995048522949, "rewards/accuracies": 0.625, "rewards/chosen": 0.4745006561279297, "rewards/margins": 0.11526194214820862, "rewards/rejected": 0.35923871397972107, "step": 448 }, { "epoch": 0.3910493156636976, "grad_norm": 204.39195717062336, "learning_rate": 1.4154150130018864e-07, "logits/chosen": 0.17188166081905365, "logits/rejected": 0.12855087220668793, "logps/chosen": -493.6163635253906, "logps/rejected": -493.07989501953125, "loss": 0.6267, "nll_loss": 0.4598374366760254, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4715019762516022, "rewards/margins": 0.11144667863845825, "rewards/rejected": 0.36005526781082153, "step": 450 }, { "epoch": 0.3927873126222029, "grad_norm": 136.99517525674915, "learning_rate": 1.4102124586371118e-07, "logits/chosen": 0.3407544791698456, "logits/rejected": 0.2230420559644699, "logps/chosen": -551.1911010742188, "logps/rejected": -530.1219482421875, "loss": 0.6281, "nll_loss": 0.5207258462905884, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6534353494644165, "rewards/margins": 0.42499563097953796, "rewards/rejected": 0.22843971848487854, "step": 452 }, { "epoch": 0.39452530958070825, "grad_norm": 171.20329280679755, "learning_rate": 1.4049965203924052e-07, "logits/chosen": 0.40859052538871765, "logits/rejected": 0.2568287253379822, "logps/chosen": -500.0255126953125, "logps/rejected": -481.6701354980469, "loss": 0.5857, "nll_loss": 0.49866920709609985, "rewards/accuracies": 0.875, "rewards/chosen": 0.8634217977523804, "rewards/margins": 0.5302352905273438, "rewards/rejected": 0.3331865072250366, "step": 454 }, { "epoch": 0.39626330653921354, "grad_norm": 138.55579601924867, "learning_rate": 1.3997673684466338e-07, "logits/chosen": 0.5448041558265686, "logits/rejected": 0.5181457996368408, "logps/chosen": -452.5978698730469, "logps/rejected": -492.1902770996094, "loss": 0.5857, "nll_loss": 0.44351333379745483, "rewards/accuracies": 0.625, "rewards/chosen": 0.6340304017066956, "rewards/margins": 0.3788342773914337, "rewards/rejected": 0.25519609451293945, "step": 456 }, { "epoch": 0.3980013034977189, "grad_norm": 121.89047583858445, "learning_rate": 1.3945251734097827e-07, "logits/chosen": 0.2940325438976288, "logits/rejected": 0.15135368704795837, "logps/chosen": -520.0460205078125, "logps/rejected": -528.7132568359375, "loss": 0.6594, "nll_loss": 0.49102744460105896, "rewards/accuracies": 0.625, "rewards/chosen": 0.4439469575881958, "rewards/margins": 0.5450565814971924, "rewards/rejected": -0.10110970586538315, "step": 458 }, { "epoch": 0.3997393004562242, "grad_norm": 128.66033221606006, "learning_rate": 1.3892701063173916e-07, "logits/chosen": 0.556334376335144, "logits/rejected": 0.5508618354797363, "logps/chosen": -466.281005859375, "logps/rejected": -486.53106689453125, "loss": 0.5992, "nll_loss": 0.4581470787525177, "rewards/accuracies": 0.75, "rewards/chosen": 0.49654558300971985, "rewards/margins": 0.3212546706199646, "rewards/rejected": 0.17529097199440002, "step": 460 }, { "epoch": 0.40147729741472954, "grad_norm": 131.50974327446443, "learning_rate": 1.3840023386249714e-07, "logits/chosen": 0.4506450891494751, "logits/rejected": 0.47021448612213135, "logps/chosen": -501.6130676269531, "logps/rejected": -513.97021484375, "loss": 0.6474, "nll_loss": 0.49557405710220337, "rewards/accuracies": 0.625, "rewards/chosen": 0.7399130463600159, "rewards/margins": 0.15451927483081818, "rewards/rejected": 0.5853937864303589, "step": 462 }, { "epoch": 0.40321529437323483, "grad_norm": 196.4458970816872, "learning_rate": 1.3787220422024133e-07, "logits/chosen": 0.2543451488018036, "logits/rejected": 0.23989835381507874, "logps/chosen": -510.9331970214844, "logps/rejected": -540.4846801757812, "loss": 0.6912, "nll_loss": 0.4922424554824829, "rewards/accuracies": 0.5625, "rewards/chosen": 0.43558579683303833, "rewards/margins": 0.13976748287677765, "rewards/rejected": 0.2958183288574219, "step": 464 }, { "epoch": 0.4049532913317402, "grad_norm": 146.01531397672778, "learning_rate": 1.373429389328378e-07, "logits/chosen": 0.5709559917449951, "logits/rejected": 0.7091385722160339, "logps/chosen": -508.1383361816406, "logps/rejected": -539.6366577148438, "loss": 0.6281, "nll_loss": 0.4904443919658661, "rewards/accuracies": 0.75, "rewards/chosen": 0.6073122024536133, "rewards/margins": 0.2943267822265625, "rewards/rejected": 0.3129854202270508, "step": 466 }, { "epoch": 0.4066912882902455, "grad_norm": 192.90454062378998, "learning_rate": 1.368124552684678e-07, "logits/chosen": 0.5518777370452881, "logits/rejected": 0.49578994512557983, "logps/chosen": -467.0284423828125, "logps/rejected": -500.19354248046875, "loss": 0.5714, "nll_loss": 0.4271387755870819, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7035149931907654, "rewards/margins": 0.17197123169898987, "rewards/rejected": 0.5315437316894531, "step": 468 }, { "epoch": 0.40842928524875083, "grad_norm": 229.61216495684394, "learning_rate": 1.3628077053506408e-07, "logits/chosen": 0.4732482433319092, "logits/rejected": 0.6402056217193604, "logps/chosen": -454.22943115234375, "logps/rejected": -464.133544921875, "loss": 0.6222, "nll_loss": 0.4393461346626282, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4335966408252716, "rewards/margins": 0.0941920354962349, "rewards/rejected": 0.3394045829772949, "step": 470 }, { "epoch": 0.4101672822072561, "grad_norm": 166.9812610820794, "learning_rate": 1.3574790207974645e-07, "logits/chosen": 0.40805256366729736, "logits/rejected": 0.3065450191497803, "logps/chosen": -404.9720458984375, "logps/rejected": -471.55340576171875, "loss": 0.5696, "nll_loss": 0.3998640775680542, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6120911836624146, "rewards/margins": 0.5119709372520447, "rewards/rejected": 0.10012024641036987, "step": 472 }, { "epoch": 0.4119052791657615, "grad_norm": 321.0367764864034, "learning_rate": 1.352138672882555e-07, "logits/chosen": 0.2997012734413147, "logits/rejected": 0.31295493245124817, "logps/chosen": -485.71783447265625, "logps/rejected": -512.8876342773438, "loss": 0.6176, "nll_loss": 0.46717286109924316, "rewards/accuracies": 0.625, "rewards/chosen": 0.7201789617538452, "rewards/margins": 0.24554875493049622, "rewards/rejected": 0.47463029623031616, "step": 474 }, { "epoch": 0.41364327612426677, "grad_norm": 146.69964221813686, "learning_rate": 1.346786835843856e-07, "logits/chosen": 0.4254930317401886, "logits/rejected": 0.5058972239494324, "logps/chosen": -480.9587707519531, "logps/rejected": -475.7575988769531, "loss": 0.6482, "nll_loss": 0.46980273723602295, "rewards/accuracies": 0.75, "rewards/chosen": 0.37536314129829407, "rewards/margins": 0.38812559843063354, "rewards/rejected": -0.012762445025146008, "step": 476 }, { "epoch": 0.4153812730827721, "grad_norm": 228.93125805195882, "learning_rate": 1.3414236842941642e-07, "logits/chosen": 0.5630742311477661, "logits/rejected": 0.667353093624115, "logps/chosen": -516.9129028320312, "logps/rejected": -573.7291259765625, "loss": 0.6823, "nll_loss": 0.49215757846832275, "rewards/accuracies": 0.75, "rewards/chosen": 0.4989810883998871, "rewards/margins": 0.3567728102207184, "rewards/rejected": 0.1422082781791687, "step": 478 }, { "epoch": 0.4171192700412774, "grad_norm": 194.58681668955893, "learning_rate": 1.33604939321543e-07, "logits/chosen": 0.6577449440956116, "logits/rejected": 0.5744633674621582, "logps/chosen": -491.93792724609375, "logps/rejected": -494.926513671875, "loss": 0.6405, "nll_loss": 0.4891810417175293, "rewards/accuracies": 0.4375, "rewards/chosen": 0.275167852640152, "rewards/margins": 0.14811164140701294, "rewards/rejected": 0.12705622613430023, "step": 480 }, { "epoch": 0.41885726699978276, "grad_norm": 186.33475326894825, "learning_rate": 1.3306641379530512e-07, "logits/chosen": 0.2660292983055115, "logits/rejected": 0.3376016914844513, "logps/chosen": -442.4875793457031, "logps/rejected": -430.53155517578125, "loss": 0.5902, "nll_loss": 0.43695008754730225, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5159265398979187, "rewards/margins": 0.4518337547779083, "rewards/rejected": 0.06409282982349396, "step": 482 }, { "epoch": 0.42059526395828806, "grad_norm": 161.1545478220916, "learning_rate": 1.3252680942101498e-07, "logits/chosen": 0.468216210603714, "logits/rejected": 0.6578527092933655, "logps/chosen": -402.4339599609375, "logps/rejected": -512.015380859375, "loss": 0.5966, "nll_loss": 0.40780964493751526, "rewards/accuracies": 0.625, "rewards/chosen": 0.41949987411499023, "rewards/margins": 0.2544829249382019, "rewards/rejected": 0.16501693427562714, "step": 484 }, { "epoch": 0.4223332609167934, "grad_norm": 268.6378897364134, "learning_rate": 1.3198614380418408e-07, "logits/chosen": 0.29518118500709534, "logits/rejected": 0.3277493417263031, "logps/chosen": -544.3590087890625, "logps/rejected": -565.6707153320312, "loss": 0.5767, "nll_loss": 0.5259649157524109, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5081907510757446, "rewards/margins": 0.5027738213539124, "rewards/rejected": 0.005416871979832649, "step": 486 }, { "epoch": 0.4240712578752987, "grad_norm": 222.5034932088444, "learning_rate": 1.314444345849488e-07, "logits/chosen": 0.2683011293411255, "logits/rejected": 0.40357792377471924, "logps/chosen": -436.8942565917969, "logps/rejected": -519.6742553710938, "loss": 0.6378, "nll_loss": 0.4412961006164551, "rewards/accuracies": 0.6875, "rewards/chosen": 0.44710198044776917, "rewards/margins": 0.21643924713134766, "rewards/rejected": 0.2306627333164215, "step": 488 }, { "epoch": 0.42580925483380405, "grad_norm": 121.21919863292835, "learning_rate": 1.3090169943749475e-07, "logits/chosen": 0.395599365234375, "logits/rejected": 0.5225737690925598, "logps/chosen": -448.34661865234375, "logps/rejected": -471.4576416015625, "loss": 0.5857, "nll_loss": 0.4481515884399414, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4304274022579193, "rewards/margins": 0.4703535735607147, "rewards/rejected": -0.039926156401634216, "step": 490 }, { "epoch": 0.42754725179230935, "grad_norm": 151.93906740828345, "learning_rate": 1.3035795606948021e-07, "logits/chosen": 0.4577603042125702, "logits/rejected": 0.3892001807689667, "logps/chosen": -455.62261962890625, "logps/rejected": -465.81658935546875, "loss": 0.5716, "nll_loss": 0.4209946393966675, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4607146382331848, "rewards/margins": 0.20319023728370667, "rewards/rejected": 0.25752440094947815, "step": 492 }, { "epoch": 0.4292852487508147, "grad_norm": 213.10071130507995, "learning_rate": 1.2981322222145844e-07, "logits/chosen": 0.4598763585090637, "logits/rejected": 0.32266587018966675, "logps/chosen": -484.8750915527344, "logps/rejected": -435.47308349609375, "loss": 0.6497, "nll_loss": 0.4658326804637909, "rewards/accuracies": 0.625, "rewards/chosen": 0.3073493242263794, "rewards/margins": 0.3369705080986023, "rewards/rejected": -0.029621221125125885, "step": 494 }, { "epoch": 0.43102324570932, "grad_norm": 158.91277416486358, "learning_rate": 1.2926751566629875e-07, "logits/chosen": 0.23220184445381165, "logits/rejected": 0.3059692084789276, "logps/chosen": -456.1237487792969, "logps/rejected": -553.1903686523438, "loss": 0.6046, "nll_loss": 0.447183221578598, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6733037829399109, "rewards/margins": 0.8882058262825012, "rewards/rejected": -0.2149021029472351, "step": 496 }, { "epoch": 0.43276124266782534, "grad_norm": 125.90740115164517, "learning_rate": 1.2872085420860664e-07, "logits/chosen": 0.23051905632019043, "logits/rejected": 0.19383393228054047, "logps/chosen": -480.57135009765625, "logps/rejected": -472.86993408203125, "loss": 0.577, "nll_loss": 0.46721819043159485, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0732322931289673, "rewards/margins": 0.78470778465271, "rewards/rejected": 0.28852444887161255, "step": 498 }, { "epoch": 0.43449923962633064, "grad_norm": 211.1551689127061, "learning_rate": 1.2817325568414297e-07, "logits/chosen": 0.05189153552055359, "logits/rejected": 0.02408377081155777, "logps/chosen": -425.1849365234375, "logps/rejected": -487.15814208984375, "loss": 0.6655, "nll_loss": 0.4015406668186188, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4995962083339691, "rewards/margins": 0.27894628047943115, "rewards/rejected": 0.22064992785453796, "step": 500 }, { "epoch": 0.436237236584836, "grad_norm": 186.1345026565842, "learning_rate": 1.2762473795924203e-07, "logits/chosen": 0.6582891941070557, "logits/rejected": 0.6144505739212036, "logps/chosen": -484.273193359375, "logps/rejected": -491.8813171386719, "loss": 0.6492, "nll_loss": 0.4544438123703003, "rewards/accuracies": 0.625, "rewards/chosen": 0.533101499080658, "rewards/margins": 0.2257290929555893, "rewards/rejected": 0.3073723614215851, "step": 502 }, { "epoch": 0.4379752335433413, "grad_norm": 160.01923268108317, "learning_rate": 1.2707531893022853e-07, "logits/chosen": 0.2920701205730438, "logits/rejected": 0.3716092109680176, "logps/chosen": -456.92645263671875, "logps/rejected": -524.2435302734375, "loss": 0.5366, "nll_loss": 0.4609117805957794, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8777877688407898, "rewards/margins": 0.4967181384563446, "rewards/rejected": 0.3810696303844452, "step": 504 }, { "epoch": 0.43971323050184663, "grad_norm": 300.5169860605167, "learning_rate": 1.2652501652283377e-07, "logits/chosen": 0.2937391996383667, "logits/rejected": 0.3704921007156372, "logps/chosen": -476.27325439453125, "logps/rejected": -471.0837707519531, "loss": 0.6354, "nll_loss": 0.4966070353984833, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5227044224739075, "rewards/margins": 0.35230910778045654, "rewards/rejected": 0.17039528489112854, "step": 506 }, { "epoch": 0.4414512274603519, "grad_norm": 164.91548254513887, "learning_rate": 1.2597384869161084e-07, "logits/chosen": 0.5253309011459351, "logits/rejected": 0.4753131568431854, "logps/chosen": -454.3218994140625, "logps/rejected": -472.0747985839844, "loss": 0.6169, "nll_loss": 0.43823111057281494, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2607538402080536, "rewards/margins": 0.1995864063501358, "rewards/rejected": 0.06116743013262749, "step": 508 }, { "epoch": 0.4431892244188573, "grad_norm": 179.30441921156591, "learning_rate": 1.254218334193487e-07, "logits/chosen": 0.2752171754837036, "logits/rejected": 0.3763906955718994, "logps/chosen": -520.6530151367188, "logps/rejected": -562.1898193359375, "loss": 0.6609, "nll_loss": 0.525636613368988, "rewards/accuracies": 0.625, "rewards/chosen": 0.4572681486606598, "rewards/margins": 0.32413142919540405, "rewards/rejected": 0.13313673436641693, "step": 510 }, { "epoch": 0.44492722137736257, "grad_norm": 243.18039368915072, "learning_rate": 1.248689887164855e-07, "logits/chosen": 0.35667744278907776, "logits/rejected": 0.43777772784233093, "logps/chosen": -563.6502075195312, "logps/rejected": -510.6645812988281, "loss": 0.7224, "nll_loss": 0.5323516130447388, "rewards/accuracies": 0.5625, "rewards/chosen": 0.286058247089386, "rewards/margins": -0.11358994990587234, "rewards/rejected": 0.3996482193470001, "step": 512 }, { "epoch": 0.4466652183358679, "grad_norm": 181.5612438218865, "learning_rate": 1.2431533262052096e-07, "logits/chosen": 0.3660877048969269, "logits/rejected": 0.43879637122154236, "logps/chosen": -479.750244140625, "logps/rejected": -486.38720703125, "loss": 0.5727, "nll_loss": 0.4494829773902893, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5008564591407776, "rewards/margins": 0.5270312428474426, "rewards/rejected": -0.02617482841014862, "step": 514 }, { "epoch": 0.4484032152943732, "grad_norm": 145.89465511544117, "learning_rate": 1.23760883195428e-07, "logits/chosen": 0.4242178797721863, "logits/rejected": 0.321554571390152, "logps/chosen": -525.2330932617188, "logps/rejected": -488.3087158203125, "loss": 0.5768, "nll_loss": 0.5024228096008301, "rewards/accuracies": 0.75, "rewards/chosen": 0.41718295216560364, "rewards/margins": 0.37589991092681885, "rewards/rejected": 0.04128303378820419, "step": 516 }, { "epoch": 0.45014121225287856, "grad_norm": 143.8495286961736, "learning_rate": 1.2320565853106316e-07, "logits/chosen": 0.22734031081199646, "logits/rejected": 0.05379747971892357, "logps/chosen": -566.68603515625, "logps/rejected": -463.33013916015625, "loss": 0.5937, "nll_loss": 0.5050527453422546, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1884550154209137, "rewards/margins": 0.32553404569625854, "rewards/rejected": -0.13707906007766724, "step": 518 }, { "epoch": 0.45187920921138386, "grad_norm": 146.48961796321032, "learning_rate": 1.2264967674257644e-07, "logits/chosen": 0.302290141582489, "logits/rejected": 0.1225215271115303, "logps/chosen": -571.464111328125, "logps/rejected": -472.727783203125, "loss": 0.6082, "nll_loss": 0.506357729434967, "rewards/accuracies": 0.75, "rewards/chosen": 0.434457391500473, "rewards/margins": 0.5556737780570984, "rewards/rejected": -0.12121641635894775, "step": 520 }, { "epoch": 0.4536172061698892, "grad_norm": 196.9459868837508, "learning_rate": 1.220929559698204e-07, "logits/chosen": 0.3795379102230072, "logits/rejected": 0.3557262718677521, "logps/chosen": -468.3785400390625, "logps/rejected": -464.5234375, "loss": 0.6234, "nll_loss": 0.4429006576538086, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4287263751029968, "rewards/margins": 0.4739219844341278, "rewards/rejected": -0.045195575803518295, "step": 522 }, { "epoch": 0.4553552031283945, "grad_norm": 150.5136352508016, "learning_rate": 1.2153551437675818e-07, "logits/chosen": 0.6236774325370789, "logits/rejected": 0.625653862953186, "logps/chosen": -456.63250732421875, "logps/rejected": -428.4668884277344, "loss": 0.6435, "nll_loss": 0.4418932795524597, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4233695864677429, "rewards/margins": 0.24205005168914795, "rewards/rejected": 0.18131951987743378, "step": 524 }, { "epoch": 0.45709320008689985, "grad_norm": 129.14095160067973, "learning_rate": 1.2097737015087092e-07, "logits/chosen": 0.45213156938552856, "logits/rejected": 0.4116738438606262, "logps/chosen": -487.0008239746094, "logps/rejected": -510.26116943359375, "loss": 0.5883, "nll_loss": 0.4597417414188385, "rewards/accuracies": 0.625, "rewards/chosen": 0.3893871307373047, "rewards/margins": 0.24991761147975922, "rewards/rejected": 0.13946953415870667, "step": 526 }, { "epoch": 0.45883119704540515, "grad_norm": 159.8229027766876, "learning_rate": 1.2041854150256433e-07, "logits/chosen": 0.11591125279664993, "logits/rejected": 0.2681584060192108, "logps/chosen": -506.5346984863281, "logps/rejected": -575.7164916992188, "loss": 0.6024, "nll_loss": 0.474510133266449, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5386300086975098, "rewards/margins": 0.23797369003295898, "rewards/rejected": 0.3006563186645508, "step": 528 }, { "epoch": 0.4605691940039105, "grad_norm": 129.70492097075226, "learning_rate": 1.1985904666457453e-07, "logits/chosen": 0.30116310715675354, "logits/rejected": 0.11452615261077881, "logps/chosen": -484.81524658203125, "logps/rejected": -478.3340148925781, "loss": 0.5955, "nll_loss": 0.480922132730484, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7380178570747375, "rewards/margins": 0.6249958872795105, "rewards/rejected": 0.11302205920219421, "step": 530 }, { "epoch": 0.4623071909624158, "grad_norm": 151.28619497251262, "learning_rate": 1.1929890389137336e-07, "logits/chosen": 0.8342644572257996, "logits/rejected": 0.655091404914856, "logps/chosen": -472.6812438964844, "logps/rejected": -445.85791015625, "loss": 0.5429, "nll_loss": 0.45103174448013306, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5036472678184509, "rewards/margins": 0.3290826082229614, "rewards/rejected": 0.1745646446943283, "step": 532 }, { "epoch": 0.46404518792092114, "grad_norm": 170.35845348035963, "learning_rate": 1.1873813145857248e-07, "logits/chosen": 0.28095149993896484, "logits/rejected": 0.2095193713903427, "logps/chosen": -461.6504821777344, "logps/rejected": -511.35821533203125, "loss": 0.5845, "nll_loss": 0.4316372573375702, "rewards/accuracies": 0.75, "rewards/chosen": 0.6642789244651794, "rewards/margins": 0.5230391621589661, "rewards/rejected": 0.14123974740505219, "step": 534 }, { "epoch": 0.46578318487942644, "grad_norm": 208.15877393203985, "learning_rate": 1.1817674766232732e-07, "logits/chosen": 0.2754078507423401, "logits/rejected": 0.2587903141975403, "logps/chosen": -457.4371337890625, "logps/rejected": -485.6224365234375, "loss": 0.6047, "nll_loss": 0.44060018658638, "rewards/accuracies": 0.6875, "rewards/chosen": 0.30926474928855896, "rewards/margins": 0.10683069378137589, "rewards/rejected": 0.20243406295776367, "step": 536 }, { "epoch": 0.4675211818379318, "grad_norm": 157.39469864638943, "learning_rate": 1.1761477081874014e-07, "logits/chosen": 0.30831941962242126, "logits/rejected": 0.2548404335975647, "logps/chosen": -510.2038269042969, "logps/rejected": -530.1707153320312, "loss": 0.6053, "nll_loss": 0.495877206325531, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9113235473632812, "rewards/margins": 0.38738253712654114, "rewards/rejected": 0.5239410400390625, "step": 538 }, { "epoch": 0.4692591787964371, "grad_norm": 174.35032866933201, "learning_rate": 1.1705221926326239e-07, "logits/chosen": 0.8459619283676147, "logits/rejected": 0.6783990859985352, "logps/chosen": -508.0318908691406, "logps/rejected": -472.6733703613281, "loss": 0.6481, "nll_loss": 0.4781099557876587, "rewards/accuracies": 0.75, "rewards/chosen": 0.5047274231910706, "rewards/margins": 0.45192253589630127, "rewards/rejected": 0.0528048537671566, "step": 540 }, { "epoch": 0.47099717575494243, "grad_norm": 134.3749169975391, "learning_rate": 1.1648911135009633e-07, "logits/chosen": 0.2638280391693115, "logits/rejected": 0.32197749614715576, "logps/chosen": -502.83843994140625, "logps/rejected": -538.831787109375, "loss": 0.5802, "nll_loss": 0.4893040359020233, "rewards/accuracies": 0.8125, "rewards/chosen": 1.065590262413025, "rewards/margins": 0.8750788569450378, "rewards/rejected": 0.19051150977611542, "step": 542 }, { "epoch": 0.4727351727134477, "grad_norm": 145.88486424397541, "learning_rate": 1.1592546545159644e-07, "logits/chosen": 0.22533680498600006, "logits/rejected": 0.2013210654258728, "logps/chosen": -454.45269775390625, "logps/rejected": -471.0827941894531, "loss": 0.5753, "nll_loss": 0.4502761960029602, "rewards/accuracies": 0.5625, "rewards/chosen": 0.9982298016548157, "rewards/margins": 0.5919008851051331, "rewards/rejected": 0.40632885694503784, "step": 544 }, { "epoch": 0.4744731696719531, "grad_norm": 137.1178337205799, "learning_rate": 1.1536129995766994e-07, "logits/chosen": 0.35270434617996216, "logits/rejected": 0.29984721541404724, "logps/chosen": -538.6884765625, "logps/rejected": -470.1349792480469, "loss": 0.6014, "nll_loss": 0.4900573790073395, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3891260325908661, "rewards/margins": -0.021922580897808075, "rewards/rejected": 0.41104862093925476, "step": 546 }, { "epoch": 0.47621116663045837, "grad_norm": 230.3870356848052, "learning_rate": 1.1479663327517666e-07, "logits/chosen": 0.44097521901130676, "logits/rejected": 0.223893404006958, "logps/chosen": -431.5259704589844, "logps/rejected": -425.232177734375, "loss": 0.6691, "nll_loss": 0.41176140308380127, "rewards/accuracies": 0.6875, "rewards/chosen": 0.251619815826416, "rewards/margins": 0.28884509205818176, "rewards/rejected": -0.03722524642944336, "step": 548 }, { "epoch": 0.4779491635889637, "grad_norm": 155.67191376098575, "learning_rate": 1.1423148382732852e-07, "logits/chosen": 0.4796018600463867, "logits/rejected": 0.38772526383399963, "logps/chosen": -464.15447998046875, "logps/rejected": -450.4916687011719, "loss": 0.5792, "nll_loss": 0.44901058077812195, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4397455155849457, "rewards/margins": 0.4938681125640869, "rewards/rejected": -0.05412254482507706, "step": 550 }, { "epoch": 0.479687160547469, "grad_norm": 228.9713106548911, "learning_rate": 1.1366587005308857e-07, "logits/chosen": 0.339149534702301, "logits/rejected": 0.4987982511520386, "logps/chosen": -453.7382507324219, "logps/rejected": -449.76123046875, "loss": 0.5801, "nll_loss": 0.44411030411720276, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3493182063102722, "rewards/margins": 0.255526602268219, "rewards/rejected": 0.09379158169031143, "step": 552 }, { "epoch": 0.48142515750597437, "grad_norm": 115.06327317172317, "learning_rate": 1.1309981040656929e-07, "logits/chosen": 0.34733298420906067, "logits/rejected": 0.48480936884880066, "logps/chosen": -473.6231994628906, "logps/rejected": -590.5986938476562, "loss": 0.6056, "nll_loss": 0.4536553919315338, "rewards/accuracies": 0.6875, "rewards/chosen": 0.29592639207839966, "rewards/margins": 0.6725630760192871, "rewards/rejected": -0.37663671374320984, "step": 554 }, { "epoch": 0.48316315446447966, "grad_norm": 169.34703454184748, "learning_rate": 1.1253332335643042e-07, "logits/chosen": 0.37676888704299927, "logits/rejected": 0.3015158772468567, "logps/chosen": -512.7796630859375, "logps/rejected": -499.06634521484375, "loss": 0.5603, "nll_loss": 0.4944426715373993, "rewards/accuracies": 0.875, "rewards/chosen": 0.7520829439163208, "rewards/margins": 0.9439869523048401, "rewards/rejected": -0.1919039934873581, "step": 556 }, { "epoch": 0.484901151422985, "grad_norm": 161.22134816206415, "learning_rate": 1.1196642738527657e-07, "logits/chosen": 0.23896373808383942, "logits/rejected": 0.19408001005649567, "logps/chosen": -426.60546875, "logps/rejected": -434.4737854003906, "loss": 0.5932, "nll_loss": 0.39752912521362305, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6780611872673035, "rewards/margins": 0.5683779120445251, "rewards/rejected": 0.1096833273768425, "step": 558 }, { "epoch": 0.4866391483814903, "grad_norm": 127.96582961604234, "learning_rate": 1.1139914098905406e-07, "logits/chosen": 0.5050298571586609, "logits/rejected": 0.3574158251285553, "logps/chosen": -476.66937255859375, "logps/rejected": -472.6229553222656, "loss": 0.4979, "nll_loss": 0.46670016646385193, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2314346432685852, "rewards/margins": 0.6994683742523193, "rewards/rejected": -0.46803373098373413, "step": 560 }, { "epoch": 0.48837714533999566, "grad_norm": 223.9780474943852, "learning_rate": 1.1083148267644747e-07, "logits/chosen": 0.39628350734710693, "logits/rejected": 0.6703532338142395, "logps/chosen": -488.671142578125, "logps/rejected": -539.9094848632812, "loss": 0.6061, "nll_loss": 0.46648550033569336, "rewards/accuracies": 0.5, "rewards/chosen": 0.5457121133804321, "rewards/margins": 0.30177631974220276, "rewards/rejected": 0.24393577873706818, "step": 562 }, { "epoch": 0.490115142298501, "grad_norm": 158.92810889604107, "learning_rate": 1.1026347096827577e-07, "logits/chosen": 0.21636277437210083, "logits/rejected": 0.13212069869041443, "logps/chosen": -502.3656921386719, "logps/rejected": -433.3890686035156, "loss": 0.5821, "nll_loss": 0.47475725412368774, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3836136758327484, "rewards/margins": 0.23296299576759338, "rewards/rejected": 0.15065070986747742, "step": 564 }, { "epoch": 0.4918531392570063, "grad_norm": 143.877679933503, "learning_rate": 1.0969512439688814e-07, "logits/chosen": 0.4711693227291107, "logits/rejected": 0.4417217969894409, "logps/chosen": -496.0475769042969, "logps/rejected": -466.2301940917969, "loss": 0.633, "nll_loss": 0.44707292318344116, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36830979585647583, "rewards/margins": 0.029814153909683228, "rewards/rejected": 0.3384956419467926, "step": 566 }, { "epoch": 0.49359113621551165, "grad_norm": 142.79523446099884, "learning_rate": 1.0912646150555917e-07, "logits/chosen": 0.47549864649772644, "logits/rejected": 0.5703907608985901, "logps/chosen": -439.390625, "logps/rejected": -507.3600158691406, "loss": 0.6273, "nll_loss": 0.4341786503791809, "rewards/accuracies": 0.625, "rewards/chosen": 0.22542604804039001, "rewards/margins": 0.32407084107398987, "rewards/rejected": -0.09864482283592224, "step": 568 }, { "epoch": 0.49532913317401694, "grad_norm": 168.53592808344612, "learning_rate": 1.0855750084788397e-07, "logits/chosen": 0.13578006625175476, "logits/rejected": 0.22005786001682281, "logps/chosen": -495.08270263671875, "logps/rejected": -540.0595092773438, "loss": 0.6059, "nll_loss": 0.49262234568595886, "rewards/accuracies": 0.625, "rewards/chosen": 0.7732293605804443, "rewards/margins": 0.5452542304992676, "rewards/rejected": 0.22797507047653198, "step": 570 }, { "epoch": 0.4970671301325223, "grad_norm": 158.41658922638484, "learning_rate": 1.0798826098717275e-07, "logits/chosen": 0.18378670513629913, "logits/rejected": 0.14254061877727509, "logps/chosen": -484.80621337890625, "logps/rejected": -501.53265380859375, "loss": 0.6449, "nll_loss": 0.4623822271823883, "rewards/accuracies": 0.375, "rewards/chosen": 0.1718544214963913, "rewards/margins": 0.07776500284671783, "rewards/rejected": 0.09408941119909286, "step": 572 }, { "epoch": 0.4988051270910276, "grad_norm": 118.95912857829893, "learning_rate": 1.0741876049584522e-07, "logits/chosen": 0.46760082244873047, "logits/rejected": 0.6172095537185669, "logps/chosen": -450.90899658203125, "logps/rejected": -485.8258056640625, "loss": 0.6145, "nll_loss": 0.4425750970840454, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4866788983345032, "rewards/margins": 0.5924919247627258, "rewards/rejected": -0.10581303387880325, "step": 574 }, { "epoch": 0.5005431240495329, "grad_norm": 149.05944822937784, "learning_rate": 1.0684901795482455e-07, "logits/chosen": 0.20234175026416779, "logits/rejected": 0.1904958337545395, "logps/chosen": -478.34521484375, "logps/rejected": -462.22735595703125, "loss": 0.6229, "nll_loss": 0.4496627151966095, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3698406219482422, "rewards/margins": 0.26504114270210266, "rewards/rejected": 0.10479945689439774, "step": 576 }, { "epoch": 0.5022811210080382, "grad_norm": 191.23054812610573, "learning_rate": 1.0627905195293134e-07, "logits/chosen": 0.5189734697341919, "logits/rejected": 0.40094074606895447, "logps/chosen": -569.7152709960938, "logps/rejected": -532.6102905273438, "loss": 0.6364, "nll_loss": 0.5307385921478271, "rewards/accuracies": 0.75, "rewards/chosen": 0.7689107656478882, "rewards/margins": 0.6044713854789734, "rewards/rejected": 0.16443948447704315, "step": 578 }, { "epoch": 0.5040191179665435, "grad_norm": 138.79983096839496, "learning_rate": 1.057088810862768e-07, "logits/chosen": 0.4737722873687744, "logits/rejected": 0.5332717895507812, "logps/chosen": -517.5496215820312, "logps/rejected": -510.6952819824219, "loss": 0.6599, "nll_loss": 0.45416009426116943, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5199866890907288, "rewards/margins": 0.19118711352348328, "rewards/rejected": 0.32879963517189026, "step": 580 }, { "epoch": 0.5057571149250489, "grad_norm": 164.12647001236164, "learning_rate": 1.051385239576563e-07, "logits/chosen": 0.7203247547149658, "logits/rejected": 0.5283986330032349, "logps/chosen": -516.7804565429688, "logps/rejected": -512.832275390625, "loss": 0.6016, "nll_loss": 0.4893644452095032, "rewards/accuracies": 0.75, "rewards/chosen": 0.4361743927001953, "rewards/margins": 0.40919187664985657, "rewards/rejected": 0.02698250487446785, "step": 582 }, { "epoch": 0.5074951118835542, "grad_norm": 183.72101725659599, "learning_rate": 1.0456799917594232e-07, "logits/chosen": 0.40470677614212036, "logits/rejected": 0.35310330986976624, "logps/chosen": -465.427978515625, "logps/rejected": -491.4359130859375, "loss": 0.624, "nll_loss": 0.44728025794029236, "rewards/accuracies": 0.625, "rewards/chosen": 0.47867053747177124, "rewards/margins": 0.2610442340373993, "rewards/rejected": 0.21762628853321075, "step": 584 }, { "epoch": 0.5092331088420595, "grad_norm": 155.45934240025235, "learning_rate": 1.0399732535547734e-07, "logits/chosen": 0.057142481207847595, "logits/rejected": 0.040528830140829086, "logps/chosen": -399.9873046875, "logps/rejected": -442.05029296875, "loss": 0.6404, "nll_loss": 0.42519205808639526, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6752683520317078, "rewards/margins": 0.7227709889411926, "rewards/rejected": -0.04750242829322815, "step": 586 }, { "epoch": 0.5109711058005648, "grad_norm": 166.46533111008497, "learning_rate": 1.0342652111546635e-07, "logits/chosen": 0.2565877139568329, "logits/rejected": 0.3365154266357422, "logps/chosen": -478.84820556640625, "logps/rejected": -534.953857421875, "loss": 0.6102, "nll_loss": 0.4510883092880249, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8399749398231506, "rewards/margins": 0.3826572000980377, "rewards/rejected": 0.4573177099227905, "step": 588 }, { "epoch": 0.5127091027590702, "grad_norm": 122.85256715814496, "learning_rate": 1.0285560507936961e-07, "logits/chosen": 0.4950277805328369, "logits/rejected": 0.5933247804641724, "logps/chosen": -484.6195068359375, "logps/rejected": -523.1106567382812, "loss": 0.6011, "nll_loss": 0.4561443626880646, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41678088903427124, "rewards/margins": 0.32378941774368286, "rewards/rejected": 0.09299144148826599, "step": 590 }, { "epoch": 0.5144470997175755, "grad_norm": 190.64666153846562, "learning_rate": 1.0228459587429496e-07, "logits/chosen": 0.30040597915649414, "logits/rejected": 0.4183442294597626, "logps/chosen": -459.64776611328125, "logps/rejected": -459.33148193359375, "loss": 0.6286, "nll_loss": 0.4211972951889038, "rewards/accuracies": 0.5, "rewards/chosen": 0.7403571009635925, "rewards/margins": 0.3292871117591858, "rewards/rejected": 0.41106992959976196, "step": 592 }, { "epoch": 0.5161850966760808, "grad_norm": 128.71076382897667, "learning_rate": 1.0171351213038992e-07, "logits/chosen": 0.5232183337211609, "logits/rejected": 0.4865014851093292, "logps/chosen": -468.09246826171875, "logps/rejected": -496.3455505371094, "loss": 0.6034, "nll_loss": 0.45981934666633606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43860453367233276, "rewards/margins": 0.31035470962524414, "rewards/rejected": 0.128249853849411, "step": 594 }, { "epoch": 0.5179230936345861, "grad_norm": 213.50640814106367, "learning_rate": 1.0114237248023403e-07, "logits/chosen": 0.5549350380897522, "logits/rejected": 0.6062289476394653, "logps/chosen": -448.6082458496094, "logps/rejected": -485.8319396972656, "loss": 0.6498, "nll_loss": 0.4462430477142334, "rewards/accuracies": 0.75, "rewards/chosen": 0.49667686223983765, "rewards/margins": 0.1833028942346573, "rewards/rejected": 0.31337395310401917, "step": 596 }, { "epoch": 0.5196610905930915, "grad_norm": 185.87793569991007, "learning_rate": 1.0057119555823083e-07, "logits/chosen": 0.2625095248222351, "logits/rejected": 0.3052242398262024, "logps/chosen": -490.8760986328125, "logps/rejected": -475.52947998046875, "loss": 0.5803, "nll_loss": 0.4620268940925598, "rewards/accuracies": 0.75, "rewards/chosen": 0.6914440989494324, "rewards/margins": 0.6549934148788452, "rewards/rejected": 0.03645067289471626, "step": 598 }, { "epoch": 0.5213990875515968, "grad_norm": 263.05843652050453, "learning_rate": 1e-07, "logits/chosen": 0.5347599983215332, "logits/rejected": 0.6004009246826172, "logps/chosen": -468.2330017089844, "logps/rejected": -488.34033203125, "loss": 0.5876, "nll_loss": 0.44898730516433716, "rewards/accuracies": 0.8125, "rewards/chosen": 0.29759863018989563, "rewards/margins": 0.2130270004272461, "rewards/rejected": 0.08457164466381073, "step": 600 }, { "epoch": 0.5231370845101021, "grad_norm": 334.5831268901687, "learning_rate": 9.942880444176916e-08, "logits/chosen": 0.5655696988105774, "logits/rejected": 0.4727866053581238, "logps/chosen": -519.6710205078125, "logps/rejected": -482.19970703125, "loss": 0.6295, "nll_loss": 0.48799240589141846, "rewards/accuracies": 0.75, "rewards/chosen": 0.7050300240516663, "rewards/margins": 0.6107040643692017, "rewards/rejected": 0.09432603418827057, "step": 602 }, { "epoch": 0.5248750814686074, "grad_norm": 109.64482292970844, "learning_rate": 9.885762751976599e-08, "logits/chosen": 0.32183387875556946, "logits/rejected": 0.334905743598938, "logps/chosen": -468.80841064453125, "logps/rejected": -527.1265258789062, "loss": 0.5201, "nll_loss": 0.442211776971817, "rewards/accuracies": 0.6875, "rewards/chosen": 0.38873061537742615, "rewards/margins": 0.5435060858726501, "rewards/rejected": -0.154775470495224, "step": 604 }, { "epoch": 0.5266130784271128, "grad_norm": 186.41721141455656, "learning_rate": 9.828648786961007e-08, "logits/chosen": 0.3031446933746338, "logits/rejected": 0.3322385549545288, "logps/chosen": -510.8680114746094, "logps/rejected": -531.59521484375, "loss": 0.5985, "nll_loss": 0.5014962553977966, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2036452293395996, "rewards/margins": 0.3037644624710083, "rewards/rejected": -0.1001192182302475, "step": 606 }, { "epoch": 0.5283510753856181, "grad_norm": 274.3454262806401, "learning_rate": 9.771540412570503e-08, "logits/chosen": 0.5674399733543396, "logits/rejected": 0.37741467356681824, "logps/chosen": -507.96551513671875, "logps/rejected": -557.3289184570312, "loss": 0.6339, "nll_loss": 0.4965820908546448, "rewards/accuracies": 0.75, "rewards/chosen": 0.540863037109375, "rewards/margins": 0.5740821957588196, "rewards/rejected": -0.033219143748283386, "step": 608 }, { "epoch": 0.5300890723441234, "grad_norm": 163.62183547762456, "learning_rate": 9.714439492063038e-08, "logits/chosen": 0.2575898766517639, "logits/rejected": 0.1513080596923828, "logps/chosen": -489.66778564453125, "logps/rejected": -457.0095520019531, "loss": 0.5504, "nll_loss": 0.44628310203552246, "rewards/accuracies": 0.75, "rewards/chosen": 0.3893031179904938, "rewards/margins": 0.4137801229953766, "rewards/rejected": -0.02447700686752796, "step": 610 }, { "epoch": 0.5318270693026287, "grad_norm": 125.74716419736642, "learning_rate": 9.657347888453366e-08, "logits/chosen": 0.6081162095069885, "logits/rejected": 0.40688732266426086, "logps/chosen": -536.954833984375, "logps/rejected": -493.84674072265625, "loss": 0.5403, "nll_loss": 0.5140741467475891, "rewards/accuracies": 0.625, "rewards/chosen": 0.22217179834842682, "rewards/margins": 0.21463699638843536, "rewards/rejected": 0.007534794509410858, "step": 612 }, { "epoch": 0.5335650662611341, "grad_norm": 151.84445184165773, "learning_rate": 9.600267464452268e-08, "logits/chosen": 0.4383659362792969, "logits/rejected": 0.3102107644081116, "logps/chosen": -534.6839599609375, "logps/rejected": -535.601318359375, "loss": 0.6385, "nll_loss": 0.4828362762928009, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4167993366718292, "rewards/margins": 0.35233569145202637, "rewards/rejected": 0.06446360796689987, "step": 614 }, { "epoch": 0.5353030632196394, "grad_norm": 165.9235457287381, "learning_rate": 9.543200082405767e-08, "logits/chosen": 0.6091023683547974, "logits/rejected": 0.5879953503608704, "logps/chosen": -553.71142578125, "logps/rejected": -550.5123291015625, "loss": 0.5878, "nll_loss": 0.5072000622749329, "rewards/accuracies": 0.75, "rewards/chosen": 0.5500698089599609, "rewards/margins": 0.4482544958591461, "rewards/rejected": 0.10181531310081482, "step": 616 }, { "epoch": 0.5370410601781447, "grad_norm": 126.54308707325764, "learning_rate": 9.48614760423437e-08, "logits/chosen": 0.18392491340637207, "logits/rejected": 0.3804677128791809, "logps/chosen": -381.7693786621094, "logps/rejected": -473.5242919921875, "loss": 0.5744, "nll_loss": 0.39874348044395447, "rewards/accuracies": 0.75, "rewards/chosen": 0.4918847680091858, "rewards/margins": 0.5275743007659912, "rewards/rejected": -0.03568943962454796, "step": 618 }, { "epoch": 0.53877905713665, "grad_norm": 171.08443280226848, "learning_rate": 9.429111891372318e-08, "logits/chosen": 0.2717643678188324, "logits/rejected": 0.18191061913967133, "logps/chosen": -497.6204833984375, "logps/rejected": -487.7861022949219, "loss": 0.6364, "nll_loss": 0.45864543318748474, "rewards/accuracies": 0.75, "rewards/chosen": 0.40872687101364136, "rewards/margins": 0.272097647190094, "rewards/rejected": 0.13662928342819214, "step": 620 }, { "epoch": 0.5405170540951554, "grad_norm": 353.14665428046374, "learning_rate": 9.372094804706866e-08, "logits/chosen": 0.34724730253219604, "logits/rejected": 0.542032778263092, "logps/chosen": -481.6340026855469, "logps/rejected": -492.6419982910156, "loss": 0.5841, "nll_loss": 0.4751821458339691, "rewards/accuracies": 0.625, "rewards/chosen": 0.437094122171402, "rewards/margins": 0.47972115874290466, "rewards/rejected": -0.042627010494470596, "step": 622 }, { "epoch": 0.5422550510536607, "grad_norm": 181.9942371418194, "learning_rate": 9.315098204517542e-08, "logits/chosen": 0.5911327600479126, "logits/rejected": 0.686689019203186, "logps/chosen": -493.7080383300781, "logps/rejected": -526.79443359375, "loss": 0.6313, "nll_loss": 0.46785080432891846, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5453550815582275, "rewards/margins": 0.2460251748561859, "rewards/rejected": 0.29932984709739685, "step": 624 }, { "epoch": 0.543993048012166, "grad_norm": 132.5432904481579, "learning_rate": 9.258123950415478e-08, "logits/chosen": 0.2856918275356293, "logits/rejected": 0.2954900860786438, "logps/chosen": -457.42034912109375, "logps/rejected": -460.74700927734375, "loss": 0.5104, "nll_loss": 0.4443623423576355, "rewards/accuracies": 0.75, "rewards/chosen": 0.6252665519714355, "rewards/margins": 0.5157196521759033, "rewards/rejected": 0.10954684019088745, "step": 626 }, { "epoch": 0.5457310449706713, "grad_norm": 162.89205752421685, "learning_rate": 9.201173901282723e-08, "logits/chosen": 0.27452006936073303, "logits/rejected": 0.3245624899864197, "logps/chosen": -469.73541259765625, "logps/rejected": -475.74957275390625, "loss": 0.6022, "nll_loss": 0.45277708768844604, "rewards/accuracies": 0.5, "rewards/chosen": 0.48880985379219055, "rewards/margins": 0.12186084687709808, "rewards/rejected": 0.3669489920139313, "step": 628 }, { "epoch": 0.5474690419291767, "grad_norm": 253.42365459394915, "learning_rate": 9.144249915211604e-08, "logits/chosen": 0.2531193494796753, "logits/rejected": 0.2789984941482544, "logps/chosen": -463.0240478515625, "logps/rejected": -504.46575927734375, "loss": 0.6221, "nll_loss": 0.49072834849357605, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0013586282730103, "rewards/margins": 0.46089407801628113, "rewards/rejected": 0.540464460849762, "step": 630 }, { "epoch": 0.549207038887682, "grad_norm": 125.5789962477866, "learning_rate": 9.087353849444083e-08, "logits/chosen": 0.4191408157348633, "logits/rejected": 0.30525293946266174, "logps/chosen": -530.7289428710938, "logps/rejected": -490.19366455078125, "loss": 0.5603, "nll_loss": 0.5064207315444946, "rewards/accuracies": 0.625, "rewards/chosen": 0.2410431057214737, "rewards/margins": 0.24200935661792755, "rewards/rejected": -0.0009662678348831832, "step": 632 }, { "epoch": 0.5509450358461873, "grad_norm": 175.99284235626237, "learning_rate": 9.030487560311185e-08, "logits/chosen": 0.6235304474830627, "logits/rejected": 0.5203417539596558, "logps/chosen": -483.08404541015625, "logps/rejected": -489.843994140625, "loss": 0.5813, "nll_loss": 0.47011032700538635, "rewards/accuracies": 0.625, "rewards/chosen": 0.5235908627510071, "rewards/margins": 0.4106482267379761, "rewards/rejected": 0.11294259130954742, "step": 634 }, { "epoch": 0.5526830328046926, "grad_norm": 198.90559585429537, "learning_rate": 8.973652903172422e-08, "logits/chosen": 0.47140440344810486, "logits/rejected": 0.5593248605728149, "logps/chosen": -518.4012451171875, "logps/rejected": -533.6356201171875, "loss": 0.5951, "nll_loss": 0.4733196496963501, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6831684112548828, "rewards/margins": 0.2774263620376587, "rewards/rejected": 0.4057420790195465, "step": 636 }, { "epoch": 0.554421029763198, "grad_norm": 141.98294190025055, "learning_rate": 8.916851732355253e-08, "logits/chosen": 0.415526419878006, "logits/rejected": 0.14838898181915283, "logps/chosen": -462.7410583496094, "logps/rejected": -434.5695495605469, "loss": 0.5801, "nll_loss": 0.43723177909851074, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6801949143409729, "rewards/margins": 0.39234715700149536, "rewards/rejected": 0.28784769773483276, "step": 638 }, { "epoch": 0.5561590267217033, "grad_norm": 183.53135339200935, "learning_rate": 8.860085901094593e-08, "logits/chosen": 0.4905831813812256, "logits/rejected": 0.45162349939346313, "logps/chosen": -513.9711303710938, "logps/rejected": -480.98150634765625, "loss": 0.5924, "nll_loss": 0.4619283080101013, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5028316378593445, "rewards/margins": 0.22538775205612183, "rewards/rejected": 0.27744388580322266, "step": 640 }, { "epoch": 0.5578970236802085, "grad_norm": 165.1047438993026, "learning_rate": 8.803357261472343e-08, "logits/chosen": 0.4632202386856079, "logits/rejected": 0.40927284955978394, "logps/chosen": -527.6860961914062, "logps/rejected": -524.5245361328125, "loss": 0.587, "nll_loss": 0.48514413833618164, "rewards/accuracies": 0.6875, "rewards/chosen": 0.688771665096283, "rewards/margins": 0.652374267578125, "rewards/rejected": 0.03639736771583557, "step": 642 }, { "epoch": 0.5596350206387138, "grad_norm": 144.25736006644212, "learning_rate": 8.746667664356956e-08, "logits/chosen": -0.019076313823461533, "logits/rejected": -0.005755473859608173, "logps/chosen": -463.5662536621094, "logps/rejected": -538.6168212890625, "loss": 0.5763, "nll_loss": 0.45522579550743103, "rewards/accuracies": 0.75, "rewards/chosen": 0.6912392377853394, "rewards/margins": 0.3665103018283844, "rewards/rejected": 0.32472896575927734, "step": 644 }, { "epoch": 0.5613730175972192, "grad_norm": 124.41344478131262, "learning_rate": 8.69001895934307e-08, "logits/chosen": 0.4203868806362152, "logits/rejected": 0.4716566205024719, "logps/chosen": -449.85791015625, "logps/rejected": -419.0743408203125, "loss": 0.5856, "nll_loss": 0.4255959987640381, "rewards/accuracies": 0.6875, "rewards/chosen": 0.426087349653244, "rewards/margins": 0.16484859585762024, "rewards/rejected": 0.2612387537956238, "step": 646 }, { "epoch": 0.5631110145557245, "grad_norm": 133.22640371919314, "learning_rate": 8.633412994691143e-08, "logits/chosen": 0.7127283215522766, "logits/rejected": 0.6688105463981628, "logps/chosen": -516.0383911132812, "logps/rejected": -493.16217041015625, "loss": 0.6169, "nll_loss": 0.489109069108963, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5604671239852905, "rewards/margins": 0.549665629863739, "rewards/rejected": 0.010801505297422409, "step": 648 }, { "epoch": 0.5648490115142298, "grad_norm": 254.8501288063932, "learning_rate": 8.576851617267149e-08, "logits/chosen": 0.6733088493347168, "logits/rejected": 0.7629974484443665, "logps/chosen": -494.3887939453125, "logps/rejected": -501.49029541015625, "loss": 0.5545, "nll_loss": 0.47174495458602905, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3747645318508148, "rewards/margins": 0.24593639373779297, "rewards/rejected": 0.12882815301418304, "step": 650 }, { "epoch": 0.5665870084727351, "grad_norm": 254.4364667759312, "learning_rate": 8.520336672482337e-08, "logits/chosen": 0.3818691074848175, "logits/rejected": 0.34968656301498413, "logps/chosen": -503.2013854980469, "logps/rejected": -476.4749755859375, "loss": 0.6643, "nll_loss": 0.44002267718315125, "rewards/accuracies": 0.5, "rewards/chosen": 0.4178128242492676, "rewards/margins": 0.0481499545276165, "rewards/rejected": 0.369662880897522, "step": 652 }, { "epoch": 0.5683250054312405, "grad_norm": 145.54967811889972, "learning_rate": 8.463870004233007e-08, "logits/chosen": 0.2866371273994446, "logits/rejected": 0.3002834618091583, "logps/chosen": -455.74786376953125, "logps/rejected": -458.2134704589844, "loss": 0.5675, "nll_loss": 0.46567031741142273, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6578887701034546, "rewards/margins": 0.5248188376426697, "rewards/rejected": 0.1330699920654297, "step": 654 }, { "epoch": 0.5700630023897458, "grad_norm": 187.06363223184863, "learning_rate": 8.407453454840356e-08, "logits/chosen": 0.5434020161628723, "logits/rejected": 0.4297879934310913, "logps/chosen": -485.2026062011719, "logps/rejected": -492.1148681640625, "loss": 0.5795, "nll_loss": 0.44286367297172546, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6396047472953796, "rewards/margins": 0.44584396481513977, "rewards/rejected": 0.19376078248023987, "step": 656 }, { "epoch": 0.5718009993482511, "grad_norm": 150.6278217076987, "learning_rate": 8.351088864990367e-08, "logits/chosen": 0.23380723595619202, "logits/rejected": 0.4935187101364136, "logps/chosen": -443.7547607421875, "logps/rejected": -480.4429626464844, "loss": 0.5463, "nll_loss": 0.4307003319263458, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7835605144500732, "rewards/margins": 0.7314401268959045, "rewards/rejected": 0.05212049186229706, "step": 658 }, { "epoch": 0.5735389963067564, "grad_norm": 152.2047609267045, "learning_rate": 8.294778073673762e-08, "logits/chosen": 0.33786603808403015, "logits/rejected": 0.34570395946502686, "logps/chosen": -413.5665588378906, "logps/rejected": -484.5573425292969, "loss": 0.5945, "nll_loss": 0.41921356320381165, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41442328691482544, "rewards/margins": 0.3553313612937927, "rewards/rejected": 0.0590919554233551, "step": 660 }, { "epoch": 0.5752769932652618, "grad_norm": 98.79951835870942, "learning_rate": 8.238522918125983e-08, "logits/chosen": 0.5606307983398438, "logits/rejected": 0.5547394156455994, "logps/chosen": -492.9490966796875, "logps/rejected": -486.5016174316406, "loss": 0.5391, "nll_loss": 0.47447332739830017, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7773496508598328, "rewards/margins": 0.3164869546890259, "rewards/rejected": 0.46086278557777405, "step": 662 }, { "epoch": 0.5770149902237671, "grad_norm": 139.0633092591687, "learning_rate": 8.182325233767267e-08, "logits/chosen": 0.4253976047039032, "logits/rejected": 0.46992579102516174, "logps/chosen": -450.5101623535156, "logps/rejected": -494.90667724609375, "loss": 0.571, "nll_loss": 0.45393383502960205, "rewards/accuracies": 0.75, "rewards/chosen": 0.4154530167579651, "rewards/margins": 0.47700321674346924, "rewards/rejected": -0.061550240963697433, "step": 664 }, { "epoch": 0.5787529871822724, "grad_norm": 173.14859519488306, "learning_rate": 8.126186854142751e-08, "logits/chosen": 0.8982985615730286, "logits/rejected": 0.8473838567733765, "logps/chosen": -465.6043701171875, "logps/rejected": -483.4315185546875, "loss": 0.6207, "nll_loss": 0.4442789554595947, "rewards/accuracies": 0.75, "rewards/chosen": 0.3262258768081665, "rewards/margins": 0.155735582113266, "rewards/rejected": 0.17049026489257812, "step": 666 }, { "epoch": 0.5804909841407777, "grad_norm": 128.41420443111352, "learning_rate": 8.070109610862667e-08, "logits/chosen": 0.16398797929286957, "logits/rejected": 0.08619170635938644, "logps/chosen": -456.3326721191406, "logps/rejected": -419.8201904296875, "loss": 0.5671, "nll_loss": 0.4239170253276825, "rewards/accuracies": 0.75, "rewards/chosen": 0.6617186665534973, "rewards/margins": 0.5650426745414734, "rewards/rejected": 0.09667597711086273, "step": 668 }, { "epoch": 0.5822289810992831, "grad_norm": 198.70109611531015, "learning_rate": 8.014095333542547e-08, "logits/chosen": 0.005634918808937073, "logits/rejected": -0.006585095077753067, "logps/chosen": -424.5329284667969, "logps/rejected": -469.5064392089844, "loss": 0.5497, "nll_loss": 0.43806880712509155, "rewards/accuracies": 0.9375, "rewards/chosen": 1.148248314857483, "rewards/margins": 0.7581798434257507, "rewards/rejected": 0.3900684714317322, "step": 670 }, { "epoch": 0.5839669780577884, "grad_norm": 200.8765720947795, "learning_rate": 7.958145849743569e-08, "logits/chosen": 0.38725346326828003, "logits/rejected": 0.328239381313324, "logps/chosen": -499.67547607421875, "logps/rejected": -477.58050537109375, "loss": 0.5503, "nll_loss": 0.44405287504196167, "rewards/accuracies": 0.75, "rewards/chosen": 0.4755365550518036, "rewards/margins": 0.31502342224121094, "rewards/rejected": 0.16051313281059265, "step": 672 }, { "epoch": 0.5857049750162937, "grad_norm": 138.10866585007568, "learning_rate": 7.902262984912909e-08, "logits/chosen": 0.5126460790634155, "logits/rejected": 0.457733154296875, "logps/chosen": -431.8909606933594, "logps/rejected": -441.8907775878906, "loss": 0.6765, "nll_loss": 0.42958134412765503, "rewards/accuracies": 0.75, "rewards/chosen": 0.5870010852813721, "rewards/margins": -0.019699443131685257, "rewards/rejected": 0.6067005395889282, "step": 674 }, { "epoch": 0.587442971974799, "grad_norm": 153.65852416714486, "learning_rate": 7.846448562324182e-08, "logits/chosen": 0.4234592318534851, "logits/rejected": 0.4593704640865326, "logps/chosen": -462.70745849609375, "logps/rejected": -477.0972900390625, "loss": 0.5945, "nll_loss": 0.45969316363334656, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7346555590629578, "rewards/margins": 0.3743089735507965, "rewards/rejected": 0.36034661531448364, "step": 676 }, { "epoch": 0.5891809689333044, "grad_norm": 220.09425556349453, "learning_rate": 7.79070440301796e-08, "logits/chosen": 0.5279329419136047, "logits/rejected": 0.5622657537460327, "logps/chosen": -479.4163513183594, "logps/rejected": -521.3204956054688, "loss": 0.5814, "nll_loss": 0.43853023648262024, "rewards/accuracies": 0.5625, "rewards/chosen": 0.529964804649353, "rewards/margins": 0.25607913732528687, "rewards/rejected": 0.27388572692871094, "step": 678 }, { "epoch": 0.5909189658918097, "grad_norm": 114.49030691842641, "learning_rate": 7.735032325742354e-08, "logits/chosen": 0.414661169052124, "logits/rejected": 0.4720441997051239, "logps/chosen": -470.0740661621094, "logps/rejected": -428.37396240234375, "loss": 0.5653, "nll_loss": 0.44829848408699036, "rewards/accuracies": 0.75, "rewards/chosen": 0.6440432071685791, "rewards/margins": 0.4521419107913971, "rewards/rejected": 0.19190125167369843, "step": 680 }, { "epoch": 0.592656962850315, "grad_norm": 157.55052864927137, "learning_rate": 7.679434146893684e-08, "logits/chosen": 0.4875819683074951, "logits/rejected": 0.5448699593544006, "logps/chosen": -486.230712890625, "logps/rejected": -494.6131591796875, "loss": 0.657, "nll_loss": 0.4658043384552002, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47913867235183716, "rewards/margins": 0.0011176057159900665, "rewards/rejected": 0.4780210852622986, "step": 682 }, { "epoch": 0.5943949598088203, "grad_norm": 106.99123250882965, "learning_rate": 7.623911680457198e-08, "logits/chosen": 0.538242518901825, "logits/rejected": 0.4498624801635742, "logps/chosen": -469.2276916503906, "logps/rejected": -468.6518859863281, "loss": 0.5517, "nll_loss": 0.42697572708129883, "rewards/accuracies": 0.625, "rewards/chosen": 0.4259525537490845, "rewards/margins": 0.28813114762306213, "rewards/rejected": 0.13782137632369995, "step": 684 }, { "epoch": 0.5961329567673257, "grad_norm": 136.23994501024254, "learning_rate": 7.568466737947903e-08, "logits/chosen": 0.48528510332107544, "logits/rejected": 0.5946138501167297, "logps/chosen": -511.0415954589844, "logps/rejected": -517.5130004882812, "loss": 0.5923, "nll_loss": 0.47626355290412903, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5737752914428711, "rewards/margins": 0.3134174048900604, "rewards/rejected": 0.2603578567504883, "step": 686 }, { "epoch": 0.597870953725831, "grad_norm": 310.86088435708143, "learning_rate": 7.513101128351454e-08, "logits/chosen": 0.5474853515625, "logits/rejected": 0.589713454246521, "logps/chosen": -534.6466674804688, "logps/rejected": -520.8831787109375, "loss": 0.5814, "nll_loss": 0.5208612084388733, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5015422701835632, "rewards/margins": 0.14514771103858948, "rewards/rejected": 0.35639458894729614, "step": 688 }, { "epoch": 0.5996089506843363, "grad_norm": 161.18501660847068, "learning_rate": 7.457816658065133e-08, "logits/chosen": 0.31886252760887146, "logits/rejected": 0.07821323722600937, "logps/chosen": -555.0598754882812, "logps/rejected": -465.5038146972656, "loss": 0.5469, "nll_loss": 0.4690946340560913, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1816483438014984, "rewards/margins": 0.35304969549179077, "rewards/rejected": -0.17140133678913116, "step": 690 }, { "epoch": 0.6013469476428416, "grad_norm": 150.85271265312244, "learning_rate": 7.402615130838917e-08, "logits/chosen": 0.4454071521759033, "logits/rejected": 0.36207619309425354, "logps/chosen": -468.035888671875, "logps/rejected": -451.74267578125, "loss": 0.5798, "nll_loss": 0.4426875710487366, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5169916152954102, "rewards/margins": 0.34321069717407227, "rewards/rejected": 0.1737809032201767, "step": 692 }, { "epoch": 0.603084944601347, "grad_norm": 182.8319718934074, "learning_rate": 7.347498347716624e-08, "logits/chosen": 0.4354170858860016, "logits/rejected": 0.30550816655158997, "logps/chosen": -432.18499755859375, "logps/rejected": -492.991455078125, "loss": 0.6565, "nll_loss": 0.4345720410346985, "rewards/accuracies": 0.625, "rewards/chosen": 0.3578968048095703, "rewards/margins": 0.11001966893672943, "rewards/rejected": 0.2478771209716797, "step": 694 }, { "epoch": 0.6048229415598523, "grad_norm": 343.8851119586624, "learning_rate": 7.292468106977147e-08, "logits/chosen": 0.5165270566940308, "logits/rejected": 0.6715250611305237, "logps/chosen": -415.6692810058594, "logps/rejected": -441.96710205078125, "loss": 0.5699, "nll_loss": 0.3966309428215027, "rewards/accuracies": 0.625, "rewards/chosen": 0.22180873155593872, "rewards/margins": 0.2036515325307846, "rewards/rejected": 0.018157199025154114, "step": 696 }, { "epoch": 0.6065609385183576, "grad_norm": 125.7461795731971, "learning_rate": 7.237526204075796e-08, "logits/chosen": 0.20688635110855103, "logits/rejected": 0.2710256576538086, "logps/chosen": -510.6375427246094, "logps/rejected": -526.5493774414062, "loss": 0.6146, "nll_loss": 0.4774847626686096, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3681733012199402, "rewards/margins": 0.3653341829776764, "rewards/rejected": 0.002839110791683197, "step": 698 }, { "epoch": 0.6082989354768629, "grad_norm": 121.43572798576211, "learning_rate": 7.182674431585702e-08, "logits/chosen": 0.46929383277893066, "logits/rejected": 0.57563716173172, "logps/chosen": -447.2082824707031, "logps/rejected": -463.76959228515625, "loss": 0.571, "nll_loss": 0.4302436113357544, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6159959435462952, "rewards/margins": 0.4504932761192322, "rewards/rejected": 0.16550272703170776, "step": 700 }, { "epoch": 0.6100369324353683, "grad_norm": 131.79731355818942, "learning_rate": 7.127914579139337e-08, "logits/chosen": 0.40889453887939453, "logits/rejected": 0.3319603204727173, "logps/chosen": -495.09344482421875, "logps/rejected": -504.8100891113281, "loss": 0.5684, "nll_loss": 0.47828787565231323, "rewards/accuracies": 0.75, "rewards/chosen": 0.6092842817306519, "rewards/margins": 0.6394984722137451, "rewards/rejected": -0.030214110389351845, "step": 702 }, { "epoch": 0.6117749293938736, "grad_norm": 155.45328672697394, "learning_rate": 7.073248433370124e-08, "logits/chosen": 0.49587368965148926, "logits/rejected": 0.5953259468078613, "logps/chosen": -512.6905517578125, "logps/rejected": -542.0206298828125, "loss": 0.6345, "nll_loss": 0.5072119832038879, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3826178312301636, "rewards/margins": 0.1317063719034195, "rewards/rejected": 0.25091153383255005, "step": 704 }, { "epoch": 0.6135129263523789, "grad_norm": 205.7696417880005, "learning_rate": 7.018677777854158e-08, "logits/chosen": 0.6558414101600647, "logits/rejected": 0.5301958322525024, "logps/chosen": -487.62066650390625, "logps/rejected": -480.21832275390625, "loss": 0.5238, "nll_loss": 0.46764057874679565, "rewards/accuracies": 0.75, "rewards/chosen": 0.11372433602809906, "rewards/margins": 0.3404094874858856, "rewards/rejected": -0.22668513655662537, "step": 706 }, { "epoch": 0.6152509233108842, "grad_norm": 226.38982186124755, "learning_rate": 6.96420439305198e-08, "logits/chosen": 0.2674558460712433, "logits/rejected": 0.37489739060401917, "logps/chosen": -514.8885498046875, "logps/rejected": -553.451416015625, "loss": 0.6391, "nll_loss": 0.5216849446296692, "rewards/accuracies": 0.75, "rewards/chosen": 0.7858709096908569, "rewards/margins": 0.7879164814949036, "rewards/rejected": -0.00204562209546566, "step": 708 }, { "epoch": 0.6169889202693896, "grad_norm": 147.1625577529555, "learning_rate": 6.909830056250527e-08, "logits/chosen": 0.29546013474464417, "logits/rejected": 0.37205877900123596, "logps/chosen": -499.74700927734375, "logps/rejected": -556.3563232421875, "loss": 0.604, "nll_loss": 0.5126159191131592, "rewards/accuracies": 0.75, "rewards/chosen": 0.3689521551132202, "rewards/margins": 0.6727516055107117, "rewards/rejected": -0.30379945039749146, "step": 710 }, { "epoch": 0.6187269172278949, "grad_norm": 159.3173874474768, "learning_rate": 6.85555654150512e-08, "logits/chosen": 0.5298061370849609, "logits/rejected": 0.4939400553703308, "logps/chosen": -493.6993713378906, "logps/rejected": -460.180908203125, "loss": 0.6112, "nll_loss": 0.4606616795063019, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23729343712329865, "rewards/margins": 0.29073992371559143, "rewards/rejected": -0.053446486592292786, "step": 712 }, { "epoch": 0.6204649141864002, "grad_norm": 139.47914568218025, "learning_rate": 6.801385619581591e-08, "logits/chosen": 0.3952076733112335, "logits/rejected": 0.41778475046157837, "logps/chosen": -437.45758056640625, "logps/rejected": -474.8389587402344, "loss": 0.5748, "nll_loss": 0.4331091046333313, "rewards/accuracies": 0.75, "rewards/chosen": 0.5372213125228882, "rewards/margins": 0.5590240359306335, "rewards/rejected": -0.021802805364131927, "step": 714 }, { "epoch": 0.6222029111449054, "grad_norm": 148.32834601314627, "learning_rate": 6.747319057898502e-08, "logits/chosen": 0.3858318626880646, "logits/rejected": 0.33923962712287903, "logps/chosen": -475.5868225097656, "logps/rejected": -507.0733947753906, "loss": 0.6073, "nll_loss": 0.48082295060157776, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5214846134185791, "rewards/margins": 0.41246843338012695, "rewards/rejected": 0.10901622474193573, "step": 716 }, { "epoch": 0.6239409081034109, "grad_norm": 120.86725662420797, "learning_rate": 6.693358620469487e-08, "logits/chosen": 0.6276645660400391, "logits/rejected": 0.5396835803985596, "logps/chosen": -492.626708984375, "logps/rejected": -497.15673828125, "loss": 0.5857, "nll_loss": 0.45979419350624084, "rewards/accuracies": 0.75, "rewards/chosen": 0.10214034467935562, "rewards/margins": 0.233620747923851, "rewards/rejected": -0.1314803957939148, "step": 718 }, { "epoch": 0.6256789050619161, "grad_norm": 153.69685391073264, "learning_rate": 6.639506067845697e-08, "logits/chosen": 0.4492928683757782, "logits/rejected": 0.47506198287010193, "logps/chosen": -513.918701171875, "logps/rejected": -455.6263122558594, "loss": 0.5852, "nll_loss": 0.4903804361820221, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22701072692871094, "rewards/margins": 0.4086184501647949, "rewards/rejected": -0.18160772323608398, "step": 720 }, { "epoch": 0.6274169020204214, "grad_norm": 144.74291732818253, "learning_rate": 6.585763157058357e-08, "logits/chosen": 0.24743977189064026, "logits/rejected": 0.24426200985908508, "logps/chosen": -513.9275512695312, "logps/rejected": -548.3668823242188, "loss": 0.6136, "nll_loss": 0.46442505717277527, "rewards/accuracies": 0.75, "rewards/chosen": 0.46324920654296875, "rewards/margins": 0.36793631315231323, "rewards/rejected": 0.09531288594007492, "step": 722 }, { "epoch": 0.6291548989789267, "grad_norm": 168.84797722833102, "learning_rate": 6.53213164156144e-08, "logits/chosen": 0.24420535564422607, "logits/rejected": 0.34088706970214844, "logps/chosen": -465.83038330078125, "logps/rejected": -463.24041748046875, "loss": 0.6246, "nll_loss": 0.4556388854980469, "rewards/accuracies": 0.75, "rewards/chosen": 0.6123493909835815, "rewards/margins": 0.12421969324350357, "rewards/rejected": 0.4881296157836914, "step": 724 }, { "epoch": 0.6308928959374321, "grad_norm": 145.95898719291733, "learning_rate": 6.478613271174452e-08, "logits/chosen": 0.46754446625709534, "logits/rejected": 0.21353721618652344, "logps/chosen": -462.1207275390625, "logps/rejected": -503.33172607421875, "loss": 0.5881, "nll_loss": 0.4296815097332001, "rewards/accuracies": 0.6875, "rewards/chosen": 0.552665650844574, "rewards/margins": 0.21230097115039825, "rewards/rejected": 0.34036463499069214, "step": 726 }, { "epoch": 0.6326308928959374, "grad_norm": 156.916802373185, "learning_rate": 6.425209792025357e-08, "logits/chosen": 0.34340569376945496, "logits/rejected": 0.42395177483558655, "logps/chosen": -422.3741455078125, "logps/rejected": -461.89788818359375, "loss": 0.5971, "nll_loss": 0.4330331087112427, "rewards/accuracies": 0.625, "rewards/chosen": 0.564942479133606, "rewards/margins": 0.31727665662765503, "rewards/rejected": 0.24766579270362854, "step": 728 }, { "epoch": 0.6343688898544427, "grad_norm": 163.70515499675665, "learning_rate": 6.371922946493591e-08, "logits/chosen": 0.5420715808868408, "logits/rejected": 0.6046915054321289, "logps/chosen": -519.7154541015625, "logps/rejected": -503.3428955078125, "loss": 0.6052, "nll_loss": 0.4930592477321625, "rewards/accuracies": 0.625, "rewards/chosen": 0.45887356996536255, "rewards/margins": 0.41354089975357056, "rewards/rejected": 0.04533272236585617, "step": 730 }, { "epoch": 0.636106886812948, "grad_norm": 173.00562897007376, "learning_rate": 6.31875447315322e-08, "logits/chosen": 0.5808181762695312, "logits/rejected": 0.5133619904518127, "logps/chosen": -490.7048645019531, "logps/rejected": -418.19482421875, "loss": 0.6095, "nll_loss": 0.45788317918777466, "rewards/accuracies": 0.625, "rewards/chosen": 0.42479199171066284, "rewards/margins": 0.07730991393327713, "rewards/rejected": 0.3474821150302887, "step": 732 }, { "epoch": 0.6378448837714534, "grad_norm": 160.63823512356848, "learning_rate": 6.26570610671622e-08, "logits/chosen": 0.14066378772258759, "logits/rejected": 0.2091757357120514, "logps/chosen": -434.40643310546875, "logps/rejected": -478.67877197265625, "loss": 0.5721, "nll_loss": 0.454807311296463, "rewards/accuracies": 0.875, "rewards/chosen": 0.7642690539360046, "rewards/margins": 0.6635603308677673, "rewards/rejected": 0.10070877522230148, "step": 734 }, { "epoch": 0.6395828807299587, "grad_norm": 264.0674555161591, "learning_rate": 6.212779577975869e-08, "logits/chosen": 0.3542450964450836, "logits/rejected": 0.41409796476364136, "logps/chosen": -478.570068359375, "logps/rejected": -481.2341613769531, "loss": 0.5887, "nll_loss": 0.4426587224006653, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5770342350006104, "rewards/margins": 0.4736230969429016, "rewards/rejected": 0.10341110080480576, "step": 736 }, { "epoch": 0.641320877688464, "grad_norm": 180.89113715298822, "learning_rate": 6.159976613750286e-08, "logits/chosen": 0.10223013162612915, "logits/rejected": 0.2627410292625427, "logps/chosen": -476.6172180175781, "logps/rejected": -510.81207275390625, "loss": 0.5827, "nll_loss": 0.4346134662628174, "rewards/accuracies": 0.625, "rewards/chosen": 0.6227002739906311, "rewards/margins": 0.48594045639038086, "rewards/rejected": 0.13675986230373383, "step": 738 }, { "epoch": 0.6430588746469693, "grad_norm": 293.2245154471447, "learning_rate": 6.107298936826086e-08, "logits/chosen": 0.35400518774986267, "logits/rejected": 0.4170874357223511, "logps/chosen": -445.3726806640625, "logps/rejected": -474.6842346191406, "loss": 0.6212, "nll_loss": 0.43134331703186035, "rewards/accuracies": 0.625, "rewards/chosen": 0.33985626697540283, "rewards/margins": 0.22458821535110474, "rewards/rejected": 0.1152680367231369, "step": 740 }, { "epoch": 0.6447968716054747, "grad_norm": 131.0312166378181, "learning_rate": 6.05474826590217e-08, "logits/chosen": 0.29119187593460083, "logits/rejected": 0.3434115946292877, "logps/chosen": -446.6615295410156, "logps/rejected": -455.331298828125, "loss": 0.6125, "nll_loss": 0.4392196238040924, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4968715310096741, "rewards/margins": 0.3695334494113922, "rewards/rejected": 0.12733803689479828, "step": 742 }, { "epoch": 0.64653486856398, "grad_norm": 162.16655463240252, "learning_rate": 6.002326315533664e-08, "logits/chosen": 0.3683479130268097, "logits/rejected": 0.37617531418800354, "logps/chosen": -577.240234375, "logps/rejected": -548.419189453125, "loss": 0.6629, "nll_loss": 0.49988922476768494, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9272751212120056, "rewards/margins": 0.266749769449234, "rewards/rejected": 0.6605253219604492, "step": 744 }, { "epoch": 0.6482728655224853, "grad_norm": 195.13382275325625, "learning_rate": 5.950034796075947e-08, "logits/chosen": 0.45707762241363525, "logits/rejected": 0.3717419505119324, "logps/chosen": -519.5899047851562, "logps/rejected": -537.0008544921875, "loss": 0.5515, "nll_loss": 0.4879206418991089, "rewards/accuracies": 0.625, "rewards/chosen": 0.7020326256752014, "rewards/margins": 0.2489035725593567, "rewards/rejected": 0.45312899351119995, "step": 746 }, { "epoch": 0.6500108624809906, "grad_norm": 130.69654593671132, "learning_rate": 5.8978754136288835e-08, "logits/chosen": 0.6327035427093506, "logits/rejected": 0.49619409441947937, "logps/chosen": -479.45111083984375, "logps/rejected": -498.4273376464844, "loss": 0.5984, "nll_loss": 0.48115187883377075, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6721794605255127, "rewards/margins": 0.6595268249511719, "rewards/rejected": 0.012652596458792686, "step": 748 }, { "epoch": 0.651748859439496, "grad_norm": 207.68261302866335, "learning_rate": 5.845849869981137e-08, "logits/chosen": 0.3563077449798584, "logits/rejected": 0.4740470051765442, "logps/chosen": -421.8052978515625, "logps/rejected": -470.5953674316406, "loss": 0.6263, "nll_loss": 0.41041111946105957, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5872929096221924, "rewards/margins": 0.30345726013183594, "rewards/rejected": 0.28383558988571167, "step": 750 }, { "epoch": 0.6534868563980013, "grad_norm": 124.08937279199534, "learning_rate": 5.7939598625546516e-08, "logits/chosen": 0.4799199104309082, "logits/rejected": 0.43483075499534607, "logps/chosen": -505.39410400390625, "logps/rejected": -542.4945068359375, "loss": 0.5873, "nll_loss": 0.4946918785572052, "rewards/accuracies": 0.875, "rewards/chosen": 0.719025731086731, "rewards/margins": 0.553241491317749, "rewards/rejected": 0.16578426957130432, "step": 752 }, { "epoch": 0.6552248533565066, "grad_norm": 177.62311751052303, "learning_rate": 5.742207084349273e-08, "logits/chosen": 0.4672154486179352, "logits/rejected": 0.3542248606681824, "logps/chosen": -440.687744140625, "logps/rejected": -423.16729736328125, "loss": 0.6154, "nll_loss": 0.4290880858898163, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3816303014755249, "rewards/margins": 0.4123232066631317, "rewards/rejected": -0.03069286048412323, "step": 754 }, { "epoch": 0.656962850315012, "grad_norm": 127.05979762459806, "learning_rate": 5.690593223887512e-08, "logits/chosen": 0.4858136773109436, "logits/rejected": 0.4847185015678406, "logps/chosen": -431.63433837890625, "logps/rejected": -431.2800598144531, "loss": 0.5734, "nll_loss": 0.4098186492919922, "rewards/accuracies": 0.6875, "rewards/chosen": 0.26508915424346924, "rewards/margins": 0.1980241984128952, "rewards/rejected": 0.06706495583057404, "step": 756 }, { "epoch": 0.6587008472735173, "grad_norm": 279.8735412918544, "learning_rate": 5.6391199651594454e-08, "logits/chosen": 0.5232665538787842, "logits/rejected": 0.6117486357688904, "logps/chosen": -460.03594970703125, "logps/rejected": -461.8345947265625, "loss": 0.7045, "nll_loss": 0.4505896270275116, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1962445229291916, "rewards/margins": -0.15768270194530487, "rewards/rejected": 0.35392722487449646, "step": 758 }, { "epoch": 0.6604388442320226, "grad_norm": 300.2555521627166, "learning_rate": 5.587788987567784e-08, "logits/chosen": 0.3533879518508911, "logits/rejected": 0.43673956394195557, "logps/chosen": -450.2116394042969, "logps/rejected": -511.2435302734375, "loss": 0.6252, "nll_loss": 0.44547367095947266, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4659990072250366, "rewards/margins": 0.5369496941566467, "rewards/rejected": -0.07095059752464294, "step": 760 }, { "epoch": 0.6621768411905279, "grad_norm": 188.81668805180584, "learning_rate": 5.536601965873082e-08, "logits/chosen": 0.3259649872779846, "logits/rejected": 0.4583315849304199, "logps/chosen": -417.64495849609375, "logps/rejected": -486.90435791015625, "loss": 0.6134, "nll_loss": 0.421085387468338, "rewards/accuracies": 0.875, "rewards/chosen": 0.5350974798202515, "rewards/margins": 0.56300950050354, "rewards/rejected": -0.027912046760320663, "step": 762 }, { "epoch": 0.6639148381490333, "grad_norm": 122.29581285365549, "learning_rate": 5.48556057013906e-08, "logits/chosen": 0.38365134596824646, "logits/rejected": 0.33106014132499695, "logps/chosen": -470.62261962890625, "logps/rejected": -456.76239013671875, "loss": 0.5941, "nll_loss": 0.4603612720966339, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9042216539382935, "rewards/margins": 0.49135351181030273, "rewards/rejected": 0.4128681421279907, "step": 764 }, { "epoch": 0.6656528351075386, "grad_norm": 127.22170192937084, "learning_rate": 5.4346664656781746e-08, "logits/chosen": 0.1824665665626526, "logits/rejected": 0.241348996758461, "logps/chosen": -418.4898681640625, "logps/rejected": -481.45306396484375, "loss": 0.6468, "nll_loss": 0.4060194492340088, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5272243618965149, "rewards/margins": 0.5564711689949036, "rewards/rejected": -0.029246818274259567, "step": 766 }, { "epoch": 0.6673908320660439, "grad_norm": 158.84153776593297, "learning_rate": 5.3839213129972416e-08, "logits/chosen": 0.6094731092453003, "logits/rejected": 0.5983626842498779, "logps/chosen": -471.9709777832031, "logps/rejected": -527.959228515625, "loss": 0.5736, "nll_loss": 0.45386606454849243, "rewards/accuracies": 0.75, "rewards/chosen": 0.6506131887435913, "rewards/margins": 0.5279781222343445, "rewards/rejected": 0.12263509631156921, "step": 768 }, { "epoch": 0.6691288290245492, "grad_norm": 211.77180871420688, "learning_rate": 5.3333267677432626e-08, "logits/chosen": 0.09929066896438599, "logits/rejected": 0.1294572353363037, "logps/chosen": -417.3568115234375, "logps/rejected": -521.1983032226562, "loss": 0.6173, "nll_loss": 0.38731706142425537, "rewards/accuracies": 0.75, "rewards/chosen": 0.46828895807266235, "rewards/margins": 0.2709873616695404, "rewards/rejected": 0.19730158150196075, "step": 770 }, { "epoch": 0.6708668259830546, "grad_norm": 142.68132898222117, "learning_rate": 5.282884480649435e-08, "logits/chosen": 0.40837544202804565, "logits/rejected": 0.5099937915802002, "logps/chosen": -450.0708923339844, "logps/rejected": -520.635986328125, "loss": 0.5429, "nll_loss": 0.4542206823825836, "rewards/accuracies": 0.875, "rewards/chosen": 0.79877108335495, "rewards/margins": 1.0893309116363525, "rewards/rejected": -0.290559858083725, "step": 772 }, { "epoch": 0.6726048229415599, "grad_norm": 177.9137771236567, "learning_rate": 5.232596097481251e-08, "logits/chosen": 0.7409548163414001, "logits/rejected": 0.6623491644859314, "logps/chosen": -542.9630126953125, "logps/rejected": -529.6650390625, "loss": 0.6094, "nll_loss": 0.4900684952735901, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3313789367675781, "rewards/margins": 0.26346588134765625, "rewards/rejected": 0.06791305541992188, "step": 774 }, { "epoch": 0.6743428199000652, "grad_norm": 277.7385675447199, "learning_rate": 5.182463258982845e-08, "logits/chosen": 0.6070559620857239, "logits/rejected": 0.6075460910797119, "logps/chosen": -458.2026672363281, "logps/rejected": -470.3449401855469, "loss": 0.6355, "nll_loss": 0.45458412170410156, "rewards/accuracies": 0.625, "rewards/chosen": 0.22362647950649261, "rewards/margins": 0.20613884925842285, "rewards/rejected": 0.01748763769865036, "step": 776 }, { "epoch": 0.6760808168585705, "grad_norm": 155.68239224526974, "learning_rate": 5.1324876008234376e-08, "logits/chosen": 0.4934028387069702, "logits/rejected": 0.5032480955123901, "logps/chosen": -477.3890380859375, "logps/rejected": -495.6091613769531, "loss": 0.6027, "nll_loss": 0.4526577293872833, "rewards/accuracies": 0.75, "rewards/chosen": 0.3062657117843628, "rewards/margins": 0.4403870701789856, "rewards/rejected": -0.13412132859230042, "step": 778 }, { "epoch": 0.6778188138170759, "grad_norm": 235.05765374401005, "learning_rate": 5.082670753543961e-08, "logits/chosen": 0.29999101161956787, "logits/rejected": 0.34913432598114014, "logps/chosen": -572.618896484375, "logps/rejected": -585.3377075195312, "loss": 0.5941, "nll_loss": 0.5501593351364136, "rewards/accuracies": 0.6875, "rewards/chosen": 0.35720503330230713, "rewards/margins": 0.20051269233226776, "rewards/rejected": 0.15669232606887817, "step": 780 }, { "epoch": 0.6795568107755812, "grad_norm": 141.4371871807222, "learning_rate": 5.033014342503889e-08, "logits/chosen": 0.44081294536590576, "logits/rejected": 0.2912307679653168, "logps/chosen": -505.85687255859375, "logps/rejected": -521.6779174804688, "loss": 0.5362, "nll_loss": 0.5228245854377747, "rewards/accuracies": 0.625, "rewards/chosen": 0.3681758940219879, "rewards/margins": 0.6614881157875061, "rewards/rejected": -0.2933122515678406, "step": 782 }, { "epoch": 0.6812948077340865, "grad_norm": 188.6420459555021, "learning_rate": 4.983519987828176e-08, "logits/chosen": 0.4042893350124359, "logits/rejected": 0.2071767896413803, "logps/chosen": -557.4622802734375, "logps/rejected": -541.4765625, "loss": 0.579, "nll_loss": 0.5044111609458923, "rewards/accuracies": 0.625, "rewards/chosen": 0.31110334396362305, "rewards/margins": 0.44057944416999817, "rewards/rejected": -0.12947607040405273, "step": 784 }, { "epoch": 0.6830328046925918, "grad_norm": 471.248521332878, "learning_rate": 4.934189304354418e-08, "logits/chosen": 0.014058850705623627, "logits/rejected": 0.23235078155994415, "logps/chosen": -570.7021484375, "logps/rejected": -485.6100158691406, "loss": 0.5428, "nll_loss": 0.5105115175247192, "rewards/accuracies": 0.625, "rewards/chosen": 0.4729238748550415, "rewards/margins": 0.4060804843902588, "rewards/rejected": 0.0668434202671051, "step": 786 }, { "epoch": 0.6847708016510972, "grad_norm": 300.8320888757264, "learning_rate": 4.885023901580162e-08, "logits/chosen": 0.49230802059173584, "logits/rejected": 0.3687041401863098, "logps/chosen": -457.49017333984375, "logps/rejected": -393.0660705566406, "loss": 0.5854, "nll_loss": 0.4349403381347656, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3268558382987976, "rewards/margins": 0.344466894865036, "rewards/rejected": -0.017611034214496613, "step": 788 }, { "epoch": 0.6865087986096025, "grad_norm": 134.60815880505996, "learning_rate": 4.8360253836103816e-08, "logits/chosen": 0.7422909736633301, "logits/rejected": 0.8082624673843384, "logps/chosen": -480.2214660644531, "logps/rejected": -494.79583740234375, "loss": 0.6049, "nll_loss": 0.46801847219467163, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17905193567276, "rewards/margins": 0.21437975764274597, "rewards/rejected": -0.035327814519405365, "step": 790 }, { "epoch": 0.6882467955681077, "grad_norm": 236.02343374540018, "learning_rate": 4.7871953491051583e-08, "logits/chosen": 0.23191796243190765, "logits/rejected": 0.28292810916900635, "logps/chosen": -495.27935791015625, "logps/rejected": -456.4753112792969, "loss": 0.5821, "nll_loss": 0.4527099132537842, "rewards/accuracies": 0.6875, "rewards/chosen": 0.267295241355896, "rewards/margins": 0.5129619240760803, "rewards/rejected": -0.24566669762134552, "step": 792 }, { "epoch": 0.689984792526613, "grad_norm": 158.10216644449264, "learning_rate": 4.7385353912275164e-08, "logits/chosen": 0.43008583784103394, "logits/rejected": 0.5059869289398193, "logps/chosen": -479.2068176269531, "logps/rejected": -489.1475830078125, "loss": 0.6079, "nll_loss": 0.4439893960952759, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11055446416139603, "rewards/margins": 0.3065342903137207, "rewards/rejected": -0.1959797739982605, "step": 794 }, { "epoch": 0.6917227894851184, "grad_norm": 340.4845104687888, "learning_rate": 4.6900470975914265e-08, "logits/chosen": 0.21972504258155823, "logits/rejected": 0.27979522943496704, "logps/chosen": -546.1092529296875, "logps/rejected": -536.8622436523438, "loss": 0.627, "nll_loss": 0.5137719511985779, "rewards/accuracies": 0.5, "rewards/chosen": 0.04987996444106102, "rewards/margins": 0.212679922580719, "rewards/rejected": -0.16280002892017365, "step": 796 }, { "epoch": 0.6934607864436237, "grad_norm": 243.2800060075489, "learning_rate": 4.641732050210031e-08, "logits/chosen": 0.42562809586524963, "logits/rejected": 0.3147584795951843, "logps/chosen": -463.6742858886719, "logps/rejected": -476.229248046875, "loss": 0.659, "nll_loss": 0.4361433982849121, "rewards/accuracies": 0.625, "rewards/chosen": 0.34175100922584534, "rewards/margins": 0.2627665400505066, "rewards/rejected": 0.07898445427417755, "step": 798 }, { "epoch": 0.695198783402129, "grad_norm": 114.92187652057805, "learning_rate": 4.5935918254440276e-08, "logits/chosen": 0.4270586669445038, "logits/rejected": 0.3975781798362732, "logps/chosen": -464.14251708984375, "logps/rejected": -462.9993591308594, "loss": 0.6274, "nll_loss": 0.4429488182067871, "rewards/accuracies": 0.75, "rewards/chosen": 0.21685649454593658, "rewards/margins": 0.5437389612197876, "rewards/rejected": -0.3268824815750122, "step": 800 }, { "epoch": 0.6969367803606343, "grad_norm": 217.41859804183937, "learning_rate": 4.5456279939502005e-08, "logits/chosen": 0.19082283973693848, "logits/rejected": 0.20748263597488403, "logps/chosen": -423.4697265625, "logps/rejected": -454.7181701660156, "loss": 0.564, "nll_loss": 0.42925646901130676, "rewards/accuracies": 0.75, "rewards/chosen": 0.28616124391555786, "rewards/margins": 0.2787553668022156, "rewards/rejected": 0.007405860349535942, "step": 802 }, { "epoch": 0.6986747773191397, "grad_norm": 236.65494312703478, "learning_rate": 4.4978421206302285e-08, "logits/chosen": 0.40193361043930054, "logits/rejected": 0.536785364151001, "logps/chosen": -477.677734375, "logps/rejected": -549.7373657226562, "loss": 0.6586, "nll_loss": 0.4303903579711914, "rewards/accuracies": 0.75, "rewards/chosen": 0.38318753242492676, "rewards/margins": 0.38266754150390625, "rewards/rejected": 0.0005199350416660309, "step": 804 }, { "epoch": 0.700412774277645, "grad_norm": 161.9548130973149, "learning_rate": 4.450235764579597e-08, "logits/chosen": 0.22347937524318695, "logits/rejected": 0.19105751812458038, "logps/chosen": -544.6937866210938, "logps/rejected": -513.24951171875, "loss": 0.5947, "nll_loss": 0.5026200413703918, "rewards/accuracies": 0.75, "rewards/chosen": 0.39928218722343445, "rewards/margins": 0.5983918309211731, "rewards/rejected": -0.19910964369773865, "step": 806 }, { "epoch": 0.7021507712361503, "grad_norm": 132.58473669179077, "learning_rate": 4.4028104790367246e-08, "logits/chosen": 0.4145227372646332, "logits/rejected": 0.3501809537410736, "logps/chosen": -469.0726318359375, "logps/rejected": -397.6184997558594, "loss": 0.553, "nll_loss": 0.43358322978019714, "rewards/accuracies": 0.625, "rewards/chosen": 0.21624755859375, "rewards/margins": 0.23603084683418274, "rewards/rejected": -0.019783303141593933, "step": 808 }, { "epoch": 0.7038887681946556, "grad_norm": 144.276213446629, "learning_rate": 4.35556781133231e-08, "logits/chosen": 0.31530264019966125, "logits/rejected": 0.5416185855865479, "logps/chosen": -465.4701843261719, "logps/rejected": -487.7655334472656, "loss": 0.576, "nll_loss": 0.4773712754249573, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39206960797309875, "rewards/margins": 0.3798760175704956, "rewards/rejected": 0.012193584814667702, "step": 810 }, { "epoch": 0.705626765153161, "grad_norm": 201.39592042775223, "learning_rate": 4.3085093028388195e-08, "logits/chosen": 0.6023300886154175, "logits/rejected": 0.7180359363555908, "logps/chosen": -445.44024658203125, "logps/rejected": -480.2512512207031, "loss": 0.6235, "nll_loss": 0.44196388125419617, "rewards/accuracies": 0.8125, "rewards/chosen": 0.44572457671165466, "rewards/margins": 0.4922065734863281, "rewards/rejected": -0.04648199677467346, "step": 812 }, { "epoch": 0.7073647621116663, "grad_norm": 130.59523911447587, "learning_rate": 4.261636488920225e-08, "logits/chosen": 0.4116157293319702, "logits/rejected": 0.44143983721733093, "logps/chosen": -465.00439453125, "logps/rejected": -561.2508544921875, "loss": 0.5782, "nll_loss": 0.4379756450653076, "rewards/accuracies": 0.875, "rewards/chosen": 0.4334089457988739, "rewards/margins": 0.5009133815765381, "rewards/rejected": -0.06750450283288956, "step": 814 }, { "epoch": 0.7091027590701716, "grad_norm": 143.79978339467263, "learning_rate": 4.2149508988818916e-08, "logits/chosen": 0.130857452750206, "logits/rejected": 0.16600145399570465, "logps/chosen": -519.40234375, "logps/rejected": -530.9063720703125, "loss": 0.5812, "nll_loss": 0.49456048011779785, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5553606748580933, "rewards/margins": 0.29760900139808655, "rewards/rejected": 0.2577517032623291, "step": 816 }, { "epoch": 0.7108407560286769, "grad_norm": 121.04385603346864, "learning_rate": 4.16845405592068e-08, "logits/chosen": 0.2572818100452423, "logits/rejected": 0.22149065136909485, "logps/chosen": -507.42364501953125, "logps/rejected": -510.28228759765625, "loss": 0.6038, "nll_loss": 0.4678362011909485, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5002222061157227, "rewards/margins": 0.47260046005249023, "rewards/rejected": 0.027621760964393616, "step": 818 }, { "epoch": 0.7125787529871823, "grad_norm": 172.5337885064375, "learning_rate": 4.1221474770752695e-08, "logits/chosen": 0.44652074575424194, "logits/rejected": 0.506121814250946, "logps/chosen": -452.7835998535156, "logps/rejected": -468.171630859375, "loss": 0.5583, "nll_loss": 0.4320453405380249, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23992222547531128, "rewards/margins": 0.2754116952419281, "rewards/rejected": -0.03548946604132652, "step": 820 }, { "epoch": 0.7143167499456876, "grad_norm": 149.04453299710758, "learning_rate": 4.076032673176637e-08, "logits/chosen": 0.12217839062213898, "logits/rejected": 0.15243466198444366, "logps/chosen": -455.9956359863281, "logps/rejected": -495.3397216796875, "loss": 0.5185, "nll_loss": 0.4414156675338745, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5716139078140259, "rewards/margins": 0.510877788066864, "rewards/rejected": 0.060736075043678284, "step": 822 }, { "epoch": 0.7160547469041929, "grad_norm": 176.89851298361629, "learning_rate": 4.030111148798775e-08, "logits/chosen": 0.48356354236602783, "logits/rejected": 0.7054731249809265, "logps/chosen": -448.03448486328125, "logps/rejected": -508.13665771484375, "loss": 0.5923, "nll_loss": 0.44542497396469116, "rewards/accuracies": 0.625, "rewards/chosen": 0.30406779050827026, "rewards/margins": 0.30130958557128906, "rewards/rejected": 0.002758212387561798, "step": 824 }, { "epoch": 0.7177927438626982, "grad_norm": 146.8343812764771, "learning_rate": 3.984384402209613e-08, "logits/chosen": 0.4624355435371399, "logits/rejected": 0.5660164952278137, "logps/chosen": -412.0571594238281, "logps/rejected": -487.960693359375, "loss": 0.6451, "nll_loss": 0.4038446247577667, "rewards/accuracies": 0.625, "rewards/chosen": 0.3133462071418762, "rewards/margins": 0.08846140652894974, "rewards/rejected": 0.22488482296466827, "step": 826 }, { "epoch": 0.7195307408212036, "grad_norm": 111.15354720327095, "learning_rate": 3.938853925322117e-08, "logits/chosen": 0.6324234008789062, "logits/rejected": 0.6697845458984375, "logps/chosen": -496.340576171875, "logps/rejected": -503.02191162109375, "loss": 0.5639, "nll_loss": 0.48350197076797485, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5550703406333923, "rewards/margins": 0.5358677506446838, "rewards/rejected": 0.019202616065740585, "step": 828 }, { "epoch": 0.7212687377797089, "grad_norm": 163.63438307124966, "learning_rate": 3.893521203645618e-08, "logits/chosen": 0.42804092168807983, "logits/rejected": 0.4747631549835205, "logps/chosen": -456.4293518066406, "logps/rejected": -470.9934997558594, "loss": 0.6167, "nll_loss": 0.4335935711860657, "rewards/accuracies": 0.5625, "rewards/chosen": 0.34340953826904297, "rewards/margins": 0.13007965683937073, "rewards/rejected": 0.21332989633083344, "step": 830 }, { "epoch": 0.7230067347382142, "grad_norm": 131.0966963549401, "learning_rate": 3.848387716237352e-08, "logits/chosen": 0.271542489528656, "logits/rejected": 0.29016950726509094, "logps/chosen": -370.72216796875, "logps/rejected": -441.3465881347656, "loss": 0.5434, "nll_loss": 0.3498722314834595, "rewards/accuracies": 0.5625, "rewards/chosen": 0.21746429800987244, "rewards/margins": 0.160241037607193, "rewards/rejected": 0.057223230600357056, "step": 832 }, { "epoch": 0.7247447316967195, "grad_norm": 142.61239618535805, "learning_rate": 3.803454935654189e-08, "logits/chosen": 0.4528628885746002, "logits/rejected": 0.4226664900779724, "logps/chosen": -575.0054321289062, "logps/rejected": -570.5166625976562, "loss": 0.6139, "nll_loss": 0.5149663090705872, "rewards/accuracies": 0.5, "rewards/chosen": 0.36920759081840515, "rewards/margins": 0.13114717602729797, "rewards/rejected": 0.2380603700876236, "step": 834 }, { "epoch": 0.7264827286552249, "grad_norm": 147.70343775287222, "learning_rate": 3.758724327904606e-08, "logits/chosen": 0.3783852756023407, "logits/rejected": 0.4000176191329956, "logps/chosen": -484.54962158203125, "logps/rejected": -538.7274169921875, "loss": 0.5264, "nll_loss": 0.47790783643722534, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26448196172714233, "rewards/margins": 0.39010438323020935, "rewards/rejected": -0.1256224662065506, "step": 836 }, { "epoch": 0.7282207256137302, "grad_norm": 159.27478507683435, "learning_rate": 3.7141973524008486e-08, "logits/chosen": 0.5152249932289124, "logits/rejected": 0.44237661361694336, "logps/chosen": -493.9826354980469, "logps/rejected": -461.91424560546875, "loss": 0.6011, "nll_loss": 0.4731515944004059, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03008108027279377, "rewards/margins": 0.2286914736032486, "rewards/rejected": -0.19861041009426117, "step": 838 }, { "epoch": 0.7299587225722355, "grad_norm": 160.37372417445502, "learning_rate": 3.669875461911297e-08, "logits/chosen": 0.38272225856781006, "logits/rejected": 0.32334208488464355, "logps/chosen": -453.45343017578125, "logps/rejected": -481.0965576171875, "loss": 0.591, "nll_loss": 0.46896225214004517, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47221946716308594, "rewards/margins": 0.4862200915813446, "rewards/rejected": -0.014000609517097473, "step": 840 }, { "epoch": 0.7316967195307408, "grad_norm": 106.48540922978027, "learning_rate": 3.6257601025131026e-08, "logits/chosen": 0.36537909507751465, "logits/rejected": 0.4610544741153717, "logps/chosen": -495.3701171875, "logps/rejected": -536.6589965820312, "loss": 0.5434, "nll_loss": 0.49130648374557495, "rewards/accuracies": 0.75, "rewards/chosen": 0.46169281005859375, "rewards/margins": 0.5839080214500427, "rewards/rejected": -0.12221517413854599, "step": 842 }, { "epoch": 0.7334347164892462, "grad_norm": 184.64521993366662, "learning_rate": 3.581852713544983e-08, "logits/chosen": 0.6350924372673035, "logits/rejected": 0.6400173902511597, "logps/chosen": -572.7459716796875, "logps/rejected": -505.0904235839844, "loss": 0.6132, "nll_loss": 0.5641618967056274, "rewards/accuracies": 0.75, "rewards/chosen": 0.4518885910511017, "rewards/margins": 0.26884880661964417, "rewards/rejected": 0.18303976953029633, "step": 844 }, { "epoch": 0.7351727134477515, "grad_norm": 174.76319754971257, "learning_rate": 3.538154727560259e-08, "logits/chosen": 0.2232353538274765, "logits/rejected": 0.25405293703079224, "logps/chosen": -457.76251220703125, "logps/rejected": -487.740234375, "loss": 0.5885, "nll_loss": 0.44223538041114807, "rewards/accuracies": 0.625, "rewards/chosen": 0.3765263557434082, "rewards/margins": 0.2363949865102768, "rewards/rejected": 0.1401313841342926, "step": 846 }, { "epoch": 0.7369107104062568, "grad_norm": 157.0768940202763, "learning_rate": 3.494667570280132e-08, "logits/chosen": -0.08307046443223953, "logits/rejected": 0.09646856039762497, "logps/chosen": -433.7715759277344, "logps/rejected": -426.2318115234375, "loss": 0.5631, "nll_loss": 0.42717698216438293, "rewards/accuracies": 0.5, "rewards/chosen": 0.3394044041633606, "rewards/margins": 0.4069460332393646, "rewards/rejected": -0.06754161417484283, "step": 848 }, { "epoch": 0.7386487073647621, "grad_norm": 137.81378973300215, "learning_rate": 3.45139266054715e-08, "logits/chosen": 0.35806044936180115, "logits/rejected": 0.3277757167816162, "logps/chosen": -560.1826171875, "logps/rejected": -511.9388427734375, "loss": 0.5852, "nll_loss": 0.5097828507423401, "rewards/accuracies": 0.75, "rewards/chosen": 0.41566944122314453, "rewards/margins": 0.37488117814064026, "rewards/rejected": 0.04078827053308487, "step": 850 }, { "epoch": 0.7403867043232675, "grad_norm": 119.84840717683835, "learning_rate": 3.4083314102789284e-08, "logits/chosen": 0.35003048181533813, "logits/rejected": 0.371743768453598, "logps/chosen": -479.0580749511719, "logps/rejected": -506.8682861328125, "loss": 0.5487, "nll_loss": 0.4575774371623993, "rewards/accuracies": 0.75, "rewards/chosen": 0.3738093376159668, "rewards/margins": 0.49029579758644104, "rewards/rejected": -0.11648646742105484, "step": 852 }, { "epoch": 0.7421247012817728, "grad_norm": 182.48903338659457, "learning_rate": 3.365485224422082e-08, "logits/chosen": 0.18598990142345428, "logits/rejected": 0.40627986192703247, "logps/chosen": -486.7777099609375, "logps/rejected": -535.242919921875, "loss": 0.6485, "nll_loss": 0.48537081480026245, "rewards/accuracies": 0.5625, "rewards/chosen": 0.236476331949234, "rewards/margins": 0.49137353897094727, "rewards/rejected": -0.25489723682403564, "step": 854 }, { "epoch": 0.7438626982402781, "grad_norm": 148.23074057937546, "learning_rate": 3.322855500906373e-08, "logits/chosen": 0.30834490060806274, "logits/rejected": 0.46750015020370483, "logps/chosen": -521.8851928710938, "logps/rejected": -524.574462890625, "loss": 0.5825, "nll_loss": 0.4600994884967804, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4887966215610504, "rewards/margins": 0.13246317207813263, "rewards/rejected": 0.3563334345817566, "step": 856 }, { "epoch": 0.7456006951987834, "grad_norm": 121.1714678839193, "learning_rate": 3.2804436305991215e-08, "logits/chosen": 0.4985821843147278, "logits/rejected": 0.4725096821784973, "logps/chosen": -417.9498291015625, "logps/rejected": -445.34869384765625, "loss": 0.5765, "nll_loss": 0.427943617105484, "rewards/accuracies": 0.6875, "rewards/chosen": 0.40432649850845337, "rewards/margins": 0.29832470417022705, "rewards/rejected": 0.10600186139345169, "step": 858 }, { "epoch": 0.7473386921572888, "grad_norm": 146.2456732980509, "learning_rate": 3.238250997259808e-08, "logits/chosen": 0.5070160627365112, "logits/rejected": 0.5410177707672119, "logps/chosen": -468.70452880859375, "logps/rejected": -484.58331298828125, "loss": 0.5232, "nll_loss": 0.4641054570674896, "rewards/accuracies": 1.0, "rewards/chosen": 0.548501193523407, "rewards/margins": 0.87884122133255, "rewards/rejected": -0.33034002780914307, "step": 860 }, { "epoch": 0.7490766891157941, "grad_norm": 243.85595162080867, "learning_rate": 3.196278977494934e-08, "logits/chosen": 0.47585529088974, "logits/rejected": 0.4665442407131195, "logps/chosen": -477.505126953125, "logps/rejected": -466.41522216796875, "loss": 0.5864, "nll_loss": 0.4726523756980896, "rewards/accuracies": 0.75, "rewards/chosen": 0.31335777044296265, "rewards/margins": 0.23028318583965302, "rewards/rejected": 0.08307457715272903, "step": 862 }, { "epoch": 0.7508146860742994, "grad_norm": 136.52590152347898, "learning_rate": 3.154528940713113e-08, "logits/chosen": 0.38702118396759033, "logits/rejected": 0.31475353240966797, "logps/chosen": -579.3184814453125, "logps/rejected": -510.0700378417969, "loss": 0.543, "nll_loss": 0.5211628079414368, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3748409152030945, "rewards/margins": 0.3414745032787323, "rewards/rejected": 0.033366404473781586, "step": 864 }, { "epoch": 0.7525526830328046, "grad_norm": 215.66091059214304, "learning_rate": 3.113002249080385e-08, "logits/chosen": 0.5306642055511475, "logits/rejected": 0.6336250901222229, "logps/chosen": -506.5212707519531, "logps/rejected": -468.5569152832031, "loss": 0.5612, "nll_loss": 0.49830642342567444, "rewards/accuracies": 0.75, "rewards/chosen": 0.5931942462921143, "rewards/margins": 0.5846770405769348, "rewards/rejected": 0.008517175912857056, "step": 866 }, { "epoch": 0.75429067999131, "grad_norm": 192.10728454750495, "learning_rate": 3.071700257475768e-08, "logits/chosen": 0.28933534026145935, "logits/rejected": 0.4013511538505554, "logps/chosen": -544.5047607421875, "logps/rejected": -606.4002685546875, "loss": 0.589, "nll_loss": 0.5018710494041443, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44215142726898193, "rewards/margins": 0.5153141617774963, "rewards/rejected": -0.07316265255212784, "step": 868 }, { "epoch": 0.7560286769498153, "grad_norm": 124.3982879280996, "learning_rate": 3.0306243134470664e-08, "logits/chosen": 0.40043580532073975, "logits/rejected": 0.3937510848045349, "logps/chosen": -486.595947265625, "logps/rejected": -502.1639099121094, "loss": 0.5233, "nll_loss": 0.4753631353378296, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5435855388641357, "rewards/margins": 0.7956272959709167, "rewards/rejected": -0.25204169750213623, "step": 870 }, { "epoch": 0.7577666739083206, "grad_norm": 128.19135238725113, "learning_rate": 2.98977575716689e-08, "logits/chosen": 0.48589569330215454, "logits/rejected": 0.5582183003425598, "logps/chosen": -542.2434692382812, "logps/rejected": -537.6492919921875, "loss": 0.5619, "nll_loss": 0.536037027835846, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1533937007188797, "rewards/margins": 0.3188681900501251, "rewards/rejected": -0.16547450423240662, "step": 872 }, { "epoch": 0.7595046708668259, "grad_norm": 151.61893281292282, "learning_rate": 2.9491559213889427e-08, "logits/chosen": 0.36808544397354126, "logits/rejected": 0.4860031306743622, "logps/chosen": -485.0714111328125, "logps/rejected": -491.4736328125, "loss": 0.5109, "nll_loss": 0.4829983711242676, "rewards/accuracies": 0.75, "rewards/chosen": 0.5121357440948486, "rewards/margins": 0.62371826171875, "rewards/rejected": -0.11158255487680435, "step": 874 }, { "epoch": 0.7612426678253313, "grad_norm": 127.68226210602406, "learning_rate": 2.9087661314045363e-08, "logits/chosen": 0.6009732484817505, "logits/rejected": 0.6443039774894714, "logps/chosen": -533.9920654296875, "logps/rejected": -519.9286499023438, "loss": 0.5595, "nll_loss": 0.5222257971763611, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4094102680683136, "rewards/margins": 0.5059654712677002, "rewards/rejected": -0.09655513614416122, "step": 876 }, { "epoch": 0.7629806647838366, "grad_norm": 110.78779283676273, "learning_rate": 2.8686077049993285e-08, "logits/chosen": 0.33771321177482605, "logits/rejected": 0.21091528236865997, "logps/chosen": -484.62060546875, "logps/rejected": -501.81146240234375, "loss": 0.5437, "nll_loss": 0.4520571529865265, "rewards/accuracies": 0.75, "rewards/chosen": 0.4446027874946594, "rewards/margins": 0.4708292484283447, "rewards/rejected": -0.02622641623020172, "step": 878 }, { "epoch": 0.7647186617423419, "grad_norm": 211.63665958435504, "learning_rate": 2.8286819524103657e-08, "logits/chosen": -0.004198629409074783, "logits/rejected": 0.06533518433570862, "logps/chosen": -491.1811218261719, "logps/rejected": -475.9295349121094, "loss": 0.544, "nll_loss": 0.4399957060813904, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7543421983718872, "rewards/margins": 0.9796266555786133, "rewards/rejected": -0.22528459131717682, "step": 880 }, { "epoch": 0.7664566587008472, "grad_norm": 126.89748945729015, "learning_rate": 2.788990176283308e-08, "logits/chosen": 0.22177600860595703, "logits/rejected": 0.21223066747188568, "logps/chosen": -427.07330322265625, "logps/rejected": -480.98004150390625, "loss": 0.5663, "nll_loss": 0.41825437545776367, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5697661638259888, "rewards/margins": 0.44856926798820496, "rewards/rejected": 0.12119686603546143, "step": 882 }, { "epoch": 0.7681946556593526, "grad_norm": 135.4485548010094, "learning_rate": 2.749533671629931e-08, "logits/chosen": 0.26252371072769165, "logits/rejected": 0.3525621294975281, "logps/chosen": -411.34228515625, "logps/rejected": -412.50360107421875, "loss": 0.5851, "nll_loss": 0.4299415349960327, "rewards/accuracies": 0.625, "rewards/chosen": 0.5434117317199707, "rewards/margins": 0.4031442403793335, "rewards/rejected": 0.1402675211429596, "step": 884 }, { "epoch": 0.7699326526178579, "grad_norm": 143.65403593274095, "learning_rate": 2.7103137257858867e-08, "logits/chosen": 0.29264941811561584, "logits/rejected": 0.38839760422706604, "logps/chosen": -497.6834411621094, "logps/rejected": -466.5719909667969, "loss": 0.5404, "nll_loss": 0.470731258392334, "rewards/accuracies": 0.75, "rewards/chosen": 0.546400785446167, "rewards/margins": 0.5197809338569641, "rewards/rejected": 0.026619907468557358, "step": 886 }, { "epoch": 0.7716706495763632, "grad_norm": 241.2602615833436, "learning_rate": 2.6713316183686818e-08, "logits/chosen": 0.25704315304756165, "logits/rejected": 0.3444461524486542, "logps/chosen": -430.9483642578125, "logps/rejected": -454.0907287597656, "loss": 0.618, "nll_loss": 0.40751364827156067, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5406338572502136, "rewards/margins": 0.3775385022163391, "rewards/rejected": 0.1630953848361969, "step": 888 }, { "epoch": 0.7734086465348685, "grad_norm": 136.35912359364139, "learning_rate": 2.6325886212359495e-08, "logits/chosen": 0.20821627974510193, "logits/rejected": 0.455917090177536, "logps/chosen": -482.72119140625, "logps/rejected": -532.3282470703125, "loss": 0.6048, "nll_loss": 0.47263607382774353, "rewards/accuracies": 0.875, "rewards/chosen": 0.4809028208255768, "rewards/margins": 0.5942395329475403, "rewards/rejected": -0.11333665251731873, "step": 890 }, { "epoch": 0.7751466434933739, "grad_norm": 204.0859099203338, "learning_rate": 2.594085998443942e-08, "logits/chosen": 0.19795550405979156, "logits/rejected": 0.26651012897491455, "logps/chosen": -402.7255554199219, "logps/rejected": -369.5584411621094, "loss": 0.5931, "nll_loss": 0.3987264037132263, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39684203267097473, "rewards/margins": 0.32805705070495605, "rewards/rejected": 0.06878500431776047, "step": 892 }, { "epoch": 0.7768846404518792, "grad_norm": 298.79923705100447, "learning_rate": 2.5558250062062825e-08, "logits/chosen": 0.37592050433158875, "logits/rejected": 0.4358353912830353, "logps/chosen": -431.3149108886719, "logps/rejected": -449.6175537109375, "loss": 0.5963, "nll_loss": 0.4203760623931885, "rewards/accuracies": 0.5, "rewards/chosen": 0.22639721632003784, "rewards/margins": 0.13411150872707367, "rewards/rejected": 0.09228573739528656, "step": 894 }, { "epoch": 0.7786226374103845, "grad_norm": 141.8941048183387, "learning_rate": 2.5178068928529862e-08, "logits/chosen": 0.4105270206928253, "logits/rejected": 0.3826986253261566, "logps/chosen": -477.2858581542969, "logps/rejected": -447.1794128417969, "loss": 0.5449, "nll_loss": 0.4311331510543823, "rewards/accuracies": 0.75, "rewards/chosen": 0.37648773193359375, "rewards/margins": 0.42232033610343933, "rewards/rejected": -0.04583262652158737, "step": 896 }, { "epoch": 0.7803606343688898, "grad_norm": 112.16343660566953, "learning_rate": 2.4800328987897424e-08, "logits/chosen": 0.30149951577186584, "logits/rejected": 0.30680081248283386, "logps/chosen": -499.3365478515625, "logps/rejected": -523.8584594726562, "loss": 0.5376, "nll_loss": 0.4783164858818054, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4257345199584961, "rewards/margins": 0.6091545820236206, "rewards/rejected": -0.1834200918674469, "step": 898 }, { "epoch": 0.7820986313273952, "grad_norm": 159.67498326737996, "learning_rate": 2.4425042564574183e-08, "logits/chosen": 0.5600635409355164, "logits/rejected": 0.5486436486244202, "logps/chosen": -550.2770385742188, "logps/rejected": -540.26708984375, "loss": 0.5726, "nll_loss": 0.5308613777160645, "rewards/accuracies": 0.6875, "rewards/chosen": 0.30320072174072266, "rewards/margins": 0.46356379985809326, "rewards/rejected": -0.1603630930185318, "step": 900 }, { "epoch": 0.7838366282859005, "grad_norm": 145.85928653023169, "learning_rate": 2.4052221902918722e-08, "logits/chosen": 0.2439267635345459, "logits/rejected": 0.27589860558509827, "logps/chosen": -450.7904968261719, "logps/rejected": -479.41851806640625, "loss": 0.5508, "nll_loss": 0.4363137185573578, "rewards/accuracies": 0.8125, "rewards/chosen": 0.478568971157074, "rewards/margins": 0.4524759352207184, "rewards/rejected": 0.026093004271388054, "step": 902 }, { "epoch": 0.7855746252444058, "grad_norm": 142.8997584918822, "learning_rate": 2.3681879166839968e-08, "logits/chosen": 0.29706910252571106, "logits/rejected": 0.4273567497730255, "logps/chosen": -420.8601379394531, "logps/rejected": -473.61578369140625, "loss": 0.5634, "nll_loss": 0.4160003364086151, "rewards/accuracies": 0.625, "rewards/chosen": 0.29417192935943604, "rewards/margins": 0.25704747438430786, "rewards/rejected": 0.03712444752454758, "step": 904 }, { "epoch": 0.7873126222029111, "grad_norm": 135.7032714707856, "learning_rate": 2.3314026439400215e-08, "logits/chosen": 0.42345553636550903, "logits/rejected": 0.11951038241386414, "logps/chosen": -567.097412109375, "logps/rejected": -528.78857421875, "loss": 0.6175, "nll_loss": 0.5222797393798828, "rewards/accuracies": 0.5, "rewards/chosen": 0.2827926278114319, "rewards/margins": 0.14894521236419678, "rewards/rejected": 0.1338474154472351, "step": 906 }, { "epoch": 0.7890506191614165, "grad_norm": 191.37433067018546, "learning_rate": 2.2948675722421085e-08, "logits/chosen": 0.38801103830337524, "logits/rejected": 0.19866381585597992, "logps/chosen": -453.0460205078125, "logps/rejected": -493.8358459472656, "loss": 0.539, "nll_loss": 0.45135483145713806, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5596899390220642, "rewards/margins": 0.6123340725898743, "rewards/rejected": -0.05264415964484215, "step": 908 }, { "epoch": 0.7907886161199218, "grad_norm": 152.5312135549706, "learning_rate": 2.258583893609175e-08, "logits/chosen": 0.3808496594429016, "logits/rejected": 0.5132696032524109, "logps/chosen": -510.3198547363281, "logps/rejected": -533.71923828125, "loss": 0.561, "nll_loss": 0.48955419659614563, "rewards/accuracies": 0.8125, "rewards/chosen": 0.43191957473754883, "rewards/margins": 0.5486711263656616, "rewards/rejected": -0.11675149202346802, "step": 910 }, { "epoch": 0.7925266130784271, "grad_norm": 286.1777750377432, "learning_rate": 2.22255279185802e-08, "logits/chosen": 0.38537973165512085, "logits/rejected": 0.5917662382125854, "logps/chosen": -465.35479736328125, "logps/rejected": -465.9227294921875, "loss": 0.607, "nll_loss": 0.44577518105506897, "rewards/accuracies": 0.5, "rewards/chosen": 0.3478468060493469, "rewards/margins": 0.02361936867237091, "rewards/rejected": 0.3242274224758148, "step": 912 }, { "epoch": 0.7942646100369324, "grad_norm": 197.55059622588414, "learning_rate": 2.1867754425646922e-08, "logits/chosen": 0.34537068009376526, "logits/rejected": 0.22164693474769592, "logps/chosen": -520.771728515625, "logps/rejected": -554.5394897460938, "loss": 0.5538, "nll_loss": 0.5084543228149414, "rewards/accuracies": 0.75, "rewards/chosen": 0.501037061214447, "rewards/margins": 0.5809021592140198, "rewards/rejected": -0.07986507564783096, "step": 914 }, { "epoch": 0.7960026069954378, "grad_norm": 132.4136998604499, "learning_rate": 2.1512530130261208e-08, "logits/chosen": 0.2082512080669403, "logits/rejected": 0.25798577070236206, "logps/chosen": -421.710693359375, "logps/rejected": -446.8322448730469, "loss": 0.5382, "nll_loss": 0.41912373900413513, "rewards/accuracies": 0.75, "rewards/chosen": 0.8833374381065369, "rewards/margins": 0.6493025422096252, "rewards/rejected": 0.23403486609458923, "step": 916 }, { "epoch": 0.7977406039539431, "grad_norm": 163.69318648505143, "learning_rate": 2.115986662222058e-08, "logits/chosen": 0.6591068506240845, "logits/rejected": 0.4656745195388794, "logps/chosen": -488.3570251464844, "logps/rejected": -459.34912109375, "loss": 0.5925, "nll_loss": 0.46167752146720886, "rewards/accuracies": 0.5, "rewards/chosen": 0.23973506689071655, "rewards/margins": 0.22639313340187073, "rewards/rejected": 0.013341886922717094, "step": 918 }, { "epoch": 0.7994786009124484, "grad_norm": 125.35380400867658, "learning_rate": 2.08097754077725e-08, "logits/chosen": 0.26784810423851013, "logits/rejected": 0.31632742285728455, "logps/chosen": -488.3423156738281, "logps/rejected": -538.6160888671875, "loss": 0.5601, "nll_loss": 0.4677424132823944, "rewards/accuracies": 0.75, "rewards/chosen": 0.5197944641113281, "rewards/margins": 0.5179188251495361, "rewards/rejected": 0.0018756985664367676, "step": 920 }, { "epoch": 0.8012165978709537, "grad_norm": 122.65155967179511, "learning_rate": 2.0462267909238895e-08, "logits/chosen": 0.4603986144065857, "logits/rejected": 0.4427664279937744, "logps/chosen": -501.7173767089844, "logps/rejected": -440.8707580566406, "loss": 0.5563, "nll_loss": 0.4742586314678192, "rewards/accuracies": 0.625, "rewards/chosen": 0.3994339108467102, "rewards/margins": 0.303816020488739, "rewards/rejected": 0.0956178605556488, "step": 922 }, { "epoch": 0.8029545948294591, "grad_norm": 181.17211501585948, "learning_rate": 2.0117355464643647e-08, "logits/chosen": 0.2570352554321289, "logits/rejected": 0.364309161901474, "logps/chosen": -520.6018676757812, "logps/rejected": -540.8889770507812, "loss": 0.5815, "nll_loss": 0.473418653011322, "rewards/accuracies": 0.75, "rewards/chosen": 0.8413310647010803, "rewards/margins": 0.6333141326904297, "rewards/rejected": 0.20801697671413422, "step": 924 }, { "epoch": 0.8046925917879644, "grad_norm": 148.36579795161384, "learning_rate": 1.9775049327342486e-08, "logits/chosen": 0.3666941225528717, "logits/rejected": 0.26878297328948975, "logps/chosen": -418.0342102050781, "logps/rejected": -489.3192443847656, "loss": 0.6069, "nll_loss": 0.40862399339675903, "rewards/accuracies": 0.6875, "rewards/chosen": 0.539859414100647, "rewards/margins": 0.32572096586227417, "rewards/rejected": 0.2141384333372116, "step": 926 }, { "epoch": 0.8064305887464697, "grad_norm": 233.14469000909418, "learning_rate": 1.9435360665656033e-08, "logits/chosen": 0.26835137605667114, "logits/rejected": 0.22353360056877136, "logps/chosen": -529.213134765625, "logps/rejected": -506.2857360839844, "loss": 0.5749, "nll_loss": 0.5105596780776978, "rewards/accuracies": 0.75, "rewards/chosen": 0.7156704664230347, "rewards/margins": 0.6623274087905884, "rewards/rejected": 0.053343020379543304, "step": 928 }, { "epoch": 0.808168585704975, "grad_norm": 177.62086603981126, "learning_rate": 1.9098300562505266e-08, "logits/chosen": 0.5697555541992188, "logits/rejected": 0.5495641827583313, "logps/chosen": -498.977783203125, "logps/rejected": -495.72662353515625, "loss": 0.6068, "nll_loss": 0.5025020241737366, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5968002080917358, "rewards/margins": 0.5186977982521057, "rewards/rejected": 0.07810239493846893, "step": 930 }, { "epoch": 0.8099065826634804, "grad_norm": 147.90397895312702, "learning_rate": 1.876388001504995e-08, "logits/chosen": 0.6278223395347595, "logits/rejected": 0.6403146386146545, "logps/chosen": -466.76763916015625, "logps/rejected": -463.7005310058594, "loss": 0.5613, "nll_loss": 0.4527406096458435, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2836538255214691, "rewards/margins": 0.3506113886833191, "rewards/rejected": -0.06695757806301117, "step": 932 }, { "epoch": 0.8116445796219857, "grad_norm": 154.72657331184226, "learning_rate": 1.843210993432983e-08, "logits/chosen": 0.4768508970737457, "logits/rejected": 0.4194488823413849, "logps/chosen": -484.4299621582031, "logps/rejected": -480.65234375, "loss": 0.5582, "nll_loss": 0.49351003766059875, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5547365546226501, "rewards/margins": 0.6049469113349915, "rewards/rejected": -0.0502103790640831, "step": 934 }, { "epoch": 0.813382576580491, "grad_norm": 187.5308682243756, "learning_rate": 1.8103001144908746e-08, "logits/chosen": 0.406520813703537, "logits/rejected": 0.56972736120224, "logps/chosen": -411.3998107910156, "logps/rejected": -491.790771484375, "loss": 0.6176, "nll_loss": 0.4119413495063782, "rewards/accuracies": 0.875, "rewards/chosen": 0.6432211399078369, "rewards/margins": 0.5553202629089355, "rewards/rejected": 0.08790083229541779, "step": 936 }, { "epoch": 0.8151205735389963, "grad_norm": 904.3954964527115, "learning_rate": 1.7776564384521288e-08, "logits/chosen": 0.6471278667449951, "logits/rejected": 0.6938129663467407, "logps/chosen": -505.6937255859375, "logps/rejected": -459.9547119140625, "loss": 0.5964, "nll_loss": 0.508669912815094, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6563972234725952, "rewards/margins": 0.5751644968986511, "rewards/rejected": 0.08123274147510529, "step": 938 }, { "epoch": 0.8168585704975017, "grad_norm": 153.79334054038893, "learning_rate": 1.74528103037226e-08, "logits/chosen": 0.3130883276462555, "logits/rejected": 0.46651285886764526, "logps/chosen": -411.9516296386719, "logps/rejected": -480.91595458984375, "loss": 0.5442, "nll_loss": 0.40169090032577515, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8183580636978149, "rewards/margins": 0.6252126097679138, "rewards/rejected": 0.19314545392990112, "step": 940 }, { "epoch": 0.818596567456007, "grad_norm": 242.12651784061373, "learning_rate": 1.7131749465540855e-08, "logits/chosen": 0.7213436365127563, "logits/rejected": 0.5559303760528564, "logps/chosen": -505.56903076171875, "logps/rejected": -454.2501220703125, "loss": 0.6202, "nll_loss": 0.49371591210365295, "rewards/accuracies": 0.75, "rewards/chosen": 0.6225160360336304, "rewards/margins": 0.43361109495162964, "rewards/rejected": 0.18890495598316193, "step": 942 }, { "epoch": 0.8203345644145122, "grad_norm": 128.95760203041075, "learning_rate": 1.6813392345132517e-08, "logits/chosen": 0.507114589214325, "logits/rejected": 0.4971063435077667, "logps/chosen": -468.1333923339844, "logps/rejected": -486.2199401855469, "loss": 0.5707, "nll_loss": 0.44967854022979736, "rewards/accuracies": 0.875, "rewards/chosen": 0.4457395672798157, "rewards/margins": 0.386365681886673, "rewards/rejected": 0.05937386304140091, "step": 944 }, { "epoch": 0.8220725613730177, "grad_norm": 147.48992803946854, "learning_rate": 1.6497749329440745e-08, "logits/chosen": 0.3423297703266144, "logits/rejected": 0.4964262545108795, "logps/chosen": -421.44964599609375, "logps/rejected": -438.1751403808594, "loss": 0.614, "nll_loss": 0.4068484604358673, "rewards/accuracies": 0.375, "rewards/chosen": 0.5428711175918579, "rewards/margins": 0.02361917495727539, "rewards/rejected": 0.5192519426345825, "step": 946 }, { "epoch": 0.823810558331523, "grad_norm": 158.19990079831206, "learning_rate": 1.6184830716856346e-08, "logits/chosen": 0.24282848834991455, "logits/rejected": 0.3862345218658447, "logps/chosen": -417.93035888671875, "logps/rejected": -480.74530029296875, "loss": 0.5395, "nll_loss": 0.4029342532157898, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6654102206230164, "rewards/margins": 0.5100387334823608, "rewards/rejected": 0.15537136793136597, "step": 948 }, { "epoch": 0.8255485552900282, "grad_norm": 205.78174442372523, "learning_rate": 1.5874646716881866e-08, "logits/chosen": 0.36575302481651306, "logits/rejected": 0.3649991750717163, "logps/chosen": -526.206298828125, "logps/rejected": -542.56494140625, "loss": 0.6185, "nll_loss": 0.48127463459968567, "rewards/accuracies": 0.625, "rewards/chosen": 0.5322883129119873, "rewards/margins": 0.5123573541641235, "rewards/rejected": 0.019931048154830933, "step": 950 }, { "epoch": 0.8272865522485335, "grad_norm": 171.145227632994, "learning_rate": 1.5567207449798513e-08, "logits/chosen": 0.5767173767089844, "logits/rejected": 0.4821607172489166, "logps/chosen": -530.916259765625, "logps/rejected": -499.0142822265625, "loss": 0.658, "nll_loss": 0.512601912021637, "rewards/accuracies": 0.625, "rewards/chosen": 0.3559614419937134, "rewards/margins": 0.09974518418312073, "rewards/rejected": 0.25621622800827026, "step": 952 }, { "epoch": 0.8290245492070389, "grad_norm": 176.07720507279737, "learning_rate": 1.5262522946335754e-08, "logits/chosen": 0.46107804775238037, "logits/rejected": 0.4554142951965332, "logps/chosen": -436.38909912109375, "logps/rejected": -498.63470458984375, "loss": 0.51, "nll_loss": 0.4370454251766205, "rewards/accuracies": 0.875, "rewards/chosen": 0.6604627370834351, "rewards/margins": 0.7786142230033875, "rewards/rejected": -0.11815138161182404, "step": 954 }, { "epoch": 0.8307625461655442, "grad_norm": 309.2601198340963, "learning_rate": 1.4960603147344342e-08, "logits/chosen": 0.3789171278476715, "logits/rejected": 0.4542412757873535, "logps/chosen": -534.3204345703125, "logps/rejected": -537.3842163085938, "loss": 0.571, "nll_loss": 0.4980693459510803, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5330474972724915, "rewards/margins": 0.32546788454055786, "rewards/rejected": 0.2075796127319336, "step": 956 }, { "epoch": 0.8325005431240495, "grad_norm": 132.0934271644858, "learning_rate": 1.466145790347183e-08, "logits/chosen": 0.3661070764064789, "logits/rejected": 0.23429855704307556, "logps/chosen": -479.94927978515625, "logps/rejected": -361.9775085449219, "loss": 0.6086, "nll_loss": 0.43883123993873596, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5983450412750244, "rewards/margins": 0.5256334543228149, "rewards/rejected": 0.0727115273475647, "step": 958 }, { "epoch": 0.8342385400825548, "grad_norm": 141.19361438240026, "learning_rate": 1.4365096974841106e-08, "logits/chosen": 0.4359699785709381, "logits/rejected": 0.5905839204788208, "logps/chosen": -572.3399047851562, "logps/rejected": -490.7935791015625, "loss": 0.5727, "nll_loss": 0.5214511156082153, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6746166348457336, "rewards/margins": 0.6807842254638672, "rewards/rejected": -0.006167605519294739, "step": 960 }, { "epoch": 0.8359765370410602, "grad_norm": 134.2041368142193, "learning_rate": 1.4071530030732093e-08, "logits/chosen": 0.36992889642715454, "logits/rejected": 0.4493410587310791, "logps/chosen": -398.4943542480469, "logps/rejected": -490.66265869140625, "loss": 0.596, "nll_loss": 0.41504624485969543, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5691247582435608, "rewards/margins": 0.5313467383384705, "rewards/rejected": 0.03777790069580078, "step": 962 }, { "epoch": 0.8377145339995655, "grad_norm": 111.93946541938594, "learning_rate": 1.378076664926624e-08, "logits/chosen": 0.44248145818710327, "logits/rejected": 0.3701130747795105, "logps/chosen": -467.9688720703125, "logps/rejected": -451.6510314941406, "loss": 0.544, "nll_loss": 0.4465351700782776, "rewards/accuracies": 0.6875, "rewards/chosen": 0.504805862903595, "rewards/margins": 0.6661850214004517, "rewards/rejected": -0.1613791435956955, "step": 964 }, { "epoch": 0.8394525309580708, "grad_norm": 143.9430178806754, "learning_rate": 1.349281631709389e-08, "logits/chosen": 0.23045340180397034, "logits/rejected": 0.16618621349334717, "logps/chosen": -428.14093017578125, "logps/rejected": -468.91717529296875, "loss": 0.5326, "nll_loss": 0.4211942255496979, "rewards/accuracies": 0.4375, "rewards/chosen": 0.6767243146896362, "rewards/margins": 0.11299353092908859, "rewards/rejected": 0.5637306571006775, "step": 966 }, { "epoch": 0.8411905279165761, "grad_norm": 103.62673108372952, "learning_rate": 1.3207688429084974e-08, "logits/chosen": 0.46591299772262573, "logits/rejected": 0.3822897672653198, "logps/chosen": -483.8509521484375, "logps/rejected": -498.87005615234375, "loss": 0.537, "nll_loss": 0.46457740664482117, "rewards/accuracies": 0.75, "rewards/chosen": 0.6259498596191406, "rewards/margins": 0.6032807230949402, "rewards/rejected": 0.022669125348329544, "step": 968 }, { "epoch": 0.8429285248750815, "grad_norm": 234.96317478690582, "learning_rate": 1.2925392288022297e-08, "logits/chosen": 0.15632614493370056, "logits/rejected": 0.20441731810569763, "logps/chosen": -476.6795349121094, "logps/rejected": -508.18792724609375, "loss": 0.5673, "nll_loss": 0.46259137988090515, "rewards/accuracies": 0.75, "rewards/chosen": 0.8507680296897888, "rewards/margins": 0.6741021275520325, "rewards/rejected": 0.17666588723659515, "step": 970 }, { "epoch": 0.8446665218335868, "grad_norm": 148.50514932068978, "learning_rate": 1.264593710429811e-08, "logits/chosen": 0.3417684733867645, "logits/rejected": 0.35431018471717834, "logps/chosen": -480.681396484375, "logps/rejected": -478.01416015625, "loss": 0.5814, "nll_loss": 0.48108553886413574, "rewards/accuracies": 0.875, "rewards/chosen": 0.5715519189834595, "rewards/margins": 0.683962881565094, "rewards/rejected": -0.11241091787815094, "step": 972 }, { "epoch": 0.8464045187920921, "grad_norm": 129.62124450547134, "learning_rate": 1.2369331995613663e-08, "logits/chosen": 0.040659140795469284, "logits/rejected": 0.054739244282245636, "logps/chosen": -445.49310302734375, "logps/rejected": -479.6976318359375, "loss": 0.5619, "nll_loss": 0.4347701668739319, "rewards/accuracies": 0.9375, "rewards/chosen": 1.047504186630249, "rewards/margins": 0.8733047842979431, "rewards/rejected": 0.17419950664043427, "step": 974 }, { "epoch": 0.8481425157505974, "grad_norm": 153.51628913591082, "learning_rate": 1.2095585986681533e-08, "logits/chosen": 0.7045519948005676, "logits/rejected": 0.67140132188797, "logps/chosen": -594.0953369140625, "logps/rejected": -604.9121704101562, "loss": 0.5705, "nll_loss": 0.5499407052993774, "rewards/accuracies": 0.875, "rewards/chosen": 0.4179590344429016, "rewards/margins": 0.40337541699409485, "rewards/rejected": 0.014583582058548927, "step": 976 }, { "epoch": 0.8498805127091028, "grad_norm": 123.40507981871161, "learning_rate": 1.1824708008931416e-08, "logits/chosen": 0.718404233455658, "logits/rejected": 0.6372251510620117, "logps/chosen": -495.7720642089844, "logps/rejected": -462.13214111328125, "loss": 0.5864, "nll_loss": 0.4798401892185211, "rewards/accuracies": 0.5625, "rewards/chosen": 0.31270793080329895, "rewards/margins": 0.2564069926738739, "rewards/rejected": 0.05630092695355415, "step": 978 }, { "epoch": 0.8516185096676081, "grad_norm": 149.48370997361744, "learning_rate": 1.155670690021857e-08, "logits/chosen": 0.845966637134552, "logits/rejected": 0.7787639498710632, "logps/chosen": -498.9972229003906, "logps/rejected": -514.4071655273438, "loss": 0.5707, "nll_loss": 0.4815356135368347, "rewards/accuracies": 0.75, "rewards/chosen": 0.4916130304336548, "rewards/margins": 0.3846202790737152, "rewards/rejected": 0.10699271410703659, "step": 980 }, { "epoch": 0.8533565066261134, "grad_norm": 283.6581431830632, "learning_rate": 1.1291591404535461e-08, "logits/chosen": 0.691591739654541, "logits/rejected": 0.38823819160461426, "logps/chosen": -521.052001953125, "logps/rejected": -443.2837829589844, "loss": 0.5708, "nll_loss": 0.4971870183944702, "rewards/accuracies": 0.75, "rewards/chosen": 0.3442355990409851, "rewards/margins": 0.4594818651676178, "rewards/rejected": -0.11524628847837448, "step": 982 }, { "epoch": 0.8550945035846187, "grad_norm": 142.05231100131795, "learning_rate": 1.1029370171726571e-08, "logits/chosen": 0.20724141597747803, "logits/rejected": 0.45057153701782227, "logps/chosen": -526.8379516601562, "logps/rejected": -588.6925659179688, "loss": 0.5284, "nll_loss": 0.5008586645126343, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8295645713806152, "rewards/margins": 0.8359251022338867, "rewards/rejected": -0.0063606202602386475, "step": 984 }, { "epoch": 0.8568325005431241, "grad_norm": 240.52585530952192, "learning_rate": 1.0770051757206077e-08, "logits/chosen": 0.20337149500846863, "logits/rejected": 0.4355597198009491, "logps/chosen": -495.91021728515625, "logps/rejected": -542.2305297851562, "loss": 0.5244, "nll_loss": 0.4666140377521515, "rewards/accuracies": 0.75, "rewards/chosen": 0.742421567440033, "rewards/margins": 0.6228967308998108, "rewards/rejected": 0.11952477693557739, "step": 986 }, { "epoch": 0.8585704975016294, "grad_norm": 161.81705766680054, "learning_rate": 1.0513644621678807e-08, "logits/chosen": 0.6418431997299194, "logits/rejected": 0.8085691928863525, "logps/chosen": -457.9586486816406, "logps/rejected": -536.1529541015625, "loss": 0.6134, "nll_loss": 0.4475609064102173, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40928417444229126, "rewards/margins": 0.32167717814445496, "rewards/rejected": 0.0876070037484169, "step": 988 }, { "epoch": 0.8603084944601347, "grad_norm": 209.83037790632198, "learning_rate": 1.0260157130864178e-08, "logits/chosen": 0.3358425796031952, "logits/rejected": 0.14016635715961456, "logps/chosen": -464.33392333984375, "logps/rejected": -476.4942626953125, "loss": 0.5479, "nll_loss": 0.4349672794342041, "rewards/accuracies": 0.75, "rewards/chosen": 0.4732265770435333, "rewards/margins": 0.3230219781398773, "rewards/rejected": 0.15020456910133362, "step": 990 }, { "epoch": 0.86204649141864, "grad_norm": 131.501360542472, "learning_rate": 1.0009597555223126e-08, "logits/chosen": 0.23200947046279907, "logits/rejected": 0.32851237058639526, "logps/chosen": -446.45947265625, "logps/rejected": -463.9974060058594, "loss": 0.513, "nll_loss": 0.4259781539440155, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6944409608840942, "rewards/margins": 0.42177268862724304, "rewards/rejected": 0.2726683020591736, "step": 992 }, { "epoch": 0.8637844883771454, "grad_norm": 127.65009818745983, "learning_rate": 9.761974069688461e-09, "logits/chosen": 0.5669763088226318, "logits/rejected": 0.5903551578521729, "logps/chosen": -515.50390625, "logps/rejected": -482.6619873046875, "loss": 0.6124, "nll_loss": 0.5234907269477844, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4056873321533203, "rewards/margins": 0.38572657108306885, "rewards/rejected": 0.019960783421993256, "step": 994 }, { "epoch": 0.8655224853356507, "grad_norm": 139.3600385512985, "learning_rate": 9.517294753398064e-09, "logits/chosen": 0.23763130605220795, "logits/rejected": 0.1550990492105484, "logps/chosen": -402.9676513671875, "logps/rejected": -437.53692626953125, "loss": 0.5845, "nll_loss": 0.398506760597229, "rewards/accuracies": 0.75, "rewards/chosen": 0.7281893491744995, "rewards/margins": 0.3145132064819336, "rewards/rejected": 0.4136761724948883, "step": 996 }, { "epoch": 0.867260482294156, "grad_norm": 151.6763492595599, "learning_rate": 9.275567589431177e-09, "logits/chosen": 0.32628533244132996, "logits/rejected": 0.3804038166999817, "logps/chosen": -496.0785827636719, "logps/rejected": -505.3673095703125, "loss": 0.6, "nll_loss": 0.47465142607688904, "rewards/accuracies": 0.75, "rewards/chosen": 0.9524919986724854, "rewards/margins": 0.6615516543388367, "rewards/rejected": 0.2909402847290039, "step": 998 }, { "epoch": 0.8689984792526613, "grad_norm": 149.6705914399637, "learning_rate": 9.036800464548155e-09, "logits/chosen": 0.3488670587539673, "logits/rejected": 0.26245802640914917, "logps/chosen": -450.0097961425781, "logps/rejected": -507.2135314941406, "loss": 0.6198, "nll_loss": 0.45576491951942444, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5999775528907776, "rewards/margins": 0.3929904103279114, "rewards/rejected": 0.20698711276054382, "step": 1000 }, { "epoch": 0.8707364762111667, "grad_norm": 214.22049086541324, "learning_rate": 8.80100116893301e-09, "logits/chosen": -0.08340902626514435, "logits/rejected": 0.002378493547439575, "logps/chosen": -538.5593872070312, "logps/rejected": -618.6040649414062, "loss": 0.584, "nll_loss": 0.4630267322063446, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7095227837562561, "rewards/margins": 0.5523332357406616, "rewards/rejected": 0.15718956291675568, "step": 1002 }, { "epoch": 0.872474473169672, "grad_norm": 135.51290099719628, "learning_rate": 8.568177395939213e-09, "logits/chosen": 0.5445169806480408, "logits/rejected": 0.5261526703834534, "logps/chosen": -519.8292846679688, "logps/rejected": -467.5902099609375, "loss": 0.6387, "nll_loss": 0.48102161288261414, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5225428342819214, "rewards/margins": 0.13977020978927612, "rewards/rejected": 0.3827725350856781, "step": 1004 }, { "epoch": 0.8742124701281773, "grad_norm": 149.90853918610964, "learning_rate": 8.338336741838836e-09, "logits/chosen": 0.2194560170173645, "logits/rejected": 0.1848369538784027, "logps/chosen": -410.1855773925781, "logps/rejected": -393.67950439453125, "loss": 0.548, "nll_loss": 0.3803246319293976, "rewards/accuracies": 0.5625, "rewards/chosen": 0.32082509994506836, "rewards/margins": 0.14545764029026031, "rewards/rejected": 0.17536745965480804, "step": 1006 }, { "epoch": 0.8759504670866826, "grad_norm": 137.29548301434852, "learning_rate": 8.111486705574533e-09, "logits/chosen": 0.4672020673751831, "logits/rejected": 0.48324400186538696, "logps/chosen": -502.46441650390625, "logps/rejected": -493.8004455566406, "loss": 0.5571, "nll_loss": 0.49147966504096985, "rewards/accuracies": 0.875, "rewards/chosen": 0.6543054580688477, "rewards/margins": 0.6119914054870605, "rewards/rejected": 0.042314041405916214, "step": 1008 }, { "epoch": 0.877688464045188, "grad_norm": 107.18011497907976, "learning_rate": 7.887634688515e-09, "logits/chosen": 0.49401983618736267, "logits/rejected": 0.562633216381073, "logps/chosen": -450.69854736328125, "logps/rejected": -484.1754150390625, "loss": 0.5953, "nll_loss": 0.4330693185329437, "rewards/accuracies": 0.75, "rewards/chosen": 0.5111261606216431, "rewards/margins": 0.2732935845851898, "rewards/rejected": 0.23783257603645325, "step": 1010 }, { "epoch": 0.8794264610036933, "grad_norm": 223.40815488567432, "learning_rate": 7.666787994213453e-09, "logits/chosen": 0.3347271978855133, "logits/rejected": 0.34065520763397217, "logps/chosen": -508.6999206542969, "logps/rejected": -517.7572631835938, "loss": 0.5512, "nll_loss": 0.49927687644958496, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6774803400039673, "rewards/margins": 0.42118340730667114, "rewards/rejected": 0.25629690289497375, "step": 1012 }, { "epoch": 0.8811644579621986, "grad_norm": 108.51949876750757, "learning_rate": 7.4489538281693136e-09, "logits/chosen": 0.6547769904136658, "logits/rejected": 0.6195938587188721, "logps/chosen": -480.0023498535156, "logps/rejected": -542.1754760742188, "loss": 0.5535, "nll_loss": 0.4586215913295746, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3463646173477173, "rewards/margins": 0.5637027025222778, "rewards/rejected": -0.21733808517456055, "step": 1014 }, { "epoch": 0.8829024549207038, "grad_norm": 171.38546762683748, "learning_rate": 7.234139297593178e-09, "logits/chosen": 0.36383742094039917, "logits/rejected": 0.30936628580093384, "logps/chosen": -482.3176574707031, "logps/rejected": -511.5982360839844, "loss": 0.5968, "nll_loss": 0.4618416726589203, "rewards/accuracies": 0.875, "rewards/chosen": 0.3494030237197876, "rewards/margins": 0.3414415717124939, "rewards/rejected": 0.007961463183164597, "step": 1016 }, { "epoch": 0.8846404518792093, "grad_norm": 130.31979725651428, "learning_rate": 7.022351411174865e-09, "logits/chosen": 0.6526001691818237, "logits/rejected": 0.650229811668396, "logps/chosen": -440.3011779785156, "logps/rejected": -459.21234130859375, "loss": 0.5771, "nll_loss": 0.43394389748573303, "rewards/accuracies": 0.5625, "rewards/chosen": 0.28458622097969055, "rewards/margins": 0.1846170425415039, "rewards/rejected": 0.09996921569108963, "step": 1018 }, { "epoch": 0.8863784488377145, "grad_norm": 146.85783391797733, "learning_rate": 6.813597078854772e-09, "logits/chosen": 0.3538188636302948, "logits/rejected": 0.5154883861541748, "logps/chosen": -405.7217102050781, "logps/rejected": -519.734130859375, "loss": 0.617, "nll_loss": 0.41503557562828064, "rewards/accuracies": 0.8125, "rewards/chosen": 0.507857084274292, "rewards/margins": 0.3474844992160797, "rewards/rejected": 0.16037264466285706, "step": 1020 }, { "epoch": 0.8881164457962198, "grad_norm": 178.76099417376233, "learning_rate": 6.607883111598445e-09, "logits/chosen": 0.6064735054969788, "logits/rejected": 0.6442840099334717, "logps/chosen": -539.7501831054688, "logps/rejected": -526.06884765625, "loss": 0.5727, "nll_loss": 0.5150287747383118, "rewards/accuracies": 0.5, "rewards/chosen": 0.11767234653234482, "rewards/margins": 0.13286437094211578, "rewards/rejected": -0.015192030929028988, "step": 1022 }, { "epoch": 0.8898544427547251, "grad_norm": 143.87254174590024, "learning_rate": 6.405216221174325e-09, "logits/chosen": 0.19760847091674805, "logits/rejected": 0.17889335751533508, "logps/chosen": -476.1910705566406, "logps/rejected": -545.6195068359375, "loss": 0.5429, "nll_loss": 0.4657999575138092, "rewards/accuracies": 0.875, "rewards/chosen": 0.7912313342094421, "rewards/margins": 0.7614498138427734, "rewards/rejected": 0.029781535267829895, "step": 1024 }, { "epoch": 0.8915924397132305, "grad_norm": 142.72070270620054, "learning_rate": 6.205603019934791e-09, "logits/chosen": 0.44221293926239014, "logits/rejected": 0.4288594424724579, "logps/chosen": -456.22088623046875, "logps/rejected": -469.8916320800781, "loss": 0.5815, "nll_loss": 0.4313237965106964, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26495635509490967, "rewards/margins": 0.35365360975265503, "rewards/rejected": -0.08869723975658417, "step": 1026 }, { "epoch": 0.8933304366717358, "grad_norm": 143.40703061220435, "learning_rate": 6.009050020600459e-09, "logits/chosen": 0.2621181011199951, "logits/rejected": 0.3539845645427704, "logps/chosen": -484.30810546875, "logps/rejected": -515.847900390625, "loss": 0.5573, "nll_loss": 0.45568376779556274, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8613646030426025, "rewards/margins": 0.5282778143882751, "rewards/rejected": 0.3330867886543274, "step": 1028 }, { "epoch": 0.8950684336302411, "grad_norm": 228.1216526051892, "learning_rate": 5.815563636047538e-09, "logits/chosen": 0.2763429284095764, "logits/rejected": 0.30502286553382874, "logps/chosen": -490.53564453125, "logps/rejected": -529.5304565429688, "loss": 0.5394, "nll_loss": 0.4875333905220032, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8852298259735107, "rewards/margins": 0.7269324660301208, "rewards/rejected": 0.1582973450422287, "step": 1030 }, { "epoch": 0.8968064305887464, "grad_norm": 119.41955809926453, "learning_rate": 5.625150179098803e-09, "logits/chosen": 0.7187064290046692, "logits/rejected": 0.6024725437164307, "logps/chosen": -527.1694946289062, "logps/rejected": -551.8734130859375, "loss": 0.5891, "nll_loss": 0.5098748207092285, "rewards/accuracies": 0.625, "rewards/chosen": 0.296517550945282, "rewards/margins": 0.18804368376731873, "rewards/rejected": 0.10847387462854385, "step": 1032 }, { "epoch": 0.8985444275472518, "grad_norm": 218.4987606965274, "learning_rate": 5.437815862317519e-09, "logits/chosen": 0.2036760449409485, "logits/rejected": 0.27650704979896545, "logps/chosen": -431.7545166015625, "logps/rejected": -486.6964111328125, "loss": 0.5661, "nll_loss": 0.4158436059951782, "rewards/accuracies": 0.75, "rewards/chosen": 0.6781007051467896, "rewards/margins": 0.7105196714401245, "rewards/rejected": -0.03241892158985138, "step": 1034 }, { "epoch": 0.9002824245057571, "grad_norm": 150.28933801097338, "learning_rate": 5.253566797804709e-09, "logits/chosen": 0.24965474009513855, "logits/rejected": 0.28565654158592224, "logps/chosen": -492.99859619140625, "logps/rejected": -555.1309814453125, "loss": 0.581, "nll_loss": 0.47916826605796814, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8634170889854431, "rewards/margins": 0.5948573350906372, "rewards/rejected": 0.2685597240924835, "step": 1036 }, { "epoch": 0.9020204214642624, "grad_norm": 136.55772838494232, "learning_rate": 5.072408996999844e-09, "logits/chosen": 0.43632686138153076, "logits/rejected": 0.39454224705696106, "logps/chosen": -494.273193359375, "logps/rejected": -499.689697265625, "loss": 0.5438, "nll_loss": 0.4602780044078827, "rewards/accuracies": 0.625, "rewards/chosen": 0.5984913110733032, "rewards/margins": 0.25624755024909973, "rewards/rejected": 0.3422437310218811, "step": 1038 }, { "epoch": 0.9037584184227677, "grad_norm": 178.70590958523087, "learning_rate": 4.8943483704846465e-09, "logits/chosen": 0.22367754578590393, "logits/rejected": 0.3362473249435425, "logps/chosen": -454.59967041015625, "logps/rejected": -488.4815368652344, "loss": 0.5952, "nll_loss": 0.44791561365127563, "rewards/accuracies": 0.75, "rewards/chosen": 0.5820288062095642, "rewards/margins": 0.25380000472068787, "rewards/rejected": 0.32822877168655396, "step": 1040 }, { "epoch": 0.9054964153812731, "grad_norm": 185.45213748058183, "learning_rate": 4.7193907277902175e-09, "logits/chosen": 0.5410184860229492, "logits/rejected": 0.5517024397850037, "logps/chosen": -508.5509338378906, "logps/rejected": -491.8945007324219, "loss": 0.5455, "nll_loss": 0.518214225769043, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39927348494529724, "rewards/margins": 0.3506126403808594, "rewards/rejected": 0.048660848289728165, "step": 1042 }, { "epoch": 0.9072344123397784, "grad_norm": 139.23192133772244, "learning_rate": 4.547541777207564e-09, "logits/chosen": 0.17857961356639862, "logits/rejected": 0.33340996503829956, "logps/chosen": -414.42791748046875, "logps/rejected": -408.1433410644531, "loss": 0.7357, "nll_loss": 0.3875848054885864, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5422300100326538, "rewards/margins": -0.007215976715087891, "rewards/rejected": 0.5494458675384521, "step": 1044 }, { "epoch": 0.9089724092982837, "grad_norm": 147.38015327862638, "learning_rate": 4.3788071256013024e-09, "logits/chosen": 0.6263728737831116, "logits/rejected": 0.581895649433136, "logps/chosen": -490.2805480957031, "logps/rejected": -471.88385009765625, "loss": 0.5973, "nll_loss": 0.47902175784111023, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5375036597251892, "rewards/margins": 0.5982069969177246, "rewards/rejected": -0.06070336699485779, "step": 1046 }, { "epoch": 0.910710406256789, "grad_norm": 106.89389378580692, "learning_rate": 4.2131922782267405e-09, "logits/chosen": 0.11579008400440216, "logits/rejected": 0.08525022119283676, "logps/chosen": -438.53094482421875, "logps/rejected": -441.49969482421875, "loss": 0.5276, "nll_loss": 0.4182903468608856, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5620919466018677, "rewards/margins": 0.9903548955917358, "rewards/rejected": -0.4282629191875458, "step": 1048 }, { "epoch": 0.9124484032152944, "grad_norm": 183.72771679991544, "learning_rate": 4.050702638550274e-09, "logits/chosen": 0.535997748374939, "logits/rejected": 0.4335247278213501, "logps/chosen": -538.29296875, "logps/rejected": -476.898193359375, "loss": 0.6073, "nll_loss": 0.5014994740486145, "rewards/accuracies": 0.4375, "rewards/chosen": 0.19083280861377716, "rewards/margins": -0.13114146888256073, "rewards/rejected": 0.3219743072986603, "step": 1050 }, { "epoch": 0.9141864001737997, "grad_norm": 158.5516983187171, "learning_rate": 3.891343508073053e-09, "logits/chosen": 0.32029953598976135, "logits/rejected": 0.3168344795703888, "logps/chosen": -449.4055480957031, "logps/rejected": -465.4240417480469, "loss": 0.5593, "nll_loss": 0.43291229009628296, "rewards/accuracies": 0.875, "rewards/chosen": 0.5993573665618896, "rewards/margins": 0.6672921776771545, "rewards/rejected": -0.0679347962141037, "step": 1052 }, { "epoch": 0.915924397132305, "grad_norm": 206.91428322699906, "learning_rate": 3.735120086158061e-09, "logits/chosen": 0.3795467019081116, "logits/rejected": 0.4101225733757019, "logps/chosen": -493.1979675292969, "logps/rejected": -489.7401123046875, "loss": 0.5397, "nll_loss": 0.46552711725234985, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0511305332183838, "rewards/margins": 0.9409102201461792, "rewards/rejected": 0.11022023856639862, "step": 1054 }, { "epoch": 0.9176623940908103, "grad_norm": 131.1638807054813, "learning_rate": 3.582037469860455e-09, "logits/chosen": 0.3941403031349182, "logits/rejected": 0.2784467041492462, "logps/chosen": -564.7503051757812, "logps/rejected": -472.0733642578125, "loss": 0.5457, "nll_loss": 0.5184577703475952, "rewards/accuracies": 0.875, "rewards/chosen": 0.8316090106964111, "rewards/margins": 0.6089596748352051, "rewards/rejected": 0.22264929115772247, "step": 1056 }, { "epoch": 0.9194003910493157, "grad_norm": 194.9239008622977, "learning_rate": 3.4321006537612163e-09, "logits/chosen": 0.13808123767375946, "logits/rejected": 0.21899640560150146, "logps/chosen": -441.20404052734375, "logps/rejected": -475.18524169921875, "loss": 0.5258, "nll_loss": 0.4570828080177307, "rewards/accuracies": 0.875, "rewards/chosen": 0.6368840336799622, "rewards/margins": 0.8989419937133789, "rewards/rejected": -0.26205796003341675, "step": 1058 }, { "epoch": 0.921138388007821, "grad_norm": 115.43391845382065, "learning_rate": 3.285314529804295e-09, "logits/chosen": 0.4206714630126953, "logits/rejected": 0.40312278270721436, "logps/chosen": -460.8715515136719, "logps/rejected": -518.5413208007812, "loss": 0.5333, "nll_loss": 0.4465717077255249, "rewards/accuracies": 1.0, "rewards/chosen": 0.6075332760810852, "rewards/margins": 0.5724822282791138, "rewards/rejected": 0.03505106270313263, "step": 1060 }, { "epoch": 0.9228763849663263, "grad_norm": 202.93417822517483, "learning_rate": 3.141683887136892e-09, "logits/chosen": 0.486500084400177, "logits/rejected": 0.4569099545478821, "logps/chosen": -442.55816650390625, "logps/rejected": -431.1639709472656, "loss": 0.5329, "nll_loss": 0.4247971773147583, "rewards/accuracies": 0.75, "rewards/chosen": 0.43177396059036255, "rewards/margins": 0.2917194366455078, "rewards/rejected": 0.14005452394485474, "step": 1062 }, { "epoch": 0.9246143819248316, "grad_norm": 119.40749558394226, "learning_rate": 3.001213411953296e-09, "logits/chosen": 0.4011079668998718, "logits/rejected": 0.46269315481185913, "logps/chosen": -487.8550720214844, "logps/rejected": -531.1329956054688, "loss": 0.5007, "nll_loss": 0.4475144147872925, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5412551760673523, "rewards/margins": 0.5304111242294312, "rewards/rejected": 0.01084403321146965, "step": 1064 }, { "epoch": 0.926352378883337, "grad_norm": 130.26377320406488, "learning_rate": 2.8639076873419487e-09, "logits/chosen": 0.14351217448711395, "logits/rejected": 0.25187253952026367, "logps/chosen": -538.365234375, "logps/rejected": -633.5084838867188, "loss": 0.6178, "nll_loss": 0.4882996380329132, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6392820477485657, "rewards/margins": 0.613286554813385, "rewards/rejected": 0.025995440781116486, "step": 1066 }, { "epoch": 0.9280903758418423, "grad_norm": 184.52491422760545, "learning_rate": 2.729771193135899e-09, "logits/chosen": 0.5797444581985474, "logits/rejected": 0.2687540650367737, "logps/chosen": -522.3880615234375, "logps/rejected": -471.7548828125, "loss": 0.6481, "nll_loss": 0.48741546273231506, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23231476545333862, "rewards/margins": 0.5791104435920715, "rewards/rejected": -0.34679561853408813, "step": 1068 }, { "epoch": 0.9298283728003476, "grad_norm": 177.24645530115882, "learning_rate": 2.598808305766653e-09, "logits/chosen": 0.43344372510910034, "logits/rejected": 0.4523126482963562, "logps/chosen": -467.32916259765625, "logps/rejected": -519.6317749023438, "loss": 0.5094, "nll_loss": 0.4872402548789978, "rewards/accuracies": 0.875, "rewards/chosen": 0.4050002992153168, "rewards/margins": 0.6784630417823792, "rewards/rejected": -0.27346280217170715, "step": 1070 }, { "epoch": 0.9315663697588529, "grad_norm": 197.39750151518055, "learning_rate": 2.4710232981214218e-09, "logits/chosen": 0.3327729403972626, "logits/rejected": 0.14955325424671173, "logps/chosen": -550.6797485351562, "logps/rejected": -475.5610046386719, "loss": 0.6603, "nll_loss": 0.46252796053886414, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3012423515319824, "rewards/margins": -0.07680274546146393, "rewards/rejected": 0.37804505228996277, "step": 1072 }, { "epoch": 0.9333043667173583, "grad_norm": 174.39443484205302, "learning_rate": 2.346420339403632e-09, "logits/chosen": 0.4824119508266449, "logits/rejected": 0.3831688165664673, "logps/chosen": -473.789794921875, "logps/rejected": -436.1537780761719, "loss": 0.6016, "nll_loss": 0.43823355436325073, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41553688049316406, "rewards/margins": 0.1545083224773407, "rewards/rejected": 0.26102858781814575, "step": 1074 }, { "epoch": 0.9350423636758636, "grad_norm": 124.99709040890798, "learning_rate": 2.2250034949969907e-09, "logits/chosen": 0.5162658095359802, "logits/rejected": 0.3049306273460388, "logps/chosen": -509.45013427734375, "logps/rejected": -532.9105224609375, "loss": 0.5665, "nll_loss": 0.5037564039230347, "rewards/accuracies": 0.625, "rewards/chosen": 0.3767814636230469, "rewards/margins": 0.46063491702079773, "rewards/rejected": -0.08385343104600906, "step": 1076 }, { "epoch": 0.9367803606343689, "grad_norm": 350.65235066069096, "learning_rate": 2.106776726332793e-09, "logits/chosen": 0.2661568820476532, "logits/rejected": 0.25111711025238037, "logps/chosen": -503.4539794921875, "logps/rejected": -484.7874450683594, "loss": 0.599, "nll_loss": 0.491374671459198, "rewards/accuracies": 0.625, "rewards/chosen": 0.3265320062637329, "rewards/margins": 0.33517876267433167, "rewards/rejected": -0.008646775037050247, "step": 1078 }, { "epoch": 0.9385183575928742, "grad_norm": 144.21117524237232, "learning_rate": 1.9917438907606556e-09, "logits/chosen": 0.5326768159866333, "logits/rejected": 0.4657231867313385, "logps/chosen": -451.84820556640625, "logps/rejected": -391.3930969238281, "loss": 0.5676, "nll_loss": 0.4326724410057068, "rewards/accuracies": 0.875, "rewards/chosen": 0.33496278524398804, "rewards/margins": 0.5200725793838501, "rewards/rejected": -0.18510979413986206, "step": 1080 }, { "epoch": 0.9402563545513796, "grad_norm": 131.1131442940977, "learning_rate": 1.8799087414227198e-09, "logits/chosen": 0.34258347749710083, "logits/rejected": 0.34085893630981445, "logps/chosen": -539.6632080078125, "logps/rejected": -504.9268798828125, "loss": 0.5598, "nll_loss": 0.5036740303039551, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6515111923217773, "rewards/margins": 0.5885885953903198, "rewards/rejected": 0.06292267143726349, "step": 1082 }, { "epoch": 0.9419943515098849, "grad_norm": 143.6646151289232, "learning_rate": 1.771274927131139e-09, "logits/chosen": 0.2275387942790985, "logits/rejected": 0.22034448385238647, "logps/chosen": -422.7888488769531, "logps/rejected": -435.2667236328125, "loss": 0.5547, "nll_loss": 0.3884378969669342, "rewards/accuracies": 0.875, "rewards/chosen": 0.7469679117202759, "rewards/margins": 0.4667326509952545, "rewards/rejected": 0.280235230922699, "step": 1084 }, { "epoch": 0.9437323484683902, "grad_norm": 163.27269333598966, "learning_rate": 1.665845992249071e-09, "logits/chosen": 0.4115934371948242, "logits/rejected": 0.44389423727989197, "logps/chosen": -516.2249145507812, "logps/rejected": -510.037109375, "loss": 0.5604, "nll_loss": 0.4815514087677002, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4112775921821594, "rewards/margins": 0.4651253819465637, "rewards/rejected": -0.05384781211614609, "step": 1086 }, { "epoch": 0.9454703454268955, "grad_norm": 129.88097141896537, "learning_rate": 1.5636253765750506e-09, "logits/chosen": 0.6956306099891663, "logits/rejected": 0.47814223170280457, "logps/chosen": -505.50189208984375, "logps/rejected": -472.5733642578125, "loss": 0.5726, "nll_loss": 0.46664267778396606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39985811710357666, "rewards/margins": 0.3242230713367462, "rewards/rejected": 0.07563506066799164, "step": 1088 }, { "epoch": 0.9472083423854009, "grad_norm": 221.92257915634556, "learning_rate": 1.4646164152307016e-09, "logits/chosen": 0.41604483127593994, "logits/rejected": 0.49654048681259155, "logps/chosen": -442.07861328125, "logps/rejected": -480.72576904296875, "loss": 0.5722, "nll_loss": 0.41331538558006287, "rewards/accuracies": 0.625, "rewards/chosen": 0.35311394929885864, "rewards/margins": 0.3139072358608246, "rewards/rejected": 0.03920670226216316, "step": 1090 }, { "epoch": 0.9489463393439062, "grad_norm": 152.98735410952526, "learning_rate": 1.3688223385519671e-09, "logits/chosen": 0.25502362847328186, "logits/rejected": 0.18617622554302216, "logps/chosen": -481.209228515625, "logps/rejected": -505.02813720703125, "loss": 0.5983, "nll_loss": 0.4872613847255707, "rewards/accuracies": 0.75, "rewards/chosen": 1.016000747680664, "rewards/margins": 0.4449235200881958, "rewards/rejected": 0.5710772275924683, "step": 1092 }, { "epoch": 0.9506843363024114, "grad_norm": 233.20407395608177, "learning_rate": 1.2762462719837275e-09, "logits/chosen": 0.3264160752296448, "logits/rejected": 0.3496069312095642, "logps/chosen": -506.10308837890625, "logps/rejected": -505.58001708984375, "loss": 0.5874, "nll_loss": 0.4658485949039459, "rewards/accuracies": 0.75, "rewards/chosen": 0.473180890083313, "rewards/margins": 0.39049264788627625, "rewards/rejected": 0.08268821239471436, "step": 1094 }, { "epoch": 0.9524223332609167, "grad_norm": 198.02450972875945, "learning_rate": 1.1868912359777606e-09, "logits/chosen": 0.44856521487236023, "logits/rejected": 0.27867019176483154, "logps/chosen": -482.5740661621094, "logps/rejected": -509.0614318847656, "loss": 0.5837, "nll_loss": 0.5008357763290405, "rewards/accuracies": 0.75, "rewards/chosen": 0.526940643787384, "rewards/margins": 0.39634713530540466, "rewards/rejected": 0.13059350848197937, "step": 1096 }, { "epoch": 0.9541603302194221, "grad_norm": 177.08877803633223, "learning_rate": 1.100760145894275e-09, "logits/chosen": 0.4941718578338623, "logits/rejected": 0.5006747245788574, "logps/chosen": -522.516357421875, "logps/rejected": -600.25341796875, "loss": 0.5911, "nll_loss": 0.46976977586746216, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6517099142074585, "rewards/margins": 0.5950245261192322, "rewards/rejected": 0.056685447692871094, "step": 1098 }, { "epoch": 0.9558983271779274, "grad_norm": 134.79426613808798, "learning_rate": 1.0178558119067315e-09, "logits/chosen": 0.6318961381912231, "logits/rejected": 0.4834882616996765, "logps/chosen": -530.4765625, "logps/rejected": -498.7202453613281, "loss": 0.535, "nll_loss": 0.5174931287765503, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7026293277740479, "rewards/margins": 0.4758650064468384, "rewards/rejected": 0.22676429152488708, "step": 1100 }, { "epoch": 0.9576363241364327, "grad_norm": 185.20080681256496, "learning_rate": 9.381809389101825e-10, "logits/chosen": 0.1996326446533203, "logits/rejected": 0.31951069831848145, "logps/chosen": -524.9970092773438, "logps/rejected": -477.78997802734375, "loss": 0.5776, "nll_loss": 0.49288156628608704, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8350135087966919, "rewards/margins": 0.5012381672859192, "rewards/rejected": 0.3337753415107727, "step": 1102 }, { "epoch": 0.959374321094938, "grad_norm": 194.81847727305524, "learning_rate": 8.617381264330425e-10, "logits/chosen": 0.3599563539028168, "logits/rejected": 0.5130894184112549, "logps/chosen": -449.61578369140625, "logps/rejected": -472.8209228515625, "loss": 0.634, "nll_loss": 0.44445574283599854, "rewards/accuracies": 0.625, "rewards/chosen": 0.5263224840164185, "rewards/margins": 0.36976680159568787, "rewards/rejected": 0.1565556526184082, "step": 1104 }, { "epoch": 0.9611123180534434, "grad_norm": 163.61216324705128, "learning_rate": 7.885298685522235e-10, "logits/chosen": 0.6613024473190308, "logits/rejected": 0.5108257532119751, "logps/chosen": -490.9361877441406, "logps/rejected": -469.5235595703125, "loss": 0.5668, "nll_loss": 0.4810979664325714, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41238242387771606, "rewards/margins": 0.48797228932380676, "rewards/rejected": -0.0755898505449295, "step": 1106 }, { "epoch": 0.9628503150119487, "grad_norm": 225.76619571629445, "learning_rate": 7.185585538117655e-10, "logits/chosen": 0.19790863990783691, "logits/rejected": 0.16061121225357056, "logps/chosen": -456.2121276855469, "logps/rejected": -423.03363037109375, "loss": 0.6116, "nll_loss": 0.4229794442653656, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2296188473701477, "rewards/margins": 0.2883566915988922, "rewards/rejected": -0.0587378591299057, "step": 1108 }, { "epoch": 0.964588311970454, "grad_norm": 126.30922488191435, "learning_rate": 6.518264651449779e-10, "logits/chosen": 0.3472726047039032, "logits/rejected": 0.2529224753379822, "logps/chosen": -464.9158630371094, "logps/rejected": -484.1787109375, "loss": 0.5457, "nll_loss": 0.4409720003604889, "rewards/accuracies": 0.875, "rewards/chosen": 0.5092476606369019, "rewards/margins": 0.6528787016868591, "rewards/rejected": -0.14363110065460205, "step": 1110 }, { "epoch": 0.9663263089289593, "grad_norm": 233.22023305294198, "learning_rate": 5.883357797998756e-10, "logits/chosen": 0.3325151205062866, "logits/rejected": 0.36989954113960266, "logps/chosen": -468.9339294433594, "logps/rejected": -413.1026611328125, "loss": 0.5261, "nll_loss": 0.4565078616142273, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5924686789512634, "rewards/margins": 0.5389618873596191, "rewards/rejected": 0.05350681021809578, "step": 1112 }, { "epoch": 0.9680643058874647, "grad_norm": 129.53738085733303, "learning_rate": 5.280885692681591e-10, "logits/chosen": 0.4145568907260895, "logits/rejected": 0.5612493753433228, "logps/chosen": -485.72955322265625, "logps/rejected": -496.85687255859375, "loss": 0.5857, "nll_loss": 0.4751755893230438, "rewards/accuracies": 0.75, "rewards/chosen": 0.6921634674072266, "rewards/margins": 0.440424382686615, "rewards/rejected": 0.25173911452293396, "step": 1114 }, { "epoch": 0.96980230284597, "grad_norm": 181.23747613565504, "learning_rate": 4.710867992176682e-10, "logits/chosen": 0.3197920620441437, "logits/rejected": 0.3367622494697571, "logps/chosen": -459.18145751953125, "logps/rejected": -447.7521057128906, "loss": 0.557, "nll_loss": 0.4460861384868622, "rewards/accuracies": 0.75, "rewards/chosen": 0.6104297041893005, "rewards/margins": 0.2446718066930771, "rewards/rejected": 0.36575785279273987, "step": 1116 }, { "epoch": 0.9715402998044753, "grad_norm": 156.1161949619465, "learning_rate": 4.173323294281994e-10, "logits/chosen": 0.35465008020401, "logits/rejected": 0.2980070412158966, "logps/chosen": -550.6026611328125, "logps/rejected": -532.0971069335938, "loss": 0.6142, "nll_loss": 0.5108888745307922, "rewards/accuracies": 0.75, "rewards/chosen": 0.1261121928691864, "rewards/margins": 0.1764545440673828, "rewards/rejected": -0.0503423698246479, "step": 1118 }, { "epoch": 0.9732782967629806, "grad_norm": 162.83002983060376, "learning_rate": 3.668269137308666e-10, "logits/chosen": 0.5446602702140808, "logits/rejected": 0.38520336151123047, "logps/chosen": -495.03192138671875, "logps/rejected": -444.5371398925781, "loss": 0.5832, "nll_loss": 0.4564196467399597, "rewards/accuracies": 0.875, "rewards/chosen": 0.3598203659057617, "rewards/margins": 0.3889120817184448, "rewards/rejected": -0.029091738164424896, "step": 1120 }, { "epoch": 0.975016293721486, "grad_norm": 179.94563561598702, "learning_rate": 3.195721999508461e-10, "logits/chosen": 0.6219773292541504, "logits/rejected": 0.6260953545570374, "logps/chosen": -502.2096252441406, "logps/rejected": -499.4460754394531, "loss": 0.5885, "nll_loss": 0.4690307080745697, "rewards/accuracies": 0.625, "rewards/chosen": 0.24832192063331604, "rewards/margins": 0.1444612741470337, "rewards/rejected": 0.10386066138744354, "step": 1122 }, { "epoch": 0.9767542906799913, "grad_norm": 126.248993284488, "learning_rate": 2.755697298536308e-10, "logits/chosen": 0.2676442563533783, "logits/rejected": 0.2535380721092224, "logps/chosen": -447.32073974609375, "logps/rejected": -412.70037841796875, "loss": 0.6036, "nll_loss": 0.43544793128967285, "rewards/accuracies": 0.625, "rewards/chosen": 0.5221117734909058, "rewards/margins": 0.3314821422100067, "rewards/rejected": 0.19062969088554382, "step": 1124 }, { "epoch": 0.9784922876384966, "grad_norm": 185.6732907788729, "learning_rate": 2.3482093909473754e-10, "logits/chosen": 0.2254175990819931, "logits/rejected": 0.30273178219795227, "logps/chosen": -470.2763671875, "logps/rejected": -531.9412841796875, "loss": 0.5924, "nll_loss": 0.46973299980163574, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7814911007881165, "rewards/margins": 0.4317186176776886, "rewards/rejected": 0.34977248311042786, "step": 1126 }, { "epoch": 0.980230284597002, "grad_norm": 211.43430486296467, "learning_rate": 1.973271571728441e-10, "logits/chosen": 0.33309152722358704, "logits/rejected": 0.3509593605995178, "logps/chosen": -422.4731750488281, "logps/rejected": -448.80963134765625, "loss": 0.6191, "nll_loss": 0.39795997738838196, "rewards/accuracies": 0.5625, "rewards/chosen": 0.48975372314453125, "rewards/margins": 0.18807843327522278, "rewards/rejected": 0.30167531967163086, "step": 1128 }, { "epoch": 0.9819682815555073, "grad_norm": 158.7187074514785, "learning_rate": 1.6308960738643517e-10, "logits/chosen": 0.3980328142642975, "logits/rejected": 0.22989094257354736, "logps/chosen": -450.3606262207031, "logps/rejected": -461.4617919921875, "loss": 0.5816, "nll_loss": 0.4547494351863861, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3753512501716614, "rewards/margins": 0.4223685562610626, "rewards/rejected": -0.047017283737659454, "step": 1130 }, { "epoch": 0.9837062785140126, "grad_norm": 124.14642305553735, "learning_rate": 1.3210940679385664e-10, "logits/chosen": 0.384761780500412, "logits/rejected": 0.43383389711380005, "logps/chosen": -482.262451171875, "logps/rejected": -534.4568481445312, "loss": 0.5631, "nll_loss": 0.4623926281929016, "rewards/accuracies": 0.875, "rewards/chosen": 0.4161040484905243, "rewards/margins": 0.5258000493049622, "rewards/rejected": -0.10969601571559906, "step": 1132 }, { "epoch": 0.9854442754725179, "grad_norm": 147.1639416823873, "learning_rate": 1.0438756617691114e-10, "logits/chosen": 0.23561853170394897, "logits/rejected": 0.38304686546325684, "logps/chosen": -498.98089599609375, "logps/rejected": -500.87750244140625, "loss": 0.5604, "nll_loss": 0.47494742274284363, "rewards/accuracies": 0.75, "rewards/chosen": 0.6560043096542358, "rewards/margins": 0.3133293092250824, "rewards/rejected": 0.34267503023147583, "step": 1134 }, { "epoch": 0.9871822724310233, "grad_norm": 121.0117229059167, "learning_rate": 7.992499000785136e-11, "logits/chosen": 0.2498825490474701, "logits/rejected": 0.36798134446144104, "logps/chosen": -382.92962646484375, "logps/rejected": -492.08172607421875, "loss": 0.5732, "nll_loss": 0.3937234580516815, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8302962779998779, "rewards/margins": 0.47974082827568054, "rewards/rejected": 0.350555419921875, "step": 1136 }, { "epoch": 0.9889202693895286, "grad_norm": 140.92643204865465, "learning_rate": 5.872247641987016e-11, "logits/chosen": 0.2667164206504822, "logits/rejected": 0.3866446614265442, "logps/chosen": -523.2333374023438, "logps/rejected": -505.80340576171875, "loss": 0.5751, "nll_loss": 0.4769275188446045, "rewards/accuracies": 0.875, "rewards/chosen": 0.6794913411140442, "rewards/margins": 0.47867727279663086, "rewards/rejected": 0.20081406831741333, "step": 1138 }, { "epoch": 0.9906582663480339, "grad_norm": 181.8205709739755, "learning_rate": 4.078071718107701e-11, "logits/chosen": 0.21165776252746582, "logits/rejected": 0.08558028936386108, "logps/chosen": -440.87237548828125, "logps/rejected": -425.84716796875, "loss": 0.6279, "nll_loss": 0.412685751914978, "rewards/accuracies": 0.5625, "rewards/chosen": 0.667019248008728, "rewards/margins": 0.12272016704082489, "rewards/rejected": 0.5442991256713867, "step": 1140 }, { "epoch": 0.9923962633065392, "grad_norm": 106.67423477208695, "learning_rate": 2.6100297671916016e-11, "logits/chosen": 0.2591753900051117, "logits/rejected": 0.35262227058410645, "logps/chosen": -502.3717956542969, "logps/rejected": -539.2123413085938, "loss": 0.5405, "nll_loss": 0.4731524884700775, "rewards/accuracies": 0.875, "rewards/chosen": 0.6413518190383911, "rewards/margins": 0.619159460067749, "rewards/rejected": 0.02219228446483612, "step": 1142 }, { "epoch": 0.9941342602650446, "grad_norm": 123.59587852601393, "learning_rate": 1.4681696866081228e-11, "logits/chosen": 0.4699622094631195, "logits/rejected": 0.380214661359787, "logps/chosen": -515.212646484375, "logps/rejected": -449.09814453125, "loss": 0.6179, "nll_loss": 0.4764086902141571, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3445565402507782, "rewards/margins": 0.3155711889266968, "rewards/rejected": 0.02898530103266239, "step": 1144 }, { "epoch": 0.9958722572235499, "grad_norm": 156.96054694461495, "learning_rate": 6.5252873148513574e-12, "logits/chosen": 0.21996259689331055, "logits/rejected": 0.23748603463172913, "logps/chosen": -400.7960510253906, "logps/rejected": -439.2467956542969, "loss": 0.5645, "nll_loss": 0.40228787064552307, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2994455397129059, "rewards/margins": 0.2858263850212097, "rewards/rejected": 0.013619126752018929, "step": 1146 }, { "epoch": 0.9976102541820552, "grad_norm": 171.74798960847323, "learning_rate": 1.6313351349883652e-12, "logits/chosen": -0.033592335879802704, "logits/rejected": 0.09655453264713287, "logps/chosen": -386.03948974609375, "logps/rejected": -446.1105041503906, "loss": 0.531, "nll_loss": 0.41557371616363525, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8122502565383911, "rewards/margins": 0.526740550994873, "rewards/rejected": 0.2855096757411957, "step": 1148 }, { "epoch": 0.9993482511405605, "grad_norm": 160.448811199801, "learning_rate": 0.0, "logits/chosen": 0.5092004537582397, "logits/rejected": 0.5407735109329224, "logps/chosen": -538.6259155273438, "logps/rejected": -522.8944091796875, "loss": 0.5529, "nll_loss": 0.5061779618263245, "rewards/accuracies": 0.75, "rewards/chosen": 0.5825742483139038, "rewards/margins": 0.24803534150123596, "rewards/rejected": 0.33453893661499023, "step": 1150 } ], "logging_steps": 2, "max_steps": 1150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }