diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9993482511405605, + "eval_steps": 10000, + "global_step": 1150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017379969585053225, + "grad_norm": 204.86322966456535, + "learning_rate": 8e-09, + "logits/chosen": 0.1418704390525818, + "logits/rejected": 0.2809927761554718, + "logps/chosen": -477.3938293457031, + "logps/rejected": -431.13787841796875, + "loss": 0.6996, + "nll_loss": 0.44202908873558044, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.01593799516558647, + "rewards/margins": -0.04110307618975639, + "rewards/rejected": 0.025165079161524773, + "step": 2 + }, + { + "epoch": 0.003475993917010645, + "grad_norm": 150.00820365668775, + "learning_rate": 1.6e-08, + "logits/chosen": 0.36849209666252136, + "logits/rejected": 0.4490591883659363, + "logps/chosen": -482.40179443359375, + "logps/rejected": -446.5460510253906, + "loss": 0.6997, + "nll_loss": 0.4239627420902252, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.041008852422237396, + "rewards/margins": -0.017010685056447983, + "rewards/rejected": -0.023998167365789413, + "step": 4 + }, + { + "epoch": 0.005213990875515968, + "grad_norm": 193.7012862807946, + "learning_rate": 2.3999999999999997e-08, + "logits/chosen": 0.1794443428516388, + "logits/rejected": 0.3568742275238037, + "logps/chosen": -519.8365478515625, + "logps/rejected": -510.6430969238281, + "loss": 0.7099, + "nll_loss": 0.46996381878852844, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.037707142531871796, + "rewards/margins": -0.050306886434555054, + "rewards/rejected": 0.01259975228458643, + "step": 6 + }, + { + "epoch": 0.00695198783402129, + "grad_norm": 205.73843329421297, + "learning_rate": 3.2e-08, + "logits/chosen": 0.44375962018966675, + "logits/rejected": 0.3268170952796936, + "logps/chosen": -543.103759765625, + "logps/rejected": -496.75482177734375, + "loss": 0.6864, + "nll_loss": 0.5108887553215027, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.058286286890506744, + "rewards/margins": 0.06145687401294708, + "rewards/rejected": -0.0031705868896096945, + "step": 8 + }, + { + "epoch": 0.008689984792526613, + "grad_norm": 200.84076796614073, + "learning_rate": 4e-08, + "logits/chosen": 0.35375142097473145, + "logits/rejected": 0.7080434560775757, + "logps/chosen": -415.6636962890625, + "logps/rejected": -469.8753662109375, + "loss": 0.7101, + "nll_loss": 0.398910790681839, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.062073614448308945, + "rewards/margins": -0.04637365788221359, + "rewards/rejected": -0.01569996029138565, + "step": 10 + }, + { + "epoch": 0.010427981751031936, + "grad_norm": 197.54313125975258, + "learning_rate": 4.799999999999999e-08, + "logits/chosen": 0.35080885887145996, + "logits/rejected": 0.41447973251342773, + "logps/chosen": -541.642822265625, + "logps/rejected": -553.8869018554688, + "loss": 0.7093, + "nll_loss": 0.4971155524253845, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.01922778971493244, + "rewards/margins": 0.019500732421875, + "rewards/rejected": -0.00027294084429740906, + "step": 12 + }, + { + "epoch": 0.012165978709537258, + "grad_norm": 238.36601819149988, + "learning_rate": 5.6000000000000005e-08, + "logits/chosen": 0.2947372794151306, + "logits/rejected": 0.23111987113952637, + "logps/chosen": -556.4337158203125, + "logps/rejected": -469.99188232421875, + "loss": 0.6999, + "nll_loss": 0.503655195236206, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004339606035500765, + "rewards/margins": 0.020965958014130592, + "rewards/rejected": -0.025305552408099174, + "step": 14 + }, + { + "epoch": 0.01390397566804258, + "grad_norm": 185.56933262588134, + "learning_rate": 6.4e-08, + "logits/chosen": 0.44016793370246887, + "logits/rejected": 0.40880700945854187, + "logps/chosen": -454.967529296875, + "logps/rejected": -460.52435302734375, + "loss": 0.6806, + "nll_loss": 0.4614264667034149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0023229592479765415, + "rewards/margins": 0.07889167219400406, + "rewards/rejected": -0.08121462166309357, + "step": 16 + }, + { + "epoch": 0.015641972626547904, + "grad_norm": 147.57386401300792, + "learning_rate": 7.2e-08, + "logits/chosen": 0.6430061459541321, + "logits/rejected": 0.6047405004501343, + "logps/chosen": -488.2435607910156, + "logps/rejected": -495.81756591796875, + "loss": 0.6886, + "nll_loss": 0.46323227882385254, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0426841676235199, + "rewards/margins": 0.06148987263441086, + "rewards/rejected": -0.018805695697665215, + "step": 18 + }, + { + "epoch": 0.017379969585053227, + "grad_norm": 190.85823681589352, + "learning_rate": 8e-08, + "logits/chosen": 0.6724389791488647, + "logits/rejected": 0.7834637761116028, + "logps/chosen": -481.4169616699219, + "logps/rejected": -496.19061279296875, + "loss": 0.7134, + "nll_loss": 0.4593673050403595, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014772225171327591, + "rewards/margins": 0.004669668152928352, + "rewards/rejected": -0.019441891461610794, + "step": 20 + }, + { + "epoch": 0.01911796654355855, + "grad_norm": 188.80442020206598, + "learning_rate": 8.8e-08, + "logits/chosen": 0.4140382409095764, + "logits/rejected": 0.46662065386772156, + "logps/chosen": -430.3562927246094, + "logps/rejected": -462.67132568359375, + "loss": 0.7065, + "nll_loss": 0.4172236919403076, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.032254599034786224, + "rewards/margins": 0.02239227294921875, + "rewards/rejected": -0.054646871984004974, + "step": 22 + }, + { + "epoch": 0.02085596350206387, + "grad_norm": 228.2165299617552, + "learning_rate": 9.599999999999999e-08, + "logits/chosen": 0.45874932408332825, + "logits/rejected": 0.3751963973045349, + "logps/chosen": -530.1676025390625, + "logps/rejected": -504.3202819824219, + "loss": 0.7172, + "nll_loss": 0.5031083822250366, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.005689051002264023, + "rewards/margins": -0.030557062476873398, + "rewards/rejected": 0.03624610975384712, + "step": 24 + }, + { + "epoch": 0.022593960460569194, + "grad_norm": 265.02955682128356, + "learning_rate": 1.04e-07, + "logits/chosen": 0.5117384791374207, + "logits/rejected": 0.4469182789325714, + "logps/chosen": -551.666748046875, + "logps/rejected": -496.4570007324219, + "loss": 0.7189, + "nll_loss": 0.5280334949493408, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06254005432128906, + "rewards/margins": -0.009257127530872822, + "rewards/rejected": -0.053282931447029114, + "step": 26 + }, + { + "epoch": 0.024331957419074516, + "grad_norm": 161.93619304345123, + "learning_rate": 1.1200000000000001e-07, + "logits/chosen": 0.3897124230861664, + "logits/rejected": 0.45780280232429504, + "logps/chosen": -481.9251403808594, + "logps/rejected": -518.1923828125, + "loss": 0.7133, + "nll_loss": 0.4771922826766968, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.036876581609249115, + "rewards/margins": 0.03441982343792915, + "rewards/rejected": -0.07129640877246857, + "step": 28 + }, + { + "epoch": 0.02606995437757984, + "grad_norm": 225.11472685922325, + "learning_rate": 1.2e-07, + "logits/chosen": 0.5148178339004517, + "logits/rejected": 0.5700947642326355, + "logps/chosen": -473.7587585449219, + "logps/rejected": -545.1907958984375, + "loss": 0.6845, + "nll_loss": 0.4589259624481201, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04277763515710831, + "rewards/margins": 0.042406272143125534, + "rewards/rejected": 0.0003713611513376236, + "step": 30 + }, + { + "epoch": 0.02780795133608516, + "grad_norm": 317.47335723089736, + "learning_rate": 1.28e-07, + "logits/chosen": 0.27863508462905884, + "logits/rejected": 0.368697851896286, + "logps/chosen": -477.912353515625, + "logps/rejected": -506.5833435058594, + "loss": 0.7088, + "nll_loss": 0.4421365559101105, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014108658768236637, + "rewards/margins": -0.04797716438770294, + "rewards/rejected": 0.03386850655078888, + "step": 32 + }, + { + "epoch": 0.029545948294590483, + "grad_norm": 210.72668411395, + "learning_rate": 1.36e-07, + "logits/chosen": 0.10042007267475128, + "logits/rejected": 0.028779903426766396, + "logps/chosen": -548.9990844726562, + "logps/rejected": -481.5511779785156, + "loss": 0.7029, + "nll_loss": 0.4736991822719574, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0562286376953125, + "rewards/margins": -0.02881174348294735, + "rewards/rejected": -0.0274168960750103, + "step": 34 + }, + { + "epoch": 0.03128394525309581, + "grad_norm": 209.72814001229338, + "learning_rate": 1.44e-07, + "logits/chosen": 0.6066944003105164, + "logits/rejected": 0.5373053550720215, + "logps/chosen": -528.7098999023438, + "logps/rejected": -506.41632080078125, + "loss": 0.6697, + "nll_loss": 0.49464890360832214, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09787749499082565, + "rewards/margins": 0.1495254635810852, + "rewards/rejected": -0.05164794996380806, + "step": 36 + }, + { + "epoch": 0.03302194221160113, + "grad_norm": 190.89703407210013, + "learning_rate": 1.5199999999999998e-07, + "logits/chosen": 0.46255195140838623, + "logits/rejected": 0.5460544228553772, + "logps/chosen": -482.3389587402344, + "logps/rejected": -484.7127990722656, + "loss": 0.6884, + "nll_loss": 0.4956420958042145, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06698399037122726, + "rewards/margins": 0.08448342978954315, + "rewards/rejected": -0.017499446868896484, + "step": 38 + }, + { + "epoch": 0.03475993917010645, + "grad_norm": 199.4593307406834, + "learning_rate": 1.6e-07, + "logits/chosen": 0.22364471852779388, + "logits/rejected": 0.31248578429222107, + "logps/chosen": -453.6600341796875, + "logps/rejected": -450.4988708496094, + "loss": 0.7112, + "nll_loss": 0.4363557696342468, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05961894989013672, + "rewards/margins": -0.03394022583961487, + "rewards/rejected": -0.0256787296384573, + "step": 40 + }, + { + "epoch": 0.036497936128611776, + "grad_norm": 239.19190157790425, + "learning_rate": 1.68e-07, + "logits/chosen": 0.34523797035217285, + "logits/rejected": 0.2734060287475586, + "logps/chosen": -522.078857421875, + "logps/rejected": -544.6460571289062, + "loss": 0.6942, + "nll_loss": 0.5025855302810669, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013720512390136719, + "rewards/margins": 0.032335568219423294, + "rewards/rejected": -0.046056076884269714, + "step": 42 + }, + { + "epoch": 0.0382359330871171, + "grad_norm": 309.03061373800654, + "learning_rate": 1.76e-07, + "logits/chosen": 0.38464105129241943, + "logits/rejected": 0.40568751096725464, + "logps/chosen": -469.6867370605469, + "logps/rejected": -469.88507080078125, + "loss": 0.7049, + "nll_loss": 0.4352942705154419, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0630766823887825, + "rewards/margins": -0.039861395955085754, + "rewards/rejected": -0.023215293884277344, + "step": 44 + }, + { + "epoch": 0.03997393004562242, + "grad_norm": 288.0039980121617, + "learning_rate": 1.84e-07, + "logits/chosen": 0.39405491948127747, + "logits/rejected": 0.37233734130859375, + "logps/chosen": -533.5667114257812, + "logps/rejected": -514.7755126953125, + "loss": 0.6918, + "nll_loss": 0.4772227108478546, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.031320951879024506, + "rewards/margins": 0.005975722335278988, + "rewards/rejected": 0.025345228612422943, + "step": 46 + }, + { + "epoch": 0.04171192700412774, + "grad_norm": 263.27022905858814, + "learning_rate": 1.9199999999999997e-07, + "logits/chosen": 0.5493794679641724, + "logits/rejected": 0.5213139057159424, + "logps/chosen": -439.7628173828125, + "logps/rejected": -462.7975158691406, + "loss": 0.6906, + "nll_loss": 0.41747790575027466, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03803615644574165, + "rewards/margins": -0.06382055580615997, + "rewards/rejected": 0.10185670852661133, + "step": 48 + }, + { + "epoch": 0.043449923962633065, + "grad_norm": 190.65381482431846, + "learning_rate": 2e-07, + "logits/chosen": 0.6704590320587158, + "logits/rejected": 0.5686213374137878, + "logps/chosen": -457.4028625488281, + "logps/rejected": -493.1829833984375, + "loss": 0.6826, + "nll_loss": 0.43137940764427185, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05906439200043678, + "rewards/margins": 0.08382105827331543, + "rewards/rejected": -0.024756668135523796, + "step": 50 + }, + { + "epoch": 0.04518792092113839, + "grad_norm": 248.201816470774, + "learning_rate": 1.99998368664865e-07, + "logits/chosen": 0.5073150992393494, + "logits/rejected": 0.519672155380249, + "logps/chosen": -532.7998046875, + "logps/rejected": -534.9505615234375, + "loss": 0.6769, + "nll_loss": 0.5028703212738037, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001774406060576439, + "rewards/margins": 0.07039718329906464, + "rewards/rejected": -0.06862278282642365, + "step": 52 + }, + { + "epoch": 0.04692591787964371, + "grad_norm": 211.00552194335916, + "learning_rate": 1.9999347471268514e-07, + "logits/chosen": 0.6019046902656555, + "logits/rejected": 0.6082484126091003, + "logps/chosen": -472.231689453125, + "logps/rejected": -471.65594482421875, + "loss": 0.7054, + "nll_loss": 0.4506027400493622, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05322122946381569, + "rewards/margins": 0.033222489058971405, + "rewards/rejected": -0.0864437147974968, + "step": 54 + }, + { + "epoch": 0.04866391483814903, + "grad_norm": 189.41970732319982, + "learning_rate": 1.9998531830313392e-07, + "logits/chosen": 0.6955520510673523, + "logits/rejected": 0.4593581259250641, + "logps/chosen": -482.1744384765625, + "logps/rejected": -411.509521484375, + "loss": 0.6807, + "nll_loss": 0.4741944968700409, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03270740434527397, + "rewards/margins": -0.01927652209997177, + "rewards/rejected": 0.05198393017053604, + "step": 56 + }, + { + "epoch": 0.050401911796654354, + "grad_norm": 200.24361432233923, + "learning_rate": 1.9997389970232808e-07, + "logits/chosen": 0.3443059027194977, + "logits/rejected": 0.4937664568424225, + "logps/chosen": -467.9244384765625, + "logps/rejected": -454.7276916503906, + "loss": 0.7053, + "nll_loss": 0.4400884211063385, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06452464312314987, + "rewards/margins": -0.08429832011461258, + "rewards/rejected": 0.01977367326617241, + "step": 58 + }, + { + "epoch": 0.05213990875515968, + "grad_norm": 163.032159459724, + "learning_rate": 1.9995921928281893e-07, + "logits/chosen": 0.422140896320343, + "logits/rejected": 0.5024916529655457, + "logps/chosen": -464.2950439453125, + "logps/rejected": -526.7002563476562, + "loss": 0.6961, + "nll_loss": 0.4522109031677246, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04293403401970863, + "rewards/margins": -0.04488658532500267, + "rewards/rejected": 0.0019525480456650257, + "step": 60 + }, + { + "epoch": 0.053877905713665, + "grad_norm": 163.5730877723102, + "learning_rate": 1.9994127752358013e-07, + "logits/chosen": 0.6417545080184937, + "logits/rejected": 0.5685979723930359, + "logps/chosen": -535.2261962890625, + "logps/rejected": -503.78277587890625, + "loss": 0.6931, + "nll_loss": 0.5154188871383667, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01960907131433487, + "rewards/margins": 0.09830178320407867, + "rewards/rejected": -0.0786927193403244, + "step": 62 + }, + { + "epoch": 0.05561590267217032, + "grad_norm": 164.19734726606004, + "learning_rate": 1.9992007500999212e-07, + "logits/chosen": 0.2852614223957062, + "logits/rejected": 0.309150367975235, + "logps/chosen": -434.3333435058594, + "logps/rejected": -432.8827209472656, + "loss": 0.6697, + "nll_loss": 0.42650461196899414, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06457091122865677, + "rewards/margins": 0.11283140629529953, + "rewards/rejected": -0.04826049506664276, + "step": 64 + }, + { + "epoch": 0.057353899630675644, + "grad_norm": 208.2379517632823, + "learning_rate": 1.998956124338231e-07, + "logits/chosen": 0.6129292845726013, + "logits/rejected": 0.5678445100784302, + "logps/chosen": -449.71087646484375, + "logps/rejected": -409.4271240234375, + "loss": 0.7017, + "nll_loss": 0.43394580483436584, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03126039355993271, + "rewards/margins": 0.05173855274915695, + "rewards/rejected": -0.020478159189224243, + "step": 66 + }, + { + "epoch": 0.059091896589180966, + "grad_norm": 293.2951894044055, + "learning_rate": 1.9986789059320613e-07, + "logits/chosen": 0.5414038896560669, + "logits/rejected": 0.3374328017234802, + "logps/chosen": -512.413818359375, + "logps/rejected": -405.919677734375, + "loss": 0.6938, + "nll_loss": 0.492817223072052, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17506714165210724, + "rewards/margins": 0.13103190064430237, + "rewards/rejected": 0.04403524845838547, + "step": 68 + }, + { + "epoch": 0.06082989354768629, + "grad_norm": 196.4370462875855, + "learning_rate": 1.9983691039261354e-07, + "logits/chosen": 0.34781157970428467, + "logits/rejected": 0.24695612490177155, + "logps/chosen": -475.7244567871094, + "logps/rejected": -478.4767761230469, + "loss": 0.6567, + "nll_loss": 0.4392806589603424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07889433950185776, + "rewards/margins": 0.14110715687274933, + "rewards/rejected": -0.06221282482147217, + "step": 70 + }, + { + "epoch": 0.06256789050619162, + "grad_norm": 163.4462233111686, + "learning_rate": 1.9980267284282714e-07, + "logits/chosen": 0.5121564269065857, + "logits/rejected": 0.5738804340362549, + "logps/chosen": -440.2140808105469, + "logps/rejected": -519.8204345703125, + "loss": 0.6778, + "nll_loss": 0.42939573526382446, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.09369020164012909, + "rewards/margins": 0.0274474136531353, + "rewards/rejected": 0.06624279171228409, + "step": 72 + }, + { + "epoch": 0.06430588746469694, + "grad_norm": 226.25500851573472, + "learning_rate": 1.9976517906090527e-07, + "logits/chosen": 0.482619047164917, + "logits/rejected": 0.45125705003738403, + "logps/chosen": -459.4217529296875, + "logps/rejected": -491.68072509765625, + "loss": 0.6939, + "nll_loss": 0.46240657567977905, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07695970684289932, + "rewards/margins": -0.011097146198153496, + "rewards/rejected": 0.08805684745311737, + "step": 74 + }, + { + "epoch": 0.06604388442320226, + "grad_norm": 187.52225346061883, + "learning_rate": 1.9972443027014636e-07, + "logits/chosen": 0.37846773862838745, + "logits/rejected": 0.28516721725463867, + "logps/chosen": -477.3094482421875, + "logps/rejected": -492.2334899902344, + "loss": 0.669, + "nll_loss": 0.4478228986263275, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18834134936332703, + "rewards/margins": -0.018651390448212624, + "rewards/rejected": 0.2069927304983139, + "step": 76 + }, + { + "epoch": 0.06778188138170758, + "grad_norm": 286.4122926243434, + "learning_rate": 1.9968042780004915e-07, + "logits/chosen": 0.5606168508529663, + "logits/rejected": 0.7590384483337402, + "logps/chosen": -471.123779296875, + "logps/rejected": -469.21038818359375, + "loss": 0.6712, + "nll_loss": 0.46313852071762085, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.18222695589065552, + "rewards/margins": 0.03420209884643555, + "rewards/rejected": 0.14802484214305878, + "step": 78 + }, + { + "epoch": 0.0695198783402129, + "grad_norm": 157.28755514832162, + "learning_rate": 1.9963317308626913e-07, + "logits/chosen": 0.49132657051086426, + "logits/rejected": 0.6132655739784241, + "logps/chosen": -426.67108154296875, + "logps/rejected": -420.4914245605469, + "loss": 0.7003, + "nll_loss": 0.412084698677063, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.16370698809623718, + "rewards/margins": -0.004933936521410942, + "rewards/rejected": 0.16864091157913208, + "step": 80 + }, + { + "epoch": 0.07125787529871823, + "grad_norm": 151.4016129399313, + "learning_rate": 1.995826676705718e-07, + "logits/chosen": 0.5328776836395264, + "logits/rejected": 0.45003530383110046, + "logps/chosen": -461.80194091796875, + "logps/rejected": -497.72930908203125, + "loss": 0.6519, + "nll_loss": 0.46158552169799805, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.260319322347641, + "rewards/margins": 0.15180225670337677, + "rewards/rejected": 0.10851707309484482, + "step": 82 + }, + { + "epoch": 0.07299587225722355, + "grad_norm": 245.99900032176603, + "learning_rate": 1.9952891320078235e-07, + "logits/chosen": 0.2650616765022278, + "logits/rejected": 0.2826390862464905, + "logps/chosen": -517.3169555664062, + "logps/rejected": -564.4183349609375, + "loss": 0.6991, + "nll_loss": 0.49650338292121887, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.193414106965065, + "rewards/margins": -0.06524582207202911, + "rewards/rejected": 0.2586599588394165, + "step": 84 + }, + { + "epoch": 0.07473386921572887, + "grad_norm": 336.95168542352815, + "learning_rate": 1.9947191143073184e-07, + "logits/chosen": 0.3787465989589691, + "logits/rejected": 0.31394198536872864, + "logps/chosen": -476.78155517578125, + "logps/rejected": -523.172607421875, + "loss": 0.7061, + "nll_loss": 0.48117950558662415, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2214973419904709, + "rewards/margins": 0.1059291809797287, + "rewards/rejected": 0.11556817591190338, + "step": 86 + }, + { + "epoch": 0.0764718661742342, + "grad_norm": 154.50191840037084, + "learning_rate": 1.9941166422020012e-07, + "logits/chosen": 0.5573416352272034, + "logits/rejected": 0.5087465643882751, + "logps/chosen": -487.1637268066406, + "logps/rejected": -495.28753662109375, + "loss": 0.6828, + "nll_loss": 0.5002313852310181, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.14835377037525177, + "rewards/margins": -0.005983538925647736, + "rewards/rejected": 0.1543373167514801, + "step": 88 + }, + { + "epoch": 0.07820986313273952, + "grad_norm": 166.90828372159802, + "learning_rate": 1.99348173534855e-07, + "logits/chosen": 0.4055403769016266, + "logits/rejected": 0.5100260972976685, + "logps/chosen": -398.7601318359375, + "logps/rejected": -449.6722106933594, + "loss": 0.648, + "nll_loss": 0.3796139061450958, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08756943047046661, + "rewards/margins": 0.035093024373054504, + "rewards/rejected": 0.05247640609741211, + "step": 90 + }, + { + "epoch": 0.07994786009124484, + "grad_norm": 195.34860330922328, + "learning_rate": 1.9928144144618822e-07, + "logits/chosen": 0.35453662276268005, + "logits/rejected": 0.17467136681079865, + "logps/chosen": -527.5667114257812, + "logps/rejected": -457.3980407714844, + "loss": 0.6638, + "nll_loss": 0.4913034439086914, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05260029435157776, + "rewards/margins": 0.0027903541922569275, + "rewards/rejected": 0.04980994015932083, + "step": 92 + }, + { + "epoch": 0.08168585704975016, + "grad_norm": 193.22023003323437, + "learning_rate": 1.992114701314478e-07, + "logits/chosen": 0.43743571639060974, + "logits/rejected": 0.4072650074958801, + "logps/chosen": -459.80718994140625, + "logps/rejected": -480.4713439941406, + "loss": 0.6549, + "nll_loss": 0.442990243434906, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11542568355798721, + "rewards/margins": 0.06230240315198898, + "rewards/rejected": 0.053123295307159424, + "step": 94 + }, + { + "epoch": 0.08342385400825549, + "grad_norm": 152.59229490030327, + "learning_rate": 1.9913826187356696e-07, + "logits/chosen": 0.2101035714149475, + "logits/rejected": 0.2910143733024597, + "logps/chosen": -496.73968505859375, + "logps/rejected": -562.0128784179688, + "loss": 0.693, + "nll_loss": 0.4974033236503601, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.035814739763736725, + "rewards/margins": -0.005833083763718605, + "rewards/rejected": 0.04164781793951988, + "step": 96 + }, + { + "epoch": 0.08516185096676081, + "grad_norm": 197.87306458058632, + "learning_rate": 1.990618190610898e-07, + "logits/chosen": 0.6426812410354614, + "logits/rejected": 0.6269667148590088, + "logps/chosen": -482.47320556640625, + "logps/rejected": -498.030517578125, + "loss": 0.6742, + "nll_loss": 0.45372655987739563, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1103857010602951, + "rewards/margins": 0.13235224783420563, + "rewards/rejected": -0.02196655422449112, + "step": 98 + }, + { + "epoch": 0.08689984792526613, + "grad_norm": 162.0131801417083, + "learning_rate": 1.9898214418809327e-07, + "logits/chosen": 0.30567625164985657, + "logits/rejected": 0.3604564666748047, + "logps/chosen": -491.9207458496094, + "logps/rejected": -483.3408508300781, + "loss": 0.6603, + "nll_loss": 0.45867347717285156, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.022484881803393364, + "rewards/margins": -0.0881936103105545, + "rewards/rejected": 0.06570873409509659, + "step": 100 + }, + { + "epoch": 0.08863784488377145, + "grad_norm": 196.02128008024215, + "learning_rate": 1.9889923985410573e-07, + "logits/chosen": 0.3616268038749695, + "logits/rejected": 0.5539577603340149, + "logps/chosen": -507.529541015625, + "logps/rejected": -502.7597961425781, + "loss": 0.6621, + "nll_loss": 0.502344012260437, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13264751434326172, + "rewards/margins": 0.15221844613552094, + "rewards/rejected": -0.01957092434167862, + "step": 102 + }, + { + "epoch": 0.09037584184227677, + "grad_norm": 218.21958324850453, + "learning_rate": 1.9881310876402223e-07, + "logits/chosen": 0.38627803325653076, + "logits/rejected": 0.3380679786205292, + "logps/chosen": -490.83709716796875, + "logps/rejected": -445.6140441894531, + "loss": 0.708, + "nll_loss": 0.4629852771759033, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02814798429608345, + "rewards/margins": 0.07472963631153107, + "rewards/rejected": -0.04658164829015732, + "step": 104 + }, + { + "epoch": 0.0921138388007821, + "grad_norm": 177.62809187588084, + "learning_rate": 1.9872375372801627e-07, + "logits/chosen": 0.48396801948547363, + "logits/rejected": 0.7172459363937378, + "logps/chosen": -467.0228576660156, + "logps/rejected": -553.0386352539062, + "loss": 0.6736, + "nll_loss": 0.44019949436187744, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11023005843162537, + "rewards/margins": 0.025584600865840912, + "rewards/rejected": 0.08464546501636505, + "step": 106 + }, + { + "epoch": 0.09385183575928742, + "grad_norm": 133.9957527520213, + "learning_rate": 1.9863117766144804e-07, + "logits/chosen": 0.6896294355392456, + "logits/rejected": 0.5460060834884644, + "logps/chosen": -519.9830322265625, + "logps/rejected": -495.1474304199219, + "loss": 0.6545, + "nll_loss": 0.5037875771522522, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04275064915418625, + "rewards/margins": 0.25670328736305237, + "rewards/rejected": -0.21395263075828552, + "step": 108 + }, + { + "epoch": 0.09558983271779274, + "grad_norm": 251.08859259838198, + "learning_rate": 1.985353835847693e-07, + "logits/chosen": 0.5879981517791748, + "logits/rejected": 0.3931455612182617, + "logps/chosen": -512.4330444335938, + "logps/rejected": -445.7779541015625, + "loss": 0.6627, + "nll_loss": 0.49814072251319885, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01979961059987545, + "rewards/margins": 0.12016735970973969, + "rewards/rejected": -0.10036774724721909, + "step": 110 + }, + { + "epoch": 0.09732782967629806, + "grad_norm": 251.4984918048756, + "learning_rate": 1.9843637462342496e-07, + "logits/chosen": 0.6077919602394104, + "logits/rejected": 0.6450572609901428, + "logps/chosen": -452.0365295410156, + "logps/rejected": -490.2518310546875, + "loss": 0.6746, + "nll_loss": 0.44132208824157715, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06424455344676971, + "rewards/margins": 0.1958976835012436, + "rewards/rejected": -0.13165313005447388, + "step": 112 + }, + { + "epoch": 0.09906582663480339, + "grad_norm": 176.76300504083784, + "learning_rate": 1.9833415400775092e-07, + "logits/chosen": 0.8745415210723877, + "logits/rejected": 0.7276170253753662, + "logps/chosen": -474.9569396972656, + "logps/rejected": -453.19403076171875, + "loss": 0.6874, + "nll_loss": 0.45759472250938416, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04741678386926651, + "rewards/margins": 0.028619293123483658, + "rewards/rejected": 0.018797490745782852, + "step": 114 + }, + { + "epoch": 0.10080382359330871, + "grad_norm": 195.6725076422546, + "learning_rate": 1.9822872507286887e-07, + "logits/chosen": 0.25701722502708435, + "logits/rejected": 0.27016139030456543, + "logps/chosen": -468.56292724609375, + "logps/rejected": -488.0750732421875, + "loss": 0.659, + "nll_loss": 0.4411785900592804, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.141248419880867, + "rewards/margins": 0.12923917174339294, + "rewards/rejected": 0.012009241618216038, + "step": 116 + }, + { + "epoch": 0.10254182055181403, + "grad_norm": 223.97345705886528, + "learning_rate": 1.9812009125857728e-07, + "logits/chosen": 0.24996408820152283, + "logits/rejected": 0.18013660609722137, + "logps/chosen": -454.4783935546875, + "logps/rejected": -452.7356872558594, + "loss": 0.6587, + "nll_loss": 0.4465644955635071, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.20268402993679047, + "rewards/margins": 0.1341714859008789, + "rewards/rejected": 0.06851252913475037, + "step": 118 + }, + { + "epoch": 0.10427981751031935, + "grad_norm": 259.9580968692604, + "learning_rate": 1.9800825610923934e-07, + "logits/chosen": 0.4408469498157501, + "logits/rejected": 0.41384851932525635, + "logps/chosen": -448.9534912109375, + "logps/rejected": -466.5663146972656, + "loss": 0.7332, + "nll_loss": 0.4316788911819458, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009281730279326439, + "rewards/margins": 0.08196325600147247, + "rewards/rejected": -0.07268151640892029, + "step": 120 + }, + { + "epoch": 0.10601781446882468, + "grad_norm": 166.16660946467783, + "learning_rate": 1.9789322327366719e-07, + "logits/chosen": 0.4468734562397003, + "logits/rejected": 0.2030533403158188, + "logps/chosen": -529.8795776367188, + "logps/rejected": -479.37957763671875, + "loss": 0.6549, + "nll_loss": 0.4917633831501007, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0391303114593029, + "rewards/margins": 0.13398800790309906, + "rewards/rejected": -0.17311832308769226, + "step": 122 + }, + { + "epoch": 0.10775581142733, + "grad_norm": 223.08401763734403, + "learning_rate": 1.97774996505003e-07, + "logits/chosen": 0.5849189758300781, + "logits/rejected": 0.4860993027687073, + "logps/chosen": -477.4208984375, + "logps/rejected": -426.2466125488281, + "loss": 0.6735, + "nll_loss": 0.4537978768348694, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07169342041015625, + "rewards/margins": 0.189275324344635, + "rewards/rejected": -0.11758189648389816, + "step": 124 + }, + { + "epoch": 0.10949380838583532, + "grad_norm": 214.45226686525862, + "learning_rate": 1.9765357966059635e-07, + "logits/chosen": 0.3300391137599945, + "logits/rejected": 0.3301427662372589, + "logps/chosen": -495.9730224609375, + "logps/rejected": -525.8510131835938, + "loss": 0.6301, + "nll_loss": 0.4744013547897339, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.016232872381806374, + "rewards/margins": 0.2226220965385437, + "rewards/rejected": -0.20638923346996307, + "step": 126 + }, + { + "epoch": 0.11123180534434064, + "grad_norm": 196.16284112503507, + "learning_rate": 1.975289767018786e-07, + "logits/chosen": 0.383301705121994, + "logits/rejected": 0.3108516037464142, + "logps/chosen": -477.2252502441406, + "logps/rejected": -413.8544921875, + "loss": 0.6543, + "nll_loss": 0.4823342263698578, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07313746213912964, + "rewards/margins": -0.002835087478160858, + "rewards/rejected": 0.0759725570678711, + "step": 128 + }, + { + "epoch": 0.11296980230284596, + "grad_norm": 172.7848965773789, + "learning_rate": 1.9740119169423335e-07, + "logits/chosen": 0.41243690252304077, + "logits/rejected": 0.500731885433197, + "logps/chosen": -560.973876953125, + "logps/rejected": -556.9948120117188, + "loss": 0.6745, + "nll_loss": 0.537980854511261, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.018784141167998314, + "rewards/margins": 0.18619613349437714, + "rewards/rejected": -0.16741199791431427, + "step": 130 + }, + { + "epoch": 0.11470779926135129, + "grad_norm": 107.09619636786023, + "learning_rate": 1.972702288068641e-07, + "logits/chosen": 0.5883492827415466, + "logits/rejected": 0.6014207601547241, + "logps/chosen": -492.56719970703125, + "logps/rejected": -571.8795166015625, + "loss": 0.6591, + "nll_loss": 0.4626290500164032, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03946494683623314, + "rewards/margins": 0.3372575640678406, + "rewards/rejected": -0.29779261350631714, + "step": 132 + }, + { + "epoch": 0.11644579621985661, + "grad_norm": 450.5904425564273, + "learning_rate": 1.9713609231265803e-07, + "logits/chosen": 0.2925584614276886, + "logits/rejected": 0.2094520628452301, + "logps/chosen": -466.8580322265625, + "logps/rejected": -448.7713928222656, + "loss": 0.7074, + "nll_loss": 0.4871719181537628, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10256430506706238, + "rewards/margins": -0.13159841299057007, + "rewards/rejected": 0.23416268825531006, + "step": 134 + }, + { + "epoch": 0.11818379317836193, + "grad_norm": 230.5018212403494, + "learning_rate": 1.969987865880467e-07, + "logits/chosen": 0.471775084733963, + "logits/rejected": 0.591442883014679, + "logps/chosen": -499.01312255859375, + "logps/rejected": -505.15234375, + "loss": 0.6955, + "nll_loss": 0.4720504581928253, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08871403336524963, + "rewards/margins": -0.016466330736875534, + "rewards/rejected": -0.0722476989030838, + "step": 136 + }, + { + "epoch": 0.11992179013686725, + "grad_norm": 140.2346299082145, + "learning_rate": 1.968583161128631e-07, + "logits/chosen": 0.3392627239227295, + "logits/rejected": 0.4022282660007477, + "logps/chosen": -459.1971740722656, + "logps/rejected": -466.5972595214844, + "loss": 0.6409, + "nll_loss": 0.4637642502784729, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10886505246162415, + "rewards/margins": 0.055580608546733856, + "rewards/rejected": 0.05328445881605148, + "step": 138 + }, + { + "epoch": 0.12165978709537258, + "grad_norm": 231.29684861071243, + "learning_rate": 1.967146854701957e-07, + "logits/chosen": 0.5594834685325623, + "logits/rejected": 0.39609870314598083, + "logps/chosen": -528.64453125, + "logps/rejected": -495.0397033691406, + "loss": 0.7025, + "nll_loss": 0.49201473593711853, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.022034645080566406, + "rewards/margins": 0.07131557166576385, + "rewards/rejected": -0.04928094893693924, + "step": 140 + }, + { + "epoch": 0.12339778405387791, + "grad_norm": 149.25121625290038, + "learning_rate": 1.965678993462388e-07, + "logits/chosen": 0.3449370265007019, + "logits/rejected": 0.48329412937164307, + "logps/chosen": -487.4235534667969, + "logps/rejected": -518.1926879882812, + "loss": 0.6558, + "nll_loss": 0.4604225754737854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.023956965655088425, + "rewards/margins": 0.1537911593914032, + "rewards/rejected": -0.17774812877178192, + "step": 142 + }, + { + "epoch": 0.12513578101238323, + "grad_norm": 378.4646764352743, + "learning_rate": 1.9641796253013955e-07, + "logits/chosen": 0.40967145562171936, + "logits/rejected": 0.7070332169532776, + "logps/chosen": -441.1578063964844, + "logps/rejected": -520.5462646484375, + "loss": 0.6765, + "nll_loss": 0.4579017758369446, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0898236334323883, + "rewards/margins": 0.07469739019870758, + "rewards/rejected": 0.015126226469874382, + "step": 144 + }, + { + "epoch": 0.12687377797088856, + "grad_norm": 173.82766602111045, + "learning_rate": 1.9626487991384193e-07, + "logits/chosen": 0.7006528377532959, + "logits/rejected": 0.6683908700942993, + "logps/chosen": -466.95941162109375, + "logps/rejected": -433.3965759277344, + "loss": 0.6957, + "nll_loss": 0.4601879417896271, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00033187679946422577, + "rewards/margins": 0.07793959975242615, + "rewards/rejected": -0.07760772109031677, + "step": 146 + }, + { + "epoch": 0.12861177492939388, + "grad_norm": 186.16920415747254, + "learning_rate": 1.9610865649192693e-07, + "logits/chosen": 0.5055305361747742, + "logits/rejected": 0.34722912311553955, + "logps/chosen": -526.1424560546875, + "logps/rejected": -392.083740234375, + "loss": 0.7046, + "nll_loss": 0.4694558382034302, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.25952208042144775, + "rewards/margins": -0.08689786493778229, + "rewards/rejected": -0.17262420058250427, + "step": 148 + }, + { + "epoch": 0.1303497718878992, + "grad_norm": 140.55582904451313, + "learning_rate": 1.9594929736144973e-07, + "logits/chosen": 0.6043753623962402, + "logits/rejected": 0.5178260803222656, + "logps/chosen": -484.9609375, + "logps/rejected": -496.45562744140625, + "loss": 0.6977, + "nll_loss": 0.4659056067466736, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.002574533224105835, + "rewards/margins": 0.12758272886276245, + "rewards/rejected": -0.13015729188919067, + "step": 150 + }, + { + "epoch": 0.13208776884640452, + "grad_norm": 144.45587301387354, + "learning_rate": 1.9578680772177326e-07, + "logits/chosen": 0.6586911082267761, + "logits/rejected": 0.6202735304832458, + "logps/chosen": -486.1122741699219, + "logps/rejected": -505.4885559082031, + "loss": 0.6498, + "nll_loss": 0.45736804604530334, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.034781839698553085, + "rewards/margins": -0.023356247693300247, + "rewards/rejected": -0.011425594799220562, + "step": 152 + }, + { + "epoch": 0.13382576580490985, + "grad_norm": 168.4370224409038, + "learning_rate": 1.956211928743987e-07, + "logits/chosen": 0.3550521731376648, + "logits/rejected": 0.37308141589164734, + "logps/chosen": -452.2578430175781, + "logps/rejected": -458.32623291015625, + "loss": 0.7033, + "nll_loss": 0.44661641120910645, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0856170654296875, + "rewards/margins": 0.16778725385665894, + "rewards/rejected": -0.08217020332813263, + "step": 154 + }, + { + "epoch": 0.13556376276341517, + "grad_norm": 141.91361126736982, + "learning_rate": 1.9545245822279242e-07, + "logits/chosen": 0.6770870685577393, + "logits/rejected": 0.6404905915260315, + "logps/chosen": -462.7332763671875, + "logps/rejected": -408.6764831542969, + "loss": 0.6552, + "nll_loss": 0.4415394067764282, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07933865487575531, + "rewards/margins": 0.038278963416814804, + "rewards/rejected": -0.11761761456727982, + "step": 156 + }, + { + "epoch": 0.1373017597219205, + "grad_norm": 230.74705101230725, + "learning_rate": 1.9528060927220979e-07, + "logits/chosen": 0.5159198045730591, + "logits/rejected": 0.6130585074424744, + "logps/chosen": -481.01568603515625, + "logps/rejected": -521.5822143554688, + "loss": 0.6778, + "nll_loss": 0.47335052490234375, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.14715270698070526, + "rewards/margins": 0.11525917053222656, + "rewards/rejected": -0.262411892414093, + "step": 158 + }, + { + "epoch": 0.1390397566804258, + "grad_norm": 202.01845557198166, + "learning_rate": 1.9510565162951537e-07, + "logits/chosen": 0.7327361702919006, + "logits/rejected": 0.7527709603309631, + "logps/chosen": -461.3598937988281, + "logps/rejected": -520.5428466796875, + "loss": 0.6427, + "nll_loss": 0.4450530409812927, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05635242164134979, + "rewards/margins": 0.10485029220581055, + "rewards/rejected": -0.048497870564460754, + "step": 160 + }, + { + "epoch": 0.14077775363893114, + "grad_norm": 195.27141708111617, + "learning_rate": 1.9492759100300015e-07, + "logits/chosen": 0.35761919617652893, + "logits/rejected": 0.39191699028015137, + "logps/chosen": -457.64892578125, + "logps/rejected": -505.98175048828125, + "loss": 0.6494, + "nll_loss": 0.4446874260902405, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0755319595336914, + "rewards/margins": 0.2582038938999176, + "rewards/rejected": -0.3337358832359314, + "step": 162 + }, + { + "epoch": 0.14251575059743646, + "grad_norm": 156.30006812274797, + "learning_rate": 1.947464332021953e-07, + "logits/chosen": 0.4818640947341919, + "logits/rejected": 0.49132969975471497, + "logps/chosen": -516.2056274414062, + "logps/rejected": -585.5494995117188, + "loss": 0.6678, + "nll_loss": 0.46864089369773865, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0640132874250412, + "rewards/margins": 0.2289627343416214, + "rewards/rejected": -0.1649494171142578, + "step": 164 + }, + { + "epoch": 0.14425374755594178, + "grad_norm": 240.13788888052562, + "learning_rate": 1.9456218413768248e-07, + "logits/chosen": 0.3798179030418396, + "logits/rejected": 0.5492743253707886, + "logps/chosen": -508.6365051269531, + "logps/rejected": -582.9490966796875, + "loss": 0.6732, + "nll_loss": 0.4977247714996338, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.009553920477628708, + "rewards/margins": 0.3013032376766205, + "rewards/rejected": -0.3108571171760559, + "step": 166 + }, + { + "epoch": 0.1459917445144471, + "grad_norm": 204.66513708394191, + "learning_rate": 1.9437484982090119e-07, + "logits/chosen": 0.22984851896762848, + "logits/rejected": 0.21164825558662415, + "logps/chosen": -551.8775024414062, + "logps/rejected": -499.84112548828125, + "loss": 0.6811, + "nll_loss": 0.5149954557418823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07367496192455292, + "rewards/margins": 0.2555732727050781, + "rewards/rejected": -0.1818983256816864, + "step": 168 + }, + { + "epoch": 0.14772974147295242, + "grad_norm": 207.99963697618364, + "learning_rate": 1.941844363639525e-07, + "logits/chosen": 0.26835981011390686, + "logits/rejected": 0.3549097776412964, + "logps/chosen": -486.7258605957031, + "logps/rejected": -511.31573486328125, + "loss": 0.6888, + "nll_loss": 0.48714739084243774, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2133750021457672, + "rewards/margins": -0.16475935280323029, + "rewards/rejected": 0.3781343698501587, + "step": 170 + }, + { + "epoch": 0.14946773843145775, + "grad_norm": 196.09540219875737, + "learning_rate": 1.9399094997939956e-07, + "logits/chosen": 0.6336839199066162, + "logits/rejected": 0.5925776958465576, + "logps/chosen": -450.0120544433594, + "logps/rejected": -426.1235046386719, + "loss": 0.6385, + "nll_loss": 0.41613397002220154, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3170146942138672, + "rewards/margins": 0.191303551197052, + "rewards/rejected": 0.12571117281913757, + "step": 172 + }, + { + "epoch": 0.15120573538996307, + "grad_norm": 240.86095927305658, + "learning_rate": 1.937943969800652e-07, + "logits/chosen": 0.2430100440979004, + "logits/rejected": 0.46853843331336975, + "logps/chosen": -448.8415222167969, + "logps/rejected": -533.2725219726562, + "loss": 0.6814, + "nll_loss": 0.4187857210636139, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17961281538009644, + "rewards/margins": 0.09986276179552078, + "rewards/rejected": 0.07975006848573685, + "step": 174 + }, + { + "epoch": 0.1529437323484684, + "grad_norm": 166.34961492388385, + "learning_rate": 1.9359478377882566e-07, + "logits/chosen": 0.6643735766410828, + "logits/rejected": 0.6006796360015869, + "logps/chosen": -500.4281311035156, + "logps/rejected": -508.8857421875, + "loss": 0.6521, + "nll_loss": 0.46919527649879456, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2530907392501831, + "rewards/margins": 0.1539958119392395, + "rewards/rejected": 0.09909495711326599, + "step": 176 + }, + { + "epoch": 0.15468172930697371, + "grad_norm": 157.1484708841, + "learning_rate": 1.9339211688840155e-07, + "logits/chosen": 0.48778361082077026, + "logits/rejected": 0.5057650804519653, + "logps/chosen": -546.6986694335938, + "logps/rejected": -591.006591796875, + "loss": 0.6418, + "nll_loss": 0.5191956162452698, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2293836623430252, + "rewards/margins": 0.06616592407226562, + "rewards/rejected": 0.16321773827075958, + "step": 178 + }, + { + "epoch": 0.15641972626547904, + "grad_norm": 179.7526837035682, + "learning_rate": 1.9318640292114523e-07, + "logits/chosen": 0.14600294828414917, + "logits/rejected": 0.18026244640350342, + "logps/chosen": -515.8131103515625, + "logps/rejected": -492.68585205078125, + "loss": 0.6491, + "nll_loss": 0.49840864539146423, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3661031723022461, + "rewards/margins": 0.4645713269710541, + "rewards/rejected": -0.09846819937229156, + "step": 180 + }, + { + "epoch": 0.15815772322398436, + "grad_norm": 156.21169756335783, + "learning_rate": 1.9297764858882513e-07, + "logits/chosen": 0.2845636308193207, + "logits/rejected": 0.3478243052959442, + "logps/chosen": -433.43304443359375, + "logps/rejected": -502.88873291015625, + "loss": 0.7059, + "nll_loss": 0.4498353600502014, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.032468315213918686, + "rewards/margins": 0.11264065653085709, + "rewards/rejected": -0.0801723524928093, + "step": 182 + }, + { + "epoch": 0.15989572018248968, + "grad_norm": 114.38837551366716, + "learning_rate": 1.9276586070240682e-07, + "logits/chosen": 0.5281504988670349, + "logits/rejected": 0.5060007572174072, + "logps/chosen": -533.6888427734375, + "logps/rejected": -484.2074279785156, + "loss": 0.6467, + "nll_loss": 0.5050809383392334, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.07081624865531921, + "rewards/margins": -0.019865036010742188, + "rewards/rejected": 0.0906812772154808, + "step": 184 + }, + { + "epoch": 0.161633717140995, + "grad_norm": 190.3601062924143, + "learning_rate": 1.9255104617183066e-07, + "logits/chosen": 0.42118650674819946, + "logits/rejected": 0.551128089427948, + "logps/chosen": -447.9131774902344, + "logps/rejected": -466.6624755859375, + "loss": 0.6818, + "nll_loss": 0.42739805579185486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13303765654563904, + "rewards/margins": 0.11362534016370773, + "rewards/rejected": 0.019412323832511902, + "step": 186 + }, + { + "epoch": 0.16337171409950033, + "grad_norm": 174.81457999255366, + "learning_rate": 1.9233321200578657e-07, + "logits/chosen": 0.6645621061325073, + "logits/rejected": 0.7099349498748779, + "logps/chosen": -479.3392028808594, + "logps/rejected": -501.9243469238281, + "loss": 0.7046, + "nll_loss": 0.45945340394973755, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11817818135023117, + "rewards/margins": 0.06987819075584412, + "rewards/rejected": 0.04829998314380646, + "step": 188 + }, + { + "epoch": 0.16510971105800565, + "grad_norm": 214.65093516136065, + "learning_rate": 1.92112365311485e-07, + "logits/chosen": 0.488328754901886, + "logits/rejected": 0.4507230818271637, + "logps/chosen": -542.5758056640625, + "logps/rejected": -518.2319946289062, + "loss": 0.6863, + "nll_loss": 0.5090084075927734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04686184227466583, + "rewards/margins": -0.026256389915943146, + "rewards/rejected": -0.02060546912252903, + "step": 190 + }, + { + "epoch": 0.16684770801651097, + "grad_norm": 175.7065914966413, + "learning_rate": 1.9188851329442546e-07, + "logits/chosen": 0.3762364089488983, + "logits/rejected": 0.42694732546806335, + "logps/chosen": -578.720703125, + "logps/rejected": -550.4427490234375, + "loss": 0.7033, + "nll_loss": 0.5272043943405151, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.031178677454590797, + "rewards/margins": 0.00896589457988739, + "rewards/rejected": 0.02221280336380005, + "step": 192 + }, + { + "epoch": 0.1685857049750163, + "grad_norm": 243.86672474625263, + "learning_rate": 1.9166166325816117e-07, + "logits/chosen": 0.43783852458000183, + "logits/rejected": 0.41702184081077576, + "logps/chosen": -455.8414001464844, + "logps/rejected": -523.8115234375, + "loss": 0.6369, + "nll_loss": 0.45409929752349854, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2518928647041321, + "rewards/margins": 0.3256605863571167, + "rewards/rejected": -0.07376771420240402, + "step": 194 + }, + { + "epoch": 0.17032370193352162, + "grad_norm": 169.63122858450672, + "learning_rate": 1.9143182260406076e-07, + "logits/chosen": 0.32926619052886963, + "logits/rejected": 0.3965323567390442, + "logps/chosen": -518.8862915039062, + "logps/rejected": -475.6866760253906, + "loss": 0.6773, + "nll_loss": 0.49594274163246155, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.010239500552415848, + "rewards/margins": -0.037561044096946716, + "rewards/rejected": 0.04780054837465286, + "step": 196 + }, + { + "epoch": 0.17206169889202694, + "grad_norm": 239.31635495177088, + "learning_rate": 1.91198998831067e-07, + "logits/chosen": 0.2399873584508896, + "logits/rejected": 0.3778996467590332, + "logps/chosen": -429.89898681640625, + "logps/rejected": -490.7498779296875, + "loss": 0.6738, + "nll_loss": 0.4365084767341614, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.49901849031448364, + "rewards/margins": 0.2948850393295288, + "rewards/rejected": 0.20413342118263245, + "step": 198 + }, + { + "epoch": 0.17379969585053226, + "grad_norm": 442.23377481682337, + "learning_rate": 1.9096319953545185e-07, + "logits/chosen": 0.2612355947494507, + "logits/rejected": 0.21945816278457642, + "logps/chosen": -421.193359375, + "logps/rejected": -504.9490966796875, + "loss": 0.6253, + "nll_loss": 0.41177114844322205, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4579131007194519, + "rewards/margins": 0.3565528094768524, + "rewards/rejected": 0.10136031359434128, + "step": 200 + }, + { + "epoch": 0.17553769280903758, + "grad_norm": 183.97075955828043, + "learning_rate": 1.9072443241056882e-07, + "logits/chosen": 0.37889376282691956, + "logits/rejected": 0.3554326593875885, + "logps/chosen": -549.0498657226562, + "logps/rejected": -564.4247436523438, + "loss": 0.6834, + "nll_loss": 0.5282487869262695, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3155590295791626, + "rewards/margins": 0.1674705594778061, + "rewards/rejected": 0.1480884701013565, + "step": 202 + }, + { + "epoch": 0.1772756897675429, + "grad_norm": 212.7149629473822, + "learning_rate": 1.9048270524660196e-07, + "logits/chosen": 0.4008958041667938, + "logits/rejected": 0.5223476886749268, + "logps/chosen": -499.8232116699219, + "logps/rejected": -517.2408447265625, + "loss": 0.6574, + "nll_loss": 0.4931824207305908, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2734750509262085, + "rewards/margins": -0.03305815905332565, + "rewards/rejected": 0.3065332770347595, + "step": 204 + }, + { + "epoch": 0.17901368672604823, + "grad_norm": 123.98091350420249, + "learning_rate": 1.9023802593031153e-07, + "logits/chosen": 0.678055465221405, + "logits/rejected": 0.49451562762260437, + "logps/chosen": -457.2571716308594, + "logps/rejected": -453.9916076660156, + "loss": 0.6521, + "nll_loss": 0.45098334550857544, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20749235153198242, + "rewards/margins": 0.07694320380687714, + "rewards/rejected": 0.13054914772510529, + "step": 206 + }, + { + "epoch": 0.18075168368455355, + "grad_norm": 122.84485776556372, + "learning_rate": 1.899904024447769e-07, + "logits/chosen": 0.4941789507865906, + "logits/rejected": 0.4557379186153412, + "logps/chosen": -494.46221923828125, + "logps/rejected": -435.7831115722656, + "loss": 0.5938, + "nll_loss": 0.4718150496482849, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15648937225341797, + "rewards/margins": 0.09144438803195953, + "rewards/rejected": 0.06504497677087784, + "step": 208 + }, + { + "epoch": 0.18248968064305887, + "grad_norm": 171.76865319699047, + "learning_rate": 1.8973984286913583e-07, + "logits/chosen": 0.3957821726799011, + "logits/rejected": 0.45247000455856323, + "logps/chosen": -439.8001708984375, + "logps/rejected": -512.7393798828125, + "loss": 0.6688, + "nll_loss": 0.4396136999130249, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4999141991138458, + "rewards/margins": 0.47802239656448364, + "rewards/rejected": 0.02189178764820099, + "step": 210 + }, + { + "epoch": 0.1842276776015642, + "grad_norm": 160.6284900707324, + "learning_rate": 1.8948635537832118e-07, + "logits/chosen": 0.8007611036300659, + "logits/rejected": 0.8054407835006714, + "logps/chosen": -540.0386352539062, + "logps/rejected": -521.6272583007812, + "loss": 0.6516, + "nll_loss": 0.5008341073989868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3313617706298828, + "rewards/margins": 0.06706114113330841, + "rewards/rejected": 0.2643006443977356, + "step": 212 + }, + { + "epoch": 0.18596567456006952, + "grad_norm": 274.1780536054067, + "learning_rate": 1.8922994824279393e-07, + "logits/chosen": 0.4809880256652832, + "logits/rejected": 0.5513718128204346, + "logps/chosen": -487.8301696777344, + "logps/rejected": -520.1082763671875, + "loss": 0.6354, + "nll_loss": 0.47460833191871643, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3392031490802765, + "rewards/margins": 0.3161538243293762, + "rewards/rejected": 0.023049363866448402, + "step": 214 + }, + { + "epoch": 0.18770367151857484, + "grad_norm": 231.93646441377587, + "learning_rate": 1.8897062982827343e-07, + "logits/chosen": 0.3585096001625061, + "logits/rejected": 0.5087136030197144, + "logps/chosen": -442.2716064453125, + "logps/rejected": -525.7360229492188, + "loss": 0.652, + "nll_loss": 0.4545535743236542, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.47251102328300476, + "rewards/margins": 0.3076295256614685, + "rewards/rejected": 0.16488152742385864, + "step": 216 + }, + { + "epoch": 0.18944166847708016, + "grad_norm": 149.52722877002068, + "learning_rate": 1.8870840859546453e-07, + "logits/chosen": 0.5101938247680664, + "logits/rejected": 0.45677730441093445, + "logps/chosen": -560.6141967773438, + "logps/rejected": -477.9551696777344, + "loss": 0.6815, + "nll_loss": 0.4999621510505676, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3215439021587372, + "rewards/margins": 0.1319688856601715, + "rewards/rejected": 0.18957501649856567, + "step": 218 + }, + { + "epoch": 0.19117966543558548, + "grad_norm": 153.20737081906722, + "learning_rate": 1.8844329309978143e-07, + "logits/chosen": 0.24976252019405365, + "logits/rejected": 0.14467594027519226, + "logps/chosen": -422.8174743652344, + "logps/rejected": -497.1708984375, + "loss": 0.6218, + "nll_loss": 0.42366060614585876, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4763959050178528, + "rewards/margins": 0.3291512727737427, + "rewards/rejected": 0.1472446471452713, + "step": 220 + }, + { + "epoch": 0.1929176623940908, + "grad_norm": 312.09631072055066, + "learning_rate": 1.8817529199106857e-07, + "logits/chosen": 0.4518931806087494, + "logits/rejected": 0.4724608063697815, + "logps/chosen": -480.308837890625, + "logps/rejected": -486.3529968261719, + "loss": 0.6416, + "nll_loss": 0.4695611000061035, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3075574040412903, + "rewards/margins": 0.2838830053806305, + "rewards/rejected": 0.023674391210079193, + "step": 222 + }, + { + "epoch": 0.19465565935259613, + "grad_norm": 291.3739480103135, + "learning_rate": 1.8790441401331843e-07, + "logits/chosen": 0.42263418436050415, + "logits/rejected": 0.2607260048389435, + "logps/chosen": -553.7242431640625, + "logps/rejected": -509.631103515625, + "loss": 0.616, + "nll_loss": 0.5167679786682129, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40777724981307983, + "rewards/margins": 0.2215692400932312, + "rewards/rejected": 0.18620796501636505, + "step": 224 + }, + { + "epoch": 0.19639365631110145, + "grad_norm": 135.6469032071128, + "learning_rate": 1.8763066800438634e-07, + "logits/chosen": 0.11122694611549377, + "logits/rejected": 0.23442375659942627, + "logps/chosen": -451.4000244140625, + "logps/rejected": -476.5044860839844, + "loss": 0.6749, + "nll_loss": 0.4101489782333374, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4101632833480835, + "rewards/margins": 0.17843975126743317, + "rewards/rejected": 0.23172350227832794, + "step": 226 + }, + { + "epoch": 0.19813165326960677, + "grad_norm": 233.63954629459604, + "learning_rate": 1.873540628957019e-07, + "logits/chosen": 0.3454923629760742, + "logits/rejected": 0.31212350726127625, + "logps/chosen": -476.87188720703125, + "logps/rejected": -442.26727294921875, + "loss": 0.6688, + "nll_loss": 0.46371322870254517, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2975081205368042, + "rewards/margins": 0.16047295928001404, + "rewards/rejected": 0.13703517615795135, + "step": 228 + }, + { + "epoch": 0.1998696502281121, + "grad_norm": 175.03944828947417, + "learning_rate": 1.8707460771197773e-07, + "logits/chosen": 0.5401207208633423, + "logits/rejected": 0.46543338894844055, + "logps/chosen": -408.17254638671875, + "logps/rejected": -425.7143859863281, + "loss": 0.653, + "nll_loss": 0.3989378809928894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22502900660037994, + "rewards/margins": 0.1482989490032196, + "rewards/rejected": 0.07673005759716034, + "step": 230 + }, + { + "epoch": 0.20160764718661742, + "grad_norm": 163.9379882296477, + "learning_rate": 1.8679231157091504e-07, + "logits/chosen": 0.276883989572525, + "logits/rejected": 0.21855014562606812, + "logps/chosen": -479.3912353515625, + "logps/rejected": -444.1050109863281, + "loss": 0.7128, + "nll_loss": 0.46482211351394653, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.21144676208496094, + "rewards/margins": 0.10382099449634552, + "rewards/rejected": 0.10762576013803482, + "step": 232 + }, + { + "epoch": 0.20334564414512274, + "grad_norm": 148.0762245669831, + "learning_rate": 1.865071836829061e-07, + "logits/chosen": 0.5332834124565125, + "logits/rejected": 0.6724749207496643, + "logps/chosen": -514.0674438476562, + "logps/rejected": -577.0899658203125, + "loss": 0.6766, + "nll_loss": 0.513380229473114, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.020297817885875702, + "rewards/margins": 0.03417910635471344, + "rewards/rejected": -0.05447692424058914, + "step": 234 + }, + { + "epoch": 0.20508364110362806, + "grad_norm": 321.381717812454, + "learning_rate": 1.8621923335073374e-07, + "logits/chosen": 0.41222482919692993, + "logits/rejected": 0.49463099241256714, + "logps/chosen": -458.3787841796875, + "logps/rejected": -498.8921203613281, + "loss": 0.6443, + "nll_loss": 0.4330087900161743, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3461689054965973, + "rewards/margins": 0.3577164113521576, + "rewards/rejected": -0.01154746487736702, + "step": 236 + }, + { + "epoch": 0.20682163806213338, + "grad_norm": 142.48431563998605, + "learning_rate": 1.859284699692679e-07, + "logits/chosen": 0.31183576583862305, + "logits/rejected": 0.3098088204860687, + "logps/chosen": -494.8736877441406, + "logps/rejected": -479.8580322265625, + "loss": 0.6803, + "nll_loss": 0.44527143239974976, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39485302567481995, + "rewards/margins": 0.4110240936279297, + "rewards/rejected": -0.01617107354104519, + "step": 238 + }, + { + "epoch": 0.2085596350206387, + "grad_norm": 180.45494632724726, + "learning_rate": 1.856349030251589e-07, + "logits/chosen": 0.352733314037323, + "logits/rejected": 0.4097552001476288, + "logps/chosen": -500.9227600097656, + "logps/rejected": -464.92645263671875, + "loss": 0.624, + "nll_loss": 0.47565579414367676, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.10063820332288742, + "rewards/margins": -0.12395801395177841, + "rewards/rejected": 0.023319821804761887, + "step": 240 + }, + { + "epoch": 0.21029763197914403, + "grad_norm": 254.73609406952772, + "learning_rate": 1.8533854209652816e-07, + "logits/chosen": 0.5989456176757812, + "logits/rejected": 0.6028575897216797, + "logps/chosen": -549.3515014648438, + "logps/rejected": -577.7095336914062, + "loss": 0.6406, + "nll_loss": 0.5223766565322876, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07228565216064453, + "rewards/margins": 0.14688776433467865, + "rewards/rejected": -0.21917343139648438, + "step": 242 + }, + { + "epoch": 0.21203562893764935, + "grad_norm": 149.56967633528575, + "learning_rate": 1.8503939685265566e-07, + "logits/chosen": 0.508268415927887, + "logits/rejected": 0.5876305103302002, + "logps/chosen": -490.4624328613281, + "logps/rejected": -507.3514099121094, + "loss": 0.6362, + "nll_loss": 0.46683698892593384, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1168375164270401, + "rewards/margins": 0.04881094768643379, + "rewards/rejected": 0.06802655756473541, + "step": 244 + }, + { + "epoch": 0.21377362589615467, + "grad_norm": 170.4776083017974, + "learning_rate": 1.8473747705366425e-07, + "logits/chosen": 0.5282714366912842, + "logits/rejected": 0.4593840539455414, + "logps/chosen": -501.4341125488281, + "logps/rejected": -465.8924865722656, + "loss": 0.6624, + "nll_loss": 0.4600476324558258, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.015180783346295357, + "rewards/margins": 0.0027478188276290894, + "rewards/rejected": -0.017928607761859894, + "step": 246 + }, + { + "epoch": 0.21551162285466, + "grad_norm": 219.96275433334395, + "learning_rate": 1.844327925502015e-07, + "logits/chosen": 0.5558596253395081, + "logits/rejected": 0.49703449010849, + "logps/chosen": -519.3560180664062, + "logps/rejected": -495.0567626953125, + "loss": 0.6754, + "nll_loss": 0.5084698796272278, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0020979903638362885, + "rewards/margins": -0.023166075348854065, + "rewards/rejected": 0.025264078751206398, + "step": 248 + }, + { + "epoch": 0.21724961981316532, + "grad_norm": 147.08443939240553, + "learning_rate": 1.8412535328311812e-07, + "logits/chosen": 0.3448954224586487, + "logits/rejected": 0.5552514791488647, + "logps/chosen": -468.8410339355469, + "logps/rejected": -556.8023681640625, + "loss": 0.6558, + "nll_loss": 0.49719932675361633, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1901819258928299, + "rewards/margins": 0.39644375443458557, + "rewards/rejected": -0.20626182854175568, + "step": 250 + }, + { + "epoch": 0.21898761677167064, + "grad_norm": 114.32403562795194, + "learning_rate": 1.8381516928314365e-07, + "logits/chosen": 0.40455546975135803, + "logits/rejected": 0.4251713156700134, + "logps/chosen": -513.6260986328125, + "logps/rejected": -538.343505859375, + "loss": 0.6049, + "nll_loss": 0.4971601963043213, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06984252482652664, + "rewards/margins": 0.26666852831840515, + "rewards/rejected": -0.19682598114013672, + "step": 252 + }, + { + "epoch": 0.22072561373017596, + "grad_norm": 205.23489490998628, + "learning_rate": 1.8350225067055925e-07, + "logits/chosen": 0.3792281150817871, + "logits/rejected": 0.4229609966278076, + "logps/chosen": -504.125732421875, + "logps/rejected": -526.7474975585938, + "loss": 0.6086, + "nll_loss": 0.4739050269126892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09776955097913742, + "rewards/margins": 0.28990498185157776, + "rewards/rejected": -0.19213542342185974, + "step": 254 + }, + { + "epoch": 0.22246361068868128, + "grad_norm": 188.55359487032592, + "learning_rate": 1.8318660765486747e-07, + "logits/chosen": 0.545383095741272, + "logits/rejected": 0.46527981758117676, + "logps/chosen": -475.6557312011719, + "logps/rejected": -476.0321350097656, + "loss": 0.6352, + "nll_loss": 0.47079068422317505, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.13502216339111328, + "rewards/margins": 0.1221860870718956, + "rewards/rejected": 0.012836072593927383, + "step": 256 + }, + { + "epoch": 0.2242016076471866, + "grad_norm": 148.2343414448246, + "learning_rate": 1.8286825053445916e-07, + "logits/chosen": 0.34472447633743286, + "logits/rejected": 0.2841368317604065, + "logps/chosen": -480.28387451171875, + "logps/rejected": -527.3165893554688, + "loss": 0.6399, + "nll_loss": 0.4767462909221649, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17577162384986877, + "rewards/margins": 0.5391324162483215, + "rewards/rejected": -0.36336082220077515, + "step": 258 + }, + { + "epoch": 0.22593960460569193, + "grad_norm": 177.1228089021279, + "learning_rate": 1.8254718969627739e-07, + "logits/chosen": 0.38141781091690063, + "logits/rejected": 0.5402300953865051, + "logps/chosen": -436.6199645996094, + "logps/rejected": -474.4032287597656, + "loss": 0.6239, + "nll_loss": 0.4355260133743286, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.036749646067619324, + "rewards/margins": 0.2768980860710144, + "rewards/rejected": -0.24014845490455627, + "step": 260 + }, + { + "epoch": 0.22767760156419725, + "grad_norm": 234.61717429643062, + "learning_rate": 1.8222343561547872e-07, + "logits/chosen": 0.5197017788887024, + "logits/rejected": 0.43862244486808777, + "logps/chosen": -532.12158203125, + "logps/rejected": -473.5322265625, + "loss": 0.6838, + "nll_loss": 0.4841741621494293, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04262780770659447, + "rewards/margins": 0.08237245678901672, + "rewards/rejected": -0.12500028312206268, + "step": 262 + }, + { + "epoch": 0.22941559852270257, + "grad_norm": 127.58154940984313, + "learning_rate": 1.8189699885509127e-07, + "logits/chosen": 0.2861557900905609, + "logits/rejected": 0.18688474595546722, + "logps/chosen": -466.5064392089844, + "logps/rejected": -457.0404052734375, + "loss": 0.5996, + "nll_loss": 0.4528616964817047, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14327269792556763, + "rewards/margins": 0.2772614657878876, + "rewards/rejected": -0.13398876786231995, + "step": 264 + }, + { + "epoch": 0.2311535954812079, + "grad_norm": 197.96923520461218, + "learning_rate": 1.8156789006567017e-07, + "logits/chosen": 0.4333341121673584, + "logits/rejected": 0.34512364864349365, + "logps/chosen": -526.6718139648438, + "logps/rejected": -497.8265380859375, + "loss": 0.6638, + "nll_loss": 0.5166666507720947, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2582641839981079, + "rewards/margins": 0.13742609322071075, + "rewards/rejected": 0.12083806842565536, + "step": 266 + }, + { + "epoch": 0.23289159243971322, + "grad_norm": 129.3864196211325, + "learning_rate": 1.8123611998495006e-07, + "logits/chosen": 0.4436326026916504, + "logits/rejected": 0.3769025206565857, + "logps/chosen": -409.8662414550781, + "logps/rejected": -483.3121032714844, + "loss": 0.5929, + "nll_loss": 0.40314382314682007, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2736284136772156, + "rewards/margins": 0.7211030721664429, + "rewards/rejected": -0.4474746882915497, + "step": 268 + }, + { + "epoch": 0.23462958939821854, + "grad_norm": 153.36158996693672, + "learning_rate": 1.8090169943749475e-07, + "logits/chosen": 0.37940219044685364, + "logits/rejected": 0.47512713074684143, + "logps/chosen": -486.993408203125, + "logps/rejected": -486.84765625, + "loss": 0.6207, + "nll_loss": 0.443195104598999, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23140813410282135, + "rewards/margins": 0.31433334946632385, + "rewards/rejected": -0.0829252228140831, + "step": 270 + }, + { + "epoch": 0.23636758635672386, + "grad_norm": 192.5367921969003, + "learning_rate": 1.8056463933434396e-07, + "logits/chosen": 0.11284930258989334, + "logits/rejected": 0.253543496131897, + "logps/chosen": -473.71661376953125, + "logps/rejected": -574.8849487304688, + "loss": 0.6702, + "nll_loss": 0.4648784399032593, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3114520311355591, + "rewards/margins": 0.43774574995040894, + "rewards/rejected": -0.12629374861717224, + "step": 272 + }, + { + "epoch": 0.23810558331522919, + "grad_norm": 194.30911480270703, + "learning_rate": 1.802249506726575e-07, + "logits/chosen": 0.7151396870613098, + "logits/rejected": 0.6841371059417725, + "logps/chosen": -485.168212890625, + "logps/rejected": -480.3174133300781, + "loss": 0.6563, + "nll_loss": 0.46294671297073364, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06923913955688477, + "rewards/margins": -0.021972376853227615, + "rewards/rejected": 0.09121151268482208, + "step": 274 + }, + { + "epoch": 0.2398435802737345, + "grad_norm": 368.64577683366, + "learning_rate": 1.7988264453535638e-07, + "logits/chosen": 0.26056328415870667, + "logits/rejected": 0.4656585454940796, + "logps/chosen": -528.419189453125, + "logps/rejected": -503.22540283203125, + "loss": 0.6334, + "nll_loss": 0.519210159778595, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10040701925754547, + "rewards/margins": 0.06605224311351776, + "rewards/rejected": 0.03435477986931801, + "step": 276 + }, + { + "epoch": 0.24158157723223983, + "grad_norm": 213.32577873138953, + "learning_rate": 1.7953773209076107e-07, + "logits/chosen": 0.582006573677063, + "logits/rejected": 0.4061691164970398, + "logps/chosen": -549.0662841796875, + "logps/rejected": -487.228759765625, + "loss": 0.6662, + "nll_loss": 0.5219871401786804, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2095928192138672, + "rewards/margins": 0.15073338150978088, + "rewards/rejected": 0.0588594488799572, + "step": 278 + }, + { + "epoch": 0.24331957419074515, + "grad_norm": 197.41029523602992, + "learning_rate": 1.7919022459222751e-07, + "logits/chosen": 0.5505284070968628, + "logits/rejected": 0.5329501032829285, + "logps/chosen": -446.39239501953125, + "logps/rejected": -451.7146301269531, + "loss": 0.629, + "nll_loss": 0.44017502665519714, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3613952100276947, + "rewards/margins": 0.33678293228149414, + "rewards/rejected": 0.02461223118007183, + "step": 280 + }, + { + "epoch": 0.2450575711492505, + "grad_norm": 243.5627781827318, + "learning_rate": 1.788401333777794e-07, + "logits/chosen": 0.6572059988975525, + "logits/rejected": 0.6510287523269653, + "logps/chosen": -483.048583984375, + "logps/rejected": -568.91357421875, + "loss": 0.6825, + "nll_loss": 0.50696861743927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3178718686103821, + "rewards/margins": 0.37436118721961975, + "rewards/rejected": -0.05648936703801155, + "step": 282 + }, + { + "epoch": 0.24679556810775582, + "grad_norm": 217.6697464767649, + "learning_rate": 1.784874698697388e-07, + "logits/chosen": 0.054942190647125244, + "logits/rejected": 0.23091381788253784, + "logps/chosen": -436.7183837890625, + "logps/rejected": -494.40802001953125, + "loss": 0.6891, + "nll_loss": 0.4455685019493103, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24671289324760437, + "rewards/margins": 0.219809427857399, + "rewards/rejected": 0.026903443038463593, + "step": 284 + }, + { + "epoch": 0.24853356506626115, + "grad_norm": 136.79160723167686, + "learning_rate": 1.7813224557435312e-07, + "logits/chosen": 0.38629719614982605, + "logits/rejected": 0.4690326750278473, + "logps/chosen": -432.0810852050781, + "logps/rejected": -431.36181640625, + "loss": 0.6467, + "nll_loss": 0.41893959045410156, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2837182879447937, + "rewards/margins": 0.09321805089712143, + "rewards/rejected": 0.19050025939941406, + "step": 286 + }, + { + "epoch": 0.25027156202476647, + "grad_norm": 222.0196942780997, + "learning_rate": 1.7777447208141978e-07, + "logits/chosen": 0.24580180644989014, + "logits/rejected": 0.2410634458065033, + "logps/chosen": -480.53411865234375, + "logps/rejected": -448.2469787597656, + "loss": 0.6514, + "nll_loss": 0.43868565559387207, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4032495319843292, + "rewards/margins": 0.11636673659086227, + "rewards/rejected": 0.28688275814056396, + "step": 288 + }, + { + "epoch": 0.25200955898327176, + "grad_norm": 139.84237516124608, + "learning_rate": 1.7741416106390824e-07, + "logits/chosen": 0.3706471025943756, + "logits/rejected": 0.4256122410297394, + "logps/chosen": -485.7464294433594, + "logps/rejected": -518.0656127929688, + "loss": 0.6718, + "nll_loss": 0.48721954226493835, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3327837884426117, + "rewards/margins": 0.07422256469726562, + "rewards/rejected": 0.25856122374534607, + "step": 290 + }, + { + "epoch": 0.2537475559417771, + "grad_norm": 155.130400096225, + "learning_rate": 1.7705132427757892e-07, + "logits/chosen": 0.10826490819454193, + "logits/rejected": 0.20488005876541138, + "logps/chosen": -476.7195739746094, + "logps/rejected": -534.486083984375, + "loss": 0.6035, + "nll_loss": 0.4442020058631897, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5589705109596252, + "rewards/margins": 0.3643781840801239, + "rewards/rejected": 0.19459228217601776, + "step": 292 + }, + { + "epoch": 0.2554855529002824, + "grad_norm": 139.1832762554709, + "learning_rate": 1.7668597356059976e-07, + "logits/chosen": 0.45618000626564026, + "logits/rejected": 0.33908048272132874, + "logps/chosen": -432.85546875, + "logps/rejected": -436.5083923339844, + "loss": 0.6213, + "nll_loss": 0.42752811312675476, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4356747269630432, + "rewards/margins": 0.17109422385692596, + "rewards/rejected": 0.26458045840263367, + "step": 294 + }, + { + "epoch": 0.25722354985878776, + "grad_norm": 295.1345977616581, + "learning_rate": 1.7631812083316002e-07, + "logits/chosen": 0.3745659589767456, + "logits/rejected": 0.19597268104553223, + "logps/chosen": -512.7102661132812, + "logps/rejected": -494.8677673339844, + "loss": 0.5835, + "nll_loss": 0.5069454312324524, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3890928030014038, + "rewards/margins": 0.4358100891113281, + "rewards/rejected": -0.046717267483472824, + "step": 296 + }, + { + "epoch": 0.25896154681729305, + "grad_norm": 274.1732165312349, + "learning_rate": 1.7594777809708125e-07, + "logits/chosen": 0.7644711136817932, + "logits/rejected": 0.5483787059783936, + "logps/chosen": -485.9484558105469, + "logps/rejected": -411.6966857910156, + "loss": 0.6717, + "nll_loss": 0.46617794036865234, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3494039475917816, + "rewards/margins": 0.08869829773902893, + "rewards/rejected": 0.2607056796550751, + "step": 298 + }, + { + "epoch": 0.2606995437757984, + "grad_norm": 144.10098828420033, + "learning_rate": 1.7557495743542582e-07, + "logits/chosen": 0.38872650265693665, + "logits/rejected": 0.40973731875419617, + "logps/chosen": -418.7256774902344, + "logps/rejected": -484.5044860839844, + "loss": 0.6249, + "nll_loss": 0.4207072854042053, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.38238725066185, + "rewards/margins": 0.35693395137786865, + "rewards/rejected": 0.025453299283981323, + "step": 300 + }, + { + "epoch": 0.2624375407343037, + "grad_norm": 131.5081786745525, + "learning_rate": 1.751996710121026e-07, + "logits/chosen": 0.47483396530151367, + "logits/rejected": 0.6197552680969238, + "logps/chosen": -517.57080078125, + "logps/rejected": -512.1171264648438, + "loss": 0.6765, + "nll_loss": 0.48550546169281006, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3952723741531372, + "rewards/margins": 0.048422060906887054, + "rewards/rejected": 0.34685030579566956, + "step": 302 + }, + { + "epoch": 0.26417553769280905, + "grad_norm": 122.21498248416636, + "learning_rate": 1.7482193107147012e-07, + "logits/chosen": 0.3328361511230469, + "logits/rejected": 0.4578433632850647, + "logps/chosen": -443.4249267578125, + "logps/rejected": -480.59283447265625, + "loss": 0.605, + "nll_loss": 0.3879072368144989, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3531206250190735, + "rewards/margins": 0.3702266812324524, + "rewards/rejected": -0.0171060748398304, + "step": 304 + }, + { + "epoch": 0.26591353465131434, + "grad_norm": 158.87287212409825, + "learning_rate": 1.744417499379372e-07, + "logits/chosen": 0.24786585569381714, + "logits/rejected": 0.30546700954437256, + "logps/chosen": -477.049560546875, + "logps/rejected": -483.1158447265625, + "loss": 0.6181, + "nll_loss": 0.43729695677757263, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.54217529296875, + "rewards/margins": 0.22286450862884521, + "rewards/rejected": 0.3193108141422272, + "step": 306 + }, + { + "epoch": 0.2676515316098197, + "grad_norm": 238.43769366296277, + "learning_rate": 1.7405914001556057e-07, + "logits/chosen": 0.180156871676445, + "logits/rejected": 0.15245738625526428, + "logps/chosen": -454.74737548828125, + "logps/rejected": -483.5639343261719, + "loss": 0.6133, + "nll_loss": 0.4567978084087372, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.490037739276886, + "rewards/margins": 0.5519703030586243, + "rewards/rejected": -0.06193256005644798, + "step": 308 + }, + { + "epoch": 0.269389528568325, + "grad_norm": 177.99780939922397, + "learning_rate": 1.7367411378764046e-07, + "logits/chosen": 0.6208648085594177, + "logits/rejected": 0.6263742446899414, + "logps/chosen": -455.6944580078125, + "logps/rejected": -451.45758056640625, + "loss": 0.5971, + "nll_loss": 0.4412953853607178, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3768067955970764, + "rewards/margins": 0.09082222729921341, + "rewards/rejected": 0.285984605550766, + "step": 310 + }, + { + "epoch": 0.27112752552683034, + "grad_norm": 212.07380967509832, + "learning_rate": 1.7328668381631318e-07, + "logits/chosen": 0.330152302980423, + "logits/rejected": 0.21931253373622894, + "logps/chosen": -490.38299560546875, + "logps/rejected": -454.47613525390625, + "loss": 0.6657, + "nll_loss": 0.5036713480949402, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1547100991010666, + "rewards/margins": -0.04359874129295349, + "rewards/rejected": 0.19830884039402008, + "step": 312 + }, + { + "epoch": 0.27286552248533563, + "grad_norm": 138.10076491846291, + "learning_rate": 1.7289686274214114e-07, + "logits/chosen": 0.18456527590751648, + "logits/rejected": 0.25792205333709717, + "logps/chosen": -454.43865966796875, + "logps/rejected": -411.9728088378906, + "loss": 0.6191, + "nll_loss": 0.42922234535217285, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4555009603500366, + "rewards/margins": 0.13302592933177948, + "rewards/rejected": 0.32247504591941833, + "step": 314 + }, + { + "epoch": 0.274603519443841, + "grad_norm": 141.35852524297098, + "learning_rate": 1.7250466328370068e-07, + "logits/chosen": 0.5290156602859497, + "logits/rejected": 0.3320920467376709, + "logps/chosen": -477.7109375, + "logps/rejected": -503.2304382324219, + "loss": 0.6023, + "nll_loss": 0.4656658172607422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.772936999797821, + "rewards/margins": 0.5684661865234375, + "rewards/rejected": 0.20447082817554474, + "step": 316 + }, + { + "epoch": 0.2763415164023463, + "grad_norm": 316.8106303332851, + "learning_rate": 1.7211009823716693e-07, + "logits/chosen": 0.1151304543018341, + "logits/rejected": 0.12682472169399261, + "logps/chosen": -487.7864990234375, + "logps/rejected": -439.44549560546875, + "loss": 0.6177, + "nll_loss": 0.47066113352775574, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49904000759124756, + "rewards/margins": 0.1463506519794464, + "rewards/rejected": 0.35268935561180115, + "step": 318 + }, + { + "epoch": 0.2780795133608516, + "grad_norm": 235.51591779073905, + "learning_rate": 1.7171318047589637e-07, + "logits/chosen": 0.4797361493110657, + "logits/rejected": 0.5132091045379639, + "logps/chosen": -405.1888427734375, + "logps/rejected": -472.8603820800781, + "loss": 0.6178, + "nll_loss": 0.3952030539512634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5869803428649902, + "rewards/margins": 0.20901460945606232, + "rewards/rejected": 0.3779657483100891, + "step": 320 + }, + { + "epoch": 0.2798175103193569, + "grad_norm": 289.2881197970101, + "learning_rate": 1.7131392295000672e-07, + "logits/chosen": 0.5913805961608887, + "logits/rejected": 0.5887272357940674, + "logps/chosen": -423.531005859375, + "logps/rejected": -460.1776428222656, + "loss": 0.6779, + "nll_loss": 0.4493826925754547, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44546252489089966, + "rewards/margins": 0.2519988715648651, + "rewards/rejected": 0.19346359372138977, + "step": 322 + }, + { + "epoch": 0.28155550727786227, + "grad_norm": 190.5144705146162, + "learning_rate": 1.7091233868595465e-07, + "logits/chosen": 0.5008837580680847, + "logits/rejected": 0.4548027217388153, + "logps/chosen": -460.7749328613281, + "logps/rejected": -466.6297912597656, + "loss": 0.611, + "nll_loss": 0.43400996923446655, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.39121007919311523, + "rewards/margins": 0.06272707879543304, + "rewards/rejected": 0.3284830152988434, + "step": 324 + }, + { + "epoch": 0.28329350423636757, + "grad_norm": 207.28223953117717, + "learning_rate": 1.7050844078611054e-07, + "logits/chosen": 0.31017956137657166, + "logits/rejected": 0.324398934841156, + "logps/chosen": -488.0496826171875, + "logps/rejected": -508.5542297363281, + "loss": 0.6418, + "nll_loss": 0.4621453285217285, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.328339546918869, + "rewards/margins": 0.13088390231132507, + "rewards/rejected": 0.19745570421218872, + "step": 326 + }, + { + "epoch": 0.2850315011948729, + "grad_norm": 195.5379731252921, + "learning_rate": 1.7010224242833106e-07, + "logits/chosen": 0.7597770094871521, + "logits/rejected": 0.6047714352607727, + "logps/chosen": -529.1282958984375, + "logps/rejected": -497.84375, + "loss": 0.6742, + "nll_loss": 0.4810156226158142, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06996631622314453, + "rewards/margins": -0.1163499504327774, + "rewards/rejected": 0.18631629645824432, + "step": 328 + }, + { + "epoch": 0.2867694981533782, + "grad_norm": 158.75613524542254, + "learning_rate": 1.6969375686552937e-07, + "logits/chosen": 0.5037192702293396, + "logits/rejected": 0.49104276299476624, + "logps/chosen": -542.0267333984375, + "logps/rejected": -481.6624755859375, + "loss": 0.6433, + "nll_loss": 0.49876895546913147, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.22195303440093994, + "rewards/margins": 0.008428195491433144, + "rewards/rejected": 0.21352483332157135, + "step": 330 + }, + { + "epoch": 0.28850749511188356, + "grad_norm": 179.9369850212301, + "learning_rate": 1.6928299742524231e-07, + "logits/chosen": 0.4871769845485687, + "logits/rejected": 0.5428123474121094, + "logps/chosen": -483.11773681640625, + "logps/rejected": -457.1769104003906, + "loss": 0.6557, + "nll_loss": 0.4703104794025421, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.37591439485549927, + "rewards/margins": 0.0965644121170044, + "rewards/rejected": 0.2793499529361725, + "step": 332 + }, + { + "epoch": 0.29024549207038886, + "grad_norm": 169.22992683207792, + "learning_rate": 1.6886997750919616e-07, + "logits/chosen": 0.41568654775619507, + "logits/rejected": 0.640016496181488, + "logps/chosen": -419.57464599609375, + "logps/rejected": -440.6019287109375, + "loss": 0.6416, + "nll_loss": 0.4042522609233856, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21573497354984283, + "rewards/margins": -0.052317921072244644, + "rewards/rejected": 0.268052875995636, + "step": 334 + }, + { + "epoch": 0.2919834890288942, + "grad_norm": 537.549948640332, + "learning_rate": 1.6845471059286887e-07, + "logits/chosen": 0.2585112452507019, + "logits/rejected": 0.40234676003456116, + "logps/chosen": -469.12744140625, + "logps/rejected": -407.4085388183594, + "loss": 0.6697, + "nll_loss": 0.43277740478515625, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4075109362602234, + "rewards/margins": -0.004766283556818962, + "rewards/rejected": 0.4122772216796875, + "step": 336 + }, + { + "epoch": 0.2937214859873995, + "grad_norm": 143.88283356900885, + "learning_rate": 1.6803721022505065e-07, + "logits/chosen": 0.2862776815891266, + "logits/rejected": 0.5013993382453918, + "logps/chosen": -419.91168212890625, + "logps/rejected": -475.68621826171875, + "loss": 0.6574, + "nll_loss": 0.4124099910259247, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5104472637176514, + "rewards/margins": 0.1771218478679657, + "rewards/rejected": 0.3333253860473633, + "step": 338 + }, + { + "epoch": 0.29545948294590485, + "grad_norm": 277.79529042691826, + "learning_rate": 1.6761749002740193e-07, + "logits/chosen": 0.46205711364746094, + "logits/rejected": 0.46211087703704834, + "logps/chosen": -460.75201416015625, + "logps/rejected": -521.1112060546875, + "loss": 0.5561, + "nll_loss": 0.4728389382362366, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6726080775260925, + "rewards/margins": 0.5335967540740967, + "rewards/rejected": 0.1390111893415451, + "step": 340 + }, + { + "epoch": 0.29719747990441014, + "grad_norm": 211.81872281009484, + "learning_rate": 1.6719556369400878e-07, + "logits/chosen": 0.31371983885765076, + "logits/rejected": 0.3987221121788025, + "logps/chosen": -508.58856201171875, + "logps/rejected": -506.62841796875, + "loss": 0.6018, + "nll_loss": 0.5143205523490906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5597350001335144, + "rewards/margins": 0.17063526809215546, + "rewards/rejected": 0.38909977674484253, + "step": 342 + }, + { + "epoch": 0.2989354768629155, + "grad_norm": 202.01733690053692, + "learning_rate": 1.6677144499093625e-07, + "logits/chosen": 0.3940061628818512, + "logits/rejected": 0.5614907145500183, + "logps/chosen": -493.75067138671875, + "logps/rejected": -516.7576293945312, + "loss": 0.6558, + "nll_loss": 0.47246384620666504, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5523098111152649, + "rewards/margins": 0.15993386507034302, + "rewards/rejected": 0.39237597584724426, + "step": 344 + }, + { + "epoch": 0.3006734738214208, + "grad_norm": 233.30784637715007, + "learning_rate": 1.6634514775577918e-07, + "logits/chosen": 0.22863087058067322, + "logits/rejected": 0.4731798768043518, + "logps/chosen": -481.202392578125, + "logps/rejected": -556.5811157226562, + "loss": 0.6281, + "nll_loss": 0.46664902567863464, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8687335252761841, + "rewards/margins": 0.3619176149368286, + "rewards/rejected": 0.5068159103393555, + "step": 346 + }, + { + "epoch": 0.30241147077992614, + "grad_norm": 157.2977888244691, + "learning_rate": 1.659166858972107e-07, + "logits/chosen": 0.5143783092498779, + "logits/rejected": 0.5790827870368958, + "logps/chosen": -468.9595947265625, + "logps/rejected": -502.225341796875, + "loss": 0.6525, + "nll_loss": 0.46870219707489014, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5990110039710999, + "rewards/margins": 0.23531866073608398, + "rewards/rejected": 0.3636922836303711, + "step": 348 + }, + { + "epoch": 0.30414946773843143, + "grad_norm": 163.6646025660051, + "learning_rate": 1.6548607339452852e-07, + "logits/chosen": 0.3285616338253021, + "logits/rejected": 0.4201197028160095, + "logps/chosen": -502.6363525390625, + "logps/rejected": -548.9605712890625, + "loss": 0.6349, + "nll_loss": 0.4952363669872284, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9033224582672119, + "rewards/margins": 0.7526147961616516, + "rewards/rejected": 0.15070763230323792, + "step": 350 + }, + { + "epoch": 0.3058874646969368, + "grad_norm": 130.2464033824859, + "learning_rate": 1.650533242971987e-07, + "logits/chosen": 0.11030253022909164, + "logits/rejected": 0.20594020187854767, + "logps/chosen": -430.72265625, + "logps/rejected": -448.17144775390625, + "loss": 0.564, + "nll_loss": 0.4082512855529785, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.643363356590271, + "rewards/margins": 0.14355525374412537, + "rewards/rejected": 0.499808132648468, + "step": 352 + }, + { + "epoch": 0.3076254616554421, + "grad_norm": 185.75442858935745, + "learning_rate": 1.646184527243974e-07, + "logits/chosen": 0.3853408694267273, + "logits/rejected": 0.325025737285614, + "logps/chosen": -499.373291015625, + "logps/rejected": -458.1291198730469, + "loss": 0.732, + "nll_loss": 0.4488787353038788, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49317529797554016, + "rewards/margins": 0.06869325041770935, + "rewards/rejected": 0.4244820773601532, + "step": 354 + }, + { + "epoch": 0.30936345861394743, + "grad_norm": 145.25722335543503, + "learning_rate": 1.6418147286455017e-07, + "logits/chosen": 0.6253547668457031, + "logits/rejected": 0.570826530456543, + "logps/chosen": -466.4505920410156, + "logps/rejected": -448.43341064453125, + "loss": 0.5772, + "nll_loss": 0.4469682276248932, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5305427312850952, + "rewards/margins": 0.28476476669311523, + "rewards/rejected": 0.24577800929546356, + "step": 356 + }, + { + "epoch": 0.3111014555724527, + "grad_norm": 273.4568869901171, + "learning_rate": 1.6374239897486896e-07, + "logits/chosen": 0.419254869222641, + "logits/rejected": 0.4319905638694763, + "logps/chosen": -460.3009033203125, + "logps/rejected": -478.2269592285156, + "loss": 0.6748, + "nll_loss": 0.4449344575405121, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4517764449119568, + "rewards/margins": 0.13741624355316162, + "rewards/rejected": 0.3143601417541504, + "step": 358 + }, + { + "epoch": 0.3128394525309581, + "grad_norm": 138.77165017718346, + "learning_rate": 1.6330124538088703e-07, + "logits/chosen": 0.5652934908866882, + "logits/rejected": 0.639899730682373, + "logps/chosen": -475.1581726074219, + "logps/rejected": -452.1554870605469, + "loss": 0.5829, + "nll_loss": 0.46334606409072876, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5346073508262634, + "rewards/margins": 0.22912371158599854, + "rewards/rejected": 0.3054836094379425, + "step": 360 + }, + { + "epoch": 0.31457744948946337, + "grad_norm": 207.1164704179335, + "learning_rate": 1.6285802647599154e-07, + "logits/chosen": 0.30453720688819885, + "logits/rejected": 0.2256930023431778, + "logps/chosen": -553.0308227539062, + "logps/rejected": -515.8295288085938, + "loss": 0.6774, + "nll_loss": 0.49242475628852844, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3522112965583801, + "rewards/margins": 0.2572094798088074, + "rewards/rejected": 0.09500180184841156, + "step": 362 + }, + { + "epoch": 0.3163154464479687, + "grad_norm": 167.9520774877879, + "learning_rate": 1.6241275672095395e-07, + "logits/chosen": 0.3028857111930847, + "logits/rejected": 0.3282950818538666, + "logps/chosen": -465.1825256347656, + "logps/rejected": -501.9438171386719, + "loss": 0.6284, + "nll_loss": 0.4515576958656311, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3405414819717407, + "rewards/margins": 0.2630569636821747, + "rewards/rejected": 0.07748451828956604, + "step": 364 + }, + { + "epoch": 0.318053443406474, + "grad_norm": 127.49701641606168, + "learning_rate": 1.619654506434581e-07, + "logits/chosen": 0.524957537651062, + "logits/rejected": 0.43854832649230957, + "logps/chosen": -540.3823852539062, + "logps/rejected": -550.18701171875, + "loss": 0.5756, + "nll_loss": 0.5099707245826721, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5246202349662781, + "rewards/margins": 0.42196738719940186, + "rewards/rejected": 0.10265286266803741, + "step": 366 + }, + { + "epoch": 0.31979144036497936, + "grad_norm": 189.5220473685797, + "learning_rate": 1.615161228376265e-07, + "logits/chosen": 0.383512943983078, + "logits/rejected": 0.44961869716644287, + "logps/chosen": -460.8105163574219, + "logps/rejected": -469.990478515625, + "loss": 0.6288, + "nll_loss": 0.4280567765235901, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5916939377784729, + "rewards/margins": 0.3190990686416626, + "rewards/rejected": 0.2725948393344879, + "step": 368 + }, + { + "epoch": 0.32152943732348466, + "grad_norm": 145.17291751298188, + "learning_rate": 1.6106478796354383e-07, + "logits/chosen": 0.42558369040489197, + "logits/rejected": 0.6559165716171265, + "logps/chosen": -410.0570983886719, + "logps/rejected": -533.5880126953125, + "loss": 0.5969, + "nll_loss": 0.43435239791870117, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5680219531059265, + "rewards/margins": 0.5121182799339294, + "rewards/rejected": 0.05590362474322319, + "step": 370 + }, + { + "epoch": 0.32326743428199, + "grad_norm": 166.70070172002627, + "learning_rate": 1.6061146074677882e-07, + "logits/chosen": 0.3405163586139679, + "logits/rejected": 0.3031487762928009, + "logps/chosen": -470.0008544921875, + "logps/rejected": -466.5913391113281, + "loss": 0.582, + "nll_loss": 0.4428296387195587, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6726614236831665, + "rewards/margins": 0.19486792385578156, + "rewards/rejected": 0.47779348492622375, + "step": 372 + }, + { + "epoch": 0.3250054312404953, + "grad_norm": 167.16311879860288, + "learning_rate": 1.6015615597790385e-07, + "logits/chosen": 0.5631195902824402, + "logits/rejected": 0.46143069863319397, + "logps/chosen": -536.484130859375, + "logps/rejected": -510.6805419921875, + "loss": 0.6317, + "nll_loss": 0.507157564163208, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4220741391181946, + "rewards/margins": 0.30352985858917236, + "rewards/rejected": 0.11854429543018341, + "step": 374 + }, + { + "epoch": 0.32674342819900065, + "grad_norm": 161.6759360651135, + "learning_rate": 1.5969888851201225e-07, + "logits/chosen": 0.6711764931678772, + "logits/rejected": 0.597443699836731, + "logps/chosen": -476.9120178222656, + "logps/rejected": -482.572998046875, + "loss": 0.6185, + "nll_loss": 0.47335314750671387, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5396869778633118, + "rewards/margins": 0.584469199180603, + "rewards/rejected": -0.044782258570194244, + "step": 376 + }, + { + "epoch": 0.328481425157506, + "grad_norm": 417.54699929930405, + "learning_rate": 1.5923967326823368e-07, + "logits/chosen": 0.33243659138679504, + "logits/rejected": 0.5092381238937378, + "logps/chosen": -475.91656494140625, + "logps/rejected": -518.292236328125, + "loss": 0.6348, + "nll_loss": 0.46231335401535034, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49808621406555176, + "rewards/margins": 0.240754634141922, + "rewards/rejected": 0.25733163952827454, + "step": 378 + }, + { + "epoch": 0.3302194221160113, + "grad_norm": 175.66639079342582, + "learning_rate": 1.5877852522924732e-07, + "logits/chosen": 0.6205189228057861, + "logits/rejected": 0.48265188932418823, + "logps/chosen": -543.364013671875, + "logps/rejected": -542.3245239257812, + "loss": 0.5893, + "nll_loss": 0.5013877749443054, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7062149047851562, + "rewards/margins": 0.5146379470825195, + "rewards/rejected": 0.19157695770263672, + "step": 380 + }, + { + "epoch": 0.33195741907451665, + "grad_norm": 138.56625803103043, + "learning_rate": 1.583154594407932e-07, + "logits/chosen": 0.4925762414932251, + "logits/rejected": 0.2916370630264282, + "logps/chosen": -441.38153076171875, + "logps/rejected": -441.0870666503906, + "loss": 0.5991, + "nll_loss": 0.40977394580841064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4514932632446289, + "rewards/margins": 0.4026234745979309, + "rewards/rejected": 0.048869796097278595, + "step": 382 + }, + { + "epoch": 0.33369541603302194, + "grad_norm": 148.55643375008012, + "learning_rate": 1.5785049101118108e-07, + "logits/chosen": 0.11955571174621582, + "logits/rejected": 0.03210335969924927, + "logps/chosen": -517.887451171875, + "logps/rejected": -560.5753173828125, + "loss": 0.6206, + "nll_loss": 0.4803450107574463, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4510236978530884, + "rewards/margins": 0.289794921875, + "rewards/rejected": 0.161228746175766, + "step": 384 + }, + { + "epoch": 0.3354334129915273, + "grad_norm": 216.1495030153299, + "learning_rate": 1.5738363511079773e-07, + "logits/chosen": 0.356658935546875, + "logits/rejected": 0.5617293119430542, + "logps/chosen": -469.86114501953125, + "logps/rejected": -553.1449584960938, + "loss": 0.6355, + "nll_loss": 0.45090147852897644, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6025825142860413, + "rewards/margins": 0.5343748331069946, + "rewards/rejected": 0.068207748234272, + "step": 386 + }, + { + "epoch": 0.3371714099500326, + "grad_norm": 116.90273062908595, + "learning_rate": 1.569149069716118e-07, + "logits/chosen": 0.36243703961372375, + "logits/rejected": 0.3176945745944977, + "logps/chosen": -473.4244079589844, + "logps/rejected": -524.5529174804688, + "loss": 0.6079, + "nll_loss": 0.48573052883148193, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34765779972076416, + "rewards/margins": 0.26835328340530396, + "rewards/rejected": 0.07930450141429901, + "step": 388 + }, + { + "epoch": 0.33890940690853794, + "grad_norm": 166.76293201719108, + "learning_rate": 1.5644432188667694e-07, + "logits/chosen": 0.44438979029655457, + "logits/rejected": 0.5965635776519775, + "logps/chosen": -505.97113037109375, + "logps/rejected": -534.6060180664062, + "loss": 0.6398, + "nll_loss": 0.4930253028869629, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3729788064956665, + "rewards/margins": 0.09345950931310654, + "rewards/rejected": 0.27951928973197937, + "step": 390 + }, + { + "epoch": 0.34064740386704323, + "grad_norm": 110.02770232107177, + "learning_rate": 1.5597189520963274e-07, + "logits/chosen": 0.6275568008422852, + "logits/rejected": 0.6534876823425293, + "logps/chosen": -489.2644348144531, + "logps/rejected": -464.12115478515625, + "loss": 0.6371, + "nll_loss": 0.4506080746650696, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.267756849527359, + "rewards/margins": 0.11544962972402573, + "rewards/rejected": 0.15230722725391388, + "step": 392 + }, + { + "epoch": 0.3423854008255486, + "grad_norm": 125.7814206886685, + "learning_rate": 1.5549764235420404e-07, + "logits/chosen": 0.36072108149528503, + "logits/rejected": 0.42248329520225525, + "logps/chosen": -470.70941162109375, + "logps/rejected": -500.63494873046875, + "loss": 0.5711, + "nll_loss": 0.4840225279331207, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4070228338241577, + "rewards/margins": 0.5793428421020508, + "rewards/rejected": -0.17231999337673187, + "step": 394 + }, + { + "epoch": 0.3441233977840539, + "grad_norm": 131.40847630338484, + "learning_rate": 1.550215787936977e-07, + "logits/chosen": 0.6912932395935059, + "logits/rejected": 0.5712380409240723, + "logps/chosen": -495.5016174316406, + "logps/rejected": -454.8415832519531, + "loss": 0.6201, + "nll_loss": 0.4484928250312805, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5907407999038696, + "rewards/margins": 0.10202912986278534, + "rewards/rejected": 0.4887116253376007, + "step": 396 + }, + { + "epoch": 0.3458613947425592, + "grad_norm": 178.1175808193889, + "learning_rate": 1.54543720060498e-07, + "logits/chosen": 0.3643982708454132, + "logits/rejected": 0.37503209710121155, + "logps/chosen": -422.71673583984375, + "logps/rejected": -456.2169189453125, + "loss": 0.5849, + "nll_loss": 0.4056944251060486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5937540531158447, + "rewards/margins": 0.3985627293586731, + "rewards/rejected": 0.19519129395484924, + "step": 398 + }, + { + "epoch": 0.3475993917010645, + "grad_norm": 171.13756750638748, + "learning_rate": 1.5406408174555975e-07, + "logits/chosen": 0.27237066626548767, + "logits/rejected": 0.1704144924879074, + "logps/chosen": -402.96527099609375, + "logps/rejected": -468.3968505859375, + "loss": 0.5572, + "nll_loss": 0.39846640825271606, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.546933650970459, + "rewards/margins": 0.40997716784477234, + "rewards/rejected": 0.13695651292800903, + "step": 400 + }, + { + "epoch": 0.34933738865956987, + "grad_norm": 184.01278443708705, + "learning_rate": 1.5358267949789966e-07, + "logits/chosen": 0.35213690996170044, + "logits/rejected": 0.380864679813385, + "logps/chosen": -490.4032287597656, + "logps/rejected": -485.1588134765625, + "loss": 0.6594, + "nll_loss": 0.47046926617622375, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8484295010566711, + "rewards/margins": 0.26358741521835327, + "rewards/rejected": 0.5848420858383179, + "step": 402 + }, + { + "epoch": 0.35107538561807516, + "grad_norm": 219.69038077247276, + "learning_rate": 1.5309952902408573e-07, + "logits/chosen": 0.06591632217168808, + "logits/rejected": 0.10977748036384583, + "logps/chosen": -491.1231689453125, + "logps/rejected": -453.5677490234375, + "loss": 0.6137, + "nll_loss": 0.4714672267436981, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7929648160934448, + "rewards/margins": 0.27987584471702576, + "rewards/rejected": 0.5130888819694519, + "step": 404 + }, + { + "epoch": 0.3528133825765805, + "grad_norm": 192.96528830282867, + "learning_rate": 1.5261464608772485e-07, + "logits/chosen": 0.31062737107276917, + "logits/rejected": 0.18132522702217102, + "logps/chosen": -458.4087829589844, + "logps/rejected": -395.68212890625, + "loss": 0.605, + "nll_loss": 0.46351706981658936, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7857881188392639, + "rewards/margins": 0.40948447585105896, + "rewards/rejected": 0.3763035833835602, + "step": 406 + }, + { + "epoch": 0.3545513795350858, + "grad_norm": 138.57973896271315, + "learning_rate": 1.5212804650894838e-07, + "logits/chosen": 0.5299434661865234, + "logits/rejected": 0.5789748430252075, + "logps/chosen": -506.7216796875, + "logps/rejected": -518.6016845703125, + "loss": 0.6216, + "nll_loss": 0.4909980893135071, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5659937858581543, + "rewards/margins": 0.21054793894290924, + "rewards/rejected": 0.35544586181640625, + "step": 408 + }, + { + "epoch": 0.35628937649359116, + "grad_norm": 135.43117518574306, + "learning_rate": 1.516397461638962e-07, + "logits/chosen": 0.16620784997940063, + "logits/rejected": 0.02259686216711998, + "logps/chosen": -496.93560791015625, + "logps/rejected": -462.2376708984375, + "loss": 0.5299, + "nll_loss": 0.4484490752220154, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9257678389549255, + "rewards/margins": 0.42007893323898315, + "rewards/rejected": 0.5056889057159424, + "step": 410 + }, + { + "epoch": 0.35802737345209645, + "grad_norm": 146.69116581523053, + "learning_rate": 1.511497609841984e-07, + "logits/chosen": 0.4435105621814728, + "logits/rejected": 0.28421181440353394, + "logps/chosen": -503.27984619140625, + "logps/rejected": -554.2760620117188, + "loss": 0.5986, + "nll_loss": 0.4445800185203552, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7452791929244995, + "rewards/margins": 0.675082802772522, + "rewards/rejected": 0.07019642740488052, + "step": 412 + }, + { + "epoch": 0.3597653704106018, + "grad_norm": 179.0729044371433, + "learning_rate": 1.5065810695645583e-07, + "logits/chosen": 0.16551180183887482, + "logits/rejected": 0.1484622061252594, + "logps/chosen": -449.44976806640625, + "logps/rejected": -505.42425537109375, + "loss": 0.579, + "nll_loss": 0.42006924748420715, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0582197904586792, + "rewards/margins": 0.6258028149604797, + "rewards/rejected": 0.4324168264865875, + "step": 414 + }, + { + "epoch": 0.3615033673691071, + "grad_norm": 216.75622432526953, + "learning_rate": 1.5016480012171825e-07, + "logits/chosen": 0.6741234660148621, + "logits/rejected": 0.7510842084884644, + "logps/chosen": -485.66668701171875, + "logps/rejected": -454.1160888671875, + "loss": 0.6953, + "nll_loss": 0.46377602219581604, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6574243307113647, + "rewards/margins": 0.11631478369235992, + "rewards/rejected": 0.541109561920166, + "step": 416 + }, + { + "epoch": 0.36324136432761245, + "grad_norm": 202.74502581658425, + "learning_rate": 1.4966985657496112e-07, + "logits/chosen": 0.5062256455421448, + "logits/rejected": 0.5332990884780884, + "logps/chosen": -482.54144287109375, + "logps/rejected": -508.75897216796875, + "loss": 0.73, + "nll_loss": 0.46353021264076233, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7008476853370667, + "rewards/margins": -0.03729201853275299, + "rewards/rejected": 0.7381397485733032, + "step": 418 + }, + { + "epoch": 0.36497936128611774, + "grad_norm": 145.0881914686186, + "learning_rate": 1.491732924645604e-07, + "logits/chosen": 0.2643652558326721, + "logits/rejected": 0.5007381439208984, + "logps/chosen": -470.0370178222656, + "logps/rejected": -493.4355163574219, + "loss": 0.6496, + "nll_loss": 0.44312015175819397, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.7855318784713745, + "rewards/margins": -0.0823485404253006, + "rewards/rejected": 0.8678804636001587, + "step": 420 + }, + { + "epoch": 0.3667173582446231, + "grad_norm": 185.84119843255152, + "learning_rate": 1.4867512399176562e-07, + "logits/chosen": 0.23039156198501587, + "logits/rejected": 0.35220447182655334, + "logps/chosen": -505.6122741699219, + "logps/rejected": -561.8280029296875, + "loss": 0.688, + "nll_loss": 0.47781142592430115, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8302431106567383, + "rewards/margins": 0.5848037600517273, + "rewards/rejected": 0.24543944001197815, + "step": 422 + }, + { + "epoch": 0.3684553552031284, + "grad_norm": 115.85134944328905, + "learning_rate": 1.4817536741017152e-07, + "logits/chosen": 0.5234343409538269, + "logits/rejected": 0.46782320737838745, + "logps/chosen": -483.80517578125, + "logps/rejected": -459.695556640625, + "loss": 0.632, + "nll_loss": 0.4559612572193146, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7901848554611206, + "rewards/margins": 0.22588977217674255, + "rewards/rejected": 0.5642951130867004, + "step": 424 + }, + { + "epoch": 0.37019335216163374, + "grad_norm": 186.22651924677427, + "learning_rate": 1.476740390251875e-07, + "logits/chosen": 0.6512681245803833, + "logits/rejected": 0.6760206818580627, + "logps/chosen": -533.1652221679688, + "logps/rejected": -559.06103515625, + "loss": 0.6956, + "nll_loss": 0.505631148815155, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9320828318595886, + "rewards/margins": 0.308445006608963, + "rewards/rejected": 0.6236377954483032, + "step": 426 + }, + { + "epoch": 0.37193134912013903, + "grad_norm": 141.6956282085058, + "learning_rate": 1.4717115519350568e-07, + "logits/chosen": 0.31024739146232605, + "logits/rejected": 0.40230733156204224, + "logps/chosen": -506.8525390625, + "logps/rejected": -467.3164978027344, + "loss": 0.575, + "nll_loss": 0.4690600633621216, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.2685997486114502, + "rewards/margins": 0.598934531211853, + "rewards/rejected": 0.6696651577949524, + "step": 428 + }, + { + "epoch": 0.3736693460786444, + "grad_norm": 259.2302220113569, + "learning_rate": 1.4666673232256736e-07, + "logits/chosen": 0.3985351324081421, + "logits/rejected": 0.45808982849121094, + "logps/chosen": -430.1258850097656, + "logps/rejected": -482.2582092285156, + "loss": 0.614, + "nll_loss": 0.3921167552471161, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.8013219237327576, + "rewards/margins": -0.042902953922748566, + "rewards/rejected": 0.8442248106002808, + "step": 430 + }, + { + "epoch": 0.3754073430371497, + "grad_norm": 155.46043813901522, + "learning_rate": 1.461607868700276e-07, + "logits/chosen": 0.425076961517334, + "logits/rejected": 0.3554433286190033, + "logps/chosen": -502.87225341796875, + "logps/rejected": -452.2707824707031, + "loss": 0.6385, + "nll_loss": 0.45759153366088867, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7368870973587036, + "rewards/margins": 0.3387683928012848, + "rewards/rejected": 0.3981187343597412, + "step": 432 + }, + { + "epoch": 0.377145339995655, + "grad_norm": 379.83578925167035, + "learning_rate": 1.4565333534321824e-07, + "logits/chosen": 0.2826644778251648, + "logits/rejected": 0.5096839070320129, + "logps/chosen": -476.4322509765625, + "logps/rejected": -443.9337463378906, + "loss": 0.5913, + "nll_loss": 0.4821905195713043, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7236399054527283, + "rewards/margins": 0.4021589457988739, + "rewards/rejected": 0.32148098945617676, + "step": 434 + }, + { + "epoch": 0.3788833369541603, + "grad_norm": 173.5884492355553, + "learning_rate": 1.4514439429860941e-07, + "logits/chosen": 0.07746727764606476, + "logits/rejected": 0.1786593645811081, + "logps/chosen": -447.15380859375, + "logps/rejected": -424.5791931152344, + "loss": 0.575, + "nll_loss": 0.43714332580566406, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8142626881599426, + "rewards/margins": 0.2937620282173157, + "rewards/rejected": 0.520500659942627, + "step": 436 + }, + { + "epoch": 0.38062133391266567, + "grad_norm": 189.44734705913655, + "learning_rate": 1.4463398034126918e-07, + "logits/chosen": 0.21691399812698364, + "logits/rejected": 0.1062842532992363, + "logps/chosen": -470.154296875, + "logps/rejected": -515.6326293945312, + "loss": 0.5866, + "nll_loss": 0.4814315438270569, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.932270884513855, + "rewards/margins": 0.4025800824165344, + "rewards/rejected": 0.5296907424926758, + "step": 438 + }, + { + "epoch": 0.38235933087117097, + "grad_norm": 180.5203885167642, + "learning_rate": 1.4412211012432212e-07, + "logits/chosen": 0.6636056303977966, + "logits/rejected": 0.7748678922653198, + "logps/chosen": -545.3409423828125, + "logps/rejected": -541.329833984375, + "loss": 0.6366, + "nll_loss": 0.5039398074150085, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4772527813911438, + "rewards/margins": 0.14066831767559052, + "rewards/rejected": 0.3365844786167145, + "step": 440 + }, + { + "epoch": 0.3840973278296763, + "grad_norm": 145.78613263537494, + "learning_rate": 1.4360880034840552e-07, + "logits/chosen": 0.698187530040741, + "logits/rejected": 0.5469500422477722, + "logps/chosen": -516.9849853515625, + "logps/rejected": -449.78717041015625, + "loss": 0.6094, + "nll_loss": 0.4981119930744171, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.33729931712150574, + "rewards/margins": 0.2149997502565384, + "rewards/rejected": 0.12229958176612854, + "step": 442 + }, + { + "epoch": 0.3858353247881816, + "grad_norm": 226.70536629048328, + "learning_rate": 1.4309406776112488e-07, + "logits/chosen": 0.2957366108894348, + "logits/rejected": 0.28157639503479004, + "logps/chosen": -534.7179565429688, + "logps/rejected": -508.509521484375, + "loss": 0.7085, + "nll_loss": 0.4948302209377289, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23636344075202942, + "rewards/margins": -0.018209829926490784, + "rewards/rejected": 0.254573255777359, + "step": 444 + }, + { + "epoch": 0.38757332174668696, + "grad_norm": 141.01878995081393, + "learning_rate": 1.4257792915650726e-07, + "logits/chosen": 0.5431017875671387, + "logits/rejected": 0.4987832009792328, + "logps/chosen": -474.10003662109375, + "logps/rejected": -469.18994140625, + "loss": 0.6611, + "nll_loss": 0.44907432794570923, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5094869136810303, + "rewards/margins": 0.28341084718704224, + "rewards/rejected": 0.22607611119747162, + "step": 446 + }, + { + "epoch": 0.38931131870519226, + "grad_norm": 143.73168116686412, + "learning_rate": 1.4206040137445348e-07, + "logits/chosen": 0.6262254118919373, + "logits/rejected": 0.6136873364448547, + "logps/chosen": -458.2574768066406, + "logps/rejected": -475.05126953125, + "loss": 0.5773, + "nll_loss": 0.4416995048522949, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4745006561279297, + "rewards/margins": 0.11526194214820862, + "rewards/rejected": 0.35923871397972107, + "step": 448 + }, + { + "epoch": 0.3910493156636976, + "grad_norm": 204.39195717062336, + "learning_rate": 1.4154150130018864e-07, + "logits/chosen": 0.17188166081905365, + "logits/rejected": 0.12855087220668793, + "logps/chosen": -493.6163635253906, + "logps/rejected": -493.07989501953125, + "loss": 0.6267, + "nll_loss": 0.4598374366760254, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4715019762516022, + "rewards/margins": 0.11144667863845825, + "rewards/rejected": 0.36005526781082153, + "step": 450 + }, + { + "epoch": 0.3927873126222029, + "grad_norm": 136.99517525674915, + "learning_rate": 1.4102124586371118e-07, + "logits/chosen": 0.3407544791698456, + "logits/rejected": 0.2230420559644699, + "logps/chosen": -551.1911010742188, + "logps/rejected": -530.1219482421875, + "loss": 0.6281, + "nll_loss": 0.5207258462905884, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6534353494644165, + "rewards/margins": 0.42499563097953796, + "rewards/rejected": 0.22843971848487854, + "step": 452 + }, + { + "epoch": 0.39452530958070825, + "grad_norm": 171.20329280679755, + "learning_rate": 1.4049965203924052e-07, + "logits/chosen": 0.40859052538871765, + "logits/rejected": 0.2568287253379822, + "logps/chosen": -500.0255126953125, + "logps/rejected": -481.6701354980469, + "loss": 0.5857, + "nll_loss": 0.49866920709609985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8634217977523804, + "rewards/margins": 0.5302352905273438, + "rewards/rejected": 0.3331865072250366, + "step": 454 + }, + { + "epoch": 0.39626330653921354, + "grad_norm": 138.55579601924867, + "learning_rate": 1.3997673684466338e-07, + "logits/chosen": 0.5448041558265686, + "logits/rejected": 0.5181457996368408, + "logps/chosen": -452.5978698730469, + "logps/rejected": -492.1902770996094, + "loss": 0.5857, + "nll_loss": 0.44351333379745483, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6340304017066956, + "rewards/margins": 0.3788342773914337, + "rewards/rejected": 0.25519609451293945, + "step": 456 + }, + { + "epoch": 0.3980013034977189, + "grad_norm": 121.89047583858445, + "learning_rate": 1.3945251734097827e-07, + "logits/chosen": 0.2940325438976288, + "logits/rejected": 0.15135368704795837, + "logps/chosen": -520.0460205078125, + "logps/rejected": -528.7132568359375, + "loss": 0.6594, + "nll_loss": 0.49102744460105896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4439469575881958, + "rewards/margins": 0.5450565814971924, + "rewards/rejected": -0.10110970586538315, + "step": 458 + }, + { + "epoch": 0.3997393004562242, + "grad_norm": 128.66033221606006, + "learning_rate": 1.3892701063173916e-07, + "logits/chosen": 0.556334376335144, + "logits/rejected": 0.5508618354797363, + "logps/chosen": -466.281005859375, + "logps/rejected": -486.53106689453125, + "loss": 0.5992, + "nll_loss": 0.4581470787525177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49654558300971985, + "rewards/margins": 0.3212546706199646, + "rewards/rejected": 0.17529097199440002, + "step": 460 + }, + { + "epoch": 0.40147729741472954, + "grad_norm": 131.50974327446443, + "learning_rate": 1.3840023386249714e-07, + "logits/chosen": 0.4506450891494751, + "logits/rejected": 0.47021448612213135, + "logps/chosen": -501.6130676269531, + "logps/rejected": -513.97021484375, + "loss": 0.6474, + "nll_loss": 0.49557405710220337, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7399130463600159, + "rewards/margins": 0.15451927483081818, + "rewards/rejected": 0.5853937864303589, + "step": 462 + }, + { + "epoch": 0.40321529437323483, + "grad_norm": 196.4458970816872, + "learning_rate": 1.3787220422024133e-07, + "logits/chosen": 0.2543451488018036, + "logits/rejected": 0.23989835381507874, + "logps/chosen": -510.9331970214844, + "logps/rejected": -540.4846801757812, + "loss": 0.6912, + "nll_loss": 0.4922424554824829, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.43558579683303833, + "rewards/margins": 0.13976748287677765, + "rewards/rejected": 0.2958183288574219, + "step": 464 + }, + { + "epoch": 0.4049532913317402, + "grad_norm": 146.01531397672778, + "learning_rate": 1.373429389328378e-07, + "logits/chosen": 0.5709559917449951, + "logits/rejected": 0.7091385722160339, + "logps/chosen": -508.1383361816406, + "logps/rejected": -539.6366577148438, + "loss": 0.6281, + "nll_loss": 0.4904443919658661, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6073122024536133, + "rewards/margins": 0.2943267822265625, + "rewards/rejected": 0.3129854202270508, + "step": 466 + }, + { + "epoch": 0.4066912882902455, + "grad_norm": 192.90454062378998, + "learning_rate": 1.368124552684678e-07, + "logits/chosen": 0.5518777370452881, + "logits/rejected": 0.49578994512557983, + "logps/chosen": -467.0284423828125, + "logps/rejected": -500.19354248046875, + "loss": 0.5714, + "nll_loss": 0.4271387755870819, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7035149931907654, + "rewards/margins": 0.17197123169898987, + "rewards/rejected": 0.5315437316894531, + "step": 468 + }, + { + "epoch": 0.40842928524875083, + "grad_norm": 229.61216495684394, + "learning_rate": 1.3628077053506408e-07, + "logits/chosen": 0.4732482433319092, + "logits/rejected": 0.6402056217193604, + "logps/chosen": -454.22943115234375, + "logps/rejected": -464.133544921875, + "loss": 0.6222, + "nll_loss": 0.4393461346626282, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4335966408252716, + "rewards/margins": 0.0941920354962349, + "rewards/rejected": 0.3394045829772949, + "step": 470 + }, + { + "epoch": 0.4101672822072561, + "grad_norm": 166.9812610820794, + "learning_rate": 1.3574790207974645e-07, + "logits/chosen": 0.40805256366729736, + "logits/rejected": 0.3065450191497803, + "logps/chosen": -404.9720458984375, + "logps/rejected": -471.55340576171875, + "loss": 0.5696, + "nll_loss": 0.3998640775680542, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6120911836624146, + "rewards/margins": 0.5119709372520447, + "rewards/rejected": 0.10012024641036987, + "step": 472 + }, + { + "epoch": 0.4119052791657615, + "grad_norm": 321.0367764864034, + "learning_rate": 1.352138672882555e-07, + "logits/chosen": 0.2997012734413147, + "logits/rejected": 0.31295493245124817, + "logps/chosen": -485.71783447265625, + "logps/rejected": -512.8876342773438, + "loss": 0.6176, + "nll_loss": 0.46717286109924316, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7201789617538452, + "rewards/margins": 0.24554875493049622, + "rewards/rejected": 0.47463029623031616, + "step": 474 + }, + { + "epoch": 0.41364327612426677, + "grad_norm": 146.69964221813686, + "learning_rate": 1.346786835843856e-07, + "logits/chosen": 0.4254930317401886, + "logits/rejected": 0.5058972239494324, + "logps/chosen": -480.9587707519531, + "logps/rejected": -475.7575988769531, + "loss": 0.6482, + "nll_loss": 0.46980273723602295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37536314129829407, + "rewards/margins": 0.38812559843063354, + "rewards/rejected": -0.012762445025146008, + "step": 476 + }, + { + "epoch": 0.4153812730827721, + "grad_norm": 228.93125805195882, + "learning_rate": 1.3414236842941642e-07, + "logits/chosen": 0.5630742311477661, + "logits/rejected": 0.667353093624115, + "logps/chosen": -516.9129028320312, + "logps/rejected": -573.7291259765625, + "loss": 0.6823, + "nll_loss": 0.49215757846832275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4989810883998871, + "rewards/margins": 0.3567728102207184, + "rewards/rejected": 0.1422082781791687, + "step": 478 + }, + { + "epoch": 0.4171192700412774, + "grad_norm": 194.58681668955893, + "learning_rate": 1.33604939321543e-07, + "logits/chosen": 0.6577449440956116, + "logits/rejected": 0.5744633674621582, + "logps/chosen": -491.93792724609375, + "logps/rejected": -494.926513671875, + "loss": 0.6405, + "nll_loss": 0.4891810417175293, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.275167852640152, + "rewards/margins": 0.14811164140701294, + "rewards/rejected": 0.12705622613430023, + "step": 480 + }, + { + "epoch": 0.41885726699978276, + "grad_norm": 186.33475326894825, + "learning_rate": 1.3306641379530512e-07, + "logits/chosen": 0.2660292983055115, + "logits/rejected": 0.3376016914844513, + "logps/chosen": -442.4875793457031, + "logps/rejected": -430.53155517578125, + "loss": 0.5902, + "nll_loss": 0.43695008754730225, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5159265398979187, + "rewards/margins": 0.4518337547779083, + "rewards/rejected": 0.06409282982349396, + "step": 482 + }, + { + "epoch": 0.42059526395828806, + "grad_norm": 161.1545478220916, + "learning_rate": 1.3252680942101498e-07, + "logits/chosen": 0.468216210603714, + "logits/rejected": 0.6578527092933655, + "logps/chosen": -402.4339599609375, + "logps/rejected": -512.015380859375, + "loss": 0.5966, + "nll_loss": 0.40780964493751526, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41949987411499023, + "rewards/margins": 0.2544829249382019, + "rewards/rejected": 0.16501693427562714, + "step": 484 + }, + { + "epoch": 0.4223332609167934, + "grad_norm": 268.6378897364134, + "learning_rate": 1.3198614380418408e-07, + "logits/chosen": 0.29518118500709534, + "logits/rejected": 0.3277493417263031, + "logps/chosen": -544.3590087890625, + "logps/rejected": -565.6707153320312, + "loss": 0.5767, + "nll_loss": 0.5259649157524109, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5081907510757446, + "rewards/margins": 0.5027738213539124, + "rewards/rejected": 0.005416871979832649, + "step": 486 + }, + { + "epoch": 0.4240712578752987, + "grad_norm": 222.5034932088444, + "learning_rate": 1.314444345849488e-07, + "logits/chosen": 0.2683011293411255, + "logits/rejected": 0.40357792377471924, + "logps/chosen": -436.8942565917969, + "logps/rejected": -519.6742553710938, + "loss": 0.6378, + "nll_loss": 0.4412961006164551, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.44710198044776917, + "rewards/margins": 0.21643924713134766, + "rewards/rejected": 0.2306627333164215, + "step": 488 + }, + { + "epoch": 0.42580925483380405, + "grad_norm": 121.21919863292835, + "learning_rate": 1.3090169943749475e-07, + "logits/chosen": 0.395599365234375, + "logits/rejected": 0.5225737690925598, + "logps/chosen": -448.34661865234375, + "logps/rejected": -471.4576416015625, + "loss": 0.5857, + "nll_loss": 0.4481515884399414, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4304274022579193, + "rewards/margins": 0.4703535735607147, + "rewards/rejected": -0.039926156401634216, + "step": 490 + }, + { + "epoch": 0.42754725179230935, + "grad_norm": 151.93906740828345, + "learning_rate": 1.3035795606948021e-07, + "logits/chosen": 0.4577603042125702, + "logits/rejected": 0.3892001807689667, + "logps/chosen": -455.62261962890625, + "logps/rejected": -465.81658935546875, + "loss": 0.5716, + "nll_loss": 0.4209946393966675, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4607146382331848, + "rewards/margins": 0.20319023728370667, + "rewards/rejected": 0.25752440094947815, + "step": 492 + }, + { + "epoch": 0.4292852487508147, + "grad_norm": 213.10071130507995, + "learning_rate": 1.2981322222145844e-07, + "logits/chosen": 0.4598763585090637, + "logits/rejected": 0.32266587018966675, + "logps/chosen": -484.8750915527344, + "logps/rejected": -435.47308349609375, + "loss": 0.6497, + "nll_loss": 0.4658326804637909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3073493242263794, + "rewards/margins": 0.3369705080986023, + "rewards/rejected": -0.029621221125125885, + "step": 494 + }, + { + "epoch": 0.43102324570932, + "grad_norm": 158.91277416486358, + "learning_rate": 1.2926751566629875e-07, + "logits/chosen": 0.23220184445381165, + "logits/rejected": 0.3059692084789276, + "logps/chosen": -456.1237487792969, + "logps/rejected": -553.1903686523438, + "loss": 0.6046, + "nll_loss": 0.447183221578598, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6733037829399109, + "rewards/margins": 0.8882058262825012, + "rewards/rejected": -0.2149021029472351, + "step": 496 + }, + { + "epoch": 0.43276124266782534, + "grad_norm": 125.90740115164517, + "learning_rate": 1.2872085420860664e-07, + "logits/chosen": 0.23051905632019043, + "logits/rejected": 0.19383393228054047, + "logps/chosen": -480.57135009765625, + "logps/rejected": -472.86993408203125, + "loss": 0.577, + "nll_loss": 0.46721819043159485, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0732322931289673, + "rewards/margins": 0.78470778465271, + "rewards/rejected": 0.28852444887161255, + "step": 498 + }, + { + "epoch": 0.43449923962633064, + "grad_norm": 211.1551689127061, + "learning_rate": 1.2817325568414297e-07, + "logits/chosen": 0.05189153552055359, + "logits/rejected": 0.02408377081155777, + "logps/chosen": -425.1849365234375, + "logps/rejected": -487.15814208984375, + "loss": 0.6655, + "nll_loss": 0.4015406668186188, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4995962083339691, + "rewards/margins": 0.27894628047943115, + "rewards/rejected": 0.22064992785453796, + "step": 500 + }, + { + "epoch": 0.436237236584836, + "grad_norm": 186.1345026565842, + "learning_rate": 1.2762473795924203e-07, + "logits/chosen": 0.6582891941070557, + "logits/rejected": 0.6144505739212036, + "logps/chosen": -484.273193359375, + "logps/rejected": -491.8813171386719, + "loss": 0.6492, + "nll_loss": 0.4544438123703003, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.533101499080658, + "rewards/margins": 0.2257290929555893, + "rewards/rejected": 0.3073723614215851, + "step": 502 + }, + { + "epoch": 0.4379752335433413, + "grad_norm": 160.01923268108317, + "learning_rate": 1.2707531893022853e-07, + "logits/chosen": 0.2920701205730438, + "logits/rejected": 0.3716092109680176, + "logps/chosen": -456.92645263671875, + "logps/rejected": -524.2435302734375, + "loss": 0.5366, + "nll_loss": 0.4609117805957794, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8777877688407898, + "rewards/margins": 0.4967181384563446, + "rewards/rejected": 0.3810696303844452, + "step": 504 + }, + { + "epoch": 0.43971323050184663, + "grad_norm": 300.5169860605167, + "learning_rate": 1.2652501652283377e-07, + "logits/chosen": 0.2937391996383667, + "logits/rejected": 0.3704921007156372, + "logps/chosen": -476.27325439453125, + "logps/rejected": -471.0837707519531, + "loss": 0.6354, + "nll_loss": 0.4966070353984833, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5227044224739075, + "rewards/margins": 0.35230910778045654, + "rewards/rejected": 0.17039528489112854, + "step": 506 + }, + { + "epoch": 0.4414512274603519, + "grad_norm": 164.91548254513887, + "learning_rate": 1.2597384869161084e-07, + "logits/chosen": 0.5253309011459351, + "logits/rejected": 0.4753131568431854, + "logps/chosen": -454.3218994140625, + "logps/rejected": -472.0747985839844, + "loss": 0.6169, + "nll_loss": 0.43823111057281494, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2607538402080536, + "rewards/margins": 0.1995864063501358, + "rewards/rejected": 0.06116743013262749, + "step": 508 + }, + { + "epoch": 0.4431892244188573, + "grad_norm": 179.30441921156591, + "learning_rate": 1.254218334193487e-07, + "logits/chosen": 0.2752171754837036, + "logits/rejected": 0.3763906955718994, + "logps/chosen": -520.6530151367188, + "logps/rejected": -562.1898193359375, + "loss": 0.6609, + "nll_loss": 0.525636613368988, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4572681486606598, + "rewards/margins": 0.32413142919540405, + "rewards/rejected": 0.13313673436641693, + "step": 510 + }, + { + "epoch": 0.44492722137736257, + "grad_norm": 243.18039368915072, + "learning_rate": 1.248689887164855e-07, + "logits/chosen": 0.35667744278907776, + "logits/rejected": 0.43777772784233093, + "logps/chosen": -563.6502075195312, + "logps/rejected": -510.6645812988281, + "loss": 0.7224, + "nll_loss": 0.5323516130447388, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.286058247089386, + "rewards/margins": -0.11358994990587234, + "rewards/rejected": 0.3996482193470001, + "step": 512 + }, + { + "epoch": 0.4466652183358679, + "grad_norm": 181.5612438218865, + "learning_rate": 1.2431533262052096e-07, + "logits/chosen": 0.3660877048969269, + "logits/rejected": 0.43879637122154236, + "logps/chosen": -479.750244140625, + "logps/rejected": -486.38720703125, + "loss": 0.5727, + "nll_loss": 0.4494829773902893, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5008564591407776, + "rewards/margins": 0.5270312428474426, + "rewards/rejected": -0.02617482841014862, + "step": 514 + }, + { + "epoch": 0.4484032152943732, + "grad_norm": 145.89465511544117, + "learning_rate": 1.23760883195428e-07, + "logits/chosen": 0.4242178797721863, + "logits/rejected": 0.321554571390152, + "logps/chosen": -525.2330932617188, + "logps/rejected": -488.3087158203125, + "loss": 0.5768, + "nll_loss": 0.5024228096008301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41718295216560364, + "rewards/margins": 0.37589991092681885, + "rewards/rejected": 0.04128303378820419, + "step": 516 + }, + { + "epoch": 0.45014121225287856, + "grad_norm": 143.8495286961736, + "learning_rate": 1.2320565853106316e-07, + "logits/chosen": 0.22734031081199646, + "logits/rejected": 0.05379747971892357, + "logps/chosen": -566.68603515625, + "logps/rejected": -463.33013916015625, + "loss": 0.5937, + "nll_loss": 0.5050527453422546, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1884550154209137, + "rewards/margins": 0.32553404569625854, + "rewards/rejected": -0.13707906007766724, + "step": 518 + }, + { + "epoch": 0.45187920921138386, + "grad_norm": 146.48961796321032, + "learning_rate": 1.2264967674257644e-07, + "logits/chosen": 0.302290141582489, + "logits/rejected": 0.1225215271115303, + "logps/chosen": -571.464111328125, + "logps/rejected": -472.727783203125, + "loss": 0.6082, + "nll_loss": 0.506357729434967, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.434457391500473, + "rewards/margins": 0.5556737780570984, + "rewards/rejected": -0.12121641635894775, + "step": 520 + }, + { + "epoch": 0.4536172061698892, + "grad_norm": 196.9459868837508, + "learning_rate": 1.220929559698204e-07, + "logits/chosen": 0.3795379102230072, + "logits/rejected": 0.3557262718677521, + "logps/chosen": -468.3785400390625, + "logps/rejected": -464.5234375, + "loss": 0.6234, + "nll_loss": 0.4429006576538086, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4287263751029968, + "rewards/margins": 0.4739219844341278, + "rewards/rejected": -0.045195575803518295, + "step": 522 + }, + { + "epoch": 0.4553552031283945, + "grad_norm": 150.5136352508016, + "learning_rate": 1.2153551437675818e-07, + "logits/chosen": 0.6236774325370789, + "logits/rejected": 0.625653862953186, + "logps/chosen": -456.63250732421875, + "logps/rejected": -428.4668884277344, + "loss": 0.6435, + "nll_loss": 0.4418932795524597, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4233695864677429, + "rewards/margins": 0.24205005168914795, + "rewards/rejected": 0.18131951987743378, + "step": 524 + }, + { + "epoch": 0.45709320008689985, + "grad_norm": 129.14095160067973, + "learning_rate": 1.2097737015087092e-07, + "logits/chosen": 0.45213156938552856, + "logits/rejected": 0.4116738438606262, + "logps/chosen": -487.0008239746094, + "logps/rejected": -510.26116943359375, + "loss": 0.5883, + "nll_loss": 0.4597417414188385, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3893871307373047, + "rewards/margins": 0.24991761147975922, + "rewards/rejected": 0.13946953415870667, + "step": 526 + }, + { + "epoch": 0.45883119704540515, + "grad_norm": 159.8229027766876, + "learning_rate": 1.2041854150256433e-07, + "logits/chosen": 0.11591125279664993, + "logits/rejected": 0.2681584060192108, + "logps/chosen": -506.5346984863281, + "logps/rejected": -575.7164916992188, + "loss": 0.6024, + "nll_loss": 0.474510133266449, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5386300086975098, + "rewards/margins": 0.23797369003295898, + "rewards/rejected": 0.3006563186645508, + "step": 528 + }, + { + "epoch": 0.4605691940039105, + "grad_norm": 129.70492097075226, + "learning_rate": 1.1985904666457453e-07, + "logits/chosen": 0.30116310715675354, + "logits/rejected": 0.11452615261077881, + "logps/chosen": -484.81524658203125, + "logps/rejected": -478.3340148925781, + "loss": 0.5955, + "nll_loss": 0.480922132730484, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7380178570747375, + "rewards/margins": 0.6249958872795105, + "rewards/rejected": 0.11302205920219421, + "step": 530 + }, + { + "epoch": 0.4623071909624158, + "grad_norm": 151.28619497251262, + "learning_rate": 1.1929890389137336e-07, + "logits/chosen": 0.8342644572257996, + "logits/rejected": 0.655091404914856, + "logps/chosen": -472.6812438964844, + "logps/rejected": -445.85791015625, + "loss": 0.5429, + "nll_loss": 0.45103174448013306, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5036472678184509, + "rewards/margins": 0.3290826082229614, + "rewards/rejected": 0.1745646446943283, + "step": 532 + }, + { + "epoch": 0.46404518792092114, + "grad_norm": 170.35845348035963, + "learning_rate": 1.1873813145857248e-07, + "logits/chosen": 0.28095149993896484, + "logits/rejected": 0.2095193713903427, + "logps/chosen": -461.6504821777344, + "logps/rejected": -511.35821533203125, + "loss": 0.5845, + "nll_loss": 0.4316372573375702, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6642789244651794, + "rewards/margins": 0.5230391621589661, + "rewards/rejected": 0.14123974740505219, + "step": 534 + }, + { + "epoch": 0.46578318487942644, + "grad_norm": 208.15877393203985, + "learning_rate": 1.1817674766232732e-07, + "logits/chosen": 0.2754078507423401, + "logits/rejected": 0.2587903141975403, + "logps/chosen": -457.4371337890625, + "logps/rejected": -485.6224365234375, + "loss": 0.6047, + "nll_loss": 0.44060018658638, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.30926474928855896, + "rewards/margins": 0.10683069378137589, + "rewards/rejected": 0.20243406295776367, + "step": 536 + }, + { + "epoch": 0.4675211818379318, + "grad_norm": 157.39469864638943, + "learning_rate": 1.1761477081874014e-07, + "logits/chosen": 0.30831941962242126, + "logits/rejected": 0.2548404335975647, + "logps/chosen": -510.2038269042969, + "logps/rejected": -530.1707153320312, + "loss": 0.6053, + "nll_loss": 0.495877206325531, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9113235473632812, + "rewards/margins": 0.38738253712654114, + "rewards/rejected": 0.5239410400390625, + "step": 538 + }, + { + "epoch": 0.4692591787964371, + "grad_norm": 174.35032866933201, + "learning_rate": 1.1705221926326239e-07, + "logits/chosen": 0.8459619283676147, + "logits/rejected": 0.6783990859985352, + "logps/chosen": -508.0318908691406, + "logps/rejected": -472.6733703613281, + "loss": 0.6481, + "nll_loss": 0.4781099557876587, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5047274231910706, + "rewards/margins": 0.45192253589630127, + "rewards/rejected": 0.0528048537671566, + "step": 540 + }, + { + "epoch": 0.47099717575494243, + "grad_norm": 134.3749169975391, + "learning_rate": 1.1648911135009633e-07, + "logits/chosen": 0.2638280391693115, + "logits/rejected": 0.32197749614715576, + "logps/chosen": -502.83843994140625, + "logps/rejected": -538.831787109375, + "loss": 0.5802, + "nll_loss": 0.4893040359020233, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.065590262413025, + "rewards/margins": 0.8750788569450378, + "rewards/rejected": 0.19051150977611542, + "step": 542 + }, + { + "epoch": 0.4727351727134477, + "grad_norm": 145.88486424397541, + "learning_rate": 1.1592546545159644e-07, + "logits/chosen": 0.22533680498600006, + "logits/rejected": 0.2013210654258728, + "logps/chosen": -454.45269775390625, + "logps/rejected": -471.0827941894531, + "loss": 0.5753, + "nll_loss": 0.4502761960029602, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.9982298016548157, + "rewards/margins": 0.5919008851051331, + "rewards/rejected": 0.40632885694503784, + "step": 544 + }, + { + "epoch": 0.4744731696719531, + "grad_norm": 137.1178337205799, + "learning_rate": 1.1536129995766994e-07, + "logits/chosen": 0.35270434617996216, + "logits/rejected": 0.29984721541404724, + "logps/chosen": -538.6884765625, + "logps/rejected": -470.1349792480469, + "loss": 0.6014, + "nll_loss": 0.4900573790073395, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3891260325908661, + "rewards/margins": -0.021922580897808075, + "rewards/rejected": 0.41104862093925476, + "step": 546 + }, + { + "epoch": 0.47621116663045837, + "grad_norm": 230.3870356848052, + "learning_rate": 1.1479663327517666e-07, + "logits/chosen": 0.44097521901130676, + "logits/rejected": 0.223893404006958, + "logps/chosen": -431.5259704589844, + "logps/rejected": -425.232177734375, + "loss": 0.6691, + "nll_loss": 0.41176140308380127, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.251619815826416, + "rewards/margins": 0.28884509205818176, + "rewards/rejected": -0.03722524642944336, + "step": 548 + }, + { + "epoch": 0.4779491635889637, + "grad_norm": 155.67191376098575, + "learning_rate": 1.1423148382732852e-07, + "logits/chosen": 0.4796018600463867, + "logits/rejected": 0.38772526383399963, + "logps/chosen": -464.15447998046875, + "logps/rejected": -450.4916687011719, + "loss": 0.5792, + "nll_loss": 0.44901058077812195, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4397455155849457, + "rewards/margins": 0.4938681125640869, + "rewards/rejected": -0.05412254482507706, + "step": 550 + }, + { + "epoch": 0.479687160547469, + "grad_norm": 228.9713106548911, + "learning_rate": 1.1366587005308857e-07, + "logits/chosen": 0.339149534702301, + "logits/rejected": 0.4987982511520386, + "logps/chosen": -453.7382507324219, + "logps/rejected": -449.76123046875, + "loss": 0.5801, + "nll_loss": 0.44411030411720276, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3493182063102722, + "rewards/margins": 0.255526602268219, + "rewards/rejected": 0.09379158169031143, + "step": 552 + }, + { + "epoch": 0.48142515750597437, + "grad_norm": 115.06327317172317, + "learning_rate": 1.1309981040656929e-07, + "logits/chosen": 0.34733298420906067, + "logits/rejected": 0.48480936884880066, + "logps/chosen": -473.6231994628906, + "logps/rejected": -590.5986938476562, + "loss": 0.6056, + "nll_loss": 0.4536553919315338, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.29592639207839966, + "rewards/margins": 0.6725630760192871, + "rewards/rejected": -0.37663671374320984, + "step": 554 + }, + { + "epoch": 0.48316315446447966, + "grad_norm": 169.34703454184748, + "learning_rate": 1.1253332335643042e-07, + "logits/chosen": 0.37676888704299927, + "logits/rejected": 0.3015158772468567, + "logps/chosen": -512.7796630859375, + "logps/rejected": -499.06634521484375, + "loss": 0.5603, + "nll_loss": 0.4944426715373993, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7520829439163208, + "rewards/margins": 0.9439869523048401, + "rewards/rejected": -0.1919039934873581, + "step": 556 + }, + { + "epoch": 0.484901151422985, + "grad_norm": 161.22134816206415, + "learning_rate": 1.1196642738527657e-07, + "logits/chosen": 0.23896373808383942, + "logits/rejected": 0.19408001005649567, + "logps/chosen": -426.60546875, + "logps/rejected": -434.4737854003906, + "loss": 0.5932, + "nll_loss": 0.39752912521362305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6780611872673035, + "rewards/margins": 0.5683779120445251, + "rewards/rejected": 0.1096833273768425, + "step": 558 + }, + { + "epoch": 0.4866391483814903, + "grad_norm": 127.96582961604234, + "learning_rate": 1.1139914098905406e-07, + "logits/chosen": 0.5050298571586609, + "logits/rejected": 0.3574158251285553, + "logps/chosen": -476.66937255859375, + "logps/rejected": -472.6229553222656, + "loss": 0.4979, + "nll_loss": 0.46670016646385193, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2314346432685852, + "rewards/margins": 0.6994683742523193, + "rewards/rejected": -0.46803373098373413, + "step": 560 + }, + { + "epoch": 0.48837714533999566, + "grad_norm": 223.9780474943852, + "learning_rate": 1.1083148267644747e-07, + "logits/chosen": 0.39628350734710693, + "logits/rejected": 0.6703532338142395, + "logps/chosen": -488.671142578125, + "logps/rejected": -539.9094848632812, + "loss": 0.6061, + "nll_loss": 0.46648550033569336, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5457121133804321, + "rewards/margins": 0.30177631974220276, + "rewards/rejected": 0.24393577873706818, + "step": 562 + }, + { + "epoch": 0.490115142298501, + "grad_norm": 158.92810889604107, + "learning_rate": 1.1026347096827577e-07, + "logits/chosen": 0.21636277437210083, + "logits/rejected": 0.13212069869041443, + "logps/chosen": -502.3656921386719, + "logps/rejected": -433.3890686035156, + "loss": 0.5821, + "nll_loss": 0.47475725412368774, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3836136758327484, + "rewards/margins": 0.23296299576759338, + "rewards/rejected": 0.15065070986747742, + "step": 564 + }, + { + "epoch": 0.4918531392570063, + "grad_norm": 143.877679933503, + "learning_rate": 1.0969512439688814e-07, + "logits/chosen": 0.4711693227291107, + "logits/rejected": 0.4417217969894409, + "logps/chosen": -496.0475769042969, + "logps/rejected": -466.2301940917969, + "loss": 0.633, + "nll_loss": 0.44707292318344116, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.36830979585647583, + "rewards/margins": 0.029814153909683228, + "rewards/rejected": 0.3384956419467926, + "step": 566 + }, + { + "epoch": 0.49359113621551165, + "grad_norm": 142.79523446099884, + "learning_rate": 1.0912646150555917e-07, + "logits/chosen": 0.47549864649772644, + "logits/rejected": 0.5703907608985901, + "logps/chosen": -439.390625, + "logps/rejected": -507.3600158691406, + "loss": 0.6273, + "nll_loss": 0.4341786503791809, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22542604804039001, + "rewards/margins": 0.32407084107398987, + "rewards/rejected": -0.09864482283592224, + "step": 568 + }, + { + "epoch": 0.49532913317401694, + "grad_norm": 168.53592808344612, + "learning_rate": 1.0855750084788397e-07, + "logits/chosen": 0.13578006625175476, + "logits/rejected": 0.22005786001682281, + "logps/chosen": -495.08270263671875, + "logps/rejected": -540.0595092773438, + "loss": 0.6059, + "nll_loss": 0.49262234568595886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7732293605804443, + "rewards/margins": 0.5452542304992676, + "rewards/rejected": 0.22797507047653198, + "step": 570 + }, + { + "epoch": 0.4970671301325223, + "grad_norm": 158.41658922638484, + "learning_rate": 1.0798826098717275e-07, + "logits/chosen": 0.18378670513629913, + "logits/rejected": 0.14254061877727509, + "logps/chosen": -484.80621337890625, + "logps/rejected": -501.53265380859375, + "loss": 0.6449, + "nll_loss": 0.4623822271823883, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1718544214963913, + "rewards/margins": 0.07776500284671783, + "rewards/rejected": 0.09408941119909286, + "step": 572 + }, + { + "epoch": 0.4988051270910276, + "grad_norm": 118.95912857829893, + "learning_rate": 1.0741876049584522e-07, + "logits/chosen": 0.46760082244873047, + "logits/rejected": 0.6172095537185669, + "logps/chosen": -450.90899658203125, + "logps/rejected": -485.8258056640625, + "loss": 0.6145, + "nll_loss": 0.4425750970840454, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4866788983345032, + "rewards/margins": 0.5924919247627258, + "rewards/rejected": -0.10581303387880325, + "step": 574 + }, + { + "epoch": 0.5005431240495329, + "grad_norm": 149.05944822937784, + "learning_rate": 1.0684901795482455e-07, + "logits/chosen": 0.20234175026416779, + "logits/rejected": 0.1904958337545395, + "logps/chosen": -478.34521484375, + "logps/rejected": -462.22735595703125, + "loss": 0.6229, + "nll_loss": 0.4496627151966095, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3698406219482422, + "rewards/margins": 0.26504114270210266, + "rewards/rejected": 0.10479945689439774, + "step": 576 + }, + { + "epoch": 0.5022811210080382, + "grad_norm": 191.23054812610573, + "learning_rate": 1.0627905195293134e-07, + "logits/chosen": 0.5189734697341919, + "logits/rejected": 0.40094074606895447, + "logps/chosen": -569.7152709960938, + "logps/rejected": -532.6102905273438, + "loss": 0.6364, + "nll_loss": 0.5307385921478271, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7689107656478882, + "rewards/margins": 0.6044713854789734, + "rewards/rejected": 0.16443948447704315, + "step": 578 + }, + { + "epoch": 0.5040191179665435, + "grad_norm": 138.79983096839496, + "learning_rate": 1.057088810862768e-07, + "logits/chosen": 0.4737722873687744, + "logits/rejected": 0.5332717895507812, + "logps/chosen": -517.5496215820312, + "logps/rejected": -510.6952819824219, + "loss": 0.6599, + "nll_loss": 0.45416009426116943, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5199866890907288, + "rewards/margins": 0.19118711352348328, + "rewards/rejected": 0.32879963517189026, + "step": 580 + }, + { + "epoch": 0.5057571149250489, + "grad_norm": 164.12647001236164, + "learning_rate": 1.051385239576563e-07, + "logits/chosen": 0.7203247547149658, + "logits/rejected": 0.5283986330032349, + "logps/chosen": -516.7804565429688, + "logps/rejected": -512.832275390625, + "loss": 0.6016, + "nll_loss": 0.4893644452095032, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4361743927001953, + "rewards/margins": 0.40919187664985657, + "rewards/rejected": 0.02698250487446785, + "step": 582 + }, + { + "epoch": 0.5074951118835542, + "grad_norm": 183.72101725659599, + "learning_rate": 1.0456799917594232e-07, + "logits/chosen": 0.40470677614212036, + "logits/rejected": 0.35310330986976624, + "logps/chosen": -465.427978515625, + "logps/rejected": -491.4359130859375, + "loss": 0.624, + "nll_loss": 0.44728025794029236, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47867053747177124, + "rewards/margins": 0.2610442340373993, + "rewards/rejected": 0.21762628853321075, + "step": 584 + }, + { + "epoch": 0.5092331088420595, + "grad_norm": 155.45934240025235, + "learning_rate": 1.0399732535547734e-07, + "logits/chosen": 0.057142481207847595, + "logits/rejected": 0.040528830140829086, + "logps/chosen": -399.9873046875, + "logps/rejected": -442.05029296875, + "loss": 0.6404, + "nll_loss": 0.42519205808639526, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6752683520317078, + "rewards/margins": 0.7227709889411926, + "rewards/rejected": -0.04750242829322815, + "step": 586 + }, + { + "epoch": 0.5109711058005648, + "grad_norm": 166.46533111008497, + "learning_rate": 1.0342652111546635e-07, + "logits/chosen": 0.2565877139568329, + "logits/rejected": 0.3365154266357422, + "logps/chosen": -478.84820556640625, + "logps/rejected": -534.953857421875, + "loss": 0.6102, + "nll_loss": 0.4510883092880249, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8399749398231506, + "rewards/margins": 0.3826572000980377, + "rewards/rejected": 0.4573177099227905, + "step": 588 + }, + { + "epoch": 0.5127091027590702, + "grad_norm": 122.85256715814496, + "learning_rate": 1.0285560507936961e-07, + "logits/chosen": 0.4950277805328369, + "logits/rejected": 0.5933247804641724, + "logps/chosen": -484.6195068359375, + "logps/rejected": -523.1106567382812, + "loss": 0.6011, + "nll_loss": 0.4561443626880646, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41678088903427124, + "rewards/margins": 0.32378941774368286, + "rewards/rejected": 0.09299144148826599, + "step": 590 + }, + { + "epoch": 0.5144470997175755, + "grad_norm": 190.64666153846562, + "learning_rate": 1.0228459587429496e-07, + "logits/chosen": 0.30040597915649414, + "logits/rejected": 0.4183442294597626, + "logps/chosen": -459.64776611328125, + "logps/rejected": -459.33148193359375, + "loss": 0.6286, + "nll_loss": 0.4211972951889038, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7403571009635925, + "rewards/margins": 0.3292871117591858, + "rewards/rejected": 0.41106992959976196, + "step": 592 + }, + { + "epoch": 0.5161850966760808, + "grad_norm": 128.71076382897667, + "learning_rate": 1.0171351213038992e-07, + "logits/chosen": 0.5232183337211609, + "logits/rejected": 0.4865014851093292, + "logps/chosen": -468.09246826171875, + "logps/rejected": -496.3455505371094, + "loss": 0.6034, + "nll_loss": 0.45981934666633606, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.43860453367233276, + "rewards/margins": 0.31035470962524414, + "rewards/rejected": 0.128249853849411, + "step": 594 + }, + { + "epoch": 0.5179230936345861, + "grad_norm": 213.50640814106367, + "learning_rate": 1.0114237248023403e-07, + "logits/chosen": 0.5549350380897522, + "logits/rejected": 0.6062289476394653, + "logps/chosen": -448.6082458496094, + "logps/rejected": -485.8319396972656, + "loss": 0.6498, + "nll_loss": 0.4462430477142334, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49667686223983765, + "rewards/margins": 0.1833028942346573, + "rewards/rejected": 0.31337395310401917, + "step": 596 + }, + { + "epoch": 0.5196610905930915, + "grad_norm": 185.87793569991007, + "learning_rate": 1.0057119555823083e-07, + "logits/chosen": 0.2625095248222351, + "logits/rejected": 0.3052242398262024, + "logps/chosen": -490.8760986328125, + "logps/rejected": -475.52947998046875, + "loss": 0.5803, + "nll_loss": 0.4620268940925598, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6914440989494324, + "rewards/margins": 0.6549934148788452, + "rewards/rejected": 0.03645067289471626, + "step": 598 + }, + { + "epoch": 0.5213990875515968, + "grad_norm": 263.05843652050453, + "learning_rate": 1e-07, + "logits/chosen": 0.5347599983215332, + "logits/rejected": 0.6004009246826172, + "logps/chosen": -468.2330017089844, + "logps/rejected": -488.34033203125, + "loss": 0.5876, + "nll_loss": 0.44898730516433716, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.29759863018989563, + "rewards/margins": 0.2130270004272461, + "rewards/rejected": 0.08457164466381073, + "step": 600 + }, + { + "epoch": 0.5231370845101021, + "grad_norm": 334.5831268901687, + "learning_rate": 9.942880444176916e-08, + "logits/chosen": 0.5655696988105774, + "logits/rejected": 0.4727866053581238, + "logps/chosen": -519.6710205078125, + "logps/rejected": -482.19970703125, + "loss": 0.6295, + "nll_loss": 0.48799240589141846, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7050300240516663, + "rewards/margins": 0.6107040643692017, + "rewards/rejected": 0.09432603418827057, + "step": 602 + }, + { + "epoch": 0.5248750814686074, + "grad_norm": 109.64482292970844, + "learning_rate": 9.885762751976599e-08, + "logits/chosen": 0.32183387875556946, + "logits/rejected": 0.334905743598938, + "logps/chosen": -468.80841064453125, + "logps/rejected": -527.1265258789062, + "loss": 0.5201, + "nll_loss": 0.442211776971817, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.38873061537742615, + "rewards/margins": 0.5435060858726501, + "rewards/rejected": -0.154775470495224, + "step": 604 + }, + { + "epoch": 0.5266130784271128, + "grad_norm": 186.41721141455656, + "learning_rate": 9.828648786961007e-08, + "logits/chosen": 0.3031446933746338, + "logits/rejected": 0.3322385549545288, + "logps/chosen": -510.8680114746094, + "logps/rejected": -531.59521484375, + "loss": 0.5985, + "nll_loss": 0.5014962553977966, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2036452293395996, + "rewards/margins": 0.3037644624710083, + "rewards/rejected": -0.1001192182302475, + "step": 606 + }, + { + "epoch": 0.5283510753856181, + "grad_norm": 274.3454262806401, + "learning_rate": 9.771540412570503e-08, + "logits/chosen": 0.5674399733543396, + "logits/rejected": 0.37741467356681824, + "logps/chosen": -507.96551513671875, + "logps/rejected": -557.3289184570312, + "loss": 0.6339, + "nll_loss": 0.4965820908546448, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.540863037109375, + "rewards/margins": 0.5740821957588196, + "rewards/rejected": -0.033219143748283386, + "step": 608 + }, + { + "epoch": 0.5300890723441234, + "grad_norm": 163.62183547762456, + "learning_rate": 9.714439492063038e-08, + "logits/chosen": 0.2575898766517639, + "logits/rejected": 0.1513080596923828, + "logps/chosen": -489.66778564453125, + "logps/rejected": -457.0095520019531, + "loss": 0.5504, + "nll_loss": 0.44628310203552246, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3893031179904938, + "rewards/margins": 0.4137801229953766, + "rewards/rejected": -0.02447700686752796, + "step": 610 + }, + { + "epoch": 0.5318270693026287, + "grad_norm": 125.74716419736642, + "learning_rate": 9.657347888453366e-08, + "logits/chosen": 0.6081162095069885, + "logits/rejected": 0.40688732266426086, + "logps/chosen": -536.954833984375, + "logps/rejected": -493.84674072265625, + "loss": 0.5403, + "nll_loss": 0.5140741467475891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22217179834842682, + "rewards/margins": 0.21463699638843536, + "rewards/rejected": 0.007534794509410858, + "step": 612 + }, + { + "epoch": 0.5335650662611341, + "grad_norm": 151.84445184165773, + "learning_rate": 9.600267464452268e-08, + "logits/chosen": 0.4383659362792969, + "logits/rejected": 0.3102107644081116, + "logps/chosen": -534.6839599609375, + "logps/rejected": -535.601318359375, + "loss": 0.6385, + "nll_loss": 0.4828362762928009, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4167993366718292, + "rewards/margins": 0.35233569145202637, + "rewards/rejected": 0.06446360796689987, + "step": 614 + }, + { + "epoch": 0.5353030632196394, + "grad_norm": 165.9235457287381, + "learning_rate": 9.543200082405767e-08, + "logits/chosen": 0.6091023683547974, + "logits/rejected": 0.5879953503608704, + "logps/chosen": -553.71142578125, + "logps/rejected": -550.5123291015625, + "loss": 0.5878, + "nll_loss": 0.5072000622749329, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5500698089599609, + "rewards/margins": 0.4482544958591461, + "rewards/rejected": 0.10181531310081482, + "step": 616 + }, + { + "epoch": 0.5370410601781447, + "grad_norm": 126.54308707325764, + "learning_rate": 9.48614760423437e-08, + "logits/chosen": 0.18392491340637207, + "logits/rejected": 0.3804677128791809, + "logps/chosen": -381.7693786621094, + "logps/rejected": -473.5242919921875, + "loss": 0.5744, + "nll_loss": 0.39874348044395447, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4918847680091858, + "rewards/margins": 0.5275743007659912, + "rewards/rejected": -0.03568943962454796, + "step": 618 + }, + { + "epoch": 0.53877905713665, + "grad_norm": 171.08443280226848, + "learning_rate": 9.429111891372318e-08, + "logits/chosen": 0.2717643678188324, + "logits/rejected": 0.18191061913967133, + "logps/chosen": -497.6204833984375, + "logps/rejected": -487.7861022949219, + "loss": 0.6364, + "nll_loss": 0.45864543318748474, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40872687101364136, + "rewards/margins": 0.272097647190094, + "rewards/rejected": 0.13662928342819214, + "step": 620 + }, + { + "epoch": 0.5405170540951554, + "grad_norm": 353.14665428046374, + "learning_rate": 9.372094804706866e-08, + "logits/chosen": 0.34724730253219604, + "logits/rejected": 0.542032778263092, + "logps/chosen": -481.6340026855469, + "logps/rejected": -492.6419982910156, + "loss": 0.5841, + "nll_loss": 0.4751821458339691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.437094122171402, + "rewards/margins": 0.47972115874290466, + "rewards/rejected": -0.042627010494470596, + "step": 622 + }, + { + "epoch": 0.5422550510536607, + "grad_norm": 181.9942371418194, + "learning_rate": 9.315098204517542e-08, + "logits/chosen": 0.5911327600479126, + "logits/rejected": 0.686689019203186, + "logps/chosen": -493.7080383300781, + "logps/rejected": -526.79443359375, + "loss": 0.6313, + "nll_loss": 0.46785080432891846, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5453550815582275, + "rewards/margins": 0.2460251748561859, + "rewards/rejected": 0.29932984709739685, + "step": 624 + }, + { + "epoch": 0.543993048012166, + "grad_norm": 132.5432904481579, + "learning_rate": 9.258123950415478e-08, + "logits/chosen": 0.2856918275356293, + "logits/rejected": 0.2954900860786438, + "logps/chosen": -457.42034912109375, + "logps/rejected": -460.74700927734375, + "loss": 0.5104, + "nll_loss": 0.4443623423576355, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6252665519714355, + "rewards/margins": 0.5157196521759033, + "rewards/rejected": 0.10954684019088745, + "step": 626 + }, + { + "epoch": 0.5457310449706713, + "grad_norm": 162.89205752421685, + "learning_rate": 9.201173901282723e-08, + "logits/chosen": 0.27452006936073303, + "logits/rejected": 0.3245624899864197, + "logps/chosen": -469.73541259765625, + "logps/rejected": -475.74957275390625, + "loss": 0.6022, + "nll_loss": 0.45277708768844604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48880985379219055, + "rewards/margins": 0.12186084687709808, + "rewards/rejected": 0.3669489920139313, + "step": 628 + }, + { + "epoch": 0.5474690419291767, + "grad_norm": 253.42365459394915, + "learning_rate": 9.144249915211604e-08, + "logits/chosen": 0.2531193494796753, + "logits/rejected": 0.2789984941482544, + "logps/chosen": -463.0240478515625, + "logps/rejected": -504.46575927734375, + "loss": 0.6221, + "nll_loss": 0.49072834849357605, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0013586282730103, + "rewards/margins": 0.46089407801628113, + "rewards/rejected": 0.540464460849762, + "step": 630 + }, + { + "epoch": 0.549207038887682, + "grad_norm": 125.5789962477866, + "learning_rate": 9.087353849444083e-08, + "logits/chosen": 0.4191408157348633, + "logits/rejected": 0.30525293946266174, + "logps/chosen": -530.7289428710938, + "logps/rejected": -490.19366455078125, + "loss": 0.5603, + "nll_loss": 0.5064207315444946, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2410431057214737, + "rewards/margins": 0.24200935661792755, + "rewards/rejected": -0.0009662678348831832, + "step": 632 + }, + { + "epoch": 0.5509450358461873, + "grad_norm": 175.99284235626237, + "learning_rate": 9.030487560311185e-08, + "logits/chosen": 0.6235304474830627, + "logits/rejected": 0.5203417539596558, + "logps/chosen": -483.08404541015625, + "logps/rejected": -489.843994140625, + "loss": 0.5813, + "nll_loss": 0.47011032700538635, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5235908627510071, + "rewards/margins": 0.4106482267379761, + "rewards/rejected": 0.11294259130954742, + "step": 634 + }, + { + "epoch": 0.5526830328046926, + "grad_norm": 198.90559585429537, + "learning_rate": 8.973652903172422e-08, + "logits/chosen": 0.47140440344810486, + "logits/rejected": 0.5593248605728149, + "logps/chosen": -518.4012451171875, + "logps/rejected": -533.6356201171875, + "loss": 0.5951, + "nll_loss": 0.4733196496963501, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6831684112548828, + "rewards/margins": 0.2774263620376587, + "rewards/rejected": 0.4057420790195465, + "step": 636 + }, + { + "epoch": 0.554421029763198, + "grad_norm": 141.98294190025055, + "learning_rate": 8.916851732355253e-08, + "logits/chosen": 0.415526419878006, + "logits/rejected": 0.14838898181915283, + "logps/chosen": -462.7410583496094, + "logps/rejected": -434.5695495605469, + "loss": 0.5801, + "nll_loss": 0.43723177909851074, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6801949143409729, + "rewards/margins": 0.39234715700149536, + "rewards/rejected": 0.28784769773483276, + "step": 638 + }, + { + "epoch": 0.5561590267217033, + "grad_norm": 183.53135339200935, + "learning_rate": 8.860085901094593e-08, + "logits/chosen": 0.4905831813812256, + "logits/rejected": 0.45162349939346313, + "logps/chosen": -513.9711303710938, + "logps/rejected": -480.98150634765625, + "loss": 0.5924, + "nll_loss": 0.4619283080101013, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5028316378593445, + "rewards/margins": 0.22538775205612183, + "rewards/rejected": 0.27744388580322266, + "step": 640 + }, + { + "epoch": 0.5578970236802085, + "grad_norm": 165.1047438993026, + "learning_rate": 8.803357261472343e-08, + "logits/chosen": 0.4632202386856079, + "logits/rejected": 0.40927284955978394, + "logps/chosen": -527.6860961914062, + "logps/rejected": -524.5245361328125, + "loss": 0.587, + "nll_loss": 0.48514413833618164, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.688771665096283, + "rewards/margins": 0.652374267578125, + "rewards/rejected": 0.03639736771583557, + "step": 642 + }, + { + "epoch": 0.5596350206387138, + "grad_norm": 144.25736006644212, + "learning_rate": 8.746667664356956e-08, + "logits/chosen": -0.019076313823461533, + "logits/rejected": -0.005755473859608173, + "logps/chosen": -463.5662536621094, + "logps/rejected": -538.6168212890625, + "loss": 0.5763, + "nll_loss": 0.45522579550743103, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6912392377853394, + "rewards/margins": 0.3665103018283844, + "rewards/rejected": 0.32472896575927734, + "step": 644 + }, + { + "epoch": 0.5613730175972192, + "grad_norm": 124.41344478131262, + "learning_rate": 8.69001895934307e-08, + "logits/chosen": 0.4203868806362152, + "logits/rejected": 0.4716566205024719, + "logps/chosen": -449.85791015625, + "logps/rejected": -419.0743408203125, + "loss": 0.5856, + "nll_loss": 0.4255959987640381, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.426087349653244, + "rewards/margins": 0.16484859585762024, + "rewards/rejected": 0.2612387537956238, + "step": 646 + }, + { + "epoch": 0.5631110145557245, + "grad_norm": 133.22640371919314, + "learning_rate": 8.633412994691143e-08, + "logits/chosen": 0.7127283215522766, + "logits/rejected": 0.6688105463981628, + "logps/chosen": -516.0383911132812, + "logps/rejected": -493.16217041015625, + "loss": 0.6169, + "nll_loss": 0.489109069108963, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5604671239852905, + "rewards/margins": 0.549665629863739, + "rewards/rejected": 0.010801505297422409, + "step": 648 + }, + { + "epoch": 0.5648490115142298, + "grad_norm": 254.8501288063932, + "learning_rate": 8.576851617267149e-08, + "logits/chosen": 0.6733088493347168, + "logits/rejected": 0.7629974484443665, + "logps/chosen": -494.3887939453125, + "logps/rejected": -501.49029541015625, + "loss": 0.5545, + "nll_loss": 0.47174495458602905, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3747645318508148, + "rewards/margins": 0.24593639373779297, + "rewards/rejected": 0.12882815301418304, + "step": 650 + }, + { + "epoch": 0.5665870084727351, + "grad_norm": 254.4364667759312, + "learning_rate": 8.520336672482337e-08, + "logits/chosen": 0.3818691074848175, + "logits/rejected": 0.34968656301498413, + "logps/chosen": -503.2013854980469, + "logps/rejected": -476.4749755859375, + "loss": 0.6643, + "nll_loss": 0.44002267718315125, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4178128242492676, + "rewards/margins": 0.0481499545276165, + "rewards/rejected": 0.369662880897522, + "step": 652 + }, + { + "epoch": 0.5683250054312405, + "grad_norm": 145.54967811889972, + "learning_rate": 8.463870004233007e-08, + "logits/chosen": 0.2866371273994446, + "logits/rejected": 0.3002834618091583, + "logps/chosen": -455.74786376953125, + "logps/rejected": -458.2134704589844, + "loss": 0.5675, + "nll_loss": 0.46567031741142273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6578887701034546, + "rewards/margins": 0.5248188376426697, + "rewards/rejected": 0.1330699920654297, + "step": 654 + }, + { + "epoch": 0.5700630023897458, + "grad_norm": 187.06363223184863, + "learning_rate": 8.407453454840356e-08, + "logits/chosen": 0.5434020161628723, + "logits/rejected": 0.4297879934310913, + "logps/chosen": -485.2026062011719, + "logps/rejected": -492.1148681640625, + "loss": 0.5795, + "nll_loss": 0.44286367297172546, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6396047472953796, + "rewards/margins": 0.44584396481513977, + "rewards/rejected": 0.19376078248023987, + "step": 656 + }, + { + "epoch": 0.5718009993482511, + "grad_norm": 150.6278217076987, + "learning_rate": 8.351088864990367e-08, + "logits/chosen": 0.23380723595619202, + "logits/rejected": 0.4935187101364136, + "logps/chosen": -443.7547607421875, + "logps/rejected": -480.4429626464844, + "loss": 0.5463, + "nll_loss": 0.4307003319263458, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7835605144500732, + "rewards/margins": 0.7314401268959045, + "rewards/rejected": 0.05212049186229706, + "step": 658 + }, + { + "epoch": 0.5735389963067564, + "grad_norm": 152.2047609267045, + "learning_rate": 8.294778073673762e-08, + "logits/chosen": 0.33786603808403015, + "logits/rejected": 0.34570395946502686, + "logps/chosen": -413.5665588378906, + "logps/rejected": -484.5573425292969, + "loss": 0.5945, + "nll_loss": 0.41921356320381165, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41442328691482544, + "rewards/margins": 0.3553313612937927, + "rewards/rejected": 0.0590919554233551, + "step": 660 + }, + { + "epoch": 0.5752769932652618, + "grad_norm": 98.79951835870942, + "learning_rate": 8.238522918125983e-08, + "logits/chosen": 0.5606307983398438, + "logits/rejected": 0.5547394156455994, + "logps/chosen": -492.9490966796875, + "logps/rejected": -486.5016174316406, + "loss": 0.5391, + "nll_loss": 0.47447332739830017, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7773496508598328, + "rewards/margins": 0.3164869546890259, + "rewards/rejected": 0.46086278557777405, + "step": 662 + }, + { + "epoch": 0.5770149902237671, + "grad_norm": 139.0633092591687, + "learning_rate": 8.182325233767267e-08, + "logits/chosen": 0.4253976047039032, + "logits/rejected": 0.46992579102516174, + "logps/chosen": -450.5101623535156, + "logps/rejected": -494.90667724609375, + "loss": 0.571, + "nll_loss": 0.45393383502960205, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4154530167579651, + "rewards/margins": 0.47700321674346924, + "rewards/rejected": -0.061550240963697433, + "step": 664 + }, + { + "epoch": 0.5787529871822724, + "grad_norm": 173.14859519488306, + "learning_rate": 8.126186854142751e-08, + "logits/chosen": 0.8982985615730286, + "logits/rejected": 0.8473838567733765, + "logps/chosen": -465.6043701171875, + "logps/rejected": -483.4315185546875, + "loss": 0.6207, + "nll_loss": 0.4442789554595947, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3262258768081665, + "rewards/margins": 0.155735582113266, + "rewards/rejected": 0.17049026489257812, + "step": 666 + }, + { + "epoch": 0.5804909841407777, + "grad_norm": 128.41420443111352, + "learning_rate": 8.070109610862667e-08, + "logits/chosen": 0.16398797929286957, + "logits/rejected": 0.08619170635938644, + "logps/chosen": -456.3326721191406, + "logps/rejected": -419.8201904296875, + "loss": 0.5671, + "nll_loss": 0.4239170253276825, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6617186665534973, + "rewards/margins": 0.5650426745414734, + "rewards/rejected": 0.09667597711086273, + "step": 668 + }, + { + "epoch": 0.5822289810992831, + "grad_norm": 198.70109611531015, + "learning_rate": 8.014095333542547e-08, + "logits/chosen": 0.005634918808937073, + "logits/rejected": -0.006585095077753067, + "logps/chosen": -424.5329284667969, + "logps/rejected": -469.5064392089844, + "loss": 0.5497, + "nll_loss": 0.43806880712509155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.148248314857483, + "rewards/margins": 0.7581798434257507, + "rewards/rejected": 0.3900684714317322, + "step": 670 + }, + { + "epoch": 0.5839669780577884, + "grad_norm": 200.8765720947795, + "learning_rate": 7.958145849743569e-08, + "logits/chosen": 0.38725346326828003, + "logits/rejected": 0.328239381313324, + "logps/chosen": -499.67547607421875, + "logps/rejected": -477.58050537109375, + "loss": 0.5503, + "nll_loss": 0.44405287504196167, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4755365550518036, + "rewards/margins": 0.31502342224121094, + "rewards/rejected": 0.16051313281059265, + "step": 672 + }, + { + "epoch": 0.5857049750162937, + "grad_norm": 138.10866585007568, + "learning_rate": 7.902262984912909e-08, + "logits/chosen": 0.5126460790634155, + "logits/rejected": 0.457733154296875, + "logps/chosen": -431.8909606933594, + "logps/rejected": -441.8907775878906, + "loss": 0.6765, + "nll_loss": 0.42958134412765503, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5870010852813721, + "rewards/margins": -0.019699443131685257, + "rewards/rejected": 0.6067005395889282, + "step": 674 + }, + { + "epoch": 0.587442971974799, + "grad_norm": 153.65852416714486, + "learning_rate": 7.846448562324182e-08, + "logits/chosen": 0.4234592318534851, + "logits/rejected": 0.4593704640865326, + "logps/chosen": -462.70745849609375, + "logps/rejected": -477.0972900390625, + "loss": 0.5945, + "nll_loss": 0.45969316363334656, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7346555590629578, + "rewards/margins": 0.3743089735507965, + "rewards/rejected": 0.36034661531448364, + "step": 676 + }, + { + "epoch": 0.5891809689333044, + "grad_norm": 220.09425556349453, + "learning_rate": 7.79070440301796e-08, + "logits/chosen": 0.5279329419136047, + "logits/rejected": 0.5622657537460327, + "logps/chosen": -479.4163513183594, + "logps/rejected": -521.3204956054688, + "loss": 0.5814, + "nll_loss": 0.43853023648262024, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.529964804649353, + "rewards/margins": 0.25607913732528687, + "rewards/rejected": 0.27388572692871094, + "step": 678 + }, + { + "epoch": 0.5909189658918097, + "grad_norm": 114.49030691842641, + "learning_rate": 7.735032325742354e-08, + "logits/chosen": 0.414661169052124, + "logits/rejected": 0.4720441997051239, + "logps/chosen": -470.0740661621094, + "logps/rejected": -428.37396240234375, + "loss": 0.5653, + "nll_loss": 0.44829848408699036, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6440432071685791, + "rewards/margins": 0.4521419107913971, + "rewards/rejected": 0.19190125167369843, + "step": 680 + }, + { + "epoch": 0.592656962850315, + "grad_norm": 157.55052864927137, + "learning_rate": 7.679434146893684e-08, + "logits/chosen": 0.4875819683074951, + "logits/rejected": 0.5448699593544006, + "logps/chosen": -486.230712890625, + "logps/rejected": -494.6131591796875, + "loss": 0.657, + "nll_loss": 0.4658043384552002, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.47913867235183716, + "rewards/margins": 0.0011176057159900665, + "rewards/rejected": 0.4780210852622986, + "step": 682 + }, + { + "epoch": 0.5943949598088203, + "grad_norm": 106.99123250882965, + "learning_rate": 7.623911680457198e-08, + "logits/chosen": 0.538242518901825, + "logits/rejected": 0.4498624801635742, + "logps/chosen": -469.2276916503906, + "logps/rejected": -468.6518859863281, + "loss": 0.5517, + "nll_loss": 0.42697572708129883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4259525537490845, + "rewards/margins": 0.28813114762306213, + "rewards/rejected": 0.13782137632369995, + "step": 684 + }, + { + "epoch": 0.5961329567673257, + "grad_norm": 136.23994501024254, + "learning_rate": 7.568466737947903e-08, + "logits/chosen": 0.48528510332107544, + "logits/rejected": 0.5946138501167297, + "logps/chosen": -511.0415954589844, + "logps/rejected": -517.5130004882812, + "loss": 0.5923, + "nll_loss": 0.47626355290412903, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5737752914428711, + "rewards/margins": 0.3134174048900604, + "rewards/rejected": 0.2603578567504883, + "step": 686 + }, + { + "epoch": 0.597870953725831, + "grad_norm": 310.86088435708143, + "learning_rate": 7.513101128351454e-08, + "logits/chosen": 0.5474853515625, + "logits/rejected": 0.589713454246521, + "logps/chosen": -534.6466674804688, + "logps/rejected": -520.8831787109375, + "loss": 0.5814, + "nll_loss": 0.5208612084388733, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5015422701835632, + "rewards/margins": 0.14514771103858948, + "rewards/rejected": 0.35639458894729614, + "step": 688 + }, + { + "epoch": 0.5996089506843363, + "grad_norm": 161.18501660847068, + "learning_rate": 7.457816658065133e-08, + "logits/chosen": 0.31886252760887146, + "logits/rejected": 0.07821323722600937, + "logps/chosen": -555.0598754882812, + "logps/rejected": -465.5038146972656, + "loss": 0.5469, + "nll_loss": 0.4690946340560913, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1816483438014984, + "rewards/margins": 0.35304969549179077, + "rewards/rejected": -0.17140133678913116, + "step": 690 + }, + { + "epoch": 0.6013469476428416, + "grad_norm": 150.85271265312244, + "learning_rate": 7.402615130838917e-08, + "logits/chosen": 0.4454071521759033, + "logits/rejected": 0.36207619309425354, + "logps/chosen": -468.035888671875, + "logps/rejected": -451.74267578125, + "loss": 0.5798, + "nll_loss": 0.4426875710487366, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5169916152954102, + "rewards/margins": 0.34321069717407227, + "rewards/rejected": 0.1737809032201767, + "step": 692 + }, + { + "epoch": 0.603084944601347, + "grad_norm": 182.8319718934074, + "learning_rate": 7.347498347716624e-08, + "logits/chosen": 0.4354170858860016, + "logits/rejected": 0.30550816655158997, + "logps/chosen": -432.18499755859375, + "logps/rejected": -492.991455078125, + "loss": 0.6565, + "nll_loss": 0.4345720410346985, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3578968048095703, + "rewards/margins": 0.11001966893672943, + "rewards/rejected": 0.2478771209716797, + "step": 694 + }, + { + "epoch": 0.6048229415598523, + "grad_norm": 343.8851119586624, + "learning_rate": 7.292468106977147e-08, + "logits/chosen": 0.5165270566940308, + "logits/rejected": 0.6715250611305237, + "logps/chosen": -415.6692810058594, + "logps/rejected": -441.96710205078125, + "loss": 0.5699, + "nll_loss": 0.3966309428215027, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22180873155593872, + "rewards/margins": 0.2036515325307846, + "rewards/rejected": 0.018157199025154114, + "step": 696 + }, + { + "epoch": 0.6065609385183576, + "grad_norm": 125.7461795731971, + "learning_rate": 7.237526204075796e-08, + "logits/chosen": 0.20688635110855103, + "logits/rejected": 0.2710256576538086, + "logps/chosen": -510.6375427246094, + "logps/rejected": -526.5493774414062, + "loss": 0.6146, + "nll_loss": 0.4774847626686096, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3681733012199402, + "rewards/margins": 0.3653341829776764, + "rewards/rejected": 0.002839110791683197, + "step": 698 + }, + { + "epoch": 0.6082989354768629, + "grad_norm": 121.43572798576211, + "learning_rate": 7.182674431585702e-08, + "logits/chosen": 0.46929383277893066, + "logits/rejected": 0.57563716173172, + "logps/chosen": -447.2082824707031, + "logps/rejected": -463.76959228515625, + "loss": 0.571, + "nll_loss": 0.4302436113357544, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6159959435462952, + "rewards/margins": 0.4504932761192322, + "rewards/rejected": 0.16550272703170776, + "step": 700 + }, + { + "epoch": 0.6100369324353683, + "grad_norm": 131.79731355818942, + "learning_rate": 7.127914579139337e-08, + "logits/chosen": 0.40889453887939453, + "logits/rejected": 0.3319603204727173, + "logps/chosen": -495.09344482421875, + "logps/rejected": -504.8100891113281, + "loss": 0.5684, + "nll_loss": 0.47828787565231323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6092842817306519, + "rewards/margins": 0.6394984722137451, + "rewards/rejected": -0.030214110389351845, + "step": 702 + }, + { + "epoch": 0.6117749293938736, + "grad_norm": 155.45328672697394, + "learning_rate": 7.073248433370124e-08, + "logits/chosen": 0.49587368965148926, + "logits/rejected": 0.5953259468078613, + "logps/chosen": -512.6905517578125, + "logps/rejected": -542.0206298828125, + "loss": 0.6345, + "nll_loss": 0.5072119832038879, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3826178312301636, + "rewards/margins": 0.1317063719034195, + "rewards/rejected": 0.25091153383255005, + "step": 704 + }, + { + "epoch": 0.6135129263523789, + "grad_norm": 205.7696417880005, + "learning_rate": 7.018677777854158e-08, + "logits/chosen": 0.6558414101600647, + "logits/rejected": 0.5301958322525024, + "logps/chosen": -487.62066650390625, + "logps/rejected": -480.21832275390625, + "loss": 0.5238, + "nll_loss": 0.46764057874679565, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11372433602809906, + "rewards/margins": 0.3404094874858856, + "rewards/rejected": -0.22668513655662537, + "step": 706 + }, + { + "epoch": 0.6152509233108842, + "grad_norm": 226.38982186124755, + "learning_rate": 6.96420439305198e-08, + "logits/chosen": 0.2674558460712433, + "logits/rejected": 0.37489739060401917, + "logps/chosen": -514.8885498046875, + "logps/rejected": -553.451416015625, + "loss": 0.6391, + "nll_loss": 0.5216849446296692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7858709096908569, + "rewards/margins": 0.7879164814949036, + "rewards/rejected": -0.00204562209546566, + "step": 708 + }, + { + "epoch": 0.6169889202693896, + "grad_norm": 147.1625577529555, + "learning_rate": 6.909830056250527e-08, + "logits/chosen": 0.29546013474464417, + "logits/rejected": 0.37205877900123596, + "logps/chosen": -499.74700927734375, + "logps/rejected": -556.3563232421875, + "loss": 0.604, + "nll_loss": 0.5126159191131592, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3689521551132202, + "rewards/margins": 0.6727516055107117, + "rewards/rejected": -0.30379945039749146, + "step": 710 + }, + { + "epoch": 0.6187269172278949, + "grad_norm": 159.3173874474768, + "learning_rate": 6.85555654150512e-08, + "logits/chosen": 0.5298061370849609, + "logits/rejected": 0.4939400553703308, + "logps/chosen": -493.6993713378906, + "logps/rejected": -460.180908203125, + "loss": 0.6112, + "nll_loss": 0.4606616795063019, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23729343712329865, + "rewards/margins": 0.29073992371559143, + "rewards/rejected": -0.053446486592292786, + "step": 712 + }, + { + "epoch": 0.6204649141864002, + "grad_norm": 139.47914568218025, + "learning_rate": 6.801385619581591e-08, + "logits/chosen": 0.3952076733112335, + "logits/rejected": 0.41778475046157837, + "logps/chosen": -437.45758056640625, + "logps/rejected": -474.8389587402344, + "loss": 0.5748, + "nll_loss": 0.4331091046333313, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5372213125228882, + "rewards/margins": 0.5590240359306335, + "rewards/rejected": -0.021802805364131927, + "step": 714 + }, + { + "epoch": 0.6222029111449054, + "grad_norm": 148.32834601314627, + "learning_rate": 6.747319057898502e-08, + "logits/chosen": 0.3858318626880646, + "logits/rejected": 0.33923962712287903, + "logps/chosen": -475.5868225097656, + "logps/rejected": -507.0733947753906, + "loss": 0.6073, + "nll_loss": 0.48082295060157776, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5214846134185791, + "rewards/margins": 0.41246843338012695, + "rewards/rejected": 0.10901622474193573, + "step": 716 + }, + { + "epoch": 0.6239409081034109, + "grad_norm": 120.86725662420797, + "learning_rate": 6.693358620469487e-08, + "logits/chosen": 0.6276645660400391, + "logits/rejected": 0.5396835803985596, + "logps/chosen": -492.626708984375, + "logps/rejected": -497.15673828125, + "loss": 0.5857, + "nll_loss": 0.45979419350624084, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10214034467935562, + "rewards/margins": 0.233620747923851, + "rewards/rejected": -0.1314803957939148, + "step": 718 + }, + { + "epoch": 0.6256789050619161, + "grad_norm": 153.69685391073264, + "learning_rate": 6.639506067845697e-08, + "logits/chosen": 0.4492928683757782, + "logits/rejected": 0.47506198287010193, + "logps/chosen": -513.918701171875, + "logps/rejected": -455.6263122558594, + "loss": 0.5852, + "nll_loss": 0.4903804361820221, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22701072692871094, + "rewards/margins": 0.4086184501647949, + "rewards/rejected": -0.18160772323608398, + "step": 720 + }, + { + "epoch": 0.6274169020204214, + "grad_norm": 144.74291732818253, + "learning_rate": 6.585763157058357e-08, + "logits/chosen": 0.24743977189064026, + "logits/rejected": 0.24426200985908508, + "logps/chosen": -513.9275512695312, + "logps/rejected": -548.3668823242188, + "loss": 0.6136, + "nll_loss": 0.46442505717277527, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46324920654296875, + "rewards/margins": 0.36793631315231323, + "rewards/rejected": 0.09531288594007492, + "step": 722 + }, + { + "epoch": 0.6291548989789267, + "grad_norm": 168.84797722833102, + "learning_rate": 6.53213164156144e-08, + "logits/chosen": 0.24420535564422607, + "logits/rejected": 0.34088706970214844, + "logps/chosen": -465.83038330078125, + "logps/rejected": -463.24041748046875, + "loss": 0.6246, + "nll_loss": 0.4556388854980469, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6123493909835815, + "rewards/margins": 0.12421969324350357, + "rewards/rejected": 0.4881296157836914, + "step": 724 + }, + { + "epoch": 0.6308928959374321, + "grad_norm": 145.95898719291733, + "learning_rate": 6.478613271174452e-08, + "logits/chosen": 0.46754446625709534, + "logits/rejected": 0.21353721618652344, + "logps/chosen": -462.1207275390625, + "logps/rejected": -503.33172607421875, + "loss": 0.5881, + "nll_loss": 0.4296815097332001, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.552665650844574, + "rewards/margins": 0.21230097115039825, + "rewards/rejected": 0.34036463499069214, + "step": 726 + }, + { + "epoch": 0.6326308928959374, + "grad_norm": 156.916802373185, + "learning_rate": 6.425209792025357e-08, + "logits/chosen": 0.34340569376945496, + "logits/rejected": 0.42395177483558655, + "logps/chosen": -422.3741455078125, + "logps/rejected": -461.89788818359375, + "loss": 0.5971, + "nll_loss": 0.4330331087112427, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.564942479133606, + "rewards/margins": 0.31727665662765503, + "rewards/rejected": 0.24766579270362854, + "step": 728 + }, + { + "epoch": 0.6343688898544427, + "grad_norm": 163.70515499675665, + "learning_rate": 6.371922946493591e-08, + "logits/chosen": 0.5420715808868408, + "logits/rejected": 0.6046915054321289, + "logps/chosen": -519.7154541015625, + "logps/rejected": -503.3428955078125, + "loss": 0.6052, + "nll_loss": 0.4930592477321625, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45887356996536255, + "rewards/margins": 0.41354089975357056, + "rewards/rejected": 0.04533272236585617, + "step": 730 + }, + { + "epoch": 0.636106886812948, + "grad_norm": 173.00562897007376, + "learning_rate": 6.31875447315322e-08, + "logits/chosen": 0.5808181762695312, + "logits/rejected": 0.5133619904518127, + "logps/chosen": -490.7048645019531, + "logps/rejected": -418.19482421875, + "loss": 0.6095, + "nll_loss": 0.45788317918777466, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42479199171066284, + "rewards/margins": 0.07730991393327713, + "rewards/rejected": 0.3474821150302887, + "step": 732 + }, + { + "epoch": 0.6378448837714534, + "grad_norm": 160.63823512356848, + "learning_rate": 6.26570610671622e-08, + "logits/chosen": 0.14066378772258759, + "logits/rejected": 0.2091757357120514, + "logps/chosen": -434.40643310546875, + "logps/rejected": -478.67877197265625, + "loss": 0.5721, + "nll_loss": 0.454807311296463, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7642690539360046, + "rewards/margins": 0.6635603308677673, + "rewards/rejected": 0.10070877522230148, + "step": 734 + }, + { + "epoch": 0.6395828807299587, + "grad_norm": 264.0674555161591, + "learning_rate": 6.212779577975869e-08, + "logits/chosen": 0.3542450964450836, + "logits/rejected": 0.41409796476364136, + "logps/chosen": -478.570068359375, + "logps/rejected": -481.2341613769531, + "loss": 0.5887, + "nll_loss": 0.4426587224006653, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5770342350006104, + "rewards/margins": 0.4736230969429016, + "rewards/rejected": 0.10341110080480576, + "step": 736 + }, + { + "epoch": 0.641320877688464, + "grad_norm": 180.89113715298822, + "learning_rate": 6.159976613750286e-08, + "logits/chosen": 0.10223013162612915, + "logits/rejected": 0.2627410292625427, + "logps/chosen": -476.6172180175781, + "logps/rejected": -510.81207275390625, + "loss": 0.5827, + "nll_loss": 0.4346134662628174, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6227002739906311, + "rewards/margins": 0.48594045639038086, + "rewards/rejected": 0.13675986230373383, + "step": 738 + }, + { + "epoch": 0.6430588746469693, + "grad_norm": 293.2245154471447, + "learning_rate": 6.107298936826086e-08, + "logits/chosen": 0.35400518774986267, + "logits/rejected": 0.4170874357223511, + "logps/chosen": -445.3726806640625, + "logps/rejected": -474.6842346191406, + "loss": 0.6212, + "nll_loss": 0.43134331703186035, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33985626697540283, + "rewards/margins": 0.22458821535110474, + "rewards/rejected": 0.1152680367231369, + "step": 740 + }, + { + "epoch": 0.6447968716054747, + "grad_norm": 131.0312166378181, + "learning_rate": 6.05474826590217e-08, + "logits/chosen": 0.29119187593460083, + "logits/rejected": 0.3434115946292877, + "logps/chosen": -446.6615295410156, + "logps/rejected": -455.331298828125, + "loss": 0.6125, + "nll_loss": 0.4392196238040924, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4968715310096741, + "rewards/margins": 0.3695334494113922, + "rewards/rejected": 0.12733803689479828, + "step": 742 + }, + { + "epoch": 0.64653486856398, + "grad_norm": 162.16655463240252, + "learning_rate": 6.002326315533664e-08, + "logits/chosen": 0.3683479130268097, + "logits/rejected": 0.37617531418800354, + "logps/chosen": -577.240234375, + "logps/rejected": -548.419189453125, + "loss": 0.6629, + "nll_loss": 0.49988922476768494, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9272751212120056, + "rewards/margins": 0.266749769449234, + "rewards/rejected": 0.6605253219604492, + "step": 744 + }, + { + "epoch": 0.6482728655224853, + "grad_norm": 195.13382275325625, + "learning_rate": 5.950034796075947e-08, + "logits/chosen": 0.45707762241363525, + "logits/rejected": 0.3717419505119324, + "logps/chosen": -519.5899047851562, + "logps/rejected": -537.0008544921875, + "loss": 0.5515, + "nll_loss": 0.4879206418991089, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7020326256752014, + "rewards/margins": 0.2489035725593567, + "rewards/rejected": 0.45312899351119995, + "step": 746 + }, + { + "epoch": 0.6500108624809906, + "grad_norm": 130.69654593671132, + "learning_rate": 5.8978754136288835e-08, + "logits/chosen": 0.6327035427093506, + "logits/rejected": 0.49619409441947937, + "logps/chosen": -479.45111083984375, + "logps/rejected": -498.4273376464844, + "loss": 0.5984, + "nll_loss": 0.48115187883377075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6721794605255127, + "rewards/margins": 0.6595268249511719, + "rewards/rejected": 0.012652596458792686, + "step": 748 + }, + { + "epoch": 0.651748859439496, + "grad_norm": 207.68261302866335, + "learning_rate": 5.845849869981137e-08, + "logits/chosen": 0.3563077449798584, + "logits/rejected": 0.4740470051765442, + "logps/chosen": -421.8052978515625, + "logps/rejected": -470.5953674316406, + "loss": 0.6263, + "nll_loss": 0.41041111946105957, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5872929096221924, + "rewards/margins": 0.30345726013183594, + "rewards/rejected": 0.28383558988571167, + "step": 750 + }, + { + "epoch": 0.6534868563980013, + "grad_norm": 124.08937279199534, + "learning_rate": 5.7939598625546516e-08, + "logits/chosen": 0.4799199104309082, + "logits/rejected": 0.43483075499534607, + "logps/chosen": -505.39410400390625, + "logps/rejected": -542.4945068359375, + "loss": 0.5873, + "nll_loss": 0.4946918785572052, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.719025731086731, + "rewards/margins": 0.553241491317749, + "rewards/rejected": 0.16578426957130432, + "step": 752 + }, + { + "epoch": 0.6552248533565066, + "grad_norm": 177.62311751052303, + "learning_rate": 5.742207084349273e-08, + "logits/chosen": 0.4672154486179352, + "logits/rejected": 0.3542248606681824, + "logps/chosen": -440.687744140625, + "logps/rejected": -423.16729736328125, + "loss": 0.6154, + "nll_loss": 0.4290880858898163, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3816303014755249, + "rewards/margins": 0.4123232066631317, + "rewards/rejected": -0.03069286048412323, + "step": 754 + }, + { + "epoch": 0.656962850315012, + "grad_norm": 127.05979762459806, + "learning_rate": 5.690593223887512e-08, + "logits/chosen": 0.4858136773109436, + "logits/rejected": 0.4847185015678406, + "logps/chosen": -431.63433837890625, + "logps/rejected": -431.2800598144531, + "loss": 0.5734, + "nll_loss": 0.4098186492919922, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.26508915424346924, + "rewards/margins": 0.1980241984128952, + "rewards/rejected": 0.06706495583057404, + "step": 756 + }, + { + "epoch": 0.6587008472735173, + "grad_norm": 279.8735412918544, + "learning_rate": 5.6391199651594454e-08, + "logits/chosen": 0.5232665538787842, + "logits/rejected": 0.6117486357688904, + "logps/chosen": -460.03594970703125, + "logps/rejected": -461.8345947265625, + "loss": 0.7045, + "nll_loss": 0.4505896270275116, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1962445229291916, + "rewards/margins": -0.15768270194530487, + "rewards/rejected": 0.35392722487449646, + "step": 758 + }, + { + "epoch": 0.6604388442320226, + "grad_norm": 300.2555521627166, + "learning_rate": 5.587788987567784e-08, + "logits/chosen": 0.3533879518508911, + "logits/rejected": 0.43673956394195557, + "logps/chosen": -450.2116394042969, + "logps/rejected": -511.2435302734375, + "loss": 0.6252, + "nll_loss": 0.44547367095947266, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4659990072250366, + "rewards/margins": 0.5369496941566467, + "rewards/rejected": -0.07095059752464294, + "step": 760 + }, + { + "epoch": 0.6621768411905279, + "grad_norm": 188.81668805180584, + "learning_rate": 5.536601965873082e-08, + "logits/chosen": 0.3259649872779846, + "logits/rejected": 0.4583315849304199, + "logps/chosen": -417.64495849609375, + "logps/rejected": -486.90435791015625, + "loss": 0.6134, + "nll_loss": 0.421085387468338, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5350974798202515, + "rewards/margins": 0.56300950050354, + "rewards/rejected": -0.027912046760320663, + "step": 762 + }, + { + "epoch": 0.6639148381490333, + "grad_norm": 122.29581285365549, + "learning_rate": 5.48556057013906e-08, + "logits/chosen": 0.38365134596824646, + "logits/rejected": 0.33106014132499695, + "logps/chosen": -470.62261962890625, + "logps/rejected": -456.76239013671875, + "loss": 0.5941, + "nll_loss": 0.4603612720966339, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9042216539382935, + "rewards/margins": 0.49135351181030273, + "rewards/rejected": 0.4128681421279907, + "step": 764 + }, + { + "epoch": 0.6656528351075386, + "grad_norm": 127.22170192937084, + "learning_rate": 5.4346664656781746e-08, + "logits/chosen": 0.1824665665626526, + "logits/rejected": 0.241348996758461, + "logps/chosen": -418.4898681640625, + "logps/rejected": -481.45306396484375, + "loss": 0.6468, + "nll_loss": 0.4060194492340088, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5272243618965149, + "rewards/margins": 0.5564711689949036, + "rewards/rejected": -0.029246818274259567, + "step": 766 + }, + { + "epoch": 0.6673908320660439, + "grad_norm": 158.84153776593297, + "learning_rate": 5.3839213129972416e-08, + "logits/chosen": 0.6094731092453003, + "logits/rejected": 0.5983626842498779, + "logps/chosen": -471.9709777832031, + "logps/rejected": -527.959228515625, + "loss": 0.5736, + "nll_loss": 0.45386606454849243, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6506131887435913, + "rewards/margins": 0.5279781222343445, + "rewards/rejected": 0.12263509631156921, + "step": 768 + }, + { + "epoch": 0.6691288290245492, + "grad_norm": 211.77180871420688, + "learning_rate": 5.3333267677432626e-08, + "logits/chosen": 0.09929066896438599, + "logits/rejected": 0.1294572353363037, + "logps/chosen": -417.3568115234375, + "logps/rejected": -521.1983032226562, + "loss": 0.6173, + "nll_loss": 0.38731706142425537, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46828895807266235, + "rewards/margins": 0.2709873616695404, + "rewards/rejected": 0.19730158150196075, + "step": 770 + }, + { + "epoch": 0.6708668259830546, + "grad_norm": 142.68132898222117, + "learning_rate": 5.282884480649435e-08, + "logits/chosen": 0.40837544202804565, + "logits/rejected": 0.5099937915802002, + "logps/chosen": -450.0708923339844, + "logps/rejected": -520.635986328125, + "loss": 0.5429, + "nll_loss": 0.4542206823825836, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.79877108335495, + "rewards/margins": 1.0893309116363525, + "rewards/rejected": -0.290559858083725, + "step": 772 + }, + { + "epoch": 0.6726048229415599, + "grad_norm": 177.9137771236567, + "learning_rate": 5.232596097481251e-08, + "logits/chosen": 0.7409548163414001, + "logits/rejected": 0.6623491644859314, + "logps/chosen": -542.9630126953125, + "logps/rejected": -529.6650390625, + "loss": 0.6094, + "nll_loss": 0.4900684952735901, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3313789367675781, + "rewards/margins": 0.26346588134765625, + "rewards/rejected": 0.06791305541992188, + "step": 774 + }, + { + "epoch": 0.6743428199000652, + "grad_norm": 277.7385675447199, + "learning_rate": 5.182463258982845e-08, + "logits/chosen": 0.6070559620857239, + "logits/rejected": 0.6075460910797119, + "logps/chosen": -458.2026672363281, + "logps/rejected": -470.3449401855469, + "loss": 0.6355, + "nll_loss": 0.45458412170410156, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22362647950649261, + "rewards/margins": 0.20613884925842285, + "rewards/rejected": 0.01748763769865036, + "step": 776 + }, + { + "epoch": 0.6760808168585705, + "grad_norm": 155.68239224526974, + "learning_rate": 5.1324876008234376e-08, + "logits/chosen": 0.4934028387069702, + "logits/rejected": 0.5032480955123901, + "logps/chosen": -477.3890380859375, + "logps/rejected": -495.6091613769531, + "loss": 0.6027, + "nll_loss": 0.4526577293872833, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3062657117843628, + "rewards/margins": 0.4403870701789856, + "rewards/rejected": -0.13412132859230042, + "step": 778 + }, + { + "epoch": 0.6778188138170759, + "grad_norm": 235.05765374401005, + "learning_rate": 5.082670753543961e-08, + "logits/chosen": 0.29999101161956787, + "logits/rejected": 0.34913432598114014, + "logps/chosen": -572.618896484375, + "logps/rejected": -585.3377075195312, + "loss": 0.5941, + "nll_loss": 0.5501593351364136, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.35720503330230713, + "rewards/margins": 0.20051269233226776, + "rewards/rejected": 0.15669232606887817, + "step": 780 + }, + { + "epoch": 0.6795568107755812, + "grad_norm": 141.4371871807222, + "learning_rate": 5.033014342503889e-08, + "logits/chosen": 0.44081294536590576, + "logits/rejected": 0.2912307679653168, + "logps/chosen": -505.85687255859375, + "logps/rejected": -521.6779174804688, + "loss": 0.5362, + "nll_loss": 0.5228245854377747, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3681758940219879, + "rewards/margins": 0.6614881157875061, + "rewards/rejected": -0.2933122515678406, + "step": 782 + }, + { + "epoch": 0.6812948077340865, + "grad_norm": 188.6420459555021, + "learning_rate": 4.983519987828176e-08, + "logits/chosen": 0.4042893350124359, + "logits/rejected": 0.2071767896413803, + "logps/chosen": -557.4622802734375, + "logps/rejected": -541.4765625, + "loss": 0.579, + "nll_loss": 0.5044111609458923, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31110334396362305, + "rewards/margins": 0.44057944416999817, + "rewards/rejected": -0.12947607040405273, + "step": 784 + }, + { + "epoch": 0.6830328046925918, + "grad_norm": 471.248521332878, + "learning_rate": 4.934189304354418e-08, + "logits/chosen": 0.014058850705623627, + "logits/rejected": 0.23235078155994415, + "logps/chosen": -570.7021484375, + "logps/rejected": -485.6100158691406, + "loss": 0.5428, + "nll_loss": 0.5105115175247192, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4729238748550415, + "rewards/margins": 0.4060804843902588, + "rewards/rejected": 0.0668434202671051, + "step": 786 + }, + { + "epoch": 0.6847708016510972, + "grad_norm": 300.8320888757264, + "learning_rate": 4.885023901580162e-08, + "logits/chosen": 0.49230802059173584, + "logits/rejected": 0.3687041401863098, + "logps/chosen": -457.49017333984375, + "logps/rejected": -393.0660705566406, + "loss": 0.5854, + "nll_loss": 0.4349403381347656, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3268558382987976, + "rewards/margins": 0.344466894865036, + "rewards/rejected": -0.017611034214496613, + "step": 788 + }, + { + "epoch": 0.6865087986096025, + "grad_norm": 134.60815880505996, + "learning_rate": 4.8360253836103816e-08, + "logits/chosen": 0.7422909736633301, + "logits/rejected": 0.8082624673843384, + "logps/chosen": -480.2214660644531, + "logps/rejected": -494.79583740234375, + "loss": 0.6049, + "nll_loss": 0.46801847219467163, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17905193567276, + "rewards/margins": 0.21437975764274597, + "rewards/rejected": -0.035327814519405365, + "step": 790 + }, + { + "epoch": 0.6882467955681077, + "grad_norm": 236.02343374540018, + "learning_rate": 4.7871953491051583e-08, + "logits/chosen": 0.23191796243190765, + "logits/rejected": 0.28292810916900635, + "logps/chosen": -495.27935791015625, + "logps/rejected": -456.4753112792969, + "loss": 0.5821, + "nll_loss": 0.4527099132537842, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.267295241355896, + "rewards/margins": 0.5129619240760803, + "rewards/rejected": -0.24566669762134552, + "step": 792 + }, + { + "epoch": 0.689984792526613, + "grad_norm": 158.10216644449264, + "learning_rate": 4.7385353912275164e-08, + "logits/chosen": 0.43008583784103394, + "logits/rejected": 0.5059869289398193, + "logps/chosen": -479.2068176269531, + "logps/rejected": -489.1475830078125, + "loss": 0.6079, + "nll_loss": 0.4439893960952759, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11055446416139603, + "rewards/margins": 0.3065342903137207, + "rewards/rejected": -0.1959797739982605, + "step": 794 + }, + { + "epoch": 0.6917227894851184, + "grad_norm": 340.4845104687888, + "learning_rate": 4.6900470975914265e-08, + "logits/chosen": 0.21972504258155823, + "logits/rejected": 0.27979522943496704, + "logps/chosen": -546.1092529296875, + "logps/rejected": -536.8622436523438, + "loss": 0.627, + "nll_loss": 0.5137719511985779, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04987996444106102, + "rewards/margins": 0.212679922580719, + "rewards/rejected": -0.16280002892017365, + "step": 796 + }, + { + "epoch": 0.6934607864436237, + "grad_norm": 243.2800060075489, + "learning_rate": 4.641732050210031e-08, + "logits/chosen": 0.42562809586524963, + "logits/rejected": 0.3147584795951843, + "logps/chosen": -463.6742858886719, + "logps/rejected": -476.229248046875, + "loss": 0.659, + "nll_loss": 0.4361433982849121, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34175100922584534, + "rewards/margins": 0.2627665400505066, + "rewards/rejected": 0.07898445427417755, + "step": 798 + }, + { + "epoch": 0.695198783402129, + "grad_norm": 114.92187652057805, + "learning_rate": 4.5935918254440276e-08, + "logits/chosen": 0.4270586669445038, + "logits/rejected": 0.3975781798362732, + "logps/chosen": -464.14251708984375, + "logps/rejected": -462.9993591308594, + "loss": 0.6274, + "nll_loss": 0.4429488182067871, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21685649454593658, + "rewards/margins": 0.5437389612197876, + "rewards/rejected": -0.3268824815750122, + "step": 800 + }, + { + "epoch": 0.6969367803606343, + "grad_norm": 217.41859804183937, + "learning_rate": 4.5456279939502005e-08, + "logits/chosen": 0.19082283973693848, + "logits/rejected": 0.20748263597488403, + "logps/chosen": -423.4697265625, + "logps/rejected": -454.7181701660156, + "loss": 0.564, + "nll_loss": 0.42925646901130676, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28616124391555786, + "rewards/margins": 0.2787553668022156, + "rewards/rejected": 0.007405860349535942, + "step": 802 + }, + { + "epoch": 0.6986747773191397, + "grad_norm": 236.65494312703478, + "learning_rate": 4.4978421206302285e-08, + "logits/chosen": 0.40193361043930054, + "logits/rejected": 0.536785364151001, + "logps/chosen": -477.677734375, + "logps/rejected": -549.7373657226562, + "loss": 0.6586, + "nll_loss": 0.4303903579711914, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38318753242492676, + "rewards/margins": 0.38266754150390625, + "rewards/rejected": 0.0005199350416660309, + "step": 804 + }, + { + "epoch": 0.700412774277645, + "grad_norm": 161.9548130973149, + "learning_rate": 4.450235764579597e-08, + "logits/chosen": 0.22347937524318695, + "logits/rejected": 0.19105751812458038, + "logps/chosen": -544.6937866210938, + "logps/rejected": -513.24951171875, + "loss": 0.5947, + "nll_loss": 0.5026200413703918, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39928218722343445, + "rewards/margins": 0.5983918309211731, + "rewards/rejected": -0.19910964369773865, + "step": 806 + }, + { + "epoch": 0.7021507712361503, + "grad_norm": 132.58473669179077, + "learning_rate": 4.4028104790367246e-08, + "logits/chosen": 0.4145227372646332, + "logits/rejected": 0.3501809537410736, + "logps/chosen": -469.0726318359375, + "logps/rejected": -397.6184997558594, + "loss": 0.553, + "nll_loss": 0.43358322978019714, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21624755859375, + "rewards/margins": 0.23603084683418274, + "rewards/rejected": -0.019783303141593933, + "step": 808 + }, + { + "epoch": 0.7038887681946556, + "grad_norm": 144.276213446629, + "learning_rate": 4.35556781133231e-08, + "logits/chosen": 0.31530264019966125, + "logits/rejected": 0.5416185855865479, + "logps/chosen": -465.4701843261719, + "logps/rejected": -487.7655334472656, + "loss": 0.576, + "nll_loss": 0.4773712754249573, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.39206960797309875, + "rewards/margins": 0.3798760175704956, + "rewards/rejected": 0.012193584814667702, + "step": 810 + }, + { + "epoch": 0.705626765153161, + "grad_norm": 201.39592042775223, + "learning_rate": 4.3085093028388195e-08, + "logits/chosen": 0.6023300886154175, + "logits/rejected": 0.7180359363555908, + "logps/chosen": -445.44024658203125, + "logps/rejected": -480.2512512207031, + "loss": 0.6235, + "nll_loss": 0.44196388125419617, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.44572457671165466, + "rewards/margins": 0.4922065734863281, + "rewards/rejected": -0.04648199677467346, + "step": 812 + }, + { + "epoch": 0.7073647621116663, + "grad_norm": 130.59523911447587, + "learning_rate": 4.261636488920225e-08, + "logits/chosen": 0.4116157293319702, + "logits/rejected": 0.44143983721733093, + "logps/chosen": -465.00439453125, + "logps/rejected": -561.2508544921875, + "loss": 0.5782, + "nll_loss": 0.4379756450653076, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4334089457988739, + "rewards/margins": 0.5009133815765381, + "rewards/rejected": -0.06750450283288956, + "step": 814 + }, + { + "epoch": 0.7091027590701716, + "grad_norm": 143.79978339467263, + "learning_rate": 4.2149508988818916e-08, + "logits/chosen": 0.130857452750206, + "logits/rejected": 0.16600145399570465, + "logps/chosen": -519.40234375, + "logps/rejected": -530.9063720703125, + "loss": 0.5812, + "nll_loss": 0.49456048011779785, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5553606748580933, + "rewards/margins": 0.29760900139808655, + "rewards/rejected": 0.2577517032623291, + "step": 816 + }, + { + "epoch": 0.7108407560286769, + "grad_norm": 121.04385603346864, + "learning_rate": 4.16845405592068e-08, + "logits/chosen": 0.2572818100452423, + "logits/rejected": 0.22149065136909485, + "logps/chosen": -507.42364501953125, + "logps/rejected": -510.28228759765625, + "loss": 0.6038, + "nll_loss": 0.4678362011909485, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5002222061157227, + "rewards/margins": 0.47260046005249023, + "rewards/rejected": 0.027621760964393616, + "step": 818 + }, + { + "epoch": 0.7125787529871823, + "grad_norm": 172.5337885064375, + "learning_rate": 4.1221474770752695e-08, + "logits/chosen": 0.44652074575424194, + "logits/rejected": 0.506121814250946, + "logps/chosen": -452.7835998535156, + "logps/rejected": -468.171630859375, + "loss": 0.5583, + "nll_loss": 0.4320453405380249, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23992222547531128, + "rewards/margins": 0.2754116952419281, + "rewards/rejected": -0.03548946604132652, + "step": 820 + }, + { + "epoch": 0.7143167499456876, + "grad_norm": 149.04453299710758, + "learning_rate": 4.076032673176637e-08, + "logits/chosen": 0.12217839062213898, + "logits/rejected": 0.15243466198444366, + "logps/chosen": -455.9956359863281, + "logps/rejected": -495.3397216796875, + "loss": 0.5185, + "nll_loss": 0.4414156675338745, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5716139078140259, + "rewards/margins": 0.510877788066864, + "rewards/rejected": 0.060736075043678284, + "step": 822 + }, + { + "epoch": 0.7160547469041929, + "grad_norm": 176.89851298361629, + "learning_rate": 4.030111148798775e-08, + "logits/chosen": 0.48356354236602783, + "logits/rejected": 0.7054731249809265, + "logps/chosen": -448.03448486328125, + "logps/rejected": -508.13665771484375, + "loss": 0.5923, + "nll_loss": 0.44542497396469116, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30406779050827026, + "rewards/margins": 0.30130958557128906, + "rewards/rejected": 0.002758212387561798, + "step": 824 + }, + { + "epoch": 0.7177927438626982, + "grad_norm": 146.8343812764771, + "learning_rate": 3.984384402209613e-08, + "logits/chosen": 0.4624355435371399, + "logits/rejected": 0.5660164952278137, + "logps/chosen": -412.0571594238281, + "logps/rejected": -487.960693359375, + "loss": 0.6451, + "nll_loss": 0.4038446247577667, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3133462071418762, + "rewards/margins": 0.08846140652894974, + "rewards/rejected": 0.22488482296466827, + "step": 826 + }, + { + "epoch": 0.7195307408212036, + "grad_norm": 111.15354720327095, + "learning_rate": 3.938853925322117e-08, + "logits/chosen": 0.6324234008789062, + "logits/rejected": 0.6697845458984375, + "logps/chosen": -496.340576171875, + "logps/rejected": -503.02191162109375, + "loss": 0.5639, + "nll_loss": 0.48350197076797485, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5550703406333923, + "rewards/margins": 0.5358677506446838, + "rewards/rejected": 0.019202616065740585, + "step": 828 + }, + { + "epoch": 0.7212687377797089, + "grad_norm": 163.63438307124966, + "learning_rate": 3.893521203645618e-08, + "logits/chosen": 0.42804092168807983, + "logits/rejected": 0.4747631549835205, + "logps/chosen": -456.4293518066406, + "logps/rejected": -470.9934997558594, + "loss": 0.6167, + "nll_loss": 0.4335935711860657, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.34340953826904297, + "rewards/margins": 0.13007965683937073, + "rewards/rejected": 0.21332989633083344, + "step": 830 + }, + { + "epoch": 0.7230067347382142, + "grad_norm": 131.0966963549401, + "learning_rate": 3.848387716237352e-08, + "logits/chosen": 0.271542489528656, + "logits/rejected": 0.29016950726509094, + "logps/chosen": -370.72216796875, + "logps/rejected": -441.3465881347656, + "loss": 0.5434, + "nll_loss": 0.3498722314834595, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.21746429800987244, + "rewards/margins": 0.160241037607193, + "rewards/rejected": 0.057223230600357056, + "step": 832 + }, + { + "epoch": 0.7247447316967195, + "grad_norm": 142.61239618535805, + "learning_rate": 3.803454935654189e-08, + "logits/chosen": 0.4528628885746002, + "logits/rejected": 0.4226664900779724, + "logps/chosen": -575.0054321289062, + "logps/rejected": -570.5166625976562, + "loss": 0.6139, + "nll_loss": 0.5149663090705872, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36920759081840515, + "rewards/margins": 0.13114717602729797, + "rewards/rejected": 0.2380603700876236, + "step": 834 + }, + { + "epoch": 0.7264827286552249, + "grad_norm": 147.70343775287222, + "learning_rate": 3.758724327904606e-08, + "logits/chosen": 0.3783852756023407, + "logits/rejected": 0.4000176191329956, + "logps/chosen": -484.54962158203125, + "logps/rejected": -538.7274169921875, + "loss": 0.5264, + "nll_loss": 0.47790783643722534, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26448196172714233, + "rewards/margins": 0.39010438323020935, + "rewards/rejected": -0.1256224662065506, + "step": 836 + }, + { + "epoch": 0.7282207256137302, + "grad_norm": 159.27478507683435, + "learning_rate": 3.7141973524008486e-08, + "logits/chosen": 0.5152249932289124, + "logits/rejected": 0.44237661361694336, + "logps/chosen": -493.9826354980469, + "logps/rejected": -461.91424560546875, + "loss": 0.6011, + "nll_loss": 0.4731515944004059, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03008108027279377, + "rewards/margins": 0.2286914736032486, + "rewards/rejected": -0.19861041009426117, + "step": 838 + }, + { + "epoch": 0.7299587225722355, + "grad_norm": 160.37372417445502, + "learning_rate": 3.669875461911297e-08, + "logits/chosen": 0.38272225856781006, + "logits/rejected": 0.32334208488464355, + "logps/chosen": -453.45343017578125, + "logps/rejected": -481.0965576171875, + "loss": 0.591, + "nll_loss": 0.46896225214004517, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.47221946716308594, + "rewards/margins": 0.4862200915813446, + "rewards/rejected": -0.014000609517097473, + "step": 840 + }, + { + "epoch": 0.7316967195307408, + "grad_norm": 106.48540922978027, + "learning_rate": 3.6257601025131026e-08, + "logits/chosen": 0.36537909507751465, + "logits/rejected": 0.4610544741153717, + "logps/chosen": -495.3701171875, + "logps/rejected": -536.6589965820312, + "loss": 0.5434, + "nll_loss": 0.49130648374557495, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46169281005859375, + "rewards/margins": 0.5839080214500427, + "rewards/rejected": -0.12221517413854599, + "step": 842 + }, + { + "epoch": 0.7334347164892462, + "grad_norm": 184.64521993366662, + "learning_rate": 3.581852713544983e-08, + "logits/chosen": 0.6350924372673035, + "logits/rejected": 0.6400173902511597, + "logps/chosen": -572.7459716796875, + "logps/rejected": -505.0904235839844, + "loss": 0.6132, + "nll_loss": 0.5641618967056274, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4518885910511017, + "rewards/margins": 0.26884880661964417, + "rewards/rejected": 0.18303976953029633, + "step": 844 + }, + { + "epoch": 0.7351727134477515, + "grad_norm": 174.76319754971257, + "learning_rate": 3.538154727560259e-08, + "logits/chosen": 0.2232353538274765, + "logits/rejected": 0.25405293703079224, + "logps/chosen": -457.76251220703125, + "logps/rejected": -487.740234375, + "loss": 0.5885, + "nll_loss": 0.44223538041114807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3765263557434082, + "rewards/margins": 0.2363949865102768, + "rewards/rejected": 0.1401313841342926, + "step": 846 + }, + { + "epoch": 0.7369107104062568, + "grad_norm": 157.0768940202763, + "learning_rate": 3.494667570280132e-08, + "logits/chosen": -0.08307046443223953, + "logits/rejected": 0.09646856039762497, + "logps/chosen": -433.7715759277344, + "logps/rejected": -426.2318115234375, + "loss": 0.5631, + "nll_loss": 0.42717698216438293, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3394044041633606, + "rewards/margins": 0.4069460332393646, + "rewards/rejected": -0.06754161417484283, + "step": 848 + }, + { + "epoch": 0.7386487073647621, + "grad_norm": 137.81378973300215, + "learning_rate": 3.45139266054715e-08, + "logits/chosen": 0.35806044936180115, + "logits/rejected": 0.3277757167816162, + "logps/chosen": -560.1826171875, + "logps/rejected": -511.9388427734375, + "loss": 0.5852, + "nll_loss": 0.5097828507423401, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41566944122314453, + "rewards/margins": 0.37488117814064026, + "rewards/rejected": 0.04078827053308487, + "step": 850 + }, + { + "epoch": 0.7403867043232675, + "grad_norm": 119.84840717683835, + "learning_rate": 3.4083314102789284e-08, + "logits/chosen": 0.35003048181533813, + "logits/rejected": 0.371743768453598, + "logps/chosen": -479.0580749511719, + "logps/rejected": -506.8682861328125, + "loss": 0.5487, + "nll_loss": 0.4575774371623993, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3738093376159668, + "rewards/margins": 0.49029579758644104, + "rewards/rejected": -0.11648646742105484, + "step": 852 + }, + { + "epoch": 0.7421247012817728, + "grad_norm": 182.48903338659457, + "learning_rate": 3.365485224422082e-08, + "logits/chosen": 0.18598990142345428, + "logits/rejected": 0.40627986192703247, + "logps/chosen": -486.7777099609375, + "logps/rejected": -535.242919921875, + "loss": 0.6485, + "nll_loss": 0.48537081480026245, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.236476331949234, + "rewards/margins": 0.49137353897094727, + "rewards/rejected": -0.25489723682403564, + "step": 854 + }, + { + "epoch": 0.7438626982402781, + "grad_norm": 148.23074057937546, + "learning_rate": 3.322855500906373e-08, + "logits/chosen": 0.30834490060806274, + "logits/rejected": 0.46750015020370483, + "logps/chosen": -521.8851928710938, + "logps/rejected": -524.574462890625, + "loss": 0.5825, + "nll_loss": 0.4600994884967804, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4887966215610504, + "rewards/margins": 0.13246317207813263, + "rewards/rejected": 0.3563334345817566, + "step": 856 + }, + { + "epoch": 0.7456006951987834, + "grad_norm": 121.1714678839193, + "learning_rate": 3.2804436305991215e-08, + "logits/chosen": 0.4985821843147278, + "logits/rejected": 0.4725096821784973, + "logps/chosen": -417.9498291015625, + "logps/rejected": -445.34869384765625, + "loss": 0.5765, + "nll_loss": 0.427943617105484, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.40432649850845337, + "rewards/margins": 0.29832470417022705, + "rewards/rejected": 0.10600186139345169, + "step": 858 + }, + { + "epoch": 0.7473386921572888, + "grad_norm": 146.2456732980509, + "learning_rate": 3.238250997259808e-08, + "logits/chosen": 0.5070160627365112, + "logits/rejected": 0.5410177707672119, + "logps/chosen": -468.70452880859375, + "logps/rejected": -484.58331298828125, + "loss": 0.5232, + "nll_loss": 0.4641054570674896, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.548501193523407, + "rewards/margins": 0.87884122133255, + "rewards/rejected": -0.33034002780914307, + "step": 860 + }, + { + "epoch": 0.7490766891157941, + "grad_norm": 243.85595162080867, + "learning_rate": 3.196278977494934e-08, + "logits/chosen": 0.47585529088974, + "logits/rejected": 0.4665442407131195, + "logps/chosen": -477.505126953125, + "logps/rejected": -466.41522216796875, + "loss": 0.5864, + "nll_loss": 0.4726523756980896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31335777044296265, + "rewards/margins": 0.23028318583965302, + "rewards/rejected": 0.08307457715272903, + "step": 862 + }, + { + "epoch": 0.7508146860742994, + "grad_norm": 136.52590152347898, + "learning_rate": 3.154528940713113e-08, + "logits/chosen": 0.38702118396759033, + "logits/rejected": 0.31475353240966797, + "logps/chosen": -579.3184814453125, + "logps/rejected": -510.0700378417969, + "loss": 0.543, + "nll_loss": 0.5211628079414368, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3748409152030945, + "rewards/margins": 0.3414745032787323, + "rewards/rejected": 0.033366404473781586, + "step": 864 + }, + { + "epoch": 0.7525526830328046, + "grad_norm": 215.66091059214304, + "learning_rate": 3.113002249080385e-08, + "logits/chosen": 0.5306642055511475, + "logits/rejected": 0.6336250901222229, + "logps/chosen": -506.5212707519531, + "logps/rejected": -468.5569152832031, + "loss": 0.5612, + "nll_loss": 0.49830642342567444, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5931942462921143, + "rewards/margins": 0.5846770405769348, + "rewards/rejected": 0.008517175912857056, + "step": 866 + }, + { + "epoch": 0.75429067999131, + "grad_norm": 192.10728454750495, + "learning_rate": 3.071700257475768e-08, + "logits/chosen": 0.28933534026145935, + "logits/rejected": 0.4013511538505554, + "logps/chosen": -544.5047607421875, + "logps/rejected": -606.4002685546875, + "loss": 0.589, + "nll_loss": 0.5018710494041443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.44215142726898193, + "rewards/margins": 0.5153141617774963, + "rewards/rejected": -0.07316265255212784, + "step": 868 + }, + { + "epoch": 0.7560286769498153, + "grad_norm": 124.3982879280996, + "learning_rate": 3.0306243134470664e-08, + "logits/chosen": 0.40043580532073975, + "logits/rejected": 0.3937510848045349, + "logps/chosen": -486.595947265625, + "logps/rejected": -502.1639099121094, + "loss": 0.5233, + "nll_loss": 0.4753631353378296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5435855388641357, + "rewards/margins": 0.7956272959709167, + "rewards/rejected": -0.25204169750213623, + "step": 870 + }, + { + "epoch": 0.7577666739083206, + "grad_norm": 128.19135238725113, + "learning_rate": 2.98977575716689e-08, + "logits/chosen": 0.48589569330215454, + "logits/rejected": 0.5582183003425598, + "logps/chosen": -542.2434692382812, + "logps/rejected": -537.6492919921875, + "loss": 0.5619, + "nll_loss": 0.536037027835846, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1533937007188797, + "rewards/margins": 0.3188681900501251, + "rewards/rejected": -0.16547450423240662, + "step": 872 + }, + { + "epoch": 0.7595046708668259, + "grad_norm": 151.61893281292282, + "learning_rate": 2.9491559213889427e-08, + "logits/chosen": 0.36808544397354126, + "logits/rejected": 0.4860031306743622, + "logps/chosen": -485.0714111328125, + "logps/rejected": -491.4736328125, + "loss": 0.5109, + "nll_loss": 0.4829983711242676, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5121357440948486, + "rewards/margins": 0.62371826171875, + "rewards/rejected": -0.11158255487680435, + "step": 874 + }, + { + "epoch": 0.7612426678253313, + "grad_norm": 127.68226210602406, + "learning_rate": 2.9087661314045363e-08, + "logits/chosen": 0.6009732484817505, + "logits/rejected": 0.6443039774894714, + "logps/chosen": -533.9920654296875, + "logps/rejected": -519.9286499023438, + "loss": 0.5595, + "nll_loss": 0.5222257971763611, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4094102680683136, + "rewards/margins": 0.5059654712677002, + "rewards/rejected": -0.09655513614416122, + "step": 876 + }, + { + "epoch": 0.7629806647838366, + "grad_norm": 110.78779283676273, + "learning_rate": 2.8686077049993285e-08, + "logits/chosen": 0.33771321177482605, + "logits/rejected": 0.21091528236865997, + "logps/chosen": -484.62060546875, + "logps/rejected": -501.81146240234375, + "loss": 0.5437, + "nll_loss": 0.4520571529865265, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4446027874946594, + "rewards/margins": 0.4708292484283447, + "rewards/rejected": -0.02622641623020172, + "step": 878 + }, + { + "epoch": 0.7647186617423419, + "grad_norm": 211.63665958435504, + "learning_rate": 2.8286819524103657e-08, + "logits/chosen": -0.004198629409074783, + "logits/rejected": 0.06533518433570862, + "logps/chosen": -491.1811218261719, + "logps/rejected": -475.9295349121094, + "loss": 0.544, + "nll_loss": 0.4399957060813904, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7543421983718872, + "rewards/margins": 0.9796266555786133, + "rewards/rejected": -0.22528459131717682, + "step": 880 + }, + { + "epoch": 0.7664566587008472, + "grad_norm": 126.89748945729015, + "learning_rate": 2.788990176283308e-08, + "logits/chosen": 0.22177600860595703, + "logits/rejected": 0.21223066747188568, + "logps/chosen": -427.07330322265625, + "logps/rejected": -480.98004150390625, + "loss": 0.5663, + "nll_loss": 0.41825437545776367, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5697661638259888, + "rewards/margins": 0.44856926798820496, + "rewards/rejected": 0.12119686603546143, + "step": 882 + }, + { + "epoch": 0.7681946556593526, + "grad_norm": 135.4485548010094, + "learning_rate": 2.749533671629931e-08, + "logits/chosen": 0.26252371072769165, + "logits/rejected": 0.3525621294975281, + "logps/chosen": -411.34228515625, + "logps/rejected": -412.50360107421875, + "loss": 0.5851, + "nll_loss": 0.4299415349960327, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5434117317199707, + "rewards/margins": 0.4031442403793335, + "rewards/rejected": 0.1402675211429596, + "step": 884 + }, + { + "epoch": 0.7699326526178579, + "grad_norm": 143.65403593274095, + "learning_rate": 2.7103137257858867e-08, + "logits/chosen": 0.29264941811561584, + "logits/rejected": 0.38839760422706604, + "logps/chosen": -497.6834411621094, + "logps/rejected": -466.5719909667969, + "loss": 0.5404, + "nll_loss": 0.470731258392334, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.546400785446167, + "rewards/margins": 0.5197809338569641, + "rewards/rejected": 0.026619907468557358, + "step": 886 + }, + { + "epoch": 0.7716706495763632, + "grad_norm": 241.2602615833436, + "learning_rate": 2.6713316183686818e-08, + "logits/chosen": 0.25704315304756165, + "logits/rejected": 0.3444461524486542, + "logps/chosen": -430.9483642578125, + "logps/rejected": -454.0907287597656, + "loss": 0.618, + "nll_loss": 0.40751364827156067, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5406338572502136, + "rewards/margins": 0.3775385022163391, + "rewards/rejected": 0.1630953848361969, + "step": 888 + }, + { + "epoch": 0.7734086465348685, + "grad_norm": 136.35912359364139, + "learning_rate": 2.6325886212359495e-08, + "logits/chosen": 0.20821627974510193, + "logits/rejected": 0.455917090177536, + "logps/chosen": -482.72119140625, + "logps/rejected": -532.3282470703125, + "loss": 0.6048, + "nll_loss": 0.47263607382774353, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4809028208255768, + "rewards/margins": 0.5942395329475403, + "rewards/rejected": -0.11333665251731873, + "step": 890 + }, + { + "epoch": 0.7751466434933739, + "grad_norm": 204.0859099203338, + "learning_rate": 2.594085998443942e-08, + "logits/chosen": 0.19795550405979156, + "logits/rejected": 0.26651012897491455, + "logps/chosen": -402.7255554199219, + "logps/rejected": -369.5584411621094, + "loss": 0.5931, + "nll_loss": 0.3987264037132263, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39684203267097473, + "rewards/margins": 0.32805705070495605, + "rewards/rejected": 0.06878500431776047, + "step": 892 + }, + { + "epoch": 0.7768846404518792, + "grad_norm": 298.79923705100447, + "learning_rate": 2.5558250062062825e-08, + "logits/chosen": 0.37592050433158875, + "logits/rejected": 0.4358353912830353, + "logps/chosen": -431.3149108886719, + "logps/rejected": -449.6175537109375, + "loss": 0.5963, + "nll_loss": 0.4203760623931885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22639721632003784, + "rewards/margins": 0.13411150872707367, + "rewards/rejected": 0.09228573739528656, + "step": 894 + }, + { + "epoch": 0.7786226374103845, + "grad_norm": 141.8941048183387, + "learning_rate": 2.5178068928529862e-08, + "logits/chosen": 0.4105270206928253, + "logits/rejected": 0.3826986253261566, + "logps/chosen": -477.2858581542969, + "logps/rejected": -447.1794128417969, + "loss": 0.5449, + "nll_loss": 0.4311331510543823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37648773193359375, + "rewards/margins": 0.42232033610343933, + "rewards/rejected": -0.04583262652158737, + "step": 896 + }, + { + "epoch": 0.7803606343688898, + "grad_norm": 112.16343660566953, + "learning_rate": 2.4800328987897424e-08, + "logits/chosen": 0.30149951577186584, + "logits/rejected": 0.30680081248283386, + "logps/chosen": -499.3365478515625, + "logps/rejected": -523.8584594726562, + "loss": 0.5376, + "nll_loss": 0.4783164858818054, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4257345199584961, + "rewards/margins": 0.6091545820236206, + "rewards/rejected": -0.1834200918674469, + "step": 898 + }, + { + "epoch": 0.7820986313273952, + "grad_norm": 159.67498326737996, + "learning_rate": 2.4425042564574183e-08, + "logits/chosen": 0.5600635409355164, + "logits/rejected": 0.5486436486244202, + "logps/chosen": -550.2770385742188, + "logps/rejected": -540.26708984375, + "loss": 0.5726, + "nll_loss": 0.5308613777160645, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.30320072174072266, + "rewards/margins": 0.46356379985809326, + "rewards/rejected": -0.1603630930185318, + "step": 900 + }, + { + "epoch": 0.7838366282859005, + "grad_norm": 145.85928653023169, + "learning_rate": 2.4052221902918722e-08, + "logits/chosen": 0.2439267635345459, + "logits/rejected": 0.27589860558509827, + "logps/chosen": -450.7904968261719, + "logps/rejected": -479.41851806640625, + "loss": 0.5508, + "nll_loss": 0.4363137185573578, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.478568971157074, + "rewards/margins": 0.4524759352207184, + "rewards/rejected": 0.026093004271388054, + "step": 902 + }, + { + "epoch": 0.7855746252444058, + "grad_norm": 142.8997584918822, + "learning_rate": 2.3681879166839968e-08, + "logits/chosen": 0.29706910252571106, + "logits/rejected": 0.4273567497730255, + "logps/chosen": -420.8601379394531, + "logps/rejected": -473.61578369140625, + "loss": 0.5634, + "nll_loss": 0.4160003364086151, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29417192935943604, + "rewards/margins": 0.25704747438430786, + "rewards/rejected": 0.03712444752454758, + "step": 904 + }, + { + "epoch": 0.7873126222029111, + "grad_norm": 135.7032714707856, + "learning_rate": 2.3314026439400215e-08, + "logits/chosen": 0.42345553636550903, + "logits/rejected": 0.11951038241386414, + "logps/chosen": -567.097412109375, + "logps/rejected": -528.78857421875, + "loss": 0.6175, + "nll_loss": 0.5222797393798828, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2827926278114319, + "rewards/margins": 0.14894521236419678, + "rewards/rejected": 0.1338474154472351, + "step": 906 + }, + { + "epoch": 0.7890506191614165, + "grad_norm": 191.37433067018546, + "learning_rate": 2.2948675722421085e-08, + "logits/chosen": 0.38801103830337524, + "logits/rejected": 0.19866381585597992, + "logps/chosen": -453.0460205078125, + "logps/rejected": -493.8358459472656, + "loss": 0.539, + "nll_loss": 0.45135483145713806, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5596899390220642, + "rewards/margins": 0.6123340725898743, + "rewards/rejected": -0.05264415964484215, + "step": 908 + }, + { + "epoch": 0.7907886161199218, + "grad_norm": 152.5312135549706, + "learning_rate": 2.258583893609175e-08, + "logits/chosen": 0.3808496594429016, + "logits/rejected": 0.5132696032524109, + "logps/chosen": -510.3198547363281, + "logps/rejected": -533.71923828125, + "loss": 0.561, + "nll_loss": 0.48955419659614563, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.43191957473754883, + "rewards/margins": 0.5486711263656616, + "rewards/rejected": -0.11675149202346802, + "step": 910 + }, + { + "epoch": 0.7925266130784271, + "grad_norm": 286.1777750377432, + "learning_rate": 2.22255279185802e-08, + "logits/chosen": 0.38537973165512085, + "logits/rejected": 0.5917662382125854, + "logps/chosen": -465.35479736328125, + "logps/rejected": -465.9227294921875, + "loss": 0.607, + "nll_loss": 0.44577518105506897, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3478468060493469, + "rewards/margins": 0.02361936867237091, + "rewards/rejected": 0.3242274224758148, + "step": 912 + }, + { + "epoch": 0.7942646100369324, + "grad_norm": 197.55059622588414, + "learning_rate": 2.1867754425646922e-08, + "logits/chosen": 0.34537068009376526, + "logits/rejected": 0.22164693474769592, + "logps/chosen": -520.771728515625, + "logps/rejected": -554.5394897460938, + "loss": 0.5538, + "nll_loss": 0.5084543228149414, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.501037061214447, + "rewards/margins": 0.5809021592140198, + "rewards/rejected": -0.07986507564783096, + "step": 914 + }, + { + "epoch": 0.7960026069954378, + "grad_norm": 132.4136998604499, + "learning_rate": 2.1512530130261208e-08, + "logits/chosen": 0.2082512080669403, + "logits/rejected": 0.25798577070236206, + "logps/chosen": -421.710693359375, + "logps/rejected": -446.8322448730469, + "loss": 0.5382, + "nll_loss": 0.41912373900413513, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8833374381065369, + "rewards/margins": 0.6493025422096252, + "rewards/rejected": 0.23403486609458923, + "step": 916 + }, + { + "epoch": 0.7977406039539431, + "grad_norm": 163.69318648505143, + "learning_rate": 2.115986662222058e-08, + "logits/chosen": 0.6591068506240845, + "logits/rejected": 0.4656745195388794, + "logps/chosen": -488.3570251464844, + "logps/rejected": -459.34912109375, + "loss": 0.5925, + "nll_loss": 0.46167752146720886, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23973506689071655, + "rewards/margins": 0.22639313340187073, + "rewards/rejected": 0.013341886922717094, + "step": 918 + }, + { + "epoch": 0.7994786009124484, + "grad_norm": 125.35380400867658, + "learning_rate": 2.08097754077725e-08, + "logits/chosen": 0.26784810423851013, + "logits/rejected": 0.31632742285728455, + "logps/chosen": -488.3423156738281, + "logps/rejected": -538.6160888671875, + "loss": 0.5601, + "nll_loss": 0.4677424132823944, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5197944641113281, + "rewards/margins": 0.5179188251495361, + "rewards/rejected": 0.0018756985664367676, + "step": 920 + }, + { + "epoch": 0.8012165978709537, + "grad_norm": 122.65155967179511, + "learning_rate": 2.0462267909238895e-08, + "logits/chosen": 0.4603986144065857, + "logits/rejected": 0.4427664279937744, + "logps/chosen": -501.7173767089844, + "logps/rejected": -440.8707580566406, + "loss": 0.5563, + "nll_loss": 0.4742586314678192, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3994339108467102, + "rewards/margins": 0.303816020488739, + "rewards/rejected": 0.0956178605556488, + "step": 922 + }, + { + "epoch": 0.8029545948294591, + "grad_norm": 181.17211501585948, + "learning_rate": 2.0117355464643647e-08, + "logits/chosen": 0.2570352554321289, + "logits/rejected": 0.364309161901474, + "logps/chosen": -520.6018676757812, + "logps/rejected": -540.8889770507812, + "loss": 0.5815, + "nll_loss": 0.473418653011322, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8413310647010803, + "rewards/margins": 0.6333141326904297, + "rewards/rejected": 0.20801697671413422, + "step": 924 + }, + { + "epoch": 0.8046925917879644, + "grad_norm": 148.36579795161384, + "learning_rate": 1.9775049327342486e-08, + "logits/chosen": 0.3666941225528717, + "logits/rejected": 0.26878297328948975, + "logps/chosen": -418.0342102050781, + "logps/rejected": -489.3192443847656, + "loss": 0.6069, + "nll_loss": 0.40862399339675903, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.539859414100647, + "rewards/margins": 0.32572096586227417, + "rewards/rejected": 0.2141384333372116, + "step": 926 + }, + { + "epoch": 0.8064305887464697, + "grad_norm": 233.14469000909418, + "learning_rate": 1.9435360665656033e-08, + "logits/chosen": 0.26835137605667114, + "logits/rejected": 0.22353360056877136, + "logps/chosen": -529.213134765625, + "logps/rejected": -506.2857360839844, + "loss": 0.5749, + "nll_loss": 0.5105596780776978, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7156704664230347, + "rewards/margins": 0.6623274087905884, + "rewards/rejected": 0.053343020379543304, + "step": 928 + }, + { + "epoch": 0.808168585704975, + "grad_norm": 177.62086603981126, + "learning_rate": 1.9098300562505266e-08, + "logits/chosen": 0.5697555541992188, + "logits/rejected": 0.5495641827583313, + "logps/chosen": -498.977783203125, + "logps/rejected": -495.72662353515625, + "loss": 0.6068, + "nll_loss": 0.5025020241737366, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5968002080917358, + "rewards/margins": 0.5186977982521057, + "rewards/rejected": 0.07810239493846893, + "step": 930 + }, + { + "epoch": 0.8099065826634804, + "grad_norm": 147.90397895312702, + "learning_rate": 1.876388001504995e-08, + "logits/chosen": 0.6278223395347595, + "logits/rejected": 0.6403146386146545, + "logps/chosen": -466.76763916015625, + "logps/rejected": -463.7005310058594, + "loss": 0.5613, + "nll_loss": 0.4527406096458435, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2836538255214691, + "rewards/margins": 0.3506113886833191, + "rewards/rejected": -0.06695757806301117, + "step": 932 + }, + { + "epoch": 0.8116445796219857, + "grad_norm": 154.72657331184226, + "learning_rate": 1.843210993432983e-08, + "logits/chosen": 0.4768508970737457, + "logits/rejected": 0.4194488823413849, + "logps/chosen": -484.4299621582031, + "logps/rejected": -480.65234375, + "loss": 0.5582, + "nll_loss": 0.49351003766059875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5547365546226501, + "rewards/margins": 0.6049469113349915, + "rewards/rejected": -0.0502103790640831, + "step": 934 + }, + { + "epoch": 0.813382576580491, + "grad_norm": 187.5308682243756, + "learning_rate": 1.8103001144908746e-08, + "logits/chosen": 0.406520813703537, + "logits/rejected": 0.56972736120224, + "logps/chosen": -411.3998107910156, + "logps/rejected": -491.790771484375, + "loss": 0.6176, + "nll_loss": 0.4119413495063782, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6432211399078369, + "rewards/margins": 0.5553202629089355, + "rewards/rejected": 0.08790083229541779, + "step": 936 + }, + { + "epoch": 0.8151205735389963, + "grad_norm": 904.3954964527115, + "learning_rate": 1.7776564384521288e-08, + "logits/chosen": 0.6471278667449951, + "logits/rejected": 0.6938129663467407, + "logps/chosen": -505.6937255859375, + "logps/rejected": -459.9547119140625, + "loss": 0.5964, + "nll_loss": 0.508669912815094, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6563972234725952, + "rewards/margins": 0.5751644968986511, + "rewards/rejected": 0.08123274147510529, + "step": 938 + }, + { + "epoch": 0.8168585704975017, + "grad_norm": 153.79334054038893, + "learning_rate": 1.74528103037226e-08, + "logits/chosen": 0.3130883276462555, + "logits/rejected": 0.46651285886764526, + "logps/chosen": -411.9516296386719, + "logps/rejected": -480.91595458984375, + "loss": 0.5442, + "nll_loss": 0.40169090032577515, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8183580636978149, + "rewards/margins": 0.6252126097679138, + "rewards/rejected": 0.19314545392990112, + "step": 940 + }, + { + "epoch": 0.818596567456007, + "grad_norm": 242.12651784061373, + "learning_rate": 1.7131749465540855e-08, + "logits/chosen": 0.7213436365127563, + "logits/rejected": 0.5559303760528564, + "logps/chosen": -505.56903076171875, + "logps/rejected": -454.2501220703125, + "loss": 0.6202, + "nll_loss": 0.49371591210365295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6225160360336304, + "rewards/margins": 0.43361109495162964, + "rewards/rejected": 0.18890495598316193, + "step": 942 + }, + { + "epoch": 0.8203345644145122, + "grad_norm": 128.95760203041075, + "learning_rate": 1.6813392345132517e-08, + "logits/chosen": 0.507114589214325, + "logits/rejected": 0.4971063435077667, + "logps/chosen": -468.1333923339844, + "logps/rejected": -486.2199401855469, + "loss": 0.5707, + "nll_loss": 0.44967854022979736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4457395672798157, + "rewards/margins": 0.386365681886673, + "rewards/rejected": 0.05937386304140091, + "step": 944 + }, + { + "epoch": 0.8220725613730177, + "grad_norm": 147.48992803946854, + "learning_rate": 1.6497749329440745e-08, + "logits/chosen": 0.3423297703266144, + "logits/rejected": 0.4964262545108795, + "logps/chosen": -421.44964599609375, + "logps/rejected": -438.1751403808594, + "loss": 0.614, + "nll_loss": 0.4068484604358673, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5428711175918579, + "rewards/margins": 0.02361917495727539, + "rewards/rejected": 0.5192519426345825, + "step": 946 + }, + { + "epoch": 0.823810558331523, + "grad_norm": 158.19990079831206, + "learning_rate": 1.6184830716856346e-08, + "logits/chosen": 0.24282848834991455, + "logits/rejected": 0.3862345218658447, + "logps/chosen": -417.93035888671875, + "logps/rejected": -480.74530029296875, + "loss": 0.5395, + "nll_loss": 0.4029342532157898, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6654102206230164, + "rewards/margins": 0.5100387334823608, + "rewards/rejected": 0.15537136793136597, + "step": 948 + }, + { + "epoch": 0.8255485552900282, + "grad_norm": 205.78174442372523, + "learning_rate": 1.5874646716881866e-08, + "logits/chosen": 0.36575302481651306, + "logits/rejected": 0.3649991750717163, + "logps/chosen": -526.206298828125, + "logps/rejected": -542.56494140625, + "loss": 0.6185, + "nll_loss": 0.48127463459968567, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5322883129119873, + "rewards/margins": 0.5123573541641235, + "rewards/rejected": 0.019931048154830933, + "step": 950 + }, + { + "epoch": 0.8272865522485335, + "grad_norm": 171.145227632994, + "learning_rate": 1.5567207449798513e-08, + "logits/chosen": 0.5767173767089844, + "logits/rejected": 0.4821607172489166, + "logps/chosen": -530.916259765625, + "logps/rejected": -499.0142822265625, + "loss": 0.658, + "nll_loss": 0.512601912021637, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3559614419937134, + "rewards/margins": 0.09974518418312073, + "rewards/rejected": 0.25621622800827026, + "step": 952 + }, + { + "epoch": 0.8290245492070389, + "grad_norm": 176.07720507279737, + "learning_rate": 1.5262522946335754e-08, + "logits/chosen": 0.46107804775238037, + "logits/rejected": 0.4554142951965332, + "logps/chosen": -436.38909912109375, + "logps/rejected": -498.63470458984375, + "loss": 0.51, + "nll_loss": 0.4370454251766205, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6604627370834351, + "rewards/margins": 0.7786142230033875, + "rewards/rejected": -0.11815138161182404, + "step": 954 + }, + { + "epoch": 0.8307625461655442, + "grad_norm": 309.2601198340963, + "learning_rate": 1.4960603147344342e-08, + "logits/chosen": 0.3789171278476715, + "logits/rejected": 0.4542412757873535, + "logps/chosen": -534.3204345703125, + "logps/rejected": -537.3842163085938, + "loss": 0.571, + "nll_loss": 0.4980693459510803, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5330474972724915, + "rewards/margins": 0.32546788454055786, + "rewards/rejected": 0.2075796127319336, + "step": 956 + }, + { + "epoch": 0.8325005431240495, + "grad_norm": 132.0934271644858, + "learning_rate": 1.466145790347183e-08, + "logits/chosen": 0.3661070764064789, + "logits/rejected": 0.23429855704307556, + "logps/chosen": -479.94927978515625, + "logps/rejected": -361.9775085449219, + "loss": 0.6086, + "nll_loss": 0.43883123993873596, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5983450412750244, + "rewards/margins": 0.5256334543228149, + "rewards/rejected": 0.0727115273475647, + "step": 958 + }, + { + "epoch": 0.8342385400825548, + "grad_norm": 141.19361438240026, + "learning_rate": 1.4365096974841106e-08, + "logits/chosen": 0.4359699785709381, + "logits/rejected": 0.5905839204788208, + "logps/chosen": -572.3399047851562, + "logps/rejected": -490.7935791015625, + "loss": 0.5727, + "nll_loss": 0.5214511156082153, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6746166348457336, + "rewards/margins": 0.6807842254638672, + "rewards/rejected": -0.006167605519294739, + "step": 960 + }, + { + "epoch": 0.8359765370410602, + "grad_norm": 134.2041368142193, + "learning_rate": 1.4071530030732093e-08, + "logits/chosen": 0.36992889642715454, + "logits/rejected": 0.4493410587310791, + "logps/chosen": -398.4943542480469, + "logps/rejected": -490.66265869140625, + "loss": 0.596, + "nll_loss": 0.41504624485969543, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5691247582435608, + "rewards/margins": 0.5313467383384705, + "rewards/rejected": 0.03777790069580078, + "step": 962 + }, + { + "epoch": 0.8377145339995655, + "grad_norm": 111.93946541938594, + "learning_rate": 1.378076664926624e-08, + "logits/chosen": 0.44248145818710327, + "logits/rejected": 0.3701130747795105, + "logps/chosen": -467.9688720703125, + "logps/rejected": -451.6510314941406, + "loss": 0.544, + "nll_loss": 0.4465351700782776, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.504805862903595, + "rewards/margins": 0.6661850214004517, + "rewards/rejected": -0.1613791435956955, + "step": 964 + }, + { + "epoch": 0.8394525309580708, + "grad_norm": 143.9430178806754, + "learning_rate": 1.349281631709389e-08, + "logits/chosen": 0.23045340180397034, + "logits/rejected": 0.16618621349334717, + "logps/chosen": -428.14093017578125, + "logps/rejected": -468.91717529296875, + "loss": 0.5326, + "nll_loss": 0.4211942255496979, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.6767243146896362, + "rewards/margins": 0.11299353092908859, + "rewards/rejected": 0.5637306571006775, + "step": 966 + }, + { + "epoch": 0.8411905279165761, + "grad_norm": 103.62673108372952, + "learning_rate": 1.3207688429084974e-08, + "logits/chosen": 0.46591299772262573, + "logits/rejected": 0.3822897672653198, + "logps/chosen": -483.8509521484375, + "logps/rejected": -498.87005615234375, + "loss": 0.537, + "nll_loss": 0.46457740664482117, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6259498596191406, + "rewards/margins": 0.6032807230949402, + "rewards/rejected": 0.022669125348329544, + "step": 968 + }, + { + "epoch": 0.8429285248750815, + "grad_norm": 234.96317478690582, + "learning_rate": 1.2925392288022297e-08, + "logits/chosen": 0.15632614493370056, + "logits/rejected": 0.20441731810569763, + "logps/chosen": -476.6795349121094, + "logps/rejected": -508.18792724609375, + "loss": 0.5673, + "nll_loss": 0.46259137988090515, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8507680296897888, + "rewards/margins": 0.6741021275520325, + "rewards/rejected": 0.17666588723659515, + "step": 970 + }, + { + "epoch": 0.8446665218335868, + "grad_norm": 148.50514932068978, + "learning_rate": 1.264593710429811e-08, + "logits/chosen": 0.3417684733867645, + "logits/rejected": 0.35431018471717834, + "logps/chosen": -480.681396484375, + "logps/rejected": -478.01416015625, + "loss": 0.5814, + "nll_loss": 0.48108553886413574, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5715519189834595, + "rewards/margins": 0.683962881565094, + "rewards/rejected": -0.11241091787815094, + "step": 972 + }, + { + "epoch": 0.8464045187920921, + "grad_norm": 129.62124450547134, + "learning_rate": 1.2369331995613663e-08, + "logits/chosen": 0.040659140795469284, + "logits/rejected": 0.054739244282245636, + "logps/chosen": -445.49310302734375, + "logps/rejected": -479.6976318359375, + "loss": 0.5619, + "nll_loss": 0.4347701668739319, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.047504186630249, + "rewards/margins": 0.8733047842979431, + "rewards/rejected": 0.17419950664043427, + "step": 974 + }, + { + "epoch": 0.8481425157505974, + "grad_norm": 153.51628913591082, + "learning_rate": 1.2095585986681533e-08, + "logits/chosen": 0.7045519948005676, + "logits/rejected": 0.67140132188797, + "logps/chosen": -594.0953369140625, + "logps/rejected": -604.9121704101562, + "loss": 0.5705, + "nll_loss": 0.5499407052993774, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4179590344429016, + "rewards/margins": 0.40337541699409485, + "rewards/rejected": 0.014583582058548927, + "step": 976 + }, + { + "epoch": 0.8498805127091028, + "grad_norm": 123.40507981871161, + "learning_rate": 1.1824708008931416e-08, + "logits/chosen": 0.718404233455658, + "logits/rejected": 0.6372251510620117, + "logps/chosen": -495.7720642089844, + "logps/rejected": -462.13214111328125, + "loss": 0.5864, + "nll_loss": 0.4798401892185211, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.31270793080329895, + "rewards/margins": 0.2564069926738739, + "rewards/rejected": 0.05630092695355415, + "step": 978 + }, + { + "epoch": 0.8516185096676081, + "grad_norm": 149.48370997361744, + "learning_rate": 1.155670690021857e-08, + "logits/chosen": 0.845966637134552, + "logits/rejected": 0.7787639498710632, + "logps/chosen": -498.9972229003906, + "logps/rejected": -514.4071655273438, + "loss": 0.5707, + "nll_loss": 0.4815356135368347, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4916130304336548, + "rewards/margins": 0.3846202790737152, + "rewards/rejected": 0.10699271410703659, + "step": 980 + }, + { + "epoch": 0.8533565066261134, + "grad_norm": 283.6581431830632, + "learning_rate": 1.1291591404535461e-08, + "logits/chosen": 0.691591739654541, + "logits/rejected": 0.38823819160461426, + "logps/chosen": -521.052001953125, + "logps/rejected": -443.2837829589844, + "loss": 0.5708, + "nll_loss": 0.4971870183944702, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3442355990409851, + "rewards/margins": 0.4594818651676178, + "rewards/rejected": -0.11524628847837448, + "step": 982 + }, + { + "epoch": 0.8550945035846187, + "grad_norm": 142.05231100131795, + "learning_rate": 1.1029370171726571e-08, + "logits/chosen": 0.20724141597747803, + "logits/rejected": 0.45057153701782227, + "logps/chosen": -526.8379516601562, + "logps/rejected": -588.6925659179688, + "loss": 0.5284, + "nll_loss": 0.5008586645126343, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8295645713806152, + "rewards/margins": 0.8359251022338867, + "rewards/rejected": -0.0063606202602386475, + "step": 984 + }, + { + "epoch": 0.8568325005431241, + "grad_norm": 240.52585530952192, + "learning_rate": 1.0770051757206077e-08, + "logits/chosen": 0.20337149500846863, + "logits/rejected": 0.4355597198009491, + "logps/chosen": -495.91021728515625, + "logps/rejected": -542.2305297851562, + "loss": 0.5244, + "nll_loss": 0.4666140377521515, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.742421567440033, + "rewards/margins": 0.6228967308998108, + "rewards/rejected": 0.11952477693557739, + "step": 986 + }, + { + "epoch": 0.8585704975016294, + "grad_norm": 161.81705766680054, + "learning_rate": 1.0513644621678807e-08, + "logits/chosen": 0.6418431997299194, + "logits/rejected": 0.8085691928863525, + "logps/chosen": -457.9586486816406, + "logps/rejected": -536.1529541015625, + "loss": 0.6134, + "nll_loss": 0.4475609064102173, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40928417444229126, + "rewards/margins": 0.32167717814445496, + "rewards/rejected": 0.0876070037484169, + "step": 988 + }, + { + "epoch": 0.8603084944601347, + "grad_norm": 209.83037790632198, + "learning_rate": 1.0260157130864178e-08, + "logits/chosen": 0.3358425796031952, + "logits/rejected": 0.14016635715961456, + "logps/chosen": -464.33392333984375, + "logps/rejected": -476.4942626953125, + "loss": 0.5479, + "nll_loss": 0.4349672794342041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4732265770435333, + "rewards/margins": 0.3230219781398773, + "rewards/rejected": 0.15020456910133362, + "step": 990 + }, + { + "epoch": 0.86204649141864, + "grad_norm": 131.501360542472, + "learning_rate": 1.0009597555223126e-08, + "logits/chosen": 0.23200947046279907, + "logits/rejected": 0.32851237058639526, + "logps/chosen": -446.45947265625, + "logps/rejected": -463.9974060058594, + "loss": 0.513, + "nll_loss": 0.4259781539440155, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6944409608840942, + "rewards/margins": 0.42177268862724304, + "rewards/rejected": 0.2726683020591736, + "step": 992 + }, + { + "epoch": 0.8637844883771454, + "grad_norm": 127.65009818745983, + "learning_rate": 9.761974069688461e-09, + "logits/chosen": 0.5669763088226318, + "logits/rejected": 0.5903551578521729, + "logps/chosen": -515.50390625, + "logps/rejected": -482.6619873046875, + "loss": 0.6124, + "nll_loss": 0.5234907269477844, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4056873321533203, + "rewards/margins": 0.38572657108306885, + "rewards/rejected": 0.019960783421993256, + "step": 994 + }, + { + "epoch": 0.8655224853356507, + "grad_norm": 139.3600385512985, + "learning_rate": 9.517294753398064e-09, + "logits/chosen": 0.23763130605220795, + "logits/rejected": 0.1550990492105484, + "logps/chosen": -402.9676513671875, + "logps/rejected": -437.53692626953125, + "loss": 0.5845, + "nll_loss": 0.398506760597229, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7281893491744995, + "rewards/margins": 0.3145132064819336, + "rewards/rejected": 0.4136761724948883, + "step": 996 + }, + { + "epoch": 0.867260482294156, + "grad_norm": 151.6763492595599, + "learning_rate": 9.275567589431177e-09, + "logits/chosen": 0.32628533244132996, + "logits/rejected": 0.3804038166999817, + "logps/chosen": -496.0785827636719, + "logps/rejected": -505.3673095703125, + "loss": 0.6, + "nll_loss": 0.47465142607688904, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9524919986724854, + "rewards/margins": 0.6615516543388367, + "rewards/rejected": 0.2909402847290039, + "step": 998 + }, + { + "epoch": 0.8689984792526613, + "grad_norm": 149.6705914399637, + "learning_rate": 9.036800464548155e-09, + "logits/chosen": 0.3488670587539673, + "logits/rejected": 0.26245802640914917, + "logps/chosen": -450.0097961425781, + "logps/rejected": -507.2135314941406, + "loss": 0.6198, + "nll_loss": 0.45576491951942444, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5999775528907776, + "rewards/margins": 0.3929904103279114, + "rewards/rejected": 0.20698711276054382, + "step": 1000 + }, + { + "epoch": 0.8707364762111667, + "grad_norm": 214.22049086541324, + "learning_rate": 8.80100116893301e-09, + "logits/chosen": -0.08340902626514435, + "logits/rejected": 0.002378493547439575, + "logps/chosen": -538.5593872070312, + "logps/rejected": -618.6040649414062, + "loss": 0.584, + "nll_loss": 0.4630267322063446, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7095227837562561, + "rewards/margins": 0.5523332357406616, + "rewards/rejected": 0.15718956291675568, + "step": 1002 + }, + { + "epoch": 0.872474473169672, + "grad_norm": 135.51290099719628, + "learning_rate": 8.568177395939213e-09, + "logits/chosen": 0.5445169806480408, + "logits/rejected": 0.5261526703834534, + "logps/chosen": -519.8292846679688, + "logps/rejected": -467.5902099609375, + "loss": 0.6387, + "nll_loss": 0.48102161288261414, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5225428342819214, + "rewards/margins": 0.13977020978927612, + "rewards/rejected": 0.3827725350856781, + "step": 1004 + }, + { + "epoch": 0.8742124701281773, + "grad_norm": 149.90853918610964, + "learning_rate": 8.338336741838836e-09, + "logits/chosen": 0.2194560170173645, + "logits/rejected": 0.1848369538784027, + "logps/chosen": -410.1855773925781, + "logps/rejected": -393.67950439453125, + "loss": 0.548, + "nll_loss": 0.3803246319293976, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.32082509994506836, + "rewards/margins": 0.14545764029026031, + "rewards/rejected": 0.17536745965480804, + "step": 1006 + }, + { + "epoch": 0.8759504670866826, + "grad_norm": 137.29548301434852, + "learning_rate": 8.111486705574533e-09, + "logits/chosen": 0.4672020673751831, + "logits/rejected": 0.48324400186538696, + "logps/chosen": -502.46441650390625, + "logps/rejected": -493.8004455566406, + "loss": 0.5571, + "nll_loss": 0.49147966504096985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6543054580688477, + "rewards/margins": 0.6119914054870605, + "rewards/rejected": 0.042314041405916214, + "step": 1008 + }, + { + "epoch": 0.877688464045188, + "grad_norm": 107.18011497907976, + "learning_rate": 7.887634688515e-09, + "logits/chosen": 0.49401983618736267, + "logits/rejected": 0.562633216381073, + "logps/chosen": -450.69854736328125, + "logps/rejected": -484.1754150390625, + "loss": 0.5953, + "nll_loss": 0.4330693185329437, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5111261606216431, + "rewards/margins": 0.2732935845851898, + "rewards/rejected": 0.23783257603645325, + "step": 1010 + }, + { + "epoch": 0.8794264610036933, + "grad_norm": 223.40815488567432, + "learning_rate": 7.666787994213453e-09, + "logits/chosen": 0.3347271978855133, + "logits/rejected": 0.34065520763397217, + "logps/chosen": -508.6999206542969, + "logps/rejected": -517.7572631835938, + "loss": 0.5512, + "nll_loss": 0.49927687644958496, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6774803400039673, + "rewards/margins": 0.42118340730667114, + "rewards/rejected": 0.25629690289497375, + "step": 1012 + }, + { + "epoch": 0.8811644579621986, + "grad_norm": 108.51949876750757, + "learning_rate": 7.4489538281693136e-09, + "logits/chosen": 0.6547769904136658, + "logits/rejected": 0.6195938587188721, + "logps/chosen": -480.0023498535156, + "logps/rejected": -542.1754760742188, + "loss": 0.5535, + "nll_loss": 0.4586215913295746, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3463646173477173, + "rewards/margins": 0.5637027025222778, + "rewards/rejected": -0.21733808517456055, + "step": 1014 + }, + { + "epoch": 0.8829024549207038, + "grad_norm": 171.38546762683748, + "learning_rate": 7.234139297593178e-09, + "logits/chosen": 0.36383742094039917, + "logits/rejected": 0.30936628580093384, + "logps/chosen": -482.3176574707031, + "logps/rejected": -511.5982360839844, + "loss": 0.5968, + "nll_loss": 0.4618416726589203, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3494030237197876, + "rewards/margins": 0.3414415717124939, + "rewards/rejected": 0.007961463183164597, + "step": 1016 + }, + { + "epoch": 0.8846404518792093, + "grad_norm": 130.31979725651428, + "learning_rate": 7.022351411174865e-09, + "logits/chosen": 0.6526001691818237, + "logits/rejected": 0.650229811668396, + "logps/chosen": -440.3011779785156, + "logps/rejected": -459.21234130859375, + "loss": 0.5771, + "nll_loss": 0.43394389748573303, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.28458622097969055, + "rewards/margins": 0.1846170425415039, + "rewards/rejected": 0.09996921569108963, + "step": 1018 + }, + { + "epoch": 0.8863784488377145, + "grad_norm": 146.85783391797733, + "learning_rate": 6.813597078854772e-09, + "logits/chosen": 0.3538188636302948, + "logits/rejected": 0.5154883861541748, + "logps/chosen": -405.7217102050781, + "logps/rejected": -519.734130859375, + "loss": 0.617, + "nll_loss": 0.41503557562828064, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.507857084274292, + "rewards/margins": 0.3474844992160797, + "rewards/rejected": 0.16037264466285706, + "step": 1020 + }, + { + "epoch": 0.8881164457962198, + "grad_norm": 178.76099417376233, + "learning_rate": 6.607883111598445e-09, + "logits/chosen": 0.6064735054969788, + "logits/rejected": 0.6442840099334717, + "logps/chosen": -539.7501831054688, + "logps/rejected": -526.06884765625, + "loss": 0.5727, + "nll_loss": 0.5150287747383118, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11767234653234482, + "rewards/margins": 0.13286437094211578, + "rewards/rejected": -0.015192030929028988, + "step": 1022 + }, + { + "epoch": 0.8898544427547251, + "grad_norm": 143.87254174590024, + "learning_rate": 6.405216221174325e-09, + "logits/chosen": 0.19760847091674805, + "logits/rejected": 0.17889335751533508, + "logps/chosen": -476.1910705566406, + "logps/rejected": -545.6195068359375, + "loss": 0.5429, + "nll_loss": 0.4657999575138092, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7912313342094421, + "rewards/margins": 0.7614498138427734, + "rewards/rejected": 0.029781535267829895, + "step": 1024 + }, + { + "epoch": 0.8915924397132305, + "grad_norm": 142.72070270620054, + "learning_rate": 6.205603019934791e-09, + "logits/chosen": 0.44221293926239014, + "logits/rejected": 0.4288594424724579, + "logps/chosen": -456.22088623046875, + "logps/rejected": -469.8916320800781, + "loss": 0.5815, + "nll_loss": 0.4313237965106964, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26495635509490967, + "rewards/margins": 0.35365360975265503, + "rewards/rejected": -0.08869723975658417, + "step": 1026 + }, + { + "epoch": 0.8933304366717358, + "grad_norm": 143.40703061220435, + "learning_rate": 6.009050020600459e-09, + "logits/chosen": 0.2621181011199951, + "logits/rejected": 0.3539845645427704, + "logps/chosen": -484.30810546875, + "logps/rejected": -515.847900390625, + "loss": 0.5573, + "nll_loss": 0.45568376779556274, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8613646030426025, + "rewards/margins": 0.5282778143882751, + "rewards/rejected": 0.3330867886543274, + "step": 1028 + }, + { + "epoch": 0.8950684336302411, + "grad_norm": 228.1216526051892, + "learning_rate": 5.815563636047538e-09, + "logits/chosen": 0.2763429284095764, + "logits/rejected": 0.30502286553382874, + "logps/chosen": -490.53564453125, + "logps/rejected": -529.5304565429688, + "loss": 0.5394, + "nll_loss": 0.4875333905220032, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8852298259735107, + "rewards/margins": 0.7269324660301208, + "rewards/rejected": 0.1582973450422287, + "step": 1030 + }, + { + "epoch": 0.8968064305887464, + "grad_norm": 119.41955809926453, + "learning_rate": 5.625150179098803e-09, + "logits/chosen": 0.7187064290046692, + "logits/rejected": 0.6024725437164307, + "logps/chosen": -527.1694946289062, + "logps/rejected": -551.8734130859375, + "loss": 0.5891, + "nll_loss": 0.5098748207092285, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.296517550945282, + "rewards/margins": 0.18804368376731873, + "rewards/rejected": 0.10847387462854385, + "step": 1032 + }, + { + "epoch": 0.8985444275472518, + "grad_norm": 218.4987606965274, + "learning_rate": 5.437815862317519e-09, + "logits/chosen": 0.2036760449409485, + "logits/rejected": 0.27650704979896545, + "logps/chosen": -431.7545166015625, + "logps/rejected": -486.6964111328125, + "loss": 0.5661, + "nll_loss": 0.4158436059951782, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6781007051467896, + "rewards/margins": 0.7105196714401245, + "rewards/rejected": -0.03241892158985138, + "step": 1034 + }, + { + "epoch": 0.9002824245057571, + "grad_norm": 150.28933801097338, + "learning_rate": 5.253566797804709e-09, + "logits/chosen": 0.24965474009513855, + "logits/rejected": 0.28565654158592224, + "logps/chosen": -492.99859619140625, + "logps/rejected": -555.1309814453125, + "loss": 0.581, + "nll_loss": 0.47916826605796814, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8634170889854431, + "rewards/margins": 0.5948573350906372, + "rewards/rejected": 0.2685597240924835, + "step": 1036 + }, + { + "epoch": 0.9020204214642624, + "grad_norm": 136.55772838494232, + "learning_rate": 5.072408996999844e-09, + "logits/chosen": 0.43632686138153076, + "logits/rejected": 0.39454224705696106, + "logps/chosen": -494.273193359375, + "logps/rejected": -499.689697265625, + "loss": 0.5438, + "nll_loss": 0.4602780044078827, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5984913110733032, + "rewards/margins": 0.25624755024909973, + "rewards/rejected": 0.3422437310218811, + "step": 1038 + }, + { + "epoch": 0.9037584184227677, + "grad_norm": 178.70590958523087, + "learning_rate": 4.8943483704846465e-09, + "logits/chosen": 0.22367754578590393, + "logits/rejected": 0.3362473249435425, + "logps/chosen": -454.59967041015625, + "logps/rejected": -488.4815368652344, + "loss": 0.5952, + "nll_loss": 0.44791561365127563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5820288062095642, + "rewards/margins": 0.25380000472068787, + "rewards/rejected": 0.32822877168655396, + "step": 1040 + }, + { + "epoch": 0.9054964153812731, + "grad_norm": 185.45213748058183, + "learning_rate": 4.7193907277902175e-09, + "logits/chosen": 0.5410184860229492, + "logits/rejected": 0.5517024397850037, + "logps/chosen": -508.5509338378906, + "logps/rejected": -491.8945007324219, + "loss": 0.5455, + "nll_loss": 0.518214225769043, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.39927348494529724, + "rewards/margins": 0.3506126403808594, + "rewards/rejected": 0.048660848289728165, + "step": 1042 + }, + { + "epoch": 0.9072344123397784, + "grad_norm": 139.23192133772244, + "learning_rate": 4.547541777207564e-09, + "logits/chosen": 0.17857961356639862, + "logits/rejected": 0.33340996503829956, + "logps/chosen": -414.42791748046875, + "logps/rejected": -408.1433410644531, + "loss": 0.7357, + "nll_loss": 0.3875848054885864, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5422300100326538, + "rewards/margins": -0.007215976715087891, + "rewards/rejected": 0.5494458675384521, + "step": 1044 + }, + { + "epoch": 0.9089724092982837, + "grad_norm": 147.38015327862638, + "learning_rate": 4.3788071256013024e-09, + "logits/chosen": 0.6263728737831116, + "logits/rejected": 0.581895649433136, + "logps/chosen": -490.2805480957031, + "logps/rejected": -471.88385009765625, + "loss": 0.5973, + "nll_loss": 0.47902175784111023, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5375036597251892, + "rewards/margins": 0.5982069969177246, + "rewards/rejected": -0.06070336699485779, + "step": 1046 + }, + { + "epoch": 0.910710406256789, + "grad_norm": 106.89389378580692, + "learning_rate": 4.2131922782267405e-09, + "logits/chosen": 0.11579008400440216, + "logits/rejected": 0.08525022119283676, + "logps/chosen": -438.53094482421875, + "logps/rejected": -441.49969482421875, + "loss": 0.5276, + "nll_loss": 0.4182903468608856, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5620919466018677, + "rewards/margins": 0.9903548955917358, + "rewards/rejected": -0.4282629191875458, + "step": 1048 + }, + { + "epoch": 0.9124484032152944, + "grad_norm": 183.72771679991544, + "learning_rate": 4.050702638550274e-09, + "logits/chosen": 0.535997748374939, + "logits/rejected": 0.4335247278213501, + "logps/chosen": -538.29296875, + "logps/rejected": -476.898193359375, + "loss": 0.6073, + "nll_loss": 0.5014994740486145, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.19083280861377716, + "rewards/margins": -0.13114146888256073, + "rewards/rejected": 0.3219743072986603, + "step": 1050 + }, + { + "epoch": 0.9141864001737997, + "grad_norm": 158.5516983187171, + "learning_rate": 3.891343508073053e-09, + "logits/chosen": 0.32029953598976135, + "logits/rejected": 0.3168344795703888, + "logps/chosen": -449.4055480957031, + "logps/rejected": -465.4240417480469, + "loss": 0.5593, + "nll_loss": 0.43291229009628296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5993573665618896, + "rewards/margins": 0.6672921776771545, + "rewards/rejected": -0.0679347962141037, + "step": 1052 + }, + { + "epoch": 0.915924397132305, + "grad_norm": 206.91428322699906, + "learning_rate": 3.735120086158061e-09, + "logits/chosen": 0.3795467019081116, + "logits/rejected": 0.4101225733757019, + "logps/chosen": -493.1979675292969, + "logps/rejected": -489.7401123046875, + "loss": 0.5397, + "nll_loss": 0.46552711725234985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0511305332183838, + "rewards/margins": 0.9409102201461792, + "rewards/rejected": 0.11022023856639862, + "step": 1054 + }, + { + "epoch": 0.9176623940908103, + "grad_norm": 131.1638807054813, + "learning_rate": 3.582037469860455e-09, + "logits/chosen": 0.3941403031349182, + "logits/rejected": 0.2784467041492462, + "logps/chosen": -564.7503051757812, + "logps/rejected": -472.0733642578125, + "loss": 0.5457, + "nll_loss": 0.5184577703475952, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8316090106964111, + "rewards/margins": 0.6089596748352051, + "rewards/rejected": 0.22264929115772247, + "step": 1056 + }, + { + "epoch": 0.9194003910493157, + "grad_norm": 194.9239008622977, + "learning_rate": 3.4321006537612163e-09, + "logits/chosen": 0.13808123767375946, + "logits/rejected": 0.21899640560150146, + "logps/chosen": -441.20404052734375, + "logps/rejected": -475.18524169921875, + "loss": 0.5258, + "nll_loss": 0.4570828080177307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6368840336799622, + "rewards/margins": 0.8989419937133789, + "rewards/rejected": -0.26205796003341675, + "step": 1058 + }, + { + "epoch": 0.921138388007821, + "grad_norm": 115.43391845382065, + "learning_rate": 3.285314529804295e-09, + "logits/chosen": 0.4206714630126953, + "logits/rejected": 0.40312278270721436, + "logps/chosen": -460.8715515136719, + "logps/rejected": -518.5413208007812, + "loss": 0.5333, + "nll_loss": 0.4465717077255249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6075332760810852, + "rewards/margins": 0.5724822282791138, + "rewards/rejected": 0.03505106270313263, + "step": 1060 + }, + { + "epoch": 0.9228763849663263, + "grad_norm": 202.93417822517483, + "learning_rate": 3.141683887136892e-09, + "logits/chosen": 0.486500084400177, + "logits/rejected": 0.4569099545478821, + "logps/chosen": -442.55816650390625, + "logps/rejected": -431.1639709472656, + "loss": 0.5329, + "nll_loss": 0.4247971773147583, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43177396059036255, + "rewards/margins": 0.2917194366455078, + "rewards/rejected": 0.14005452394485474, + "step": 1062 + }, + { + "epoch": 0.9246143819248316, + "grad_norm": 119.40749558394226, + "learning_rate": 3.001213411953296e-09, + "logits/chosen": 0.4011079668998718, + "logits/rejected": 0.46269315481185913, + "logps/chosen": -487.8550720214844, + "logps/rejected": -531.1329956054688, + "loss": 0.5007, + "nll_loss": 0.4475144147872925, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5412551760673523, + "rewards/margins": 0.5304111242294312, + "rewards/rejected": 0.01084403321146965, + "step": 1064 + }, + { + "epoch": 0.926352378883337, + "grad_norm": 130.26377320406488, + "learning_rate": 2.8639076873419487e-09, + "logits/chosen": 0.14351217448711395, + "logits/rejected": 0.25187253952026367, + "logps/chosen": -538.365234375, + "logps/rejected": -633.5084838867188, + "loss": 0.6178, + "nll_loss": 0.4882996380329132, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6392820477485657, + "rewards/margins": 0.613286554813385, + "rewards/rejected": 0.025995440781116486, + "step": 1066 + }, + { + "epoch": 0.9280903758418423, + "grad_norm": 184.52491422760545, + "learning_rate": 2.729771193135899e-09, + "logits/chosen": 0.5797444581985474, + "logits/rejected": 0.2687540650367737, + "logps/chosen": -522.3880615234375, + "logps/rejected": -471.7548828125, + "loss": 0.6481, + "nll_loss": 0.48741546273231506, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23231476545333862, + "rewards/margins": 0.5791104435920715, + "rewards/rejected": -0.34679561853408813, + "step": 1068 + }, + { + "epoch": 0.9298283728003476, + "grad_norm": 177.24645530115882, + "learning_rate": 2.598808305766653e-09, + "logits/chosen": 0.43344372510910034, + "logits/rejected": 0.4523126482963562, + "logps/chosen": -467.32916259765625, + "logps/rejected": -519.6317749023438, + "loss": 0.5094, + "nll_loss": 0.4872402548789978, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4050002992153168, + "rewards/margins": 0.6784630417823792, + "rewards/rejected": -0.27346280217170715, + "step": 1070 + }, + { + "epoch": 0.9315663697588529, + "grad_norm": 197.39750151518055, + "learning_rate": 2.4710232981214218e-09, + "logits/chosen": 0.3327729403972626, + "logits/rejected": 0.14955325424671173, + "logps/chosen": -550.6797485351562, + "logps/rejected": -475.5610046386719, + "loss": 0.6603, + "nll_loss": 0.46252796053886414, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3012423515319824, + "rewards/margins": -0.07680274546146393, + "rewards/rejected": 0.37804505228996277, + "step": 1072 + }, + { + "epoch": 0.9333043667173583, + "grad_norm": 174.39443484205302, + "learning_rate": 2.346420339403632e-09, + "logits/chosen": 0.4824119508266449, + "logits/rejected": 0.3831688165664673, + "logps/chosen": -473.789794921875, + "logps/rejected": -436.1537780761719, + "loss": 0.6016, + "nll_loss": 0.43823355436325073, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.41553688049316406, + "rewards/margins": 0.1545083224773407, + "rewards/rejected": 0.26102858781814575, + "step": 1074 + }, + { + "epoch": 0.9350423636758636, + "grad_norm": 124.99709040890798, + "learning_rate": 2.2250034949969907e-09, + "logits/chosen": 0.5162658095359802, + "logits/rejected": 0.3049306273460388, + "logps/chosen": -509.45013427734375, + "logps/rejected": -532.9105224609375, + "loss": 0.5665, + "nll_loss": 0.5037564039230347, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3767814636230469, + "rewards/margins": 0.46063491702079773, + "rewards/rejected": -0.08385343104600906, + "step": 1076 + }, + { + "epoch": 0.9367803606343689, + "grad_norm": 350.65235066069096, + "learning_rate": 2.106776726332793e-09, + "logits/chosen": 0.2661568820476532, + "logits/rejected": 0.25111711025238037, + "logps/chosen": -503.4539794921875, + "logps/rejected": -484.7874450683594, + "loss": 0.599, + "nll_loss": 0.491374671459198, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3265320062637329, + "rewards/margins": 0.33517876267433167, + "rewards/rejected": -0.008646775037050247, + "step": 1078 + }, + { + "epoch": 0.9385183575928742, + "grad_norm": 144.21117524237232, + "learning_rate": 1.9917438907606556e-09, + "logits/chosen": 0.5326768159866333, + "logits/rejected": 0.4657231867313385, + "logps/chosen": -451.84820556640625, + "logps/rejected": -391.3930969238281, + "loss": 0.5676, + "nll_loss": 0.4326724410057068, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33496278524398804, + "rewards/margins": 0.5200725793838501, + "rewards/rejected": -0.18510979413986206, + "step": 1080 + }, + { + "epoch": 0.9402563545513796, + "grad_norm": 131.1131442940977, + "learning_rate": 1.8799087414227198e-09, + "logits/chosen": 0.34258347749710083, + "logits/rejected": 0.34085893630981445, + "logps/chosen": -539.6632080078125, + "logps/rejected": -504.9268798828125, + "loss": 0.5598, + "nll_loss": 0.5036740303039551, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6515111923217773, + "rewards/margins": 0.5885885953903198, + "rewards/rejected": 0.06292267143726349, + "step": 1082 + }, + { + "epoch": 0.9419943515098849, + "grad_norm": 143.6646151289232, + "learning_rate": 1.771274927131139e-09, + "logits/chosen": 0.2275387942790985, + "logits/rejected": 0.22034448385238647, + "logps/chosen": -422.7888488769531, + "logps/rejected": -435.2667236328125, + "loss": 0.5547, + "nll_loss": 0.3884378969669342, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7469679117202759, + "rewards/margins": 0.4667326509952545, + "rewards/rejected": 0.280235230922699, + "step": 1084 + }, + { + "epoch": 0.9437323484683902, + "grad_norm": 163.27269333598966, + "learning_rate": 1.665845992249071e-09, + "logits/chosen": 0.4115934371948242, + "logits/rejected": 0.44389423727989197, + "logps/chosen": -516.2249145507812, + "logps/rejected": -510.037109375, + "loss": 0.5604, + "nll_loss": 0.4815514087677002, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4112775921821594, + "rewards/margins": 0.4651253819465637, + "rewards/rejected": -0.05384781211614609, + "step": 1086 + }, + { + "epoch": 0.9454703454268955, + "grad_norm": 129.88097141896537, + "learning_rate": 1.5636253765750506e-09, + "logits/chosen": 0.6956306099891663, + "logits/rejected": 0.47814223170280457, + "logps/chosen": -505.50189208984375, + "logps/rejected": -472.5733642578125, + "loss": 0.5726, + "nll_loss": 0.46664267778396606, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39985811710357666, + "rewards/margins": 0.3242230713367462, + "rewards/rejected": 0.07563506066799164, + "step": 1088 + }, + { + "epoch": 0.9472083423854009, + "grad_norm": 221.92257915634556, + "learning_rate": 1.4646164152307016e-09, + "logits/chosen": 0.41604483127593994, + "logits/rejected": 0.49654048681259155, + "logps/chosen": -442.07861328125, + "logps/rejected": -480.72576904296875, + "loss": 0.5722, + "nll_loss": 0.41331538558006287, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35311394929885864, + "rewards/margins": 0.3139072358608246, + "rewards/rejected": 0.03920670226216316, + "step": 1090 + }, + { + "epoch": 0.9489463393439062, + "grad_norm": 152.98735410952526, + "learning_rate": 1.3688223385519671e-09, + "logits/chosen": 0.25502362847328186, + "logits/rejected": 0.18617622554302216, + "logps/chosen": -481.209228515625, + "logps/rejected": -505.02813720703125, + "loss": 0.5983, + "nll_loss": 0.4872613847255707, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.016000747680664, + "rewards/margins": 0.4449235200881958, + "rewards/rejected": 0.5710772275924683, + "step": 1092 + }, + { + "epoch": 0.9506843363024114, + "grad_norm": 233.20407395608177, + "learning_rate": 1.2762462719837275e-09, + "logits/chosen": 0.3264160752296448, + "logits/rejected": 0.3496069312095642, + "logps/chosen": -506.10308837890625, + "logps/rejected": -505.58001708984375, + "loss": 0.5874, + "nll_loss": 0.4658485949039459, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.473180890083313, + "rewards/margins": 0.39049264788627625, + "rewards/rejected": 0.08268821239471436, + "step": 1094 + }, + { + "epoch": 0.9524223332609167, + "grad_norm": 198.02450972875945, + "learning_rate": 1.1868912359777606e-09, + "logits/chosen": 0.44856521487236023, + "logits/rejected": 0.27867019176483154, + "logps/chosen": -482.5740661621094, + "logps/rejected": -509.0614318847656, + "loss": 0.5837, + "nll_loss": 0.5008357763290405, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.526940643787384, + "rewards/margins": 0.39634713530540466, + "rewards/rejected": 0.13059350848197937, + "step": 1096 + }, + { + "epoch": 0.9541603302194221, + "grad_norm": 177.08877803633223, + "learning_rate": 1.100760145894275e-09, + "logits/chosen": 0.4941718578338623, + "logits/rejected": 0.5006747245788574, + "logps/chosen": -522.516357421875, + "logps/rejected": -600.25341796875, + "loss": 0.5911, + "nll_loss": 0.46976977586746216, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6517099142074585, + "rewards/margins": 0.5950245261192322, + "rewards/rejected": 0.056685447692871094, + "step": 1098 + }, + { + "epoch": 0.9558983271779274, + "grad_norm": 134.79426613808798, + "learning_rate": 1.0178558119067315e-09, + "logits/chosen": 0.6318961381912231, + "logits/rejected": 0.4834882616996765, + "logps/chosen": -530.4765625, + "logps/rejected": -498.7202453613281, + "loss": 0.535, + "nll_loss": 0.5174931287765503, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7026293277740479, + "rewards/margins": 0.4758650064468384, + "rewards/rejected": 0.22676429152488708, + "step": 1100 + }, + { + "epoch": 0.9576363241364327, + "grad_norm": 185.20080681256496, + "learning_rate": 9.381809389101825e-10, + "logits/chosen": 0.1996326446533203, + "logits/rejected": 0.31951069831848145, + "logps/chosen": -524.9970092773438, + "logps/rejected": -477.78997802734375, + "loss": 0.5776, + "nll_loss": 0.49288156628608704, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8350135087966919, + "rewards/margins": 0.5012381672859192, + "rewards/rejected": 0.3337753415107727, + "step": 1102 + }, + { + "epoch": 0.959374321094938, + "grad_norm": 194.81847727305524, + "learning_rate": 8.617381264330425e-10, + "logits/chosen": 0.3599563539028168, + "logits/rejected": 0.5130894184112549, + "logps/chosen": -449.61578369140625, + "logps/rejected": -472.8209228515625, + "loss": 0.634, + "nll_loss": 0.44445574283599854, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5263224840164185, + "rewards/margins": 0.36976680159568787, + "rewards/rejected": 0.1565556526184082, + "step": 1104 + }, + { + "epoch": 0.9611123180534434, + "grad_norm": 163.61216324705128, + "learning_rate": 7.885298685522235e-10, + "logits/chosen": 0.6613024473190308, + "logits/rejected": 0.5108257532119751, + "logps/chosen": -490.9361877441406, + "logps/rejected": -469.5235595703125, + "loss": 0.5668, + "nll_loss": 0.4810979664325714, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.41238242387771606, + "rewards/margins": 0.48797228932380676, + "rewards/rejected": -0.0755898505449295, + "step": 1106 + }, + { + "epoch": 0.9628503150119487, + "grad_norm": 225.76619571629445, + "learning_rate": 7.185585538117655e-10, + "logits/chosen": 0.19790863990783691, + "logits/rejected": 0.16061121225357056, + "logps/chosen": -456.2121276855469, + "logps/rejected": -423.03363037109375, + "loss": 0.6116, + "nll_loss": 0.4229794442653656, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2296188473701477, + "rewards/margins": 0.2883566915988922, + "rewards/rejected": -0.0587378591299057, + "step": 1108 + }, + { + "epoch": 0.964588311970454, + "grad_norm": 126.30922488191435, + "learning_rate": 6.518264651449779e-10, + "logits/chosen": 0.3472726047039032, + "logits/rejected": 0.2529224753379822, + "logps/chosen": -464.9158630371094, + "logps/rejected": -484.1787109375, + "loss": 0.5457, + "nll_loss": 0.4409720003604889, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5092476606369019, + "rewards/margins": 0.6528787016868591, + "rewards/rejected": -0.14363110065460205, + "step": 1110 + }, + { + "epoch": 0.9663263089289593, + "grad_norm": 233.22023305294198, + "learning_rate": 5.883357797998756e-10, + "logits/chosen": 0.3325151205062866, + "logits/rejected": 0.36989954113960266, + "logps/chosen": -468.9339294433594, + "logps/rejected": -413.1026611328125, + "loss": 0.5261, + "nll_loss": 0.4565078616142273, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5924686789512634, + "rewards/margins": 0.5389618873596191, + "rewards/rejected": 0.05350681021809578, + "step": 1112 + }, + { + "epoch": 0.9680643058874647, + "grad_norm": 129.53738085733303, + "learning_rate": 5.280885692681591e-10, + "logits/chosen": 0.4145568907260895, + "logits/rejected": 0.5612493753433228, + "logps/chosen": -485.72955322265625, + "logps/rejected": -496.85687255859375, + "loss": 0.5857, + "nll_loss": 0.4751755893230438, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6921634674072266, + "rewards/margins": 0.440424382686615, + "rewards/rejected": 0.25173911452293396, + "step": 1114 + }, + { + "epoch": 0.96980230284597, + "grad_norm": 181.23747613565504, + "learning_rate": 4.710867992176682e-10, + "logits/chosen": 0.3197920620441437, + "logits/rejected": 0.3367622494697571, + "logps/chosen": -459.18145751953125, + "logps/rejected": -447.7521057128906, + "loss": 0.557, + "nll_loss": 0.4460861384868622, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6104297041893005, + "rewards/margins": 0.2446718066930771, + "rewards/rejected": 0.36575785279273987, + "step": 1116 + }, + { + "epoch": 0.9715402998044753, + "grad_norm": 156.1161949619465, + "learning_rate": 4.173323294281994e-10, + "logits/chosen": 0.35465008020401, + "logits/rejected": 0.2980070412158966, + "logps/chosen": -550.6026611328125, + "logps/rejected": -532.0971069335938, + "loss": 0.6142, + "nll_loss": 0.5108888745307922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1261121928691864, + "rewards/margins": 0.1764545440673828, + "rewards/rejected": -0.0503423698246479, + "step": 1118 + }, + { + "epoch": 0.9732782967629806, + "grad_norm": 162.83002983060376, + "learning_rate": 3.668269137308666e-10, + "logits/chosen": 0.5446602702140808, + "logits/rejected": 0.38520336151123047, + "logps/chosen": -495.03192138671875, + "logps/rejected": -444.5371398925781, + "loss": 0.5832, + "nll_loss": 0.4564196467399597, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3598203659057617, + "rewards/margins": 0.3889120817184448, + "rewards/rejected": -0.029091738164424896, + "step": 1120 + }, + { + "epoch": 0.975016293721486, + "grad_norm": 179.94563561598702, + "learning_rate": 3.195721999508461e-10, + "logits/chosen": 0.6219773292541504, + "logits/rejected": 0.6260953545570374, + "logps/chosen": -502.2096252441406, + "logps/rejected": -499.4460754394531, + "loss": 0.5885, + "nll_loss": 0.4690307080745697, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24832192063331604, + "rewards/margins": 0.1444612741470337, + "rewards/rejected": 0.10386066138744354, + "step": 1122 + }, + { + "epoch": 0.9767542906799913, + "grad_norm": 126.248993284488, + "learning_rate": 2.755697298536308e-10, + "logits/chosen": 0.2676442563533783, + "logits/rejected": 0.2535380721092224, + "logps/chosen": -447.32073974609375, + "logps/rejected": -412.70037841796875, + "loss": 0.6036, + "nll_loss": 0.43544793128967285, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5221117734909058, + "rewards/margins": 0.3314821422100067, + "rewards/rejected": 0.19062969088554382, + "step": 1124 + }, + { + "epoch": 0.9784922876384966, + "grad_norm": 185.6732907788729, + "learning_rate": 2.3482093909473754e-10, + "logits/chosen": 0.2254175990819931, + "logits/rejected": 0.30273178219795227, + "logps/chosen": -470.2763671875, + "logps/rejected": -531.9412841796875, + "loss": 0.5924, + "nll_loss": 0.46973299980163574, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7814911007881165, + "rewards/margins": 0.4317186176776886, + "rewards/rejected": 0.34977248311042786, + "step": 1126 + }, + { + "epoch": 0.980230284597002, + "grad_norm": 211.43430486296467, + "learning_rate": 1.973271571728441e-10, + "logits/chosen": 0.33309152722358704, + "logits/rejected": 0.3509593605995178, + "logps/chosen": -422.4731750488281, + "logps/rejected": -448.80963134765625, + "loss": 0.6191, + "nll_loss": 0.39795997738838196, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.48975372314453125, + "rewards/margins": 0.18807843327522278, + "rewards/rejected": 0.30167531967163086, + "step": 1128 + }, + { + "epoch": 0.9819682815555073, + "grad_norm": 158.7187074514785, + "learning_rate": 1.6308960738643517e-10, + "logits/chosen": 0.3980328142642975, + "logits/rejected": 0.22989094257354736, + "logps/chosen": -450.3606262207031, + "logps/rejected": -461.4617919921875, + "loss": 0.5816, + "nll_loss": 0.4547494351863861, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3753512501716614, + "rewards/margins": 0.4223685562610626, + "rewards/rejected": -0.047017283737659454, + "step": 1130 + }, + { + "epoch": 0.9837062785140126, + "grad_norm": 124.14642305553735, + "learning_rate": 1.3210940679385664e-10, + "logits/chosen": 0.384761780500412, + "logits/rejected": 0.43383389711380005, + "logps/chosen": -482.262451171875, + "logps/rejected": -534.4568481445312, + "loss": 0.5631, + "nll_loss": 0.4623926281929016, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4161040484905243, + "rewards/margins": 0.5258000493049622, + "rewards/rejected": -0.10969601571559906, + "step": 1132 + }, + { + "epoch": 0.9854442754725179, + "grad_norm": 147.1639416823873, + "learning_rate": 1.0438756617691114e-10, + "logits/chosen": 0.23561853170394897, + "logits/rejected": 0.38304686546325684, + "logps/chosen": -498.98089599609375, + "logps/rejected": -500.87750244140625, + "loss": 0.5604, + "nll_loss": 0.47494742274284363, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6560043096542358, + "rewards/margins": 0.3133293092250824, + "rewards/rejected": 0.34267503023147583, + "step": 1134 + }, + { + "epoch": 0.9871822724310233, + "grad_norm": 121.0117229059167, + "learning_rate": 7.992499000785136e-11, + "logits/chosen": 0.2498825490474701, + "logits/rejected": 0.36798134446144104, + "logps/chosen": -382.92962646484375, + "logps/rejected": -492.08172607421875, + "loss": 0.5732, + "nll_loss": 0.3937234580516815, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8302962779998779, + "rewards/margins": 0.47974082827568054, + "rewards/rejected": 0.350555419921875, + "step": 1136 + }, + { + "epoch": 0.9889202693895286, + "grad_norm": 140.92643204865465, + "learning_rate": 5.872247641987016e-11, + "logits/chosen": 0.2667164206504822, + "logits/rejected": 0.3866446614265442, + "logps/chosen": -523.2333374023438, + "logps/rejected": -505.80340576171875, + "loss": 0.5751, + "nll_loss": 0.4769275188446045, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6794913411140442, + "rewards/margins": 0.47867727279663086, + "rewards/rejected": 0.20081406831741333, + "step": 1138 + }, + { + "epoch": 0.9906582663480339, + "grad_norm": 181.8205709739755, + "learning_rate": 4.078071718107701e-11, + "logits/chosen": 0.21165776252746582, + "logits/rejected": 0.08558028936386108, + "logps/chosen": -440.87237548828125, + "logps/rejected": -425.84716796875, + "loss": 0.6279, + "nll_loss": 0.412685751914978, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.667019248008728, + "rewards/margins": 0.12272016704082489, + "rewards/rejected": 0.5442991256713867, + "step": 1140 + }, + { + "epoch": 0.9923962633065392, + "grad_norm": 106.67423477208695, + "learning_rate": 2.6100297671916016e-11, + "logits/chosen": 0.2591753900051117, + "logits/rejected": 0.35262227058410645, + "logps/chosen": -502.3717956542969, + "logps/rejected": -539.2123413085938, + "loss": 0.5405, + "nll_loss": 0.4731524884700775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6413518190383911, + "rewards/margins": 0.619159460067749, + "rewards/rejected": 0.02219228446483612, + "step": 1142 + }, + { + "epoch": 0.9941342602650446, + "grad_norm": 123.59587852601393, + "learning_rate": 1.4681696866081228e-11, + "logits/chosen": 0.4699622094631195, + "logits/rejected": 0.380214661359787, + "logps/chosen": -515.212646484375, + "logps/rejected": -449.09814453125, + "loss": 0.6179, + "nll_loss": 0.4764086902141571, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3445565402507782, + "rewards/margins": 0.3155711889266968, + "rewards/rejected": 0.02898530103266239, + "step": 1144 + }, + { + "epoch": 0.9958722572235499, + "grad_norm": 156.96054694461495, + "learning_rate": 6.5252873148513574e-12, + "logits/chosen": 0.21996259689331055, + "logits/rejected": 0.23748603463172913, + "logps/chosen": -400.7960510253906, + "logps/rejected": -439.2467956542969, + "loss": 0.5645, + "nll_loss": 0.40228787064552307, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2994455397129059, + "rewards/margins": 0.2858263850212097, + "rewards/rejected": 0.013619126752018929, + "step": 1146 + }, + { + "epoch": 0.9976102541820552, + "grad_norm": 171.74798960847323, + "learning_rate": 1.6313351349883652e-12, + "logits/chosen": -0.033592335879802704, + "logits/rejected": 0.09655453264713287, + "logps/chosen": -386.03948974609375, + "logps/rejected": -446.1105041503906, + "loss": 0.531, + "nll_loss": 0.41557371616363525, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8122502565383911, + "rewards/margins": 0.526740550994873, + "rewards/rejected": 0.2855096757411957, + "step": 1148 + }, + { + "epoch": 0.9993482511405605, + "grad_norm": 160.448811199801, + "learning_rate": 0.0, + "logits/chosen": 0.5092004537582397, + "logits/rejected": 0.5407735109329224, + "logps/chosen": -538.6259155273438, + "logps/rejected": -522.8944091796875, + "loss": 0.5529, + "nll_loss": 0.5061779618263245, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5825742483139038, + "rewards/margins": 0.24803534150123596, + "rewards/rejected": 0.33453893661499023, + "step": 1150 + } + ], + "logging_steps": 2, + "max_steps": 1150, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}