{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 594, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016842105263157896, "grad_norm": 756.2248967148687, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": -1430707968.0, "logits/rejected": -627221650.2857143, "logps/chosen": -274.7185974121094, "logps/rejected": -244.01890345982142, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.003368421052631579, "grad_norm": 127.71874425517983, "kl": 0.0, "learning_rate": 8.333333333333334e-09, "logits/chosen": -625243200.0, "logits/rejected": -765075712.0, "logps/chosen": -214.222900390625, "logps/rejected": -240.65220642089844, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0050526315789473685, "grad_norm": 181.90697566158104, "kl": 0.4360809326171875, "learning_rate": 1.6666666666666667e-08, "logits/chosen": -1081217792.0, "logits/rejected": -543856768.0, "logps/chosen": -203.947265625, "logps/rejected": -286.96112060546875, "loss": 0.4983, "rewards/chosen": -0.00811920315027237, "rewards/margins": -0.013331988826394081, "rewards/rejected": 0.005212785676121712, "step": 3 }, { "epoch": 0.006736842105263158, "grad_norm": 157.50669772082293, "kl": 0.939788818359375, "learning_rate": 2.5e-08, "logits/chosen": -665212416.0, "logits/rejected": -989478912.0, "logps/chosen": -212.04036458333334, "logps/rejected": -268.3646484375, "loss": 0.4997, "rewards/chosen": -0.022487640380859375, "rewards/margins": -0.013136444240808487, "rewards/rejected": -0.009351196140050888, "step": 4 }, { "epoch": 0.008421052631578947, "grad_norm": 323.0915477325882, "kl": 0.5852317810058594, "learning_rate": 3.3333333333333334e-08, "logits/chosen": -1499776170.6666667, "logits/rejected": -823883468.8, "logps/chosen": -206.73421223958334, "logps/rejected": -249.1157470703125, "loss": 0.4999, "rewards/chosen": 0.03809356689453125, "rewards/margins": 0.03527191206812859, "rewards/rejected": 0.002821654826402664, "step": 5 }, { "epoch": 0.010105263157894737, "grad_norm": 985.1109364386158, "kl": 1.2429885864257812, "learning_rate": 4.166666666666666e-08, "logits/chosen": -830737578.6666666, "logits/rejected": -1135501516.8, "logps/chosen": -179.890869140625, "logps/rejected": -192.8841796875, "loss": 0.499, "rewards/chosen": 0.012117513765891394, "rewards/margins": -0.012128396580616633, "rewards/rejected": 0.024245910346508026, "step": 6 }, { "epoch": 0.011789473684210527, "grad_norm": 122.52722890356792, "kl": 2.1596755981445312, "learning_rate": 5e-08, "logits/chosen": -917885610.6666666, "logits/rejected": -1120814080.0, "logps/chosen": -223.6461181640625, "logps/rejected": -218.6181396484375, "loss": 0.5027, "rewards/chosen": -0.0010091145522892475, "rewards/margins": -0.017569171357899906, "rewards/rejected": 0.016560056805610658, "step": 7 }, { "epoch": 0.013473684210526317, "grad_norm": 95.64200187206991, "kl": 1.8461112976074219, "learning_rate": 5.833333333333333e-08, "logits/chosen": -963475648.0, "logits/rejected": -777407829.3333334, "logps/chosen": -194.63409423828125, "logps/rejected": -209.066162109375, "loss": 0.4996, "rewards/chosen": 0.036714937537908554, "rewards/margins": 0.05407130097349484, "rewards/rejected": -0.017356363435586292, "step": 8 }, { "epoch": 0.015157894736842105, "grad_norm": 230.4658286109559, "kl": 0.5257720947265625, "learning_rate": 6.666666666666667e-08, "logits/chosen": -474723840.0, "logits/rejected": -627376426.6666666, "logps/chosen": -223.980908203125, "logps/rejected": -260.8198649088542, "loss": 0.4984, "rewards/chosen": 0.005419158190488815, "rewards/margins": 0.021833036839962006, "rewards/rejected": -0.01641387864947319, "step": 9 }, { "epoch": 0.016842105263157894, "grad_norm": 144.31561283203507, "kl": 2.7238082885742188, "learning_rate": 7.5e-08, "logits/chosen": -834623385.6, "logits/rejected": -1033011200.0, "logps/chosen": -257.28134765625, "logps/rejected": -233.40643310546875, "loss": 0.502, "rewards/chosen": 0.04759369194507599, "rewards/margins": 0.024074305097262067, "rewards/rejected": 0.023519386847813923, "step": 10 }, { "epoch": 0.018526315789473686, "grad_norm": 246.23166775055776, "kl": 0.989532470703125, "learning_rate": 8.333333333333333e-08, "logits/chosen": -933497548.8, "logits/rejected": -1285298858.6666667, "logps/chosen": -221.352392578125, "logps/rejected": -204.99224853515625, "loss": 0.4965, "rewards/chosen": 0.0369128406047821, "rewards/margins": 0.07419260541598002, "rewards/rejected": -0.037279764811197914, "step": 11 }, { "epoch": 0.020210526315789474, "grad_norm": 173.9408541060697, "kl": 0.8486251831054688, "learning_rate": 9.166666666666665e-08, "logits/chosen": -960234432.0, "logits/rejected": -981980800.0, "logps/chosen": -243.4825439453125, "logps/rejected": -170.26206970214844, "loss": 0.5027, "rewards/chosen": -0.004507828503847122, "rewards/margins": -0.03447256237268448, "rewards/rejected": 0.029964733868837357, "step": 12 }, { "epoch": 0.021894736842105262, "grad_norm": 175.56996418245757, "kl": 0.223480224609375, "learning_rate": 1e-07, "logits/chosen": -1123834752.0, "logits/rejected": -974051264.0, "logps/chosen": -217.30453491210938, "logps/rejected": -285.2402038574219, "loss": 0.498, "rewards/chosen": 0.011188508942723274, "rewards/margins": 0.04091682657599449, "rewards/rejected": -0.029728317633271217, "step": 13 }, { "epoch": 0.023578947368421053, "grad_norm": 109.85035410014211, "kl": 1.2569313049316406, "learning_rate": 1.0833333333333334e-07, "logits/chosen": -752794521.6, "logits/rejected": -981168981.3333334, "logps/chosen": -259.7815673828125, "logps/rejected": -145.32327270507812, "loss": 0.4979, "rewards/chosen": 0.027963563799858093, "rewards/margins": 0.05449656397104263, "rewards/rejected": -0.02653300017118454, "step": 14 }, { "epoch": 0.02526315789473684, "grad_norm": 112.13711806138755, "kl": 0.5673065185546875, "learning_rate": 1.1666666666666667e-07, "logits/chosen": -726509824.0, "logits/rejected": -840169088.0, "logps/chosen": -206.02989196777344, "logps/rejected": -283.2633972167969, "loss": 0.4991, "rewards/chosen": 0.005207442678511143, "rewards/margins": 0.007555770222097635, "rewards/rejected": -0.0023483275435864925, "step": 15 }, { "epoch": 0.026947368421052633, "grad_norm": 103.03187365170976, "kl": 0.21651458740234375, "learning_rate": 1.25e-07, "logits/chosen": -1134580838.4, "logits/rejected": -816441002.6666666, "logps/chosen": -255.2339111328125, "logps/rejected": -226.8826904296875, "loss": 0.5015, "rewards/chosen": -0.0194052129983902, "rewards/margins": -0.014941000193357468, "rewards/rejected": -0.00446421280503273, "step": 16 }, { "epoch": 0.02863157894736842, "grad_norm": 775.9066138659358, "kl": 1.3112754821777344, "learning_rate": 1.3333333333333334e-07, "logits/chosen": -1642847744.0, "logits/rejected": -1082994346.6666667, "logps/chosen": -223.9878692626953, "logps/rejected": -277.746337890625, "loss": 0.4999, "rewards/chosen": 0.01275329664349556, "rewards/margins": 0.02697169159849485, "rewards/rejected": -0.014218394954999289, "step": 17 }, { "epoch": 0.03031578947368421, "grad_norm": 168.9285804872629, "kl": 0.6275863647460938, "learning_rate": 1.4166666666666665e-07, "logits/chosen": -667281792.0, "logits/rejected": -773589248.0, "logps/chosen": -258.7420654296875, "logps/rejected": -229.06549072265625, "loss": 0.5024, "rewards/chosen": 8.049095049500465e-05, "rewards/margins": -0.011579894926398993, "rewards/rejected": 0.011660385876893997, "step": 18 }, { "epoch": 0.032, "grad_norm": 96.35367098860345, "kl": 0.8299026489257812, "learning_rate": 1.5e-07, "logits/chosen": -1054523520.0, "logits/rejected": -1119684096.0, "logps/chosen": -227.5456085205078, "logps/rejected": -175.5878448486328, "loss": 0.5028, "rewards/chosen": -0.006691360846161842, "rewards/margins": -0.007221984677016735, "rewards/rejected": 0.0005306238308548927, "step": 19 }, { "epoch": 0.03368421052631579, "grad_norm": 212.06858439091923, "kl": 0.49672698974609375, "learning_rate": 1.583333333333333e-07, "logits/chosen": -1220449484.8, "logits/rejected": -1335795029.3333333, "logps/chosen": -192.7220947265625, "logps/rejected": -213.80257161458334, "loss": 0.4997, "rewards/chosen": 0.028179934620857237, "rewards/margins": 0.10689647098382313, "rewards/rejected": -0.0787165363629659, "step": 20 }, { "epoch": 0.03536842105263158, "grad_norm": 142.05363612622702, "kl": 0.5618019104003906, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -865751616.0, "logits/rejected": -749211562.6666666, "logps/chosen": -274.3310852050781, "logps/rejected": -161.32567342122397, "loss": 0.5039, "rewards/chosen": 0.006252288818359375, "rewards/margins": -0.04492963353792826, "rewards/rejected": 0.051181922356287636, "step": 21 }, { "epoch": 0.03705263157894737, "grad_norm": 202.39733674967437, "kl": 2.1550140380859375, "learning_rate": 1.75e-07, "logits/chosen": -852115285.3333334, "logits/rejected": -1103316787.2, "logps/chosen": -221.8100382486979, "logps/rejected": -217.199462890625, "loss": 0.4971, "rewards/chosen": 0.03007812798023224, "rewards/margins": 0.013606876134872437, "rewards/rejected": 0.016471251845359802, "step": 22 }, { "epoch": 0.03873684210526316, "grad_norm": 205.96536510497384, "kl": 0.9358940124511719, "learning_rate": 1.833333333333333e-07, "logits/chosen": -1277413580.8, "logits/rejected": -971898112.0, "logps/chosen": -217.6686767578125, "logps/rejected": -276.12188720703125, "loss": 0.5003, "rewards/chosen": 0.022262269258499147, "rewards/margins": 0.0022859687606493657, "rewards/rejected": 0.01997630049784978, "step": 23 }, { "epoch": 0.04042105263157895, "grad_norm": 97.86200798922602, "kl": 1.6557350158691406, "learning_rate": 1.9166666666666668e-07, "logits/chosen": -837048012.8, "logits/rejected": -1441815552.0, "logps/chosen": -212.37763671875, "logps/rejected": -136.08146158854166, "loss": 0.5045, "rewards/chosen": -0.0468679815530777, "rewards/margins": -0.06955506652593613, "rewards/rejected": 0.02268708497285843, "step": 24 }, { "epoch": 0.042105263157894736, "grad_norm": 146.9171587034632, "kl": 1.9702682495117188, "learning_rate": 2e-07, "logits/chosen": -692624256.0, "logits/rejected": -764641536.0, "logps/chosen": -240.31827799479166, "logps/rejected": -236.6541015625, "loss": 0.5038, "rewards/chosen": -0.0033594767252604165, "rewards/margins": 0.008408406128485998, "rewards/rejected": -0.011767882853746414, "step": 25 }, { "epoch": 0.043789473684210524, "grad_norm": 372.015222658044, "kl": 2.9479598999023438, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -836751744.0, "logits/rejected": -1142913280.0, "logps/chosen": -212.76544189453125, "logps/rejected": -166.8655548095703, "loss": 0.4997, "rewards/chosen": -0.01788177713751793, "rewards/margins": -0.06701736897230148, "rewards/rejected": 0.049135591834783554, "step": 26 }, { "epoch": 0.04547368421052632, "grad_norm": 173.67968690260923, "kl": 0.8008079528808594, "learning_rate": 2.1666666666666667e-07, "logits/chosen": -796426240.0, "logits/rejected": -697571200.0, "logps/chosen": -200.24410400390624, "logps/rejected": -241.0162556966146, "loss": 0.4962, "rewards/chosen": 0.07569732666015624, "rewards/margins": 0.07425994885464508, "rewards/rejected": 0.0014373778055111568, "step": 27 }, { "epoch": 0.04715789473684211, "grad_norm": 179.48473623203154, "kl": 1.4168357849121094, "learning_rate": 2.25e-07, "logits/chosen": -744388736.0, "logits/rejected": -863063040.0, "logps/chosen": -290.7626953125, "logps/rejected": -231.60797119140625, "loss": 0.504, "rewards/chosen": -0.019368745386600494, "rewards/margins": -0.058126453310251236, "rewards/rejected": 0.03875770792365074, "step": 28 }, { "epoch": 0.048842105263157895, "grad_norm": 130.22057137941954, "kl": 1.3458938598632812, "learning_rate": 2.3333333333333333e-07, "logits/chosen": -691689472.0, "logits/rejected": -918764885.3333334, "logps/chosen": -209.3045654296875, "logps/rejected": -241.4656982421875, "loss": 0.4978, "rewards/chosen": 0.07106705009937286, "rewards/margins": 0.06067454318205515, "rewards/rejected": 0.010392506917317709, "step": 29 }, { "epoch": 0.05052631578947368, "grad_norm": 130.9807773433376, "kl": 0.4033660888671875, "learning_rate": 2.4166666666666665e-07, "logits/chosen": -920996224.0, "logits/rejected": -841631424.0, "logps/chosen": -190.30531311035156, "logps/rejected": -162.928466796875, "loss": 0.4982, "rewards/chosen": 0.035247232764959335, "rewards/margins": 0.06811809912323952, "rewards/rejected": -0.03287086635828018, "step": 30 }, { "epoch": 0.05221052631578947, "grad_norm": 76.30157221981797, "kl": 0.47165679931640625, "learning_rate": 2.5e-07, "logits/chosen": -1194017920.0, "logits/rejected": -877919317.3333334, "logps/chosen": -306.70257568359375, "logps/rejected": -252.0715535481771, "loss": 0.4996, "rewards/chosen": 0.04151611402630806, "rewards/margins": 0.03829981635014216, "rewards/rejected": 0.003216297676165899, "step": 31 }, { "epoch": 0.053894736842105266, "grad_norm": 102.32572244854322, "kl": 2.0101585388183594, "learning_rate": 2.5833333333333333e-07, "logits/chosen": -944446336.0, "logits/rejected": -1106841088.0, "logps/chosen": -199.4850311279297, "logps/rejected": -183.14755249023438, "loss": 0.5005, "rewards/chosen": 0.046735379844903946, "rewards/margins": -0.004496384412050247, "rewards/rejected": 0.05123176425695419, "step": 32 }, { "epoch": 0.055578947368421054, "grad_norm": 99.04470417971368, "kl": 2.4130401611328125, "learning_rate": 2.6666666666666667e-07, "logits/chosen": -825663078.4, "logits/rejected": -991676245.3333334, "logps/chosen": -220.846435546875, "logps/rejected": -356.6270751953125, "loss": 0.4947, "rewards/chosen": 0.052228087186813356, "rewards/margins": 0.03795552278558413, "rewards/rejected": 0.014272564401229223, "step": 33 }, { "epoch": 0.05726315789473684, "grad_norm": 246.96656404040365, "kl": 1.233936071395874, "learning_rate": 2.75e-07, "logits/chosen": -1095287296.0, "logits/rejected": -1060861504.0, "logps/chosen": -182.49984741210938, "logps/rejected": -234.69015502929688, "loss": 0.4936, "rewards/chosen": 0.06470909714698792, "rewards/margins": 0.04259873181581497, "rewards/rejected": 0.022110365331172943, "step": 34 }, { "epoch": 0.05894736842105263, "grad_norm": 152.73517140204396, "kl": 2.7289276123046875, "learning_rate": 2.833333333333333e-07, "logits/chosen": -1097634133.3333333, "logits/rejected": -1259796582.4, "logps/chosen": -202.9609375, "logps/rejected": -189.42052001953124, "loss": 0.4999, "rewards/chosen": 0.01765136917432149, "rewards/margins": -0.034761963287989306, "rewards/rejected": 0.052413332462310794, "step": 35 }, { "epoch": 0.06063157894736842, "grad_norm": 215.293974300091, "kl": 2.45306396484375, "learning_rate": 2.916666666666667e-07, "logits/chosen": -631062869.3333334, "logits/rejected": -987482112.0, "logps/chosen": -213.41678873697916, "logps/rejected": -221.72783203125, "loss": 0.5006, "rewards/chosen": 0.04089965671300888, "rewards/margins": -0.024423225224018102, "rewards/rejected": 0.06532288193702698, "step": 36 }, { "epoch": 0.06231578947368421, "grad_norm": 195.90220199948243, "kl": 2.9997177124023438, "learning_rate": 3e-07, "logits/chosen": -848359219.2, "logits/rejected": -1426793557.3333333, "logps/chosen": -195.2117431640625, "logps/rejected": -180.9798787434896, "loss": 0.4993, "rewards/chosen": 0.08168579339981079, "rewards/margins": 0.04379872928063074, "rewards/rejected": 0.037887064119180046, "step": 37 }, { "epoch": 0.064, "grad_norm": 121.58903894060508, "kl": 2.0522327423095703, "learning_rate": 3.0833333333333333e-07, "logits/chosen": -1282800435.2, "logits/rejected": -435072682.6666667, "logps/chosen": -226.767138671875, "logps/rejected": -249.10555013020834, "loss": 0.4959, "rewards/chosen": 0.11044464111328126, "rewards/margins": 0.0951441449423631, "rewards/rejected": 0.015300496170918146, "step": 38 }, { "epoch": 0.06568421052631579, "grad_norm": 116.69257759601491, "kl": 2.5439231395721436, "learning_rate": 3.166666666666666e-07, "logits/chosen": -557800192.0, "logits/rejected": -930213376.0, "logps/chosen": -211.4619140625, "logps/rejected": -234.09031677246094, "loss": 0.4983, "rewards/chosen": 0.09951868653297424, "rewards/margins": 0.06241312623023987, "rewards/rejected": 0.037105560302734375, "step": 39 }, { "epoch": 0.06736842105263158, "grad_norm": 365.02933086523586, "kl": 2.2659378051757812, "learning_rate": 3.25e-07, "logits/chosen": -965182668.8, "logits/rejected": -642834560.0, "logps/chosen": -220.280224609375, "logps/rejected": -222.31856282552084, "loss": 0.4953, "rewards/chosen": 0.09896668195724487, "rewards/margins": 0.024154870708783457, "rewards/rejected": 0.07481181124846141, "step": 40 }, { "epoch": 0.06905263157894737, "grad_norm": 582.132852175163, "kl": 2.304011344909668, "learning_rate": 3.333333333333333e-07, "logits/chosen": -885514444.8, "logits/rejected": -748998314.6666666, "logps/chosen": -227.1810302734375, "logps/rejected": -196.22355143229166, "loss": 0.4766, "rewards/chosen": 0.18464019298553466, "rewards/margins": 0.5814312537511189, "rewards/rejected": -0.3967910607655843, "step": 41 }, { "epoch": 0.07073684210526315, "grad_norm": 168.55393612476047, "kl": 2.7851181030273438, "learning_rate": 3.4166666666666664e-07, "logits/chosen": -1012587946.6666666, "logits/rejected": -900292300.8, "logps/chosen": -227.2638956705729, "logps/rejected": -187.71868896484375, "loss": 0.4928, "rewards/chosen": 0.1678065061569214, "rewards/margins": 0.08132548332214355, "rewards/rejected": 0.08648102283477783, "step": 42 }, { "epoch": 0.07242105263157894, "grad_norm": 236.97744686773765, "kl": 3.527801513671875, "learning_rate": 3.5e-07, "logits/chosen": -970476885.3333334, "logits/rejected": -398392224.0, "logps/chosen": -196.3362019856771, "logps/rejected": -353.429443359375, "loss": 0.4969, "rewards/chosen": 0.08341166377067566, "rewards/margins": 0.04898173362016678, "rewards/rejected": 0.03442993015050888, "step": 43 }, { "epoch": 0.07410526315789474, "grad_norm": 251.79089726814883, "kl": 4.6118011474609375, "learning_rate": 3.583333333333333e-07, "logits/chosen": -944620885.3333334, "logits/rejected": -546614579.2, "logps/chosen": -229.73470052083334, "logps/rejected": -211.629931640625, "loss": 0.4902, "rewards/chosen": 0.1472813884417216, "rewards/margins": 0.1069905529419581, "rewards/rejected": 0.04029083549976349, "step": 44 }, { "epoch": 0.07578947368421053, "grad_norm": 115.12623939626701, "kl": 4.802654266357422, "learning_rate": 3.666666666666666e-07, "logits/chosen": -1199768166.4, "logits/rejected": -1026606506.6666666, "logps/chosen": -250.584716796875, "logps/rejected": -256.4547932942708, "loss": 0.4888, "rewards/chosen": 0.15660583972930908, "rewards/margins": 0.056487321853637695, "rewards/rejected": 0.10011851787567139, "step": 45 }, { "epoch": 0.07747368421052632, "grad_norm": 147.48000207541645, "kl": 2.1971359252929688, "learning_rate": 3.75e-07, "logits/chosen": -779579904.0, "logits/rejected": -777894058.6666666, "logps/chosen": -234.33892822265625, "logps/rejected": -279.74733479817706, "loss": 0.4941, "rewards/chosen": 0.2183128446340561, "rewards/margins": 0.09147873024145761, "rewards/rejected": 0.12683411439259848, "step": 46 }, { "epoch": 0.07915789473684211, "grad_norm": 172.83190668649902, "kl": 5.049167633056641, "learning_rate": 3.8333333333333335e-07, "logits/chosen": -911320192.0, "logits/rejected": -1069644800.0, "logps/chosen": -215.0435791015625, "logps/rejected": -261.28668212890625, "loss": 0.4947, "rewards/chosen": 0.17158202826976776, "rewards/margins": 0.06581267714500427, "rewards/rejected": 0.10576935112476349, "step": 47 }, { "epoch": 0.0808421052631579, "grad_norm": 135.23703398521903, "kl": 3.5211753845214844, "learning_rate": 3.9166666666666664e-07, "logits/chosen": -1736729472.0, "logits/rejected": -939244617.1428572, "logps/chosen": -226.9305877685547, "logps/rejected": -232.20169503348214, "loss": 0.4972, "rewards/chosen": 0.16886138916015625, "rewards/margins": 0.05592117990766253, "rewards/rejected": 0.11294020925249372, "step": 48 }, { "epoch": 0.08252631578947368, "grad_norm": 87.76901964257267, "kl": 5.832710266113281, "learning_rate": 4e-07, "logits/chosen": -794667622.4, "logits/rejected": -622882474.6666666, "logps/chosen": -230.16806640625, "logps/rejected": -224.6505330403646, "loss": 0.4913, "rewards/chosen": 0.2949032783508301, "rewards/margins": 0.20258251825968426, "rewards/rejected": 0.09232076009114583, "step": 49 }, { "epoch": 0.08421052631578947, "grad_norm": 131.7748262627241, "kl": 4.04681396484375, "learning_rate": 4.083333333333333e-07, "logits/chosen": -1042156714.6666666, "logits/rejected": -956070912.0, "logps/chosen": -220.65376790364584, "logps/rejected": -351.86298828125, "loss": 0.4921, "rewards/chosen": 0.1149485210577647, "rewards/margins": -0.044381918509801235, "rewards/rejected": 0.15933043956756593, "step": 50 }, { "epoch": 0.08589473684210526, "grad_norm": 93.7193003978435, "kl": 5.549095153808594, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -913337088.0, "logits/rejected": -1463801514.6666667, "logps/chosen": -234.42970275878906, "logps/rejected": -256.32749430338544, "loss": 0.5037, "rewards/chosen": 0.12055359780788422, "rewards/margins": 0.05089595417181651, "rewards/rejected": 0.0696576436360677, "step": 51 }, { "epoch": 0.08757894736842105, "grad_norm": 124.37458019612187, "kl": 5.329154968261719, "learning_rate": 4.2499999999999995e-07, "logits/chosen": -769787840.0, "logits/rejected": -873242752.0, "logps/chosen": -228.21946716308594, "logps/rejected": -178.02896118164062, "loss": 0.5018, "rewards/chosen": 0.1875530183315277, "rewards/margins": -0.029434025287628174, "rewards/rejected": 0.21698704361915588, "step": 52 }, { "epoch": 0.08926315789473684, "grad_norm": 151.7193063462552, "kl": 2.5696582794189453, "learning_rate": 4.3333333333333335e-07, "logits/chosen": -966253760.0, "logits/rejected": -818051840.0, "logps/chosen": -210.7382354736328, "logps/rejected": -210.70034790039062, "loss": 0.4948, "rewards/chosen": 0.09387054294347763, "rewards/margins": -0.11254673451185226, "rewards/rejected": 0.2064172774553299, "step": 53 }, { "epoch": 0.09094736842105264, "grad_norm": 113.57346599809466, "kl": 2.8759994506835938, "learning_rate": 4.4166666666666664e-07, "logits/chosen": -936665600.0, "logits/rejected": -744050602.6666666, "logps/chosen": -210.92718505859375, "logps/rejected": -213.8590087890625, "loss": 0.4893, "rewards/chosen": 0.17248383164405823, "rewards/margins": -0.09370574355125427, "rewards/rejected": 0.2661895751953125, "step": 54 }, { "epoch": 0.09263157894736843, "grad_norm": 174.61803322959847, "kl": 4.2318267822265625, "learning_rate": 4.5e-07, "logits/chosen": -677702400.0, "logits/rejected": -692814116.5714285, "logps/chosen": -218.57711791992188, "logps/rejected": -213.56180245535714, "loss": 0.4879, "rewards/chosen": 0.4497177302837372, "rewards/margins": 0.41578697732516695, "rewards/rejected": 0.03393075295857021, "step": 55 }, { "epoch": 0.09431578947368421, "grad_norm": 147.27747224286998, "kl": 3.260711669921875, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -1289244544.0, "logits/rejected": -916406464.0, "logps/chosen": -246.1941375732422, "logps/rejected": -253.98768615722656, "loss": 0.4921, "rewards/chosen": 0.16882018744945526, "rewards/margins": -0.14416618645191193, "rewards/rejected": 0.3129863739013672, "step": 56 }, { "epoch": 0.096, "grad_norm": 95.08985748052072, "kl": 1.6493759155273438, "learning_rate": 4.6666666666666666e-07, "logits/chosen": -733326400.0, "logits/rejected": -922427392.0, "logps/chosen": -178.46246337890625, "logps/rejected": -258.2501627604167, "loss": 0.4867, "rewards/chosen": 0.3112235963344574, "rewards/margins": 0.17294743657112122, "rewards/rejected": 0.13827615976333618, "step": 57 }, { "epoch": 0.09768421052631579, "grad_norm": 133.57090255911547, "kl": 3.168426513671875, "learning_rate": 4.7499999999999995e-07, "logits/chosen": -767859712.0, "logits/rejected": -977725248.0, "logps/chosen": -228.76634216308594, "logps/rejected": -284.31427001953125, "loss": 0.4985, "rewards/chosen": 0.1971721649169922, "rewards/margins": 0.03166121244430542, "rewards/rejected": 0.16551095247268677, "step": 58 }, { "epoch": 0.09936842105263158, "grad_norm": 108.0916864306588, "kl": 0.6577796936035156, "learning_rate": 4.833333333333333e-07, "logits/chosen": -1173459148.8, "logits/rejected": -655583232.0, "logps/chosen": -196.4993408203125, "logps/rejected": -219.53653971354166, "loss": 0.4757, "rewards/chosen": 0.23870911598205566, "rewards/margins": 0.28514466087023416, "rewards/rejected": -0.046435544888178505, "step": 59 }, { "epoch": 0.10105263157894737, "grad_norm": 87.62449313729059, "kl": 2.194793701171875, "learning_rate": 4.916666666666666e-07, "logits/chosen": -1028928256.0, "logits/rejected": -540835200.0, "logps/chosen": -227.88580322265625, "logps/rejected": -234.1096649169922, "loss": 0.45, "rewards/chosen": 0.5302422046661377, "rewards/margins": 1.1665672659873962, "rewards/rejected": -0.6363250613212585, "step": 60 }, { "epoch": 0.10273684210526315, "grad_norm": 119.47945666482947, "kl": 1.5802383422851562, "learning_rate": 5e-07, "logits/chosen": -1018257536.0, "logits/rejected": -1083207040.0, "logps/chosen": -222.7873992919922, "logps/rejected": -275.23138427734375, "loss": 0.5044, "rewards/chosen": 0.32063254714012146, "rewards/margins": 0.10322417318820953, "rewards/rejected": 0.21740837395191193, "step": 61 }, { "epoch": 0.10442105263157894, "grad_norm": 114.74202024192041, "kl": 1.9775314331054688, "learning_rate": 4.999956736067562e-07, "logits/chosen": -720981299.2, "logits/rejected": -543601152.0, "logps/chosen": -240.6268798828125, "logps/rejected": -207.02860514322916, "loss": 0.4983, "rewards/chosen": 0.25333800315856936, "rewards/margins": -0.036342994372049964, "rewards/rejected": 0.2896809975306193, "step": 62 }, { "epoch": 0.10610526315789473, "grad_norm": 77.65993832754032, "kl": 0.7235641479492188, "learning_rate": 4.999826945767664e-07, "logits/chosen": -692212443.4285715, "logits/rejected": -1071096576.0, "logps/chosen": -255.70343889508928, "logps/rejected": -259.9676208496094, "loss": 0.5084, "rewards/chosen": 0.2564025947025844, "rewards/margins": -0.06277771506990704, "rewards/rejected": 0.31918030977249146, "step": 63 }, { "epoch": 0.10778947368421053, "grad_norm": 101.9412698769531, "kl": 2.6254653930664062, "learning_rate": 4.999610633592496e-07, "logits/chosen": -1115974656.0, "logits/rejected": -880023961.6, "logps/chosen": -227.8680419921875, "logps/rejected": -210.0842041015625, "loss": 0.4639, "rewards/chosen": 0.1862970987955729, "rewards/margins": 0.7373786608378092, "rewards/rejected": -0.5510815620422364, "step": 64 }, { "epoch": 0.10947368421052632, "grad_norm": 83.26215203358481, "kl": 0.0, "learning_rate": 4.99930780702887e-07, "logits/chosen": -2037875712.0, "logits/rejected": -891881472.0, "logps/chosen": -298.9163818359375, "logps/rejected": -212.7870076497396, "loss": 0.5016, "rewards/chosen": 0.0009033232927322388, "rewards/margins": -0.28533248603343964, "rewards/rejected": 0.2862358093261719, "step": 65 }, { "epoch": 0.11115789473684211, "grad_norm": 65.37492119822153, "kl": 4.6044464111328125, "learning_rate": 4.998918476557962e-07, "logits/chosen": -1163713365.3333333, "logits/rejected": -717799936.0, "logps/chosen": -291.79477945963544, "logps/rejected": -137.73323974609374, "loss": 0.4896, "rewards/chosen": 0.2572901447614034, "rewards/margins": 0.1486562927563985, "rewards/rejected": 0.10863385200500489, "step": 66 }, { "epoch": 0.1128421052631579, "grad_norm": 108.123906294101, "kl": 0.8921241760253906, "learning_rate": 4.998442655654946e-07, "logits/chosen": -1018941312.0, "logits/rejected": -723264512.0, "logps/chosen": -227.9298553466797, "logps/rejected": -196.53343200683594, "loss": 0.4936, "rewards/chosen": 0.1972118318080902, "rewards/margins": -0.028878778219223022, "rewards/rejected": 0.22609061002731323, "step": 67 }, { "epoch": 0.11452631578947368, "grad_norm": 154.7633558965442, "kl": 3.6258544921875, "learning_rate": 4.997880360788526e-07, "logits/chosen": -1207795302.4, "logits/rejected": -640148437.3333334, "logps/chosen": -207.35986328125, "logps/rejected": -239.59635416666666, "loss": 0.4908, "rewards/chosen": 0.24745573997497558, "rewards/margins": 0.09161568085352578, "rewards/rejected": 0.1558400591214498, "step": 68 }, { "epoch": 0.11621052631578947, "grad_norm": 189.59978273083877, "kl": 1.4420166015625, "learning_rate": 4.997231611420373e-07, "logits/chosen": -722635929.6, "logits/rejected": -1086283520.0, "logps/chosen": -205.415087890625, "logps/rejected": -223.599853515625, "loss": 0.4755, "rewards/chosen": 0.2955571174621582, "rewards/margins": 0.23686978320280708, "rewards/rejected": 0.0586873342593511, "step": 69 }, { "epoch": 0.11789473684210526, "grad_norm": 82.88808037398137, "kl": 0.0, "learning_rate": 4.996496430004445e-07, "logits/chosen": -894456704.0, "logits/rejected": -1008478336.0, "logps/chosen": -197.12306213378906, "logps/rejected": -189.72410583496094, "loss": 0.4798, "rewards/chosen": 0.5647777318954468, "rewards/margins": 0.4188384860754013, "rewards/rejected": 0.14593924582004547, "step": 70 }, { "epoch": 0.11957894736842105, "grad_norm": 116.22452277931659, "kl": 0.0, "learning_rate": 4.995674841986217e-07, "logits/chosen": -1016096426.6666666, "logits/rejected": -340471296.0, "logps/chosen": -235.82279459635416, "logps/rejected": -181.5946807861328, "loss": 0.4523, "rewards/chosen": 0.3477465311686198, "rewards/margins": 0.14004083971182507, "rewards/rejected": 0.20770569145679474, "step": 71 }, { "epoch": 0.12126315789473684, "grad_norm": 169.23180648930278, "kl": 2.728069305419922, "learning_rate": 4.994766875801788e-07, "logits/chosen": -802654592.0, "logits/rejected": -754744960.0, "logps/chosen": -195.60926818847656, "logps/rejected": -193.22048950195312, "loss": 0.5093, "rewards/chosen": 0.20827637612819672, "rewards/margins": 0.17099057137966156, "rewards/rejected": 0.037285804748535156, "step": 72 }, { "epoch": 0.12294736842105262, "grad_norm": 97.86580017412098, "kl": 1.1262283325195312, "learning_rate": 4.993772562876909e-07, "logits/chosen": -791456000.0, "logits/rejected": -912904704.0, "logps/chosen": -172.02268981933594, "logps/rejected": -196.89213053385416, "loss": 0.4554, "rewards/chosen": 0.7986927032470703, "rewards/margins": 1.128045916557312, "rewards/rejected": -0.3293532133102417, "step": 73 }, { "epoch": 0.12463157894736843, "grad_norm": 87.12336042753508, "kl": 0.322601318359375, "learning_rate": 4.992691937625891e-07, "logits/chosen": -579524224.0, "logits/rejected": -731365120.0, "logps/chosen": -232.52737426757812, "logps/rejected": -234.31417846679688, "loss": 0.4493, "rewards/chosen": 0.24241486191749573, "rewards/margins": 1.9988974630832672, "rewards/rejected": -1.7564826011657715, "step": 74 }, { "epoch": 0.12631578947368421, "grad_norm": 72.27499509744034, "kl": 0.0, "learning_rate": 4.991525037450412e-07, "logits/chosen": -883592192.0, "logits/rejected": -728281088.0, "logps/chosen": -225.2747599283854, "logps/rejected": -199.8900146484375, "loss": 0.4878, "rewards/chosen": 0.1509552001953125, "rewards/margins": 0.05537720024585724, "rewards/rejected": 0.09557799994945526, "step": 75 }, { "epoch": 0.128, "grad_norm": 107.49035218888741, "kl": 0.0, "learning_rate": 4.990271902738222e-07, "logits/chosen": -625081190.4, "logits/rejected": -646702250.6666666, "logps/chosen": -232.6982421875, "logps/rejected": -326.85638427734375, "loss": 0.45, "rewards/chosen": 0.23982207775115966, "rewards/margins": 0.3932990511258443, "rewards/rejected": -0.15347697337468466, "step": 76 }, { "epoch": 0.1296842105263158, "grad_norm": 101.17758060728043, "kl": 0.0, "learning_rate": 4.988932576861752e-07, "logits/chosen": -1040184640.0, "logits/rejected": -690037376.0, "logps/chosen": -184.14572143554688, "logps/rejected": -228.58084106445312, "loss": 0.5148, "rewards/chosen": 0.24517688155174255, "rewards/margins": 0.009785279631614685, "rewards/rejected": 0.23539160192012787, "step": 77 }, { "epoch": 0.13136842105263158, "grad_norm": 84.42483041363506, "kl": 0.3793144226074219, "learning_rate": 4.987507106176606e-07, "logits/chosen": -611603046.4, "logits/rejected": -684819114.6666666, "logps/chosen": -241.2910400390625, "logps/rejected": -251.3832804361979, "loss": 0.4885, "rewards/chosen": 0.5874441623687744, "rewards/margins": -0.2702706495920817, "rewards/rejected": 0.8577148119608561, "step": 78 }, { "epoch": 0.13305263157894737, "grad_norm": 123.42412793246983, "kl": 0.0, "learning_rate": 4.985995540019955e-07, "logits/chosen": -902578752.0, "logits/rejected": -1213336917.3333333, "logps/chosen": -271.75140380859375, "logps/rejected": -266.7210286458333, "loss": 0.4488, "rewards/chosen": 0.22760315239429474, "rewards/margins": -0.06182275712490082, "rewards/rejected": 0.28942590951919556, "step": 79 }, { "epoch": 0.13473684210526315, "grad_norm": 70.07470086670996, "kl": 0.5335845947265625, "learning_rate": 4.984397930708837e-07, "logits/chosen": -737288960.0, "logits/rejected": -944431616.0, "logps/chosen": -198.37362670898438, "logps/rejected": -224.55250549316406, "loss": 0.4547, "rewards/chosen": 0.5928070545196533, "rewards/margins": 0.4272579848766327, "rewards/rejected": 0.16554906964302063, "step": 80 }, { "epoch": 0.13642105263157894, "grad_norm": 66.8382583080533, "kl": 0.0, "learning_rate": 4.982714333538342e-07, "logits/chosen": -604147353.6, "logits/rejected": -792211114.6666666, "logps/chosen": -264.7359375, "logps/rejected": -207.922119140625, "loss": 0.502, "rewards/chosen": 0.11271027326583863, "rewards/margins": -0.04983551700909933, "rewards/rejected": 0.16254579027493796, "step": 81 }, { "epoch": 0.13810526315789473, "grad_norm": 90.56573316497496, "kl": 1.1843795776367188, "learning_rate": 4.980944806779698e-07, "logits/chosen": -994012032.0, "logits/rejected": -756074154.6666666, "logps/chosen": -211.22325134277344, "logps/rejected": -273.8466389973958, "loss": 0.503, "rewards/chosen": 0.21327057480812073, "rewards/margins": 0.3816507955392202, "rewards/rejected": -0.16838022073109946, "step": 82 }, { "epoch": 0.13978947368421052, "grad_norm": 95.89588780633147, "kl": 0.0, "learning_rate": 4.979089411678251e-07, "logits/chosen": -1088499200.0, "logits/rejected": -580556672.0, "logps/chosen": -201.80905151367188, "logps/rejected": -149.05142211914062, "loss": 0.4614, "rewards/chosen": 0.5321415066719055, "rewards/margins": 0.3093837797641754, "rewards/rejected": 0.2227577269077301, "step": 83 }, { "epoch": 0.1414736842105263, "grad_norm": 84.70589504268058, "kl": 0.0, "learning_rate": 4.977148212451354e-07, "logits/chosen": -1006119296.0, "logits/rejected": -702025600.0, "logps/chosen": -220.37423706054688, "logps/rejected": -213.92950439453125, "loss": 0.4784, "rewards/chosen": 0.37330934405326843, "rewards/margins": -0.14478299021720886, "rewards/rejected": 0.5180923342704773, "step": 84 }, { "epoch": 0.1431578947368421, "grad_norm": 80.24131984783912, "kl": 0.18048095703125, "learning_rate": 4.975121276286136e-07, "logits/chosen": -574469888.0, "logits/rejected": -1207006720.0, "logps/chosen": -224.60697428385416, "logps/rejected": -230.05050659179688, "loss": 0.4587, "rewards/chosen": 0.3602706988652547, "rewards/margins": 0.06148912509282428, "rewards/rejected": 0.2987815737724304, "step": 85 }, { "epoch": 0.14484210526315788, "grad_norm": 119.4817380814399, "kl": 0.0, "learning_rate": 4.97300867333718e-07, "logits/chosen": -559665344.0, "logits/rejected": -1335321600.0, "logps/chosen": -246.3226318359375, "logps/rejected": -240.66729736328125, "loss": 0.4545, "rewards/chosen": 0.27413901686668396, "rewards/margins": 0.11862906813621521, "rewards/rejected": 0.15550994873046875, "step": 86 }, { "epoch": 0.14652631578947367, "grad_norm": 76.72262570711749, "kl": 0.0, "learning_rate": 4.970810476724096e-07, "logits/chosen": -794261632.0, "logits/rejected": -888321194.6666666, "logps/chosen": -223.9755096435547, "logps/rejected": -230.53841145833334, "loss": 0.4915, "rewards/chosen": 0.6259682178497314, "rewards/margins": 0.16484379768371582, "rewards/rejected": 0.4611244201660156, "step": 87 }, { "epoch": 0.1482105263157895, "grad_norm": 103.45011997542589, "kl": 0.6499404907226562, "learning_rate": 4.968526762528988e-07, "logits/chosen": -780058624.0, "logits/rejected": -916248371.2, "logps/chosen": -256.363525390625, "logps/rejected": -207.6240234375, "loss": 0.4831, "rewards/chosen": 0.28617451588312787, "rewards/margins": 0.07391071716944378, "rewards/rejected": 0.2122637987136841, "step": 88 }, { "epoch": 0.14989473684210528, "grad_norm": 215.23670048541007, "kl": 0.0, "learning_rate": 4.96615760979382e-07, "logits/chosen": -637054976.0, "logits/rejected": -924139712.0, "logps/chosen": -238.05352783203125, "logps/rejected": -168.34219360351562, "loss": 0.4829, "rewards/chosen": 0.5608859062194824, "rewards/margins": -0.13643085956573486, "rewards/rejected": 0.6973167657852173, "step": 89 }, { "epoch": 0.15157894736842106, "grad_norm": 76.57882355799092, "kl": 0.0, "learning_rate": 4.963703100517683e-07, "logits/chosen": -1086607488.0, "logits/rejected": -987515264.0, "logps/chosen": -204.48553466796875, "logps/rejected": -251.81475830078125, "loss": 0.4542, "rewards/chosen": 0.7157703638076782, "rewards/margins": 1.3066982626914978, "rewards/rejected": -0.5909278988838196, "step": 90 }, { "epoch": 0.15326315789473685, "grad_norm": 85.3247850434762, "kl": 0.0, "learning_rate": 4.961163319653958e-07, "logits/chosen": -1008856704.0, "logits/rejected": -618826112.0, "logps/chosen": -187.7606201171875, "logps/rejected": -191.36654663085938, "loss": 0.4556, "rewards/chosen": 0.2826835811138153, "rewards/margins": -0.048234134912490845, "rewards/rejected": 0.33091771602630615, "step": 91 }, { "epoch": 0.15494736842105264, "grad_norm": 172.74293893285517, "kl": 0.0, "learning_rate": 4.958538355107369e-07, "logits/chosen": -771198272.0, "logits/rejected": -911799936.0, "logps/chosen": -198.0313262939453, "logps/rejected": -243.49655151367188, "loss": 0.4021, "rewards/chosen": 0.537731945514679, "rewards/margins": 1.0454849600791931, "rewards/rejected": -0.5077530145645142, "step": 92 }, { "epoch": 0.15663157894736843, "grad_norm": 115.09597490195979, "kl": 0.0, "learning_rate": 4.955828297730948e-07, "logits/chosen": -578800576.0, "logits/rejected": -940042752.0, "logps/chosen": -166.36688232421875, "logps/rejected": -236.39175415039062, "loss": 0.4575, "rewards/chosen": 0.2515438199043274, "rewards/margins": 0.63800048828125, "rewards/rejected": -0.3864566683769226, "step": 93 }, { "epoch": 0.15831578947368422, "grad_norm": 189.22015271094992, "kl": 0.0, "learning_rate": 4.953033241322886e-07, "logits/chosen": -971949875.2, "logits/rejected": -642591701.3333334, "logps/chosen": -208.137646484375, "logps/rejected": -267.102294921875, "loss": 0.4444, "rewards/chosen": 0.30209686756134035, "rewards/margins": 0.15472238659858706, "rewards/rejected": 0.1473744809627533, "step": 94 }, { "epoch": 0.16, "grad_norm": 151.69924056862672, "kl": 0.8690567016601562, "learning_rate": 4.950153282623288e-07, "logits/chosen": -929138029.7142857, "logits/rejected": -305715200.0, "logps/chosen": -243.89243861607142, "logps/rejected": -234.21505737304688, "loss": 0.4272, "rewards/chosen": 1.003729956490653, "rewards/margins": 1.042243140084403, "rewards/rejected": -0.03851318359375, "step": 95 }, { "epoch": 0.1616842105263158, "grad_norm": 117.75406936780551, "kl": 0.0, "learning_rate": 4.947188521310827e-07, "logits/chosen": -1078521088.0, "logits/rejected": -754925696.0, "logps/chosen": -241.37603759765625, "logps/rejected": -246.62840270996094, "loss": 0.468, "rewards/chosen": 0.06325645744800568, "rewards/margins": 2.5708944350481033, "rewards/rejected": -2.5076379776000977, "step": 96 }, { "epoch": 0.16336842105263158, "grad_norm": 92.90737005905727, "kl": 0.0, "learning_rate": 4.944139059999286e-07, "logits/chosen": -939986624.0, "logits/rejected": -757044608.0, "logps/chosen": -255.14584350585938, "logps/rejected": -217.84385681152344, "loss": 0.4852, "rewards/chosen": 0.14831504225730896, "rewards/margins": -0.12935563921928406, "rewards/rejected": 0.277670681476593, "step": 97 }, { "epoch": 0.16505263157894737, "grad_norm": 75.87889683822247, "kl": 2.0634384155273438, "learning_rate": 4.941005004234018e-07, "logits/chosen": -674378444.8, "logits/rejected": -790328490.6666666, "logps/chosen": -222.7276123046875, "logps/rejected": -319.3123372395833, "loss": 0.4712, "rewards/chosen": 0.9145062446594239, "rewards/margins": 1.130675482749939, "rewards/rejected": -0.21616923809051514, "step": 98 }, { "epoch": 0.16673684210526316, "grad_norm": 83.71279388164065, "kl": 0.06458663940429688, "learning_rate": 4.937786462488283e-07, "logits/chosen": -715301376.0, "logits/rejected": -695437458.2857143, "logps/chosen": -187.44473266601562, "logps/rejected": -221.67951311383928, "loss": 0.4397, "rewards/chosen": 0.4498535096645355, "rewards/margins": 0.21583446860313416, "rewards/rejected": 0.23401904106140137, "step": 99 }, { "epoch": 0.16842105263157894, "grad_norm": 105.87757666440052, "kl": 4.265495300292969, "learning_rate": 4.9344835461595e-07, "logits/chosen": -985528320.0, "logits/rejected": -841993301.3333334, "logps/chosen": -187.4853759765625, "logps/rejected": -244.225830078125, "loss": 0.4832, "rewards/chosen": 0.04033417701721191, "rewards/margins": -0.09548329114913941, "rewards/rejected": 0.13581746816635132, "step": 100 }, { "epoch": 0.17010526315789473, "grad_norm": 99.19589292443813, "kl": 0.0, "learning_rate": 4.93109636956539e-07, "logits/chosen": -1094334464.0, "logits/rejected": -712445226.6666666, "logps/chosen": -216.4138671875, "logps/rejected": -227.96439615885416, "loss": 0.4423, "rewards/chosen": 0.8524996757507324, "rewards/margins": 1.196986190478007, "rewards/rejected": -0.3444865147272746, "step": 101 }, { "epoch": 0.17178947368421052, "grad_norm": 74.15099841568721, "kl": 2.151416778564453, "learning_rate": 4.927625049940012e-07, "logits/chosen": -766361152.0, "logits/rejected": -333142048.0, "logps/chosen": -199.77578735351562, "logps/rejected": -168.93804931640625, "loss": 0.4554, "rewards/chosen": 0.7172653079032898, "rewards/margins": 0.35114437341690063, "rewards/rejected": 0.36612093448638916, "step": 102 }, { "epoch": 0.1734736842105263, "grad_norm": 107.51116166109057, "kl": 1.1529388427734375, "learning_rate": 4.92406970742972e-07, "logits/chosen": -971440256.0, "logits/rejected": -828623232.0, "logps/chosen": -195.1447296142578, "logps/rejected": -280.0885009765625, "loss": 0.4425, "rewards/chosen": 0.5913848876953125, "rewards/margins": 1.5447425842285156, "rewards/rejected": -0.9533576965332031, "step": 103 }, { "epoch": 0.1751578947368421, "grad_norm": 74.47709378212676, "kl": 4.383308410644531, "learning_rate": 4.920430465088991e-07, "logits/chosen": -670352640.0, "logits/rejected": -671316608.0, "logps/chosen": -255.62147521972656, "logps/rejected": -200.2668914794922, "loss": 0.4575, "rewards/chosen": 0.5768207311630249, "rewards/margins": 0.4924035966396332, "rewards/rejected": 0.08441713452339172, "step": 104 }, { "epoch": 0.17684210526315788, "grad_norm": 132.0792640659443, "kl": 1.2592926025390625, "learning_rate": 4.916707448876173e-07, "logits/chosen": -832779328.0, "logits/rejected": -1296543232.0, "logps/chosen": -227.14276123046875, "logps/rejected": -218.0616912841797, "loss": 0.4254, "rewards/chosen": 1.1795654296875, "rewards/margins": 1.3467857092618942, "rewards/rejected": -0.16722027957439423, "step": 105 }, { "epoch": 0.17852631578947367, "grad_norm": 145.72514277447476, "kl": 0.0, "learning_rate": 4.912900787649123e-07, "logits/chosen": -898515675.4285715, "logits/rejected": -1168541696.0, "logps/chosen": -201.853515625, "logps/rejected": -228.6392822265625, "loss": 0.4635, "rewards/chosen": 0.14519500732421875, "rewards/margins": -0.7220779657363892, "rewards/rejected": 0.8672729730606079, "step": 106 }, { "epoch": 0.18021052631578946, "grad_norm": 96.96868580065858, "kl": 0.47565460205078125, "learning_rate": 4.90901061316075e-07, "logits/chosen": -979152704.0, "logits/rejected": -788988416.0, "logps/chosen": -262.84649658203125, "logps/rejected": -219.4362589518229, "loss": 0.4916, "rewards/chosen": -0.06726683676242828, "rewards/margins": 1.062198743224144, "rewards/rejected": -1.1294655799865723, "step": 107 }, { "epoch": 0.18189473684210528, "grad_norm": 131.83686757875427, "kl": 0.0, "learning_rate": 4.905037060054449e-07, "logits/chosen": -1066169344.0, "logits/rejected": -1416755712.0, "logps/chosen": -177.3980255126953, "logps/rejected": -229.30007934570312, "loss": 0.4426, "rewards/chosen": 1.1091067790985107, "rewards/margins": 0.9519591778516769, "rewards/rejected": 0.1571476012468338, "step": 108 }, { "epoch": 0.18357894736842106, "grad_norm": 127.9527788904727, "kl": 0.0, "learning_rate": 4.900980265859448e-07, "logits/chosen": -559809066.6666666, "logits/rejected": -611907481.6, "logps/chosen": -164.3425089518229, "logps/rejected": -240.176318359375, "loss": 0.424, "rewards/chosen": 1.06194003423055, "rewards/margins": 0.5945489724477131, "rewards/rejected": 0.4673910617828369, "step": 109 }, { "epoch": 0.18526315789473685, "grad_norm": 84.90732586257637, "kl": 0.0, "learning_rate": 4.896840370986042e-07, "logits/chosen": -976780032.0, "logits/rejected": -716965632.0, "logps/chosen": -253.70814514160156, "logps/rejected": -167.56753540039062, "loss": 0.4595, "rewards/chosen": 0.47311899065971375, "rewards/margins": 0.09406355023384094, "rewards/rejected": 0.3790554404258728, "step": 110 }, { "epoch": 0.18694736842105264, "grad_norm": 89.53866936328932, "kl": 0.0, "learning_rate": 4.892617518720737e-07, "logits/chosen": -829139285.3333334, "logits/rejected": -455405536.0, "logps/chosen": -206.80936686197916, "logps/rejected": -257.3726806640625, "loss": 0.4362, "rewards/chosen": 0.4422444502512614, "rewards/margins": 2.1721797386805215, "rewards/rejected": -1.7299352884292603, "step": 111 }, { "epoch": 0.18863157894736843, "grad_norm": 192.37189364365034, "kl": 0.0, "learning_rate": 4.888311855221289e-07, "logits/chosen": -770445312.0, "logits/rejected": -951467392.0, "logps/chosen": -199.4364471435547, "logps/rejected": -215.43002319335938, "loss": 0.5149, "rewards/chosen": -1.172553300857544, "rewards/margins": -1.1502388417720795, "rewards/rejected": -0.022314459085464478, "step": 112 }, { "epoch": 0.19031578947368422, "grad_norm": 95.28956568265271, "kl": 0.0, "learning_rate": 4.883923529511646e-07, "logits/chosen": -535924416.0, "logits/rejected": -786336960.0, "logps/chosen": -247.45355224609375, "logps/rejected": -203.68768310546875, "loss": 0.4584, "rewards/chosen": 0.5863098502159119, "rewards/margins": -0.6281996369361877, "rewards/rejected": 1.2145094871520996, "step": 113 }, { "epoch": 0.192, "grad_norm": 141.25750368118935, "kl": 0.0, "learning_rate": 4.879452693476789e-07, "logits/chosen": -1001671372.8, "logits/rejected": -836295936.0, "logps/chosen": -201.454052734375, "logps/rejected": -205.16349283854166, "loss": 0.4619, "rewards/chosen": 0.1915292501449585, "rewards/margins": 1.2447933912277223, "rewards/rejected": -1.0532641410827637, "step": 114 }, { "epoch": 0.1936842105263158, "grad_norm": 90.01647424789367, "kl": 1.8182296752929688, "learning_rate": 4.874899501857477e-07, "logits/chosen": -642349952.0, "logits/rejected": -997530176.0, "logps/chosen": -269.48760986328125, "logps/rejected": -281.49591064453125, "loss": 0.4548, "rewards/chosen": -0.8407798409461975, "rewards/margins": -0.7594596445560455, "rewards/rejected": -0.08132019639015198, "step": 115 }, { "epoch": 0.19536842105263158, "grad_norm": 142.4095270366161, "kl": 0.0, "learning_rate": 4.87026411224489e-07, "logits/chosen": -909213798.4, "logits/rejected": -513591637.3333333, "logps/chosen": -240.9342041015625, "logps/rejected": -284.7578531901042, "loss": 0.405, "rewards/chosen": 0.716831636428833, "rewards/margins": 1.0737988074620564, "rewards/rejected": -0.35696717103322345, "step": 116 }, { "epoch": 0.19705263157894737, "grad_norm": 313.6380304727835, "kl": 0.0, "learning_rate": 4.865546685075174e-07, "logits/chosen": -904379648.0, "logits/rejected": -820042547.2, "logps/chosen": -220.26141357421875, "logps/rejected": -245.7005859375, "loss": 0.431, "rewards/chosen": 1.131263256072998, "rewards/margins": 1.3263026356697083, "rewards/rejected": -0.1950393795967102, "step": 117 }, { "epoch": 0.19873684210526316, "grad_norm": 108.28624508912246, "kl": 0.0, "learning_rate": 4.860747383623889e-07, "logits/chosen": -723659136.0, "logits/rejected": -772578752.0, "logps/chosen": -184.33941650390625, "logps/rejected": -185.81124877929688, "loss": 0.3927, "rewards/chosen": 1.3619561195373535, "rewards/margins": 1.463188648223877, "rewards/rejected": -0.10123252868652344, "step": 118 }, { "epoch": 0.20042105263157894, "grad_norm": 314.88581870057607, "kl": 0.0, "learning_rate": 4.85586637400036e-07, "logits/chosen": -458558624.0, "logits/rejected": -640709376.0, "logps/chosen": -257.4935302734375, "logps/rejected": -263.326416015625, "loss": 0.4136, "rewards/chosen": 0.4196751117706299, "rewards/margins": 1.0923622846603394, "rewards/rejected": -0.6726871728897095, "step": 119 }, { "epoch": 0.20210526315789473, "grad_norm": 138.4365785651591, "kl": 0.0, "learning_rate": 4.85090382514192e-07, "logits/chosen": -712995328.0, "logits/rejected": -820921941.3333334, "logps/chosen": -222.21136474609375, "logps/rejected": -301.34344482421875, "loss": 0.4317, "rewards/chosen": 0.45389634370803833, "rewards/margins": 0.7454813917477925, "rewards/rejected": -0.2915850480397542, "step": 120 }, { "epoch": 0.20378947368421052, "grad_norm": 92.36506768933428, "kl": 0.0, "learning_rate": 4.845859908808073e-07, "logits/chosen": -695455232.0, "logits/rejected": -842530406.4, "logps/chosen": -220.80411783854166, "logps/rejected": -278.7416015625, "loss": 0.458, "rewards/chosen": 0.664484699567159, "rewards/margins": 1.1589130957921348, "rewards/rejected": -0.4944283962249756, "step": 121 }, { "epoch": 0.2054736842105263, "grad_norm": 73.63166789116582, "kl": 0.4639434814453125, "learning_rate": 4.840734799574546e-07, "logits/chosen": -1259681877.3333333, "logits/rejected": -1012162560.0, "logps/chosen": -189.5477091471354, "logps/rejected": -160.91326904296875, "loss": 0.4281, "rewards/chosen": 0.8242993354797363, "rewards/margins": 0.5254170298576355, "rewards/rejected": 0.29888230562210083, "step": 122 }, { "epoch": 0.2071578947368421, "grad_norm": 97.28869763760606, "kl": 0.318084716796875, "learning_rate": 4.835528674827239e-07, "logits/chosen": -732118954.6666666, "logits/rejected": -770999680.0, "logps/chosen": -228.71061197916666, "logps/rejected": -203.45211791992188, "loss": 0.4352, "rewards/chosen": 0.7461237907409668, "rewards/margins": 0.08015847206115723, "rewards/rejected": 0.6659653186798096, "step": 123 }, { "epoch": 0.20884210526315788, "grad_norm": 103.28487237641944, "kl": 0.0, "learning_rate": 4.830241714756098e-07, "logits/chosen": -580801901.7142857, "logits/rejected": -687661312.0, "logps/chosen": -200.93122209821428, "logps/rejected": -259.4430847167969, "loss": 0.3573, "rewards/chosen": 1.0478062629699707, "rewards/margins": -0.5003474950790405, "rewards/rejected": 1.5481537580490112, "step": 124 }, { "epoch": 0.21052631578947367, "grad_norm": 80.62146513928802, "kl": 1.8628311157226562, "learning_rate": 4.82487410234887e-07, "logits/chosen": -867455707.4285715, "logits/rejected": -523073568.0, "logps/chosen": -173.80622209821428, "logps/rejected": -181.6681671142578, "loss": 0.5246, "rewards/chosen": -0.30137692178998676, "rewards/margins": -1.384874565260751, "rewards/rejected": 1.0834976434707642, "step": 125 }, { "epoch": 0.21221052631578946, "grad_norm": 82.55380217967634, "kl": 2.27947998046875, "learning_rate": 4.819426023384769e-07, "logits/chosen": -958819942.4, "logits/rejected": -718372181.3333334, "logps/chosen": -212.732763671875, "logps/rejected": -163.0384724934896, "loss": 0.5059, "rewards/chosen": 0.950899314880371, "rewards/margins": 0.2773471991221109, "rewards/rejected": 0.6735521157582601, "step": 126 }, { "epoch": 0.21389473684210528, "grad_norm": 134.57523570837543, "kl": 0.0, "learning_rate": 4.813897666428053e-07, "logits/chosen": -582362880.0, "logits/rejected": -902837043.2, "logps/chosen": -258.3628743489583, "logps/rejected": -191.00611572265626, "loss": 0.4573, "rewards/chosen": 0.8414382934570312, "rewards/margins": 0.5884557962417603, "rewards/rejected": 0.252982497215271, "step": 127 }, { "epoch": 0.21557894736842106, "grad_norm": 77.91215003119481, "kl": 0.0, "learning_rate": 4.80828922282149e-07, "logits/chosen": -715421593.6, "logits/rejected": -996736597.3333334, "logps/chosen": -221.0246337890625, "logps/rejected": -193.3650105794271, "loss": 0.4897, "rewards/chosen": 0.5778164863586426, "rewards/margins": -0.47630953788757324, "rewards/rejected": 1.0541260242462158, "step": 128 }, { "epoch": 0.21726315789473685, "grad_norm": 84.25313070423411, "kl": 2.1194534301757812, "learning_rate": 4.802600886679741e-07, "logits/chosen": -852968704.0, "logits/rejected": -692032000.0, "logps/chosen": -224.84365844726562, "logps/rejected": -224.15106201171875, "loss": 0.436, "rewards/chosen": 0.7637878656387329, "rewards/margins": 0.6827945858240128, "rewards/rejected": 0.08099327981472015, "step": 129 }, { "epoch": 0.21894736842105264, "grad_norm": 76.6604526230067, "kl": 0.0, "learning_rate": 4.79683285488264e-07, "logits/chosen": -768312064.0, "logits/rejected": -553834240.0, "logps/chosen": -209.3781982421875, "logps/rejected": -275.8822428385417, "loss": 0.3727, "rewards/chosen": 1.0357406616210938, "rewards/margins": 10.20118408203125, "rewards/rejected": -9.165443420410156, "step": 130 }, { "epoch": 0.22063157894736843, "grad_norm": 58.96966604279425, "kl": 0.0, "learning_rate": 4.790985327068375e-07, "logits/chosen": -1307087360.0, "logits/rejected": -688753612.8, "logps/chosen": -197.68208821614584, "logps/rejected": -239.55947265625, "loss": 0.4189, "rewards/chosen": 1.5207815170288086, "rewards/margins": 1.0784048080444335, "rewards/rejected": 0.442376708984375, "step": 131 }, { "epoch": 0.22231578947368422, "grad_norm": 50.487962918238146, "kl": 0.0, "learning_rate": 4.785058505626587e-07, "logits/chosen": -903821824.0, "logits/rejected": -479294592.0, "logps/chosen": -188.7058563232422, "logps/rejected": -270.5711364746094, "loss": 0.4645, "rewards/chosen": 0.9261478781700134, "rewards/margins": 0.3811134696006775, "rewards/rejected": 0.5450344085693359, "step": 132 }, { "epoch": 0.224, "grad_norm": 73.6874876394, "kl": 0.84033203125, "learning_rate": 4.779052595691354e-07, "logits/chosen": -728201152.0, "logits/rejected": -1323703552.0, "logps/chosen": -219.38204956054688, "logps/rejected": -226.48712158203125, "loss": 0.4986, "rewards/chosen": 0.8902133703231812, "rewards/margins": -0.04759258031845093, "rewards/rejected": 0.9378059506416321, "step": 133 }, { "epoch": 0.2256842105263158, "grad_norm": 95.59678991510799, "kl": 0.3926353454589844, "learning_rate": 4.772967805134105e-07, "logits/chosen": -1039820992.0, "logits/rejected": -1321091328.0, "logps/chosen": -226.0445556640625, "logps/rejected": -204.591064453125, "loss": 0.4261, "rewards/chosen": 1.1810604333877563, "rewards/margins": 0.4814262390136719, "rewards/rejected": 0.6996341943740845, "step": 134 }, { "epoch": 0.22736842105263158, "grad_norm": 188.3750547944905, "kl": 0.0, "learning_rate": 4.766804344556413e-07, "logits/chosen": -612920320.0, "logits/rejected": -678868736.0, "logps/chosen": -246.20358276367188, "logps/rejected": -229.70596313476562, "loss": 0.4773, "rewards/chosen": 0.6662670373916626, "rewards/margins": 1.006878674030304, "rewards/rejected": -0.34061163663864136, "step": 135 }, { "epoch": 0.22905263157894737, "grad_norm": 68.4177176261016, "kl": 0.0, "learning_rate": 4.760562427282712e-07, "logits/chosen": -829823573.3333334, "logits/rejected": -959791168.0, "logps/chosen": -276.5669352213542, "logps/rejected": -458.56427001953125, "loss": 0.486, "rewards/chosen": -0.15483548243840536, "rewards/margins": 0.1716126004854838, "rewards/rejected": -0.32644808292388916, "step": 136 }, { "epoch": 0.23073684210526316, "grad_norm": 170.91909658805818, "kl": 0.0, "learning_rate": 4.754242269352911e-07, "logits/chosen": -975248237.7142857, "logits/rejected": -1017260032.0, "logps/chosen": -223.60107421875, "logps/rejected": -248.3441162109375, "loss": 0.3863, "rewards/chosen": 1.2192728860037667, "rewards/margins": 1.134688836123262, "rewards/rejected": 0.08458404988050461, "step": 137 }, { "epoch": 0.23242105263157894, "grad_norm": 49.863175356012206, "kl": 0.0, "learning_rate": 4.747844089514919e-07, "logits/chosen": -1015484825.6, "logits/rejected": -859580074.6666666, "logps/chosen": -238.688232421875, "logps/rejected": -222.31477864583334, "loss": 0.4322, "rewards/chosen": 1.2349286079406738, "rewards/margins": 1.1146054069201152, "rewards/rejected": 0.12032320102055867, "step": 138 }, { "epoch": 0.23410526315789473, "grad_norm": 253.28180299915874, "kl": 2.1038894653320312, "learning_rate": 4.741368109217071e-07, "logits/chosen": -1175646293.3333333, "logits/rejected": -688029491.2, "logps/chosen": -291.4467366536458, "logps/rejected": -250.909619140625, "loss": 0.4026, "rewards/chosen": 0.8213460445404053, "rewards/margins": 1.0762538671493531, "rewards/rejected": -0.25490782260894773, "step": 139 }, { "epoch": 0.23578947368421052, "grad_norm": 80.42815443807443, "kl": 0.0, "learning_rate": 4.734814552600468e-07, "logits/chosen": -934606677.3333334, "logits/rejected": -1007555904.0, "logps/chosen": -220.41634114583334, "logps/rejected": -240.6993865966797, "loss": 0.4317, "rewards/chosen": 0.8413275877634684, "rewards/margins": 0.3180929819742838, "rewards/rejected": 0.5232346057891846, "step": 140 }, { "epoch": 0.2374736842105263, "grad_norm": 466.33070749527764, "kl": 0.0, "learning_rate": 4.728183646491214e-07, "logits/chosen": -753246873.6, "logits/rejected": -974831786.6666666, "logps/chosen": -238.600927734375, "logps/rejected": -243.89959716796875, "loss": 0.3808, "rewards/chosen": -0.32029788494110106, "rewards/margins": -0.08840243816375731, "rewards/rejected": -0.23189544677734375, "step": 141 }, { "epoch": 0.2391578947368421, "grad_norm": 176.73411471688468, "kl": 0.0, "learning_rate": 4.721475620392567e-07, "logits/chosen": -908911445.3333334, "logits/rejected": -713944217.6, "logps/chosen": -227.62554931640625, "logps/rejected": -191.32474365234376, "loss": 0.4569, "rewards/chosen": 1.7199311256408691, "rewards/margins": 2.082160711288452, "rewards/rejected": -0.362229585647583, "step": 142 }, { "epoch": 0.24084210526315789, "grad_norm": 566.7557950897859, "kl": 0.0, "learning_rate": 4.7146907064769993e-07, "logits/chosen": -1072672170.6666666, "logits/rejected": -1189377228.8, "logps/chosen": -252.31046549479166, "logps/rejected": -199.03504638671876, "loss": 0.4128, "rewards/chosen": 0.5313369830449423, "rewards/margins": 1.2499615271886189, "rewards/rejected": -0.7186245441436767, "step": 143 }, { "epoch": 0.24252631578947367, "grad_norm": 4243.997152918305, "kl": 0.0, "learning_rate": 4.707829139578155e-07, "logits/chosen": -873702195.2, "logits/rejected": -1221907285.3333333, "logps/chosen": -200.1794921875, "logps/rejected": -223.1510009765625, "loss": 0.4001, "rewards/chosen": 0.7977983474731445, "rewards/margins": 2.0752524852752687, "rewards/rejected": -1.277454137802124, "step": 144 }, { "epoch": 0.24421052631578946, "grad_norm": 314.5163932036576, "kl": 4.798194885253906, "learning_rate": 4.7008911571827284e-07, "logits/chosen": -978259712.0, "logits/rejected": -640961024.0, "logps/chosen": -256.1488342285156, "logps/rejected": -242.27796936035156, "loss": 0.4857, "rewards/chosen": 0.40538519620895386, "rewards/margins": 0.4533770680427551, "rewards/rejected": -0.04799187183380127, "step": 145 }, { "epoch": 0.24589473684210525, "grad_norm": 360.7422844202394, "kl": 0.0, "learning_rate": 4.6938769994222406e-07, "logits/chosen": -825879756.8, "logits/rejected": -888731989.3333334, "logps/chosen": -198.52425537109374, "logps/rejected": -280.38706461588544, "loss": 0.358, "rewards/chosen": 0.7168326377868652, "rewards/margins": 1.5658982594807944, "rewards/rejected": -0.8490656216939291, "step": 146 }, { "epoch": 0.24757894736842107, "grad_norm": 2156.8379086317163, "kl": 0.0, "learning_rate": 4.686786909064729e-07, "logits/chosen": -410741248.0, "logits/rejected": -832765440.0, "logps/chosen": -249.67108154296875, "logps/rejected": -203.24400329589844, "loss": 0.421, "rewards/chosen": -0.19027066230773926, "rewards/margins": -0.8781961798667908, "rewards/rejected": 0.6879255175590515, "step": 147 }, { "epoch": 0.24926315789473685, "grad_norm": 286.56368982318264, "kl": 0.0, "learning_rate": 4.679621131506346e-07, "logits/chosen": -1148589421.7142856, "logits/rejected": -688305152.0, "logps/chosen": -267.63023158482144, "logps/rejected": -254.83946228027344, "loss": 0.4676, "rewards/chosen": 0.4722416400909424, "rewards/margins": 0.7534733414649963, "rewards/rejected": -0.28123170137405396, "step": 148 }, { "epoch": 0.25094736842105264, "grad_norm": 222.97278644560413, "kl": 0.0, "learning_rate": 4.6723799147628664e-07, "logits/chosen": -779184896.0, "logits/rejected": -826172416.0, "logps/chosen": -190.68980407714844, "logps/rejected": -227.09054565429688, "loss": 0.4862, "rewards/chosen": 1.420555830001831, "rewards/margins": 1.2711764872074127, "rewards/rejected": 0.14937934279441833, "step": 149 }, { "epoch": 0.25263157894736843, "grad_norm": 145.46733652305187, "kl": 0.0, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -831112106.6666666, "logits/rejected": -727667648.0, "logps/chosen": -205.18501790364584, "logps/rejected": -203.62704467773438, "loss": 0.4654, "rewards/chosen": 0.4877975384394328, "rewards/margins": -0.5469383796056111, "rewards/rejected": 1.034735918045044, "step": 150 }, { "epoch": 0.2543157894736842, "grad_norm": 148.86465225969857, "kl": 0.0, "learning_rate": 4.6576721688302103e-07, "logits/chosen": -616928128.0, "logits/rejected": -506641365.3333333, "logps/chosen": -269.32086181640625, "logps/rejected": -279.0386962890625, "loss": 0.4548, "rewards/chosen": -0.4652397334575653, "rewards/margins": -0.7438713212807972, "rewards/rejected": 0.278631587823232, "step": 151 }, { "epoch": 0.256, "grad_norm": 80.8642282340819, "kl": 0.0, "learning_rate": 4.6502061486929765e-07, "logits/chosen": -1172402585.6, "logits/rejected": -611508778.6666666, "logps/chosen": -199.3252197265625, "logps/rejected": -200.5145263671875, "loss": 0.4583, "rewards/chosen": 0.7260040283203125, "rewards/margins": 0.8308873613675436, "rewards/rejected": -0.10488333304723103, "step": 152 }, { "epoch": 0.2576842105263158, "grad_norm": 59.01380093137268, "kl": 0.0, "learning_rate": 4.6426657074569076e-07, "logits/chosen": -679793049.6, "logits/rejected": -954248618.6666666, "logps/chosen": -219.611181640625, "logps/rejected": -154.1990966796875, "loss": 0.4257, "rewards/chosen": 0.9013955116271972, "rewards/margins": 1.4685372829437255, "rewards/rejected": -0.5671417713165283, "step": 153 }, { "epoch": 0.2593684210526316, "grad_norm": 165.5135211510967, "kl": 0.0, "learning_rate": 4.6350511061053155e-07, "logits/chosen": -996774144.0, "logits/rejected": -484326496.0, "logps/chosen": -222.6114501953125, "logps/rejected": -187.74560546875, "loss": 0.4011, "rewards/chosen": 0.8318793773651123, "rewards/margins": 1.6771708726882935, "rewards/rejected": -0.8452914953231812, "step": 154 }, { "epoch": 0.26105263157894737, "grad_norm": 62.878815512005815, "kl": 0.0, "learning_rate": 4.62736260818828e-07, "logits/chosen": -1305949610.6666667, "logits/rejected": -473263104.0, "logps/chosen": -258.23980712890625, "logps/rejected": -231.45439453125, "loss": 0.4867, "rewards/chosen": -0.5735707680384318, "rewards/margins": -0.266325561205546, "rewards/rejected": -0.30724520683288575, "step": 155 }, { "epoch": 0.26273684210526316, "grad_norm": 68.5671614498506, "kl": 0.0, "learning_rate": 4.6196004798135236e-07, "logits/chosen": -818773568.0, "logits/rejected": -855193408.0, "logps/chosen": -297.919677734375, "logps/rejected": -230.89285278320312, "loss": 0.4601, "rewards/chosen": -0.12369692325592041, "rewards/margins": -0.5421707332134247, "rewards/rejected": 0.4184738099575043, "step": 156 }, { "epoch": 0.26442105263157895, "grad_norm": 49.45697259448593, "kl": 0.0, "learning_rate": 4.611764989637205e-07, "logits/chosen": -671601715.2, "logits/rejected": -1087509162.6666667, "logps/chosen": -242.408056640625, "logps/rejected": -313.653564453125, "loss": 0.5013, "rewards/chosen": 1.0554190635681153, "rewards/margins": 3.382936731974284, "rewards/rejected": -2.3275176684061685, "step": 157 }, { "epoch": 0.26610526315789473, "grad_norm": 106.47797133432495, "kl": 0.0, "learning_rate": 4.603856408854618e-07, "logits/chosen": -757610240.0, "logits/rejected": -791967232.0, "logps/chosen": -249.16958618164062, "logps/rejected": -192.21998596191406, "loss": 0.4991, "rewards/chosen": 1.1682857275009155, "rewards/margins": 0.8241084814071655, "rewards/rejected": 0.34417724609375, "step": 158 }, { "epoch": 0.2677894736842105, "grad_norm": 49.18374091004554, "kl": 0.0, "learning_rate": 4.5958750111908065e-07, "logits/chosen": -1117810432.0, "logits/rejected": -668232345.6, "logps/chosen": -165.53057861328125, "logps/rejected": -253.265234375, "loss": 0.502, "rewards/chosen": 0.5383493900299072, "rewards/margins": 0.5533985137939453, "rewards/rejected": -0.015049123764038086, "step": 159 }, { "epoch": 0.2694736842105263, "grad_norm": 44.5074703062048, "kl": 0.0, "learning_rate": 4.5878210728910886e-07, "logits/chosen": -953598976.0, "logits/rejected": -1030626918.4, "logps/chosen": -242.2763875325521, "logps/rejected": -223.04658203125, "loss": 0.4662, "rewards/chosen": 0.856818675994873, "rewards/margins": 0.2501978874206543, "rewards/rejected": 0.6066207885742188, "step": 160 }, { "epoch": 0.2711578947368421, "grad_norm": 75.43811234031128, "kl": 0.0, "learning_rate": 4.5796948727115e-07, "logits/chosen": -793493299.2, "logits/rejected": -714943658.6666666, "logps/chosen": -278.5044189453125, "logps/rejected": -209.54473876953125, "loss": 0.3873, "rewards/chosen": 0.2761260747909546, "rewards/margins": -0.06068220933278401, "rewards/rejected": 0.3368082841237386, "step": 161 }, { "epoch": 0.2728421052631579, "grad_norm": 63.90407966356232, "kl": 0.0, "learning_rate": 4.5714966919091414e-07, "logits/chosen": -482634784.0, "logits/rejected": -1263959466.6666667, "logps/chosen": -282.9561767578125, "logps/rejected": -233.3080037434896, "loss": 0.398, "rewards/chosen": 0.31816405057907104, "rewards/margins": 0.5229944586753845, "rewards/rejected": -0.20483040809631348, "step": 162 }, { "epoch": 0.2745263157894737, "grad_norm": 121.01434181632379, "kl": 0.0, "learning_rate": 4.5632268142324435e-07, "logits/chosen": -697656746.6666666, "logits/rejected": -517255552.0, "logps/chosen": -242.203369140625, "logps/rejected": -196.41952514648438, "loss": 0.3492, "rewards/chosen": 1.0457820097605388, "rewards/margins": 1.9965357581774394, "rewards/rejected": -0.9507537484169006, "step": 163 }, { "epoch": 0.27621052631578946, "grad_norm": 59.7684852435887, "kl": 0.0, "learning_rate": 4.554885525911351e-07, "logits/chosen": -932756352.0, "logits/rejected": -709267626.6666666, "logps/chosen": -220.3507537841797, "logps/rejected": -211.04886881510416, "loss": 0.4335, "rewards/chosen": 0.9487717151641846, "rewards/margins": 2.4863777955373125, "rewards/rejected": -1.5376060803731282, "step": 164 }, { "epoch": 0.27789473684210525, "grad_norm": 84.16432681430886, "kl": 0.0, "learning_rate": 4.546473115647409e-07, "logits/chosen": -594779008.0, "logits/rejected": -749184102.4, "logps/chosen": -238.7244873046875, "logps/rejected": -268.174755859375, "loss": 0.42, "rewards/chosen": -1.2170298099517822, "rewards/margins": 0.033264207839965865, "rewards/rejected": -1.250294017791748, "step": 165 }, { "epoch": 0.27957894736842104, "grad_norm": 72.08410795046584, "kl": 0.0, "learning_rate": 4.53798987460378e-07, "logits/chosen": -850566144.0, "logits/rejected": -682344243.2, "logps/chosen": -284.1813557942708, "logps/rejected": -257.943896484375, "loss": 0.3765, "rewards/chosen": -0.22490237156550089, "rewards/margins": 2.2622447689374288, "rewards/rejected": -2.4871471405029295, "step": 166 }, { "epoch": 0.2812631578947368, "grad_norm": 226.29378772227912, "kl": 0.0, "learning_rate": 4.529436096395156e-07, "logits/chosen": -800250816.0, "logits/rejected": -1130786816.0, "logps/chosen": -183.79263305664062, "logps/rejected": -233.85260009765625, "loss": 0.4077, "rewards/chosen": 0.2685844302177429, "rewards/margins": 0.8706353306770325, "rewards/rejected": -0.6020509004592896, "step": 167 }, { "epoch": 0.2829473684210526, "grad_norm": 69.313319440612, "kl": 0.0, "learning_rate": 4.520812077077604e-07, "logits/chosen": -840975616.0, "logits/rejected": -762263040.0, "logps/chosen": -194.9156005859375, "logps/rejected": -208.27591959635416, "loss": 0.5229, "rewards/chosen": -0.07560654282569886, "rewards/margins": 0.054520648717880246, "rewards/rejected": -0.1301271915435791, "step": 168 }, { "epoch": 0.2846315789473684, "grad_norm": 93.08308180288073, "kl": 0.0, "learning_rate": 4.512118115138314e-07, "logits/chosen": -760346965.3333334, "logits/rejected": -865922432.0, "logps/chosen": -213.150634765625, "logps/rejected": -177.36080932617188, "loss": 0.4687, "rewards/chosen": 0.10082270701726277, "rewards/margins": 0.7567354639371237, "rewards/rejected": -0.6559127569198608, "step": 169 }, { "epoch": 0.2863157894736842, "grad_norm": 91.57290262396198, "kl": 0.0, "learning_rate": 4.503354511485273e-07, "logits/chosen": -858243276.8, "logits/rejected": -1284581632.0, "logps/chosen": -279.31884765625, "logps/rejected": -272.38559977213544, "loss": 0.4222, "rewards/chosen": -0.1363077163696289, "rewards/margins": 0.4786560535430908, "rewards/rejected": -0.6149637699127197, "step": 170 }, { "epoch": 0.288, "grad_norm": 99.04964646632929, "kl": 0.0, "learning_rate": 4.4945215694368447e-07, "logits/chosen": -328858944.0, "logits/rejected": -713192234.6666666, "logps/chosen": -172.78839111328125, "logps/rejected": -212.23014322916666, "loss": 0.4366, "rewards/chosen": 0.2652008533477783, "rewards/margins": 1.0063419342041016, "rewards/rejected": -0.7411410808563232, "step": 171 }, { "epoch": 0.28968421052631577, "grad_norm": 89.56438737059334, "kl": 0.0, "learning_rate": 4.485619594711277e-07, "logits/chosen": -598744512.0, "logits/rejected": -598062378.6666666, "logps/chosen": -262.1839599609375, "logps/rejected": -212.77608235677084, "loss": 0.4281, "rewards/chosen": 0.616127073764801, "rewards/margins": 0.38657637437184655, "rewards/rejected": 0.2295506993929545, "step": 172 }, { "epoch": 0.29136842105263155, "grad_norm": 55.09758282232377, "kl": 0.0, "learning_rate": 4.4766488954161153e-07, "logits/chosen": -804758186.6666666, "logits/rejected": -721973760.0, "logps/chosen": -261.65061442057294, "logps/rejected": -265.42978515625, "loss": 0.4359, "rewards/chosen": -0.7668237686157227, "rewards/margins": 0.3059118270874024, "rewards/rejected": -1.072735595703125, "step": 173 }, { "epoch": 0.29305263157894734, "grad_norm": 69.82107320148485, "kl": 0.0, "learning_rate": 4.4676097820375426e-07, "logits/chosen": -515698304.0, "logits/rejected": -1235355648.0, "logps/chosen": -205.48640950520834, "logps/rejected": -245.5269287109375, "loss": 0.4254, "rewards/chosen": 0.9509898026784261, "rewards/margins": 0.5845218737920126, "rewards/rejected": 0.36646792888641355, "step": 174 }, { "epoch": 0.29473684210526313, "grad_norm": 78.35333198019593, "kl": 0.0, "learning_rate": 4.458502567429631e-07, "logits/chosen": -498773248.0, "logits/rejected": -519963648.0, "logps/chosen": -194.12052408854166, "logps/rejected": -172.4505615234375, "loss": 0.4344, "rewards/chosen": 0.4248570998509725, "rewards/margins": 1.5527800162633258, "rewards/rejected": -1.1279229164123534, "step": 175 }, { "epoch": 0.296421052631579, "grad_norm": 169.14882087581336, "kl": 0.0, "learning_rate": 4.4493275668035147e-07, "logits/chosen": -721955242.6666666, "logits/rejected": -710825932.8, "logps/chosen": -167.6915079752604, "logps/rejected": -147.37952880859376, "loss": 0.4063, "rewards/chosen": 0.4422839085261027, "rewards/margins": 2.1021666447321574, "rewards/rejected": -1.6598827362060546, "step": 176 }, { "epoch": 0.29810526315789476, "grad_norm": 67.3786479497002, "kl": 0.0, "learning_rate": 4.440085097716479e-07, "logits/chosen": -663990016.0, "logits/rejected": -555922304.0, "logps/chosen": -259.0157470703125, "logps/rejected": -192.321533203125, "loss": 0.4259, "rewards/chosen": 0.24189604818820953, "rewards/margins": 0.6399521082639694, "rewards/rejected": -0.3980560600757599, "step": 177 }, { "epoch": 0.29978947368421055, "grad_norm": 80.2865086271179, "kl": 0.0, "learning_rate": 4.430775480060972e-07, "logits/chosen": -851462912.0, "logits/rejected": -1203325952.0, "logps/chosen": -203.14907836914062, "logps/rejected": -263.4740905761719, "loss": 0.3982, "rewards/chosen": -1.356869101524353, "rewards/margins": 0.5466030836105347, "rewards/rejected": -1.9034721851348877, "step": 178 }, { "epoch": 0.30147368421052634, "grad_norm": 84.09627558392572, "kl": 0.0, "learning_rate": 4.4213990360535267e-07, "logits/chosen": -656672042.6666666, "logits/rejected": -944876748.8, "logps/chosen": -294.62876383463544, "logps/rejected": -214.0318115234375, "loss": 0.4823, "rewards/chosen": -1.5061303774515789, "rewards/margins": -1.1466392676035564, "rewards/rejected": -0.35949110984802246, "step": 179 }, { "epoch": 0.3031578947368421, "grad_norm": 50.8032139364716, "kl": 0.0, "learning_rate": 4.4119560902236174e-07, "logits/chosen": -764519808.0, "logits/rejected": -615565184.0, "logps/chosen": -261.0996398925781, "logps/rejected": -222.42861938476562, "loss": 0.4579, "rewards/chosen": -0.5169571042060852, "rewards/margins": 1.855327069759369, "rewards/rejected": -2.372284173965454, "step": 180 }, { "epoch": 0.3048421052631579, "grad_norm": 60.14921204443068, "kl": 0.0, "learning_rate": 4.402446969402419e-07, "logits/chosen": -1501441194.6666667, "logits/rejected": -1463862681.6, "logps/chosen": -328.10398356119794, "logps/rejected": -221.4558349609375, "loss": 0.4195, "rewards/chosen": -3.202397664388021, "rewards/margins": -1.721674950917562, "rewards/rejected": -1.480722713470459, "step": 181 }, { "epoch": 0.3065263157894737, "grad_norm": 57.771134136758484, "kl": 0.0, "learning_rate": 4.392872002711501e-07, "logits/chosen": -459086745.6, "logits/rejected": -1257842090.6666667, "logps/chosen": -290.8982421875, "logps/rejected": -208.20979817708334, "loss": 0.4709, "rewards/chosen": -1.0606164932250977, "rewards/margins": 0.5609116554260254, "rewards/rejected": -1.621528148651123, "step": 182 }, { "epoch": 0.3082105263157895, "grad_norm": 237.395695237356, "kl": 0.0, "learning_rate": 4.3832315215514316e-07, "logits/chosen": -752865587.2, "logits/rejected": -800709973.3333334, "logps/chosen": -222.4587646484375, "logps/rejected": -164.1585693359375, "loss": 0.3825, "rewards/chosen": 0.5814694404602051, "rewards/margins": 0.6762328982353211, "rewards/rejected": -0.09476345777511597, "step": 183 }, { "epoch": 0.3098947368421053, "grad_norm": 68.49160205355011, "kl": 0.0, "learning_rate": 4.3735258595903123e-07, "logits/chosen": -1131358412.8, "logits/rejected": -615695616.0, "logps/chosen": -263.4253173828125, "logps/rejected": -251.53666178385416, "loss": 0.4091, "rewards/chosen": -0.1255630373954773, "rewards/margins": 2.537329304218292, "rewards/rejected": -2.6628923416137695, "step": 184 }, { "epoch": 0.31157894736842107, "grad_norm": 56.555913296788326, "kl": 0.0, "learning_rate": 4.3637553527522265e-07, "logits/chosen": -537920128.0, "logits/rejected": -931403434.6666666, "logps/chosen": -261.276611328125, "logps/rejected": -258.6404622395833, "loss": 0.4377, "rewards/chosen": -0.8533248901367188, "rewards/margins": -0.1802154382069906, "rewards/rejected": -0.6731094519297282, "step": 185 }, { "epoch": 0.31326315789473685, "grad_norm": 62.41772660361089, "kl": 0.0, "learning_rate": 4.353920339205611e-07, "logits/chosen": -722902016.0, "logits/rejected": -948598144.0, "logps/chosen": -224.0010223388672, "logps/rejected": -272.22760009765625, "loss": 0.4404, "rewards/chosen": 0.367422878742218, "rewards/margins": 0.9069034457206726, "rewards/rejected": -0.5394805669784546, "step": 186 }, { "epoch": 0.31494736842105264, "grad_norm": 64.75287382395764, "kl": 0.0, "learning_rate": 4.344021159351555e-07, "logits/chosen": -1008326553.6, "logits/rejected": -771843413.3333334, "logps/chosen": -227.4381591796875, "logps/rejected": -186.0328369140625, "loss": 0.4424, "rewards/chosen": -0.36620848178863524, "rewards/margins": -0.2139639139175415, "rewards/rejected": -0.15224456787109375, "step": 187 }, { "epoch": 0.31663157894736843, "grad_norm": 70.9290525757904, "kl": 0.0, "learning_rate": 4.3340581558120195e-07, "logits/chosen": -618546688.0, "logits/rejected": -912823125.3333334, "logps/chosen": -246.94599609375, "logps/rejected": -296.3729654947917, "loss": 0.3832, "rewards/chosen": -0.18909727334976195, "rewards/margins": 3.131844015916189, "rewards/rejected": -3.3209412892659507, "step": 188 }, { "epoch": 0.3183157894736842, "grad_norm": 70.41433062876469, "kl": 0.0, "learning_rate": 4.3240316734179705e-07, "logits/chosen": -761420928.0, "logits/rejected": -672102144.0, "logps/chosen": -227.72821044921875, "logps/rejected": -258.29815673828125, "loss": 0.4255, "rewards/chosen": 0.08762133121490479, "rewards/margins": 3.5601266622543335, "rewards/rejected": -3.4725053310394287, "step": 189 }, { "epoch": 0.32, "grad_norm": 66.55630414028238, "kl": 0.0, "learning_rate": 4.313942059197456e-07, "logits/chosen": -618079808.0, "logits/rejected": -658955968.0, "logps/chosen": -252.71575927734375, "logps/rejected": -140.86489868164062, "loss": 0.4927, "rewards/chosen": -0.4330390989780426, "rewards/margins": -0.2638881802558899, "rewards/rejected": -0.1691509187221527, "step": 190 }, { "epoch": 0.3216842105263158, "grad_norm": 94.67677667797632, "kl": 0.0, "learning_rate": 4.3037896623635864e-07, "logits/chosen": -923324928.0, "logits/rejected": -594331520.0, "logps/chosen": -247.37379455566406, "logps/rejected": -226.16342163085938, "loss": 0.367, "rewards/chosen": 0.9253654479980469, "rewards/margins": 2.8820842504501343, "rewards/rejected": -1.9567188024520874, "step": 191 }, { "epoch": 0.3233684210526316, "grad_norm": 54.17392420976664, "kl": 0.0, "learning_rate": 4.29357483430245e-07, "logits/chosen": -664615808.0, "logits/rejected": -569389952.0, "logps/chosen": -216.7957763671875, "logps/rejected": -197.62344360351562, "loss": 0.4826, "rewards/chosen": -0.4946579138437907, "rewards/margins": 0.1053409179051717, "rewards/rejected": -0.5999988317489624, "step": 192 }, { "epoch": 0.32505263157894737, "grad_norm": 75.00984735078153, "kl": 1.3663177490234375, "learning_rate": 4.2832979285609505e-07, "logits/chosen": -614758997.3333334, "logits/rejected": -654426368.0, "logps/chosen": -235.02339680989584, "logps/rejected": -149.56541442871094, "loss": 0.3863, "rewards/chosen": 0.690934975941976, "rewards/margins": 0.510087807973226, "rewards/rejected": 0.18084716796875, "step": 193 }, { "epoch": 0.32673684210526316, "grad_norm": 60.51774121025042, "kl": 0.0, "learning_rate": 4.2729593008345734e-07, "logits/chosen": -1201705642.6666667, "logits/rejected": -741241139.2, "logps/chosen": -172.75325520833334, "logps/rejected": -184.46663818359374, "loss": 0.3963, "rewards/chosen": 1.0127630233764648, "rewards/margins": 2.2689538955688477, "rewards/rejected": -1.256190872192383, "step": 194 }, { "epoch": 0.32842105263157895, "grad_norm": 90.59367998785133, "kl": 0.0, "learning_rate": 4.2625593089550717e-07, "logits/chosen": -541644697.6, "logits/rejected": -1247126954.6666667, "logps/chosen": -258.279150390625, "logps/rejected": -216.1505126953125, "loss": 0.458, "rewards/chosen": 0.21086244583129882, "rewards/margins": 0.3597394009431203, "rewards/rejected": -0.1488769551118215, "step": 195 }, { "epoch": 0.33010526315789473, "grad_norm": 95.59189487331257, "kl": 0.0, "learning_rate": 4.2520983128780825e-07, "logits/chosen": -635985920.0, "logits/rejected": -922811136.0, "logps/chosen": -279.5119323730469, "logps/rejected": -208.4548543294271, "loss": 0.3814, "rewards/chosen": -0.1562713384628296, "rewards/margins": 1.136791984240214, "rewards/rejected": -1.2930633227030437, "step": 196 }, { "epoch": 0.3317894736842105, "grad_norm": 54.38017510634436, "kl": 0.0, "learning_rate": 4.2415766746706674e-07, "logits/chosen": -605963136.0, "logits/rejected": -808883507.2, "logps/chosen": -224.5291544596354, "logps/rejected": -254.458984375, "loss": 0.3932, "rewards/chosen": -1.42087189356486, "rewards/margins": 1.0473615964253742, "rewards/rejected": -2.4682334899902343, "step": 197 }, { "epoch": 0.3334736842105263, "grad_norm": 77.91234986865499, "kl": 0.0, "learning_rate": 4.230994758498782e-07, "logits/chosen": -977752576.0, "logits/rejected": -573459456.0, "logps/chosen": -287.57672119140625, "logps/rejected": -200.0487823486328, "loss": 0.4666, "rewards/chosen": -0.592963457107544, "rewards/margins": -0.5604517459869385, "rewards/rejected": -0.03251171112060547, "step": 198 }, { "epoch": 0.3351578947368421, "grad_norm": 217.16301375592573, "kl": 0.0, "learning_rate": 4.220352930614672e-07, "logits/chosen": -1347623082.6666667, "logits/rejected": -1223722905.6, "logps/chosen": -250.11665852864584, "logps/rejected": -210.2994140625, "loss": 0.3773, "rewards/chosen": 0.5189147790273031, "rewards/margins": 0.7931217749913534, "rewards/rejected": -0.2742069959640503, "step": 199 }, { "epoch": 0.3368421052631579, "grad_norm": 58.97358660347629, "kl": 0.0, "learning_rate": 4.209651559344194e-07, "logits/chosen": -1134724096.0, "logits/rejected": -713243093.3333334, "logps/chosen": -202.700537109375, "logps/rejected": -273.8854573567708, "loss": 0.4269, "rewards/chosen": 0.44961090087890626, "rewards/margins": 0.7851913293202718, "rewards/rejected": -0.33558042844136554, "step": 200 }, { "epoch": 0.3385263157894737, "grad_norm": 72.17373452754133, "kl": 0.0, "learning_rate": 4.198891015074073e-07, "logits/chosen": -646841984.0, "logits/rejected": -952984896.0, "logps/chosen": -190.56002807617188, "logps/rejected": -252.1724090576172, "loss": 0.4577, "rewards/chosen": -0.3583839535713196, "rewards/margins": -1.000853717327118, "rewards/rejected": 0.6424697637557983, "step": 201 }, { "epoch": 0.34021052631578946, "grad_norm": 80.45111424098052, "kl": 0.0, "learning_rate": 4.1880716702390763e-07, "logits/chosen": -1190700544.0, "logits/rejected": -969268480.0, "logps/chosen": -192.81265258789062, "logps/rejected": -200.2042236328125, "loss": 0.414, "rewards/chosen": 1.0531678199768066, "rewards/margins": 3.0669215520222983, "rewards/rejected": -2.0137537320454917, "step": 202 }, { "epoch": 0.34189473684210525, "grad_norm": 94.02730380206307, "kl": 0.0, "learning_rate": 4.177193899309126e-07, "logits/chosen": -503011993.6, "logits/rejected": -704219050.6666666, "logps/chosen": -218.402734375, "logps/rejected": -179.79007975260416, "loss": 0.4332, "rewards/chosen": 0.8202567100524902, "rewards/margins": 1.5902131398518882, "rewards/rejected": -0.7699564297993978, "step": 203 }, { "epoch": 0.34357894736842104, "grad_norm": 54.77503186544864, "kl": 0.0, "learning_rate": 4.166258078776341e-07, "logits/chosen": -832879786.6666666, "logits/rejected": -515199590.4, "logps/chosen": -202.35528564453125, "logps/rejected": -200.24393310546876, "loss": 0.4434, "rewards/chosen": 0.5717076857884725, "rewards/margins": 2.901677473386129, "rewards/rejected": -2.329969787597656, "step": 204 }, { "epoch": 0.3452631578947368, "grad_norm": 148.39721947330267, "kl": 0.0, "learning_rate": 4.155264587142001e-07, "logits/chosen": -1255011840.0, "logits/rejected": -1037022464.0, "logps/chosen": -242.1149139404297, "logps/rejected": -220.04427083333334, "loss": 0.5528, "rewards/chosen": -0.22197723388671875, "rewards/margins": -0.06859992941220602, "rewards/rejected": -0.15337730447451273, "step": 205 }, { "epoch": 0.3469473684210526, "grad_norm": 49.784955690964, "kl": 0.0, "learning_rate": 4.1442138049034494e-07, "logits/chosen": -775684266.6666666, "logits/rejected": -880259379.2, "logps/chosen": -191.68634033203125, "logps/rejected": -225.9909912109375, "loss": 0.3432, "rewards/chosen": 1.5534348487854004, "rewards/margins": 3.249916934967041, "rewards/rejected": -1.6964820861816405, "step": 206 }, { "epoch": 0.3486315789473684, "grad_norm": 91.02392105011648, "kl": 0.0, "learning_rate": 4.133106114540923e-07, "logits/chosen": -971414336.0, "logits/rejected": -1156522368.0, "logps/chosen": -213.8065185546875, "logps/rejected": -225.66744995117188, "loss": 0.3929, "rewards/chosen": -0.12490347027778625, "rewards/margins": 0.5315063297748566, "rewards/rejected": -0.6564098000526428, "step": 207 }, { "epoch": 0.3503157894736842, "grad_norm": 68.74268293327242, "kl": 0.0, "learning_rate": 4.1219419005043154e-07, "logits/chosen": -916664064.0, "logits/rejected": -622102208.0, "logps/chosen": -237.611083984375, "logps/rejected": -284.42645263671875, "loss": 0.5005, "rewards/chosen": 0.0357242226600647, "rewards/margins": 0.9489158391952515, "rewards/rejected": -0.9131916165351868, "step": 208 }, { "epoch": 0.352, "grad_norm": 76.77975476469975, "kl": 0.0, "learning_rate": 4.110721549199866e-07, "logits/chosen": -1072110400.0, "logits/rejected": -753468608.0, "logps/chosen": -184.95042419433594, "logps/rejected": -249.1209716796875, "loss": 0.3055, "rewards/chosen": 0.7373764514923096, "rewards/margins": 2.8142330646514893, "rewards/rejected": -2.0768566131591797, "step": 209 }, { "epoch": 0.35368421052631577, "grad_norm": 80.64382792874225, "kl": 0.0, "learning_rate": 4.0994454489767927e-07, "logits/chosen": -689984256.0, "logits/rejected": -1297480448.0, "logps/chosen": -210.7186279296875, "logps/rejected": -249.07969665527344, "loss": 0.3988, "rewards/chosen": 0.8691381613413492, "rewards/margins": 4.103166739145915, "rewards/rejected": -3.2340285778045654, "step": 210 }, { "epoch": 0.35536842105263156, "grad_norm": 149.68736456789574, "kl": 0.0, "learning_rate": 4.088113990113846e-07, "logits/chosen": -731458048.0, "logits/rejected": -345419264.0, "logps/chosen": -280.026025390625, "logps/rejected": -250.71551513671875, "loss": 0.3742, "rewards/chosen": 0.011942720413208008, "rewards/margins": 0.32606066862742106, "rewards/rejected": -0.31411794821421307, "step": 211 }, { "epoch": 0.35705263157894734, "grad_norm": 88.38448491891266, "kl": 0.0, "learning_rate": 4.076727564805802e-07, "logits/chosen": -1213323571.2, "logits/rejected": -650571477.3333334, "logps/chosen": -225.9, "logps/rejected": -215.1610107421875, "loss": 0.3696, "rewards/chosen": 1.0948437690734862, "rewards/margins": 1.9371202627817787, "rewards/rejected": -0.8422764937082926, "step": 212 }, { "epoch": 0.35873684210526313, "grad_norm": 65.08222255296863, "kl": 0.0, "learning_rate": 4.0652865671498906e-07, "logits/chosen": -1109334869.3333333, "logits/rejected": -673303040.0, "logps/chosen": -209.34431966145834, "logps/rejected": -264.689892578125, "loss": 0.4692, "rewards/chosen": 0.025906383991241455, "rewards/margins": 0.10383939743041992, "rewards/rejected": -0.07793301343917847, "step": 213 }, { "epoch": 0.3604210526315789, "grad_norm": 95.42507509421407, "kl": 0.0, "learning_rate": 4.053791393132149e-07, "logits/chosen": -2192135936.0, "logits/rejected": -679301717.3333334, "logps/chosen": -280.882080078125, "logps/rejected": -235.8901570638021, "loss": 0.3862, "rewards/chosen": 1.1142441034317017, "rewards/margins": 1.286988377571106, "rewards/rejected": -0.1727442741394043, "step": 214 }, { "epoch": 0.36210526315789476, "grad_norm": 57.980133275698655, "kl": 0.0, "learning_rate": 4.0422424406137235e-07, "logits/chosen": -533541728.0, "logits/rejected": -664400128.0, "logps/chosen": -241.3803253173828, "logps/rejected": -255.393310546875, "loss": 0.378, "rewards/chosen": 0.4608551263809204, "rewards/margins": 2.0191084146499634, "rewards/rejected": -1.558253288269043, "step": 215 }, { "epoch": 0.36378947368421055, "grad_norm": 62.774785038898976, "kl": 0.0, "learning_rate": 4.0306401093170956e-07, "logits/chosen": -863296819.2, "logits/rejected": -931824469.3333334, "logps/chosen": -209.0201171875, "logps/rejected": -258.80812581380206, "loss": 0.4346, "rewards/chosen": -0.07239959239959717, "rewards/margins": -0.04090743859608969, "rewards/rejected": -0.031492153803507485, "step": 216 }, { "epoch": 0.36547368421052634, "grad_norm": 43.09358868475917, "kl": 0.0, "learning_rate": 4.0189848008122475e-07, "logits/chosen": -832388096.0, "logits/rejected": -767827328.0, "logps/chosen": -231.43902587890625, "logps/rejected": -311.52874755859375, "loss": 0.3487, "rewards/chosen": 0.1406116485595703, "rewards/margins": 1.9062778949737549, "rewards/rejected": -1.7656662464141846, "step": 217 }, { "epoch": 0.3671578947368421, "grad_norm": 65.51992296550779, "kl": 0.0, "learning_rate": 4.007276918502763e-07, "logits/chosen": -1086964224.0, "logits/rejected": -893988352.0, "logps/chosen": -246.42593383789062, "logps/rejected": -244.95123291015625, "loss": 0.3927, "rewards/chosen": 0.23493842780590057, "rewards/margins": 1.4225323647260666, "rewards/rejected": -1.187593936920166, "step": 218 }, { "epoch": 0.3688421052631579, "grad_norm": 88.60132651868206, "kl": 0.0, "learning_rate": 3.9955168676118645e-07, "logits/chosen": -698185813.3333334, "logits/rejected": -1118987673.6, "logps/chosen": -195.71712239583334, "logps/rejected": -205.612646484375, "loss": 0.5847, "rewards/chosen": 0.2993367513020833, "rewards/margins": -0.6073127428690592, "rewards/rejected": 0.9066494941711426, "step": 219 }, { "epoch": 0.3705263157894737, "grad_norm": 49.2393342246424, "kl": 0.0, "learning_rate": 3.9837050551683904e-07, "logits/chosen": -615184170.6666666, "logits/rejected": -701940121.6, "logps/chosen": -227.86104329427084, "logps/rejected": -263.94287109375, "loss": 0.3788, "rewards/chosen": 0.2989359696706136, "rewards/margins": 2.076623805363973, "rewards/rejected": -1.7776878356933594, "step": 220 }, { "epoch": 0.3722105263157895, "grad_norm": 186.40613554967234, "kl": 0.0, "learning_rate": 3.9718418899927056e-07, "logits/chosen": -326911872.0, "logits/rejected": -801610093.7142857, "logps/chosen": -450.5743408203125, "logps/rejected": -212.93069893973214, "loss": 0.485, "rewards/chosen": -2.387774705886841, "rewards/margins": -2.170388392039708, "rewards/rejected": -0.21738631384713308, "step": 221 }, { "epoch": 0.3738947368421053, "grad_norm": 64.26673569575549, "kl": 0.0, "learning_rate": 3.959927782682551e-07, "logits/chosen": -726571008.0, "logits/rejected": -706536896.0, "logps/chosen": -230.51875813802084, "logps/rejected": -126.30707550048828, "loss": 0.4306, "rewards/chosen": -0.7994547684987386, "rewards/margins": -1.1165289779504142, "rewards/rejected": 0.3170742094516754, "step": 222 }, { "epoch": 0.37557894736842107, "grad_norm": 73.20938961913613, "kl": 0.0, "learning_rate": 3.947963145598833e-07, "logits/chosen": -1290946048.0, "logits/rejected": -1101487957.3333333, "logps/chosen": -192.78887939453125, "logps/rejected": -246.4966023763021, "loss": 0.4078, "rewards/chosen": 1.7199859619140625, "rewards/margins": 2.088035305341085, "rewards/rejected": -0.3680493434270223, "step": 223 }, { "epoch": 0.37726315789473686, "grad_norm": 47.763563197737376, "kl": 0.0, "learning_rate": 3.935948392851353e-07, "logits/chosen": -2707193088.0, "logits/rejected": -1093253997.7142856, "logps/chosen": -202.72845458984375, "logps/rejected": -189.14756556919642, "loss": 0.3882, "rewards/chosen": -2.7178618907928467, "rewards/margins": -2.8144114358084544, "rewards/rejected": 0.09654954501560756, "step": 224 }, { "epoch": 0.37894736842105264, "grad_norm": 67.91419872933243, "kl": 0.0, "learning_rate": 3.923883940284472e-07, "logits/chosen": -493479210.6666667, "logits/rejected": -663955968.0, "logps/chosen": -208.96756998697916, "logps/rejected": -219.612744140625, "loss": 0.4068, "rewards/chosen": 0.5567906697591146, "rewards/margins": 2.475810750325521, "rewards/rejected": -1.9190200805664062, "step": 225 }, { "epoch": 0.38063157894736843, "grad_norm": 45.896671613894135, "kl": 0.0, "learning_rate": 3.9117702054627164e-07, "logits/chosen": -843394901.3333334, "logits/rejected": -696314688.0, "logps/chosen": -184.59212239583334, "logps/rejected": -289.22589111328125, "loss": 0.3531, "rewards/chosen": 1.452480634053548, "rewards/margins": 4.889726003011067, "rewards/rejected": -3.4372453689575195, "step": 226 }, { "epoch": 0.3823157894736842, "grad_norm": 117.9741866381254, "kl": 0.0, "learning_rate": 3.8996076076563333e-07, "logits/chosen": -633168768.0, "logits/rejected": -1228658005.3333333, "logps/chosen": -255.6255645751953, "logps/rejected": -198.0909423828125, "loss": 0.5188, "rewards/chosen": -0.49002379179000854, "rewards/margins": -1.5519655744234722, "rewards/rejected": 1.0619417826334636, "step": 227 }, { "epoch": 0.384, "grad_norm": 59.87930468202007, "kl": 0.0, "learning_rate": 3.8873965678267686e-07, "logits/chosen": -748600405.3333334, "logits/rejected": -564317888.0, "logps/chosen": -205.5136515299479, "logps/rejected": -190.98033142089844, "loss": 0.4398, "rewards/chosen": 0.013506333033243815, "rewards/margins": 0.8160205880800883, "rewards/rejected": -0.8025142550468445, "step": 228 }, { "epoch": 0.3856842105263158, "grad_norm": 55.75368595740401, "kl": 0.0, "learning_rate": 3.8751375086121027e-07, "logits/chosen": -689110741.3333334, "logits/rejected": -534848768.0, "logps/chosen": -284.23940022786456, "logps/rejected": -163.17276000976562, "loss": 0.4007, "rewards/chosen": 0.30055542786916095, "rewards/margins": 0.026084144910176577, "rewards/rejected": 0.2744712829589844, "step": 229 }, { "epoch": 0.3873684210526316, "grad_norm": 85.00209812207287, "kl": 0.0, "learning_rate": 3.8628308543124264e-07, "logits/chosen": -733420748.8, "logits/rejected": -613972394.6666666, "logps/chosen": -240.112109375, "logps/rejected": -277.9342854817708, "loss": 0.4103, "rewards/chosen": 0.4192187309265137, "rewards/margins": 0.19571206172307334, "rewards/rejected": 0.22350666920344034, "step": 230 }, { "epoch": 0.38905263157894737, "grad_norm": 67.82002108621575, "kl": 0.0, "learning_rate": 3.8504770308751465e-07, "logits/chosen": -815699712.0, "logits/rejected": -538265600.0, "logps/chosen": -195.72711181640625, "logps/rejected": -168.96177673339844, "loss": 0.3659, "rewards/chosen": 0.9801532030105591, "rewards/margins": 0.6151527762413025, "rewards/rejected": 0.3650004267692566, "step": 231 }, { "epoch": 0.39073684210526316, "grad_norm": 77.76306759423939, "kl": 0.0, "learning_rate": 3.8380764658802476e-07, "logits/chosen": -863703961.6, "logits/rejected": -500137898.6666667, "logps/chosen": -190.1537353515625, "logps/rejected": -248.5384318033854, "loss": 0.3315, "rewards/chosen": 0.9144930839538574, "rewards/margins": 1.550446311632792, "rewards/rejected": -0.6359532276789347, "step": 232 }, { "epoch": 0.39242105263157895, "grad_norm": 66.37493353593294, "kl": 0.0, "learning_rate": 3.8256295885254977e-07, "logits/chosen": -929981747.2, "logits/rejected": -953465258.6666666, "logps/chosen": -216.4085205078125, "logps/rejected": -182.04376220703125, "loss": 0.3813, "rewards/chosen": 1.277543067932129, "rewards/margins": 1.0003930727640789, "rewards/rejected": 0.2771499951680501, "step": 233 }, { "epoch": 0.39410526315789474, "grad_norm": 75.1964810655982, "kl": 0.0, "learning_rate": 3.8131368296115823e-07, "logits/chosen": -896362752.0, "logits/rejected": -561620096.0, "logps/chosen": -249.6129353841146, "logps/rejected": -264.5303955078125, "loss": 0.2827, "rewards/chosen": 1.139868974685669, "rewards/margins": 2.838852047920227, "rewards/rejected": -1.698983073234558, "step": 234 }, { "epoch": 0.3957894736842105, "grad_norm": 122.44554306763347, "kl": 0.0, "learning_rate": 3.800598621527205e-07, "logits/chosen": -969984146.2857143, "logits/rejected": -854562432.0, "logps/chosen": -182.46547154017858, "logps/rejected": -194.51100158691406, "loss": 0.3864, "rewards/chosen": 1.404153551374163, "rewards/margins": 1.3077897163374084, "rewards/rejected": 0.09636383503675461, "step": 235 }, { "epoch": 0.3974736842105263, "grad_norm": 60.34196841703653, "kl": 0.0, "learning_rate": 3.7880153982341163e-07, "logits/chosen": -823134464.0, "logits/rejected": -873622101.3333334, "logps/chosen": -235.61932373046875, "logps/rejected": -271.49196370442706, "loss": 0.3463, "rewards/chosen": 2.0248725414276123, "rewards/margins": 3.3654048442840576, "rewards/rejected": -1.3405323028564453, "step": 236 }, { "epoch": 0.3991578947368421, "grad_norm": 73.61126053660914, "kl": 0.0, "learning_rate": 3.7753875952520937e-07, "logits/chosen": -791607424.0, "logits/rejected": -736724992.0, "logps/chosen": -260.60198974609375, "logps/rejected": -270.4560852050781, "loss": 0.3967, "rewards/chosen": 0.33294641971588135, "rewards/margins": 2.238220691680908, "rewards/rejected": -1.9052742719650269, "step": 237 }, { "epoch": 0.4008421052631579, "grad_norm": 44.26653587553288, "kl": 0.0, "learning_rate": 3.7627156496438685e-07, "logits/chosen": -824610218.6666666, "logits/rejected": -856471756.8, "logps/chosen": -174.15934244791666, "logps/rejected": -221.6721435546875, "loss": 0.4355, "rewards/chosen": 1.1449518998463948, "rewards/margins": 0.593308385213216, "rewards/rejected": 0.5516435146331787, "step": 238 }, { "epoch": 0.4025263157894737, "grad_norm": 47.69085670063618, "kl": 0.0, "learning_rate": 3.75e-07, "logits/chosen": -820081216.0, "logits/rejected": -955681408.0, "logps/chosen": -224.08152770996094, "logps/rejected": -240.1038818359375, "loss": 0.3694, "rewards/chosen": 1.8589047193527222, "rewards/margins": 3.0458600521087646, "rewards/rejected": -1.1869553327560425, "step": 239 }, { "epoch": 0.40421052631578946, "grad_norm": 48.281804889717954, "kl": 0.0, "learning_rate": 3.7372410864236947e-07, "logits/chosen": -717564245.3333334, "logits/rejected": -679164364.8, "logps/chosen": -152.29527791341147, "logps/rejected": -278.3427490234375, "loss": 0.3254, "rewards/chosen": 1.1597716808319092, "rewards/margins": 3.844050931930542, "rewards/rejected": -2.6842792510986326, "step": 240 }, { "epoch": 0.40589473684210525, "grad_norm": 75.76751371273427, "kl": 0.0, "learning_rate": 3.724439350515571e-07, "logits/chosen": -629566144.0, "logits/rejected": -622408640.0, "logps/chosen": -201.63623046875, "logps/rejected": -248.00782775878906, "loss": 0.3847, "rewards/chosen": 0.5420559048652649, "rewards/margins": 1.8232646584510803, "rewards/rejected": -1.2812087535858154, "step": 241 }, { "epoch": 0.40757894736842104, "grad_norm": 95.46822610468628, "kl": 0.0, "learning_rate": 3.7115952353583803e-07, "logits/chosen": -702194048.0, "logps/chosen": -212.0802459716797, "loss": 0.3904, "rewards/chosen": 1.2338132858276367, "step": 242 }, { "epoch": 0.40926315789473683, "grad_norm": 86.1205385319129, "kl": 0.0, "learning_rate": 3.698709185501666e-07, "logits/chosen": -567989094.4, "logits/rejected": -984176554.6666666, "logps/chosen": -259.487060546875, "logps/rejected": -214.1506144205729, "loss": 0.2951, "rewards/chosen": 0.584656047821045, "rewards/margins": 1.1261462450027466, "rewards/rejected": -0.5414901971817017, "step": 243 }, { "epoch": 0.4109473684210526, "grad_norm": 94.11033116313197, "kl": 0.0, "learning_rate": 3.6857816469463805e-07, "logits/chosen": -414785728.0, "logits/rejected": -834902308.5714285, "logps/chosen": -117.85265350341797, "logps/rejected": -237.00685337611608, "loss": 0.4324, "rewards/chosen": 1.1912513971328735, "rewards/margins": 1.5301621811730521, "rewards/rejected": -0.33891078404017855, "step": 244 }, { "epoch": 0.4126315789473684, "grad_norm": 73.01801229445103, "kl": 0.0, "learning_rate": 3.6728130671294484e-07, "logits/chosen": -480209600.0, "logits/rejected": -1261644629.3333333, "logps/chosen": -242.870361328125, "logps/rejected": -297.1340738932292, "loss": 0.3948, "rewards/chosen": 1.906396508216858, "rewards/margins": 3.281412720680237, "rewards/rejected": -1.375016212463379, "step": 245 }, { "epoch": 0.4143157894736842, "grad_norm": 50.21406501502345, "kl": 0.0, "learning_rate": 3.6598038949082773e-07, "logits/chosen": -1082578688.0, "logits/rejected": -1054340505.6, "logps/chosen": -202.77239990234375, "logps/rejected": -241.3439697265625, "loss": 0.4015, "rewards/chosen": 0.08610687653223674, "rewards/margins": -0.06312040885289509, "rewards/rejected": 0.14922728538513183, "step": 246 }, { "epoch": 0.416, "grad_norm": 69.90136809562398, "kl": 0.0, "learning_rate": 3.646754580545226e-07, "logits/chosen": -551080704.0, "logits/rejected": -721525162.6666666, "logps/chosen": -253.119091796875, "logps/rejected": -209.84647623697916, "loss": 0.4882, "rewards/chosen": -0.7751388549804688, "rewards/margins": -1.7328248023986816, "rewards/rejected": 0.9576859474182129, "step": 247 }, { "epoch": 0.41768421052631577, "grad_norm": 66.55136067755517, "kl": 0.0, "learning_rate": 3.633665575692019e-07, "logits/chosen": -469412010.6666667, "logits/rejected": -688726988.8, "logps/chosen": -186.66743977864584, "logps/rejected": -300.4759765625, "loss": 0.3747, "rewards/chosen": 0.8189966678619385, "rewards/margins": 2.057415819168091, "rewards/rejected": -1.2384191513061524, "step": 248 }, { "epoch": 0.41936842105263156, "grad_norm": 91.34875948495586, "kl": 0.0, "learning_rate": 3.6205373333741137e-07, "logits/chosen": -695206528.0, "logits/rejected": -909713856.0, "logps/chosen": -220.77886962890625, "logps/rejected": -275.035400390625, "loss": 0.4002, "rewards/chosen": 0.9514415860176086, "rewards/margins": 2.3362786173820496, "rewards/rejected": -1.384837031364441, "step": 249 }, { "epoch": 0.42105263157894735, "grad_norm": 41.18603961196617, "kl": 0.0, "learning_rate": 3.60737030797502e-07, "logits/chosen": -578035840.0, "logits/rejected": -939070378.6666666, "logps/chosen": -261.39300537109375, "logps/rejected": -181.3475341796875, "loss": 0.4095, "rewards/chosen": -1.6891616582870483, "rewards/margins": -1.0875247716903687, "rewards/rejected": -0.6016368865966797, "step": 250 }, { "epoch": 0.42273684210526313, "grad_norm": 116.0424680075244, "kl": 0.0, "learning_rate": 3.5941649552205766e-07, "logits/chosen": -1035228672.0, "logits/rejected": -634597068.8, "logps/chosen": -246.8405558268229, "logps/rejected": -214.5775390625, "loss": 0.4228, "rewards/chosen": -0.7042236328125, "rewards/margins": -1.805875873565674, "rewards/rejected": 1.101652240753174, "step": 251 }, { "epoch": 0.4244210526315789, "grad_norm": 57.738431491671584, "kl": 0.0, "learning_rate": 3.580921732163174e-07, "logits/chosen": -844649472.0, "logits/rejected": -548579840.0, "logps/chosen": -214.6180419921875, "logps/rejected": -231.5613250732422, "loss": 0.3853, "rewards/chosen": 0.8030085563659668, "rewards/margins": 1.5388163328170776, "rewards/rejected": -0.7358077764511108, "step": 252 }, { "epoch": 0.4261052631578947, "grad_norm": 346.5318458002952, "kl": 0.0, "learning_rate": 3.56764109716594e-07, "logits/chosen": -551624384.0, "logits/rejected": -1136507611.4285715, "logps/chosen": -178.53115844726562, "logps/rejected": -271.80191476004467, "loss": 0.3623, "rewards/chosen": 3.3293793201446533, "rewards/margins": 8.204052346093313, "rewards/rejected": -4.874673025948661, "step": 253 }, { "epoch": 0.42778947368421055, "grad_norm": 115.23971460689606, "kl": 0.0, "learning_rate": 3.55432350988687e-07, "logits/chosen": -830976576.0, "logits/rejected": -1067142144.0, "logps/chosen": -153.7244415283203, "logps/rejected": -251.48733520507812, "loss": 0.4097, "rewards/chosen": 0.4606727957725525, "rewards/margins": 1.2488128542900085, "rewards/rejected": -0.788140058517456, "step": 254 }, { "epoch": 0.42947368421052634, "grad_norm": 54.92704059463097, "kl": 0.0, "learning_rate": 3.5409694312629193e-07, "logits/chosen": -630669141.3333334, "logits/rejected": -987313868.8, "logps/chosen": -209.82574462890625, "logps/rejected": -287.5651611328125, "loss": 0.3034, "rewards/chosen": 1.022467056910197, "rewards/margins": 2.030047623316447, "rewards/rejected": -1.00758056640625, "step": 255 }, { "epoch": 0.43115789473684213, "grad_norm": 310.7398715383573, "kl": 0.0, "learning_rate": 3.5275793234940544e-07, "logits/chosen": -917445760.0, "logits/rejected": -839287808.0, "logps/chosen": -376.3251953125, "logps/rejected": -219.46243722098214, "loss": 0.4546, "rewards/chosen": -1.2787322998046875, "rewards/margins": -1.270533732005528, "rewards/rejected": -0.00819856779915946, "step": 256 }, { "epoch": 0.4328421052631579, "grad_norm": 92.05663310439607, "kl": 1.0852890014648438, "learning_rate": 3.514153650027249e-07, "logits/chosen": -917584298.6666666, "logits/rejected": -720754227.2, "logps/chosen": -277.08099365234375, "logps/rejected": -215.7774169921875, "loss": 0.4514, "rewards/chosen": -1.6589380900065105, "rewards/margins": -1.4071196715037029, "rewards/rejected": -0.2518184185028076, "step": 257 }, { "epoch": 0.4345263157894737, "grad_norm": 46.29681896211451, "kl": 0.0, "learning_rate": 3.5006928755404464e-07, "logits/chosen": -601805260.8, "logits/rejected": -705630848.0, "logps/chosen": -302.690771484375, "logps/rejected": -220.0159912109375, "loss": 0.3499, "rewards/chosen": 0.9885660171508789, "rewards/margins": -0.45607048670450856, "rewards/rejected": 1.4446365038553874, "step": 258 }, { "epoch": 0.4362105263157895, "grad_norm": 60.3839696290393, "kl": 0.0, "learning_rate": 3.487197465926478e-07, "logits/chosen": -877942016.0, "logits/rejected": -762154956.8, "logps/chosen": -241.57857259114584, "logps/rejected": -244.444482421875, "loss": 0.4307, "rewards/chosen": 1.5284678141276042, "rewards/margins": 1.983873716990153, "rewards/rejected": -0.45540590286254884, "step": 259 }, { "epoch": 0.4378947368421053, "grad_norm": 47.33258383634144, "kl": 0.0, "learning_rate": 3.473667888276935e-07, "logits/chosen": -555102617.6, "logits/rejected": -590850773.3333334, "logps/chosen": -220.5026123046875, "logps/rejected": -312.41845703125, "loss": 0.3409, "rewards/chosen": 1.8922588348388671, "rewards/margins": 2.5513052860895793, "rewards/rejected": -0.659046451250712, "step": 260 }, { "epoch": 0.43957894736842107, "grad_norm": 108.21153319687332, "kl": 0.0, "learning_rate": 3.460104610866003e-07, "logits/chosen": -1036087466.6666666, "logits/rejected": -1177354444.8, "logps/chosen": -258.5961100260417, "logps/rejected": -187.52177734375, "loss": 0.4231, "rewards/chosen": 1.5323643684387207, "rewards/margins": 2.1087326049804687, "rewards/rejected": -0.5763682365417481, "step": 261 }, { "epoch": 0.44126315789473686, "grad_norm": 119.88936087714019, "kl": 0.0, "learning_rate": 3.4465081031342583e-07, "logits/chosen": -810871347.2, "logits/rejected": -1137264128.0, "logps/chosen": -274.263916015625, "logps/rejected": -234.40445963541666, "loss": 0.4701, "rewards/chosen": 0.387792706489563, "rewards/margins": -0.040941635767618834, "rewards/rejected": 0.4287343422571818, "step": 262 }, { "epoch": 0.44294736842105265, "grad_norm": 61.18523248696411, "kl": 0.0, "learning_rate": 3.432878835672413e-07, "logits/chosen": -875720789.3333334, "logits/rejected": -787683264.0, "logps/chosen": -223.71048990885416, "logps/rejected": -237.80686950683594, "loss": 0.4187, "rewards/chosen": 1.0855443477630615, "rewards/margins": 0.4147740602493286, "rewards/rejected": 0.6707702875137329, "step": 263 }, { "epoch": 0.44463157894736843, "grad_norm": 85.74861272190243, "kl": 0.0, "learning_rate": 3.419217280205032e-07, "logits/chosen": -493305770.6666667, "logits/rejected": -738251980.8, "logps/chosen": -250.9864501953125, "logps/rejected": -200.50716552734374, "loss": 0.4608, "rewards/chosen": 0.6460779507954916, "rewards/margins": 0.17245400746663414, "rewards/rejected": 0.47362394332885743, "step": 264 }, { "epoch": 0.4463157894736842, "grad_norm": 54.77540128655694, "kl": 0.0, "learning_rate": 3.405523909574206e-07, "logits/chosen": -651585450.6666666, "logits/rejected": -683791001.6, "logps/chosen": -242.39778645833334, "logps/rejected": -298.1034912109375, "loss": 0.3501, "rewards/chosen": 0.5337386926015218, "rewards/margins": 0.8820349057515462, "rewards/rejected": -0.3482962131500244, "step": 265 }, { "epoch": 0.448, "grad_norm": 89.19503556391335, "kl": 0.0, "learning_rate": 3.391799197723185e-07, "logits/chosen": -1162056499.2, "logits/rejected": -1215143509.3333333, "logps/chosen": -259.603076171875, "logps/rejected": -309.9653727213542, "loss": 0.4462, "rewards/chosen": 0.42551660537719727, "rewards/margins": 2.0183765093485517, "rewards/rejected": -1.5928599039713542, "step": 266 }, { "epoch": 0.4496842105263158, "grad_norm": 50.9461938511508, "kl": 0.0, "learning_rate": 3.378043619679973e-07, "logits/chosen": -1112641536.0, "logits/rejected": -554899541.3333334, "logps/chosen": -208.4407470703125, "logps/rejected": -223.9110107421875, "loss": 0.37, "rewards/chosen": 1.009368324279785, "rewards/margins": 1.0311517755190531, "rewards/rejected": -0.021783451239267986, "step": 267 }, { "epoch": 0.4513684210526316, "grad_norm": 127.83348671346022, "kl": 0.0, "learning_rate": 3.3642576515408905e-07, "logits/chosen": -642838357.3333334, "logits/rejected": -1174076211.2, "logps/chosen": -164.12750244140625, "logps/rejected": -177.136376953125, "loss": 0.4873, "rewards/chosen": 0.642901619275411, "rewards/margins": 1.1527833541234336, "rewards/rejected": -0.5098817348480225, "step": 268 }, { "epoch": 0.4530526315789474, "grad_norm": 106.6924383104548, "kl": 0.0, "learning_rate": 3.350441770454092e-07, "logits/chosen": -881408614.4, "logits/rejected": -748522496.0, "logps/chosen": -271.259326171875, "logps/rejected": -234.607421875, "loss": 0.3324, "rewards/chosen": 0.8681741714477539, "rewards/margins": 1.9296697139739991, "rewards/rejected": -1.0614955425262451, "step": 269 }, { "epoch": 0.45473684210526316, "grad_norm": 51.51923388471137, "kl": 0.0, "learning_rate": 3.336596454603054e-07, "logits/chosen": -1130812074.6666667, "logits/rejected": -598989926.4, "logps/chosen": -204.42683919270834, "logps/rejected": -234.242333984375, "loss": 0.3524, "rewards/chosen": 0.7063151200612386, "rewards/margins": 1.668326743443807, "rewards/rejected": -0.9620116233825684, "step": 270 }, { "epoch": 0.45642105263157895, "grad_norm": 56.40900184314831, "kl": 0.0, "learning_rate": 3.322722183190025e-07, "logits/chosen": -734314154.6666666, "logits/rejected": -522018406.4, "logps/chosen": -165.00169881184897, "logps/rejected": -248.694873046875, "loss": 0.445, "rewards/chosen": -1.2975707054138184, "rewards/margins": -1.0448772430419921, "rewards/rejected": -0.25269346237182616, "step": 271 }, { "epoch": 0.45810526315789474, "grad_norm": 68.53879212164166, "kl": 0.0, "learning_rate": 3.308819436419437e-07, "logits/chosen": -1095972181.3333333, "logits/rejected": -1827424384.0, "logps/chosen": -254.98893229166666, "logps/rejected": -206.98135375976562, "loss": 0.3692, "rewards/chosen": 0.8086059888203939, "rewards/margins": 4.705807526906331, "rewards/rejected": -3.8972015380859375, "step": 272 }, { "epoch": 0.4597894736842105, "grad_norm": 62.413627457878825, "kl": 0.0, "learning_rate": 3.294888695481287e-07, "logits/chosen": -1661173418.6666667, "logits/rejected": -1076040294.4, "logps/chosen": -195.62664794921875, "logps/rejected": -220.1724609375, "loss": 0.4067, "rewards/chosen": 1.7726329167683919, "rewards/margins": 2.3173933347066242, "rewards/rejected": -0.5447604179382324, "step": 273 }, { "epoch": 0.4614736842105263, "grad_norm": 41.769964940057946, "kl": 0.0, "learning_rate": 3.2809304425344856e-07, "logits/chosen": -1145147264.0, "logits/rejected": -744639780.5714285, "logps/chosen": -178.32012939453125, "logps/rejected": -222.49560546875, "loss": 0.4646, "rewards/chosen": 1.3849213123321533, "rewards/margins": 1.6690550361360823, "rewards/rejected": -0.28413372380392893, "step": 274 }, { "epoch": 0.4631578947368421, "grad_norm": 51.7666775396847, "kl": 0.0, "learning_rate": 3.266945160690159e-07, "logits/chosen": -718938112.0, "logits/rejected": -701548672.0, "logps/chosen": -168.946533203125, "logps/rejected": -342.5791015625, "loss": 0.3497, "rewards/chosen": 1.088832139968872, "rewards/margins": 3.3126580715179443, "rewards/rejected": -2.2238259315490723, "step": 275 }, { "epoch": 0.4648421052631579, "grad_norm": 40.27881909450745, "kl": 0.0, "learning_rate": 3.252933333994942e-07, "logits/chosen": -618843904.0, "logits/rejected": -992992768.0, "logps/chosen": -176.893798828125, "logps/rejected": -240.34130859375, "loss": 0.3206, "rewards/chosen": 1.2537689208984375, "rewards/margins": 2.309921582539876, "rewards/rejected": -1.0561526616414387, "step": 276 }, { "epoch": 0.4665263157894737, "grad_norm": 37.41871849188705, "kl": 0.0, "learning_rate": 3.23889544741421e-07, "logits/chosen": -787857536.0, "logits/rejected": -838022016.0, "logps/chosen": -214.87179565429688, "logps/rejected": -229.31048583984375, "loss": 0.2983, "rewards/chosen": 2.574376106262207, "rewards/margins": 4.175034284591675, "rewards/rejected": -1.6006581783294678, "step": 277 }, { "epoch": 0.46821052631578947, "grad_norm": 46.77822204181921, "kl": 0.0, "learning_rate": 3.224831986815306e-07, "logits/chosen": -1206424576.0, "logits/rejected": -531875840.0, "logps/chosen": -218.09373474121094, "logps/rejected": -287.2392578125, "loss": 0.3122, "rewards/chosen": 1.3858611583709717, "rewards/margins": 7.854579210281372, "rewards/rejected": -6.4687180519104, "step": 278 }, { "epoch": 0.46989473684210525, "grad_norm": 48.645481075648526, "kl": 0.0, "learning_rate": 3.2107434389507177e-07, "logits/chosen": -829400217.6, "logits/rejected": -402655232.0, "logps/chosen": -180.144287109375, "logps/rejected": -130.1499226888021, "loss": 0.329, "rewards/chosen": 1.9622688293457031, "rewards/margins": 1.4281412760416665, "rewards/rejected": 0.5341275533040365, "step": 279 }, { "epoch": 0.47157894736842104, "grad_norm": 41.28195566807422, "kl": 0.0, "learning_rate": 3.19663029144123e-07, "logits/chosen": -841884723.2, "logits/rejected": -862919680.0, "logps/chosen": -200.8014892578125, "logps/rejected": -234.42822265625, "loss": 0.3977, "rewards/chosen": 0.43205790519714354, "rewards/margins": 0.7687125682830811, "rewards/rejected": -0.3366546630859375, "step": 280 }, { "epoch": 0.47326315789473683, "grad_norm": 84.07688846755318, "kl": 0.0, "learning_rate": 3.1824930327590525e-07, "logits/chosen": -710278314.6666666, "logits/rejected": -995849472.0, "logps/chosen": -224.30419921875, "logps/rejected": -217.58340454101562, "loss": 0.3496, "rewards/chosen": 0.7394833564758301, "rewards/margins": 0.8354802131652832, "rewards/rejected": -0.09599685668945312, "step": 281 }, { "epoch": 0.4749473684210526, "grad_norm": 112.7314053842234, "kl": 0.0, "learning_rate": 3.168332152210909e-07, "logits/chosen": -497686954.6666667, "logits/rejected": -851313049.6, "logps/chosen": -303.6035970052083, "logps/rejected": -268.5775390625, "loss": 0.4291, "rewards/chosen": -0.6700571378072103, "rewards/margins": -1.136230500539144, "rewards/rejected": 0.4661733627319336, "step": 282 }, { "epoch": 0.4766315789473684, "grad_norm": 47.718340024126896, "kl": 0.0, "learning_rate": 3.154148139921102e-07, "logits/chosen": -1520765056.0, "logits/rejected": -917776018.2857143, "logps/chosen": -182.46583557128906, "logps/rejected": -224.23322405133928, "loss": 0.3508, "rewards/chosen": -1.4963241815567017, "rewards/margins": -0.8272254637309483, "rewards/rejected": -0.6690987178257534, "step": 283 }, { "epoch": 0.4783157894736842, "grad_norm": 51.50174863063673, "kl": 0.0, "learning_rate": 3.1399414868145503e-07, "logits/chosen": -879817557.3333334, "logits/rejected": -509886771.2, "logps/chosen": -196.01708984375, "logps/rejected": -247.4649169921875, "loss": 0.4224, "rewards/chosen": -1.2518666585286458, "rewards/margins": 2.6389341990152992, "rewards/rejected": -3.8908008575439452, "step": 284 }, { "epoch": 0.48, "grad_norm": 46.979767312290676, "kl": 0.0, "learning_rate": 3.1257126845997996e-07, "logits/chosen": -605518720.0, "logits/rejected": -689517056.0, "logps/chosen": -217.33444213867188, "logps/rejected": -329.7290954589844, "loss": 0.2682, "rewards/chosen": 0.8716354370117188, "rewards/margins": 11.700860023498535, "rewards/rejected": -10.829224586486816, "step": 285 }, { "epoch": 0.48168421052631577, "grad_norm": 53.078466818339, "kl": 0.0, "learning_rate": 3.111462225752e-07, "logits/chosen": -1177064704.0, "logits/rejected": -1062161664.0, "logps/chosen": -172.35658264160156, "logps/rejected": -229.10531616210938, "loss": 0.2759, "rewards/chosen": 1.4827051162719727, "rewards/margins": 3.3628339767456055, "rewards/rejected": -1.8801288604736328, "step": 286 }, { "epoch": 0.48336842105263156, "grad_norm": 73.05034651400905, "kl": 0.0, "learning_rate": 3.097190603495861e-07, "logits/chosen": -1032015040.0, "logits/rejected": -431953184.0, "logps/chosen": -204.94126892089844, "logps/rejected": -273.42724609375, "loss": 0.4725, "rewards/chosen": 0.42430874705314636, "rewards/margins": 0.12238076329231262, "rewards/rejected": 0.30192798376083374, "step": 287 }, { "epoch": 0.48505263157894735, "grad_norm": 58.199485015649564, "kl": 0.0, "learning_rate": 3.0828983117885856e-07, "logits/chosen": -947543360.0, "logits/rejected": -1011533738.6666666, "logps/chosen": -225.79100036621094, "logps/rejected": -238.0352579752604, "loss": 0.3671, "rewards/chosen": 0.18504944443702698, "rewards/margins": 2.1747958560784655, "rewards/rejected": -1.9897464116414387, "step": 288 }, { "epoch": 0.48673684210526313, "grad_norm": 38.613615018070185, "kl": 0.0, "learning_rate": 3.0685858453027665e-07, "logits/chosen": -1476833689.6, "logits/rejected": -866345984.0, "logps/chosen": -210.87587890625, "logps/rejected": -175.0899861653646, "loss": 0.4495, "rewards/chosen": 1.3699981689453125, "rewards/margins": 0.4098469734191894, "rewards/rejected": 0.960151195526123, "step": 289 }, { "epoch": 0.4884210526315789, "grad_norm": 62.47965185776049, "kl": 0.0, "learning_rate": 3.0542536994092696e-07, "logits/chosen": -1110704128.0, "logits/rejected": -717989222.4, "logps/chosen": -271.7138671875, "logps/rejected": -226.588330078125, "loss": 0.3672, "rewards/chosen": 0.2757614056269328, "rewards/margins": 2.1927677075068157, "rewards/rejected": -1.9170063018798829, "step": 290 }, { "epoch": 0.4901052631578947, "grad_norm": 44.32623223762354, "kl": 0.0, "learning_rate": 3.03990237016009e-07, "logits/chosen": -798940876.8, "logits/rejected": -814150997.3333334, "logps/chosen": -253.6781005859375, "logps/rejected": -307.87721761067706, "loss": 0.405, "rewards/chosen": 0.5707196235656739, "rewards/margins": 3.1684674263000487, "rewards/rejected": -2.597747802734375, "step": 291 }, { "epoch": 0.4917894736842105, "grad_norm": 55.91294023967487, "kl": 0.0, "learning_rate": 3.025532354271178e-07, "logits/chosen": -961314816.0, "logits/rejected": -835593676.8, "logps/chosen": -208.382568359375, "logps/rejected": -190.52183837890624, "loss": 0.4201, "rewards/chosen": 0.48334558804829914, "rewards/margins": 0.11008816560109452, "rewards/rejected": 0.3732574224472046, "step": 292 }, { "epoch": 0.49347368421052634, "grad_norm": 39.083753233538296, "kl": 0.0, "learning_rate": 3.0111441491052505e-07, "logits/chosen": -898919116.8, "logits/rejected": -784928000.0, "logps/chosen": -233.350927734375, "logps/rejected": -234.80265299479166, "loss": 0.4615, "rewards/chosen": 0.10661284923553467, "rewards/margins": 1.890099851290385, "rewards/rejected": -1.7834870020548503, "step": 293 }, { "epoch": 0.49515789473684213, "grad_norm": 51.14880984410606, "kl": 0.0, "learning_rate": 2.996738252654577e-07, "logits/chosen": -821468160.0, "logits/rejected": -1598704128.0, "logps/chosen": -196.221533203125, "logps/rejected": -359.334228515625, "loss": 0.4244, "rewards/chosen": -0.5685586929321289, "rewards/margins": 3.059499422709147, "rewards/rejected": -3.628058115641276, "step": 294 }, { "epoch": 0.4968421052631579, "grad_norm": 106.86865404878358, "kl": 0.0, "learning_rate": 2.982315163523742e-07, "logits/chosen": -661637952.0, "logits/rejected": -624379264.0, "logps/chosen": -208.84210205078125, "logps/rejected": -236.47945149739584, "loss": 0.4826, "rewards/chosen": 0.8578643798828125, "rewards/margins": 1.4199414253234863, "rewards/rejected": -0.5620770454406738, "step": 295 }, { "epoch": 0.4985263157894737, "grad_norm": 60.74058573931991, "kl": 0.0, "learning_rate": 2.967875380912388e-07, "logits/chosen": -1204205824.0, "logits/rejected": -1308154026.6666667, "logps/chosen": -226.80226135253906, "logps/rejected": -224.9236857096354, "loss": 0.506, "rewards/chosen": 0.23896867036819458, "rewards/margins": 0.5748037298520405, "rewards/rejected": -0.335835059483846, "step": 296 }, { "epoch": 0.5002105263157894, "grad_norm": 42.435672467475534, "kl": 0.0, "learning_rate": 2.9534194045979393e-07, "logits/chosen": -706206506.6666666, "logits/rejected": -569180224.0, "logps/chosen": -252.4887898763021, "logps/rejected": -193.00033569335938, "loss": 0.372, "rewards/chosen": 0.9058823585510254, "rewards/margins": 3.202226400375366, "rewards/rejected": -2.296344041824341, "step": 297 }, { "epoch": 0.5018947368421053, "grad_norm": 56.50443377225829, "kl": 0.0, "learning_rate": 2.9389477349183015e-07, "logits/chosen": -490414496.0, "logits/rejected": -853274048.0, "logps/chosen": -217.90597534179688, "logps/rejected": -235.65103149414062, "loss": 0.3792, "rewards/chosen": 1.547549843788147, "rewards/margins": 3.4790706634521484, "rewards/rejected": -1.9315208196640015, "step": 298 }, { "epoch": 0.503578947368421, "grad_norm": 61.19734777336964, "kl": 0.0, "learning_rate": 2.924460872754547e-07, "logits/chosen": -481068714.6666667, "logits/rejected": -665770188.8, "logps/chosen": -227.10294596354166, "logps/rejected": -185.52493896484376, "loss": 0.3672, "rewards/chosen": 1.1002014478047688, "rewards/margins": 2.9621237119038897, "rewards/rejected": -1.861922264099121, "step": 299 }, { "epoch": 0.5052631578947369, "grad_norm": 104.48557270061485, "kl": 0.0, "learning_rate": 2.909959319513574e-07, "logits/chosen": -709741760.0, "logits/rejected": -1217407146.6666667, "logps/chosen": -191.64903259277344, "logps/rejected": -252.3373819986979, "loss": 0.3704, "rewards/chosen": -0.1919853240251541, "rewards/margins": 0.7327715208133062, "rewards/rejected": -0.9247568448384603, "step": 300 }, { "epoch": 0.5069473684210526, "grad_norm": 43.12147455849188, "kl": 0.0, "learning_rate": 2.8954435771107597e-07, "logits/chosen": -765106368.0, "logits/rejected": -901597440.0, "logps/chosen": -226.85153198242188, "logps/rejected": -243.94345092773438, "loss": 0.361, "rewards/chosen": 1.015770435333252, "rewards/margins": 2.2279579639434814, "rewards/rejected": -1.2121875286102295, "step": 301 }, { "epoch": 0.5086315789473684, "grad_norm": 68.53182250423035, "kl": 0.0, "learning_rate": 2.880914147952584e-07, "logits/chosen": -892777728.0, "logits/rejected": -893605683.2, "logps/chosen": -231.4443359375, "logps/rejected": -212.4324462890625, "loss": 0.4275, "rewards/chosen": 1.4770619074503581, "rewards/margins": 0.6548136393229167, "rewards/rejected": 0.8222482681274415, "step": 302 }, { "epoch": 0.5103157894736842, "grad_norm": 57.399630263093485, "kl": 0.0, "learning_rate": 2.8663715349192383e-07, "logits/chosen": -980971605.3333334, "logits/rejected": -1283480320.0, "logps/chosen": -271.35976155598956, "logps/rejected": -338.93670654296875, "loss": 0.3848, "rewards/chosen": 1.7701203028361003, "rewards/margins": 6.533517519632976, "rewards/rejected": -4.763397216796875, "step": 303 }, { "epoch": 0.512, "grad_norm": 50.30087172775045, "kl": 0.0, "learning_rate": 2.8518162413472263e-07, "logits/chosen": -876101017.6, "logits/rejected": -833215488.0, "logps/chosen": -223.075244140625, "logps/rejected": -205.84307861328125, "loss": 0.4463, "rewards/chosen": 1.0503494262695312, "rewards/margins": 2.2924479166666667, "rewards/rejected": -1.2420984903971355, "step": 304 }, { "epoch": 0.5136842105263157, "grad_norm": 56.83106728944569, "kl": 0.0, "learning_rate": 2.837248771011937e-07, "logits/chosen": -778440089.6, "logits/rejected": -998400341.3333334, "logps/chosen": -258.7751953125, "logps/rejected": -218.8525594075521, "loss": 0.4367, "rewards/chosen": -0.8302111625671387, "rewards/margins": -1.7733329931894937, "rewards/rejected": 0.9431218306223551, "step": 305 }, { "epoch": 0.5153684210526316, "grad_norm": 37.14827382589415, "kl": 0.0, "learning_rate": 2.822669628110213e-07, "logits/chosen": -949675349.3333334, "logits/rejected": -1104420966.4, "logps/chosen": -197.35005696614584, "logps/rejected": -236.111083984375, "loss": 0.3883, "rewards/chosen": 1.4387264251708984, "rewards/margins": 1.295378041267395, "rewards/rejected": 0.1433483839035034, "step": 306 }, { "epoch": 0.5170526315789473, "grad_norm": 42.48400320185914, "kl": 0.0, "learning_rate": 2.808079317242896e-07, "logits/chosen": -733285376.0, "logits/rejected": -290018304.0, "logps/chosen": -230.39531598772322, "logps/rejected": -196.5876922607422, "loss": 0.3848, "rewards/chosen": -0.14083341189793178, "rewards/margins": -1.530245338167463, "rewards/rejected": 1.3894119262695312, "step": 307 }, { "epoch": 0.5187368421052632, "grad_norm": 49.30251176881769, "kl": 0.0, "learning_rate": 2.793478343397367e-07, "logits/chosen": -637947801.6, "logits/rejected": -1100627200.0, "logps/chosen": -236.856103515625, "logps/rejected": -254.74955240885416, "loss": 0.4514, "rewards/chosen": 0.8377789497375489, "rewards/margins": 0.31805851459503176, "rewards/rejected": 0.5197204351425171, "step": 308 }, { "epoch": 0.5204210526315789, "grad_norm": 38.15594290546514, "kl": 0.0, "learning_rate": 2.778867211930061e-07, "logits/chosen": -946033956.5714285, "logits/rejected": -322719744.0, "logps/chosen": -188.54940359933036, "logps/rejected": -337.9103088378906, "loss": 0.3864, "rewards/chosen": 0.2591233934674944, "rewards/margins": 4.263719626835415, "rewards/rejected": -4.00459623336792, "step": 309 }, { "epoch": 0.5221052631578947, "grad_norm": 36.73206711146749, "kl": 0.0, "learning_rate": 2.7642464285489827e-07, "logits/chosen": -1031071232.0, "logits/rejected": -701534515.2, "logps/chosen": -285.6693115234375, "logps/rejected": -270.429541015625, "loss": 0.3706, "rewards/chosen": -0.7057943344116211, "rewards/margins": 0.5104178428649901, "rewards/rejected": -1.2162121772766112, "step": 310 }, { "epoch": 0.5237894736842105, "grad_norm": 35.15174071748067, "kl": 0.0, "learning_rate": 2.749616499296199e-07, "logits/chosen": -999771648.0, "logits/rejected": -866819712.0, "logps/chosen": -261.1120910644531, "logps/rejected": -233.74642944335938, "loss": 0.4233, "rewards/chosen": -0.03837205469608307, "rewards/margins": 2.5518483966588974, "rewards/rejected": -2.5902204513549805, "step": 311 }, { "epoch": 0.5254736842105263, "grad_norm": 40.26805134000718, "kl": 0.0, "learning_rate": 2.734977930530326e-07, "logits/chosen": -521609728.0, "logits/rejected": -1214733482.6666667, "logps/chosen": -222.58474731445312, "logps/rejected": -238.75984700520834, "loss": 0.4109, "rewards/chosen": 0.5281776785850525, "rewards/margins": -0.3983253439267477, "rewards/rejected": 0.9265030225118002, "step": 312 }, { "epoch": 0.5271578947368422, "grad_norm": 41.427977504816596, "kl": 0.0, "learning_rate": 2.7203312289090044e-07, "logits/chosen": -484565952.0, "logits/rejected": -1246305024.0, "logps/chosen": -273.0993347167969, "logps/rejected": -276.9410400390625, "loss": 0.4445, "rewards/chosen": -0.8267297148704529, "rewards/margins": 2.6424904465675354, "rewards/rejected": -3.4692201614379883, "step": 313 }, { "epoch": 0.5288421052631579, "grad_norm": 68.994308446122, "kl": 0.0, "learning_rate": 2.7056769013713623e-07, "logits/chosen": -822423360.0, "logits/rejected": -650067328.0, "logps/chosen": -216.25306701660156, "logps/rejected": -217.991943359375, "loss": 0.3846, "rewards/chosen": 1.5515110492706299, "rewards/margins": 1.647828370332718, "rewards/rejected": -0.09631732106208801, "step": 314 }, { "epoch": 0.5305263157894737, "grad_norm": 57.35653485170902, "kl": 0.0, "learning_rate": 2.6910154551204676e-07, "logits/chosen": -745939541.3333334, "logits/rejected": -736545408.0, "logps/chosen": -232.83365885416666, "logps/rejected": -210.52879333496094, "loss": 0.4926, "rewards/chosen": 0.7454870541890463, "rewards/margins": 0.6709303458531698, "rewards/rejected": 0.07455670833587646, "step": 315 }, { "epoch": 0.5322105263157895, "grad_norm": 81.62832673291715, "kl": 0.0, "learning_rate": 2.6763473976057773e-07, "logits/chosen": -1231821397.3333333, "logits/rejected": -768418918.4, "logps/chosen": -211.95243326822916, "logps/rejected": -236.2597900390625, "loss": 0.351, "rewards/chosen": 0.9875667095184326, "rewards/margins": 3.251889371871948, "rewards/rejected": -2.2643226623535155, "step": 316 }, { "epoch": 0.5338947368421053, "grad_norm": 40.40366631438029, "kl": 0.0, "learning_rate": 2.6616732365055713e-07, "logits/chosen": -878860083.2, "logits/rejected": -1151675733.3333333, "logps/chosen": -263.190625, "logps/rejected": -241.0025634765625, "loss": 0.3784, "rewards/chosen": 1.9023317337036132, "rewards/margins": 2.3599199295043944, "rewards/rejected": -0.45758819580078125, "step": 317 }, { "epoch": 0.535578947368421, "grad_norm": 37.906423958216756, "kl": 0.0, "learning_rate": 2.64699347970938e-07, "logits/chosen": -643882432.0, "logits/rejected": -725854293.3333334, "logps/chosen": -210.35226440429688, "logps/rejected": -264.7013346354167, "loss": 0.39, "rewards/chosen": 0.8370101451873779, "rewards/margins": 0.3739887475967407, "rewards/rejected": 0.4630213975906372, "step": 318 }, { "epoch": 0.5372631578947369, "grad_norm": 41.62282940753045, "kl": 0.0, "learning_rate": 2.632308635300408e-07, "logits/chosen": -1076104704.0, "logits/rejected": -821822592.0, "logps/chosen": -270.3343505859375, "logps/rejected": -172.8352508544922, "loss": 0.3992, "rewards/chosen": -0.8739106059074402, "rewards/margins": -0.6177101731300354, "rewards/rejected": -0.2562004327774048, "step": 319 }, { "epoch": 0.5389473684210526, "grad_norm": 32.33388694240573, "kl": 0.0, "learning_rate": 2.6176192115379494e-07, "logits/chosen": -1163147776.0, "logits/rejected": -671515520.0, "logps/chosen": -232.58616638183594, "logps/rejected": -390.2035319010417, "loss": 0.4368, "rewards/chosen": 3.005178928375244, "rewards/margins": 4.992446422576904, "rewards/rejected": -1.9872674942016602, "step": 320 }, { "epoch": 0.5406315789473685, "grad_norm": 99.7767501397208, "kl": 0.0, "learning_rate": 2.6029257168397944e-07, "logits/chosen": -1067986124.8, "logits/rejected": -520208341.3333333, "logps/chosen": -205.670068359375, "logps/rejected": -209.88446044921875, "loss": 0.365, "rewards/chosen": 0.7348861694335938, "rewards/margins": 0.3829381465911865, "rewards/rejected": 0.3519480228424072, "step": 321 }, { "epoch": 0.5423157894736842, "grad_norm": 43.79004322458768, "kl": 0.0, "learning_rate": 2.5882286597646313e-07, "logits/chosen": -837222016.0, "logits/rejected": -825369429.3333334, "logps/chosen": -224.58160400390625, "logps/rejected": -298.0353597005208, "loss": 0.5333, "rewards/chosen": 1.3384246826171875, "rewards/margins": 2.8695327440897627, "rewards/rejected": -1.531108061472575, "step": 322 }, { "epoch": 0.544, "grad_norm": 45.949254583233454, "kl": 0.0, "learning_rate": 2.5735285489944485e-07, "logits/chosen": -728969216.0, "logits/rejected": -885923225.6, "logps/chosen": -220.39337158203125, "logps/rejected": -217.1091552734375, "loss": 0.4497, "rewards/chosen": 3.17012882232666, "rewards/margins": 2.6761786460876467, "rewards/rejected": 0.49395017623901366, "step": 323 }, { "epoch": 0.5456842105263158, "grad_norm": 51.89181096647278, "kl": 0.0, "learning_rate": 2.5588258933169244e-07, "logits/chosen": -918494272.0, "logits/rejected": -728522112.0, "logps/chosen": -144.35769653320312, "logps/rejected": -252.4864501953125, "loss": 0.4313, "rewards/chosen": 0.6073247194290161, "rewards/margins": 1.5144293308258057, "rewards/rejected": -0.9071046113967896, "step": 324 }, { "epoch": 0.5473684210526316, "grad_norm": 42.4363414069206, "kl": 0.0, "learning_rate": 2.544121201607822e-07, "logits/chosen": -741229977.6, "logits/rejected": -902070272.0, "logps/chosen": -216.93974609375, "logps/rejected": -230.6597900390625, "loss": 0.4731, "rewards/chosen": -0.7157364368438721, "rewards/margins": -0.09154821236928312, "rewards/rejected": -0.624188224474589, "step": 325 }, { "epoch": 0.5490526315789473, "grad_norm": 49.977660612765106, "kl": 0.0, "learning_rate": 2.5294149828133704e-07, "logits/chosen": -785820672.0, "logits/rejected": -410074240.0, "logps/chosen": -192.42119489397322, "logps/rejected": -291.78887939453125, "loss": 0.2975, "rewards/chosen": 1.6415718623570033, "rewards/margins": 2.1497139760426114, "rewards/rejected": -0.5081421136856079, "step": 326 }, { "epoch": 0.5507368421052632, "grad_norm": 40.20254402613094, "kl": 0.0, "learning_rate": 2.5147077459326555e-07, "logits/chosen": -540746086.4, "logits/rejected": -600018858.6666666, "logps/chosen": -271.22294921875, "logps/rejected": -190.82244873046875, "loss": 0.3018, "rewards/chosen": 1.12197265625, "rewards/margins": 4.917408243815104, "rewards/rejected": -3.795435587565104, "step": 327 }, { "epoch": 0.5524210526315789, "grad_norm": 36.746566177202766, "kl": 0.0, "learning_rate": 2.5e-07, "logits/chosen": -960942811.4285715, "logits/rejected": -569875712.0, "logps/chosen": -220.05186244419642, "logps/rejected": -294.9991149902344, "loss": 0.3669, "rewards/chosen": 1.3416931969778878, "rewards/margins": 3.6566162449972968, "rewards/rejected": -2.314923048019409, "step": 328 }, { "epoch": 0.5541052631578948, "grad_norm": 92.10516677186112, "kl": 0.0, "learning_rate": 2.485292254067345e-07, "logits/chosen": -911412736.0, "logits/rejected": -994976153.6, "logps/chosen": -234.53729248046875, "logps/rejected": -238.84951171875, "loss": 0.3849, "rewards/chosen": 0.3486948808034261, "rewards/margins": 1.069074519475301, "rewards/rejected": -0.720379638671875, "step": 329 }, { "epoch": 0.5557894736842105, "grad_norm": 40.68592836090874, "kl": 0.0, "learning_rate": 2.4705850171866294e-07, "logits/chosen": -916755840.0, "logits/rejected": -721056704.0, "logps/chosen": -226.9412841796875, "logps/rejected": -197.04571533203125, "loss": 0.3178, "rewards/chosen": 1.0302788019180298, "rewards/margins": 2.7032233476638794, "rewards/rejected": -1.6729445457458496, "step": 330 }, { "epoch": 0.5574736842105263, "grad_norm": 64.00175361910134, "kl": 0.0, "learning_rate": 2.4558787983921785e-07, "logits/chosen": -1136037478.4, "logits/rejected": -776420096.0, "logps/chosen": -184.01240234375, "logps/rejected": -270.2752278645833, "loss": 0.354, "rewards/chosen": 1.9217365264892579, "rewards/margins": 2.225330440203349, "rewards/rejected": -0.303593913714091, "step": 331 }, { "epoch": 0.5591578947368421, "grad_norm": 53.20843592872294, "kl": 0.0, "learning_rate": 2.441174106683076e-07, "logits/chosen": -987133542.4, "logits/rejected": -757408768.0, "logps/chosen": -270.960498046875, "logps/rejected": -269.0520426432292, "loss": 0.4314, "rewards/chosen": 0.6490658760070801, "rewards/margins": 1.0015769640604655, "rewards/rejected": -0.35251108805338544, "step": 332 }, { "epoch": 0.5608421052631579, "grad_norm": 39.62692923007265, "kl": 0.0, "learning_rate": 2.4264714510055513e-07, "logits/chosen": -779204096.0, "logits/rejected": -775985749.3333334, "logps/chosen": -172.64938354492188, "logps/rejected": -189.1490275065104, "loss": 0.379, "rewards/chosen": 0.8577301502227783, "rewards/margins": 0.2588157653808594, "rewards/rejected": 0.598914384841919, "step": 333 }, { "epoch": 0.5625263157894737, "grad_norm": 189.00573139104904, "kl": 0.0, "learning_rate": 2.4117713402353685e-07, "logits/chosen": -717026304.0, "logits/rejected": -589375616.0, "logps/chosen": -255.56656901041666, "logps/rejected": -222.212158203125, "loss": 0.4805, "rewards/chosen": -0.3913642962773641, "rewards/margins": -0.5911636402209599, "rewards/rejected": 0.1997993439435959, "step": 334 }, { "epoch": 0.5642105263157895, "grad_norm": 45.7662847906049, "kl": 0.0, "learning_rate": 2.397074283160206e-07, "logits/chosen": -753268096.0, "logits/rejected": -917319168.0, "logps/chosen": -198.811279296875, "logps/rejected": -218.18515014648438, "loss": 0.3514, "rewards/chosen": 1.425872802734375, "rewards/margins": 2.8109121322631836, "rewards/rejected": -1.3850393295288086, "step": 335 }, { "epoch": 0.5658947368421052, "grad_norm": 39.80557628734281, "kl": 0.0, "learning_rate": 2.38238078846205e-07, "logits/chosen": -806648490.6666666, "logits/rejected": -1043137280.0, "logps/chosen": -197.84334309895834, "logps/rejected": -166.3507080078125, "loss": 0.4203, "rewards/chosen": 2.0990654627482095, "rewards/margins": 0.3083709875742593, "rewards/rejected": 1.7906944751739502, "step": 336 }, { "epoch": 0.5675789473684211, "grad_norm": 55.26598793514629, "kl": 0.0, "learning_rate": 2.367691364699592e-07, "logits/chosen": -990476117.3333334, "logits/rejected": -843749478.4, "logps/chosen": -211.03731282552084, "logps/rejected": -200.39736328125, "loss": 0.4159, "rewards/chosen": 1.5658411979675293, "rewards/margins": 2.0694480895996095, "rewards/rejected": -0.5036068916320801, "step": 337 }, { "epoch": 0.5692631578947368, "grad_norm": 43.55127827223215, "kl": 0.0, "learning_rate": 2.3530065202906208e-07, "logits/chosen": -677904725.3333334, "logits/rejected": -739208768.0, "logps/chosen": -215.2685546875, "logps/rejected": -168.70806884765625, "loss": 0.365, "rewards/chosen": -0.36894532044728595, "rewards/margins": 3.7609441677729287, "rewards/rejected": -4.129889488220215, "step": 338 }, { "epoch": 0.5709473684210526, "grad_norm": 44.273467218054, "kl": 0.0, "learning_rate": 2.3383267634944288e-07, "logits/chosen": -708441152.0, "logits/rejected": -909734720.0, "logps/chosen": -236.1248016357422, "logps/rejected": -329.7475280761719, "loss": 0.3328, "rewards/chosen": 0.44537657499313354, "rewards/margins": 2.3071479201316833, "rewards/rejected": -1.8617713451385498, "step": 339 }, { "epoch": 0.5726315789473684, "grad_norm": 44.104623030836734, "kl": 0.0, "learning_rate": 2.3236526023942222e-07, "logits/chosen": -1229630873.6, "logits/rejected": -615721045.3333334, "logps/chosen": -201.258935546875, "logps/rejected": -160.00018310546875, "loss": 0.3583, "rewards/chosen": 0.4522857666015625, "rewards/margins": 3.613059679667155, "rewards/rejected": -3.1607739130655923, "step": 340 }, { "epoch": 0.5743157894736842, "grad_norm": 37.46747269379094, "kl": 0.0, "learning_rate": 2.3089845448795325e-07, "logits/chosen": -972004778.6666666, "logits/rejected": -684686336.0, "logps/chosen": -186.35986328125, "logps/rejected": -252.18388671875, "loss": 0.3367, "rewards/chosen": 2.4348222414652505, "rewards/margins": 3.513483206431071, "rewards/rejected": -1.0786609649658203, "step": 341 }, { "epoch": 0.576, "grad_norm": 41.066699309390806, "kl": 0.0, "learning_rate": 2.2943230986286386e-07, "logits/chosen": -776381866.6666666, "logits/rejected": -411293952.0, "logps/chosen": -237.33272298177084, "logps/rejected": -230.616357421875, "loss": 0.5505, "rewards/chosen": -1.2874414920806885, "rewards/margins": -2.0171528339385985, "rewards/rejected": 0.7297113418579102, "step": 342 }, { "epoch": 0.5776842105263158, "grad_norm": 34.78601277894388, "kl": 0.0, "learning_rate": 2.2796687710909961e-07, "logits/chosen": -894285824.0, "logits/rejected": -1057857365.3333334, "logps/chosen": -193.6359619140625, "logps/rejected": -198.5640665690104, "loss": 0.4058, "rewards/chosen": 0.9409942626953125, "rewards/margins": -0.3375056584676106, "rewards/rejected": 1.278499921162923, "step": 343 }, { "epoch": 0.5793684210526315, "grad_norm": 43.656984836332974, "kl": 0.0, "learning_rate": 2.2650220694696746e-07, "logits/chosen": -936853913.6, "logits/rejected": -637026218.6666666, "logps/chosen": -258.152392578125, "logps/rejected": -172.32596842447916, "loss": 0.3634, "rewards/chosen": 0.7583493709564209, "rewards/margins": 0.6318836331367492, "rewards/rejected": 0.12646573781967163, "step": 344 }, { "epoch": 0.5810526315789474, "grad_norm": 43.09840300795585, "kl": 0.0, "learning_rate": 2.2503835007038018e-07, "logits/chosen": -1292558336.0, "logits/rejected": -511415200.0, "logps/chosen": -219.50491333007812, "logps/rejected": -236.9748077392578, "loss": 0.3775, "rewards/chosen": 1.3266575336456299, "rewards/margins": 2.1448956727981567, "rewards/rejected": -0.8182381391525269, "step": 345 }, { "epoch": 0.5827368421052631, "grad_norm": 38.706929628825115, "kl": 0.0, "learning_rate": 2.2357535714510179e-07, "logits/chosen": -1250603776.0, "logits/rejected": -919112908.8, "logps/chosen": -281.291748046875, "logps/rejected": -183.513916015625, "loss": 0.3373, "rewards/chosen": -0.5119430224100748, "rewards/margins": 1.2942089398701988, "rewards/rejected": -1.8061519622802735, "step": 346 }, { "epoch": 0.584421052631579, "grad_norm": 54.06441877859246, "kl": 0.0, "learning_rate": 2.2211327880699389e-07, "logits/chosen": -983545036.8, "logits/rejected": -1222936832.0, "logps/chosen": -196.76480712890626, "logps/rejected": -226.22184244791666, "loss": 0.4641, "rewards/chosen": 0.9738927841186523, "rewards/margins": -0.2919163544972737, "rewards/rejected": 1.265809138615926, "step": 347 }, { "epoch": 0.5861052631578947, "grad_norm": 31.758781014479883, "kl": 0.0, "learning_rate": 2.206521656602633e-07, "logits/chosen": -929091264.0, "logits/rejected": -505945888.0, "logps/chosen": -176.41741943359375, "logps/rejected": -188.78915405273438, "loss": 0.3671, "rewards/chosen": 1.3973584175109863, "rewards/margins": 1.813145935535431, "rewards/rejected": -0.4157875180244446, "step": 348 }, { "epoch": 0.5877894736842105, "grad_norm": 37.86314064294317, "kl": 0.0, "learning_rate": 2.1919206827571034e-07, "logits/chosen": -801264640.0, "logits/rejected": -464600064.0, "logps/chosen": -198.34344482421875, "logps/rejected": -244.7729034423828, "loss": 0.4117, "rewards/chosen": 1.5484859943389893, "rewards/margins": 3.387136936187744, "rewards/rejected": -1.8386509418487549, "step": 349 }, { "epoch": 0.5894736842105263, "grad_norm": 50.13743066285001, "kl": 0.0, "learning_rate": 2.177330371889787e-07, "logits/chosen": -899666124.8, "logits/rejected": -916757248.0, "logps/chosen": -219.9362060546875, "logps/rejected": -315.0555419921875, "loss": 0.4165, "rewards/chosen": 0.3476431369781494, "rewards/margins": 0.9587682882944742, "rewards/rejected": -0.6111251513163248, "step": 350 }, { "epoch": 0.5911578947368421, "grad_norm": 51.02552672848151, "kl": 0.0, "learning_rate": 2.162751228988063e-07, "logits/chosen": -853029760.0, "logits/rejected": -762731136.0, "logps/chosen": -260.21746826171875, "logps/rejected": -254.34788513183594, "loss": 0.3839, "rewards/chosen": 0.05103302001953125, "rewards/margins": 1.4993259906768799, "rewards/rejected": -1.4482929706573486, "step": 351 }, { "epoch": 0.592842105263158, "grad_norm": 34.82384239421157, "kl": 0.0, "learning_rate": 2.148183758652774e-07, "logits/chosen": -740140032.0, "logits/rejected": -741718784.0, "logps/chosen": -147.6769561767578, "logps/rejected": -276.81591796875, "loss": 0.4139, "rewards/chosen": 2.366483211517334, "rewards/margins": 3.0736358165740967, "rewards/rejected": -0.7071526050567627, "step": 352 }, { "epoch": 0.5945263157894737, "grad_norm": 34.883343445970695, "kl": 0.0, "learning_rate": 2.1336284650807612e-07, "logits/chosen": -1254429781.3333333, "logits/rejected": -611748761.6, "logps/chosen": -215.1737060546875, "logps/rejected": -214.90478515625, "loss": 0.3704, "rewards/chosen": 2.0823793411254883, "rewards/margins": 2.4012239456176756, "rewards/rejected": -0.3188446044921875, "step": 353 }, { "epoch": 0.5962105263157895, "grad_norm": 55.9131613356441, "kl": 0.0, "learning_rate": 2.1190858520474163e-07, "logits/chosen": -1588147200.0, "logits/rejected": -903177318.4, "logps/chosen": -219.27388509114584, "logps/rejected": -283.88779296875, "loss": 0.3452, "rewards/chosen": 2.384967645009359, "rewards/margins": 2.509509845574697, "rewards/rejected": -0.12454220056533813, "step": 354 }, { "epoch": 0.5978947368421053, "grad_norm": 43.395183494537164, "kl": 0.0, "learning_rate": 2.10455642288924e-07, "logits/chosen": -1037725632.0, "logits/rejected": -652462464.0, "logps/chosen": -192.9014892578125, "logps/rejected": -218.373779296875, "loss": 0.338, "rewards/chosen": 1.9063019752502441, "rewards/margins": 2.8689812421798706, "rewards/rejected": -0.9626792669296265, "step": 355 }, { "epoch": 0.5995789473684211, "grad_norm": 72.0621578330362, "kl": 0.0, "learning_rate": 2.090040680486426e-07, "logits/chosen": -1054155008.0, "logits/rejected": -724854912.0, "logps/chosen": -225.2272491455078, "logps/rejected": -268.515869140625, "loss": 0.4356, "rewards/chosen": -0.31277698278427124, "rewards/margins": 1.6200557351112366, "rewards/rejected": -1.9328327178955078, "step": 356 }, { "epoch": 0.6012631578947368, "grad_norm": 46.80458240411995, "kl": 0.0, "learning_rate": 2.0755391272454537e-07, "logits/chosen": -1309161600.0, "logits/rejected": -773884096.0, "logps/chosen": -221.47581481933594, "logps/rejected": -243.53134155273438, "loss": 0.3444, "rewards/chosen": 0.6755388379096985, "rewards/margins": 0.5105028748512268, "rewards/rejected": 0.16503596305847168, "step": 357 }, { "epoch": 0.6029473684210527, "grad_norm": 44.25731693592669, "kl": 0.0, "learning_rate": 2.0610522650816982e-07, "logits/chosen": -781600384.0, "logits/rejected": -991481728.0, "logps/chosen": -269.00433349609375, "logps/rejected": -233.53921508789062, "loss": 0.3766, "rewards/chosen": -0.19508057832717896, "rewards/margins": 1.9026996493339539, "rewards/rejected": -2.097780227661133, "step": 358 }, { "epoch": 0.6046315789473684, "grad_norm": 51.072468854632184, "kl": 0.0, "learning_rate": 2.04658059540206e-07, "logits/chosen": -955063210.6666666, "logits/rejected": -842874060.8, "logps/chosen": -206.79142252604166, "logps/rejected": -263.4759521484375, "loss": 0.3669, "rewards/chosen": 1.126490831375122, "rewards/margins": 1.8041922569274902, "rewards/rejected": -0.6777014255523681, "step": 359 }, { "epoch": 0.6063157894736843, "grad_norm": 43.797014090675, "kl": 0.0, "learning_rate": 2.0321246190876112e-07, "logits/chosen": -813810773.3333334, "logits/rejected": -735887360.0, "logps/chosen": -223.61555989583334, "logps/rejected": -310.8141357421875, "loss": 0.2421, "rewards/chosen": 2.7599121729532876, "rewards/margins": 4.69793898264567, "rewards/rejected": -1.9380268096923827, "step": 360 }, { "epoch": 0.608, "grad_norm": 44.19945884028008, "kl": 0.0, "learning_rate": 2.0176848364762576e-07, "logits/chosen": -1044594995.2, "logits/rejected": -838090410.6666666, "logps/chosen": -241.623193359375, "logps/rejected": -180.17657470703125, "loss": 0.4451, "rewards/chosen": 0.6463582992553711, "rewards/margins": 3.4257356007893884, "rewards/rejected": -2.779377301534017, "step": 361 }, { "epoch": 0.6096842105263158, "grad_norm": 84.65829093491772, "kl": 0.0, "learning_rate": 2.0032617473454225e-07, "logits/chosen": -993763620.5714285, "logits/rejected": -1168001152.0, "logps/chosen": -222.610107421875, "logps/rejected": -359.9516296386719, "loss": 0.373, "rewards/chosen": 0.029357058661324636, "rewards/margins": 5.195747954504831, "rewards/rejected": -5.166390895843506, "step": 362 }, { "epoch": 0.6113684210526316, "grad_norm": 37.454198883632955, "kl": 0.0, "learning_rate": 1.9888558508947492e-07, "logits/chosen": -656499840.0, "logits/rejected": -800222016.0, "logps/chosen": -248.09259033203125, "logps/rejected": -252.30076599121094, "loss": 0.3177, "rewards/chosen": 1.0152060985565186, "rewards/margins": 3.4270780086517334, "rewards/rejected": -2.411871910095215, "step": 363 }, { "epoch": 0.6130526315789474, "grad_norm": 44.05381109880352, "kl": 0.0, "learning_rate": 1.974467645728822e-07, "logits/chosen": -734617600.0, "logits/rejected": -1037793280.0, "logps/chosen": -210.473876953125, "logps/rejected": -227.2939453125, "loss": 0.3765, "rewards/chosen": 0.8343788385391235, "rewards/margins": 2.4099492629369097, "rewards/rejected": -1.5755704243977864, "step": 364 }, { "epoch": 0.6147368421052631, "grad_norm": 48.57136273490337, "kl": 0.0, "learning_rate": 1.9600976298399108e-07, "logits/chosen": -942329036.8, "logits/rejected": -660537429.3333334, "logps/chosen": -285.062890625, "logps/rejected": -235.09102376302084, "loss": 0.3531, "rewards/chosen": -0.9990038871765137, "rewards/margins": 0.40311670303344727, "rewards/rejected": -1.402120590209961, "step": 365 }, { "epoch": 0.616421052631579, "grad_norm": 37.488732058461395, "kl": 0.0, "learning_rate": 1.945746300590731e-07, "logits/chosen": -483911520.0, "logits/rejected": -1166225408.0, "logps/chosen": -247.5726776123047, "logps/rejected": -214.5377197265625, "loss": 0.3618, "rewards/chosen": 0.26900553703308105, "rewards/margins": 0.6737111012140911, "rewards/rejected": -0.40470556418100995, "step": 366 }, { "epoch": 0.6181052631578947, "grad_norm": 44.38980472905698, "kl": 0.0, "learning_rate": 1.9314141546972344e-07, "logits/chosen": -783111424.0, "logits/rejected": -941412928.0, "logps/chosen": -177.5583953857422, "logps/rejected": -260.1361083984375, "loss": 0.3547, "rewards/chosen": -0.2703380584716797, "rewards/margins": -0.19543498754501343, "rewards/rejected": -0.07490307092666626, "step": 367 }, { "epoch": 0.6197894736842106, "grad_norm": 37.93187426865055, "kl": 0.0, "learning_rate": 1.9171016882114152e-07, "logits/chosen": -849338432.0, "logits/rejected": -782459200.0, "logps/chosen": -195.59857177734375, "logps/rejected": -232.708984375, "loss": 0.38, "rewards/chosen": 1.189550757408142, "rewards/margins": 3.122536540031433, "rewards/rejected": -1.932985782623291, "step": 368 }, { "epoch": 0.6214736842105263, "grad_norm": 38.196373434045015, "kl": 0.0, "learning_rate": 1.9028093965041391e-07, "logits/chosen": -1023574976.0, "logits/rejected": -983692288.0, "logps/chosen": -205.63710021972656, "logps/rejected": -268.6028747558594, "loss": 0.3751, "rewards/chosen": 0.3406059741973877, "rewards/margins": 1.6527996063232422, "rewards/rejected": -1.3121936321258545, "step": 369 }, { "epoch": 0.6231578947368421, "grad_norm": 55.12173721267581, "kl": 0.0, "learning_rate": 1.888537774248e-07, "logits/chosen": -741024597.3333334, "logits/rejected": -600171980.8, "logps/chosen": -179.9103800455729, "logps/rejected": -237.39990234375, "loss": 0.418, "rewards/chosen": 1.0137186845143635, "rewards/margins": 2.055362590154012, "rewards/rejected": -1.0416439056396485, "step": 370 }, { "epoch": 0.6248421052631579, "grad_norm": 34.16987497139576, "kl": 0.0, "learning_rate": 1.8742873154002004e-07, "logits/chosen": -837830826.6666666, "logits/rejected": -1010286720.0, "logps/chosen": -276.2460530598958, "logps/rejected": -183.48431396484375, "loss": 0.4701, "rewards/chosen": 0.2395154039065043, "rewards/margins": -0.8625567158063253, "rewards/rejected": 1.1020721197128296, "step": 371 }, { "epoch": 0.6265263157894737, "grad_norm": 46.66359237058551, "kl": 0.0, "learning_rate": 1.86005851318545e-07, "logits/chosen": -669656012.8, "logits/rejected": -1303058517.3333333, "logps/chosen": -244.3946044921875, "logps/rejected": -313.6065266927083, "loss": 0.3828, "rewards/chosen": -0.9342806816101075, "rewards/margins": 2.6151256879170734, "rewards/rejected": -3.549406369527181, "step": 372 }, { "epoch": 0.6282105263157894, "grad_norm": 35.82420878860377, "kl": 0.0, "learning_rate": 1.8458518600788987e-07, "logits/chosen": -1138871722.6666667, "logits/rejected": -761401753.6, "logps/chosen": -230.2353312174479, "logps/rejected": -246.2571044921875, "loss": 0.3776, "rewards/chosen": 0.059048473834991455, "rewards/margins": 2.80713711977005, "rewards/rejected": -2.7480886459350584, "step": 373 }, { "epoch": 0.6298947368421053, "grad_norm": 47.50923482875504, "kl": 0.0, "learning_rate": 1.8316678477890913e-07, "logits/chosen": -1124751530.6666667, "logits/rejected": -855729868.8, "logps/chosen": -216.44978841145834, "logps/rejected": -233.7421875, "loss": 0.422, "rewards/chosen": 0.27029621601104736, "rewards/margins": 3.8977864503860475, "rewards/rejected": -3.627490234375, "step": 374 }, { "epoch": 0.631578947368421, "grad_norm": 59.37971841261376, "kl": 0.0, "learning_rate": 1.8175069672409475e-07, "logits/chosen": -564849280.0, "logits/rejected": -890142080.0, "logps/chosen": -229.4367218017578, "logps/rejected": -251.96417236328125, "loss": 0.4461, "rewards/chosen": -2.096214532852173, "rewards/margins": -0.3813058137893677, "rewards/rejected": -1.7149087190628052, "step": 375 }, { "epoch": 0.6332631578947369, "grad_norm": 42.11784105230497, "kl": 0.0, "learning_rate": 1.8033697085587696e-07, "logits/chosen": -910276096.0, "logits/rejected": -714325845.3333334, "logps/chosen": -231.393359375, "logps/rejected": -271.1864420572917, "loss": 0.4749, "rewards/chosen": -0.2165008544921875, "rewards/margins": 1.6030135790506999, "rewards/rejected": -1.8195144335428874, "step": 376 }, { "epoch": 0.6349473684210526, "grad_norm": 38.23749676627054, "kl": 0.0, "learning_rate": 1.7892565610492826e-07, "logits/chosen": -592622421.3333334, "logits/rejected": -968836416.0, "logps/chosen": -251.1493937174479, "logps/rejected": -316.63690185546875, "loss": 0.4272, "rewards/chosen": 0.3538096348444621, "rewards/margins": 1.5449076096216838, "rewards/rejected": -1.1910979747772217, "step": 377 }, { "epoch": 0.6366315789473684, "grad_norm": 33.65033547018444, "kl": 0.0, "learning_rate": 1.775168013184694e-07, "logits/chosen": -1084484352.0, "logits/rejected": -1348099840.0, "logps/chosen": -294.9368896484375, "logps/rejected": -199.22079467773438, "loss": 0.3069, "rewards/chosen": -0.5554192066192627, "rewards/margins": 2.1953020095825195, "rewards/rejected": -2.7507212162017822, "step": 378 }, { "epoch": 0.6383157894736842, "grad_norm": 57.13484688248307, "kl": 0.0, "learning_rate": 1.7611045525857898e-07, "logits/chosen": -878904466.2857143, "logits/rejected": -939131328.0, "logps/chosen": -230.76688058035714, "logps/rejected": -233.318603515625, "loss": 0.3319, "rewards/chosen": 0.6585009438650948, "rewards/margins": 0.9025790137904031, "rewards/rejected": -0.24407806992530823, "step": 379 }, { "epoch": 0.64, "grad_norm": 75.99248678640355, "kl": 0.0, "learning_rate": 1.7470666660050584e-07, "logits/chosen": -838683050.6666666, "logits/rejected": -667947072.0, "logps/chosen": -205.16400146484375, "logps/rejected": -321.7478332519531, "loss": 0.3082, "rewards/chosen": 1.483425776163737, "rewards/margins": 1.0787352124849956, "rewards/rejected": 0.40469056367874146, "step": 380 }, { "epoch": 0.6416842105263157, "grad_norm": 41.23482970655455, "kl": 0.0, "learning_rate": 1.7330548393098403e-07, "logits/chosen": -790441728.0, "logits/rejected": -1023563200.0, "logps/chosen": -277.3321533203125, "logps/rejected": -204.98135375976562, "loss": 0.3465, "rewards/chosen": 0.8123489618301392, "rewards/margins": 4.149082779884338, "rewards/rejected": -3.336733818054199, "step": 381 }, { "epoch": 0.6433684210526316, "grad_norm": 61.65012784187018, "kl": 0.0, "learning_rate": 1.7190695574655144e-07, "logits/chosen": -850949056.0, "logits/rejected": -890390656.0, "logps/chosen": -191.81503295898438, "logps/rejected": -262.1980285644531, "loss": 0.4055, "rewards/chosen": 2.1015682220458984, "rewards/margins": 2.900312066078186, "rewards/rejected": -0.7987438440322876, "step": 382 }, { "epoch": 0.6450526315789473, "grad_norm": 40.46551574043127, "kl": 0.0, "learning_rate": 1.705111304518712e-07, "logits/chosen": -800098986.6666666, "logits/rejected": -1169969920.0, "logps/chosen": -194.7923787434896, "logps/rejected": -145.28848266601562, "loss": 0.2469, "rewards/chosen": 2.726773262023926, "rewards/margins": 4.934856414794922, "rewards/rejected": -2.208083152770996, "step": 383 }, { "epoch": 0.6467368421052632, "grad_norm": 43.106097272592606, "kl": 0.0, "learning_rate": 1.6911805635805632e-07, "logits/chosen": -827328576.0, "logits/rejected": -977397696.0, "logps/chosen": -231.96633911132812, "logps/rejected": -252.32371520996094, "loss": 0.3401, "rewards/chosen": 0.41304701566696167, "rewards/margins": 1.2065090537071228, "rewards/rejected": -0.7934620380401611, "step": 384 }, { "epoch": 0.6484210526315789, "grad_norm": 42.75933123532293, "kl": 0.0, "learning_rate": 1.677277816809975e-07, "logits/chosen": -1216948864.0, "logits/rejected": -623093888.0, "logps/chosen": -240.3336181640625, "logps/rejected": -257.3775939941406, "loss": 0.3767, "rewards/chosen": 1.1069988012313843, "rewards/margins": 4.4674311876297, "rewards/rejected": -3.3604323863983154, "step": 385 }, { "epoch": 0.6501052631578947, "grad_norm": 41.09667706031157, "kl": 0.0, "learning_rate": 1.6634035453969457e-07, "logits/chosen": -689569740.8, "logits/rejected": -782888533.3333334, "logps/chosen": -218.35732421875, "logps/rejected": -258.15380859375, "loss": 0.3759, "rewards/chosen": 1.433096218109131, "rewards/margins": 1.4153085629145306, "rewards/rejected": 0.017787655194600422, "step": 386 }, { "epoch": 0.6517894736842105, "grad_norm": 54.55329906409335, "kl": 0.0, "learning_rate": 1.6495582295459078e-07, "logits/chosen": -605408320.0, "logits/rejected": -774927232.0, "logps/chosen": -204.93194580078125, "logps/rejected": -248.58714294433594, "loss": 0.3717, "rewards/chosen": 0.8562226891517639, "rewards/margins": 2.2260130047798157, "rewards/rejected": -1.3697903156280518, "step": 387 }, { "epoch": 0.6534736842105263, "grad_norm": 29.797849697928793, "kl": 0.0, "learning_rate": 1.6357423484591087e-07, "logits/chosen": -766515712.0, "logits/rejected": -736909653.3333334, "logps/chosen": -210.6973388671875, "logps/rejected": -224.88907877604166, "loss": 0.312, "rewards/chosen": 0.03613958358764648, "rewards/margins": 0.5231950441996257, "rewards/rejected": -0.4870554606119792, "step": 388 }, { "epoch": 0.655157894736842, "grad_norm": 64.33167682170762, "kl": 0.0, "learning_rate": 1.621956380320027e-07, "logits/rejected": -840579712.0, "logps/rejected": -260.83721923828125, "loss": 0.4427, "rewards/rejected": -0.597730278968811, "step": 389 }, { "epoch": 0.6568421052631579, "grad_norm": 42.22779651027316, "kl": 0.0, "learning_rate": 1.608200802276815e-07, "logits/chosen": -596097536.0, "logits/rejected": -820432298.6666666, "logps/chosen": -243.89404296875, "logps/rejected": -213.07832845052084, "loss": 0.3828, "rewards/chosen": 1.3395811080932618, "rewards/margins": 0.6359276135762534, "rewards/rejected": 0.7036534945170084, "step": 390 }, { "epoch": 0.6585263157894737, "grad_norm": 38.06012726917251, "kl": 0.0, "learning_rate": 1.5944760904257942e-07, "logits/chosen": -941375040.0, "logits/rejected": -882885802.6666666, "logps/chosen": -157.46560668945312, "logps/rejected": -215.0748291015625, "loss": 0.3784, "rewards/chosen": 0.9262268543243408, "rewards/margins": 0.7893735567728678, "rewards/rejected": 0.136853297551473, "step": 391 }, { "epoch": 0.6602105263157895, "grad_norm": 40.676015795529814, "kl": 0.0, "learning_rate": 1.5807827197949685e-07, "logits/chosen": -708315968.0, "logits/rejected": -1063172181.3333334, "logps/chosen": -245.2415771484375, "logps/rejected": -261.03733317057294, "loss": 0.4446, "rewards/chosen": 1.1207358837127686, "rewards/margins": 0.9106475313504537, "rewards/rejected": 0.21008835236231485, "step": 392 }, { "epoch": 0.6618947368421053, "grad_norm": 120.64719239721539, "kl": 0.0, "learning_rate": 1.5671211643275875e-07, "logits/chosen": -633163648.0, "logits/rejected": -805495872.0, "logps/chosen": -236.74386596679688, "logps/rejected": -252.68606567382812, "loss": 0.4374, "rewards/chosen": 0.5052295923233032, "rewards/margins": -0.2789260149002075, "rewards/rejected": 0.7841556072235107, "step": 393 }, { "epoch": 0.663578947368421, "grad_norm": 34.633986243971194, "kl": 0.0, "learning_rate": 1.553491896865742e-07, "logits/chosen": -1026384000.0, "logits/rejected": -597304448.0, "logps/chosen": -198.23861694335938, "logps/rejected": -220.9433135986328, "loss": 0.3553, "rewards/chosen": 2.9143166542053223, "rewards/margins": 1.0535552501678467, "rewards/rejected": 1.8607614040374756, "step": 394 }, { "epoch": 0.6652631578947369, "grad_norm": 45.21200734298712, "kl": 0.0, "learning_rate": 1.5398953891339968e-07, "logits/chosen": -1087951360.0, "logits/rejected": -763728640.0, "logps/chosen": -215.31341552734375, "logps/rejected": -249.522705078125, "loss": 0.3877, "rewards/chosen": 0.6071091294288635, "rewards/margins": 0.3096790115038554, "rewards/rejected": 0.2974301179250081, "step": 395 }, { "epoch": 0.6669473684210526, "grad_norm": 53.142787819676414, "kl": 0.0, "learning_rate": 1.5263321117230655e-07, "logits/chosen": -674901376.0, "logits/rejected": -1003873728.0, "logps/chosen": -160.89675903320312, "logps/rejected": -298.0811767578125, "loss": 0.2986, "rewards/chosen": 0.6914758682250977, "rewards/margins": 3.5794568061828613, "rewards/rejected": -2.8879809379577637, "step": 396 }, { "epoch": 0.6686315789473685, "grad_norm": 48.05043349132663, "kl": 2.46405029296875, "learning_rate": 1.512802534073522e-07, "logits/chosen": -578873856.0, "logits/rejected": -697750656.0, "logps/chosen": -232.189501953125, "logps/rejected": -211.70320638020834, "loss": 0.367, "rewards/chosen": 1.3567285537719727, "rewards/margins": 1.517380674680074, "rewards/rejected": -0.1606521209081014, "step": 397 }, { "epoch": 0.6703157894736842, "grad_norm": 62.66713109876527, "kl": 0.0, "learning_rate": 1.4993071244595534e-07, "logits/chosen": -1050684544.0, "logits/rejected": -831061440.0, "logps/chosen": -178.69546508789062, "logps/rejected": -267.62298583984375, "loss": 0.4321, "rewards/chosen": -0.0750776082277298, "rewards/margins": 1.4986597746610641, "rewards/rejected": -1.573737382888794, "step": 398 }, { "epoch": 0.672, "grad_norm": 31.500731382305798, "kl": 0.0, "learning_rate": 1.4858463499727507e-07, "logits/chosen": -985857280.0, "logits/rejected": -838302720.0, "logps/chosen": -201.73216247558594, "logps/rejected": -263.0357666015625, "loss": 0.3035, "rewards/chosen": 1.0928525924682617, "rewards/margins": 3.4981284141540527, "rewards/rejected": -2.405275821685791, "step": 399 }, { "epoch": 0.6736842105263158, "grad_norm": 56.975473997069834, "kl": 0.0, "learning_rate": 1.4724206765059454e-07, "logits/chosen": -706666624.0, "logits/rejected": -940830976.0, "logps/chosen": -226.00015258789062, "logps/rejected": -203.8423868815104, "loss": 0.4627, "rewards/chosen": 0.5999778509140015, "rewards/margins": -0.3421023289362589, "rewards/rejected": 0.9420801798502604, "step": 400 }, { "epoch": 0.6753684210526316, "grad_norm": 38.51367993065121, "kl": 0.0, "learning_rate": 1.459030568737081e-07, "logits/chosen": -688410496.0, "logits/rejected": -619393088.0, "logps/chosen": -228.07568359375, "logps/rejected": -262.47265625, "loss": 0.3821, "rewards/chosen": -0.5633602738380432, "rewards/margins": -0.32700619101524353, "rewards/rejected": -0.23635408282279968, "step": 401 }, { "epoch": 0.6770526315789474, "grad_norm": 48.63664615883306, "kl": 0.0, "learning_rate": 1.4456764901131308e-07, "logits/chosen": -785745664.0, "logits/rejected": -527159765.3333333, "logps/chosen": -361.95355224609375, "logps/rejected": -176.71563720703125, "loss": 0.4017, "rewards/chosen": 1.7527008056640625, "rewards/margins": 5.218010902404785, "rewards/rejected": -3.4653100967407227, "step": 402 }, { "epoch": 0.6787368421052632, "grad_norm": 42.63233832130583, "kl": 0.0, "learning_rate": 1.4323589028340597e-07, "logits/chosen": -542379605.3333334, "logits/rejected": -1274020224.0, "logps/chosen": -244.51863606770834, "logps/rejected": -148.19590759277344, "loss": 0.3506, "rewards/chosen": 1.1265826225280762, "rewards/margins": 0.32805508375167847, "rewards/rejected": 0.7985275387763977, "step": 403 }, { "epoch": 0.6804210526315789, "grad_norm": 61.18182583064187, "kl": 0.0, "learning_rate": 1.4190782678368257e-07, "logits/chosen": -793320960.0, "logits/rejected": -901524684.8, "logps/chosen": -240.1772664388021, "logps/rejected": -349.7687255859375, "loss": 0.3981, "rewards/chosen": 0.828582763671875, "rewards/margins": 1.3413378715515136, "rewards/rejected": -0.5127551078796386, "step": 404 }, { "epoch": 0.6821052631578948, "grad_norm": 44.427082645231216, "kl": 0.0, "learning_rate": 1.4058350447794235e-07, "logits/chosen": -734803200.0, "logits/rejected": -947461184.0, "logps/chosen": -209.5147705078125, "logps/rejected": -400.060302734375, "loss": 0.4264, "rewards/chosen": 0.879941463470459, "rewards/margins": 15.826271533966064, "rewards/rejected": -14.946330070495605, "step": 405 }, { "epoch": 0.6837894736842105, "grad_norm": 33.804563029238984, "kl": 0.0, "learning_rate": 1.3926296920249793e-07, "logits/chosen": -536416288.0, "logits/rejected": -671906624.0, "logps/chosen": -211.46499633789062, "logps/rejected": -246.7613067626953, "loss": 0.3206, "rewards/chosen": 1.6434662342071533, "rewards/margins": 5.484874725341797, "rewards/rejected": -3.8414084911346436, "step": 406 }, { "epoch": 0.6854736842105263, "grad_norm": 41.68975540862706, "kl": 0.0, "learning_rate": 1.3794626666258866e-07, "logits/chosen": -946841600.0, "logits/rejected": -811282005.3333334, "logps/chosen": -278.705859375, "logps/rejected": -203.1152547200521, "loss": 0.4434, "rewards/chosen": -1.5871347427368163, "rewards/margins": -2.073893650372823, "rewards/rejected": 0.48675890763600665, "step": 407 }, { "epoch": 0.6871578947368421, "grad_norm": 37.54233790104561, "kl": 0.0, "learning_rate": 1.3663344243079806e-07, "logits/chosen": -1188021120.0, "logits/rejected": -819954395.4285715, "logps/chosen": -194.07516479492188, "logps/rejected": -175.0321044921875, "loss": 0.3924, "rewards/chosen": 2.3552887439727783, "rewards/margins": 1.9132578032357352, "rewards/rejected": 0.44203094073704313, "step": 408 }, { "epoch": 0.6888421052631579, "grad_norm": 49.55293923346674, "kl": 0.0, "learning_rate": 1.3532454194547732e-07, "logits/chosen": -869699072.0, "logits/rejected": -755703552.0, "logps/chosen": -211.62156677246094, "logps/rejected": -255.9964141845703, "loss": 0.2897, "rewards/chosen": 2.3169875144958496, "rewards/margins": 3.9210219383239746, "rewards/rejected": -1.604034423828125, "step": 409 }, { "epoch": 0.6905263157894737, "grad_norm": 45.07517151760579, "kl": 0.0, "learning_rate": 1.3401961050917228e-07, "logits/chosen": -1723337728.0, "logits/rejected": -646925004.8, "logps/chosen": -234.47102864583334, "logps/rejected": -205.66240234375, "loss": 0.3623, "rewards/chosen": 0.6976583003997803, "rewards/margins": 0.9717099666595459, "rewards/rejected": -0.2740516662597656, "step": 410 }, { "epoch": 0.6922105263157895, "grad_norm": 41.62482913947959, "kl": 0.0, "learning_rate": 1.3271869328705516e-07, "logits/chosen": -539952554.6666666, "logits/rejected": -892416409.6, "logps/chosen": -198.6785888671875, "logps/rejected": -201.31024169921875, "loss": 0.3754, "rewards/chosen": 1.5827067693074544, "rewards/margins": 3.3067630132039385, "rewards/rejected": -1.7240562438964844, "step": 411 }, { "epoch": 0.6938947368421052, "grad_norm": 42.26855513911886, "kl": 0.0, "learning_rate": 1.314218353053619e-07, "logits/chosen": -660784768.0, "logits/rejected": -657162342.4, "logps/chosen": -219.67671712239584, "logps/rejected": -248.3271484375, "loss": 0.3954, "rewards/chosen": 0.8856130441029867, "rewards/margins": 2.3070303757985435, "rewards/rejected": -1.4214173316955567, "step": 412 }, { "epoch": 0.6955789473684211, "grad_norm": 63.372635671964915, "kl": 0.0, "learning_rate": 1.301290814498335e-07, "logits/chosen": -682650026.6666666, "logits/rejected": -750546227.2, "logps/chosen": -216.4493408203125, "logps/rejected": -230.68525390625, "loss": 0.3873, "rewards/chosen": 2.892109235127767, "rewards/margins": 3.49957750638326, "rewards/rejected": -0.6074682712554932, "step": 413 }, { "epoch": 0.6972631578947368, "grad_norm": 31.557969726141994, "kl": 0.0, "learning_rate": 1.2884047646416205e-07, "logits/chosen": -455988778.6666667, "logits/rejected": -1020206796.8, "logps/chosen": -256.14182535807294, "logps/rejected": -181.9263916015625, "loss": 0.2895, "rewards/chosen": 0.852165699005127, "rewards/margins": 0.872660094499588, "rewards/rejected": -0.02049439549446106, "step": 414 }, { "epoch": 0.6989473684210527, "grad_norm": 44.99525550241327, "kl": 0.0, "learning_rate": 1.2755606494844294e-07, "logits/chosen": -619836032.0, "logits/rejected": -689233024.0, "logps/chosen": -243.01300048828125, "logps/rejected": -235.70297241210938, "loss": 0.3066, "rewards/chosen": 1.5389785766601562, "rewards/margins": 2.8424285650253296, "rewards/rejected": -1.3034499883651733, "step": 415 }, { "epoch": 0.7006315789473684, "grad_norm": 46.608977034452934, "kl": 0.0, "learning_rate": 1.2627589135763066e-07, "logits/chosen": -622393664.0, "logits/rejected": -588082816.0, "logps/chosen": -263.4278869628906, "logps/rejected": -237.45460510253906, "loss": 0.3805, "rewards/chosen": 0.9929741621017456, "rewards/margins": 2.5076760053634644, "rewards/rejected": -1.5147018432617188, "step": 416 }, { "epoch": 0.7023157894736842, "grad_norm": 45.15173225020037, "kl": 0.0, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -290351680.0, "logits/rejected": -570943590.4, "logps/chosen": -210.52325439453125, "logps/rejected": -230.669921875, "loss": 0.3545, "rewards/chosen": 1.303624947865804, "rewards/margins": 1.4630508740743, "rewards/rejected": -0.1594259262084961, "step": 417 }, { "epoch": 0.704, "grad_norm": 49.827214101384186, "kl": 0.0, "learning_rate": 1.2372843503561318e-07, "logits/chosen": -860238028.8, "logits/rejected": -1133508778.6666667, "logps/chosen": -212.53994140625, "logps/rejected": -191.24481201171875, "loss": 0.3653, "rewards/chosen": 1.4028396606445312, "rewards/margins": 1.0293253660202026, "rewards/rejected": 0.3735142946243286, "step": 418 }, { "epoch": 0.7056842105263158, "grad_norm": 41.39161745285879, "kl": 0.0, "learning_rate": 1.2246124047479074e-07, "logits/chosen": -1557299200.0, "logits/rejected": -1170218752.0, "logps/chosen": -264.32098388671875, "logps/rejected": -288.19140625, "loss": 0.2715, "rewards/chosen": -0.8450756072998047, "rewards/margins": 5.187082767486572, "rewards/rejected": -6.032158374786377, "step": 419 }, { "epoch": 0.7073684210526315, "grad_norm": 55.33615495444503, "kl": 0.0, "learning_rate": 1.211984601765884e-07, "logits/chosen": -1000780970.6666666, "logits/rejected": -1060321177.6, "logps/chosen": -244.44415283203125, "logps/rejected": -210.060546875, "loss": 0.4162, "rewards/chosen": 0.8905111153920492, "rewards/margins": 1.1921974500020345, "rewards/rejected": -0.30168633460998534, "step": 420 }, { "epoch": 0.7090526315789474, "grad_norm": 48.50467903033437, "kl": 0.0, "learning_rate": 1.1994013784727946e-07, "logits/chosen": -798935961.6, "logits/rejected": -556915968.0, "logps/chosen": -193.3767333984375, "logps/rejected": -204.73103841145834, "loss": 0.3481, "rewards/chosen": 1.7989086151123046, "rewards/margins": 0.6901429335276286, "rewards/rejected": 1.108765681584676, "step": 421 }, { "epoch": 0.7107368421052631, "grad_norm": 43.771933039570335, "kl": 0.0, "learning_rate": 1.1868631703884183e-07, "logits/chosen": -770181836.8, "logits/rejected": -566833578.6666666, "logps/chosen": -239.477392578125, "logps/rejected": -194.196533203125, "loss": 0.3706, "rewards/chosen": 0.2928351402282715, "rewards/margins": -1.4519967397054034, "rewards/rejected": 1.744831879933675, "step": 422 }, { "epoch": 0.712421052631579, "grad_norm": 52.317703020939184, "kl": 0.0, "learning_rate": 1.1743704114745029e-07, "logits/chosen": -1021569408.0, "logits/rejected": -1079399082.6666667, "logps/chosen": -271.7890625, "logps/rejected": -211.10260009765625, "loss": 0.3508, "rewards/chosen": 1.666670322418213, "rewards/margins": 3.195549964904785, "rewards/rejected": -1.5288796424865723, "step": 423 }, { "epoch": 0.7141052631578947, "grad_norm": 44.73164989384191, "kl": 0.0, "learning_rate": 1.1619235341197517e-07, "logits/chosen": -833912896.0, "logits/rejected": -611463552.0, "logps/chosen": -261.9710693359375, "logps/rejected": -261.09979248046875, "loss": 0.4409, "rewards/chosen": -0.13229060173034668, "rewards/margins": 1.3757647673288982, "rewards/rejected": -1.5080553690592449, "step": 424 }, { "epoch": 0.7157894736842105, "grad_norm": 43.20707344363169, "kl": 0.0, "learning_rate": 1.1495229691248543e-07, "logits/chosen": -771930453.3333334, "logits/rejected": -296132800.0, "logps/chosen": -194.66646321614584, "logps/rejected": -196.60357666015625, "loss": 0.3797, "rewards/chosen": 0.4760231177012126, "rewards/margins": -0.01816942294438678, "rewards/rejected": 0.49419254064559937, "step": 425 }, { "epoch": 0.7174736842105263, "grad_norm": 37.98854463213142, "kl": 0.0, "learning_rate": 1.1371691456875735e-07, "logits/chosen": -1023635285.3333334, "logits/rejected": -639436902.4, "logps/chosen": -228.6927693684896, "logps/rejected": -203.55322265625, "loss": 0.4071, "rewards/chosen": 3.219998041788737, "rewards/margins": 3.7488079706827797, "rewards/rejected": -0.528809928894043, "step": 426 }, { "epoch": 0.7191578947368421, "grad_norm": 38.6260673498228, "kl": 0.0, "learning_rate": 1.1248624913878965e-07, "logits/chosen": -868456618.6666666, "logits/rejected": -591238860.8, "logps/chosen": -246.4551798502604, "logps/rejected": -202.9473876953125, "loss": 0.4088, "rewards/chosen": 2.625763416290283, "rewards/margins": 2.438036632537842, "rewards/rejected": 0.1877267837524414, "step": 427 }, { "epoch": 0.7208421052631578, "grad_norm": 42.32410569595673, "kl": 0.0, "learning_rate": 1.1126034321732325e-07, "logits/chosen": -789433344.0, "logits/rejected": -534869043.2, "logps/chosen": -241.09903971354166, "logps/rejected": -267.5244384765625, "loss": 0.3987, "rewards/chosen": -0.5567977428436279, "rewards/margins": -1.7331472873687743, "rewards/rejected": 1.1763495445251464, "step": 428 }, { "epoch": 0.7225263157894737, "grad_norm": 33.14828016562309, "kl": 0.0, "learning_rate": 1.100392392343667e-07, "logits/chosen": -811491712.0, "logits/rejected": -881575552.0, "logps/chosen": -205.48550415039062, "logps/rejected": -208.66879272460938, "loss": 0.3453, "rewards/chosen": 1.3217136859893799, "rewards/margins": 3.224560260772705, "rewards/rejected": -1.9028465747833252, "step": 429 }, { "epoch": 0.7242105263157895, "grad_norm": 45.216833187005214, "kl": 0.0, "learning_rate": 1.0882297945372828e-07, "logits/chosen": -879807385.6, "logits/rejected": -1048175104.0, "logps/chosen": -217.96474609375, "logps/rejected": -228.4263916015625, "loss": 0.2549, "rewards/chosen": 2.451664924621582, "rewards/margins": 4.334318161010742, "rewards/rejected": -1.8826532363891602, "step": 430 }, { "epoch": 0.7258947368421053, "grad_norm": 34.528325072499406, "kl": 0.0, "learning_rate": 1.0761160597155286e-07, "logits/chosen": -767435264.0, "logits/rejected": -604252245.3333334, "logps/chosen": -238.663525390625, "logps/rejected": -211.9125773111979, "loss": 0.4655, "rewards/chosen": 0.8303826332092286, "rewards/margins": 3.884237257639567, "rewards/rejected": -3.0538546244303384, "step": 431 }, { "epoch": 0.7275789473684211, "grad_norm": 71.46182969123905, "kl": 0.0, "learning_rate": 1.0640516071486466e-07, "logits/chosen": -645191936.0, "logits/rejected": -718282432.0, "logps/chosen": -243.2042236328125, "logps/rejected": -196.67498779296875, "loss": 0.484, "rewards/chosen": 0.9913086295127869, "rewards/margins": 0.5928112268447876, "rewards/rejected": 0.39849740266799927, "step": 432 }, { "epoch": 0.7292631578947368, "grad_norm": 37.01038450064629, "kl": 0.0, "learning_rate": 1.052036854401166e-07, "logits/chosen": -916742485.3333334, "logits/rejected": -1014475161.6, "logps/chosen": -251.3818359375, "logps/rejected": -294.1171630859375, "loss": 0.3733, "rewards/chosen": 2.161475658416748, "rewards/margins": 5.233707332611084, "rewards/rejected": -3.072231674194336, "step": 433 }, { "epoch": 0.7309473684210527, "grad_norm": 47.994445942830765, "kl": 0.0, "learning_rate": 1.0400722173174489e-07, "logits/chosen": -653794218.6666666, "logits/rejected": -922273920.0, "logps/chosen": -217.94051106770834, "logps/rejected": -335.4775390625, "loss": 0.3971, "rewards/chosen": 1.1465115547180176, "rewards/margins": 9.679410457611084, "rewards/rejected": -8.532898902893066, "step": 434 }, { "epoch": 0.7326315789473684, "grad_norm": 31.242483556300705, "kl": 0.0, "learning_rate": 1.0281581100072939e-07, "logits/chosen": -1153430144.0, "logits/rejected": -658751168.0, "logps/chosen": -223.9492950439453, "logps/rejected": -250.08029174804688, "loss": 0.3316, "rewards/chosen": -0.05509984493255615, "rewards/margins": 2.957166314125061, "rewards/rejected": -3.012266159057617, "step": 435 }, { "epoch": 0.7343157894736843, "grad_norm": 44.882205337938544, "kl": 0.0, "learning_rate": 1.0162949448316088e-07, "logits/chosen": -1179825561.6, "logits/rejected": -795787349.3333334, "logps/chosen": -184.4357421875, "logps/rejected": -319.9234619140625, "loss": 0.4252, "rewards/chosen": 1.6069551467895509, "rewards/margins": 3.3702004114786783, "rewards/rejected": -1.7632452646891277, "step": 436 }, { "epoch": 0.736, "grad_norm": 53.02185490371656, "kl": 0.0, "learning_rate": 1.0044831323881357e-07, "logits/chosen": -786681753.6, "logits/rejected": -730507520.0, "logps/chosen": -197.95892333984375, "logps/rejected": -195.9046630859375, "loss": 0.3626, "rewards/chosen": 0.8055712699890136, "rewards/margins": 2.56037015914917, "rewards/rejected": -1.7547988891601562, "step": 437 }, { "epoch": 0.7376842105263158, "grad_norm": 41.26140691549541, "kl": 0.0, "learning_rate": 9.927230814972382e-08, "logits/chosen": -1242336402.2857144, "logits/rejected": -715042368.0, "logps/chosen": -253.29446847098214, "logps/rejected": -256.8112487792969, "loss": 0.4456, "rewards/chosen": 1.5902636391775948, "rewards/margins": 5.37732127734593, "rewards/rejected": -3.787057638168335, "step": 438 }, { "epoch": 0.7393684210526316, "grad_norm": 39.11479149303325, "kl": 0.0, "learning_rate": 9.81015199187753e-08, "logits/chosen": -1125344256.0, "logits/rejected": -827617536.0, "logps/chosen": -162.90849812825522, "logps/rejected": -385.02578125, "loss": 0.2965, "rewards/chosen": 2.1140575408935547, "rewards/margins": 4.867137718200683, "rewards/rejected": -2.7530801773071287, "step": 439 }, { "epoch": 0.7410526315789474, "grad_norm": 42.334848723588486, "kl": 0.0, "learning_rate": 9.693598906829046e-08, "logits/chosen": -1135330304.0, "logits/rejected": -764287616.0, "logps/chosen": -208.2585652669271, "logps/rejected": -316.421875, "loss": 0.4066, "rewards/chosen": 1.152496576309204, "rewards/margins": 4.668377876281738, "rewards/rejected": -3.515881299972534, "step": 440 }, { "epoch": 0.7427368421052631, "grad_norm": 41.258031701194476, "kl": 0.0, "learning_rate": 9.577575593862775e-08, "logits/chosen": -715377024.0, "logits/rejected": -1129872384.0, "logps/chosen": -234.7706298828125, "logps/rejected": -306.43499755859375, "loss": 0.3972, "rewards/chosen": 1.0637226104736328, "rewards/margins": 2.9347736835479736, "rewards/rejected": -1.8710510730743408, "step": 441 }, { "epoch": 0.744421052631579, "grad_norm": 54.07697190099627, "kl": 0.0, "learning_rate": 9.462086068678519e-08, "logits/chosen": -766640000.0, "logits/rejected": -959083136.0, "logps/chosen": -211.8131103515625, "logps/rejected": -283.1666259765625, "loss": 0.401, "rewards/chosen": -0.5408859252929688, "rewards/margins": 1.790869951248169, "rewards/rejected": -2.3317558765411377, "step": 442 }, { "epoch": 0.7461052631578947, "grad_norm": 66.5169402578717, "kl": 0.0, "learning_rate": 9.347134328501097e-08, "logits/chosen": -653176448.0, "logits/rejected": -1222430976.0, "logps/chosen": -176.54693603515625, "logps/rejected": -252.2942657470703, "loss": 0.3954, "rewards/chosen": -0.43219953775405884, "rewards/margins": 0.7258930802345276, "rewards/rejected": -1.1580926179885864, "step": 443 }, { "epoch": 0.7477894736842106, "grad_norm": 59.0005118657837, "kl": 0.0, "learning_rate": 9.232724351941978e-08, "logits/chosen": -756462080.0, "logits/rejected": -722251946.6666666, "logps/chosen": -305.7142333984375, "logps/rejected": -234.7782999674479, "loss": 0.3794, "rewards/chosen": 0.13024139404296875, "rewards/margins": 0.5876843929290771, "rewards/rejected": -0.4574429988861084, "step": 444 }, { "epoch": 0.7494736842105263, "grad_norm": 55.62210680654247, "kl": 0.0, "learning_rate": 9.118860098861537e-08, "logits/chosen": -813152153.6, "logits/rejected": -290959360.0, "logps/chosen": -197.15120849609374, "logps/rejected": -282.9163004557292, "loss": 0.4162, "rewards/chosen": 0.6745959281921386, "rewards/margins": 1.0792610804239908, "rewards/rejected": -0.40466515223185223, "step": 445 }, { "epoch": 0.7511578947368421, "grad_norm": 358.03078972226217, "kl": 0.0, "learning_rate": 9.005545510232068e-08, "logits/chosen": -854770614.8571428, "logits/rejected": -522669440.0, "logps/chosen": -227.82121930803572, "logps/rejected": -329.3843994140625, "loss": 0.2992, "rewards/chosen": 1.0909557342529297, "rewards/margins": 7.157499313354492, "rewards/rejected": -6.0665435791015625, "step": 446 }, { "epoch": 0.7528421052631579, "grad_norm": 40.23280955297014, "kl": 0.0, "learning_rate": 8.892784508001341e-08, "logits/chosen": -665233920.0, "logits/rejected": -786437529.6, "logps/chosen": -205.84651692708334, "logps/rejected": -208.0626220703125, "loss": 0.3587, "rewards/chosen": 3.913280804951986, "rewards/margins": 4.716941197713217, "rewards/rejected": -0.8036603927612305, "step": 447 }, { "epoch": 0.7545263157894737, "grad_norm": 331.3992420977293, "kl": 0.0, "learning_rate": 8.780580994956849e-08, "logits/chosen": -991909824.0, "logits/rejected": -1036429738.6666666, "logps/chosen": -241.2684326171875, "logps/rejected": -222.25858561197916, "loss": 0.3923, "rewards/chosen": 0.7948417663574219, "rewards/margins": 0.6548532644907633, "rewards/rejected": 0.13998850186665854, "step": 448 }, { "epoch": 0.7562105263157894, "grad_norm": 33.19053977720104, "kl": 0.0, "learning_rate": 8.668938854590763e-08, "logits/chosen": -644909875.2, "logits/rejected": -695736832.0, "logps/chosen": -230.962060546875, "logps/rejected": -194.96405029296875, "loss": 0.4064, "rewards/chosen": 2.0331790924072264, "rewards/margins": -0.12323621114095085, "rewards/rejected": 2.1564153035481772, "step": 449 }, { "epoch": 0.7578947368421053, "grad_norm": 45.92452873664749, "kl": 0.0, "learning_rate": 8.557861950965509e-08, "logits/chosen": -1397461930.6666667, "logits/rejected": -905682124.8, "logps/chosen": -271.2666422526042, "logps/rejected": -288.8114013671875, "loss": 0.3724, "rewards/chosen": -0.3505045572916667, "rewards/margins": 2.9490411122639975, "rewards/rejected": -3.299545669555664, "step": 450 }, { "epoch": 0.759578947368421, "grad_norm": 40.248447099852605, "kl": 0.0, "learning_rate": 8.44735412857999e-08, "logits/chosen": -1039405504.0, "logits/rejected": -620940032.0, "logps/chosen": -259.6157531738281, "logps/rejected": -244.8777618408203, "loss": 0.3763, "rewards/chosen": 0.6175476312637329, "rewards/margins": 3.04443895816803, "rewards/rejected": -2.426891326904297, "step": 451 }, { "epoch": 0.7612631578947369, "grad_norm": 37.694918257120385, "kl": 0.0, "learning_rate": 8.337419212236584e-08, "logits/chosen": -523443744.0, "logits/rejected": -701745856.0, "logps/chosen": -225.3424072265625, "logps/rejected": -293.00054931640625, "loss": 0.3628, "rewards/chosen": 2.430769443511963, "rewards/margins": 3.699450373649597, "rewards/rejected": -1.2686809301376343, "step": 452 }, { "epoch": 0.7629473684210526, "grad_norm": 35.15022250675745, "kl": 0.0, "learning_rate": 8.228061006908738e-08, "logits/chosen": -789152448.0, "logits/rejected": -639658240.0, "logps/chosen": -204.5469207763672, "logps/rejected": -296.8457336425781, "loss": 0.3558, "rewards/chosen": 0.2700985074043274, "rewards/margins": 10.158620655536652, "rewards/rejected": -9.888522148132324, "step": 453 }, { "epoch": 0.7646315789473684, "grad_norm": 47.5088310425883, "kl": 0.0, "learning_rate": 8.119283297609236e-08, "logits/chosen": -1028607744.0, "logits/rejected": -645622656.0, "logps/chosen": -289.964111328125, "logps/rejected": -224.54290771484375, "loss": 0.3183, "rewards/chosen": 0.8806011080741882, "rewards/margins": 1.526337444782257, "rewards/rejected": -0.6457363367080688, "step": 454 }, { "epoch": 0.7663157894736842, "grad_norm": 34.441433884082336, "kl": 0.0, "learning_rate": 8.011089849259262e-08, "logits/chosen": -749322752.0, "logps/chosen": -202.4013671875, "loss": 0.3446, "rewards/chosen": 0.888677179813385, "step": 455 }, { "epoch": 0.768, "grad_norm": 43.86036582680801, "kl": 0.0, "learning_rate": 7.903484406558053e-08, "logits/chosen": -536868650.6666667, "logits/rejected": -616575808.0, "logps/chosen": -276.9130045572917, "logps/rejected": -232.73355102539062, "loss": 0.43, "rewards/chosen": -0.4769521156946818, "rewards/margins": 0.35738088687260944, "rewards/rejected": -0.8343330025672913, "step": 456 }, { "epoch": 0.7696842105263157, "grad_norm": 42.12234531239963, "kl": 0.0, "learning_rate": 7.79647069385328e-08, "logits/chosen": -752578133.3333334, "logits/rejected": -934050816.0, "logps/chosen": -224.02742513020834, "logps/rejected": -277.469970703125, "loss": 0.4204, "rewards/chosen": -0.5293665726979574, "rewards/margins": -1.098210604985555, "rewards/rejected": 0.5688440322875976, "step": 457 }, { "epoch": 0.7713684210526316, "grad_norm": 42.59646325267393, "kl": 0.0, "learning_rate": 7.690052415012174e-08, "logits/chosen": -938434252.8, "logits/rejected": -1136305237.3333333, "logps/chosen": -216.8803466796875, "logps/rejected": -190.39237467447916, "loss": 0.2499, "rewards/chosen": 1.1629043579101563, "rewards/margins": 4.905188814798991, "rewards/rejected": -3.7422844568888345, "step": 458 }, { "epoch": 0.7730526315789473, "grad_norm": 52.399177497226496, "kl": 0.0, "learning_rate": 7.584233253293326e-08, "logits/chosen": -716863027.2, "logits/rejected": -816465237.3333334, "logps/chosen": -243.8283203125, "logps/rejected": -196.03934733072916, "loss": 0.2746, "rewards/chosen": 1.0614694595336913, "rewards/margins": 2.684300168355306, "rewards/rejected": -1.6228307088216145, "step": 459 }, { "epoch": 0.7747368421052632, "grad_norm": 52.891481922508966, "kl": 0.0, "learning_rate": 7.479016871219173e-08, "logits/chosen": -917309440.0, "logits/rejected": -898528870.4, "logps/chosen": -282.1151936848958, "logps/rejected": -213.7822998046875, "loss": 0.4074, "rewards/chosen": -0.2861933708190918, "rewards/margins": 1.0193795204162597, "rewards/rejected": -1.3055728912353515, "step": 460 }, { "epoch": 0.7764210526315789, "grad_norm": 35.541679693506616, "kl": 0.0, "learning_rate": 7.374406910449276e-08, "logits/chosen": -625601749.3333334, "logits/rejected": -777565235.2, "logps/chosen": -214.69976806640625, "logps/rejected": -227.8533447265625, "loss": 0.3537, "rewards/chosen": 0.7599695523579916, "rewards/margins": 2.2433162053426106, "rewards/rejected": -1.483346652984619, "step": 461 }, { "epoch": 0.7781052631578947, "grad_norm": 63.418725969793606, "kl": 0.0, "learning_rate": 7.270406991654273e-08, "logits/chosen": -963097258.6666666, "logits/rejected": -1547470848.0, "logps/chosen": -263.2251383463542, "logps/rejected": -274.23115234375, "loss": 0.425, "rewards/chosen": 0.2878738244374593, "rewards/margins": 1.6172294457753498, "rewards/rejected": -1.3293556213378905, "step": 462 }, { "epoch": 0.7797894736842105, "grad_norm": 38.377982473660865, "kl": 0.0, "learning_rate": 7.167020714390501e-08, "logits/chosen": -783421030.4, "logits/rejected": -488972885.3333333, "logps/chosen": -215.7775390625, "logps/rejected": -179.737548828125, "loss": 0.436, "rewards/chosen": 0.9828885078430176, "rewards/margins": 0.3364712874094645, "rewards/rejected": 0.6464172204335531, "step": 463 }, { "epoch": 0.7814736842105263, "grad_norm": 45.40428262520681, "kl": 0.0, "learning_rate": 7.064251656975503e-08, "logits/chosen": -917309888.0, "logits/rejected": -730922069.3333334, "logps/chosen": -207.1556396484375, "logps/rejected": -247.15775553385416, "loss": 0.361, "rewards/chosen": 1.675153374671936, "rewards/margins": 3.9481043418248496, "rewards/rejected": -2.2729509671529136, "step": 464 }, { "epoch": 0.783157894736842, "grad_norm": 55.99199517489128, "kl": 0.0, "learning_rate": 6.96210337636414e-08, "logits/chosen": -1108944512.0, "logits/rejected": -576058944.0, "logps/chosen": -289.2898864746094, "logps/rejected": -202.31178283691406, "loss": 0.3961, "rewards/chosen": -0.21090269088745117, "rewards/margins": -0.2718646228313446, "rewards/rejected": 0.06096193194389343, "step": 465 }, { "epoch": 0.7848421052631579, "grad_norm": 49.25923339661157, "kl": 0.0, "learning_rate": 6.860579408025435e-08, "logits/chosen": -814812672.0, "logits/rejected": -933722880.0, "logps/chosen": -191.00594075520834, "logps/rejected": -235.2879638671875, "loss": 0.3267, "rewards/chosen": 1.0561386744181316, "rewards/margins": 2.257707277933757, "rewards/rejected": -1.201568603515625, "step": 466 }, { "epoch": 0.7865263157894736, "grad_norm": 52.73809432153172, "kl": 0.0, "learning_rate": 6.759683265820293e-08, "logits/chosen": -749964800.0, "logits/rejected": -1379509504.0, "logps/chosen": -182.05026245117188, "logps/rejected": -213.2086639404297, "loss": 0.4184, "rewards/chosen": 1.8726093769073486, "rewards/margins": 3.3285834789276123, "rewards/rejected": -1.4559741020202637, "step": 467 }, { "epoch": 0.7882105263157895, "grad_norm": 55.06684904272287, "kl": 0.0, "learning_rate": 6.659418441879816e-08, "logits/chosen": -886685760.0, "logits/rejected": -867823936.0, "logps/chosen": -242.12347412109375, "logps/rejected": -234.9542236328125, "loss": 0.4054, "rewards/chosen": 0.7248154282569885, "rewards/margins": 0.6286347508430481, "rewards/rejected": 0.09618067741394043, "step": 468 }, { "epoch": 0.7898947368421053, "grad_norm": 41.061950624386746, "kl": 0.0, "learning_rate": 6.559788406484445e-08, "logits/chosen": -1067656874.6666666, "logits/rejected": -635095756.8, "logps/chosen": -228.18094889322916, "logps/rejected": -251.1126708984375, "loss": 0.3713, "rewards/chosen": 2.0004892349243164, "rewards/margins": 4.07933406829834, "rewards/rejected": -2.0788448333740233, "step": 469 }, { "epoch": 0.791578947368421, "grad_norm": 55.92018991052814, "kl": 0.0, "learning_rate": 6.460796607943888e-08, "logits/chosen": -1008979114.6666666, "logits/rejected": -580155904.0, "logps/chosen": -220.10860188802084, "logps/rejected": -200.3806640625, "loss": 0.3572, "rewards/chosen": 0.6659164428710938, "rewards/margins": 2.2499670028686523, "rewards/rejected": -1.5840505599975585, "step": 470 }, { "epoch": 0.7932631578947369, "grad_norm": 44.05178707657523, "kl": 0.0, "learning_rate": 6.362446472477739e-08, "logits/chosen": -886915754.6666666, "logits/rejected": -697056153.6, "logps/chosen": -161.08222452799478, "logps/rejected": -195.1838623046875, "loss": 0.3431, "rewards/chosen": 2.043999671936035, "rewards/margins": 1.8765502452850342, "rewards/rejected": 0.167449426651001, "step": 471 }, { "epoch": 0.7949473684210526, "grad_norm": 76.34832400410619, "kl": 0.0, "learning_rate": 6.264741404096873e-08, "logits/chosen": -970343594.6666666, "logits/rejected": -1028333772.8, "logps/chosen": -240.02298990885416, "logps/rejected": -181.0073486328125, "loss": 0.4382, "rewards/chosen": 1.713447093963623, "rewards/margins": 0.9383099555969239, "rewards/rejected": 0.7751371383666992, "step": 472 }, { "epoch": 0.7966315789473685, "grad_norm": 53.36902208721296, "kl": 0.0, "learning_rate": 6.16768478448568e-08, "logits/chosen": -770189056.0, "logits/rejected": -784716697.6, "logps/chosen": -198.6039021809896, "logps/rejected": -230.421240234375, "loss": 0.4316, "rewards/chosen": 1.7407466570536296, "rewards/margins": 0.8216514269510906, "rewards/rejected": 0.919095230102539, "step": 473 }, { "epoch": 0.7983157894736842, "grad_norm": 76.44940846297158, "kl": 0.0, "learning_rate": 6.071279972884996e-08, "logits/chosen": -650762581.3333334, "logits/rejected": -824322304.0, "logps/chosen": -271.9395345052083, "logps/rejected": -240.26007080078125, "loss": 0.374, "rewards/chosen": 0.2647084792455037, "rewards/margins": 1.6960653861363728, "rewards/rejected": -1.4313569068908691, "step": 474 }, { "epoch": 0.8, "grad_norm": 38.84706884248548, "kl": 0.0, "learning_rate": 5.975530305975806e-08, "logits/chosen": -953603481.6, "logits/rejected": -874571178.6666666, "logps/chosen": -224.2393310546875, "logps/rejected": -248.4403076171875, "loss": 0.3032, "rewards/chosen": 0.2576324462890625, "rewards/margins": 2.0944193522135417, "rewards/rejected": -1.8367869059244792, "step": 475 }, { "epoch": 0.8016842105263158, "grad_norm": 80.57439772482964, "kl": 0.0, "learning_rate": 5.88043909776382e-08, "logits/chosen": -720325705.1428572, "logits/rejected": -1631839360.0, "logps/chosen": -212.89100864955358, "logps/rejected": -245.09353637695312, "loss": 0.3109, "rewards/chosen": 0.8418189457484654, "rewards/margins": 1.0114402196237018, "rewards/rejected": -0.1696212738752365, "step": 476 }, { "epoch": 0.8033684210526316, "grad_norm": 60.404391161081, "kl": 0.0, "learning_rate": 5.786009639464728e-08, "logits/chosen": -754591168.0, "logits/rejected": -984781056.0, "logps/chosen": -213.29608154296875, "logps/rejected": -191.29742431640625, "loss": 0.3819, "rewards/chosen": 1.0502116680145264, "rewards/margins": 8.452488660812378, "rewards/rejected": -7.402276992797852, "step": 477 }, { "epoch": 0.8050526315789474, "grad_norm": 39.88427529417202, "kl": 0.0, "learning_rate": 5.69224519939028e-08, "logits/chosen": -1256239872.0, "logits/rejected": -680008704.0, "logps/chosen": -207.2843017578125, "logps/rejected": -281.371337890625, "loss": 0.4186, "rewards/chosen": 2.776592969894409, "rewards/margins": 2.579076017652239, "rewards/rejected": 0.19751695224217006, "step": 478 }, { "epoch": 0.8067368421052632, "grad_norm": 46.39968561566463, "kl": 0.0, "learning_rate": 5.599149022835201e-08, "logits/chosen": -757022515.2, "logits/rejected": -568918784.0, "logps/chosen": -190.7361572265625, "logps/rejected": -234.748291015625, "loss": 0.3371, "rewards/chosen": 1.5194448471069335, "rewards/margins": 3.035427951812744, "rewards/rejected": -1.5159831047058105, "step": 479 }, { "epoch": 0.8084210526315789, "grad_norm": 40.26692401415863, "kl": 0.0, "learning_rate": 5.5067243319648517e-08, "logits/chosen": -628281651.2, "logits/rejected": -631906560.0, "logps/chosen": -231.4928466796875, "logps/rejected": -164.00850423177084, "loss": 0.3951, "rewards/chosen": 0.1779022455215454, "rewards/margins": 1.319037381807963, "rewards/rejected": -1.1411351362864177, "step": 480 }, { "epoch": 0.8101052631578948, "grad_norm": 58.215282154133604, "kl": 0.0, "learning_rate": 5.414974325703686e-08, "logits/chosen": -691516928.0, "logits/rejected": -898130624.0, "logps/chosen": -199.2517852783203, "logps/rejected": -193.02496337890625, "loss": 0.4195, "rewards/chosen": 0.9336433410644531, "rewards/margins": 0.8423346877098083, "rewards/rejected": 0.09130865335464478, "step": 481 }, { "epoch": 0.8117894736842105, "grad_norm": 50.746031932726225, "kl": 0.0, "learning_rate": 5.3239021796245707e-08, "logits/chosen": -643468480.0, "logits/rejected": -744267861.3333334, "logps/chosen": -202.1339111328125, "logps/rejected": -236.04170735677084, "loss": 0.3827, "rewards/chosen": 0.5528717041015625, "rewards/margins": 0.6056533952554067, "rewards/rejected": -0.0527816911538442, "step": 482 }, { "epoch": 0.8134736842105263, "grad_norm": 90.38953108333145, "kl": 0.0, "learning_rate": 5.2335110458388457e-08, "logits/chosen": -861357653.3333334, "logits/rejected": -1220613529.6, "logps/chosen": -228.4935302734375, "logps/rejected": -244.1078125, "loss": 0.3977, "rewards/chosen": 0.24794367949167886, "rewards/margins": 1.8121562878290813, "rewards/rejected": -1.5642126083374024, "step": 483 }, { "epoch": 0.8151578947368421, "grad_norm": 40.4236540400168, "kl": 0.0, "learning_rate": 5.1438040528872265e-08, "logits/chosen": -782614656.0, "logits/rejected": -757872469.3333334, "logps/chosen": -136.77349853515625, "logps/rejected": -224.17281087239584, "loss": 0.5002, "rewards/chosen": 1.6036109924316406, "rewards/margins": 0.9859415292739868, "rewards/rejected": 0.6176694631576538, "step": 484 }, { "epoch": 0.8168421052631579, "grad_norm": 121.37099361156412, "kl": 0.0, "learning_rate": 5.054784305631546e-08, "logits/chosen": -910532864.0, "logits/rejected": -786700117.3333334, "logps/chosen": -177.5673065185547, "logps/rejected": -266.01979573567706, "loss": 0.2893, "rewards/chosen": -0.057147204875946045, "rewards/margins": 1.8595226407051086, "rewards/rejected": -1.9166698455810547, "step": 485 }, { "epoch": 0.8185263157894737, "grad_norm": 50.530115528782915, "kl": 0.0, "learning_rate": 4.96645488514727e-08, "logits/chosen": -788890709.3333334, "logits/rejected": -455711808.0, "logps/chosen": -256.8352457682292, "logps/rejected": -267.2051086425781, "loss": 0.4482, "rewards/chosen": -0.5332825183868408, "rewards/margins": 0.43467551469802856, "rewards/rejected": -0.9679580330848694, "step": 486 }, { "epoch": 0.8202105263157895, "grad_norm": 70.07611011272452, "kl": 0.0, "learning_rate": 4.8788188486168616e-08, "logits/chosen": -872919142.4, "logits/rejected": -701736832.0, "logps/chosen": -193.6914794921875, "logps/rejected": -235.78462727864584, "loss": 0.3179, "rewards/chosen": 2.0400518417358398, "rewards/margins": 2.919521633783976, "rewards/rejected": -0.8794697920481364, "step": 487 }, { "epoch": 0.8218947368421052, "grad_norm": 52.808019352043246, "kl": 0.0, "learning_rate": 4.7918792292239635e-08, "logits/chosen": -670976153.6, "logits/rejected": -896994133.3333334, "logps/chosen": -245.185595703125, "logps/rejected": -399.8438313802083, "loss": 0.4311, "rewards/chosen": 0.799212646484375, "rewards/margins": 14.883302561442056, "rewards/rejected": -14.084089914957682, "step": 488 }, { "epoch": 0.8235789473684211, "grad_norm": 54.91548661991697, "kl": 0.0, "learning_rate": 4.705639036048439e-08, "logits/chosen": -952622208.0, "logits/rejected": -660675968.0, "logps/chosen": -200.71133422851562, "logps/rejected": -260.607666015625, "loss": 0.3739, "rewards/chosen": 1.0811817646026611, "rewards/margins": 0.40599095821380615, "rewards/rejected": 0.675190806388855, "step": 489 }, { "epoch": 0.8252631578947368, "grad_norm": 62.12949703126072, "kl": 0.0, "learning_rate": 4.6201012539622054e-08, "logits/chosen": -576909888.0, "logits/rejected": -580735232.0, "logps/chosen": -185.22006225585938, "logps/rejected": -247.2104695638021, "loss": 0.3748, "rewards/chosen": 2.8498971462249756, "rewards/margins": 3.258001764615377, "rewards/rejected": -0.4081046183904012, "step": 490 }, { "epoch": 0.8269473684210527, "grad_norm": 44.89977606216685, "kl": 0.0, "learning_rate": 4.535268843525908e-08, "logits/chosen": -561914880.0, "logits/rejected": -1542454400.0, "logps/chosen": -279.1746520996094, "logps/rejected": -252.60659790039062, "loss": 0.4187, "rewards/chosen": -0.5238960385322571, "rewards/margins": -1.0888721942901611, "rewards/rejected": 0.564976155757904, "step": 491 }, { "epoch": 0.8286315789473684, "grad_norm": 33.62401765550933, "kl": 0.0, "learning_rate": 4.4511447408864975e-08, "logits/chosen": -705677184.0, "logits/rejected": -642696576.0, "logps/chosen": -219.29757690429688, "logps/rejected": -180.13229370117188, "loss": 0.3157, "rewards/chosen": 0.919715940952301, "rewards/margins": 3.393587648868561, "rewards/rejected": -2.4738717079162598, "step": 492 }, { "epoch": 0.8303157894736842, "grad_norm": 44.80536332639876, "kl": 0.0, "learning_rate": 4.367731857675569e-08, "logits/chosen": -1012552362.6666666, "logits/rejected": -1070745088.0, "logps/chosen": -210.34134928385416, "logps/rejected": -197.56212158203124, "loss": 0.4666, "rewards/chosen": 2.4052974383036294, "rewards/margins": 1.760869280497233, "rewards/rejected": 0.6444281578063965, "step": 493 }, { "epoch": 0.832, "grad_norm": 69.77716049036187, "kl": 0.0, "learning_rate": 4.2850330809085874e-08, "logits/chosen": -630044373.3333334, "logits/rejected": -1326352512.0, "logps/chosen": -251.13602701822916, "logps/rejected": -142.5716094970703, "loss": 0.3156, "rewards/chosen": 0.20519916216532388, "rewards/margins": -1.417292316754659, "rewards/rejected": 1.622491478919983, "step": 494 }, { "epoch": 0.8336842105263158, "grad_norm": 36.11198104501435, "kl": 0.0, "learning_rate": 4.2030512728849945e-08, "logits/chosen": -921905493.3333334, "logits/rejected": -791465369.6, "logps/chosen": -237.568115234375, "logps/rejected": -210.0384521484375, "loss": 0.4533, "rewards/chosen": 0.15185137589772543, "rewards/margins": 1.321685512860616, "rewards/rejected": -1.1698341369628906, "step": 495 }, { "epoch": 0.8353684210526315, "grad_norm": 47.31830867232736, "kl": 0.0, "learning_rate": 4.1217892710891126e-08, "logits/chosen": -688228992.0, "logits/rejected": -928642474.6666666, "logps/chosen": -180.07191467285156, "logps/rejected": -247.9576416015625, "loss": 0.3971, "rewards/chosen": 2.725332736968994, "rewards/margins": 4.285257339477539, "rewards/rejected": -1.559924602508545, "step": 496 }, { "epoch": 0.8370526315789474, "grad_norm": 58.64030492711, "kl": 0.0, "learning_rate": 4.041249888091941e-08, "logits/chosen": -930103466.6666666, "logits/rejected": -520454144.0, "logps/chosen": -223.98262532552084, "logps/rejected": -291.0794982910156, "loss": 0.3902, "rewards/chosen": 0.41166090965270996, "rewards/margins": 2.5037872791290283, "rewards/rejected": -2.0921263694763184, "step": 497 }, { "epoch": 0.8387368421052631, "grad_norm": 42.51804510909749, "kl": 0.0, "learning_rate": 3.9614359114538196e-08, "logits/chosen": -594655701.3333334, "logits/rejected": -1102049280.0, "logps/chosen": -239.44209798177084, "logps/rejected": -235.8392578125, "loss": 0.4429, "rewards/chosen": -0.807283083597819, "rewards/margins": -1.4231882254282633, "rewards/rejected": 0.6159051418304443, "step": 498 }, { "epoch": 0.840421052631579, "grad_norm": 60.0326086702577, "kl": 0.0, "learning_rate": 3.8823501036279514e-08, "logits/chosen": -403930848.0, "logits/rejected": -417444640.0, "logps/chosen": -302.0504150390625, "logps/rejected": -168.2909698486328, "loss": 0.3504, "rewards/chosen": 0.14431113004684448, "rewards/margins": 1.6669214367866516, "rewards/rejected": -1.5226103067398071, "step": 499 }, { "epoch": 0.8421052631578947, "grad_norm": 44.106324622609606, "kl": 0.0, "learning_rate": 3.803995201864762e-08, "logits/chosen": -775272089.6, "logits/rejected": -703777536.0, "logps/chosen": -191.228076171875, "logps/rejected": -175.0321248372396, "loss": 0.3079, "rewards/chosen": 0.6324722290039062, "rewards/margins": 2.6547363917032873, "rewards/rejected": -2.0222641626993814, "step": 500 }, { "epoch": 0.8421052631578947, "eval_logits/chosen": -922359676.4770643, "eval_logits/rejected": -826945158.35461, "eval_logps/chosen": -217.7674706135321, "eval_logps/rejected": -228.38810394503545, "eval_loss": 0.378118097782135, "eval_rewards/chosen": 0.8047234771448538, "eval_rewards/margins": 1.8294198813516913, "eval_rewards/rejected": -1.0246964042068374, "eval_runtime": 1555.2949, "eval_samples_per_second": 0.643, "eval_steps_per_second": 0.161, "kl": 0.0, "step": 500 }, { "epoch": 0.8437894736842105, "grad_norm": 55.94728204942989, "kl": 0.0, "learning_rate": 3.726373918117196e-08, "logits/chosen": -971137472.0, "logits/rejected": -1374881280.0, "logps/chosen": -255.43304443359375, "logps/rejected": -218.9608612060547, "loss": 0.4355, "rewards/chosen": -0.6300251483917236, "rewards/margins": -1.8687539100646973, "rewards/rejected": 1.2387287616729736, "step": 501 }, { "epoch": 0.8454736842105263, "grad_norm": 47.235552657321186, "kl": 0.0, "learning_rate": 3.6494889389468434e-08, "logits/chosen": -602287744.0, "logits/rejected": -920995072.0, "logps/chosen": -210.18441772460938, "logps/rejected": -227.16696166992188, "loss": 0.3343, "rewards/chosen": 0.2986224889755249, "rewards/margins": 1.3155208826065063, "rewards/rejected": -1.0168983936309814, "step": 502 }, { "epoch": 0.8471578947368421, "grad_norm": 49.23690109646782, "kl": 0.0, "learning_rate": 3.573342925430925e-08, "logits/chosen": -778606080.0, "logits/rejected": -654501376.0, "logps/chosen": -210.22725423177084, "logps/rejected": -293.99200439453125, "loss": 0.373, "rewards/chosen": 0.8925824165344238, "rewards/margins": 2.6825954914093018, "rewards/rejected": -1.790013074874878, "step": 503 }, { "epoch": 0.8488421052631578, "grad_norm": 47.196350435325876, "kl": 0.0, "learning_rate": 3.497938513070234e-08, "logits/chosen": -747158118.4, "logits/rejected": -715651498.6666666, "logps/chosen": -196.84251708984374, "logps/rejected": -247.5506388346354, "loss": 0.3714, "rewards/chosen": 1.6987966537475585, "rewards/margins": 2.460467561086019, "rewards/rejected": -0.7616709073384603, "step": 504 }, { "epoch": 0.8505263157894737, "grad_norm": 67.84038276109705, "kl": 0.0, "learning_rate": 3.423278311697897e-08, "logits/chosen": -1138272768.0, "logits/rejected": -622846156.8, "logps/chosen": -215.9085896809896, "logps/rejected": -196.0285888671875, "loss": 0.3555, "rewards/chosen": 2.7768580118815103, "rewards/margins": 3.196271006266276, "rewards/rejected": -0.41941299438476565, "step": 505 }, { "epoch": 0.8522105263157894, "grad_norm": 42.41057229807221, "kl": 0.0, "learning_rate": 3.349364905389032e-08, "logits/chosen": -1031722581.3333334, "logits/rejected": -827191808.0, "logps/chosen": -204.5508015950521, "logps/rejected": -485.5096130371094, "loss": 0.3319, "rewards/chosen": 1.5830011367797852, "rewards/margins": 23.542262077331543, "rewards/rejected": -21.959260940551758, "step": 506 }, { "epoch": 0.8538947368421053, "grad_norm": 40.22957411853309, "kl": 0.0, "learning_rate": 3.2762008523713386e-08, "logits/chosen": -694314709.3333334, "logits/rejected": -912536064.0, "logps/chosen": -231.19136555989584, "logps/rejected": -210.5927978515625, "loss": 0.4044, "rewards/chosen": -1.8982666333516438, "rewards/margins": -1.2363632520039876, "rewards/rejected": -0.6619033813476562, "step": 507 }, { "epoch": 0.8555789473684211, "grad_norm": 56.68241962573519, "kl": 0.0, "learning_rate": 3.203788684936534e-08, "logits/chosen": -1072353133.7142857, "logits/rejected": -857103872.0, "logps/chosen": -251.75599888392858, "logps/rejected": -181.1580810546875, "loss": 0.4473, "rewards/chosen": -0.4974602290562221, "rewards/margins": -0.4106224605015346, "rewards/rejected": -0.0868377685546875, "step": 508 }, { "epoch": 0.8572631578947368, "grad_norm": 47.126301959598706, "kl": 0.0, "learning_rate": 3.132130909352709e-08, "logits/chosen": -802446774.8571428, "logits/rejected": -754396800.0, "logps/chosen": -243.97293526785714, "logps/rejected": -171.71823120117188, "loss": 0.4578, "rewards/chosen": 0.9422939164297921, "rewards/margins": 0.3230052930968148, "rewards/rejected": 0.6192886233329773, "step": 509 }, { "epoch": 0.8589473684210527, "grad_norm": 67.1017756333287, "kl": 0.0, "learning_rate": 3.061230005777593e-08, "logits/chosen": -716743552.0, "logits/rejected": -928516315.4285715, "logps/chosen": -392.4697265625, "logps/rejected": -267.4026576450893, "loss": 0.3928, "rewards/chosen": -4.59370756149292, "rewards/margins": -1.8247188159397671, "rewards/rejected": -2.7689887455531528, "step": 510 }, { "epoch": 0.8606315789473684, "grad_norm": 37.48246855118462, "kl": 0.0, "learning_rate": 2.991088428172722e-08, "logits/chosen": -882642329.6, "logits/rejected": -579270997.3333334, "logps/chosen": -216.027587890625, "logps/rejected": -224.06245930989584, "loss": 0.3588, "rewards/chosen": 1.8644773483276367, "rewards/margins": 4.355569076538086, "rewards/rejected": -2.491091728210449, "step": 511 }, { "epoch": 0.8623157894736843, "grad_norm": 46.701572144028056, "kl": 0.0, "learning_rate": 2.9217086042184534e-08, "logits/chosen": -951701888.0, "logits/rejected": -1123305728.0, "logps/chosen": -232.40447998046875, "logps/rejected": -214.9777374267578, "loss": 0.3516, "rewards/chosen": 0.27192726731300354, "rewards/margins": 5.410427957773209, "rewards/rejected": -5.138500690460205, "step": 512 }, { "epoch": 0.864, "grad_norm": 47.82460894266106, "kl": 0.0, "learning_rate": 2.8530929352300087e-08, "logits/chosen": -747589632.0, "logits/rejected": -987404083.2, "logps/chosen": -221.045166015625, "logps/rejected": -198.009423828125, "loss": 0.3888, "rewards/chosen": 0.2495401899019877, "rewards/margins": 2.997531179587046, "rewards/rejected": -2.7479909896850585, "step": 513 }, { "epoch": 0.8656842105263158, "grad_norm": 50.95999181346899, "kl": 0.0, "learning_rate": 2.7852437960743326e-08, "logits/chosen": -816897996.8, "logits/rejected": -827434154.6666666, "logps/chosen": -209.32421875, "logps/rejected": -241.70332845052084, "loss": 0.3541, "rewards/chosen": 0.15461671352386475, "rewards/margins": 2.3556827306747437, "rewards/rejected": -2.201066017150879, "step": 514 }, { "epoch": 0.8673684210526316, "grad_norm": 70.2346543078714, "kl": 0.0, "learning_rate": 2.718163535087864e-08, "logits/chosen": -805316096.0, "logits/rejected": -1582949376.0, "logps/chosen": -179.23245239257812, "logps/rejected": -174.32058715820312, "loss": 0.3397, "rewards/chosen": -0.28184014558792114, "rewards/margins": 1.7464155554771423, "rewards/rejected": -2.0282557010650635, "step": 515 }, { "epoch": 0.8690526315789474, "grad_norm": 44.88916114173372, "kl": 0.0, "learning_rate": 2.6518544739953187e-08, "logits/chosen": -884145472.0, "logits/rejected": -1249547136.0, "logps/chosen": -265.9125671386719, "logps/rejected": -233.0666961669922, "loss": 0.3362, "rewards/chosen": 0.07709883153438568, "rewards/margins": 0.21887779235839844, "rewards/rejected": -0.14177896082401276, "step": 516 }, { "epoch": 0.8707368421052631, "grad_norm": 46.28385480900765, "kl": 0.0, "learning_rate": 2.586318907829291e-08, "logits/chosen": -802108842.6666666, "logits/rejected": -925224960.0, "logps/chosen": -203.9495646158854, "logps/rejected": -222.9334716796875, "loss": 0.3661, "rewards/chosen": 1.3522602717081706, "rewards/margins": 0.6993664304415386, "rewards/rejected": 0.6528938412666321, "step": 517 }, { "epoch": 0.872421052631579, "grad_norm": 46.93723588820055, "kl": 0.0, "learning_rate": 2.5215591048508152e-08, "logits/chosen": -1172611498.6666667, "logits/rejected": -601648947.2, "logps/chosen": -245.01177978515625, "logps/rejected": -234.228076171875, "loss": 0.4201, "rewards/chosen": -0.7683883508046468, "rewards/margins": 0.7730680624643961, "rewards/rejected": -1.541456413269043, "step": 518 }, { "epoch": 0.8741052631578947, "grad_norm": 37.74401010007106, "kl": 0.0, "learning_rate": 2.4575773064708898e-08, "logits/chosen": -844627520.0, "logits/rejected": -680875776.0, "logps/chosen": -191.596923828125, "logps/rejected": -210.30389404296875, "loss": 0.3464, "rewards/chosen": 1.64203679561615, "rewards/margins": 2.2509559392929077, "rewards/rejected": -0.6089191436767578, "step": 519 }, { "epoch": 0.8757894736842106, "grad_norm": 38.54143890619264, "kl": 0.0, "learning_rate": 2.3943757271728816e-08, "logits/chosen": -821064064.0, "logits/rejected": -720391232.0, "logps/chosen": -230.83758544921875, "logps/rejected": -239.45230102539062, "loss": 0.2565, "rewards/chosen": 2.128859043121338, "rewards/margins": 4.223225116729736, "rewards/rejected": -2.0943660736083984, "step": 520 }, { "epoch": 0.8774736842105263, "grad_norm": 46.42071867786955, "kl": 0.0, "learning_rate": 2.3319565544358628e-08, "logits/chosen": -550017280.0, "logits/rejected": -889491840.0, "logps/chosen": -228.99757385253906, "logps/rejected": -294.79571533203125, "loss": 0.4034, "rewards/chosen": 1.803693413734436, "rewards/margins": 10.621366143226624, "rewards/rejected": -8.817672729492188, "step": 521 }, { "epoch": 0.8791578947368421, "grad_norm": 165.5741913948296, "kl": 0.0, "learning_rate": 2.270321948658943e-08, "logits/chosen": -1067505152.0, "logits/rejected": -1063348160.0, "logps/chosen": -216.38136291503906, "logps/rejected": -210.46290588378906, "loss": 0.4473, "rewards/chosen": 0.3500576317310333, "rewards/margins": 0.7664283812046051, "rewards/rejected": -0.4163707494735718, "step": 522 }, { "epoch": 0.8808421052631579, "grad_norm": 53.27002172942906, "kl": 0.0, "learning_rate": 2.2094740430864567e-08, "logits/chosen": -694395392.0, "logits/rejected": -859106099.2, "logps/chosen": -148.91910807291666, "logps/rejected": -242.549169921875, "loss": 0.4046, "rewards/chosen": 1.4957799911499023, "rewards/margins": 2.2706830501556396, "rewards/rejected": -0.7749030590057373, "step": 523 }, { "epoch": 0.8825263157894737, "grad_norm": 62.65204336260102, "kl": 0.0, "learning_rate": 2.1494149437341373e-08, "logits/chosen": -1015188992.0, "logits/rejected": -330501728.0, "logps/chosen": -239.76932779947916, "logps/rejected": -281.90582275390625, "loss": 0.3585, "rewards/chosen": 1.6041777928670247, "rewards/margins": 3.236354072888692, "rewards/rejected": -1.6321762800216675, "step": 524 }, { "epoch": 0.8842105263157894, "grad_norm": 59.41452814443212, "kl": 0.0, "learning_rate": 2.0901467293162444e-08, "logits/chosen": -898306944.0, "logits/rejected": -934284970.6666666, "logps/chosen": -239.46893310546875, "logps/rejected": -347.1363525390625, "loss": 0.4087, "rewards/chosen": -0.3174598515033722, "rewards/margins": 1.7363191147645316, "rewards/rejected": -2.053778966267904, "step": 525 }, { "epoch": 0.8858947368421053, "grad_norm": 28.835874353397827, "kl": 0.0, "learning_rate": 2.0316714511735998e-08, "logits/chosen": -985848448.0, "logits/rejected": -997492416.0, "logps/chosen": -201.36636352539062, "logps/rejected": -174.35333251953125, "loss": 0.259, "rewards/chosen": 2.0680689811706543, "rewards/margins": 4.799753904342651, "rewards/rejected": -2.731684923171997, "step": 526 }, { "epoch": 0.887578947368421, "grad_norm": 54.565049728962386, "kl": 0.0, "learning_rate": 1.9739911332025793e-08, "logits/chosen": -839355136.0, "logits/rejected": -553939626.6666666, "logps/chosen": -199.08831787109375, "logps/rejected": -176.72452799479166, "loss": 0.34, "rewards/chosen": 2.425278663635254, "rewards/margins": 2.334609886010488, "rewards/rejected": 0.09066877762476604, "step": 527 }, { "epoch": 0.8892631578947369, "grad_norm": 46.20449911567436, "kl": 0.0, "learning_rate": 1.9171077717850955e-08, "logits/chosen": -786516309.3333334, "logits/rejected": -727864640.0, "logps/chosen": -230.94038899739584, "logps/rejected": -252.71160888671875, "loss": 0.3229, "rewards/chosen": 0.6411038637161255, "rewards/margins": 6.392639756202698, "rewards/rejected": -5.751535892486572, "step": 528 }, { "epoch": 0.8909473684210526, "grad_norm": 57.38593942004128, "kl": 0.0, "learning_rate": 1.8610233357194747e-08, "logits/chosen": -670264618.6666666, "logits/rejected": -815440179.2, "logps/chosen": -183.37162272135416, "logps/rejected": -351.1501708984375, "loss": 0.3711, "rewards/chosen": 1.5523789723714192, "rewards/margins": 9.300235112508139, "rewards/rejected": -7.747856140136719, "step": 529 }, { "epoch": 0.8926315789473684, "grad_norm": 68.33693509306372, "kl": 0.0, "learning_rate": 1.805739766152309e-08, "logits/chosen": -777834240.0, "logits/rejected": -670208640.0, "logps/chosen": -191.6290283203125, "logps/rejected": -211.78524780273438, "loss": 0.3876, "rewards/chosen": 1.8237395286560059, "rewards/margins": 1.157799780368805, "rewards/rejected": 0.6659397482872009, "step": 530 }, { "epoch": 0.8943157894736842, "grad_norm": 34.749494357287055, "kl": 0.0, "learning_rate": 1.7512589765112994e-08, "logits/chosen": -896000804.5714285, "logits/rejected": -150322144.0, "logps/chosen": -230.301513671875, "logps/rejected": -289.5316467285156, "loss": 0.3033, "rewards/chosen": 1.0773948260716029, "rewards/margins": 1.1100638934544154, "rewards/rejected": -0.0326690673828125, "step": 531 }, { "epoch": 0.896, "grad_norm": 52.937522598027066, "kl": 0.0, "learning_rate": 1.697582852439011e-08, "logits/chosen": -1075272448.0, "logits/rejected": -1150560512.0, "logps/chosen": -186.83663940429688, "logps/rejected": -228.51455688476562, "loss": 0.3423, "rewards/chosen": 1.7373024225234985, "rewards/margins": 1.5837997794151306, "rewards/rejected": 0.15350264310836792, "step": 532 }, { "epoch": 0.8976842105263158, "grad_norm": 30.475440103504955, "kl": 0.0, "learning_rate": 1.6447132517276004e-08, "logits/chosen": -637437952.0, "logits/rejected": -962437324.8, "logps/chosen": -198.0897216796875, "logps/rejected": -254.03193359375, "loss": 0.3967, "rewards/chosen": 1.2216943899790447, "rewards/margins": 0.7942300478617351, "rewards/rejected": 0.42746434211730955, "step": 533 }, { "epoch": 0.8993684210526316, "grad_norm": 43.2574141571032, "kl": 0.0, "learning_rate": 1.5926520042545383e-08, "logits/chosen": -1247182848.0, "logits/rejected": -590226432.0, "logps/chosen": -222.28861490885416, "logps/rejected": -237.892724609375, "loss": 0.3006, "rewards/chosen": 0.06360169251759847, "rewards/margins": 1.3943130572636921, "rewards/rejected": -1.3307113647460938, "step": 534 }, { "epoch": 0.9010526315789473, "grad_norm": 53.71591400639584, "kl": 0.0, "learning_rate": 1.5414009119192635e-08, "logits/chosen": -915837120.0, "logits/rejected": -808716458.6666666, "logps/chosen": -179.90530395507812, "logps/rejected": -195.09493001302084, "loss": 0.4779, "rewards/chosen": 0.5700050592422485, "rewards/margins": -0.024598677953084347, "rewards/rejected": 0.5946037371953329, "step": 535 }, { "epoch": 0.9027368421052632, "grad_norm": 61.45180482744315, "kl": 0.0, "learning_rate": 1.4909617485808073e-08, "logits/chosen": -1060230348.8, "logits/rejected": -902803541.3333334, "logps/chosen": -229.759912109375, "logps/rejected": -223.30476888020834, "loss": 0.3913, "rewards/chosen": 1.0647006988525392, "rewards/margins": 0.6491708119710287, "rewards/rejected": 0.41552988688151044, "step": 536 }, { "epoch": 0.9044210526315789, "grad_norm": 101.47573520441375, "kl": 0.0, "learning_rate": 1.4413362599964118e-08, "logits/chosen": -850445994.6666666, "logits/rejected": -538516864.0, "logps/chosen": -285.9638264973958, "logps/rejected": -175.34397888183594, "loss": 0.4247, "rewards/chosen": -0.0778544048468272, "rewards/margins": -3.1665999988714852, "rewards/rejected": 3.088745594024658, "step": 537 }, { "epoch": 0.9061052631578947, "grad_norm": 45.25790194746145, "kl": 0.0, "learning_rate": 1.3925261637611068e-08, "logits/chosen": -965904588.8, "logits/rejected": -1320554325.3333333, "logps/chosen": -204.27237548828126, "logps/rejected": -217.86629231770834, "loss": 0.4245, "rewards/chosen": 1.1585922241210938, "rewards/margins": 4.713977813720703, "rewards/rejected": -3.5553855895996094, "step": 538 }, { "epoch": 0.9077894736842105, "grad_norm": 31.80801282899897, "kl": 0.0, "learning_rate": 1.3445331492482614e-08, "logits/chosen": -1130592682.6666667, "logits/rejected": -715334297.6, "logps/chosen": -185.36031087239584, "logps/rejected": -224.770458984375, "loss": 0.3553, "rewards/chosen": 0.9161597887674967, "rewards/margins": 2.255999151865641, "rewards/rejected": -1.3398393630981444, "step": 539 }, { "epoch": 0.9094736842105263, "grad_norm": 39.75979427316365, "kl": 0.0, "learning_rate": 1.2973588775511024e-08, "logits/chosen": -829291840.0, "logits/rejected": -1113173418.6666667, "logps/chosen": -194.16685485839844, "logps/rejected": -230.64664713541666, "loss": 0.325, "rewards/chosen": 2.4532127380371094, "rewards/margins": 2.8305301666259766, "rewards/rejected": -0.3773174285888672, "step": 540 }, { "epoch": 0.9111578947368421, "grad_norm": 49.13868786217956, "kl": 0.0, "learning_rate": 1.2510049814252299e-08, "logits/chosen": -1220529152.0, "logits/rejected": -959601868.8, "logps/chosen": -205.97456868489584, "logps/rejected": -292.898876953125, "loss": 0.4062, "rewards/chosen": 0.8909465471903483, "rewards/margins": 2.5595290819803873, "rewards/rejected": -1.6685825347900392, "step": 541 }, { "epoch": 0.9128421052631579, "grad_norm": 46.27018727053655, "kl": 0.0, "learning_rate": 1.2054730652321127e-08, "logits/chosen": -801094336.0, "logits/rejected": -564379648.0, "logps/chosen": -316.7994384765625, "logps/rejected": -242.9945831298828, "loss": 0.4345, "rewards/chosen": -1.3522491455078125, "rewards/margins": -0.6712818145751953, "rewards/rejected": -0.6809673309326172, "step": 542 }, { "epoch": 0.9145263157894736, "grad_norm": 40.13535818390141, "kl": 0.0, "learning_rate": 1.1607647048835462e-08, "logits/chosen": -819266389.3333334, "logits/rejected": -731518822.4, "logps/chosen": -188.42167154947916, "logps/rejected": -174.18707275390625, "loss": 0.3224, "rewards/chosen": 1.533565839131673, "rewards/margins": 3.497674496968587, "rewards/rejected": -1.964108657836914, "step": 543 }, { "epoch": 0.9162105263157895, "grad_norm": 58.704500927311344, "kl": 0.0, "learning_rate": 1.116881447787113e-08, "logits/chosen": -644635840.0, "logits/rejected": -1043029376.0, "logps/chosen": -210.66639709472656, "logps/rejected": -211.0460968017578, "loss": 0.357, "rewards/chosen": 0.7050827145576477, "rewards/margins": 0.4989025592803955, "rewards/rejected": 0.2061801552772522, "step": 544 }, { "epoch": 0.9178947368421052, "grad_norm": 38.106880910390885, "kl": 0.0, "learning_rate": 1.0738248127926342e-08, "logits/chosen": -678777770.6666666, "logits/rejected": -1563541504.0, "logps/chosen": -219.55989583333334, "logps/rejected": -179.98016357421875, "loss": 0.3392, "rewards/chosen": 0.5457992553710938, "rewards/margins": -1.0953590869903564, "rewards/rejected": 1.6411583423614502, "step": 545 }, { "epoch": 0.919578947368421, "grad_norm": 58.09818425552494, "kl": 0.0, "learning_rate": 1.0315962901395802e-08, "logits/chosen": -517518624.0, "logits/rejected": -936108714.6666666, "logps/chosen": -197.85256958007812, "logps/rejected": -324.84869384765625, "loss": 0.3334, "rewards/chosen": 4.236547470092773, "rewards/margins": 13.935715993245443, "rewards/rejected": -9.69916852315267, "step": 546 }, { "epoch": 0.9212631578947369, "grad_norm": 87.5472135904328, "kl": 0.0, "learning_rate": 9.901973414055186e-09, "logits/chosen": -972204544.0, "logits/rejected": -999611456.0, "logps/chosen": -240.75531005859375, "logps/rejected": -262.8578186035156, "loss": 0.4516, "rewards/chosen": 0.7045586109161377, "rewards/margins": 3.072298526763916, "rewards/rejected": -2.3677399158477783, "step": 547 }, { "epoch": 0.9229473684210526, "grad_norm": 61.8636347167815, "kl": 0.0, "learning_rate": 9.496293994555066e-09, "logits/chosen": -967555788.8, "logits/rejected": -894452138.6666666, "logps/chosen": -237.103955078125, "logps/rejected": -238.02288818359375, "loss": 0.4624, "rewards/chosen": -0.08766875267028809, "rewards/margins": 3.0026972929636635, "rewards/rejected": -3.0903660456339517, "step": 548 }, { "epoch": 0.9246315789473685, "grad_norm": 76.28246005925376, "kl": 0.0, "learning_rate": 9.098938683924972e-09, "logits/chosen": -610943360.0, "logits/rejected": -872966400.0, "logps/chosen": -243.48297119140625, "logps/rejected": -183.1815185546875, "loss": 0.4655, "rewards/chosen": 0.6167190869649252, "rewards/margins": 0.13095130523045861, "rewards/rejected": 0.48576778173446655, "step": 549 }, { "epoch": 0.9263157894736842, "grad_norm": 39.99028773675479, "kl": 0.0, "learning_rate": 8.709921235087597e-09, "logits/chosen": -773891264.0, "logits/rejected": -839812096.0, "logps/chosen": -239.84304809570312, "logps/rejected": -216.54505920410156, "loss": 0.3937, "rewards/chosen": -0.7027072906494141, "rewards/margins": 0.8752220869064331, "rewards/rejected": -1.5779293775558472, "step": 550 }, { "epoch": 0.928, "grad_norm": 176.8083268495628, "kl": 0.0, "learning_rate": 8.329255112382665e-09, "logits/chosen": -674264934.4, "logits/rejected": -985622101.3333334, "logps/chosen": -153.71854248046876, "logps/rejected": -222.71295166015625, "loss": 0.3363, "rewards/chosen": 1.8852203369140625, "rewards/margins": 4.823611068725586, "rewards/rejected": -2.9383907318115234, "step": 551 }, { "epoch": 0.9296842105263158, "grad_norm": 47.516751657127465, "kl": 0.0, "learning_rate": 7.956953491100871e-09, "logits/chosen": -841082624.0, "logits/rejected": -614038528.0, "logps/chosen": -277.8362731933594, "logps/rejected": -308.65185546875, "loss": 0.3715, "rewards/chosen": 0.449728399515152, "rewards/margins": 3.049138320343835, "rewards/rejected": -2.599409920828683, "step": 552 }, { "epoch": 0.9313684210526316, "grad_norm": 92.74872502810405, "kl": 0.0, "learning_rate": 7.593029257027956e-09, "logits/chosen": -639304106.6666666, "logits/rejected": -1038373478.4, "logps/chosen": -212.13045247395834, "logps/rejected": -236.4179931640625, "loss": 0.3602, "rewards/chosen": -0.6566390991210938, "rewards/margins": 1.910294532775879, "rewards/rejected": -2.5669336318969727, "step": 553 }, { "epoch": 0.9330526315789474, "grad_norm": 49.323568144147856, "kl": 0.0, "learning_rate": 7.23749500599874e-09, "logits/chosen": -1499314858.6666667, "logits/rejected": -812018636.8, "logps/chosen": -252.00286865234375, "logps/rejected": -275.40224609375, "loss": 0.3429, "rewards/chosen": -0.8385945955912272, "rewards/margins": 0.727512296040853, "rewards/rejected": -1.5661068916320802, "step": 554 }, { "epoch": 0.9347368421052632, "grad_norm": 48.76961282183268, "kl": 0.0, "learning_rate": 6.89036304346105e-09, "logits/chosen": -709829034.6666666, "logits/rejected": -862226176.0, "logps/chosen": -221.70589192708334, "logps/rejected": -222.41966247558594, "loss": 0.3214, "rewards/chosen": 0.810302734375, "rewards/margins": 1.5894142389297485, "rewards/rejected": -0.7791115045547485, "step": 555 }, { "epoch": 0.9364210526315789, "grad_norm": 51.49949200706721, "kl": 0.0, "learning_rate": 6.551645384049897e-09, "logits/chosen": -880212906.6666666, "logits/rejected": -670162816.0, "logps/chosen": -202.01947021484375, "logps/rejected": -240.55711364746094, "loss": 0.3524, "rewards/chosen": 0.031015316645304363, "rewards/margins": 1.1750317017237346, "rewards/rejected": -1.1440163850784302, "step": 556 }, { "epoch": 0.9381052631578948, "grad_norm": 48.535906716358035, "kl": 0.0, "learning_rate": 6.221353751171665e-09, "logits/chosen": -726684928.0, "logits/rejected": -747469531.4285715, "logps/chosen": -186.82913208007812, "logps/rejected": -262.0355922154018, "loss": 0.3274, "rewards/chosen": 4.104483127593994, "rewards/margins": 5.761045115334647, "rewards/rejected": -1.656561987740653, "step": 557 }, { "epoch": 0.9397894736842105, "grad_norm": 44.18140203861248, "kl": 0.0, "learning_rate": 5.899499576598216e-09, "logits/chosen": -919181721.6, "logits/rejected": -773407402.6666666, "logps/chosen": -205.78271484375, "logps/rejected": -287.0757242838542, "loss": 0.36, "rewards/chosen": 1.863102340698242, "rewards/margins": 3.7114522298177084, "rewards/rejected": -1.848349889119466, "step": 558 }, { "epoch": 0.9414736842105264, "grad_norm": 34.08124127057782, "kl": 0.0, "learning_rate": 5.586094000071401e-09, "logits/chosen": -597630336.0, "logits/rejected": -568648192.0, "logps/chosen": -231.41092936197916, "logps/rejected": -189.46743774414062, "loss": 0.4097, "rewards/chosen": 1.2908663749694824, "rewards/margins": 0.8174479305744171, "rewards/rejected": 0.4734184443950653, "step": 559 }, { "epoch": 0.9431578947368421, "grad_norm": 36.1106808708771, "kl": 0.0, "learning_rate": 5.2811478689173686e-09, "logits/chosen": -405102976.0, "logits/rejected": -868101461.3333334, "logps/chosen": -254.44976806640625, "logps/rejected": -242.90071614583334, "loss": 0.3742, "rewards/chosen": 1.0309677124023438, "rewards/margins": 2.66564400990804, "rewards/rejected": -1.6346762975056965, "step": 560 }, { "epoch": 0.9448421052631579, "grad_norm": 45.82329638700707, "kl": 0.0, "learning_rate": 4.984671737671142e-09, "logits/chosen": -742133077.3333334, "logits/rejected": -740478361.6, "logps/chosen": -258.3249918619792, "logps/rejected": -183.15859375, "loss": 0.4038, "rewards/chosen": -0.7480514844258627, "rewards/margins": -0.37228356202443447, "rewards/rejected": -0.3757679224014282, "step": 561 }, { "epoch": 0.9465263157894737, "grad_norm": 39.6743299184901, "kl": 0.0, "learning_rate": 4.696675867711386e-09, "logits/chosen": -535260979.2, "logits/rejected": -851761066.6666666, "logps/chosen": -218.016943359375, "logps/rejected": -338.2209065755208, "loss": 0.4127, "rewards/chosen": 1.0443305015563964, "rewards/margins": 3.631992403666178, "rewards/rejected": -2.5876619021097818, "step": 562 }, { "epoch": 0.9482105263157895, "grad_norm": 25.945047187533934, "kl": 0.0, "learning_rate": 4.417170226905187e-09, "logits/chosen": -638596096.0, "logits/rejected": -722204416.0, "logps/chosen": -176.8044230143229, "logps/rejected": -205.5111572265625, "loss": 0.3022, "rewards/chosen": 1.1705973148345947, "rewards/margins": 1.9772413730621339, "rewards/rejected": -0.806644058227539, "step": 563 }, { "epoch": 0.9498947368421052, "grad_norm": 56.04530688467595, "kl": 0.0, "learning_rate": 4.146164489263054e-09, "logits/chosen": -803748693.3333334, "logits/rejected": -645856576.0, "logps/chosen": -237.97408040364584, "logps/rejected": -261.6069030761719, "loss": 0.4415, "rewards/chosen": 0.1846969723701477, "rewards/margins": 2.00448077917099, "rewards/rejected": -1.8197838068008423, "step": 564 }, { "epoch": 0.9515789473684211, "grad_norm": 45.941589484462845, "kl": 0.0, "learning_rate": 3.88366803460416e-09, "logits/chosen": -1046677888.0, "logits/rejected": -937844800.0, "logps/chosen": -216.30455017089844, "logps/rejected": -152.97848510742188, "loss": 0.3607, "rewards/chosen": 1.318871259689331, "rewards/margins": 1.2597434520721436, "rewards/rejected": 0.0591278076171875, "step": 565 }, { "epoch": 0.9532631578947368, "grad_norm": 61.47288256142775, "kl": 0.0, "learning_rate": 3.6296899482316236e-09, "logits/chosen": -661587904.0, "logits/rejected": -835168896.0, "logps/chosen": -245.52484130859375, "logps/rejected": -229.88687133789062, "loss": 0.4462, "rewards/chosen": 1.543544054031372, "rewards/margins": 1.7612977027893066, "rewards/rejected": -0.21775364875793457, "step": 566 }, { "epoch": 0.9549473684210527, "grad_norm": 47.39961532223011, "kl": 0.0, "learning_rate": 3.384239020618018e-09, "logits/chosen": -1413642368.0, "logits/rejected": -993228288.0, "logps/chosen": -251.99951171875, "logps/rejected": -258.76263427734375, "loss": 0.3247, "rewards/chosen": 0.9964095950126648, "rewards/margins": 2.568497121334076, "rewards/rejected": -1.5720875263214111, "step": 567 }, { "epoch": 0.9566315789473684, "grad_norm": 42.291585331791644, "kl": 0.0, "learning_rate": 3.1473237471012214e-09, "logits/chosen": -733431910.4, "logits/rejected": -799960064.0, "logps/chosen": -177.69178466796876, "logps/rejected": -213.2462158203125, "loss": 0.3644, "rewards/chosen": 0.27896909713745116, "rewards/margins": 2.7935458501180013, "rewards/rejected": -2.5145767529805503, "step": 568 }, { "epoch": 0.9583157894736842, "grad_norm": 40.929057929848376, "kl": 0.0, "learning_rate": 2.9189523275903736e-09, "logits/chosen": -978359091.2, "logits/rejected": -652150016.0, "logps/chosen": -271.872216796875, "logps/rejected": -174.5239461263021, "loss": 0.3284, "rewards/chosen": -0.7042415618896485, "rewards/margins": 0.11138847668965657, "rewards/rejected": -0.815630038579305, "step": 569 }, { "epoch": 0.96, "grad_norm": 40.061462493884235, "kl": 0.0, "learning_rate": 2.6991326662819667e-09, "logits/chosen": -729751552.0, "logits/rejected": -752365772.8, "logps/chosen": -195.5203857421875, "logps/rejected": -209.675634765625, "loss": 0.3478, "rewards/chosen": 1.747754414876302, "rewards/margins": 3.8098199208577475, "rewards/rejected": -2.0620655059814452, "step": 570 }, { "epoch": 0.9616842105263158, "grad_norm": 37.651701881002374, "kl": 0.0, "learning_rate": 2.4878723713864234e-09, "logits/chosen": -627430656.0, "logits/rejected": -945733734.4, "logps/chosen": -186.00541178385416, "logps/rejected": -253.2623046875, "loss": 0.3783, "rewards/chosen": 1.6760249137878418, "rewards/margins": 4.277830410003662, "rewards/rejected": -2.60180549621582, "step": 571 }, { "epoch": 0.9633684210526315, "grad_norm": 70.57315668742685, "kl": 0.0, "learning_rate": 2.285178754864614e-09, "logits/chosen": -987434325.3333334, "logits/rejected": -616783462.4, "logps/chosen": -218.68375651041666, "logps/rejected": -293.998388671875, "loss": 0.4376, "rewards/chosen": 1.6107417742411296, "rewards/margins": 7.283069960276286, "rewards/rejected": -5.672328186035156, "step": 572 }, { "epoch": 0.9650526315789474, "grad_norm": 69.90429499041211, "kl": 0.0, "learning_rate": 2.091058832174891e-09, "logits/chosen": -730186432.0, "logits/rejected": -509659392.0, "logps/chosen": -196.79669189453125, "logps/rejected": -303.237060546875, "loss": 0.317, "rewards/chosen": 0.6219147443771362, "rewards/margins": 11.697727084159851, "rewards/rejected": -11.075812339782715, "step": 573 }, { "epoch": 0.9667368421052631, "grad_norm": 38.30542476552404, "kl": 0.0, "learning_rate": 1.905519322030258e-09, "logits/chosen": -938985881.6, "logits/rejected": -598871637.3333334, "logps/chosen": -183.621826171875, "logps/rejected": -228.38956705729166, "loss": 0.428, "rewards/chosen": 0.9899630546569824, "rewards/margins": 1.8534589608510337, "rewards/rejected": -0.8634959061940511, "step": 574 }, { "epoch": 0.968421052631579, "grad_norm": 80.5219219878979, "kl": 0.0, "learning_rate": 1.7285666461657467e-09, "logits/chosen": -844385426.2857143, "logits/rejected": -1569694336.0, "logps/chosen": -221.71473911830358, "logps/rejected": -172.7010498046875, "loss": 0.4116, "rewards/chosen": -0.09411777768816267, "rewards/margins": -0.8716171554156712, "rewards/rejected": 0.7774993777275085, "step": 575 }, { "epoch": 0.9701052631578947, "grad_norm": 89.03407725109749, "kl": 0.0, "learning_rate": 1.5602069291162368e-09, "logits/chosen": -925635904.0, "logits/rejected": -844432000.0, "logps/chosen": -256.534912109375, "logps/rejected": -220.78697204589844, "loss": 0.2972, "rewards/chosen": -0.9312546849250793, "rewards/margins": 2.055956780910492, "rewards/rejected": -2.9872114658355713, "step": 576 }, { "epoch": 0.9717894736842105, "grad_norm": 33.63530023405151, "kl": 0.0, "learning_rate": 1.4004459980045124e-09, "logits/chosen": -958183424.0, "logits/rejected": -700009045.3333334, "logps/chosen": -239.659912109375, "logps/rejected": -270.008544921875, "loss": 0.4455, "rewards/chosen": -0.18169254064559937, "rewards/margins": 2.001987040042877, "rewards/rejected": -2.1836795806884766, "step": 577 }, { "epoch": 0.9734736842105263, "grad_norm": 34.10535983128858, "kl": 0.0, "learning_rate": 1.2492893823394246e-09, "logits/chosen": -1030501120.0, "logps/chosen": -254.66343688964844, "loss": 0.3929, "rewards/chosen": -0.19029521942138672, "step": 578 }, { "epoch": 0.9751578947368421, "grad_norm": 130.35662841898622, "kl": 0.0, "learning_rate": 1.10674231382471e-09, "logits/chosen": -418937888.0, "logits/rejected": -518623584.0, "logps/chosen": -232.24008178710938, "logps/rejected": -243.5550994873047, "loss": 0.3157, "rewards/chosen": 0.7040752172470093, "rewards/margins": 1.6154407858848572, "rewards/rejected": -0.9113655686378479, "step": 579 }, { "epoch": 0.9768421052631578, "grad_norm": 42.15293348378943, "kl": 0.0, "learning_rate": 9.7280972617772e-10, "logits/chosen": -1371472384.0, "logits/rejected": -1195231402.6666667, "logps/chosen": -254.03028869628906, "logps/rejected": -272.2608642578125, "loss": 0.3864, "rewards/chosen": -2.0985825061798096, "rewards/margins": -0.8987653255462646, "rewards/rejected": -1.199817180633545, "step": 580 }, { "epoch": 0.9785263157894737, "grad_norm": 82.82639202731109, "kl": 0.0, "learning_rate": 8.474962549588349e-10, "logits/chosen": -553952512.0, "logits/rejected": -1753602389.3333333, "logps/chosen": -212.57861328125, "logps/rejected": -235.5814208984375, "loss": 0.3613, "rewards/chosen": 1.6823585510253907, "rewards/margins": 2.548850742975871, "rewards/rejected": -0.8664921919504801, "step": 581 }, { "epoch": 0.9802105263157894, "grad_norm": 39.562779110049625, "kl": 0.0, "learning_rate": 7.308062374108692e-10, "logits/chosen": -836093888.0, "logits/rejected": -541282218.6666666, "logps/chosen": -245.46795654296875, "logps/rejected": -265.79986572265625, "loss": 0.3856, "rewards/chosen": 1.5786278247833252, "rewards/margins": 3.83101216952006, "rewards/rejected": -2.252384344736735, "step": 582 }, { "epoch": 0.9818947368421053, "grad_norm": 49.619503065339295, "kl": 0.0, "learning_rate": 6.227437123090539e-10, "logits/chosen": -1412194432.0, "logits/rejected": -683454902.8571428, "logps/chosen": -247.62109375, "logps/rejected": -278.0474853515625, "loss": 0.2775, "rewards/chosen": -2.1934449672698975, "rewards/margins": 0.6917969499315535, "rewards/rejected": -2.885241917201451, "step": 583 }, { "epoch": 0.983578947368421, "grad_norm": 41.65613682455663, "kl": 0.0, "learning_rate": 5.233124198212035e-10, "logits/chosen": -1068381312.0, "logits/rejected": -620764608.0, "logps/chosen": -203.81106567382812, "logps/rejected": -199.25198364257812, "loss": 0.3426, "rewards/chosen": 0.44310152530670166, "rewards/margins": 1.8038746118545532, "rewards/rejected": -1.3607730865478516, "step": 584 }, { "epoch": 0.9852631578947368, "grad_norm": 74.23272136164195, "kl": 0.0, "learning_rate": 4.3251580137831924e-10, "logits/chosen": -815114496.0, "logits/rejected": -602160384.0, "logps/chosen": -206.22401428222656, "logps/rejected": -258.4097900390625, "loss": 0.2408, "rewards/chosen": 1.1723206043243408, "rewards/margins": 2.8920695781707764, "rewards/rejected": -1.7197489738464355, "step": 585 }, { "epoch": 0.9869473684210527, "grad_norm": 37.528975070907734, "kl": 0.0, "learning_rate": 3.5035699955540675e-10, "logits/chosen": -1021857088.0, "logits/rejected": -1152371072.0, "logps/chosen": -198.44970703125, "logps/rejected": -276.451904296875, "loss": 0.2932, "rewards/chosen": 1.2920650243759155, "rewards/margins": 4.100528836250305, "rewards/rejected": -2.8084638118743896, "step": 586 }, { "epoch": 0.9886315789473684, "grad_norm": 66.83382682650414, "kl": 0.0, "learning_rate": 2.768388579627301e-10, "logits/chosen": -696945877.3333334, "logits/rejected": -635326361.6, "logps/chosen": -188.5633748372396, "logps/rejected": -227.542333984375, "loss": 0.3505, "rewards/chosen": -1.1209592819213867, "rewards/margins": 2.0206165313720703, "rewards/rejected": -3.141575813293457, "step": 587 }, { "epoch": 0.9903157894736843, "grad_norm": 38.439783990376164, "kl": 0.0, "learning_rate": 2.1196392114744556e-10, "logits/chosen": -732253184.0, "logits/rejected": -485836117.3333333, "logps/chosen": -229.332763671875, "logps/rejected": -178.27132161458334, "loss": 0.3191, "rewards/chosen": 2.1135427474975588, "rewards/margins": 2.325268598397573, "rewards/rejected": -0.21172585090001425, "step": 588 }, { "epoch": 0.992, "grad_norm": 84.00543667151456, "kl": 0.0, "learning_rate": 1.5573443450545009e-10, "logits/chosen": -681184563.2, "logits/rejected": -883190613.3333334, "logps/chosen": -231.067626953125, "logps/rejected": -222.4321492513021, "loss": 0.3878, "rewards/chosen": 0.3701077699661255, "rewards/margins": 1.0806505282719931, "rewards/rejected": -0.7105427583058676, "step": 589 }, { "epoch": 0.9936842105263158, "grad_norm": 65.06311520099162, "kl": 0.0, "learning_rate": 1.0815234420369357e-10, "logits/chosen": -837959372.8, "logits/rejected": -1049096704.0, "logps/chosen": -237.297705078125, "logps/rejected": -172.59944661458334, "loss": 0.3653, "rewards/chosen": 1.0379376411437988, "rewards/margins": -0.5275972684224446, "rewards/rejected": 1.5655349095662434, "step": 590 }, { "epoch": 0.9953684210526316, "grad_norm": 131.03458960479955, "kl": 0.0, "learning_rate": 6.921929711287134e-11, "logits/chosen": -793252928.0, "logits/rejected": -1009639296.0, "logps/chosen": -249.7359619140625, "logps/rejected": -276.18560791015625, "loss": 0.4221, "rewards/chosen": 1.2013773918151855, "rewards/margins": 2.3829771280288696, "rewards/rejected": -1.181599736213684, "step": 591 }, { "epoch": 0.9970526315789474, "grad_norm": 59.274731916237144, "kl": 0.0, "learning_rate": 3.893664075035885e-11, "logits/chosen": -958084010.6666666, "logits/rejected": -776463616.0, "logps/chosen": -219.0360107421875, "logps/rejected": -177.7460174560547, "loss": 0.4209, "rewards/chosen": 0.09142877658208211, "rewards/margins": -0.03964764873186748, "rewards/rejected": 0.13107642531394958, "step": 592 }, { "epoch": 0.9987368421052631, "grad_norm": 35.198610073696365, "kl": 0.0, "learning_rate": 1.730542323355455e-11, "logits/chosen": -1364308699.4285715, "logits/rejected": -682940672.0, "logps/chosen": -254.698486328125, "logps/rejected": -193.01634216308594, "loss": 0.3481, "rewards/chosen": -0.044670513698032925, "rewards/margins": -1.6197544847215926, "rewards/rejected": 1.5750839710235596, "step": 593 }, { "epoch": 1.0, "grad_norm": 35.198610073696365, "kl": 0.0, "learning_rate": 4.326393243742066e-12, "logits/chosen": -436645888.0, "logits/rejected": -1217262336.0, "logps/chosen": -184.77606201171875, "logps/rejected": -178.923583984375, "loss": 0.342, "rewards/chosen": -0.0059494078159332275, "rewards/margins": 1.3890884816646576, "rewards/rejected": -1.3950378894805908, "step": 594 }, { "epoch": 1.0, "step": 594, "total_flos": 2.3597531637009613e+17, "train_loss": 0.4100885201333348, "train_runtime": 53730.3673, "train_samples_per_second": 0.354, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 594, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3597531637009613e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }