{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3649, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002740852405097985, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 1.4285714285714287e-07, "logits/chosen": 13927143.111111112, "logits/rejected": 11332061.866666667, "logps/chosen": -520.2071940104166, "logps/rejected": -525.8274739583334, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.000548170481019597, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "logits/chosen": 6138932.0, "logits/rejected": 53417216.0, "logps/chosen": -419.6460774739583, "logps/rejected": -486.9514567057292, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0008222557215293956, "grad_norm": 58.5, "kl": 0.21227264404296875, "learning_rate": 4.285714285714286e-07, "logits/chosen": 17829473.14285714, "logits/rejected": 8480004.8, "logps/chosen": -581.1654575892857, "logps/rejected": -494.3796875, "loss": 0.4925, "rewards/chosen": 0.03145424808774676, "rewards/margins": 0.05075905855212893, "rewards/rejected": -0.01930481046438217, "step": 3 }, { "epoch": 0.001096340962039194, "grad_norm": 43.5, "kl": 0.08537165820598602, "learning_rate": 5.714285714285715e-07, "logits/chosen": 8679396.363636363, "logits/rejected": 19455780.923076924, "logps/chosen": -401.5114080255682, "logps/rejected": -392.17041015625, "loss": 0.5027, "rewards/chosen": -0.048328193751248444, "rewards/margins": -0.02755581686546752, "rewards/rejected": -0.020772376885780923, "step": 4 }, { "epoch": 0.0013704262025489927, "grad_norm": 69.0, "kl": 0.11183484643697739, "learning_rate": 7.142857142857143e-07, "logits/chosen": 26516902.4, "logits/rejected": 49527992.88888889, "logps/chosen": -645.0729817708333, "logps/rejected": -414.0642361111111, "loss": 0.4977, "rewards/chosen": 0.004266563057899475, "rewards/margins": 0.028189377321137323, "rewards/rejected": -0.02392281426323785, "step": 5 }, { "epoch": 0.0016445114430587912, "grad_norm": 50.25, "kl": 0.07754262536764145, "learning_rate": 8.571428571428572e-07, "logits/chosen": 11069390.857142856, "logits/rejected": 21200387.2, "logps/chosen": -470.66636439732144, "logps/rejected": -320.935498046875, "loss": 0.5071, "rewards/chosen": -0.030853109700339183, "rewards/margins": -0.055996160847800125, "rewards/rejected": 0.02514305114746094, "step": 6 }, { "epoch": 0.0019185966835685898, "grad_norm": 52.0, "kl": 0.34815216064453125, "learning_rate": 1.0000000000000002e-06, "logits/chosen": 23814938.666666668, "logits/rejected": 16669717.333333334, "logps/chosen": -595.7197265625, "logps/rejected": -338.6750081380208, "loss": 0.4944, "rewards/chosen": 0.019522984822591145, "rewards/margins": 0.06013285617033641, "rewards/rejected": -0.04060987134774526, "step": 7 }, { "epoch": 0.002192681924078388, "grad_norm": 50.25, "kl": 0.14911271631717682, "learning_rate": 1.142857142857143e-06, "logits/chosen": 3765517.4545454546, "logits/rejected": 2731864.6153846155, "logps/chosen": -544.12646484375, "logps/rejected": -429.9457256610577, "loss": 0.4961, "rewards/chosen": 0.00898340479894118, "rewards/margins": 0.021482700547138293, "rewards/rejected": -0.012499295748197116, "step": 8 }, { "epoch": 0.0024667671645881867, "grad_norm": 59.75, "kl": 0.2560485303401947, "learning_rate": 1.2857142857142856e-06, "logits/chosen": -6629439.05882353, "logits/rejected": 37816203.428571425, "logps/chosen": -439.47443704044116, "logps/rejected": -329.57212611607144, "loss": 0.4984, "rewards/chosen": 0.03969919681549072, "rewards/margins": 0.011308148503303528, "rewards/rejected": 0.028391048312187195, "step": 9 }, { "epoch": 0.0027408524050979853, "grad_norm": 47.75, "kl": 0.06622314453125, "learning_rate": 1.4285714285714286e-06, "logits/chosen": 4689703.384615385, "logits/rejected": 39738984.72727273, "logps/chosen": -441.67427884615387, "logps/rejected": -464.00297407670456, "loss": 0.4949, "rewards/chosen": 0.030432481032151442, "rewards/margins": 0.03887017618317704, "rewards/rejected": -0.008437695151025599, "step": 10 }, { "epoch": 0.003014937645607784, "grad_norm": 46.75, "kl": 0.3595088720321655, "learning_rate": 1.5714285714285714e-06, "logits/chosen": 23492142.769230768, "logits/rejected": 15509666.909090908, "logps/chosen": -491.8811598557692, "logps/rejected": -397.7107599431818, "loss": 0.5029, "rewards/chosen": 0.014393966931563158, "rewards/margins": -0.0066788027753363136, "rewards/rejected": 0.02107276970689947, "step": 11 }, { "epoch": 0.0032890228861175825, "grad_norm": 53.25, "kl": 0.5253448486328125, "learning_rate": 1.7142857142857145e-06, "logits/chosen": -22031419.2, "logits/rejected": 21621410.285714287, "logps/chosen": -622.251708984375, "logps/rejected": -458.8676060267857, "loss": 0.4851, "rewards/chosen": 0.1518975853919983, "rewards/margins": 0.14506044728415354, "rewards/rejected": 0.006837138107844761, "step": 12 }, { "epoch": 0.003563108126627381, "grad_norm": 43.25, "kl": 0.3934866786003113, "learning_rate": 1.8571428571428573e-06, "logits/chosen": -13052924.444444444, "logits/rejected": 17158837.333333332, "logps/chosen": -511.2985026041667, "logps/rejected": -329.3484375, "loss": 0.4885, "rewards/chosen": 0.0893764959441291, "rewards/margins": 0.09199073356058862, "rewards/rejected": -0.0026142376164595285, "step": 13 }, { "epoch": 0.0038371933671371796, "grad_norm": 43.5, "kl": 0.6282181739807129, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 9690637.333333334, "logits/rejected": 43223938.666666664, "logps/chosen": -372.0698649088542, "logps/rejected": -470.623291015625, "loss": 0.4993, "rewards/chosen": 0.09184247255325317, "rewards/margins": 0.05240772167841593, "rewards/rejected": 0.03943475087483724, "step": 14 }, { "epoch": 0.004111278607646978, "grad_norm": 46.0, "kl": 0.44204461574554443, "learning_rate": 2.1428571428571427e-06, "logits/chosen": 27328170.666666668, "logits/rejected": 20297512.0, "logps/chosen": -480.5823567708333, "logps/rejected": -317.0185953776042, "loss": 0.4835, "rewards/chosen": 0.16300062338511148, "rewards/margins": 0.16654206129411855, "rewards/rejected": -0.0035414379090070724, "step": 15 }, { "epoch": 0.004385363848156776, "grad_norm": 49.0, "kl": 0.5807406306266785, "learning_rate": 2.285714285714286e-06, "logits/chosen": 25201772.307692308, "logits/rejected": 38787066.18181818, "logps/chosen": -424.2336989182692, "logps/rejected": -554.6207386363636, "loss": 0.4759, "rewards/chosen": 0.17877949201143706, "rewards/margins": 0.2151955620392219, "rewards/rejected": -0.036416070027784866, "step": 16 }, { "epoch": 0.004659449088666575, "grad_norm": 56.5, "kl": 1.4637140035629272, "learning_rate": 2.428571428571429e-06, "logits/chosen": 27894973.866666667, "logits/rejected": 17511440.0, "logps/chosen": -555.9555338541667, "logps/rejected": -407.6745334201389, "loss": 0.4712, "rewards/chosen": 0.24269622166951496, "rewards/margins": 0.24497919090920023, "rewards/rejected": -0.0022829692396852705, "step": 17 }, { "epoch": 0.0049335343291763735, "grad_norm": 47.75, "kl": 1.1878068447113037, "learning_rate": 2.571428571428571e-06, "logits/chosen": 17917548.307692308, "logits/rejected": 39262219.63636363, "logps/chosen": -504.28665865384613, "logps/rejected": -306.9205433238636, "loss": 0.4678, "rewards/chosen": 0.29997653227586013, "rewards/margins": 0.2904134330215988, "rewards/rejected": 0.009563099254261364, "step": 18 }, { "epoch": 0.0052076195696861725, "grad_norm": 48.0, "kl": 1.7262630462646484, "learning_rate": 2.7142857142857144e-06, "logits/chosen": 27008704.0, "logits/rejected": 41710848.0, "logps/chosen": -450.87025669642856, "logps/rejected": -471.009033203125, "loss": 0.4588, "rewards/chosen": 0.33586086545671734, "rewards/margins": 0.3269642270037106, "rewards/rejected": 0.008896638453006745, "step": 19 }, { "epoch": 0.005481704810195971, "grad_norm": 51.5, "kl": 2.1238677501678467, "learning_rate": 2.8571428571428573e-06, "logits/chosen": 9376416.0, "logits/rejected": 1561116.0, "logps/chosen": -444.3380916819853, "logps/rejected": -678.6237444196429, "loss": 0.4709, "rewards/chosen": 0.33550604651955995, "rewards/margins": 0.3662089099403189, "rewards/rejected": -0.030702863420758928, "step": 20 }, { "epoch": 0.00575579005070577, "grad_norm": 42.0, "kl": 1.308091163635254, "learning_rate": 3e-06, "logits/chosen": 59523955.2, "logits/rejected": 14392685.714285715, "logps/chosen": -408.484619140625, "logps/rejected": -304.14937918526783, "loss": 0.4715, "rewards/chosen": 0.27727417945861815, "rewards/margins": 0.28442657345107625, "rewards/rejected": -0.007152393992458071, "step": 21 }, { "epoch": 0.006029875291215568, "grad_norm": 46.75, "kl": 1.8810844421386719, "learning_rate": 3.142857142857143e-06, "logits/chosen": 28565369.14285714, "logits/rejected": 44350809.6, "logps/chosen": -481.31815011160717, "logps/rejected": -563.53828125, "loss": 0.4499, "rewards/chosen": 0.4027050222669329, "rewards/margins": 0.45145227568490165, "rewards/rejected": -0.04874725341796875, "step": 22 }, { "epoch": 0.006303960531725367, "grad_norm": 54.0, "kl": 3.506432056427002, "learning_rate": 3.285714285714286e-06, "logits/chosen": 15614805.333333334, "logits/rejected": 26853038.222222224, "logps/chosen": -555.7845052083334, "logps/rejected": -534.6737196180555, "loss": 0.4343, "rewards/chosen": 0.5615922292073567, "rewards/margins": 0.5846288051870134, "rewards/rejected": -0.023036575979656644, "step": 23 }, { "epoch": 0.006578045772235165, "grad_norm": 47.5, "kl": 2.1787030696868896, "learning_rate": 3.428571428571429e-06, "logits/chosen": 13475936.888888888, "logits/rejected": 8984152.533333333, "logps/chosen": -514.1055230034722, "logps/rejected": -519.4493815104166, "loss": 0.4602, "rewards/chosen": 0.596612188551161, "rewards/margins": 0.5181697977913751, "rewards/rejected": 0.07844239075978597, "step": 24 }, { "epoch": 0.006852131012744964, "grad_norm": 47.25, "kl": 2.2701964378356934, "learning_rate": 3.5714285714285718e-06, "logits/chosen": 25019580.0, "logits/rejected": 49364536.0, "logps/chosen": -464.3586730957031, "logps/rejected": -473.77532958984375, "loss": 0.4445, "rewards/chosen": 0.5936341285705566, "rewards/margins": 0.5567014440894127, "rewards/rejected": 0.03693268448114395, "step": 25 }, { "epoch": 0.007126216253254762, "grad_norm": 64.5, "kl": 5.17418909072876, "learning_rate": 3.7142857142857146e-06, "logits/chosen": 8441686.588235294, "logits/rejected": 62357321.14285714, "logps/chosen": -448.70237821691177, "logps/rejected": -516.1609235491071, "loss": 0.4399, "rewards/chosen": 0.7655358034021714, "rewards/margins": 0.5591918460461272, "rewards/rejected": 0.20634395735604422, "step": 26 }, { "epoch": 0.007400301493764561, "grad_norm": 40.5, "kl": 3.3029277324676514, "learning_rate": 3.857142857142858e-06, "logits/chosen": 24676644.923076924, "logits/rejected": 12172162.909090908, "logps/chosen": -456.43197866586536, "logps/rejected": -429.5431019176136, "loss": 0.4261, "rewards/chosen": 0.6237812042236328, "rewards/margins": 0.6781343980268999, "rewards/rejected": -0.054353193803267044, "step": 27 }, { "epoch": 0.007674386734274359, "grad_norm": 39.75, "kl": 4.1008524894714355, "learning_rate": 4.000000000000001e-06, "logits/chosen": 29518685.714285713, "logits/rejected": 41061369.6, "logps/chosen": -404.863037109375, "logps/rejected": -320.807373046875, "loss": 0.395, "rewards/chosen": 0.9848604202270508, "rewards/margins": 1.0156987398862838, "rewards/rejected": -0.030838319659233095, "step": 28 }, { "epoch": 0.007948471974784158, "grad_norm": 41.0, "kl": 2.776031970977783, "learning_rate": 4.1428571428571435e-06, "logits/chosen": 27716812.8, "logits/rejected": 30704429.714285713, "logps/chosen": -508.3095703125, "logps/rejected": -437.04983956473217, "loss": 0.3691, "rewards/chosen": 1.162557888031006, "rewards/margins": 1.3755822999136789, "rewards/rejected": -0.213024411882673, "step": 29 }, { "epoch": 0.008222557215293956, "grad_norm": 44.75, "kl": 5.293633460998535, "learning_rate": 4.2857142857142855e-06, "logits/chosen": 8289705.846153846, "logits/rejected": 41540800.0, "logps/chosen": -486.68160306490387, "logps/rejected": -373.1516779119318, "loss": 0.4456, "rewards/chosen": 1.0477979366595929, "rewards/margins": 1.051271803729184, "rewards/rejected": -0.0034738670695911755, "step": 30 }, { "epoch": 0.008496642455803755, "grad_norm": 42.0, "kl": 5.815784931182861, "learning_rate": 4.428571428571429e-06, "logits/chosen": -3688871.714285714, "logits/rejected": 18809180.8, "logps/chosen": -506.93868582589283, "logps/rejected": -483.13701171875, "loss": 0.3683, "rewards/chosen": 1.415254865373884, "rewards/margins": 1.3565889579909187, "rewards/rejected": 0.058665907382965087, "step": 31 }, { "epoch": 0.008770727696313553, "grad_norm": 40.5, "kl": 8.473623275756836, "learning_rate": 4.571428571428572e-06, "logits/chosen": 35405789.71428572, "logits/rejected": 30077491.2, "logps/chosen": -609.8044084821429, "logps/rejected": -374.19736328125, "loss": 0.3696, "rewards/chosen": 1.5361454827444894, "rewards/margins": 1.4733915669577462, "rewards/rejected": 0.06275391578674316, "step": 32 }, { "epoch": 0.009044812936823353, "grad_norm": 73.5, "kl": 5.787624835968018, "learning_rate": 4.714285714285715e-06, "logits/chosen": 17223326.666666668, "logits/rejected": 131733930.66666667, "logps/chosen": -449.9830729166667, "logps/rejected": -795.5939127604166, "loss": 0.4131, "rewards/chosen": 1.0646476745605469, "rewards/margins": 0.8413174947102865, "rewards/rejected": 0.2233301798502604, "step": 33 }, { "epoch": 0.00931889817733315, "grad_norm": 35.75, "kl": 4.198353290557861, "learning_rate": 4.857142857142858e-06, "logits/chosen": -3141058.6666666665, "logits/rejected": 10970303.333333334, "logps/chosen": -414.4723714192708, "logps/rejected": -447.91064453125, "loss": 0.3887, "rewards/chosen": 1.1848435401916504, "rewards/margins": 1.2730753223101299, "rewards/rejected": -0.08823178211847942, "step": 34 }, { "epoch": 0.009592983417842949, "grad_norm": 34.75, "kl": 6.246673583984375, "learning_rate": 5e-06, "logits/chosen": 19459925.333333332, "logits/rejected": 29363196.444444444, "logps/chosen": -479.74423828125, "logps/rejected": -503.4288736979167, "loss": 0.2916, "rewards/chosen": 1.6024121602376302, "rewards/margins": 2.0010582235124375, "rewards/rejected": -0.3986460632748074, "step": 35 }, { "epoch": 0.009867068658352747, "grad_norm": 32.25, "kl": 3.9539794921875, "learning_rate": 5e-06, "logits/chosen": 12401387.636363637, "logits/rejected": 16549063.384615384, "logps/chosen": -373.5501154119318, "logps/rejected": -312.5066669170673, "loss": 0.3782, "rewards/chosen": 1.0413850437511096, "rewards/margins": 1.3865859491841777, "rewards/rejected": -0.3452009054330679, "step": 36 }, { "epoch": 0.010141153898862547, "grad_norm": 33.0, "kl": 6.961370944976807, "learning_rate": 5e-06, "logits/chosen": 15602493.538461538, "logits/rejected": 44651985.45454545, "logps/chosen": -520.2418870192307, "logps/rejected": -448.80184659090907, "loss": 0.3531, "rewards/chosen": 1.7802779857928936, "rewards/margins": 1.7309738472625091, "rewards/rejected": 0.04930413853038441, "step": 37 }, { "epoch": 0.010415239139372345, "grad_norm": 36.25, "kl": 7.040739059448242, "learning_rate": 5e-06, "logits/chosen": 3953404.5714285714, "logits/rejected": 16432121.6, "logps/chosen": -512.9888044084821, "logps/rejected": -370.351123046875, "loss": 0.3276, "rewards/chosen": 1.463531221662249, "rewards/margins": 1.762472094808306, "rewards/rejected": -0.2989408731460571, "step": 38 }, { "epoch": 0.010689324379882143, "grad_norm": 35.25, "kl": 13.392007827758789, "learning_rate": 5e-06, "logits/chosen": 14701056.0, "logits/rejected": 41624876.0, "logps/chosen": -601.6832885742188, "logps/rejected": -369.640625, "loss": 0.2529, "rewards/chosen": 2.621581554412842, "rewards/margins": 2.705070421099663, "rewards/rejected": -0.08348886668682098, "step": 39 }, { "epoch": 0.010963409620391941, "grad_norm": 33.25, "kl": 3.6535260677337646, "learning_rate": 5e-06, "logits/chosen": 6697520.0, "logits/rejected": 70534698.66666667, "logps/chosen": -342.048828125, "logps/rejected": -490.277099609375, "loss": 0.3973, "rewards/chosen": 1.1207311948140461, "rewards/margins": 1.2539427081743875, "rewards/rejected": -0.1332115133603414, "step": 40 }, { "epoch": 0.011237494860901741, "grad_norm": 30.875, "kl": 5.0016679763793945, "learning_rate": 5e-06, "logits/chosen": 9828660.0, "logits/rejected": 43756438.85714286, "logps/chosen": -450.37763671875, "logps/rejected": -337.4054478236607, "loss": 0.3206, "rewards/chosen": 1.7253347396850587, "rewards/margins": 1.8594807999474663, "rewards/rejected": -0.13414606026240758, "step": 41 }, { "epoch": 0.01151158010141154, "grad_norm": 29.5, "kl": 8.423419952392578, "learning_rate": 5e-06, "logits/chosen": 10457610.666666666, "logits/rejected": 83750584.8888889, "logps/chosen": -456.89440104166664, "logps/rejected": -345.96196831597223, "loss": 0.2814, "rewards/chosen": 1.840472412109375, "rewards/margins": 2.3330771976047093, "rewards/rejected": -0.4926047854953342, "step": 42 }, { "epoch": 0.011785665341921337, "grad_norm": 27.875, "kl": 9.02241039276123, "learning_rate": 5e-06, "logits/chosen": 5947637.142857143, "logits/rejected": 26396668.8, "logps/chosen": -516.7006487165179, "logps/rejected": -685.04111328125, "loss": 0.2351, "rewards/chosen": 2.5028574807303294, "rewards/margins": 2.9223902566092357, "rewards/rejected": -0.41953277587890625, "step": 43 }, { "epoch": 0.012059750582431136, "grad_norm": 29.75, "kl": 5.592858791351318, "learning_rate": 5e-06, "logits/chosen": 33032398.222222224, "logits/rejected": 42351202.13333333, "logps/chosen": -465.45941840277777, "logps/rejected": -427.7013671875, "loss": 0.3202, "rewards/chosen": 2.4192110697428384, "rewards/margins": 2.686837514241536, "rewards/rejected": -0.26762644449869794, "step": 44 }, { "epoch": 0.012333835822940935, "grad_norm": 29.0, "kl": 3.624695301055908, "learning_rate": 5e-06, "logits/chosen": 39387026.28571428, "logits/rejected": 35902930.823529415, "logps/chosen": -423.22586495535717, "logps/rejected": -409.5027286305147, "loss": 0.3003, "rewards/chosen": 1.6984740665980749, "rewards/margins": 2.3937878568633266, "rewards/rejected": -0.6953137902652516, "step": 45 }, { "epoch": 0.012607921063450734, "grad_norm": 38.5, "kl": 8.498068809509277, "learning_rate": 5e-06, "logits/chosen": 23162082.133333333, "logits/rejected": 69726542.22222222, "logps/chosen": -444.71106770833336, "logps/rejected": -387.68402777777777, "loss": 0.3749, "rewards/chosen": 1.8610551198323568, "rewards/margins": 1.9935715039571127, "rewards/rejected": -0.13251638412475586, "step": 46 }, { "epoch": 0.012882006303960532, "grad_norm": 31.875, "kl": 6.290726661682129, "learning_rate": 5e-06, "logits/chosen": 23095166.4, "logits/rejected": 166788845.7142857, "logps/chosen": -474.6017578125, "logps/rejected": -419.70626395089283, "loss": 0.315, "rewards/chosen": 1.9880184173583983, "rewards/margins": 2.792800671713693, "rewards/rejected": -0.8047822543552944, "step": 47 }, { "epoch": 0.01315609154447033, "grad_norm": 25.125, "kl": 8.459863662719727, "learning_rate": 5e-06, "logits/chosen": 6010321.714285715, "logits/rejected": 93140755.2, "logps/chosen": -395.0535365513393, "logps/rejected": -387.6829345703125, "loss": 0.2301, "rewards/chosen": 2.4377634865897044, "rewards/margins": 3.126647077287947, "rewards/rejected": -0.6888835906982422, "step": 48 }, { "epoch": 0.013430176784980128, "grad_norm": 26.75, "kl": 6.801774501800537, "learning_rate": 5e-06, "logits/chosen": 22894212.923076924, "logits/rejected": 42641245.09090909, "logps/chosen": -386.40433443509613, "logps/rejected": -431.5662286931818, "loss": 0.3159, "rewards/chosen": 2.003287388728215, "rewards/margins": 2.004647496280137, "rewards/rejected": -0.0013601075519214976, "step": 49 }, { "epoch": 0.013704262025489928, "grad_norm": 26.375, "kl": 5.436151027679443, "learning_rate": 5e-06, "logits/chosen": 24727900.8, "logits/rejected": 41986276.571428575, "logps/chosen": -416.88037109375, "logps/rejected": -388.4741908482143, "loss": 0.208, "rewards/chosen": 3.1476879119873047, "rewards/margins": 3.8775106157575334, "rewards/rejected": -0.7298227037702288, "step": 50 }, { "epoch": 0.013978347265999726, "grad_norm": 24.25, "kl": 3.0052928924560547, "learning_rate": 5e-06, "logits/chosen": 14159145.333333334, "logits/rejected": 11019588.666666666, "logps/chosen": -359.2147623697917, "logps/rejected": -403.9764811197917, "loss": 0.2847, "rewards/chosen": 1.325678030649821, "rewards/margins": 2.3204700152079267, "rewards/rejected": -0.9947919845581055, "step": 51 }, { "epoch": 0.014252432506509524, "grad_norm": 23.125, "kl": 5.6787190437316895, "learning_rate": 5e-06, "logits/chosen": 42522938.18181818, "logits/rejected": 16937075.692307692, "logps/chosen": -460.03981711647725, "logps/rejected": -322.4841496394231, "loss": 0.2644, "rewards/chosen": 2.404735044999556, "rewards/margins": 3.349144928938859, "rewards/rejected": -0.9444098839393029, "step": 52 }, { "epoch": 0.014526517747019322, "grad_norm": 28.875, "kl": 7.9893012046813965, "learning_rate": 5e-06, "logits/chosen": 32912562.0, "logits/rejected": 49076544.0, "logps/chosen": -431.33258056640625, "logps/rejected": -424.5028381347656, "loss": 0.2757, "rewards/chosen": 1.9015986919403076, "rewards/margins": 2.6104074716567993, "rewards/rejected": -0.7088087797164917, "step": 53 }, { "epoch": 0.014800602987529122, "grad_norm": 28.625, "kl": 4.304208755493164, "learning_rate": 5e-06, "logits/chosen": 23664178.90909091, "logits/rejected": 12210955.076923076, "logps/chosen": -536.5038174715909, "logps/rejected": -494.7594651442308, "loss": 0.1902, "rewards/chosen": 2.6065868030894888, "rewards/margins": 3.5872313226019585, "rewards/rejected": -0.98064451951247, "step": 54 }, { "epoch": 0.01507468822803892, "grad_norm": 27.75, "kl": 9.380560874938965, "learning_rate": 5e-06, "logits/chosen": 23252522.666666668, "logits/rejected": 79440986.66666667, "logps/chosen": -431.4990234375, "logps/rejected": -531.6238199869791, "loss": 0.2944, "rewards/chosen": 2.4666922887166343, "rewards/margins": 3.43416968981425, "rewards/rejected": -0.9674774010976156, "step": 55 }, { "epoch": 0.015348773468548719, "grad_norm": 23.875, "kl": 4.746701717376709, "learning_rate": 5e-06, "logits/chosen": 18870550.153846152, "logits/rejected": 55028247.27272727, "logps/chosen": -440.3030348557692, "logps/rejected": -479.9771839488636, "loss": 0.2993, "rewards/chosen": 2.2841456486628604, "rewards/margins": 3.50526873715274, "rewards/rejected": -1.2211230884898792, "step": 56 }, { "epoch": 0.015622858709058517, "grad_norm": 26.875, "kl": 1.756322979927063, "learning_rate": 5e-06, "logits/chosen": 11870756.0, "logits/rejected": 48996692.0, "logps/chosen": -377.71124267578125, "logps/rejected": -340.19915771484375, "loss": 0.2706, "rewards/chosen": 1.803788423538208, "rewards/margins": 2.7144264578819275, "rewards/rejected": -0.9106380343437195, "step": 57 }, { "epoch": 0.015896943949568317, "grad_norm": 27.125, "kl": 6.497262477874756, "learning_rate": 5e-06, "logits/chosen": 32482503.384615384, "logits/rejected": 17007783.272727273, "logps/chosen": -554.7453425480769, "logps/rejected": -383.93337180397725, "loss": 0.2213, "rewards/chosen": 2.4197717813345103, "rewards/margins": 4.001330875850224, "rewards/rejected": -1.5815590945157139, "step": 58 }, { "epoch": 0.016171029190078113, "grad_norm": 20.875, "kl": 3.7782459259033203, "learning_rate": 5e-06, "logits/chosen": 6875651.636363637, "logits/rejected": 23413124.923076924, "logps/chosen": -452.8283025568182, "logps/rejected": -434.6975285456731, "loss": 0.1746, "rewards/chosen": 2.5919872630726206, "rewards/margins": 4.208964954723012, "rewards/rejected": -1.6169776916503906, "step": 59 }, { "epoch": 0.016445114430587913, "grad_norm": 32.75, "kl": 2.3381259441375732, "learning_rate": 5e-06, "logits/chosen": 25134605.714285713, "logits/rejected": 19099017.6, "logps/chosen": -366.24727957589283, "logps/rejected": -576.251611328125, "loss": 0.2659, "rewards/chosen": 1.4540916170392717, "rewards/margins": 3.568445941380092, "rewards/rejected": -2.1143543243408205, "step": 60 }, { "epoch": 0.016719199671097713, "grad_norm": 23.375, "kl": 5.8981709480285645, "learning_rate": 5e-06, "logits/chosen": 35762795.428571425, "logits/rejected": 30180304.0, "logps/chosen": -550.7538713727679, "logps/rejected": -345.968115234375, "loss": 0.1862, "rewards/chosen": 2.724156515938895, "rewards/margins": 4.091200583321708, "rewards/rejected": -1.3670440673828126, "step": 61 }, { "epoch": 0.01699328491160751, "grad_norm": 23.25, "kl": 1.3037316799163818, "learning_rate": 5e-06, "logits/chosen": 10830165.714285715, "logits/rejected": 21696726.588235293, "logps/chosen": -589.4746791294643, "logps/rejected": -318.08088235294116, "loss": 0.2419, "rewards/chosen": 2.2175776617867604, "rewards/margins": 3.621944435504304, "rewards/rejected": -1.4043667737175436, "step": 62 }, { "epoch": 0.01726737015211731, "grad_norm": 23.25, "kl": 7.365187168121338, "learning_rate": 5e-06, "logits/chosen": 11966174.857142856, "logits/rejected": 38944505.6, "logps/chosen": -534.6565987723214, "logps/rejected": -355.68125, "loss": 0.2422, "rewards/chosen": 3.1841708592006137, "rewards/margins": 4.426242773873465, "rewards/rejected": -1.2420719146728516, "step": 63 }, { "epoch": 0.017541455392627105, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1307868.7272727273, "logits/rejected": 23363773.53846154, "logps/chosen": -502.49027876420456, "logps/rejected": -584.9458383413462, "loss": 0.1503, "rewards/chosen": 2.612940874966708, "rewards/margins": 4.932725772991047, "rewards/rejected": -2.3197848980243387, "step": 64 }, { "epoch": 0.017815540633136905, "grad_norm": 26.25, "kl": 5.363725662231445, "learning_rate": 5e-06, "logits/chosen": 18424075.076923076, "logits/rejected": 36056811.63636363, "logps/chosen": -518.3754507211538, "logps/rejected": -289.0869806463068, "loss": 0.224, "rewards/chosen": 2.8196372985839844, "rewards/margins": 3.857002691789107, "rewards/rejected": -1.0373653932051226, "step": 65 }, { "epoch": 0.018089625873646705, "grad_norm": 21.875, "kl": 3.846811294555664, "learning_rate": 5e-06, "logits/chosen": -306209.53846153844, "logits/rejected": 35297736.72727273, "logps/chosen": -457.67074819711536, "logps/rejected": -548.7908824573864, "loss": 0.1629, "rewards/chosen": 2.3135832273043118, "rewards/margins": 5.336668734783893, "rewards/rejected": -3.023085507479581, "step": 66 }, { "epoch": 0.0183637111141565, "grad_norm": 20.875, "kl": 3.4063804149627686, "learning_rate": 5e-06, "logits/chosen": 27097432.888888888, "logits/rejected": 18730331.733333334, "logps/chosen": -509.33452690972223, "logps/rejected": -357.07366536458335, "loss": 0.1752, "rewards/chosen": 3.244341108534071, "rewards/margins": 4.878824827406142, "rewards/rejected": -1.6344837188720702, "step": 67 }, { "epoch": 0.0186377963546663, "grad_norm": 19.0, "kl": 4.6208343505859375, "learning_rate": 5e-06, "logits/chosen": 29549668.923076924, "logits/rejected": 10042684.363636363, "logps/chosen": -562.0656926081731, "logps/rejected": -430.71604225852275, "loss": 0.1512, "rewards/chosen": 3.3743045513446512, "rewards/margins": 5.721220403284459, "rewards/rejected": -2.346915851939808, "step": 68 }, { "epoch": 0.0189118815951761, "grad_norm": 18.25, "kl": 1.4971065521240234, "learning_rate": 5e-06, "logits/chosen": 39023526.4, "logits/rejected": 18489563.42857143, "logps/chosen": -499.4744140625, "logps/rejected": -460.47719029017856, "loss": 0.1333, "rewards/chosen": 2.648311996459961, "rewards/margins": 5.496648897443499, "rewards/rejected": -2.8483369009835378, "step": 69 }, { "epoch": 0.019185966835685898, "grad_norm": 19.625, "kl": 3.564007520675659, "learning_rate": 5e-06, "logits/chosen": 63498048.0, "logits/rejected": 19260384.0, "logps/chosen": -493.9431966145833, "logps/rejected": -405.7475179036458, "loss": 0.1998, "rewards/chosen": 2.537106513977051, "rewards/margins": 5.008331616719564, "rewards/rejected": -2.471225102742513, "step": 70 }, { "epoch": 0.019460052076195698, "grad_norm": 23.625, "kl": 1.2855949401855469, "learning_rate": 5e-06, "logits/chosen": 2550151.777777778, "logits/rejected": 34357211.733333334, "logps/chosen": -482.25238715277777, "logps/rejected": -471.84742838541666, "loss": 0.1745, "rewards/chosen": 2.126002417670356, "rewards/margins": 4.086680518256293, "rewards/rejected": -1.9606781005859375, "step": 71 }, { "epoch": 0.019734137316705494, "grad_norm": 20.875, "kl": 6.143612384796143, "learning_rate": 5e-06, "logits/chosen": 34628038.4, "logits/rejected": 59811556.571428575, "logps/chosen": -456.66435546875, "logps/rejected": -355.07338169642856, "loss": 0.1924, "rewards/chosen": 3.100025939941406, "rewards/margins": 4.7760104588099885, "rewards/rejected": -1.6759845188685827, "step": 72 }, { "epoch": 0.020008222557215294, "grad_norm": 17.5, "kl": 2.6417174339294434, "learning_rate": 5e-06, "logits/chosen": 6316999.428571428, "logits/rejected": 22775555.2, "logps/chosen": -441.87486049107144, "logps/rejected": -544.7041015625, "loss": 0.141, "rewards/chosen": 2.3974432264055525, "rewards/margins": 5.558333614894321, "rewards/rejected": -3.1608903884887694, "step": 73 }, { "epoch": 0.020282307797725094, "grad_norm": 19.0, "kl": 4.240756034851074, "learning_rate": 5e-06, "logits/chosen": -6296676.4, "logits/rejected": 12588715.42857143, "logps/chosen": -471.886865234375, "logps/rejected": -306.11488560267856, "loss": 0.245, "rewards/chosen": 2.7338836669921873, "rewards/margins": 4.121359089442661, "rewards/rejected": -1.3874754224504744, "step": 74 }, { "epoch": 0.02055639303823489, "grad_norm": 29.0, "kl": 1.6686592102050781, "learning_rate": 5e-06, "logits/chosen": 29766542.545454547, "logits/rejected": 48010092.307692304, "logps/chosen": -468.88485440340907, "logps/rejected": -411.26318359375, "loss": 0.2153, "rewards/chosen": 2.3088212446732954, "rewards/margins": 4.004663107278464, "rewards/rejected": -1.6958418626051683, "step": 75 }, { "epoch": 0.02083047827874469, "grad_norm": 20.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 23169942.4, "logits/rejected": 36346192.0, "logps/chosen": -409.9318359375, "logps/rejected": -504.4545200892857, "loss": 0.1852, "rewards/chosen": 2.074100685119629, "rewards/margins": 4.416526167733329, "rewards/rejected": -2.3424254826136996, "step": 76 }, { "epoch": 0.021104563519254486, "grad_norm": 18.75, "kl": 0.6364943385124207, "learning_rate": 5e-06, "logits/chosen": 4080342.222222222, "logits/rejected": 32010973.866666667, "logps/chosen": -589.8191189236111, "logps/rejected": -349.69635416666665, "loss": 0.1676, "rewards/chosen": 2.507464302910699, "rewards/margins": 4.6828839196099175, "rewards/rejected": -2.1754196166992186, "step": 77 }, { "epoch": 0.021378648759764286, "grad_norm": 18.375, "kl": 6.328530311584473, "learning_rate": 5e-06, "logits/chosen": 26068838.4, "logits/rejected": 35921095.11111111, "logps/chosen": -548.1175130208334, "logps/rejected": -397.6599934895833, "loss": 0.1485, "rewards/chosen": 3.4052574157714846, "rewards/margins": 5.856091690063477, "rewards/rejected": -2.450834274291992, "step": 78 }, { "epoch": 0.021652734000274086, "grad_norm": 25.0, "kl": 2.5518507957458496, "learning_rate": 5e-06, "logits/chosen": 15718385.6, "logits/rejected": 34627062.85714286, "logps/chosen": -325.064306640625, "logps/rejected": -577.972412109375, "loss": 0.2106, "rewards/chosen": 1.8185646057128906, "rewards/margins": 3.918954631260463, "rewards/rejected": -2.1003900255475725, "step": 79 }, { "epoch": 0.021926819240783883, "grad_norm": 35.0, "kl": 11.106085777282715, "learning_rate": 5e-06, "logits/chosen": 53865472.0, "logits/rejected": 18882939.2, "logps/chosen": -450.5677939967105, "logps/rejected": -463.009033203125, "loss": 0.307, "rewards/chosen": 2.154160348992599, "rewards/margins": 4.554188196282638, "rewards/rejected": -2.400027847290039, "step": 80 }, { "epoch": 0.022200904481293682, "grad_norm": 24.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14250960.0, "logits/rejected": 26650525.09090909, "logps/chosen": -570.2341871995193, "logps/rejected": -534.8829900568181, "loss": 0.1614, "rewards/chosen": 2.611414982722356, "rewards/margins": 6.015355703714011, "rewards/rejected": -3.403940720991655, "step": 81 }, { "epoch": 0.022474989721803482, "grad_norm": 13.375, "kl": 0.5885226130485535, "learning_rate": 5e-06, "logits/chosen": 129458488.8888889, "logits/rejected": 13414014.933333334, "logps/chosen": -588.1131184895834, "logps/rejected": -536.0904947916666, "loss": 0.1192, "rewards/chosen": 2.83927239312066, "rewards/margins": 7.3524798923068575, "rewards/rejected": -4.513207499186198, "step": 82 }, { "epoch": 0.02274907496231328, "grad_norm": 24.75, "kl": 1.3537757396697998, "learning_rate": 5e-06, "logits/chosen": 10850793.846153846, "logits/rejected": 26208386.90909091, "logps/chosen": -416.8234675480769, "logps/rejected": -359.08121004971593, "loss": 0.2038, "rewards/chosen": 2.3124424861027646, "rewards/margins": 4.027981031191099, "rewards/rejected": -1.7155385450883345, "step": 83 }, { "epoch": 0.02302316020282308, "grad_norm": 19.75, "kl": 6.049086570739746, "learning_rate": 5e-06, "logits/chosen": -19335779.2, "logits/rejected": 18661692.57142857, "logps/chosen": -463.2177734375, "logps/rejected": -422.97119140625, "loss": 0.153, "rewards/chosen": 3.801902008056641, "rewards/margins": 6.535798263549805, "rewards/rejected": -2.733896255493164, "step": 84 }, { "epoch": 0.023297245443332875, "grad_norm": 20.875, "kl": 3.8662221431732178, "learning_rate": 5e-06, "logits/chosen": 1038996.0, "logits/rejected": 47304912.0, "logps/chosen": -522.1352132161459, "logps/rejected": -414.3451741536458, "loss": 0.1852, "rewards/chosen": 2.422243277231852, "rewards/margins": 5.102021058400472, "rewards/rejected": -2.6797777811686196, "step": 85 }, { "epoch": 0.023571330683842675, "grad_norm": 17.5, "kl": 4.065312385559082, "learning_rate": 5e-06, "logits/chosen": 18186856.727272727, "logits/rejected": 36126552.615384616, "logps/chosen": -410.13503196022725, "logps/rejected": -509.38179837740387, "loss": 0.1571, "rewards/chosen": 2.93945971402255, "rewards/margins": 6.152276392583246, "rewards/rejected": -3.212816678560697, "step": 86 }, { "epoch": 0.023845415924352475, "grad_norm": 21.625, "kl": 9.968841552734375, "learning_rate": 5e-06, "logits/chosen": 50234496.0, "logits/rejected": 33597867.63636363, "logps/chosen": -588.9787409855769, "logps/rejected": -430.63649680397725, "loss": 0.1905, "rewards/chosen": 3.467991755558894, "rewards/margins": 5.458204629537942, "rewards/rejected": -1.9902128739790483, "step": 87 }, { "epoch": 0.02411950116486227, "grad_norm": 21.75, "kl": 3.625864028930664, "learning_rate": 5e-06, "logits/chosen": 32464818.666666668, "logits/rejected": 21559377.333333332, "logps/chosen": -589.9312337239584, "logps/rejected": -402.20849609375, "loss": 0.1362, "rewards/chosen": 2.6200315157572427, "rewards/margins": 5.054628690083821, "rewards/rejected": -2.4345971743265786, "step": 88 }, { "epoch": 0.02439358640537207, "grad_norm": 17.875, "kl": 6.760239601135254, "learning_rate": 5e-06, "logits/chosen": 8786205.090909092, "logits/rejected": 9406712.615384616, "logps/chosen": -498.74369673295456, "logps/rejected": -358.5177659254808, "loss": 0.1458, "rewards/chosen": 3.0020314996892754, "rewards/margins": 5.687044903948591, "rewards/rejected": -2.685013404259315, "step": 89 }, { "epoch": 0.02466767164588187, "grad_norm": 22.75, "kl": 7.913074016571045, "learning_rate": 5e-06, "logits/chosen": 18659120.0, "logits/rejected": 12018938.0, "logps/chosen": -582.2000122070312, "logps/rejected": -444.6913757324219, "loss": 0.1693, "rewards/chosen": 3.051455020904541, "rewards/margins": 5.743775129318237, "rewards/rejected": -2.6923201084136963, "step": 90 }, { "epoch": 0.024941756886391667, "grad_norm": 21.5, "kl": 3.418027877807617, "learning_rate": 5e-06, "logits/chosen": 17872997.818181816, "logits/rejected": 16715724.307692308, "logps/chosen": -365.46872780539775, "logps/rejected": -434.9141376201923, "loss": 0.1837, "rewards/chosen": 2.5711728876287285, "rewards/margins": 4.873141828950468, "rewards/rejected": -2.3019689413217397, "step": 91 }, { "epoch": 0.025215842126901467, "grad_norm": 23.0, "kl": 2.7574374675750732, "learning_rate": 5e-06, "logits/chosen": 17689121.14285714, "logits/rejected": 13743030.4, "logps/chosen": -454.55569893973217, "logps/rejected": -564.399365234375, "loss": 0.1495, "rewards/chosen": 2.0093439647129605, "rewards/margins": 5.262865311758858, "rewards/rejected": -3.2535213470458983, "step": 92 }, { "epoch": 0.025489927367411264, "grad_norm": 16.875, "kl": 4.220672607421875, "learning_rate": 5e-06, "logits/chosen": 9335516.57142857, "logits/rejected": 14761451.2, "logps/chosen": -433.75697544642856, "logps/rejected": -447.04755859375, "loss": 0.1231, "rewards/chosen": 2.8768509456089566, "rewards/margins": 5.450325448172434, "rewards/rejected": -2.5734745025634767, "step": 93 }, { "epoch": 0.025764012607921064, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 702501.8181818182, "logits/rejected": 25841501.53846154, "logps/chosen": -543.1882102272727, "logps/rejected": -379.1970027043269, "loss": 0.1393, "rewards/chosen": 2.313337499445135, "rewards/margins": 5.381938080687623, "rewards/rejected": -3.068600581242488, "step": 94 }, { "epoch": 0.026038097848430863, "grad_norm": 16.25, "kl": 5.628309726715088, "learning_rate": 5e-06, "logits/chosen": 24624981.333333332, "logits/rejected": 22273497.333333332, "logps/chosen": -557.425048828125, "logps/rejected": -445.7806803385417, "loss": 0.1375, "rewards/chosen": 3.868370691935221, "rewards/margins": 6.180222670237223, "rewards/rejected": -2.311851978302002, "step": 95 }, { "epoch": 0.02631218308894066, "grad_norm": 20.125, "kl": 2.1607184410095215, "learning_rate": 5e-06, "logits/chosen": 12384234.666666666, "logits/rejected": 29940743.111111112, "logps/chosen": -479.4551106770833, "logps/rejected": -506.44135199652777, "loss": 0.1065, "rewards/chosen": 3.095289103190104, "rewards/margins": 7.713512929280599, "rewards/rejected": -4.618223826090495, "step": 96 }, { "epoch": 0.02658626832945046, "grad_norm": 20.75, "kl": 1.8822358846664429, "learning_rate": 5e-06, "logits/chosen": 28964878.769230768, "logits/rejected": 19372717.09090909, "logps/chosen": -560.0998722956731, "logps/rejected": -548.4446466619319, "loss": 0.1226, "rewards/chosen": 2.9954311664287863, "rewards/margins": 7.5494524815699435, "rewards/rejected": -4.554021315141157, "step": 97 }, { "epoch": 0.026860353569960256, "grad_norm": 14.875, "kl": 6.619040012359619, "learning_rate": 5e-06, "logits/chosen": 16958804.57142857, "logits/rejected": 18586472.0, "logps/chosen": -469.22607421875, "logps/rejected": -429.299853515625, "loss": 0.1171, "rewards/chosen": 3.992417199271066, "rewards/margins": 6.345064789908273, "rewards/rejected": -2.352647590637207, "step": 98 }, { "epoch": 0.027134438810470056, "grad_norm": 16.5, "kl": 4.733872890472412, "learning_rate": 5e-06, "logits/chosen": 15031603.555555556, "logits/rejected": 16353557.333333334, "logps/chosen": -440.88495551215277, "logps/rejected": -495.40244140625, "loss": 0.1167, "rewards/chosen": 3.6917800903320312, "rewards/margins": 6.683789316813151, "rewards/rejected": -2.99200922648112, "step": 99 }, { "epoch": 0.027408524050979856, "grad_norm": 14.5, "kl": 5.418939590454102, "learning_rate": 5e-06, "logits/chosen": 33002867.692307692, "logits/rejected": 6804550.545454546, "logps/chosen": -463.97152944711536, "logps/rejected": -346.8890935724432, "loss": 0.1094, "rewards/chosen": 3.111212216890775, "rewards/margins": 5.815786455061052, "rewards/rejected": -2.704574238170277, "step": 100 }, { "epoch": 0.027682609291489652, "grad_norm": 23.0, "kl": 7.328195095062256, "learning_rate": 5e-06, "logits/chosen": 25669506.285714287, "logits/rejected": 14558977.6, "logps/chosen": -482.9292689732143, "logps/rejected": -398.1857421875, "loss": 0.1322, "rewards/chosen": 2.8205160413469588, "rewards/margins": 6.174603870936803, "rewards/rejected": -3.3540878295898438, "step": 101 }, { "epoch": 0.027956694531999452, "grad_norm": 12.125, "kl": 1.2081178426742554, "learning_rate": 5e-06, "logits/chosen": 8745844.8, "logits/rejected": 11674523.42857143, "logps/chosen": -548.87998046875, "logps/rejected": -578.4268275669643, "loss": 0.0915, "rewards/chosen": 3.4765869140625, "rewards/margins": 7.697880227225168, "rewards/rejected": -4.2212933131626675, "step": 102 }, { "epoch": 0.028230779772509252, "grad_norm": 21.5, "kl": 9.375618934631348, "learning_rate": 5e-06, "logits/chosen": 18404303.05882353, "logits/rejected": 16273563.42857143, "logps/chosen": -496.83582261029414, "logps/rejected": -503.66043526785717, "loss": 0.1859, "rewards/chosen": 3.0863616045783546, "rewards/margins": 6.005081849939683, "rewards/rejected": -2.918720245361328, "step": 103 }, { "epoch": 0.02850486501301905, "grad_norm": 12.625, "kl": 0.12923431396484375, "learning_rate": 5e-06, "logits/chosen": 30547623.111111112, "logits/rejected": 17241509.333333332, "logps/chosen": -517.9462890625, "logps/rejected": -531.9221354166667, "loss": 0.0915, "rewards/chosen": 3.109615961710612, "rewards/margins": 6.860518264770508, "rewards/rejected": -3.750902303059896, "step": 104 }, { "epoch": 0.02877895025352885, "grad_norm": 19.75, "kl": 3.369232416152954, "learning_rate": 5e-06, "logits/chosen": 2220591.2, "logits/rejected": 17177297.777777776, "logps/chosen": -372.38893229166666, "logps/rejected": -648.5984157986111, "loss": 0.1588, "rewards/chosen": 2.7208231608072917, "rewards/margins": 6.64911872016059, "rewards/rejected": -3.9282955593532987, "step": 105 }, { "epoch": 0.029053035494038645, "grad_norm": 18.25, "kl": 4.21042013168335, "learning_rate": 5e-06, "logits/chosen": 7505625.454545454, "logits/rejected": 30354678.153846152, "logps/chosen": -345.70751953125, "logps/rejected": -517.5760216346154, "loss": 0.178, "rewards/chosen": 3.0852737426757812, "rewards/margins": 5.952566293569712, "rewards/rejected": -2.8672925508939304, "step": 106 }, { "epoch": 0.029327120734548445, "grad_norm": 17.5, "kl": 2.262537717819214, "learning_rate": 5e-06, "logits/chosen": 5833050.181818182, "logits/rejected": 6948367.384615385, "logps/chosen": -341.8981267755682, "logps/rejected": -381.22641225961536, "loss": 0.1836, "rewards/chosen": 2.06175405328924, "rewards/margins": 4.966436799589571, "rewards/rejected": -2.9046827463003306, "step": 107 }, { "epoch": 0.029601205975058244, "grad_norm": 20.375, "kl": 4.341683864593506, "learning_rate": 5e-06, "logits/chosen": 17089812.266666666, "logits/rejected": 52581479.11111111, "logps/chosen": -442.5478515625, "logps/rejected": -360.9601779513889, "loss": 0.1707, "rewards/chosen": 2.5851648966471354, "rewards/margins": 4.845933787027995, "rewards/rejected": -2.2607688903808594, "step": 108 }, { "epoch": 0.02987529121556804, "grad_norm": 18.625, "kl": 1.1561189889907837, "learning_rate": 5e-06, "logits/chosen": 10022530.666666666, "logits/rejected": 13987117.866666667, "logps/chosen": -378.71031358506946, "logps/rejected": -383.22239583333334, "loss": 0.1515, "rewards/chosen": 3.521940231323242, "rewards/margins": 5.98422482808431, "rewards/rejected": -2.4622845967610676, "step": 109 }, { "epoch": 0.03014937645607784, "grad_norm": 27.125, "kl": 4.705845355987549, "learning_rate": 5e-06, "logits/chosen": 57224857.6, "logits/rejected": 25194999.111111112, "logps/chosen": -399.91888020833335, "logps/rejected": -493.31005859375, "loss": 0.208, "rewards/chosen": 2.0417269388834636, "rewards/margins": 5.394904327392578, "rewards/rejected": -3.3531773885091147, "step": 110 }, { "epoch": 0.030423461696587637, "grad_norm": 13.5, "kl": 1.9837684631347656, "learning_rate": 5e-06, "logits/chosen": 1804728.7272727273, "logits/rejected": 29268716.307692308, "logps/chosen": -444.46102627840907, "logps/rejected": -500.3703425480769, "loss": 0.0879, "rewards/chosen": 4.136935147372159, "rewards/margins": 7.404892981469215, "rewards/rejected": -3.2679578340970554, "step": 111 }, { "epoch": 0.030697546937097437, "grad_norm": 15.75, "kl": 5.39693021774292, "learning_rate": 5e-06, "logits/chosen": 7782838.222222222, "logits/rejected": 14582045.866666667, "logps/chosen": -467.0935329861111, "logps/rejected": -428.77233072916664, "loss": 0.151, "rewards/chosen": 3.5055544111463757, "rewards/margins": 6.503260252210829, "rewards/rejected": -2.997705841064453, "step": 112 }, { "epoch": 0.030971632177607237, "grad_norm": 17.875, "kl": 1.4776777029037476, "learning_rate": 5e-06, "logits/chosen": 3173813.777777778, "logits/rejected": 21916565.333333332, "logps/chosen": -455.9449869791667, "logps/rejected": -511.13046875, "loss": 0.0918, "rewards/chosen": 3.0507030487060547, "rewards/margins": 6.5917705535888675, "rewards/rejected": -3.5410675048828124, "step": 113 }, { "epoch": 0.031245717418117033, "grad_norm": 19.875, "kl": 7.317265510559082, "learning_rate": 5e-06, "logits/chosen": 23529253.333333332, "logits/rejected": 12044717.333333334, "logps/chosen": -563.69921875, "logps/rejected": -316.1934000651042, "loss": 0.0988, "rewards/chosen": 4.081796010335286, "rewards/margins": 6.754910469055176, "rewards/rejected": -2.673114458719889, "step": 114 }, { "epoch": 0.03151980265862683, "grad_norm": 18.25, "kl": 10.545022964477539, "learning_rate": 5e-06, "logits/chosen": 21121748.0, "logits/rejected": 1008005.5, "logps/chosen": -436.3394775390625, "logps/rejected": -424.2666931152344, "loss": 0.1042, "rewards/chosen": 3.8913726806640625, "rewards/margins": 7.59371280670166, "rewards/rejected": -3.7023401260375977, "step": 115 }, { "epoch": 0.03179388789913663, "grad_norm": 18.5, "kl": 1.9868406057357788, "learning_rate": 5e-06, "logits/chosen": -1585722.6666666667, "logits/rejected": 32733413.333333332, "logps/chosen": -437.7894287109375, "logps/rejected": -594.1539306640625, "loss": 0.1356, "rewards/chosen": 3.285210609436035, "rewards/margins": 6.36833381652832, "rewards/rejected": -3.083123207092285, "step": 116 }, { "epoch": 0.03206797313964643, "grad_norm": 22.125, "kl": 7.9134345054626465, "learning_rate": 5e-06, "logits/chosen": -2366885.714285714, "logits/rejected": 21332201.6, "logps/chosen": -428.76171875, "logps/rejected": -434.93095703125, "loss": 0.1699, "rewards/chosen": 3.383405957903181, "rewards/margins": 5.333840833391462, "rewards/rejected": -1.9504348754882812, "step": 117 }, { "epoch": 0.032342058380156226, "grad_norm": 14.125, "kl": 0.39247769117355347, "learning_rate": 5e-06, "logits/chosen": 9108326.0, "logits/rejected": 7130486.0, "logps/chosen": -414.0350748697917, "logps/rejected": -483.5046793619792, "loss": 0.1336, "rewards/chosen": 2.9867026011149087, "rewards/margins": 6.826655387878418, "rewards/rejected": -3.8399527867635093, "step": 118 }, { "epoch": 0.032616143620666026, "grad_norm": 11.125, "kl": 5.072084903717041, "learning_rate": 5e-06, "logits/chosen": 2911688.0, "logits/rejected": 5918937.6, "logps/chosen": -601.43359375, "logps/rejected": -442.8593424479167, "loss": 0.0527, "rewards/chosen": 4.709318372938368, "rewards/margins": 7.925303480360243, "rewards/rejected": -3.215985107421875, "step": 119 }, { "epoch": 0.032890228861175826, "grad_norm": 16.75, "kl": 2.4859352111816406, "learning_rate": 5e-06, "logits/chosen": 7471055.0, "logits/rejected": 91416704.0, "logps/chosen": -404.43719482421875, "logps/rejected": -426.8681945800781, "loss": 0.153, "rewards/chosen": 3.0623435974121094, "rewards/margins": 5.152045249938965, "rewards/rejected": -2.0897016525268555, "step": 120 }, { "epoch": 0.033164314101685625, "grad_norm": 14.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -588565.0, "logits/rejected": 104797656.0, "logps/chosen": -430.85693359375, "logps/rejected": -511.8392333984375, "loss": 0.0865, "rewards/chosen": 3.0163121223449707, "rewards/margins": 7.520627498626709, "rewards/rejected": -4.504315376281738, "step": 121 }, { "epoch": 0.033438399342195425, "grad_norm": 17.625, "kl": 4.316936492919922, "learning_rate": 5e-06, "logits/chosen": 1090698.1818181819, "logits/rejected": 17009662.769230768, "logps/chosen": -461.2551935369318, "logps/rejected": -535.8484074519231, "loss": 0.1504, "rewards/chosen": 3.0159277482466265, "rewards/margins": 7.051921764453809, "rewards/rejected": -4.035994016207182, "step": 122 }, { "epoch": 0.03371248458270522, "grad_norm": 11.75, "kl": 2.7255916595458984, "learning_rate": 5e-06, "logits/chosen": 20351586.666666668, "logits/rejected": 31297052.444444444, "logps/chosen": -388.5260416666667, "logps/rejected": -467.99083116319446, "loss": 0.0845, "rewards/chosen": 3.066335995992025, "rewards/margins": 7.254668023851183, "rewards/rejected": -4.188332027859158, "step": 123 }, { "epoch": 0.03398656982321502, "grad_norm": 13.0625, "kl": 4.989921569824219, "learning_rate": 5e-06, "logits/chosen": 39528119.27272727, "logits/rejected": 7553741.538461538, "logps/chosen": -429.36328125, "logps/rejected": -439.1651141826923, "loss": 0.1116, "rewards/chosen": 3.1653626181862573, "rewards/margins": 7.152938149192117, "rewards/rejected": -3.9875755310058594, "step": 124 }, { "epoch": 0.03426065506372482, "grad_norm": 13.6875, "kl": 6.987547874450684, "learning_rate": 5e-06, "logits/chosen": 26046222.769230768, "logits/rejected": 15754682.181818182, "logps/chosen": -382.44095552884613, "logps/rejected": -461.89506392045456, "loss": 0.1158, "rewards/chosen": 3.8392243018517127, "rewards/margins": 7.147886022821173, "rewards/rejected": -3.3086617209694604, "step": 125 }, { "epoch": 0.03453474030423462, "grad_norm": 22.75, "kl": 10.74423885345459, "learning_rate": 5e-06, "logits/chosen": 6657704.470588235, "logits/rejected": 55314368.0, "logps/chosen": -454.2903837316176, "logps/rejected": -528.1566336495536, "loss": 0.17, "rewards/chosen": 3.3222797618192783, "rewards/margins": 7.636172126321231, "rewards/rejected": -4.313892364501953, "step": 126 }, { "epoch": 0.03480882554474442, "grad_norm": 13.875, "kl": 3.957669734954834, "learning_rate": 5e-06, "logits/chosen": 44045595.428571425, "logits/rejected": 4808115.2, "logps/chosen": -515.1978934151786, "logps/rejected": -412.98388671875, "loss": 0.1088, "rewards/chosen": 3.3976309640066966, "rewards/margins": 6.785617337908064, "rewards/rejected": -3.3879863739013674, "step": 127 }, { "epoch": 0.03508291078525421, "grad_norm": 16.75, "kl": 2.2041454315185547, "learning_rate": 5e-06, "logits/chosen": -5353247.333333333, "logits/rejected": -7262472.0, "logps/chosen": -509.7245279947917, "logps/rejected": -368.511474609375, "loss": 0.1225, "rewards/chosen": 3.8793627421061196, "rewards/margins": 5.869749228159586, "rewards/rejected": -1.9903864860534668, "step": 128 }, { "epoch": 0.03535699602576401, "grad_norm": 15.75, "kl": 10.216436386108398, "learning_rate": 5e-06, "logits/chosen": 4459213.866666666, "logits/rejected": 50977038.222222224, "logps/chosen": -469.7241536458333, "logps/rejected": -501.19097222222223, "loss": 0.1251, "rewards/chosen": 3.3956616719563804, "rewards/margins": 6.54650158352322, "rewards/rejected": -3.15083991156684, "step": 129 }, { "epoch": 0.03563108126627381, "grad_norm": 11.5625, "kl": 1.150543212890625, "learning_rate": 5e-06, "logits/chosen": 1555752.3076923077, "logits/rejected": 101963810.9090909, "logps/chosen": -356.1366436298077, "logps/rejected": -525.0433682528409, "loss": 0.0814, "rewards/chosen": 2.7951507568359375, "rewards/margins": 7.363092595880682, "rewards/rejected": -4.567941839044744, "step": 130 }, { "epoch": 0.03590516650678361, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 46374665.84615385, "logits/rejected": 6027564.363636363, "logps/chosen": -459.18118990384613, "logps/rejected": -469.4314630681818, "loss": 0.0739, "rewards/chosen": 2.945754124568059, "rewards/margins": 7.130077522117775, "rewards/rejected": -4.184323397549716, "step": 131 }, { "epoch": 0.03617925174729341, "grad_norm": 17.125, "kl": 7.387209892272949, "learning_rate": 5e-06, "logits/chosen": -3445882.8, "logits/rejected": 30769424.0, "logps/chosen": -465.807177734375, "logps/rejected": -549.3658272879464, "loss": 0.1, "rewards/chosen": 3.2449859619140624, "rewards/margins": 7.239930125645229, "rewards/rejected": -3.9949441637311662, "step": 132 }, { "epoch": 0.03645333698780321, "grad_norm": 18.75, "kl": 5.410910606384277, "learning_rate": 5e-06, "logits/chosen": -4222018.909090909, "logits/rejected": 8481665.23076923, "logps/chosen": -450.92085404829544, "logps/rejected": -425.76607572115387, "loss": 0.171, "rewards/chosen": 3.3744205128062856, "rewards/margins": 7.00080583479021, "rewards/rejected": -3.6263853219839244, "step": 133 }, { "epoch": 0.036727422228313, "grad_norm": 25.25, "kl": 1.5340436697006226, "learning_rate": 5e-06, "logits/chosen": 30840853.333333332, "logits/rejected": 18268949.333333332, "logps/chosen": -515.9419352213541, "logps/rejected": -336.60813395182294, "loss": 0.1652, "rewards/chosen": 2.9756199518839517, "rewards/margins": 5.059343655904134, "rewards/rejected": -2.083723704020182, "step": 134 }, { "epoch": 0.0370015074688228, "grad_norm": 22.875, "kl": 10.762727737426758, "learning_rate": 5e-06, "logits/chosen": 20816425.14285714, "logits/rejected": 45016812.8, "logps/chosen": -493.72279575892856, "logps/rejected": -390.991943359375, "loss": 0.1198, "rewards/chosen": 3.8694651467459544, "rewards/margins": 6.836864934648787, "rewards/rejected": -2.967399787902832, "step": 135 }, { "epoch": 0.0372755927093326, "grad_norm": 19.25, "kl": 2.5975253582000732, "learning_rate": 5e-06, "logits/chosen": 5768961.454545454, "logits/rejected": 17050875.076923076, "logps/chosen": -515.7679332386364, "logps/rejected": -595.7152944711538, "loss": 0.1205, "rewards/chosen": 3.4394669966264204, "rewards/margins": 6.635594081211757, "rewards/rejected": -3.1961270845853367, "step": 136 }, { "epoch": 0.0375496779498424, "grad_norm": 21.5, "kl": 2.327805280685425, "learning_rate": 5e-06, "logits/chosen": -525593.4285714285, "logits/rejected": 12550768.0, "logps/chosen": -362.6689453125, "logps/rejected": -468.272998046875, "loss": 0.2047, "rewards/chosen": 2.330976758684431, "rewards/margins": 4.195863996233259, "rewards/rejected": -1.8648872375488281, "step": 137 }, { "epoch": 0.0378237631903522, "grad_norm": 9.4375, "kl": 0.8102906942367554, "learning_rate": 5e-06, "logits/chosen": 17442417.333333332, "logits/rejected": 6629061.333333333, "logps/chosen": -463.6735026041667, "logps/rejected": -414.1232096354167, "loss": 0.093, "rewards/chosen": 3.689485232035319, "rewards/margins": 7.367673238118489, "rewards/rejected": -3.6781880060831704, "step": 138 }, { "epoch": 0.038097848430861996, "grad_norm": 18.5, "kl": 5.231475830078125, "learning_rate": 5e-06, "logits/chosen": 6162806.545454546, "logits/rejected": 13712226.461538462, "logps/chosen": -382.6570933948864, "logps/rejected": -365.7350886418269, "loss": 0.1691, "rewards/chosen": 2.1841780922629614, "rewards/margins": 4.628090011489975, "rewards/rejected": -2.443911919227013, "step": 139 }, { "epoch": 0.038371933671371795, "grad_norm": 18.5, "kl": 11.546306610107422, "learning_rate": 5e-06, "logits/chosen": -3024618.117647059, "logits/rejected": 5520382.857142857, "logps/chosen": -445.6412568933824, "logps/rejected": -402.4121791294643, "loss": 0.1465, "rewards/chosen": 3.5650401395909928, "rewards/margins": 7.207088021671071, "rewards/rejected": -3.642047882080078, "step": 140 }, { "epoch": 0.038646018911881595, "grad_norm": 13.125, "kl": 8.965350151062012, "learning_rate": 5e-06, "logits/chosen": -4687534.857142857, "logits/rejected": 26242864.0, "logps/chosen": -520.7423270089286, "logps/rejected": -476.65322265625, "loss": 0.0709, "rewards/chosen": 4.832400730678013, "rewards/margins": 9.550467899867467, "rewards/rejected": -4.718067169189453, "step": 141 }, { "epoch": 0.038920104152391395, "grad_norm": 12.9375, "kl": 6.09146785736084, "learning_rate": 5e-06, "logits/chosen": 8739299.0, "logits/rejected": 32563932.0, "logps/chosen": -442.37908935546875, "logps/rejected": -514.930419921875, "loss": 0.1386, "rewards/chosen": 3.767910957336426, "rewards/margins": 7.011478424072266, "rewards/rejected": -3.24356746673584, "step": 142 }, { "epoch": 0.039194189392901195, "grad_norm": 14.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6336357.714285715, "logits/rejected": -4828286.588235294, "logps/chosen": -435.0844029017857, "logps/rejected": -479.82907284007354, "loss": 0.1259, "rewards/chosen": 3.0613125392368863, "rewards/margins": 6.45475682491014, "rewards/rejected": -3.3934442856732536, "step": 143 }, { "epoch": 0.03946827463341099, "grad_norm": 14.3125, "kl": 5.0732598304748535, "learning_rate": 5e-06, "logits/chosen": 7137986.5, "logits/rejected": -8001068.0, "logps/chosen": -454.30816650390625, "logps/rejected": -474.5111389160156, "loss": 0.0835, "rewards/chosen": 3.823915481567383, "rewards/margins": 8.35860824584961, "rewards/rejected": -4.534692764282227, "step": 144 }, { "epoch": 0.03974235987392079, "grad_norm": 13.5, "kl": 4.37014102935791, "learning_rate": 5e-06, "logits/chosen": -1062166.1666666667, "logits/rejected": 9943148.0, "logps/chosen": -474.2447102864583, "logps/rejected": -377.0118001302083, "loss": 0.0843, "rewards/chosen": 4.5043792724609375, "rewards/margins": 7.524034182230631, "rewards/rejected": -3.019654909769694, "step": 145 }, { "epoch": 0.04001644511443059, "grad_norm": 10.25, "kl": 4.937111854553223, "learning_rate": 5e-06, "logits/chosen": -1964717.3333333333, "logits/rejected": -3300825.3333333335, "logps/chosen": -566.4967447916666, "logps/rejected": -512.397705078125, "loss": 0.0873, "rewards/chosen": 4.372155825297038, "rewards/margins": 7.626852671305339, "rewards/rejected": -3.254696846008301, "step": 146 }, { "epoch": 0.04029053035494039, "grad_norm": 13.75, "kl": 0.3398030698299408, "learning_rate": 5e-06, "logits/chosen": 16566529.454545455, "logits/rejected": 64355707.07692308, "logps/chosen": -405.75217507102275, "logps/rejected": -638.7633713942307, "loss": 0.0605, "rewards/chosen": 3.2560195922851562, "rewards/margins": 9.292753366323618, "rewards/rejected": -6.036733774038462, "step": 147 }, { "epoch": 0.04056461559545019, "grad_norm": 20.375, "kl": 6.975541591644287, "learning_rate": 5e-06, "logits/chosen": 5985534.857142857, "logits/rejected": 12324473.6, "logps/chosen": -489.385986328125, "logps/rejected": -599.351171875, "loss": 0.0958, "rewards/chosen": 3.935640335083008, "rewards/margins": 7.458247375488281, "rewards/rejected": -3.5226070404052736, "step": 148 }, { "epoch": 0.04083870083595998, "grad_norm": 22.625, "kl": 5.720283508300781, "learning_rate": 5e-06, "logits/chosen": 93705773.1764706, "logits/rejected": 46988013.71428572, "logps/chosen": -524.2184627757352, "logps/rejected": -651.1162806919643, "loss": 0.18, "rewards/chosen": 3.1041470695944393, "rewards/margins": 5.357109590738761, "rewards/rejected": -2.252962521144322, "step": 149 }, { "epoch": 0.04111278607646978, "grad_norm": 12.125, "kl": 0.1414540708065033, "learning_rate": 5e-06, "logits/chosen": 33680704.0, "logits/rejected": 38841610.666666664, "logps/chosen": -443.945556640625, "logps/rejected": -551.6576741536459, "loss": 0.086, "rewards/chosen": 3.5813897450764975, "rewards/margins": 8.469660123189291, "rewards/rejected": -4.888270378112793, "step": 150 }, { "epoch": 0.04138687131697958, "grad_norm": 14.0, "kl": 6.840159893035889, "learning_rate": 5e-06, "logits/chosen": -5499635.692307692, "logits/rejected": 3975175.6363636362, "logps/chosen": -463.1681941105769, "logps/rejected": -458.4490411931818, "loss": 0.0966, "rewards/chosen": 4.063002072847807, "rewards/margins": 8.965942996365207, "rewards/rejected": -4.9029409235174, "step": 151 }, { "epoch": 0.04166095655748938, "grad_norm": 14.5625, "kl": 1.4093616008758545, "learning_rate": 5e-06, "logits/chosen": -9136766.222222222, "logits/rejected": -3159806.933333333, "logps/chosen": -424.96693250868054, "logps/rejected": -451.37083333333334, "loss": 0.1482, "rewards/chosen": 3.644777086046007, "rewards/margins": 6.425176408555773, "rewards/rejected": -2.7803993225097656, "step": 152 }, { "epoch": 0.04193504179799918, "grad_norm": 15.8125, "kl": 0.11477534472942352, "learning_rate": 5e-06, "logits/chosen": 2089156.8, "logits/rejected": 1807227.4285714286, "logps/chosen": -466.020458984375, "logps/rejected": -391.89903041294644, "loss": 0.0939, "rewards/chosen": 3.2995338439941406, "rewards/margins": 6.805568695068359, "rewards/rejected": -3.5060348510742188, "step": 153 }, { "epoch": 0.04220912703850897, "grad_norm": 12.0625, "kl": 6.059615135192871, "learning_rate": 5e-06, "logits/chosen": -3982537.230769231, "logits/rejected": 15260858.181818182, "logps/chosen": -512.8073542668269, "logps/rejected": -418.42587002840907, "loss": 0.1003, "rewards/chosen": 4.285635434664213, "rewards/margins": 7.461113589626926, "rewards/rejected": -3.175478154962713, "step": 154 }, { "epoch": 0.04248321227901877, "grad_norm": 16.625, "kl": 4.7000813484191895, "learning_rate": 5e-06, "logits/chosen": 18414144.0, "logits/rejected": 34331652.571428575, "logps/chosen": -419.1076171875, "logps/rejected": -452.16598074776783, "loss": 0.1456, "rewards/chosen": 3.455859375, "rewards/margins": 6.370354570661272, "rewards/rejected": -2.914495195661272, "step": 155 }, { "epoch": 0.04275729751952857, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1865476.3636363635, "logits/rejected": 4743569.846153846, "logps/chosen": -558.0752840909091, "logps/rejected": -460.3121995192308, "loss": 0.0676, "rewards/chosen": 3.764279452237216, "rewards/margins": 8.034574815443346, "rewards/rejected": -4.27029536320613, "step": 156 }, { "epoch": 0.04303138276003837, "grad_norm": 15.0625, "kl": 11.875640869140625, "learning_rate": 5e-06, "logits/chosen": 5159369.818181818, "logits/rejected": 4972001.230769231, "logps/chosen": -474.6345880681818, "logps/rejected": -361.26600060096155, "loss": 0.1156, "rewards/chosen": 4.618873942982066, "rewards/margins": 7.128033617993335, "rewards/rejected": -2.509159675011268, "step": 157 }, { "epoch": 0.04330546800054817, "grad_norm": 13.375, "kl": 1.4234187602996826, "learning_rate": 5e-06, "logits/chosen": 1026157.2307692308, "logits/rejected": 6590210.909090909, "logps/chosen": -500.7785832331731, "logps/rejected": -422.14626242897725, "loss": 0.1131, "rewards/chosen": 3.89547846867488, "rewards/margins": 7.2030873331990275, "rewards/rejected": -3.307608864524148, "step": 158 }, { "epoch": 0.04357955324105797, "grad_norm": 15.6875, "kl": 1.4406242370605469, "learning_rate": 5e-06, "logits/chosen": 5291171.2, "logits/rejected": 28458715.42857143, "logps/chosen": -381.8375, "logps/rejected": -494.67745535714283, "loss": 0.1326, "rewards/chosen": 2.7426521301269533, "rewards/margins": 6.884755325317383, "rewards/rejected": -4.14210319519043, "step": 159 }, { "epoch": 0.043853638481567765, "grad_norm": 15.0625, "kl": 10.282400131225586, "learning_rate": 5e-06, "logits/chosen": 8967352.0, "logits/rejected": 12385390.545454545, "logps/chosen": -467.5222731370192, "logps/rejected": -399.13583096590907, "loss": 0.0803, "rewards/chosen": 3.573980771578275, "rewards/margins": 7.412597923012047, "rewards/rejected": -3.8386171514337715, "step": 160 }, { "epoch": 0.044127723722077565, "grad_norm": 17.125, "kl": 1.6266670227050781, "learning_rate": 5e-06, "logits/chosen": 4219183.428571428, "logits/rejected": 1000269.0, "logps/chosen": -425.2318638392857, "logps/rejected": -521.913330078125, "loss": 0.1021, "rewards/chosen": 3.2845420837402344, "rewards/margins": 8.480320358276368, "rewards/rejected": -5.195778274536133, "step": 161 }, { "epoch": 0.044401808962587365, "grad_norm": 21.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 26945221.333333332, "logits/rejected": 50798208.0, "logps/chosen": -535.8470052083334, "logps/rejected": -526.8080078125, "loss": 0.0893, "rewards/chosen": 3.835415734185113, "rewards/margins": 7.45284890068902, "rewards/rejected": -3.6174331665039063, "step": 162 }, { "epoch": 0.044675894203097165, "grad_norm": 11.875, "kl": 1.4016908407211304, "learning_rate": 5e-06, "logits/chosen": 4323036.0, "logits/rejected": -4481660.8, "logps/chosen": -443.5078822544643, "logps/rejected": -486.075390625, "loss": 0.0423, "rewards/chosen": 3.7646004813058034, "rewards/margins": 10.4869752066476, "rewards/rejected": -6.722374725341797, "step": 163 }, { "epoch": 0.044949979443606965, "grad_norm": 17.25, "kl": 7.307744979858398, "learning_rate": 5e-06, "logits/chosen": 4501340.0, "logits/rejected": 38610982.4, "logps/chosen": -439.13330078125, "logps/rejected": -764.902197265625, "loss": 0.1625, "rewards/chosen": 3.6295950753348216, "rewards/margins": 8.521568952287947, "rewards/rejected": -4.891973876953125, "step": 164 }, { "epoch": 0.04522406468411676, "grad_norm": 8.25, "kl": 1.4399440288543701, "learning_rate": 5e-06, "logits/chosen": -8199441.333333333, "logits/rejected": 73125056.0, "logps/chosen": -427.3248697916667, "logps/rejected": -584.5726725260416, "loss": 0.038, "rewards/chosen": 4.277071634928386, "rewards/margins": 9.650992075602215, "rewards/rejected": -5.373920440673828, "step": 165 }, { "epoch": 0.04549814992462656, "grad_norm": 16.25, "kl": 5.778878688812256, "learning_rate": 5e-06, "logits/chosen": -3432344.888888889, "logits/rejected": -10030792.533333333, "logps/chosen": -455.89073350694446, "logps/rejected": -439.53020833333335, "loss": 0.1203, "rewards/chosen": 4.417838626437717, "rewards/margins": 8.125884077284072, "rewards/rejected": -3.7080454508463543, "step": 166 }, { "epoch": 0.04577223516513636, "grad_norm": 14.75, "kl": 0.9812116622924805, "learning_rate": 5e-06, "logits/chosen": 11196534.4, "logits/rejected": -2763015.4285714286, "logps/chosen": -414.8537109375, "logps/rejected": -342.06602260044644, "loss": 0.1234, "rewards/chosen": 3.51639404296875, "rewards/margins": 6.551245389665876, "rewards/rejected": -3.0348513466971263, "step": 167 }, { "epoch": 0.04604632040564616, "grad_norm": 18.5, "kl": 12.343981742858887, "learning_rate": 5e-06, "logits/chosen": -11683740.0, "logits/rejected": 2785088.6666666665, "logps/chosen": -494.2982991536458, "logps/rejected": -260.26031494140625, "loss": 0.1155, "rewards/chosen": 4.537640889485677, "rewards/margins": 6.5785706837972, "rewards/rejected": -2.0409297943115234, "step": 168 }, { "epoch": 0.04632040564615596, "grad_norm": 10.875, "kl": 7.613009452819824, "learning_rate": 5e-06, "logits/chosen": 3751092.3636363638, "logits/rejected": 22831448.615384616, "logps/chosen": -350.1577814275568, "logps/rejected": -469.2224684495192, "loss": 0.1755, "rewards/chosen": 3.6616862903941763, "rewards/margins": 6.950874381965691, "rewards/rejected": -3.2891880915715146, "step": 169 }, { "epoch": 0.04659449088666575, "grad_norm": 15.0, "kl": 4.901795864105225, "learning_rate": 5e-06, "logits/chosen": 499790.1538461539, "logits/rejected": 36225835.63636363, "logps/chosen": -465.99988731971155, "logps/rejected": -540.3160955255681, "loss": 0.0988, "rewards/chosen": 3.8305176955003004, "rewards/margins": 8.146344111515926, "rewards/rejected": -4.315826416015625, "step": 170 }, { "epoch": 0.04686857612717555, "grad_norm": 15.75, "kl": 5.247824668884277, "learning_rate": 5e-06, "logits/chosen": 9125872.0, "logits/rejected": 15740993.777777778, "logps/chosen": -552.4172526041667, "logps/rejected": -497.6188151041667, "loss": 0.0765, "rewards/chosen": 4.492021179199218, "rewards/margins": 7.996091885036892, "rewards/rejected": -3.5040707058376737, "step": 171 }, { "epoch": 0.04714266136768535, "grad_norm": 15.3125, "kl": 4.287100791931152, "learning_rate": 5e-06, "logits/chosen": 18854958.769230768, "logits/rejected": 28869172.363636363, "logps/chosen": -434.86527193509613, "logps/rejected": -588.8723810369319, "loss": 0.1165, "rewards/chosen": 3.2667098412146935, "rewards/margins": 6.739490242271156, "rewards/rejected": -3.472780401056463, "step": 172 }, { "epoch": 0.04741674660819515, "grad_norm": 15.6875, "kl": 7.779524803161621, "learning_rate": 5e-06, "logits/chosen": 42919104.0, "logits/rejected": 71971170.9090909, "logps/chosen": -531.3508112980769, "logps/rejected": -592.1924272017045, "loss": 0.108, "rewards/chosen": 4.538337120643029, "rewards/margins": 10.240909336330173, "rewards/rejected": -5.702572215687145, "step": 173 }, { "epoch": 0.04769083184870495, "grad_norm": 14.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5054573.454545454, "logits/rejected": -2682424.0, "logps/chosen": -409.9319957386364, "logps/rejected": -542.6337139423077, "loss": 0.0802, "rewards/chosen": 3.301650654185902, "rewards/margins": 7.8283893345119235, "rewards/rejected": -4.526738680326021, "step": 174 }, { "epoch": 0.04796491708921474, "grad_norm": 15.9375, "kl": 2.1455702781677246, "learning_rate": 5e-06, "logits/chosen": -5704532.307692308, "logits/rejected": 21278651.636363637, "logps/chosen": -495.7316331129808, "logps/rejected": -483.4337269176136, "loss": 0.094, "rewards/chosen": 4.712146759033203, "rewards/margins": 8.850966020063922, "rewards/rejected": -4.138819261030718, "step": 175 }, { "epoch": 0.04823900232972454, "grad_norm": 19.75, "kl": 15.89577865600586, "learning_rate": 5e-06, "logits/chosen": 17876523.2, "logits/rejected": 16360310.857142856, "logps/chosen": -464.021923828125, "logps/rejected": -342.09354073660717, "loss": 0.1398, "rewards/chosen": 4.993391036987305, "rewards/margins": 7.877691268920898, "rewards/rejected": -2.8843002319335938, "step": 176 }, { "epoch": 0.04851308757023434, "grad_norm": 16.0, "kl": 6.69710636138916, "learning_rate": 5e-06, "logits/chosen": 29377424.0, "logits/rejected": 37160886.4, "logps/chosen": -660.6689453125, "logps/rejected": -613.596533203125, "loss": 0.0894, "rewards/chosen": 4.287473678588867, "rewards/margins": 10.671078872680663, "rewards/rejected": -6.383605194091797, "step": 177 }, { "epoch": 0.04878717281074414, "grad_norm": 18.125, "kl": 3.478078842163086, "learning_rate": 5e-06, "logits/chosen": -10876278.153846154, "logits/rejected": 482590.9090909091, "logps/chosen": -419.2790715144231, "logps/rejected": -482.77951882102275, "loss": 0.1105, "rewards/chosen": 2.966394277719351, "rewards/margins": 7.834832651631816, "rewards/rejected": -4.868438373912465, "step": 178 }, { "epoch": 0.04906125805125394, "grad_norm": 11.875, "kl": 4.207137107849121, "learning_rate": 5e-06, "logits/chosen": 1222231.8181818181, "logits/rejected": 981525.0769230769, "logps/chosen": -505.85227272727275, "logps/rejected": -470.0186298076923, "loss": 0.0624, "rewards/chosen": 4.240551688454368, "rewards/margins": 8.59122330992372, "rewards/rejected": -4.350671621469351, "step": 179 }, { "epoch": 0.04933534329176374, "grad_norm": 11.75, "kl": 3.9081592559814453, "learning_rate": 5e-06, "logits/chosen": -14504654.666666666, "logits/rejected": 9512708.666666666, "logps/chosen": -405.6181640625, "logps/rejected": -404.8481852213542, "loss": 0.0897, "rewards/chosen": 3.81339963277181, "rewards/margins": 7.379643758138021, "rewards/rejected": -3.566244125366211, "step": 180 }, { "epoch": 0.049609428532273535, "grad_norm": 18.125, "kl": 5.893155097961426, "learning_rate": 5e-06, "logits/chosen": 1433725.3846153845, "logits/rejected": -3703657.8181818184, "logps/chosen": -394.33458533653845, "logps/rejected": -405.2277166193182, "loss": 0.2003, "rewards/chosen": 2.606241666353666, "rewards/margins": 5.70912925346748, "rewards/rejected": -3.102887587113814, "step": 181 }, { "epoch": 0.049883513772783335, "grad_norm": 14.0, "kl": 3.477199077606201, "learning_rate": 5e-06, "logits/chosen": -2805859.6923076925, "logits/rejected": 19693463.272727273, "logps/chosen": -405.1745042067308, "logps/rejected": -519.7160866477273, "loss": 0.1088, "rewards/chosen": 3.0947042611929088, "rewards/margins": 8.17467453429749, "rewards/rejected": -5.079970273104581, "step": 182 }, { "epoch": 0.050157599013293135, "grad_norm": 13.0625, "kl": 0.9740562438964844, "learning_rate": 5e-06, "logits/chosen": 2677949.714285714, "logits/rejected": -2326293.6, "logps/chosen": -396.19813755580356, "logps/rejected": -510.873828125, "loss": 0.0752, "rewards/chosen": 3.954784393310547, "rewards/margins": 8.478173065185548, "rewards/rejected": -4.523388671875, "step": 183 }, { "epoch": 0.050431684253802934, "grad_norm": 21.75, "kl": 0.07017135620117188, "learning_rate": 5e-06, "logits/chosen": 7269542.222222222, "logits/rejected": 27111517.866666667, "logps/chosen": -403.87909613715277, "logps/rejected": -565.3015625, "loss": 0.0992, "rewards/chosen": 3.6746283637152777, "rewards/margins": 7.23223639594184, "rewards/rejected": -3.5576080322265624, "step": 184 }, { "epoch": 0.050705769494312734, "grad_norm": 16.75, "kl": 0.7380339503288269, "learning_rate": 5e-06, "logits/chosen": 14453528.888888888, "logits/rejected": 24097557.333333332, "logps/chosen": -535.5159505208334, "logps/rejected": -407.748046875, "loss": 0.1229, "rewards/chosen": 4.7562815348307295, "rewards/margins": 8.691663869222005, "rewards/rejected": -3.935382334391276, "step": 185 }, { "epoch": 0.05097985473482253, "grad_norm": 7.1875, "kl": 1.6303462982177734, "learning_rate": 5e-06, "logits/chosen": 3836750.222222222, "logits/rejected": 67559334.4, "logps/chosen": -436.47108289930554, "logps/rejected": -569.364453125, "loss": 0.0625, "rewards/chosen": 3.1747926076253257, "rewards/margins": 7.324109268188476, "rewards/rejected": -4.149316660563151, "step": 186 }, { "epoch": 0.05125393997533233, "grad_norm": 9.6875, "kl": 3.386350154876709, "learning_rate": 5e-06, "logits/chosen": 8371529.846153846, "logits/rejected": 1126565.8181818181, "logps/chosen": -530.5818058894231, "logps/rejected": -424.65065696022725, "loss": 0.0514, "rewards/chosen": 4.412977658785307, "rewards/margins": 8.751892009815137, "rewards/rejected": -4.338914351029829, "step": 187 }, { "epoch": 0.05152802521584213, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10512130.285714285, "logits/rejected": 3045296.9411764704, "logps/chosen": -384.2388392857143, "logps/rejected": -404.8528837316176, "loss": 0.0855, "rewards/chosen": 2.908428192138672, "rewards/margins": 6.666636074290556, "rewards/rejected": -3.758207882151884, "step": 188 }, { "epoch": 0.05180211045635193, "grad_norm": 10.3125, "kl": 0.9558185338973999, "learning_rate": 5e-06, "logits/chosen": -11954053.0, "logits/rejected": 3895838.0, "logps/chosen": -421.6596984863281, "logps/rejected": -441.7607727050781, "loss": 0.0746, "rewards/chosen": 4.198707580566406, "rewards/margins": 8.476572513580322, "rewards/rejected": -4.277864933013916, "step": 189 }, { "epoch": 0.05207619569686173, "grad_norm": 18.25, "kl": 4.466094970703125, "learning_rate": 5e-06, "logits/chosen": -13750637.538461538, "logits/rejected": 37969762.90909091, "logps/chosen": -481.6154597355769, "logps/rejected": -480.09011008522725, "loss": 0.1324, "rewards/chosen": 4.394994882436899, "rewards/margins": 8.01372213296957, "rewards/rejected": -3.6187272505326704, "step": 190 }, { "epoch": 0.05235028093737152, "grad_norm": 20.375, "kl": 2.2208545207977295, "learning_rate": 5e-06, "logits/chosen": 46390380.8, "logits/rejected": 2279539.8571428573, "logps/chosen": -548.43134765625, "logps/rejected": -307.97935267857144, "loss": 0.1212, "rewards/chosen": 5.315782165527343, "rewards/margins": 7.84322509765625, "rewards/rejected": -2.5274429321289062, "step": 191 }, { "epoch": 0.05262436617788132, "grad_norm": 15.3125, "kl": 6.22431755065918, "learning_rate": 5e-06, "logits/chosen": 21383638.153846152, "logits/rejected": 14241125.818181818, "logps/chosen": -429.71987680288464, "logps/rejected": -358.8785289417614, "loss": 0.0875, "rewards/chosen": 3.9996155958909254, "rewards/margins": 7.152943537785457, "rewards/rejected": -3.1533279418945312, "step": 192 }, { "epoch": 0.05289845141839112, "grad_norm": 10.1875, "kl": 1.4627799987792969, "learning_rate": 5e-06, "logits/chosen": 16685993.333333334, "logits/rejected": 3016435.3333333335, "logps/chosen": -562.1373291015625, "logps/rejected": -434.8553059895833, "loss": 0.0585, "rewards/chosen": 5.035298983256022, "rewards/margins": 8.927838643391928, "rewards/rejected": -3.892539660135905, "step": 193 }, { "epoch": 0.05317253665890092, "grad_norm": 11.25, "kl": 4.287712097167969, "learning_rate": 5e-06, "logits/chosen": -4872255.2727272725, "logits/rejected": -5848512.0, "logps/chosen": -428.62539950284093, "logps/rejected": -464.92052283653845, "loss": 0.0703, "rewards/chosen": 4.30358193137429, "rewards/margins": 8.26336915342958, "rewards/rejected": -3.9597872220552883, "step": 194 }, { "epoch": 0.05344662189941072, "grad_norm": 10.4375, "kl": 1.0176925659179688, "learning_rate": 5e-06, "logits/chosen": 35775028.36363637, "logits/rejected": 28224755.692307692, "logps/chosen": -384.02885298295456, "logps/rejected": -464.6925706129808, "loss": 0.0938, "rewards/chosen": 4.1449456648393115, "rewards/margins": 9.020546626377772, "rewards/rejected": -4.875600961538462, "step": 195 }, { "epoch": 0.05372070713992051, "grad_norm": 13.6875, "kl": 2.9492831230163574, "learning_rate": 5e-06, "logits/chosen": 9383102.666666666, "logits/rejected": 9798896.666666666, "logps/chosen": -469.1483968098958, "logps/rejected": -519.20703125, "loss": 0.112, "rewards/chosen": 3.574925104777018, "rewards/margins": 8.684710184733072, "rewards/rejected": -5.109785079956055, "step": 196 }, { "epoch": 0.05399479238043031, "grad_norm": 15.9375, "kl": 3.680001974105835, "learning_rate": 5e-06, "logits/chosen": 13442461.090909092, "logits/rejected": 3291914.4615384615, "logps/chosen": -466.88321200284093, "logps/rejected": -358.75229116586536, "loss": 0.1069, "rewards/chosen": 4.072281924161044, "rewards/margins": 7.470335020051969, "rewards/rejected": -3.3980530958909254, "step": 197 }, { "epoch": 0.05426887762094011, "grad_norm": 8.125, "kl": 6.769434928894043, "learning_rate": 5e-06, "logits/chosen": 19864017.230769232, "logits/rejected": 3010255.272727273, "logps/chosen": -428.7634089543269, "logps/rejected": -491.7413884943182, "loss": 0.0648, "rewards/chosen": 4.95170651949369, "rewards/margins": 9.77585660494291, "rewards/rejected": -4.824150085449219, "step": 198 }, { "epoch": 0.05454296286144991, "grad_norm": 16.75, "kl": 3.6588282585144043, "learning_rate": 5e-06, "logits/chosen": 1113135.3846153845, "logits/rejected": 14299457.454545455, "logps/chosen": -370.3683894230769, "logps/rejected": -181.45439009232953, "loss": 0.1795, "rewards/chosen": 3.3832743718073917, "rewards/margins": 5.518249978552332, "rewards/rejected": -2.1349756067449395, "step": 199 }, { "epoch": 0.05481704810195971, "grad_norm": 14.25, "kl": 7.383843421936035, "learning_rate": 5e-06, "logits/chosen": 25432183.272727273, "logits/rejected": 15468454.153846154, "logps/chosen": -483.94340376420456, "logps/rejected": -529.8083308293269, "loss": 0.1097, "rewards/chosen": 5.084277413108132, "rewards/margins": 9.61124497527009, "rewards/rejected": -4.526967562161959, "step": 200 }, { "epoch": 0.055091133342469505, "grad_norm": 14.3125, "kl": 11.056272506713867, "learning_rate": 5e-06, "logits/chosen": 6108566.4, "logits/rejected": 4541319.111111111, "logps/chosen": -500.247265625, "logps/rejected": -353.01361762152777, "loss": 0.0676, "rewards/chosen": 4.155651092529297, "rewards/margins": 7.368665059407553, "rewards/rejected": -3.2130139668782554, "step": 201 }, { "epoch": 0.055365218582979304, "grad_norm": 19.75, "kl": 15.486307144165039, "learning_rate": 5e-06, "logits/chosen": 10029453.47368421, "logits/rejected": -2559727.6, "logps/chosen": -468.8062808388158, "logps/rejected": -364.5609130859375, "loss": 0.1715, "rewards/chosen": 4.261139719109786, "rewards/margins": 8.295858804803146, "rewards/rejected": -4.034719085693359, "step": 202 }, { "epoch": 0.055639303823489104, "grad_norm": 14.9375, "kl": 9.248078346252441, "learning_rate": 5e-06, "logits/chosen": 19844104.0, "logits/rejected": 37087440.0, "logps/chosen": -389.9686279296875, "logps/rejected": -463.65704345703125, "loss": 0.157, "rewards/chosen": 3.495807647705078, "rewards/margins": 7.266016006469727, "rewards/rejected": -3.7702083587646484, "step": 203 }, { "epoch": 0.055913389063998904, "grad_norm": 12.8125, "kl": 6.673659324645996, "learning_rate": 5e-06, "logits/chosen": -4767434.5, "logits/rejected": -1982735.5, "logps/chosen": -513.214111328125, "logps/rejected": -428.96954345703125, "loss": 0.0524, "rewards/chosen": 4.518033981323242, "rewards/margins": 8.874419689178467, "rewards/rejected": -4.356385707855225, "step": 204 }, { "epoch": 0.056187474304508704, "grad_norm": 14.0625, "kl": 1.0780258178710938, "learning_rate": 5e-06, "logits/chosen": -3009850.153846154, "logits/rejected": 6350238.545454546, "logps/chosen": -424.18810096153845, "logps/rejected": -443.51908735795456, "loss": 0.0787, "rewards/chosen": 3.8292001577524037, "rewards/margins": 7.236341756540579, "rewards/rejected": -3.407141598788175, "step": 205 }, { "epoch": 0.056461559545018504, "grad_norm": 16.25, "kl": 10.485641479492188, "learning_rate": 5e-06, "logits/chosen": -12033363.692307692, "logits/rejected": 24036234.181818184, "logps/chosen": -513.3760141225962, "logps/rejected": -419.58114346590907, "loss": 0.1068, "rewards/chosen": 4.299801166240986, "rewards/margins": 7.897839579548869, "rewards/rejected": -3.5980384133078833, "step": 206 }, { "epoch": 0.0567356447855283, "grad_norm": 12.3125, "kl": 1.9726613759994507, "learning_rate": 5e-06, "logits/chosen": -1918259.5555555555, "logits/rejected": 18183485.866666667, "logps/chosen": -354.55604383680554, "logps/rejected": -559.9032552083333, "loss": 0.1006, "rewards/chosen": 3.3451639811197915, "rewards/margins": 7.8223108927408855, "rewards/rejected": -4.477146911621094, "step": 207 }, { "epoch": 0.0570097300260381, "grad_norm": 11.0, "kl": 7.964696884155273, "learning_rate": 5e-06, "logits/chosen": -440720.9090909091, "logits/rejected": 116747864.61538461, "logps/chosen": -494.40229936079544, "logps/rejected": -559.3937800480769, "loss": 0.0515, "rewards/chosen": 4.483458085493608, "rewards/margins": 9.575550159374316, "rewards/rejected": -5.092092073880709, "step": 208 }, { "epoch": 0.0572838152665479, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5282568.0, "logits/rejected": 22053842.82352941, "logps/chosen": -431.14718191964283, "logps/rejected": -525.8369140625, "loss": 0.0645, "rewards/chosen": 3.2288616725376675, "rewards/margins": 8.168556245435186, "rewards/rejected": -4.939694572897518, "step": 209 }, { "epoch": 0.0575579005070577, "grad_norm": 20.75, "kl": 13.416534423828125, "learning_rate": 5e-06, "logits/chosen": -12611493.0, "logits/rejected": -2432364.5, "logps/chosen": -458.89898681640625, "logps/rejected": -626.782470703125, "loss": 0.1587, "rewards/chosen": 4.080418586730957, "rewards/margins": 9.986147403717041, "rewards/rejected": -5.905728816986084, "step": 210 }, { "epoch": 0.057831985747567496, "grad_norm": 14.375, "kl": 4.132503509521484, "learning_rate": 5e-06, "logits/chosen": -1156940.3636363635, "logits/rejected": 9815869.538461538, "logps/chosen": -454.30131392045456, "logps/rejected": -447.4434344951923, "loss": 0.1008, "rewards/chosen": 3.9283533963290127, "rewards/margins": 6.948578334354854, "rewards/rejected": -3.0202249380258412, "step": 211 }, { "epoch": 0.05810607098807729, "grad_norm": 16.0, "kl": 7.021111488342285, "learning_rate": 5e-06, "logits/chosen": -14080098.666666666, "logits/rejected": 11815545.333333334, "logps/chosen": -476.7827962239583, "logps/rejected": -521.1528727213541, "loss": 0.1336, "rewards/chosen": 3.47343381245931, "rewards/margins": 8.34574826558431, "rewards/rejected": -4.872314453125, "step": 212 }, { "epoch": 0.05838015622858709, "grad_norm": 14.8125, "kl": 5.720648765563965, "learning_rate": 5e-06, "logits/chosen": 3370417.8666666667, "logits/rejected": 10946906.666666666, "logps/chosen": -457.3771158854167, "logps/rejected": -610.3717990451389, "loss": 0.0825, "rewards/chosen": 4.106975301106771, "rewards/margins": 8.303077019585505, "rewards/rejected": -4.196101718478733, "step": 213 }, { "epoch": 0.05865424146909689, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7416556.8, "logits/rejected": 24578971.42857143, "logps/chosen": -447.045751953125, "logps/rejected": -438.4349888392857, "loss": 0.0772, "rewards/chosen": 4.363005828857422, "rewards/margins": 9.002563803536551, "rewards/rejected": -4.639557974679129, "step": 214 }, { "epoch": 0.05892832670960669, "grad_norm": 10.5, "kl": 0.26170605421066284, "learning_rate": 5e-06, "logits/chosen": 6511733.0, "logits/rejected": 6521426.5, "logps/chosen": -615.953857421875, "logps/rejected": -408.9903564453125, "loss": 0.0695, "rewards/chosen": 4.427460670471191, "rewards/margins": 7.9983837604522705, "rewards/rejected": -3.570923089981079, "step": 215 }, { "epoch": 0.05920241195011649, "grad_norm": 10.25, "kl": 6.5508832931518555, "learning_rate": 5e-06, "logits/chosen": 5330468.363636363, "logits/rejected": 100999020.3076923, "logps/chosen": -393.74702592329544, "logps/rejected": -562.4472280649038, "loss": 0.0515, "rewards/chosen": 5.456273165616122, "rewards/margins": 11.124527724472792, "rewards/rejected": -5.66825455885667, "step": 216 }, { "epoch": 0.05947649719062628, "grad_norm": 18.5, "kl": 5.353166103363037, "learning_rate": 5e-06, "logits/chosen": -1463375.5384615385, "logits/rejected": 15532372.363636363, "logps/chosen": -295.460693359375, "logps/rejected": -303.0595703125, "loss": 0.1375, "rewards/chosen": 3.10614747267503, "rewards/margins": 6.287823470322403, "rewards/rejected": -3.181675997647372, "step": 217 }, { "epoch": 0.05975058243113608, "grad_norm": 10.1875, "kl": 1.7608833312988281, "learning_rate": 5e-06, "logits/chosen": -10305269.0, "logits/rejected": -3087028.5, "logps/chosen": -346.1761474609375, "logps/rejected": -469.1958923339844, "loss": 0.0592, "rewards/chosen": 4.161206245422363, "rewards/margins": 9.302098274230957, "rewards/rejected": -5.140892028808594, "step": 218 }, { "epoch": 0.06002466767164588, "grad_norm": 13.0625, "kl": 4.5908308029174805, "learning_rate": 5e-06, "logits/chosen": -6138497.6, "logits/rejected": 11777070.222222222, "logps/chosen": -443.49033203125, "logps/rejected": -327.10302734375, "loss": 0.0964, "rewards/chosen": 3.811534881591797, "rewards/margins": 7.0538722144232855, "rewards/rejected": -3.2423373328314886, "step": 219 }, { "epoch": 0.06029875291215568, "grad_norm": 13.6875, "kl": 2.3556036949157715, "learning_rate": 5e-06, "logits/chosen": -22363188.363636363, "logits/rejected": 7053648.0, "logps/chosen": -489.70649857954544, "logps/rejected": -429.1982421875, "loss": 0.0831, "rewards/chosen": 4.779176538640803, "rewards/margins": 8.27339697884513, "rewards/rejected": -3.494220440204327, "step": 220 }, { "epoch": 0.06057283815266548, "grad_norm": 18.5, "kl": 14.855815887451172, "learning_rate": 5e-06, "logits/chosen": -15365006.0, "logits/rejected": 17526724.0, "logps/chosen": -456.07525634765625, "logps/rejected": -255.51934814453125, "loss": 0.1168, "rewards/chosen": 4.42777681350708, "rewards/margins": 7.579113006591797, "rewards/rejected": -3.151336193084717, "step": 221 }, { "epoch": 0.060846923393175274, "grad_norm": 11.125, "kl": 2.5943312644958496, "learning_rate": 5e-06, "logits/chosen": -4181854.5454545454, "logits/rejected": 8472212.923076924, "logps/chosen": -530.3947975852273, "logps/rejected": -394.90091646634613, "loss": 0.0777, "rewards/chosen": 4.756909457120028, "rewards/margins": 8.87311628648451, "rewards/rejected": -4.116206829364483, "step": 222 }, { "epoch": 0.061121008633685074, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14311652.444444444, "logits/rejected": 4806328.533333333, "logps/chosen": -533.1510416666666, "logps/rejected": -478.25397135416665, "loss": 0.0365, "rewards/chosen": 4.741060045030382, "rewards/margins": 10.047053697374132, "rewards/rejected": -5.30599365234375, "step": 223 }, { "epoch": 0.061395093874194874, "grad_norm": 14.625, "kl": 9.388216018676758, "learning_rate": 5e-06, "logits/chosen": -1732472.0, "logits/rejected": 8802879.384615384, "logps/chosen": -527.9182794744319, "logps/rejected": -603.7789588341346, "loss": 0.0972, "rewards/chosen": 5.2377797907049, "rewards/margins": 11.379497554752376, "rewards/rejected": -6.141717764047476, "step": 224 }, { "epoch": 0.061669179114704674, "grad_norm": 15.5, "kl": 5.757241249084473, "learning_rate": 5e-06, "logits/chosen": 14995893.333333334, "logits/rejected": 8014917.333333333, "logps/chosen": -517.5732421875, "logps/rejected": -479.071044921875, "loss": 0.0693, "rewards/chosen": 4.14844290415446, "rewards/margins": 7.683147748311361, "rewards/rejected": -3.534704844156901, "step": 225 }, { "epoch": 0.061943264355214474, "grad_norm": 9.75, "kl": 2.330820083618164, "learning_rate": 5e-06, "logits/chosen": 4367716.363636363, "logits/rejected": 2672217.5384615385, "logps/chosen": -409.56906960227275, "logps/rejected": -391.29161658653845, "loss": 0.0567, "rewards/chosen": 4.331513144753196, "rewards/margins": 9.46991975157411, "rewards/rejected": -5.138406606820913, "step": 226 }, { "epoch": 0.062217349595724274, "grad_norm": 11.0625, "kl": 7.8385820388793945, "learning_rate": 5e-06, "logits/chosen": 19304300.307692308, "logits/rejected": 14306337.454545455, "logps/chosen": -499.7370417668269, "logps/rejected": -408.14448686079544, "loss": 0.1313, "rewards/chosen": 4.942823556753305, "rewards/margins": 10.094192131415948, "rewards/rejected": -5.151368574662642, "step": 227 }, { "epoch": 0.06249143483623407, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 24242362.181818184, "logits/rejected": 16606225.23076923, "logps/chosen": -604.2864879261364, "logps/rejected": -554.0869140625, "loss": 0.0465, "rewards/chosen": 4.118512933904475, "rewards/margins": 10.749394023335064, "rewards/rejected": -6.630881089430589, "step": 228 }, { "epoch": 0.06276552007674387, "grad_norm": 12.4375, "kl": 3.264291763305664, "learning_rate": 5e-06, "logits/chosen": 17365170.46153846, "logits/rejected": 18591547.636363637, "logps/chosen": -382.7811748798077, "logps/rejected": -439.1297496448864, "loss": 0.1191, "rewards/chosen": 3.6738451444185696, "rewards/margins": 7.307094947441474, "rewards/rejected": -3.633249803022905, "step": 229 }, { "epoch": 0.06303960531725367, "grad_norm": 13.25, "kl": 9.579853057861328, "learning_rate": 5e-06, "logits/chosen": -11651626.666666666, "logits/rejected": 6581990.666666667, "logps/chosen": -679.1416422526041, "logps/rejected": -410.2731119791667, "loss": 0.1684, "rewards/chosen": 5.971960703531901, "rewards/margins": 9.261691729227703, "rewards/rejected": -3.289731025695801, "step": 230 }, { "epoch": 0.06331369055776347, "grad_norm": 13.375, "kl": 3.077338218688965, "learning_rate": 5e-06, "logits/chosen": 23111858.90909091, "logits/rejected": 19078796.307692308, "logps/chosen": -444.17231889204544, "logps/rejected": -436.1751051682692, "loss": 0.1062, "rewards/chosen": 4.351072484796697, "rewards/margins": 9.162907847157726, "rewards/rejected": -4.811835362361028, "step": 231 }, { "epoch": 0.06358777579827327, "grad_norm": 15.9375, "kl": 4.432603359222412, "learning_rate": 5e-06, "logits/chosen": 10592062.0, "logits/rejected": 7002415.0, "logps/chosen": -493.58642578125, "logps/rejected": -486.51654052734375, "loss": 0.0989, "rewards/chosen": 3.6595046520233154, "rewards/margins": 7.671621561050415, "rewards/rejected": -4.0121169090271, "step": 232 }, { "epoch": 0.06386186103878307, "grad_norm": 7.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 22913944.888888888, "logits/rejected": 78571050.66666667, "logps/chosen": -396.70545789930554, "logps/rejected": -588.0014973958333, "loss": 0.0338, "rewards/chosen": 3.5782987806532116, "rewards/margins": 11.110277133517794, "rewards/rejected": -7.531978352864583, "step": 233 }, { "epoch": 0.06413594627929287, "grad_norm": 54.25, "kl": 5.083621501922607, "learning_rate": 5e-06, "logits/chosen": 20707717.818181816, "logits/rejected": 14602550.153846154, "logps/chosen": -413.29514382102275, "logps/rejected": -663.9215745192307, "loss": 0.0993, "rewards/chosen": 4.1621104153719815, "rewards/margins": 9.996567225956417, "rewards/rejected": -5.834456810584435, "step": 234 }, { "epoch": 0.06441003151980265, "grad_norm": 11.0625, "kl": 10.2030029296875, "learning_rate": 5e-06, "logits/chosen": -8643770.133333333, "logits/rejected": 55987768.88888889, "logps/chosen": -546.3072265625, "logps/rejected": -597.9254557291666, "loss": 0.1066, "rewards/chosen": 4.503391520182292, "rewards/margins": 10.501863098144531, "rewards/rejected": -5.998471577962239, "step": 235 }, { "epoch": 0.06468411676031245, "grad_norm": 13.1875, "kl": 5.806245803833008, "learning_rate": 5e-06, "logits/chosen": -5549272.571428572, "logits/rejected": 12086524.8, "logps/chosen": -520.2118791852679, "logps/rejected": -519.3787109375, "loss": 0.0944, "rewards/chosen": 4.678683144705636, "rewards/margins": 9.020408303397042, "rewards/rejected": -4.341725158691406, "step": 236 }, { "epoch": 0.06495820200082225, "grad_norm": 14.875, "kl": 2.021557331085205, "learning_rate": 5e-06, "logits/chosen": 32835833.6, "logits/rejected": 54510445.71428572, "logps/chosen": -355.077783203125, "logps/rejected": -642.2839704241071, "loss": 0.1259, "rewards/chosen": 3.2599563598632812, "rewards/margins": 8.924779074532644, "rewards/rejected": -5.664822714669364, "step": 237 }, { "epoch": 0.06523228724133205, "grad_norm": 10.25, "kl": 1.1376266479492188, "learning_rate": 5e-06, "logits/chosen": -5679687.111111111, "logits/rejected": 16897065.6, "logps/chosen": -342.44788953993054, "logps/rejected": -425.70924479166666, "loss": 0.1022, "rewards/chosen": 3.713292015923394, "rewards/margins": 8.432776472303603, "rewards/rejected": -4.719484456380209, "step": 238 }, { "epoch": 0.06550637248184185, "grad_norm": 10.875, "kl": 2.6556992530822754, "learning_rate": 5e-06, "logits/chosen": 9555662.857142856, "logits/rejected": 16332102.4, "logps/chosen": -460.96292550223217, "logps/rejected": -475.517431640625, "loss": 0.0799, "rewards/chosen": 4.528543199811663, "rewards/margins": 8.05802775791713, "rewards/rejected": -3.5294845581054686, "step": 239 }, { "epoch": 0.06578045772235165, "grad_norm": 9.9375, "kl": 1.4102262258529663, "learning_rate": 5e-06, "logits/chosen": 6603336.0, "logits/rejected": 28273417.14285714, "logps/chosen": -545.98671875, "logps/rejected": -626.6135602678571, "loss": 0.0451, "rewards/chosen": 4.669897842407226, "rewards/margins": 10.963448061261857, "rewards/rejected": -6.293550218854632, "step": 240 }, { "epoch": 0.06605454296286145, "grad_norm": 20.5, "kl": 3.0436925888061523, "learning_rate": 5e-06, "logits/chosen": 38258397.71428572, "logits/rejected": 19989163.2, "logps/chosen": -461.94737025669644, "logps/rejected": -450.12109375, "loss": 0.0984, "rewards/chosen": 3.777672358921596, "rewards/margins": 9.070582362583705, "rewards/rejected": -5.29291000366211, "step": 241 }, { "epoch": 0.06632862820337125, "grad_norm": 12.0, "kl": 1.8440793752670288, "learning_rate": 5e-06, "logits/chosen": 9315063.272727273, "logits/rejected": 49010131.692307696, "logps/chosen": -433.865234375, "logps/rejected": -489.8444260817308, "loss": 0.0685, "rewards/chosen": 4.11791298606179, "rewards/margins": 8.522804953835227, "rewards/rejected": -4.4048919677734375, "step": 242 }, { "epoch": 0.06660271344388105, "grad_norm": 11.4375, "kl": 6.11643123626709, "learning_rate": 5e-06, "logits/chosen": 7375888.0, "logits/rejected": 48403721.6, "logps/chosen": -322.82090541294644, "logps/rejected": -534.0943359375, "loss": 0.1581, "rewards/chosen": 3.8321969168526784, "rewards/margins": 8.205166353498186, "rewards/rejected": -4.372969436645508, "step": 243 }, { "epoch": 0.06687679868439085, "grad_norm": 19.75, "kl": 14.693050384521484, "learning_rate": 5e-06, "logits/chosen": -18356544.0, "logits/rejected": 36735396.571428575, "logps/chosen": -461.48816636029414, "logps/rejected": -792.9794921875, "loss": 0.1235, "rewards/chosen": 4.183081682990579, "rewards/margins": 13.760397967170267, "rewards/rejected": -9.577316284179688, "step": 244 }, { "epoch": 0.06715088392490065, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8314821.333333333, "logits/rejected": -2836730.6666666665, "logps/chosen": -416.8761393229167, "logps/rejected": -699.8424479166666, "loss": 0.1116, "rewards/chosen": 3.572037696838379, "rewards/margins": 10.237701733907063, "rewards/rejected": -6.665664037068685, "step": 245 }, { "epoch": 0.06742496916541044, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 264759.2, "logits/rejected": 7193585.714285715, "logps/chosen": -387.6857177734375, "logps/rejected": -426.6007603236607, "loss": 0.0969, "rewards/chosen": 3.474840545654297, "rewards/margins": 8.703953988211495, "rewards/rejected": -5.229113442557199, "step": 246 }, { "epoch": 0.06769905440592024, "grad_norm": 13.125, "kl": 10.30768871307373, "learning_rate": 5e-06, "logits/chosen": -14994899.42857143, "logits/rejected": 5681408.8, "logps/chosen": -444.95982142857144, "logps/rejected": -494.2611328125, "loss": 0.0731, "rewards/chosen": 4.896151951381138, "rewards/margins": 9.476132801600865, "rewards/rejected": -4.579980850219727, "step": 247 }, { "epoch": 0.06797313964643004, "grad_norm": 9.5625, "kl": 3.554717540740967, "learning_rate": 5e-06, "logits/chosen": 27912142.769230768, "logits/rejected": 14734832.0, "logps/chosen": -372.9149639423077, "logps/rejected": -640.0835404829545, "loss": 0.0439, "rewards/chosen": 4.469609187199519, "rewards/margins": 11.187395109163297, "rewards/rejected": -6.717785921963778, "step": 248 }, { "epoch": 0.06824722488693984, "grad_norm": 8.3125, "kl": 0.12672869861125946, "learning_rate": 5e-06, "logits/chosen": -16766844.8, "logits/rejected": 6625658.947368421, "logps/chosen": -457.173291015625, "logps/rejected": -439.6635485197368, "loss": 0.0452, "rewards/chosen": 5.323111724853516, "rewards/margins": 9.577124384829872, "rewards/rejected": -4.254012659976357, "step": 249 }, { "epoch": 0.06852131012744964, "grad_norm": 15.3125, "kl": 1.0597375631332397, "learning_rate": 5e-06, "logits/chosen": 9964952.615384616, "logits/rejected": 40427202.90909091, "logps/chosen": -394.0656550480769, "logps/rejected": -606.6855912642045, "loss": 0.0907, "rewards/chosen": 2.9479874830979567, "rewards/margins": 8.501980708195614, "rewards/rejected": -5.553993225097656, "step": 250 }, { "epoch": 0.06879539536795944, "grad_norm": 14.125, "kl": 6.820850372314453, "learning_rate": 5e-06, "logits/chosen": 14734212.266666668, "logits/rejected": -5203939.555555556, "logps/chosen": -462.8798828125, "logps/rejected": -474.47059461805554, "loss": 0.1135, "rewards/chosen": 4.326311238606771, "rewards/margins": 7.977598317464193, "rewards/rejected": -3.651287078857422, "step": 251 }, { "epoch": 0.06906948060846924, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11703633.777777778, "logits/rejected": 3396974.933333333, "logps/chosen": -461.3015950520833, "logps/rejected": -385.6204427083333, "loss": 0.0441, "rewards/chosen": 4.588537851969401, "rewards/margins": 9.106887563069662, "rewards/rejected": -4.51834971110026, "step": 252 }, { "epoch": 0.06934356584897904, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7985262.4, "logits/rejected": -798647.7142857143, "logps/chosen": -449.96728515625, "logps/rejected": -479.57571847098217, "loss": 0.101, "rewards/chosen": 3.827876663208008, "rewards/margins": 7.801386315482004, "rewards/rejected": -3.9735096522739957, "step": 253 }, { "epoch": 0.06961765108948884, "grad_norm": 10.0625, "kl": 0.9049758911132812, "learning_rate": 5e-06, "logits/chosen": -7319471.2727272725, "logits/rejected": -3721517.230769231, "logps/chosen": -533.5440784801136, "logps/rejected": -414.28455528846155, "loss": 0.0614, "rewards/chosen": 4.016970547762784, "rewards/margins": 8.756836657757525, "rewards/rejected": -4.7398661099947414, "step": 254 }, { "epoch": 0.06989173632999864, "grad_norm": 15.25, "kl": 7.700930595397949, "learning_rate": 5e-06, "logits/chosen": -3431287.2, "logits/rejected": 2536856.0, "logps/chosen": -537.388818359375, "logps/rejected": -405.92818777901783, "loss": 0.1716, "rewards/chosen": 5.448218536376953, "rewards/margins": 9.428494916643416, "rewards/rejected": -3.9802763802664622, "step": 255 }, { "epoch": 0.07016582157050842, "grad_norm": 8.9375, "kl": 0.2056560516357422, "learning_rate": 5e-06, "logits/chosen": 27405304.0, "logits/rejected": -11109617.333333334, "logps/chosen": -415.3972981770833, "logps/rejected": -408.6536458333333, "loss": 0.0673, "rewards/chosen": 4.3152875900268555, "rewards/margins": 9.112414677937824, "rewards/rejected": -4.79712708791097, "step": 256 }, { "epoch": 0.07043990681101822, "grad_norm": 14.125, "kl": 4.355138778686523, "learning_rate": 5e-06, "logits/chosen": -15176532.57142857, "logits/rejected": 66385024.0, "logps/chosen": -445.87193080357144, "logps/rejected": -568.08994140625, "loss": 0.0579, "rewards/chosen": 4.3558197021484375, "rewards/margins": 9.572275161743164, "rewards/rejected": -5.216455459594727, "step": 257 }, { "epoch": 0.07071399205152802, "grad_norm": 13.0, "kl": 2.433619260787964, "learning_rate": 5e-06, "logits/chosen": 14969946.181818182, "logits/rejected": 48164169.84615385, "logps/chosen": -427.78426846590907, "logps/rejected": -479.892578125, "loss": 0.0777, "rewards/chosen": 3.599002491344105, "rewards/margins": 9.995502125133168, "rewards/rejected": -6.3964996337890625, "step": 258 }, { "epoch": 0.07098807729203782, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8117113.333333333, "logits/rejected": 31431960.0, "logps/chosen": -541.1790364583334, "logps/rejected": -497.0306803385417, "loss": 0.0697, "rewards/chosen": 4.528116226196289, "rewards/margins": 9.4809144337972, "rewards/rejected": -4.952798207600911, "step": 259 }, { "epoch": 0.07126216253254762, "grad_norm": 17.75, "kl": 4.295126438140869, "learning_rate": 5e-06, "logits/chosen": 40228067.2, "logits/rejected": 30496992.0, "logps/chosen": -608.2, "logps/rejected": -335.3004673549107, "loss": 0.0771, "rewards/chosen": 4.677749252319336, "rewards/margins": 8.161793463570731, "rewards/rejected": -3.484044211251395, "step": 260 }, { "epoch": 0.07153624777305742, "grad_norm": 12.0, "kl": 0.4326254725456238, "learning_rate": 5e-06, "logits/chosen": -1208177.5833333333, "logits/rejected": 28106890.666666668, "logps/chosen": -445.582275390625, "logps/rejected": -575.3586832682291, "loss": 0.0713, "rewards/chosen": 4.889801661173503, "rewards/margins": 11.608488082885742, "rewards/rejected": -6.718686421712239, "step": 261 }, { "epoch": 0.07181033301356722, "grad_norm": 14.5625, "kl": 1.7306512594223022, "learning_rate": 5e-06, "logits/chosen": 23954191.05882353, "logits/rejected": 20334240.0, "logps/chosen": -432.1903722426471, "logps/rejected": -447.7947474888393, "loss": 0.0858, "rewards/chosen": 3.393847072825712, "rewards/margins": 8.54405895201098, "rewards/rejected": -5.150211879185268, "step": 262 }, { "epoch": 0.07208441825407702, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4660982.222222222, "logits/rejected": 16374439.466666667, "logps/chosen": -541.5124782986111, "logps/rejected": -461.14391276041664, "loss": 0.079, "rewards/chosen": 4.340162489149305, "rewards/margins": 8.462654452853734, "rewards/rejected": -4.1224919637044275, "step": 263 }, { "epoch": 0.07235850349458682, "grad_norm": 11.5, "kl": 7.6823811531066895, "learning_rate": 5e-06, "logits/chosen": -19549808.0, "logits/rejected": 18184868.363636363, "logps/chosen": -436.8807842548077, "logps/rejected": -430.6495472301136, "loss": 0.0531, "rewards/chosen": 6.502193744365986, "rewards/margins": 12.324811015095744, "rewards/rejected": -5.822617270729759, "step": 264 }, { "epoch": 0.07263258873509662, "grad_norm": 13.1875, "kl": 1.491803526878357, "learning_rate": 5e-06, "logits/chosen": -14613978.181818182, "logits/rejected": 30286294.153846152, "logps/chosen": -492.35129616477275, "logps/rejected": -460.20079627403845, "loss": 0.0714, "rewards/chosen": 4.017611763694069, "rewards/margins": 8.38382798308259, "rewards/rejected": -4.366216219388521, "step": 265 }, { "epoch": 0.07290667397560642, "grad_norm": 15.0, "kl": 1.1497396230697632, "learning_rate": 5e-06, "logits/chosen": 2628845.1428571427, "logits/rejected": 21416083.2, "logps/chosen": -417.68603515625, "logps/rejected": -503.79892578125, "loss": 0.0904, "rewards/chosen": 3.371175765991211, "rewards/margins": 8.741563034057616, "rewards/rejected": -5.370387268066406, "step": 266 }, { "epoch": 0.0731807592161162, "grad_norm": 19.625, "kl": 6.655999660491943, "learning_rate": 5e-06, "logits/chosen": -13081777.142857144, "logits/rejected": 33984934.4, "logps/chosen": -528.3924734933036, "logps/rejected": -472.630029296875, "loss": 0.1129, "rewards/chosen": 4.5996246337890625, "rewards/margins": 7.437141990661621, "rewards/rejected": -2.8375173568725587, "step": 267 }, { "epoch": 0.073454844456626, "grad_norm": 14.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4866083.0, "logits/rejected": 10778604.0, "logps/chosen": -527.2923583984375, "logps/rejected": -518.109130859375, "loss": 0.0734, "rewards/chosen": 4.740788459777832, "rewards/margins": 10.537458419799805, "rewards/rejected": -5.796669960021973, "step": 268 }, { "epoch": 0.0737289296971358, "grad_norm": 14.4375, "kl": 5.852717876434326, "learning_rate": 5e-06, "logits/chosen": -4066713.3333333335, "logits/rejected": 55962876.44444445, "logps/chosen": -566.7399088541666, "logps/rejected": -428.1861979166667, "loss": 0.0647, "rewards/chosen": 4.9399668375651045, "rewards/margins": 8.698386722140842, "rewards/rejected": -3.758419884575738, "step": 269 }, { "epoch": 0.0740030149376456, "grad_norm": 19.125, "kl": 9.196998596191406, "learning_rate": 5e-06, "logits/chosen": 23252451.76470588, "logits/rejected": 48162537.14285714, "logps/chosen": -469.28624770220586, "logps/rejected": -401.90586635044644, "loss": 0.1701, "rewards/chosen": 3.296945908490349, "rewards/margins": 7.379231108336889, "rewards/rejected": -4.08228519984654, "step": 270 }, { "epoch": 0.0742771001781554, "grad_norm": 15.6875, "kl": 4.808794021606445, "learning_rate": 5e-06, "logits/chosen": 3625887.272727273, "logits/rejected": -100800.30769230769, "logps/chosen": -478.80264559659093, "logps/rejected": -477.2715594951923, "loss": 0.0999, "rewards/chosen": 3.762373143976385, "rewards/margins": 9.350861796132335, "rewards/rejected": -5.58848865215595, "step": 271 }, { "epoch": 0.0745511854186652, "grad_norm": 17.0, "kl": 8.700464248657227, "learning_rate": 5e-06, "logits/chosen": -4990892.307692308, "logits/rejected": -5112166.545454546, "logps/chosen": -533.8067157451923, "logps/rejected": -640.0094992897727, "loss": 0.0835, "rewards/chosen": 5.018348106971154, "rewards/margins": 12.808261631252048, "rewards/rejected": -7.789913524280895, "step": 272 }, { "epoch": 0.074825270659175, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2765644.2, "logits/rejected": 21635337.14285714, "logps/chosen": -553.750390625, "logps/rejected": -528.7015904017857, "loss": 0.0732, "rewards/chosen": 4.473266983032227, "rewards/margins": 10.06370964050293, "rewards/rejected": -5.590442657470703, "step": 273 }, { "epoch": 0.0750993558996848, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10500308.0, "logits/rejected": 17166392.0, "logps/chosen": -517.083251953125, "logps/rejected": -511.386962890625, "loss": 0.089, "rewards/chosen": 4.425843715667725, "rewards/margins": 10.993432998657227, "rewards/rejected": -6.567589282989502, "step": 274 }, { "epoch": 0.0753734411401946, "grad_norm": 14.0625, "kl": 2.405853271484375, "learning_rate": 5e-06, "logits/chosen": 14553065.846153846, "logits/rejected": 22566573.09090909, "logps/chosen": -432.07425631009613, "logps/rejected": -423.16477272727275, "loss": 0.0961, "rewards/chosen": 3.9345098642202525, "rewards/margins": 9.061872202199655, "rewards/rejected": -5.127362337979403, "step": 275 }, { "epoch": 0.0756475263807044, "grad_norm": 8.5, "kl": 3.8984687328338623, "learning_rate": 5e-06, "logits/chosen": -26578803.2, "logits/rejected": -11575993.142857144, "logps/chosen": -551.15302734375, "logps/rejected": -363.7992466517857, "loss": 0.0639, "rewards/chosen": 6.405482482910156, "rewards/margins": 11.70845227922712, "rewards/rejected": -5.302969796316964, "step": 276 }, { "epoch": 0.07592161162121419, "grad_norm": 13.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11808154.181818182, "logits/rejected": -2788896.6153846155, "logps/chosen": -455.95632102272725, "logps/rejected": -412.1980543870192, "loss": 0.0818, "rewards/chosen": 3.849466497247869, "rewards/margins": 8.932009503557964, "rewards/rejected": -5.082543006310096, "step": 277 }, { "epoch": 0.07619569686172399, "grad_norm": 10.875, "kl": 3.6099560260772705, "learning_rate": 5e-06, "logits/chosen": -7605600.533333333, "logits/rejected": 35155687.11111111, "logps/chosen": -479.7279296875, "logps/rejected": -448.9381510416667, "loss": 0.1003, "rewards/chosen": 4.321465555826823, "rewards/margins": 8.85787387424045, "rewards/rejected": -4.536408318413629, "step": 278 }, { "epoch": 0.07646978210223379, "grad_norm": 8.375, "kl": 0.5924161672592163, "learning_rate": 5e-06, "logits/chosen": 31867325.714285713, "logits/rejected": 51111110.4, "logps/chosen": -574.6766880580357, "logps/rejected": -674.838916015625, "loss": 0.0476, "rewards/chosen": 3.7448556082589284, "rewards/margins": 11.341829027448382, "rewards/rejected": -7.596973419189453, "step": 279 }, { "epoch": 0.07674386734274359, "grad_norm": 8.125, "kl": 1.069183349609375, "learning_rate": 5e-06, "logits/chosen": -3330772.0, "logits/rejected": -12238551.384615384, "logps/chosen": -435.54860617897725, "logps/rejected": -408.17728365384613, "loss": 0.0625, "rewards/chosen": 4.37043588811701, "rewards/margins": 9.958705635337562, "rewards/rejected": -5.5882697472205525, "step": 280 }, { "epoch": 0.07701795258325339, "grad_norm": 18.625, "kl": 1.81894052028656, "learning_rate": 5e-06, "logits/chosen": 2607350.0, "logits/rejected": 55268697.6, "logps/chosen": -408.18526785714283, "logps/rejected": -407.909375, "loss": 0.1233, "rewards/chosen": 3.0901903424944197, "rewards/margins": 7.35550787789481, "rewards/rejected": -4.26531753540039, "step": 281 }, { "epoch": 0.07729203782376319, "grad_norm": 15.75, "kl": 3.394258499145508, "learning_rate": 5e-06, "logits/chosen": -13108462.545454545, "logits/rejected": 5535024.0, "logps/chosen": -357.32865767045456, "logps/rejected": -484.9636042668269, "loss": 0.1277, "rewards/chosen": 4.178504250266335, "rewards/margins": 9.44068780645624, "rewards/rejected": -5.262183556189904, "step": 282 }, { "epoch": 0.07756612306427299, "grad_norm": 10.125, "kl": 1.1827621459960938, "learning_rate": 5e-06, "logits/chosen": 16676073.6, "logits/rejected": -2959666.8571428573, "logps/chosen": -420.728076171875, "logps/rejected": -385.94747488839283, "loss": 0.0721, "rewards/chosen": 3.9352622985839845, "rewards/margins": 8.179723358154297, "rewards/rejected": -4.2444610595703125, "step": 283 }, { "epoch": 0.07784020830478279, "grad_norm": 14.6875, "kl": 4.102264404296875, "learning_rate": 5e-06, "logits/chosen": -13287650.461538462, "logits/rejected": 6164178.909090909, "logps/chosen": -486.5637770432692, "logps/rejected": -580.0447887073864, "loss": 0.0624, "rewards/chosen": 3.797785832331731, "rewards/margins": 11.2286732280171, "rewards/rejected": -7.430887395685369, "step": 284 }, { "epoch": 0.07811429354529259, "grad_norm": 17.25, "kl": 4.552162170410156, "learning_rate": 5e-06, "logits/chosen": 398062.85714285716, "logits/rejected": -8424584.0, "logps/chosen": -435.98221261160717, "logps/rejected": -315.180029296875, "loss": 0.1867, "rewards/chosen": 3.202957970755441, "rewards/margins": 6.262224633353098, "rewards/rejected": -3.0592666625976563, "step": 285 }, { "epoch": 0.07838837878580239, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7470672.8, "logits/rejected": -3814649.714285714, "logps/chosen": -490.853857421875, "logps/rejected": -394.91371372767856, "loss": 0.0538, "rewards/chosen": 6.014366912841797, "rewards/margins": 11.41098153250558, "rewards/rejected": -5.396614619663784, "step": 286 }, { "epoch": 0.07866246402631219, "grad_norm": 10.5, "kl": 3.856412410736084, "learning_rate": 5e-06, "logits/chosen": 2724012.5714285714, "logits/rejected": 41308249.6, "logps/chosen": -393.5224609375, "logps/rejected": -533.0912109375, "loss": 0.0781, "rewards/chosen": 4.1764213017054965, "rewards/margins": 11.120481055123467, "rewards/rejected": -6.944059753417969, "step": 287 }, { "epoch": 0.07893654926682198, "grad_norm": 15.5, "kl": 5.688349723815918, "learning_rate": 5e-06, "logits/chosen": -13112982.666666666, "logits/rejected": 24606485.333333332, "logps/chosen": -546.221923828125, "logps/rejected": -347.6874186197917, "loss": 0.1332, "rewards/chosen": 4.418943405151367, "rewards/margins": 8.380524635314941, "rewards/rejected": -3.961581230163574, "step": 288 }, { "epoch": 0.07921063450733178, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3981869.4545454546, "logits/rejected": -2361308.923076923, "logps/chosen": -492.59965376420456, "logps/rejected": -485.8505108173077, "loss": 0.0659, "rewards/chosen": 4.404278148304332, "rewards/margins": 9.444561351429332, "rewards/rejected": -5.040283203125, "step": 289 }, { "epoch": 0.07948471974784158, "grad_norm": 15.875, "kl": 6.24346923828125, "learning_rate": 5e-06, "logits/chosen": 3680886.8571428573, "logits/rejected": -4384022.4, "logps/chosen": -497.16660853794644, "logps/rejected": -324.111474609375, "loss": 0.1254, "rewards/chosen": 3.8708904811314175, "rewards/margins": 6.9683283397129605, "rewards/rejected": -3.097437858581543, "step": 290 }, { "epoch": 0.07975880498835138, "grad_norm": 23.875, "kl": 10.540879249572754, "learning_rate": 5e-06, "logits/chosen": 21684532.70588235, "logits/rejected": 927929.1428571428, "logps/chosen": -461.34329044117646, "logps/rejected": -458.38218470982144, "loss": 0.1696, "rewards/chosen": 3.10023655610926, "rewards/margins": 7.663300041391068, "rewards/rejected": -4.563063485281808, "step": 291 }, { "epoch": 0.08003289022886118, "grad_norm": 12.75, "kl": 0.9361509084701538, "learning_rate": 5e-06, "logits/chosen": 11743655.272727273, "logits/rejected": 9239201.846153846, "logps/chosen": -503.54736328125, "logps/rejected": -563.1848707932693, "loss": 0.0791, "rewards/chosen": 4.443031311035156, "rewards/margins": 10.551200279822716, "rewards/rejected": -6.10816896878756, "step": 292 }, { "epoch": 0.08030697546937098, "grad_norm": 7.1875, "kl": 4.891231060028076, "learning_rate": 5e-06, "logits/chosen": -13001907.692307692, "logits/rejected": 30297605.818181816, "logps/chosen": -523.3993013822115, "logps/rejected": -390.8577769886364, "loss": 0.0288, "rewards/chosen": 5.34951899601863, "rewards/margins": 10.810799125191215, "rewards/rejected": -5.461280129172585, "step": 293 }, { "epoch": 0.08058106070988078, "grad_norm": 12.8125, "kl": 2.1094970703125, "learning_rate": 5e-06, "logits/chosen": -19594950.4, "logits/rejected": 6164467.428571428, "logps/chosen": -564.647998046875, "logps/rejected": -372.67236328125, "loss": 0.1051, "rewards/chosen": 5.093234252929688, "rewards/margins": 8.542799595424107, "rewards/rejected": -3.4495653424944197, "step": 294 }, { "epoch": 0.08085514595039058, "grad_norm": 19.25, "kl": 4.680455684661865, "learning_rate": 5e-06, "logits/chosen": 10780152.533333333, "logits/rejected": 4866618.666666667, "logps/chosen": -432.8248046875, "logps/rejected": -526.0397677951389, "loss": 0.1247, "rewards/chosen": 3.477860514322917, "rewards/margins": 9.723358747694228, "rewards/rejected": -6.245498233371311, "step": 295 }, { "epoch": 0.08112923119090037, "grad_norm": 10.8125, "kl": 1.2363357543945312, "learning_rate": 5e-06, "logits/chosen": 6187233.454545454, "logits/rejected": -10271917.538461538, "logps/chosen": -515.5923295454545, "logps/rejected": -489.28380408653845, "loss": 0.0458, "rewards/chosen": 4.562067898837003, "rewards/margins": 10.348860360525705, "rewards/rejected": -5.786792461688702, "step": 296 }, { "epoch": 0.08140331643141017, "grad_norm": 14.0625, "kl": 4.663355827331543, "learning_rate": 5e-06, "logits/chosen": 7682080.615384615, "logits/rejected": 45846725.81818182, "logps/chosen": -469.4047100360577, "logps/rejected": -339.8621271306818, "loss": 0.1291, "rewards/chosen": 4.086775266207182, "rewards/margins": 7.86245650178069, "rewards/rejected": -3.7756812355735083, "step": 297 }, { "epoch": 0.08167740167191996, "grad_norm": 11.6875, "kl": 2.9907760620117188, "learning_rate": 5e-06, "logits/chosen": 3645667.6363636362, "logits/rejected": 26743896.615384616, "logps/chosen": -379.9996448863636, "logps/rejected": -473.4130108173077, "loss": 0.0483, "rewards/chosen": 4.322912042791193, "rewards/margins": 9.285969340717877, "rewards/rejected": -4.9630572979266825, "step": 298 }, { "epoch": 0.08195148691242976, "grad_norm": 14.125, "kl": 4.142131805419922, "learning_rate": 5e-06, "logits/chosen": 10398168.0, "logits/rejected": 6104552.615384615, "logps/chosen": -444.04643110795456, "logps/rejected": -393.59728064903845, "loss": 0.1502, "rewards/chosen": 3.6946293223987925, "rewards/margins": 8.183096612250054, "rewards/rejected": -4.488467289851262, "step": 299 }, { "epoch": 0.08222557215293956, "grad_norm": 14.8125, "kl": 8.605305671691895, "learning_rate": 5e-06, "logits/chosen": 3454330.933333333, "logits/rejected": 12275706.666666666, "logps/chosen": -371.35358072916665, "logps/rejected": -420.60565863715277, "loss": 0.1218, "rewards/chosen": 4.571321105957031, "rewards/margins": 8.392479621039495, "rewards/rejected": -3.821158515082465, "step": 300 }, { "epoch": 0.08249965739344936, "grad_norm": 7.28125, "kl": 0.11552556604146957, "learning_rate": 5e-06, "logits/chosen": -9031827.636363637, "logits/rejected": 23478803.692307692, "logps/chosen": -556.2598987926136, "logps/rejected": -580.3497596153846, "loss": 0.0404, "rewards/chosen": 4.506899053400213, "rewards/margins": 10.817116690682365, "rewards/rejected": -6.310217637282151, "step": 301 }, { "epoch": 0.08277374263395916, "grad_norm": 10.5625, "kl": 2.96460223197937, "learning_rate": 5e-06, "logits/chosen": 8337247.333333333, "logits/rejected": 71219349.33333333, "logps/chosen": -466.9274088541667, "logps/rejected": -522.2462158203125, "loss": 0.0647, "rewards/chosen": 4.609638849894206, "rewards/margins": 9.838012377421062, "rewards/rejected": -5.2283735275268555, "step": 302 }, { "epoch": 0.08304782787446896, "grad_norm": 9.75, "kl": 1.0347316265106201, "learning_rate": 5e-06, "logits/chosen": -5780146.666666667, "logits/rejected": 16446557.333333334, "logps/chosen": -605.89404296875, "logps/rejected": -491.3582356770833, "loss": 0.0419, "rewards/chosen": 4.969841321309407, "rewards/margins": 10.360783576965332, "rewards/rejected": -5.390942255655925, "step": 303 }, { "epoch": 0.08332191311497876, "grad_norm": 10.875, "kl": 4.32183837890625, "learning_rate": 5e-06, "logits/chosen": 5808237.333333333, "logits/rejected": 32808330.666666668, "logps/chosen": -439.6461181640625, "logps/rejected": -499.350830078125, "loss": 0.0772, "rewards/chosen": 3.7525622049967446, "rewards/margins": 10.664719899495443, "rewards/rejected": -6.912157694498698, "step": 304 }, { "epoch": 0.08359599835548856, "grad_norm": 18.75, "kl": 10.855632781982422, "learning_rate": 5e-06, "logits/chosen": -12223272.615384616, "logits/rejected": 36265780.36363637, "logps/chosen": -455.9514723557692, "logps/rejected": -550.2575461647727, "loss": 0.1009, "rewards/chosen": 4.792245718149038, "rewards/margins": 11.478937002328726, "rewards/rejected": -6.6866912841796875, "step": 305 }, { "epoch": 0.08387008359599836, "grad_norm": 17.75, "kl": 1.2188060283660889, "learning_rate": 5e-06, "logits/chosen": 13440210.0, "logits/rejected": -12556063.0, "logps/chosen": -331.47259521484375, "logps/rejected": -444.20416259765625, "loss": 0.0939, "rewards/chosen": 3.523893356323242, "rewards/margins": 8.299412250518799, "rewards/rejected": -4.775518894195557, "step": 306 }, { "epoch": 0.08414416883650816, "grad_norm": 20.625, "kl": 4.410481929779053, "learning_rate": 5e-06, "logits/chosen": 48077344.0, "logits/rejected": -985793.2307692308, "logps/chosen": -461.78573330965907, "logps/rejected": -475.99815955528845, "loss": 0.0937, "rewards/chosen": 3.696801619096236, "rewards/margins": 8.42352903139341, "rewards/rejected": -4.726727412297175, "step": 307 }, { "epoch": 0.08441825407701795, "grad_norm": 8.0625, "kl": 3.7214226722717285, "learning_rate": 5e-06, "logits/chosen": 8445786.666666666, "logits/rejected": 6363951.111111111, "logps/chosen": -419.8125, "logps/rejected": -481.6486002604167, "loss": 0.0567, "rewards/chosen": 5.018906656901041, "rewards/margins": 10.423315429687499, "rewards/rejected": -5.404408772786458, "step": 308 }, { "epoch": 0.08469233931752775, "grad_norm": 12.9375, "kl": 1.5139936208724976, "learning_rate": 5e-06, "logits/chosen": -8219018.909090909, "logits/rejected": -16288500.923076924, "logps/chosen": -399.3187144886364, "logps/rejected": -517.2471454326923, "loss": 0.0903, "rewards/chosen": 3.660085504705256, "rewards/margins": 8.0196664983576, "rewards/rejected": -4.359580993652344, "step": 309 }, { "epoch": 0.08496642455803755, "grad_norm": 14.0625, "kl": 9.586676597595215, "learning_rate": 5e-06, "logits/chosen": 21250973.714285713, "logits/rejected": -9885582.4, "logps/chosen": -479.18899972098217, "logps/rejected": -363.3520751953125, "loss": 0.0909, "rewards/chosen": 4.958606719970703, "rewards/margins": 9.076781463623046, "rewards/rejected": -4.118174743652344, "step": 310 }, { "epoch": 0.08524050979854735, "grad_norm": 8.6875, "kl": 9.95068073272705, "learning_rate": 5e-06, "logits/chosen": 3080690.933333333, "logits/rejected": -3955510.222222222, "logps/chosen": -412.59401041666666, "logps/rejected": -466.76085069444446, "loss": 0.0401, "rewards/chosen": 4.594488525390625, "rewards/margins": 10.933694797092015, "rewards/rejected": -6.339206271701389, "step": 311 }, { "epoch": 0.08551459503905715, "grad_norm": 7.125, "kl": 5.563377380371094, "learning_rate": 5e-06, "logits/chosen": 1431192.3333333333, "logits/rejected": -27067.333333333332, "logps/chosen": -430.8623860677083, "logps/rejected": -374.1400553385417, "loss": 0.0432, "rewards/chosen": 4.691751480102539, "rewards/margins": 8.68898073832194, "rewards/rejected": -3.997229258219401, "step": 312 }, { "epoch": 0.08578868027956695, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 12155381.818181818, "logits/rejected": 4607276.307692308, "logps/chosen": -389.10653409090907, "logps/rejected": -387.4092548076923, "loss": 0.062, "rewards/chosen": 4.03867582841353, "rewards/margins": 9.115452773087508, "rewards/rejected": -5.076776944673979, "step": 313 }, { "epoch": 0.08606276552007674, "grad_norm": 8.75, "kl": 3.5429065227508545, "learning_rate": 5e-06, "logits/chosen": -2829339.111111111, "logits/rejected": 21292620.8, "logps/chosen": -475.9368489583333, "logps/rejected": -629.4335286458333, "loss": 0.0393, "rewards/chosen": 5.210927327473958, "rewards/margins": 12.535593159993489, "rewards/rejected": -7.324665832519531, "step": 314 }, { "epoch": 0.08633685076058654, "grad_norm": 12.375, "kl": 1.2242025136947632, "learning_rate": 5e-06, "logits/chosen": 37832936.0, "logits/rejected": -28541692.0, "logps/chosen": -438.2005615234375, "logps/rejected": -364.8160095214844, "loss": 0.0806, "rewards/chosen": 3.253568410873413, "rewards/margins": 7.023784875869751, "rewards/rejected": -3.770216464996338, "step": 315 }, { "epoch": 0.08661093600109634, "grad_norm": 12.0625, "kl": 5.899319171905518, "learning_rate": 5e-06, "logits/chosen": -7354921.333333333, "logits/rejected": -6741395.333333333, "logps/chosen": -420.066650390625, "logps/rejected": -360.73193359375, "loss": 0.1111, "rewards/chosen": 4.450411478678386, "rewards/margins": 7.956522623697917, "rewards/rejected": -3.5061111450195312, "step": 316 }, { "epoch": 0.08688502124160614, "grad_norm": 14.25, "kl": 7.771266460418701, "learning_rate": 5e-06, "logits/chosen": 7232413.333333333, "logits/rejected": 88771674.66666667, "logps/chosen": -513.9732259114584, "logps/rejected": -639.1307779947916, "loss": 0.1032, "rewards/chosen": 4.653367360432942, "rewards/margins": 13.400952657063801, "rewards/rejected": -8.74758529663086, "step": 317 }, { "epoch": 0.08715910648211594, "grad_norm": 9.125, "kl": 1.4852209091186523, "learning_rate": 5e-06, "logits/chosen": 19518432.0, "logits/rejected": 18974692.363636363, "logps/chosen": -499.4041090745192, "logps/rejected": -503.1810191761364, "loss": 0.0451, "rewards/chosen": 5.18667719914363, "rewards/margins": 9.373006433873744, "rewards/rejected": -4.186329234730113, "step": 318 }, { "epoch": 0.08743319172262573, "grad_norm": 11.5625, "kl": 9.634284973144531, "learning_rate": 5e-06, "logits/chosen": 6825672.615384615, "logits/rejected": 41824683.63636363, "logps/chosen": -494.70248647836536, "logps/rejected": -422.60640092329544, "loss": 0.0653, "rewards/chosen": 5.764184805063101, "rewards/margins": 9.98528777969467, "rewards/rejected": -4.221102974631569, "step": 319 }, { "epoch": 0.08770727696313553, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 17818560.0, "logits/rejected": -8392576.0, "logps/chosen": -410.58417426215277, "logps/rejected": -601.182421875, "loss": 0.0456, "rewards/chosen": 3.4444783528645835, "rewards/margins": 8.29544423421224, "rewards/rejected": -4.850965881347657, "step": 320 }, { "epoch": 0.08798136220364533, "grad_norm": 8.1875, "kl": 7.622042655944824, "learning_rate": 5e-06, "logits/chosen": -6843279.0, "logits/rejected": -7313893.0, "logps/chosen": -478.805908203125, "logps/rejected": -499.60430908203125, "loss": 0.0362, "rewards/chosen": 5.783178329467773, "rewards/margins": 11.783013343811035, "rewards/rejected": -5.999835014343262, "step": 321 }, { "epoch": 0.08825544744415513, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14580689.6, "logits/rejected": -800077.0, "logps/chosen": -418.745361328125, "logps/rejected": -363.83182198660717, "loss": 0.0662, "rewards/chosen": 5.140164947509765, "rewards/margins": 9.302838897705078, "rewards/rejected": -4.1626739501953125, "step": 322 }, { "epoch": 0.08852953268466493, "grad_norm": 9.875, "kl": 2.5579657554626465, "learning_rate": 5e-06, "logits/chosen": -15571645.090909092, "logits/rejected": -3645008.6153846155, "logps/chosen": -529.8484552556819, "logps/rejected": -430.7838792067308, "loss": 0.0697, "rewards/chosen": 4.406594709916548, "rewards/margins": 9.81196146078043, "rewards/rejected": -5.405366750863882, "step": 323 }, { "epoch": 0.08880361792517473, "grad_norm": 14.125, "kl": 2.7109484672546387, "learning_rate": 5e-06, "logits/chosen": 20186052.8, "logits/rejected": 1621664.857142857, "logps/chosen": -400.7422607421875, "logps/rejected": -555.0231584821429, "loss": 0.091, "rewards/chosen": 4.526076126098633, "rewards/margins": 9.316374588012696, "rewards/rejected": -4.7902984619140625, "step": 324 }, { "epoch": 0.08907770316568453, "grad_norm": 14.75, "kl": 7.780281066894531, "learning_rate": 5e-06, "logits/chosen": 2269228.3076923075, "logits/rejected": 49807354.18181818, "logps/chosen": -450.9328049879808, "logps/rejected": -643.5640536221591, "loss": 0.0682, "rewards/chosen": 5.23037602351262, "rewards/margins": 12.027452855677037, "rewards/rejected": -6.797076832164418, "step": 325 }, { "epoch": 0.08935178840619433, "grad_norm": 10.5, "kl": 6.505181789398193, "learning_rate": 5e-06, "logits/chosen": 7229774.545454546, "logits/rejected": -1148411.3846153845, "logps/chosen": -512.1546963778409, "logps/rejected": -439.0304612379808, "loss": 0.0775, "rewards/chosen": 5.463993766091087, "rewards/margins": 9.893863197806832, "rewards/rejected": -4.429869431715745, "step": 326 }, { "epoch": 0.08962587364670413, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5305491.2, "logits/rejected": -7767329.142857143, "logps/chosen": -349.3232421875, "logps/rejected": -531.0228097098214, "loss": 0.0787, "rewards/chosen": 3.352826690673828, "rewards/margins": 8.51670172555106, "rewards/rejected": -5.163875034877232, "step": 327 }, { "epoch": 0.08989995888721393, "grad_norm": 5.875, "kl": 4.851354122161865, "learning_rate": 5e-06, "logits/chosen": -15420618.666666666, "logits/rejected": -12900296.888888888, "logps/chosen": -430.53583984375, "logps/rejected": -498.8820529513889, "loss": 0.0258, "rewards/chosen": 5.986392211914063, "rewards/margins": 12.168555874294704, "rewards/rejected": -6.182163662380642, "step": 328 }, { "epoch": 0.09017404412772372, "grad_norm": 7.28125, "kl": 3.322453260421753, "learning_rate": 5e-06, "logits/chosen": -10699344.0, "logits/rejected": -3330855.6363636362, "logps/chosen": -520.7551081730769, "logps/rejected": -404.9727894176136, "loss": 0.0491, "rewards/chosen": 5.226038419283354, "rewards/margins": 10.896529377757254, "rewards/rejected": -5.670490958473899, "step": 329 }, { "epoch": 0.09044812936823352, "grad_norm": 8.0625, "kl": 3.718661308288574, "learning_rate": 5e-06, "logits/chosen": -22441638.85714286, "logits/rejected": 26954848.0, "logps/chosen": -463.52022879464283, "logps/rejected": -610.670849609375, "loss": 0.0496, "rewards/chosen": 5.779016767229352, "rewards/margins": 11.163347898210798, "rewards/rejected": -5.384331130981446, "step": 330 }, { "epoch": 0.09072221460874332, "grad_norm": 13.75, "kl": 4.474693298339844, "learning_rate": 5e-06, "logits/chosen": -11822338.181818182, "logits/rejected": 44201353.84615385, "logps/chosen": -488.7800958806818, "logps/rejected": -452.4543269230769, "loss": 0.0783, "rewards/chosen": 4.970636541193182, "rewards/margins": 9.45377352521136, "rewards/rejected": -4.483136984018179, "step": 331 }, { "epoch": 0.09099629984925311, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18749540.8, "logits/rejected": 4186261.714285714, "logps/chosen": -411.421533203125, "logps/rejected": -601.0364118303571, "loss": 0.0488, "rewards/chosen": 4.048043060302734, "rewards/margins": 10.00490973336356, "rewards/rejected": -5.9568666730608255, "step": 332 }, { "epoch": 0.09127038508976291, "grad_norm": 8.25, "kl": 0.7423636317253113, "learning_rate": 5e-06, "logits/chosen": 3511867.5, "logits/rejected": -10965819.0, "logps/chosen": -476.4599914550781, "logps/rejected": -467.89630126953125, "loss": 0.0396, "rewards/chosen": 5.1556243896484375, "rewards/margins": 10.822744846343994, "rewards/rejected": -5.667120456695557, "step": 333 }, { "epoch": 0.09154447033027271, "grad_norm": 8.75, "kl": 2.613624095916748, "learning_rate": 5e-06, "logits/chosen": -7745174.153846154, "logits/rejected": -2972929.8181818184, "logps/chosen": -485.1369816706731, "logps/rejected": -397.18186257102275, "loss": 0.0422, "rewards/chosen": 5.342171302208533, "rewards/margins": 9.805132672503277, "rewards/rejected": -4.462961370294744, "step": 334 }, { "epoch": 0.09181855557078251, "grad_norm": 8.875, "kl": 4.095698356628418, "learning_rate": 5e-06, "logits/chosen": -3232483.6923076925, "logits/rejected": 37259424.0, "logps/chosen": -470.3552433894231, "logps/rejected": -269.6803089488636, "loss": 0.0476, "rewards/chosen": 5.65692138671875, "rewards/margins": 9.9495981389826, "rewards/rejected": -4.29267675226385, "step": 335 }, { "epoch": 0.09209264081129231, "grad_norm": 15.5, "kl": 12.096663475036621, "learning_rate": 5e-06, "logits/chosen": 4483824.0, "logits/rejected": 95542129.77777778, "logps/chosen": -397.64716796875, "logps/rejected": -354.6735568576389, "loss": 0.096, "rewards/chosen": 4.3307342529296875, "rewards/margins": 6.975193023681641, "rewards/rejected": -2.644458770751953, "step": 336 }, { "epoch": 0.09236672605180211, "grad_norm": 2.578125, "kl": 0.5759099721908569, "learning_rate": 5e-06, "logits/chosen": 25946592.0, "logits/rejected": -1670124.2857142857, "logps/chosen": -451.4900390625, "logps/rejected": -658.5890066964286, "loss": 0.0069, "rewards/chosen": 5.735967636108398, "rewards/margins": 13.225248881748744, "rewards/rejected": -7.489281245640346, "step": 337 }, { "epoch": 0.09264081129231191, "grad_norm": 15.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5532118.222222222, "logits/rejected": 16164577.066666666, "logps/chosen": -517.6430121527778, "logps/rejected": -457.69951171875, "loss": 0.073, "rewards/chosen": 4.865934583875868, "rewards/margins": 9.911490207248264, "rewards/rejected": -5.045555623372396, "step": 338 }, { "epoch": 0.09291489653282171, "grad_norm": 13.625, "kl": 8.053850173950195, "learning_rate": 5e-06, "logits/chosen": -7704076.307692308, "logits/rejected": 7311906.181818182, "logps/chosen": -399.5871769831731, "logps/rejected": -365.02530184659093, "loss": 0.1125, "rewards/chosen": 3.833532480093149, "rewards/margins": 7.884671618054797, "rewards/rejected": -4.0511391379616475, "step": 339 }, { "epoch": 0.0931889817733315, "grad_norm": 9.4375, "kl": 2.893493175506592, "learning_rate": 5e-06, "logits/chosen": -3969793.8181818184, "logits/rejected": 72880226.46153846, "logps/chosen": -455.58589311079544, "logps/rejected": -526.0353816105769, "loss": 0.0361, "rewards/chosen": 5.276725422252309, "rewards/margins": 12.483305951098462, "rewards/rejected": -7.206580528846154, "step": 340 }, { "epoch": 0.0934630670138413, "grad_norm": 11.125, "kl": 1.9984945058822632, "learning_rate": 5e-06, "logits/chosen": 16299812.266666668, "logits/rejected": 10699376.0, "logps/chosen": -448.3462239583333, "logps/rejected": -519.3725043402778, "loss": 0.0918, "rewards/chosen": 4.531538899739584, "rewards/margins": 10.709219868977865, "rewards/rejected": -6.177680969238281, "step": 341 }, { "epoch": 0.0937371522543511, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7061378.4, "logits/rejected": 8157065.142857143, "logps/chosen": -421.717626953125, "logps/rejected": -378.17264229910717, "loss": 0.0716, "rewards/chosen": 4.5264839172363285, "rewards/margins": 10.464900534493584, "rewards/rejected": -5.938416617257254, "step": 342 }, { "epoch": 0.0940112374948609, "grad_norm": 4.9375, "kl": 5.768405914306641, "learning_rate": 5e-06, "logits/chosen": -3019908.8571428573, "logits/rejected": 5286672.0, "logps/chosen": -401.21058872767856, "logps/rejected": -541.14384765625, "loss": 0.0768, "rewards/chosen": 4.963448115757534, "rewards/margins": 12.115760585239956, "rewards/rejected": -7.152312469482422, "step": 343 }, { "epoch": 0.0942853227353707, "grad_norm": 7.71875, "kl": 4.360863208770752, "learning_rate": 5e-06, "logits/chosen": -7032073.230769231, "logits/rejected": 12070064.0, "logps/chosen": -555.7020733173077, "logps/rejected": -363.00026633522725, "loss": 0.0805, "rewards/chosen": 5.065423231858474, "rewards/margins": 8.882967648806272, "rewards/rejected": -3.8175444169477983, "step": 344 }, { "epoch": 0.0945594079758805, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9335779.636363637, "logits/rejected": -2463865.5384615385, "logps/chosen": -472.43288352272725, "logps/rejected": -448.21544471153845, "loss": 0.0653, "rewards/chosen": 3.725847764448686, "rewards/margins": 9.116328392829095, "rewards/rejected": -5.390480628380408, "step": 345 }, { "epoch": 0.0948334932163903, "grad_norm": 13.0, "kl": 4.8338623046875, "learning_rate": 5e-06, "logits/chosen": 14069372.307692308, "logits/rejected": -19770890.181818184, "logps/chosen": -455.5188176081731, "logps/rejected": -386.56156782670456, "loss": 0.1189, "rewards/chosen": 3.914758535531851, "rewards/margins": 9.642044094059017, "rewards/rejected": -5.727285558527166, "step": 346 }, { "epoch": 0.0951075784569001, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4011559.0, "logits/rejected": -13196682.666666666, "logps/chosen": -434.1505533854167, "logps/rejected": -383.3353678385417, "loss": 0.0303, "rewards/chosen": 5.217780431111653, "rewards/margins": 10.343499501546223, "rewards/rejected": -5.12571907043457, "step": 347 }, { "epoch": 0.0953816636974099, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4806686.0, "logits/rejected": 2501639.4285714286, "logps/chosen": -505.066943359375, "logps/rejected": -622.4914899553571, "loss": 0.0684, "rewards/chosen": 3.9256961822509764, "rewards/margins": 10.904704448154995, "rewards/rejected": -6.979008265904018, "step": 348 }, { "epoch": 0.0956557489379197, "grad_norm": 14.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2373786.8571428573, "logits/rejected": 3552262.8, "logps/chosen": -459.16489955357144, "logps/rejected": -446.603515625, "loss": 0.0602, "rewards/chosen": 5.185779571533203, "rewards/margins": 11.201059341430664, "rewards/rejected": -6.015279769897461, "step": 349 }, { "epoch": 0.09592983417842949, "grad_norm": 14.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18023790.666666668, "logits/rejected": -8810012.0, "logps/chosen": -412.6937255859375, "logps/rejected": -526.1974690755209, "loss": 0.1161, "rewards/chosen": 4.321699778238933, "rewards/margins": 9.615612665812176, "rewards/rejected": -5.293912887573242, "step": 350 }, { "epoch": 0.09620391941893928, "grad_norm": 16.875, "kl": 6.75664758682251, "learning_rate": 5e-06, "logits/chosen": -13552211.555555556, "logits/rejected": -9349713.066666666, "logps/chosen": -493.78955078125, "logps/rejected": -560.4846354166667, "loss": 0.0916, "rewards/chosen": 5.470484839545356, "rewards/margins": 10.824471961127387, "rewards/rejected": -5.353987121582032, "step": 351 }, { "epoch": 0.09647800465944908, "grad_norm": 17.125, "kl": 6.74098014831543, "learning_rate": 5e-06, "logits/chosen": 10342390.857142856, "logits/rejected": 188605.7, "logps/chosen": -450.35951450892856, "logps/rejected": -388.8828369140625, "loss": 0.0994, "rewards/chosen": 4.575883047921317, "rewards/margins": 8.207985632760185, "rewards/rejected": -3.6321025848388673, "step": 352 }, { "epoch": 0.09675208989995888, "grad_norm": 6.4375, "kl": 0.013274192810058594, "learning_rate": 5e-06, "logits/chosen": 11546056.615384616, "logits/rejected": 12070858.181818182, "logps/chosen": -401.8313176081731, "logps/rejected": -558.6486150568181, "loss": 0.0315, "rewards/chosen": 4.768405914306641, "rewards/margins": 12.035384785045277, "rewards/rejected": -7.266978870738637, "step": 353 }, { "epoch": 0.09702617514046868, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9673048.0, "logits/rejected": 17923089.230769232, "logps/chosen": -573.50244140625, "logps/rejected": -538.8931790865385, "loss": 0.0218, "rewards/chosen": 5.700440146706321, "rewards/margins": 13.019058334243882, "rewards/rejected": -7.31861818753756, "step": 354 }, { "epoch": 0.09730026038097848, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4220443.0, "logits/rejected": -12693865.333333334, "logps/chosen": -594.9004720052084, "logps/rejected": -612.3279622395834, "loss": 0.0241, "rewards/chosen": 6.699169794718425, "rewards/margins": 14.716889063517254, "rewards/rejected": -8.017719268798828, "step": 355 }, { "epoch": 0.09757434562148828, "grad_norm": 16.0, "kl": 4.908015251159668, "learning_rate": 5e-06, "logits/chosen": -3031238.933333333, "logits/rejected": 20635235.555555556, "logps/chosen": -375.221484375, "logps/rejected": -369.5971408420139, "loss": 0.0809, "rewards/chosen": 3.823686218261719, "rewards/margins": 9.226395331488716, "rewards/rejected": -5.402709113226996, "step": 356 }, { "epoch": 0.09784843086199808, "grad_norm": 13.5625, "kl": 11.671039581298828, "learning_rate": 5e-06, "logits/chosen": 8864902.0, "logits/rejected": -8920805.0, "logps/chosen": -485.20489501953125, "logps/rejected": -727.4148559570312, "loss": 0.1141, "rewards/chosen": 4.861318588256836, "rewards/margins": 10.816500663757324, "rewards/rejected": -5.955182075500488, "step": 357 }, { "epoch": 0.09812251610250788, "grad_norm": 20.0, "kl": 15.707134246826172, "learning_rate": 5e-06, "logits/chosen": -2437111.0588235296, "logits/rejected": 3111564.8571428573, "logps/chosen": -506.63005514705884, "logps/rejected": -311.2676478794643, "loss": 0.1165, "rewards/chosen": 4.43244395536535, "rewards/margins": 8.544677798487559, "rewards/rejected": -4.11223384312221, "step": 358 }, { "epoch": 0.09839660134301768, "grad_norm": 9.125, "kl": 3.163815975189209, "learning_rate": 5e-06, "logits/chosen": -736939.2, "logits/rejected": -4855220.444444444, "logps/chosen": -442.7157877604167, "logps/rejected": -452.0954318576389, "loss": 0.0473, "rewards/chosen": 4.398774210611979, "rewards/margins": 8.719865332709418, "rewards/rejected": -4.321091122097439, "step": 359 }, { "epoch": 0.09867068658352748, "grad_norm": 5.3125, "kl": 4.461540222167969, "learning_rate": 5e-06, "logits/chosen": 5391864.7272727275, "logits/rejected": 54569398.15384615, "logps/chosen": -441.72367720170456, "logps/rejected": -1029.5927734375, "loss": 0.0507, "rewards/chosen": 5.646115736527876, "rewards/margins": 14.262811434018861, "rewards/rejected": -8.616695697490986, "step": 360 }, { "epoch": 0.09894477182403727, "grad_norm": 9.9375, "kl": 2.9127261638641357, "learning_rate": 5e-06, "logits/chosen": -1583598.7692307692, "logits/rejected": 11708122.181818182, "logps/chosen": -408.72352013221155, "logps/rejected": -484.66202059659093, "loss": 0.0456, "rewards/chosen": 4.164166083702674, "rewards/margins": 9.989671427053171, "rewards/rejected": -5.825505343350497, "step": 361 }, { "epoch": 0.09921885706454707, "grad_norm": 18.25, "kl": 8.243597984313965, "learning_rate": 5e-06, "logits/chosen": 3505608.470588235, "logits/rejected": 4076946.8571428573, "logps/chosen": -407.02355238970586, "logps/rejected": -664.6607840401786, "loss": 0.1496, "rewards/chosen": 3.9354207655962776, "rewards/margins": 10.993010705258666, "rewards/rejected": -7.057589939662388, "step": 362 }, { "epoch": 0.09949294230505687, "grad_norm": 10.0625, "kl": 8.164571762084961, "learning_rate": 5e-06, "logits/chosen": -17709230.545454547, "logits/rejected": 1551192.6153846155, "logps/chosen": -503.4070933948864, "logps/rejected": -544.8983999399038, "loss": 0.0341, "rewards/chosen": 5.116266424005682, "rewards/margins": 11.08907819627882, "rewards/rejected": -5.972811772273137, "step": 363 }, { "epoch": 0.09976702754556667, "grad_norm": 14.0625, "kl": 3.1012039184570312, "learning_rate": 5e-06, "logits/chosen": -14781931.076923076, "logits/rejected": -14546539.636363637, "logps/chosen": -476.02591646634613, "logps/rejected": -451.92684659090907, "loss": 0.0632, "rewards/chosen": 5.36757073035607, "rewards/margins": 10.126421868384302, "rewards/rejected": -4.7588511380282315, "step": 364 }, { "epoch": 0.10004111278607647, "grad_norm": 15.875, "kl": 4.846736431121826, "learning_rate": 5e-06, "logits/chosen": 9237574.285714285, "logits/rejected": 3976045.6, "logps/chosen": -458.769287109375, "logps/rejected": -459.04560546875, "loss": 0.0838, "rewards/chosen": 4.842789786202567, "rewards/margins": 9.43100313459124, "rewards/rejected": -4.588213348388672, "step": 365 }, { "epoch": 0.10031519802658627, "grad_norm": 7.78125, "kl": 3.1894302368164062, "learning_rate": 5e-06, "logits/chosen": -22309379.555555556, "logits/rejected": -5540848.0, "logps/chosen": -458.9069552951389, "logps/rejected": -466.1834309895833, "loss": 0.0363, "rewards/chosen": 4.928116268581814, "rewards/margins": 9.858646053738063, "rewards/rejected": -4.93052978515625, "step": 366 }, { "epoch": 0.10058928326709607, "grad_norm": 21.5, "kl": 9.371637344360352, "learning_rate": 5e-06, "logits/chosen": 12825353.846153846, "logits/rejected": 12731824.0, "logps/chosen": -314.9335186298077, "logps/rejected": -421.6270862926136, "loss": 0.1408, "rewards/chosen": 4.004116351787861, "rewards/margins": 7.789084294459203, "rewards/rejected": -3.7849679426713423, "step": 367 }, { "epoch": 0.10086336850760587, "grad_norm": 19.75, "kl": 5.2935709953308105, "learning_rate": 5e-06, "logits/chosen": 6325059.636363637, "logits/rejected": 11292257.23076923, "logps/chosen": -461.5269886363636, "logps/rejected": -477.5490910456731, "loss": 0.1315, "rewards/chosen": 4.666685624556108, "rewards/margins": 8.124520308487899, "rewards/rejected": -3.457834683931791, "step": 368 }, { "epoch": 0.10113745374811567, "grad_norm": 7.96875, "kl": 3.227548599243164, "learning_rate": 5e-06, "logits/chosen": -4774811.2, "logits/rejected": 321147.1111111111, "logps/chosen": -373.49095052083334, "logps/rejected": -469.25889756944446, "loss": 0.053, "rewards/chosen": 4.257208251953125, "rewards/margins": 10.302008480495877, "rewards/rejected": -6.044800228542751, "step": 369 }, { "epoch": 0.10141153898862547, "grad_norm": 10.9375, "kl": 3.425731658935547, "learning_rate": 5e-06, "logits/chosen": 4618500.0, "logits/rejected": -10885377.23076923, "logps/chosen": -399.13924893465907, "logps/rejected": -634.8315805288462, "loss": 0.035, "rewards/chosen": 5.7440032958984375, "rewards/margins": 11.910483140211838, "rewards/rejected": -6.166479844313401, "step": 370 }, { "epoch": 0.10168562422913525, "grad_norm": 7.53125, "kl": 2.958395004272461, "learning_rate": 5e-06, "logits/chosen": 2868309.8181818184, "logits/rejected": 7781420.923076923, "logps/chosen": -556.6782670454545, "logps/rejected": -494.76419771634613, "loss": 0.0354, "rewards/chosen": 7.192312067205256, "rewards/margins": 12.871020657199246, "rewards/rejected": -5.67870858999399, "step": 371 }, { "epoch": 0.10195970946964505, "grad_norm": 9.625, "kl": 4.565187454223633, "learning_rate": 5e-06, "logits/chosen": 2550867.2, "logits/rejected": -9135171.42857143, "logps/chosen": -458.90771484375, "logps/rejected": -357.43980189732144, "loss": 0.0967, "rewards/chosen": 5.349785995483399, "rewards/margins": 9.459080614362446, "rewards/rejected": -4.109294618879046, "step": 372 }, { "epoch": 0.10223379471015485, "grad_norm": 19.375, "kl": 13.114043235778809, "learning_rate": 5e-06, "logits/chosen": 16627749.333333334, "logits/rejected": 4657855.111111111, "logps/chosen": -481.7962239583333, "logps/rejected": -451.67377387152777, "loss": 0.1969, "rewards/chosen": 5.295735168457031, "rewards/margins": 8.98633100721571, "rewards/rejected": -3.6905958387586804, "step": 373 }, { "epoch": 0.10250787995066465, "grad_norm": 9.625, "kl": 13.949646949768066, "learning_rate": 5e-06, "logits/chosen": -11078638.76923077, "logits/rejected": 20588440.727272727, "logps/chosen": -468.18348106971155, "logps/rejected": -531.9199662642045, "loss": 0.0586, "rewards/chosen": 6.2476348876953125, "rewards/margins": 11.495819785378195, "rewards/rejected": -5.248184897682884, "step": 374 }, { "epoch": 0.10278196519117445, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -216040.0, "logits/rejected": -15845426.285714285, "logps/chosen": -536.52978515625, "logps/rejected": -549.4783761160714, "loss": 0.0278, "rewards/chosen": 5.194533920288086, "rewards/margins": 11.767646953037808, "rewards/rejected": -6.573113032749721, "step": 375 }, { "epoch": 0.10305605043168425, "grad_norm": 8.4375, "kl": 5.845787048339844, "learning_rate": 5e-06, "logits/chosen": 3430472.3333333335, "logits/rejected": -37479450.666666664, "logps/chosen": -467.2042236328125, "logps/rejected": -578.019775390625, "loss": 0.0392, "rewards/chosen": 4.964664459228516, "rewards/margins": 11.174613952636719, "rewards/rejected": -6.209949493408203, "step": 376 }, { "epoch": 0.10333013567219405, "grad_norm": 10.3125, "kl": 2.199061155319214, "learning_rate": 5e-06, "logits/chosen": 4752562.133333334, "logits/rejected": -7086947.555555556, "logps/chosen": -411.28430989583336, "logps/rejected": -539.9830729166666, "loss": 0.0641, "rewards/chosen": 5.188321431477864, "rewards/margins": 10.399413638644749, "rewards/rejected": -5.2110922071668835, "step": 377 }, { "epoch": 0.10360422091270385, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17745344.0, "logits/rejected": 13748669.866666667, "logps/chosen": -308.31456163194446, "logps/rejected": -532.8786458333333, "loss": 0.0593, "rewards/chosen": 3.535329394870334, "rewards/margins": 9.471394051445856, "rewards/rejected": -5.936064656575521, "step": 378 }, { "epoch": 0.10387830615321365, "grad_norm": 11.25, "kl": 6.540373802185059, "learning_rate": 5e-06, "logits/chosen": -3510293.066666667, "logits/rejected": -641320.0, "logps/chosen": -440.75677083333335, "logps/rejected": -364.64374457465277, "loss": 0.0737, "rewards/chosen": 4.4596099853515625, "rewards/margins": 8.87327660454644, "rewards/rejected": -4.413666619194879, "step": 379 }, { "epoch": 0.10415239139372345, "grad_norm": 13.0625, "kl": 6.113406181335449, "learning_rate": 5e-06, "logits/chosen": 20384891.636363637, "logits/rejected": 10872384.0, "logps/chosen": -454.14346590909093, "logps/rejected": -497.1989933894231, "loss": 0.0712, "rewards/chosen": 5.2230307839133525, "rewards/margins": 11.822215847202115, "rewards/rejected": -6.599185063288762, "step": 380 }, { "epoch": 0.10442647663423325, "grad_norm": 12.5, "kl": 12.40969467163086, "learning_rate": 5e-06, "logits/chosen": 3223406.933333333, "logits/rejected": 38076576.0, "logps/chosen": -620.93515625, "logps/rejected": -421.50355360243054, "loss": 0.0962, "rewards/chosen": 5.6459503173828125, "rewards/margins": 10.100438435872395, "rewards/rejected": -4.454488118489583, "step": 381 }, { "epoch": 0.10470056187474304, "grad_norm": 14.0625, "kl": 9.922893524169922, "learning_rate": 5e-06, "logits/chosen": 1282458.857142857, "logits/rejected": -3058581.8, "logps/chosen": -430.85693359375, "logps/rejected": -533.255615234375, "loss": 0.0808, "rewards/chosen": 5.612513950892857, "rewards/margins": 12.721558598109652, "rewards/rejected": -7.1090446472167965, "step": 382 }, { "epoch": 0.10497464711525284, "grad_norm": 9.3125, "kl": 4.028563499450684, "learning_rate": 5e-06, "logits/chosen": -12041236.57142857, "logits/rejected": 12947410.4, "logps/chosen": -368.94984654017856, "logps/rejected": -544.012451171875, "loss": 0.0633, "rewards/chosen": 4.221489225115095, "rewards/margins": 10.779545865740094, "rewards/rejected": -6.558056640625, "step": 383 }, { "epoch": 0.10524873235576264, "grad_norm": 3.0, "kl": 0.9948133230209351, "learning_rate": 5e-06, "logits/chosen": -7333005.090909091, "logits/rejected": -5737601.230769231, "logps/chosen": -497.0949041193182, "logps/rejected": -588.9089167668269, "loss": 0.0118, "rewards/chosen": 5.599761616099965, "rewards/margins": 12.479461910007718, "rewards/rejected": -6.879700293907752, "step": 384 }, { "epoch": 0.10552281759627244, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6530182.0, "logits/rejected": 3827299.4285714286, "logps/chosen": -413.419287109375, "logps/rejected": -413.5795200892857, "loss": 0.0945, "rewards/chosen": 5.414896774291992, "rewards/margins": 9.934168352399553, "rewards/rejected": -4.519271578107562, "step": 385 }, { "epoch": 0.10579690283678224, "grad_norm": 16.625, "kl": 7.2806854248046875, "learning_rate": 5e-06, "logits/chosen": 10021854.545454545, "logits/rejected": 5942677.538461538, "logps/chosen": -476.0545099431818, "logps/rejected": -370.1890399639423, "loss": 0.17, "rewards/chosen": 4.42681260542436, "rewards/margins": 8.896024770670003, "rewards/rejected": -4.469212165245643, "step": 386 }, { "epoch": 0.10607098807729204, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1108325.142857143, "logits/rejected": -17699843.76470588, "logps/chosen": -317.35226004464283, "logps/rejected": -507.4276769301471, "loss": 0.0458, "rewards/chosen": 3.9186297825404575, "rewards/margins": 8.654878023291836, "rewards/rejected": -4.736248240751379, "step": 387 }, { "epoch": 0.10634507331780184, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9803357.818181818, "logits/rejected": -19441282.46153846, "logps/chosen": -496.7515980113636, "logps/rejected": -562.0519080528846, "loss": 0.0696, "rewards/chosen": 5.582048242742365, "rewards/margins": 12.599089962619168, "rewards/rejected": -7.0170417198768025, "step": 388 }, { "epoch": 0.10661915855831164, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3630399.5, "logits/rejected": 18051842.0, "logps/chosen": -442.54681396484375, "logps/rejected": -481.6558532714844, "loss": 0.0465, "rewards/chosen": 4.568587303161621, "rewards/margins": 10.411843299865723, "rewards/rejected": -5.843255996704102, "step": 389 }, { "epoch": 0.10689324379882144, "grad_norm": 9.25, "kl": 1.1936544179916382, "learning_rate": 5e-06, "logits/chosen": 4993397.333333333, "logits/rejected": 3522146.1333333333, "logps/chosen": -459.7312282986111, "logps/rejected": -415.7174479166667, "loss": 0.067, "rewards/chosen": 4.6838573879665795, "rewards/margins": 9.4553712632921, "rewards/rejected": -4.771513875325521, "step": 390 }, { "epoch": 0.10716732903933124, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10646230.4, "logits/rejected": 7371471.428571428, "logps/chosen": -334.169580078125, "logps/rejected": -562.5219377790179, "loss": 0.0868, "rewards/chosen": 3.3019550323486326, "rewards/margins": 10.208338001796177, "rewards/rejected": -6.906382969447544, "step": 391 }, { "epoch": 0.10744141427984102, "grad_norm": 7.75, "kl": 3.874619960784912, "learning_rate": 5e-06, "logits/chosen": -14227780.363636363, "logits/rejected": -524540.0, "logps/chosen": -513.6878107244319, "logps/rejected": -361.2426006610577, "loss": 0.0451, "rewards/chosen": 5.7338738874955615, "rewards/margins": 10.015842224334504, "rewards/rejected": -4.2819683368389425, "step": 392 }, { "epoch": 0.10771549952035082, "grad_norm": 9.5, "kl": 9.538880348205566, "learning_rate": 5e-06, "logits/chosen": -1072111.142857143, "logits/rejected": 60120844.8, "logps/chosen": -527.1409737723214, "logps/rejected": -619.152880859375, "loss": 0.1049, "rewards/chosen": 5.260035378592355, "rewards/margins": 12.980886513846261, "rewards/rejected": -7.720851135253906, "step": 393 }, { "epoch": 0.10798958476086062, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 16458166.4, "logits/rejected": 3664633.714285714, "logps/chosen": -384.166552734375, "logps/rejected": -562.4048549107143, "loss": 0.0635, "rewards/chosen": 3.8510818481445312, "rewards/margins": 10.844894409179688, "rewards/rejected": -6.993812561035156, "step": 394 }, { "epoch": 0.10826367000137042, "grad_norm": 12.625, "kl": 3.019012451171875, "learning_rate": 5e-06, "logits/chosen": -10347966.666666666, "logits/rejected": 25305970.666666668, "logps/chosen": -439.0441080729167, "logps/rejected": -666.35107421875, "loss": 0.0689, "rewards/chosen": 4.786215464274089, "rewards/margins": 11.87702751159668, "rewards/rejected": -7.090812047322591, "step": 395 }, { "epoch": 0.10853775524188022, "grad_norm": 10.75, "kl": 0.815506637096405, "learning_rate": 5e-06, "logits/chosen": 9216539.636363637, "logits/rejected": 6513662.153846154, "logps/chosen": -547.1751154119319, "logps/rejected": -521.7282527043269, "loss": 0.0557, "rewards/chosen": 3.9793451482599433, "rewards/margins": 9.452274295833561, "rewards/rejected": -5.472929147573618, "step": 396 }, { "epoch": 0.10881184048239002, "grad_norm": 11.125, "kl": 2.2020812034606934, "learning_rate": 5e-06, "logits/chosen": -27553773.333333332, "logits/rejected": 15273142.666666666, "logps/chosen": -424.37255859375, "logps/rejected": -491.4037272135417, "loss": 0.0638, "rewards/chosen": 4.9065901438395185, "rewards/margins": 10.74013392130534, "rewards/rejected": -5.83354377746582, "step": 397 }, { "epoch": 0.10908592572289982, "grad_norm": 13.9375, "kl": 10.4578857421875, "learning_rate": 5e-06, "logits/chosen": 1187757.8181818181, "logits/rejected": -5323793.846153846, "logps/chosen": -409.53879616477275, "logps/rejected": -444.86733774038464, "loss": 0.116, "rewards/chosen": 4.526841597123579, "rewards/margins": 11.752539041159036, "rewards/rejected": -7.225697444035457, "step": 398 }, { "epoch": 0.10936001096340962, "grad_norm": 13.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -666236.6666666666, "logits/rejected": -1785195.6666666667, "logps/chosen": -483.5561930338542, "logps/rejected": -387.2552490234375, "loss": 0.08, "rewards/chosen": 4.0673214594523115, "rewards/margins": 7.719374656677246, "rewards/rejected": -3.652053197224935, "step": 399 }, { "epoch": 0.10963409620391942, "grad_norm": 10.375, "kl": 2.1788525581359863, "learning_rate": 5e-06, "logits/chosen": -2113.6666666666665, "logits/rejected": -3318004.0, "logps/chosen": -404.5034993489583, "logps/rejected": -498.3372395833333, "loss": 0.0786, "rewards/chosen": 4.300940831502278, "rewards/margins": 9.744958877563477, "rewards/rejected": -5.444018046061198, "step": 400 }, { "epoch": 0.10990818144442922, "grad_norm": 17.75, "kl": 2.4937453269958496, "learning_rate": 5e-06, "logits/chosen": 10246278.0, "logits/rejected": 1525123.6666666667, "logps/chosen": -432.0513102213542, "logps/rejected": -441.5913899739583, "loss": 0.1199, "rewards/chosen": 3.398935317993164, "rewards/margins": 8.591543515523274, "rewards/rejected": -5.19260819753011, "step": 401 }, { "epoch": 0.11018226668493901, "grad_norm": 4.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3207872.3636363638, "logits/rejected": 25495515.076923076, "logps/chosen": -539.0189098011364, "logps/rejected": -475.51600060096155, "loss": 0.0448, "rewards/chosen": 5.247100830078125, "rewards/margins": 12.623350290151743, "rewards/rejected": -7.376249460073618, "step": 402 }, { "epoch": 0.11045635192544881, "grad_norm": 8.5, "kl": 3.560702085494995, "learning_rate": 5e-06, "logits/chosen": -17612080.0, "logits/rejected": 8707564.307692308, "logps/chosen": -476.72998046875, "logps/rejected": -318.03673377403845, "loss": 0.0616, "rewards/chosen": 4.386832150545987, "rewards/margins": 8.622750342309057, "rewards/rejected": -4.235918191763071, "step": 403 }, { "epoch": 0.11073043716595861, "grad_norm": 19.75, "kl": 14.514243125915527, "learning_rate": 5e-06, "logits/chosen": -7514420.0, "logits/rejected": 17417088.0, "logps/chosen": -467.595703125, "logps/rejected": -312.0213317871094, "loss": 0.1554, "rewards/chosen": 5.004546165466309, "rewards/margins": 7.014521598815918, "rewards/rejected": -2.0099754333496094, "step": 404 }, { "epoch": 0.11100452240646841, "grad_norm": 14.3125, "kl": 5.245902061462402, "learning_rate": 5e-06, "logits/chosen": -5744848.0, "logits/rejected": 5922108.4, "logps/chosen": -408.44594029017856, "logps/rejected": -467.51474609375, "loss": 0.1229, "rewards/chosen": 4.717265537806919, "rewards/margins": 10.405254200526645, "rewards/rejected": -5.687988662719727, "step": 405 }, { "epoch": 0.11127860764697821, "grad_norm": 5.25, "kl": 1.5991802215576172, "learning_rate": 5e-06, "logits/chosen": 3106993.4, "logits/rejected": 42130907.428571425, "logps/chosen": -406.222802734375, "logps/rejected": -589.76806640625, "loss": 0.0203, "rewards/chosen": 5.550785827636719, "rewards/margins": 12.39765134538923, "rewards/rejected": -6.846865517752511, "step": 406 }, { "epoch": 0.11155269288748801, "grad_norm": 8.9375, "kl": 6.116461753845215, "learning_rate": 5e-06, "logits/chosen": -430017.14285714284, "logits/rejected": 3349842.2, "logps/chosen": -333.46407645089283, "logps/rejected": -369.2785888671875, "loss": 0.0973, "rewards/chosen": 4.202733993530273, "rewards/margins": 8.159285354614259, "rewards/rejected": -3.956551361083984, "step": 407 }, { "epoch": 0.11182677812799781, "grad_norm": 10.5, "kl": 8.523906707763672, "learning_rate": 5e-06, "logits/chosen": -15628277.0, "logits/rejected": -6887793.0, "logps/chosen": -471.76092529296875, "logps/rejected": -359.3701477050781, "loss": 0.1024, "rewards/chosen": 5.244580268859863, "rewards/margins": 10.839628219604492, "rewards/rejected": -5.595047950744629, "step": 408 }, { "epoch": 0.11210086336850761, "grad_norm": 13.4375, "kl": 2.967818021774292, "learning_rate": 5e-06, "logits/chosen": -15955392.0, "logits/rejected": -7449380.923076923, "logps/chosen": -482.66428444602275, "logps/rejected": -599.7571364182693, "loss": 0.0654, "rewards/chosen": 4.474745663729581, "rewards/margins": 11.295598090111792, "rewards/rejected": -6.820852426382212, "step": 409 }, { "epoch": 0.11237494860901741, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4290071.555555556, "logits/rejected": -6525326.933333334, "logps/chosen": -411.1657986111111, "logps/rejected": -472.92975260416665, "loss": 0.0119, "rewards/chosen": 7.72343275282118, "rewards/margins": 14.659793429904514, "rewards/rejected": -6.936360677083333, "step": 410 }, { "epoch": 0.11264903384952721, "grad_norm": 13.3125, "kl": 4.662321090698242, "learning_rate": 5e-06, "logits/chosen": 4655643.384615385, "logits/rejected": -6582004.363636363, "logps/chosen": -489.75826322115387, "logps/rejected": -490.99995561079544, "loss": 0.0662, "rewards/chosen": 4.497153062086839, "rewards/margins": 10.713255662184494, "rewards/rejected": -6.216102600097656, "step": 411 }, { "epoch": 0.11292311909003701, "grad_norm": 11.5, "kl": 4.333498001098633, "learning_rate": 5e-06, "logits/chosen": -13396376.615384616, "logits/rejected": -19145629.09090909, "logps/chosen": -375.6730769230769, "logps/rejected": -495.4718572443182, "loss": 0.065, "rewards/chosen": 4.358804556039663, "rewards/margins": 10.14560632772379, "rewards/rejected": -5.786801771684126, "step": 412 }, { "epoch": 0.1131972043305468, "grad_norm": 12.9375, "kl": 1.5558459758758545, "learning_rate": 5e-06, "logits/chosen": -1488176.6153846155, "logits/rejected": -13841104.0, "logps/chosen": -495.03579477163464, "logps/rejected": -599.1341441761364, "loss": 0.0543, "rewards/chosen": 5.21119866004357, "rewards/margins": 11.875739731155075, "rewards/rejected": -6.664541071111506, "step": 413 }, { "epoch": 0.1134712895710566, "grad_norm": 10.0625, "kl": 0.3039652705192566, "learning_rate": 5e-06, "logits/chosen": -7698110.0, "logits/rejected": 1821611.5, "logps/chosen": -397.0900573730469, "logps/rejected": -531.03076171875, "loss": 0.0456, "rewards/chosen": 4.839269161224365, "rewards/margins": 11.225603580474854, "rewards/rejected": -6.386334419250488, "step": 414 }, { "epoch": 0.1137453748115664, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19270824.888888888, "logits/rejected": 18100196.266666666, "logps/chosen": -528.3715277777778, "logps/rejected": -592.5204427083333, "loss": 0.0282, "rewards/chosen": 4.957862430148655, "rewards/margins": 12.618770429823133, "rewards/rejected": -7.660907999674479, "step": 415 }, { "epoch": 0.1140194600520762, "grad_norm": 12.4375, "kl": 1.437565565109253, "learning_rate": 5e-06, "logits/chosen": 3215716.4444444445, "logits/rejected": 9945175.466666667, "logps/chosen": -336.6449381510417, "logps/rejected": -516.6180989583333, "loss": 0.1174, "rewards/chosen": 2.7909374237060547, "rewards/margins": 7.929811223347982, "rewards/rejected": -5.138873799641927, "step": 416 }, { "epoch": 0.114293545292586, "grad_norm": 8.0, "kl": 3.5195860862731934, "learning_rate": 5e-06, "logits/chosen": 18414855.272727273, "logits/rejected": 32377708.307692308, "logps/chosen": -623.5820756392045, "logps/rejected": -731.8547175480769, "loss": 0.0258, "rewards/chosen": 6.177843960848722, "rewards/margins": 17.116611614093912, "rewards/rejected": -10.938767653245192, "step": 417 }, { "epoch": 0.1145676305330958, "grad_norm": 14.9375, "kl": 11.458067893981934, "learning_rate": 5e-06, "logits/chosen": -12164310.588235294, "logits/rejected": -17830915.42857143, "logps/chosen": -388.7544806985294, "logps/rejected": -462.2735072544643, "loss": 0.0856, "rewards/chosen": 4.858300601734834, "rewards/margins": 11.368716905097, "rewards/rejected": -6.510416303362165, "step": 418 }, { "epoch": 0.1148417157736056, "grad_norm": 6.25, "kl": 3.2085494995117188, "learning_rate": 5e-06, "logits/chosen": 2921664.1818181816, "logits/rejected": -1772660.923076923, "logps/chosen": -391.63578657670456, "logps/rejected": -496.2317082331731, "loss": 0.024, "rewards/chosen": 4.800546125932173, "rewards/margins": 12.214680224865466, "rewards/rejected": -7.414134098933293, "step": 419 }, { "epoch": 0.1151158010141154, "grad_norm": 13.875, "kl": 8.706876754760742, "learning_rate": 5e-06, "logits/chosen": 12038871.111111112, "logits/rejected": 12916477.333333334, "logps/chosen": -341.86943901909723, "logps/rejected": -424.3650716145833, "loss": 0.1673, "rewards/chosen": 4.215212927924262, "rewards/margins": 8.325309541490343, "rewards/rejected": -4.110096613566081, "step": 420 }, { "epoch": 0.11538988625462519, "grad_norm": 8.9375, "kl": 0.7990188598632812, "learning_rate": 5e-06, "logits/chosen": -16539252.0, "logits/rejected": -7891428.0, "logps/chosen": -493.7884521484375, "logps/rejected": -472.4351399739583, "loss": 0.0267, "rewards/chosen": 4.8810930252075195, "rewards/margins": 11.223545392354328, "rewards/rejected": -6.34245236714681, "step": 421 }, { "epoch": 0.11566397149513499, "grad_norm": 9.375, "kl": 2.6363677978515625, "learning_rate": 5e-06, "logits/chosen": -14195959.272727273, "logits/rejected": -8672962.461538462, "logps/chosen": -445.5892223011364, "logps/rejected": -438.87680288461536, "loss": 0.038, "rewards/chosen": 5.242577639493075, "rewards/margins": 10.599408529855154, "rewards/rejected": -5.35683089036208, "step": 422 }, { "epoch": 0.11593805673564478, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13663899.076923076, "logits/rejected": -3723773.090909091, "logps/chosen": -356.7087590144231, "logps/rejected": -391.03608842329544, "loss": 0.0435, "rewards/chosen": 4.336830139160156, "rewards/margins": 10.388533852317117, "rewards/rejected": -6.05170371315696, "step": 423 }, { "epoch": 0.11621214197615458, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4023783.3333333335, "logits/rejected": 29415626.666666668, "logps/chosen": -433.2442626953125, "logps/rejected": -422.5835774739583, "loss": 0.0531, "rewards/chosen": 5.0741316477457685, "rewards/margins": 10.777281443277996, "rewards/rejected": -5.703149795532227, "step": 424 }, { "epoch": 0.11648622721666438, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3326568.6666666665, "logits/rejected": 29719957.333333332, "logps/chosen": -463.706787109375, "logps/rejected": -384.6886800130208, "loss": 0.0408, "rewards/chosen": 4.230129559834798, "rewards/margins": 9.778522809346516, "rewards/rejected": -5.548393249511719, "step": 425 }, { "epoch": 0.11676031245717418, "grad_norm": 12.5625, "kl": 11.489250183105469, "learning_rate": 5e-06, "logits/chosen": 3004155.111111111, "logits/rejected": 3887718.0, "logps/chosen": -470.66069878472223, "logps/rejected": -317.6863199869792, "loss": 0.0941, "rewards/chosen": 5.331384870741102, "rewards/margins": 9.732485983106825, "rewards/rejected": -4.401101112365723, "step": 426 }, { "epoch": 0.11703439769768398, "grad_norm": 17.375, "kl": 6.727948188781738, "learning_rate": 5e-06, "logits/chosen": -24456662.85714286, "logits/rejected": 15297899.2, "logps/chosen": -519.6393694196429, "logps/rejected": -438.657177734375, "loss": 0.0851, "rewards/chosen": 4.337975365774972, "rewards/margins": 12.346113640921455, "rewards/rejected": -8.008138275146484, "step": 427 }, { "epoch": 0.11730848293819378, "grad_norm": 6.9375, "kl": 7.775592803955078, "learning_rate": 5e-06, "logits/chosen": -18425237.714285713, "logits/rejected": 3766884.0, "logps/chosen": -497.04931640625, "logps/rejected": -384.038671875, "loss": 0.0328, "rewards/chosen": 6.203835623604911, "rewards/margins": 11.072706168038504, "rewards/rejected": -4.868870544433594, "step": 428 }, { "epoch": 0.11758256817870358, "grad_norm": 6.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7550387.0, "logits/rejected": -5048006.0, "logps/chosen": -437.6584167480469, "logps/rejected": -530.2982177734375, "loss": 0.0709, "rewards/chosen": 4.545597076416016, "rewards/margins": 10.871126174926758, "rewards/rejected": -6.325529098510742, "step": 429 }, { "epoch": 0.11785665341921338, "grad_norm": 11.6875, "kl": 0.5750173330307007, "learning_rate": 5e-06, "logits/chosen": -233086.15384615384, "logits/rejected": -2284518.0, "logps/chosen": -434.5003004807692, "logps/rejected": -447.03884055397725, "loss": 0.0471, "rewards/chosen": 4.457581153282752, "rewards/margins": 10.256158895425862, "rewards/rejected": -5.79857774214311, "step": 430 }, { "epoch": 0.11813073865972318, "grad_norm": 8.75, "kl": 1.9956697225570679, "learning_rate": 5e-06, "logits/chosen": 11909627.2, "logits/rejected": -223967.42857142858, "logps/chosen": -391.791259765625, "logps/rejected": -432.71700613839283, "loss": 0.0612, "rewards/chosen": 5.969361877441406, "rewards/margins": 11.088246917724609, "rewards/rejected": -5.118885040283203, "step": 431 }, { "epoch": 0.11840482390023298, "grad_norm": 12.4375, "kl": 11.777565956115723, "learning_rate": 5e-06, "logits/chosen": 9536930.133333333, "logits/rejected": -3777880.0, "logps/chosen": -378.4199544270833, "logps/rejected": -461.8505859375, "loss": 0.1284, "rewards/chosen": 4.021975199381511, "rewards/margins": 8.819593641493057, "rewards/rejected": -4.7976184421115455, "step": 432 }, { "epoch": 0.11867890914074278, "grad_norm": 14.25, "kl": 7.559208393096924, "learning_rate": 5e-06, "logits/chosen": -3685064.0, "logits/rejected": 7974252.0, "logps/chosen": -453.21072823660717, "logps/rejected": -452.85166015625, "loss": 0.1142, "rewards/chosen": 4.575423104422433, "rewards/margins": 9.679639489310128, "rewards/rejected": -5.104216384887695, "step": 433 }, { "epoch": 0.11895299438125256, "grad_norm": 10.625, "kl": 3.732858419418335, "learning_rate": 5e-06, "logits/chosen": -10687925.6, "logits/rejected": -12137588.57142857, "logps/chosen": -595.43896484375, "logps/rejected": -470.03578404017856, "loss": 0.0667, "rewards/chosen": 5.797731018066406, "rewards/margins": 11.311407252720425, "rewards/rejected": -5.513676234654018, "step": 434 }, { "epoch": 0.11922707962176236, "grad_norm": 10.3125, "kl": 6.808411598205566, "learning_rate": 5e-06, "logits/chosen": -14575180.57142857, "logits/rejected": -14702224.0, "logps/chosen": -468.2163783482143, "logps/rejected": -401.5806396484375, "loss": 0.0524, "rewards/chosen": 4.191730771745954, "rewards/margins": 9.41584881373814, "rewards/rejected": -5.224118041992187, "step": 435 }, { "epoch": 0.11950116486227216, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5027998.933333334, "logits/rejected": -1982009.7777777778, "logps/chosen": -296.90032552083335, "logps/rejected": -681.9200846354166, "loss": 0.0715, "rewards/chosen": 3.5995712280273438, "rewards/margins": 11.70694308810764, "rewards/rejected": -8.107371860080296, "step": 436 }, { "epoch": 0.11977525010278196, "grad_norm": 6.21875, "kl": 1.868575096130371, "learning_rate": 5e-06, "logits/chosen": -22727918.222222224, "logits/rejected": -11467808.0, "logps/chosen": -513.1781141493055, "logps/rejected": -379.51793619791664, "loss": 0.0404, "rewards/chosen": 5.166791280110677, "rewards/margins": 10.032808430989583, "rewards/rejected": -4.866017150878906, "step": 437 }, { "epoch": 0.12004933534329176, "grad_norm": 13.125, "kl": 1.3064229488372803, "learning_rate": 5e-06, "logits/chosen": -1528856.6666666667, "logits/rejected": 5685832.666666667, "logps/chosen": -410.4261881510417, "logps/rejected": -364.9016927083333, "loss": 0.1029, "rewards/chosen": 3.6042772928873696, "rewards/margins": 8.596930185953775, "rewards/rejected": -4.992652893066406, "step": 438 }, { "epoch": 0.12032342058380156, "grad_norm": 13.8125, "kl": 1.6372134685516357, "learning_rate": 5e-06, "logits/chosen": 22419186.0, "logits/rejected": 23937180.0, "logps/chosen": -477.5968322753906, "logps/rejected": -492.0136413574219, "loss": 0.0803, "rewards/chosen": 4.7312541007995605, "rewards/margins": 11.434528827667236, "rewards/rejected": -6.703274726867676, "step": 439 }, { "epoch": 0.12059750582431136, "grad_norm": 7.09375, "kl": 5.7635297775268555, "learning_rate": 5e-06, "logits/chosen": -7858690.0, "logits/rejected": -3217337.6666666665, "logps/chosen": -420.45361328125, "logps/rejected": -355.1445719401042, "loss": 0.0712, "rewards/chosen": 5.364744186401367, "rewards/margins": 11.879725138346355, "rewards/rejected": -6.514980951944987, "step": 440 }, { "epoch": 0.12087159106482116, "grad_norm": 3.390625, "kl": 5.21973180770874, "learning_rate": 5e-06, "logits/chosen": -11270036.923076924, "logits/rejected": 21840215.272727273, "logps/chosen": -441.7110126201923, "logps/rejected": -573.6541637073864, "loss": 0.0175, "rewards/chosen": 5.8538947472205525, "rewards/margins": 14.265495727112242, "rewards/rejected": -8.41160097989169, "step": 441 }, { "epoch": 0.12114567630533096, "grad_norm": 7.8125, "kl": 1.7652950286865234, "learning_rate": 5e-06, "logits/chosen": -14295597.090909092, "logits/rejected": -8470441.846153846, "logps/chosen": -396.2638494318182, "logps/rejected": -565.4736328125, "loss": 0.0382, "rewards/chosen": 5.062072060324929, "rewards/margins": 11.54180838844993, "rewards/rejected": -6.479736328125, "step": 442 }, { "epoch": 0.12141976154584076, "grad_norm": 15.4375, "kl": 3.535595655441284, "learning_rate": 5e-06, "logits/chosen": -5138544.7272727275, "logits/rejected": 2369894.769230769, "logps/chosen": -357.2017711292614, "logps/rejected": -447.4820087139423, "loss": 0.069, "rewards/chosen": 4.752412275834517, "rewards/margins": 9.821964183887403, "rewards/rejected": -5.069551908052885, "step": 443 }, { "epoch": 0.12169384678635055, "grad_norm": 7.65625, "kl": 5.171342849731445, "learning_rate": 5e-06, "logits/chosen": 3158433.1428571427, "logits/rejected": 91625.2, "logps/chosen": -525.6520647321429, "logps/rejected": -410.87998046875, "loss": 0.0633, "rewards/chosen": 4.711139678955078, "rewards/margins": 11.090646362304687, "rewards/rejected": -6.379506683349609, "step": 444 }, { "epoch": 0.12196793202686035, "grad_norm": 13.875, "kl": 7.881667613983154, "learning_rate": 5e-06, "logits/chosen": -16214974.857142856, "logits/rejected": -15484476.8, "logps/chosen": -424.52933175223217, "logps/rejected": -486.9, "loss": 0.0533, "rewards/chosen": 4.297619683401925, "rewards/margins": 10.739328057425363, "rewards/rejected": -6.441708374023437, "step": 445 }, { "epoch": 0.12224201726737015, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 12903280.0, "logits/rejected": -11975392.0, "logps/chosen": -497.42569247159093, "logps/rejected": -374.8212139423077, "loss": 0.0921, "rewards/chosen": 4.8758392333984375, "rewards/margins": 10.177911611703726, "rewards/rejected": -5.302072378305288, "step": 446 }, { "epoch": 0.12251610250787995, "grad_norm": 16.5, "kl": 7.73777961730957, "learning_rate": 5e-06, "logits/chosen": 6214854.4, "logits/rejected": 22107936.0, "logps/chosen": -625.9276692708333, "logps/rejected": -743.7556966145834, "loss": 0.048, "rewards/chosen": 6.159839884440104, "rewards/margins": 16.415760294596353, "rewards/rejected": -10.25592041015625, "step": 447 }, { "epoch": 0.12279018774838975, "grad_norm": 11.625, "kl": 7.255550384521484, "learning_rate": 5e-06, "logits/chosen": 18127763.692307692, "logits/rejected": 2452176.909090909, "logps/chosen": -442.31640625, "logps/rejected": -522.4675071022727, "loss": 0.0855, "rewards/chosen": 4.932028550368089, "rewards/margins": 10.998615131511555, "rewards/rejected": -6.066586581143466, "step": 448 }, { "epoch": 0.12306427298889955, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13392304.888888888, "logits/rejected": -4581746.666666667, "logps/chosen": -379.14238823784723, "logps/rejected": -399.48248697916665, "loss": 0.0787, "rewards/chosen": 4.210876888699001, "rewards/margins": 9.513733842637803, "rewards/rejected": -5.302856953938802, "step": 449 }, { "epoch": 0.12333835822940935, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10442922.666666666, "logits/rejected": 5991036.8, "logps/chosen": -539.3796115451389, "logps/rejected": -695.9688802083333, "loss": 0.0123, "rewards/chosen": 5.890575408935547, "rewards/margins": 14.102466583251953, "rewards/rejected": -8.211891174316406, "step": 450 }, { "epoch": 0.12361244346991915, "grad_norm": 9.125, "kl": 3.3635027408599854, "learning_rate": 5e-06, "logits/chosen": -11976048.888888888, "logits/rejected": 9342389.333333334, "logps/chosen": -406.0400390625, "logps/rejected": -367.65423177083335, "loss": 0.091, "rewards/chosen": 4.3674875895182295, "rewards/margins": 8.354638163248698, "rewards/rejected": -3.9871505737304687, "step": 451 }, { "epoch": 0.12388652871042895, "grad_norm": 10.4375, "kl": 0.6373850703239441, "learning_rate": 5e-06, "logits/chosen": -12601808.0, "logits/rejected": 1428642.0, "logps/chosen": -440.121142578125, "logps/rejected": -348.94022042410717, "loss": 0.0659, "rewards/chosen": 5.3453319549560545, "rewards/margins": 10.180178451538087, "rewards/rejected": -4.834846496582031, "step": 452 }, { "epoch": 0.12416061395093875, "grad_norm": 12.0625, "kl": 4.79276180267334, "learning_rate": 5e-06, "logits/chosen": -2940508.0, "logits/rejected": -7162262.4, "logps/chosen": -396.97262137276783, "logps/rejected": -626.80634765625, "loss": 0.083, "rewards/chosen": 5.480047498430524, "rewards/margins": 11.15739162990025, "rewards/rejected": -5.677344131469726, "step": 453 }, { "epoch": 0.12443469919144855, "grad_norm": 8.9375, "kl": 5.5915374755859375, "learning_rate": 5e-06, "logits/chosen": -6060873.846153846, "logits/rejected": 1029700.0, "logps/chosen": -400.9353215144231, "logps/rejected": -529.9533913352273, "loss": 0.0375, "rewards/chosen": 5.423340430626502, "rewards/margins": 12.378292323826077, "rewards/rejected": -6.954951893199574, "step": 454 }, { "epoch": 0.12470878443195833, "grad_norm": 10.1875, "kl": 2.5470480918884277, "learning_rate": 5e-06, "logits/chosen": -21044281.6, "logits/rejected": 5689600.0, "logps/chosen": -381.8020833333333, "logps/rejected": -298.42328559027777, "loss": 0.0919, "rewards/chosen": 3.9520118713378904, "rewards/margins": 7.202756585015191, "rewards/rejected": -3.2507447136773004, "step": 455 }, { "epoch": 0.12498286967246813, "grad_norm": 10.25, "kl": 4.531886100769043, "learning_rate": 5e-06, "logits/chosen": 10876402.909090908, "logits/rejected": -687826.9230769231, "logps/chosen": -488.38454367897725, "logps/rejected": -360.48978365384613, "loss": 0.0368, "rewards/chosen": 4.953082691539418, "rewards/margins": 10.720058681247952, "rewards/rejected": -5.766975989708533, "step": 456 }, { "epoch": 0.12525695491297795, "grad_norm": 10.625, "kl": 0.6735445857048035, "learning_rate": 5e-06, "logits/chosen": -1898091.6363636365, "logits/rejected": 8320043.076923077, "logps/chosen": -342.6056019176136, "logps/rejected": -436.19576322115387, "loss": 0.0798, "rewards/chosen": 4.387000344016335, "rewards/margins": 9.348640335189712, "rewards/rejected": -4.961639991173377, "step": 457 }, { "epoch": 0.12553104015348773, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10754361.333333334, "logits/rejected": -14138842.666666666, "logps/chosen": -401.8219401041667, "logps/rejected": -519.2270100911459, "loss": 0.0355, "rewards/chosen": 4.588844935099284, "rewards/margins": 13.193602244059246, "rewards/rejected": -8.604757308959961, "step": 458 }, { "epoch": 0.12580512539399755, "grad_norm": 7.59375, "kl": 0.47039923071861267, "learning_rate": 5e-06, "logits/chosen": -12107523.0, "logits/rejected": -14077453.0, "logps/chosen": -409.70672607421875, "logps/rejected": -369.8258056640625, "loss": 0.0556, "rewards/chosen": 4.790868759155273, "rewards/margins": 9.652441024780273, "rewards/rejected": -4.861572265625, "step": 459 }, { "epoch": 0.12607921063450733, "grad_norm": 11.4375, "kl": 4.065341472625732, "learning_rate": 5e-06, "logits/chosen": 8706708.57142857, "logits/rejected": 19295160.0, "logps/chosen": -458.15248325892856, "logps/rejected": -517.084765625, "loss": 0.0669, "rewards/chosen": 5.410501752580915, "rewards/margins": 14.067499433244977, "rewards/rejected": -8.656997680664062, "step": 460 }, { "epoch": 0.12635329587501712, "grad_norm": 11.3125, "kl": 13.221921920776367, "learning_rate": 5e-06, "logits/chosen": 1971047.4666666666, "logits/rejected": -7713275.555555556, "logps/chosen": -405.24931640625, "logps/rejected": -386.20024956597223, "loss": 0.1258, "rewards/chosen": 5.4412180582682295, "rewards/margins": 10.04423607720269, "rewards/rejected": -4.603018018934462, "step": 461 }, { "epoch": 0.12662738111552693, "grad_norm": 10.1875, "kl": 1.1256942749023438, "learning_rate": 5e-06, "logits/chosen": -34256608.0, "logits/rejected": -14039293.0, "logps/chosen": -417.0491027832031, "logps/rejected": -493.0030517578125, "loss": 0.0564, "rewards/chosen": 4.840474605560303, "rewards/margins": 11.41145372390747, "rewards/rejected": -6.570979118347168, "step": 462 }, { "epoch": 0.12690146635603672, "grad_norm": 12.5625, "kl": 7.961170673370361, "learning_rate": 5e-06, "logits/chosen": -10097382.4, "logits/rejected": -15102848.0, "logps/chosen": -414.6386393229167, "logps/rejected": -535.7789713541666, "loss": 0.1701, "rewards/chosen": 4.0880381266276045, "rewards/margins": 10.137106153700088, "rewards/rejected": -6.049068027072483, "step": 463 }, { "epoch": 0.12717555159654653, "grad_norm": 6.6875, "kl": 6.578490257263184, "learning_rate": 5e-06, "logits/chosen": -15815629.333333334, "logits/rejected": 26144360.0, "logps/chosen": -531.19140625, "logps/rejected": -502.2237955729167, "loss": 0.042, "rewards/chosen": 5.8003692626953125, "rewards/margins": 11.146399180094402, "rewards/rejected": -5.346029917399089, "step": 464 }, { "epoch": 0.12744963683705632, "grad_norm": 8.4375, "kl": 0.7406253814697266, "learning_rate": 5e-06, "logits/chosen": 3217378.6666666665, "logits/rejected": -286025.3333333333, "logps/chosen": -431.3926595052083, "logps/rejected": -376.3029378255208, "loss": 0.0869, "rewards/chosen": 4.371631622314453, "rewards/margins": 9.80678876241048, "rewards/rejected": -5.435157140096028, "step": 465 }, { "epoch": 0.12772372207756613, "grad_norm": 8.5625, "kl": 1.3172938823699951, "learning_rate": 5e-06, "logits/chosen": -1668962.1666666667, "logits/rejected": 6739132.666666667, "logps/chosen": -365.090576171875, "logps/rejected": -469.5023600260417, "loss": 0.0646, "rewards/chosen": 3.446626663208008, "rewards/margins": 8.896043141682942, "rewards/rejected": -5.449416478474935, "step": 466 }, { "epoch": 0.12799780731807592, "grad_norm": 5.75, "kl": 0.1311105191707611, "learning_rate": 5e-06, "logits/chosen": -4915605.2, "logits/rejected": 382791.14285714284, "logps/chosen": -525.1234375, "logps/rejected": -441.40708705357144, "loss": 0.021, "rewards/chosen": 6.357054138183594, "rewards/margins": 11.639038739885603, "rewards/rejected": -5.281984601702009, "step": 467 }, { "epoch": 0.12827189255858573, "grad_norm": 9.9375, "kl": 4.100715637207031, "learning_rate": 5e-06, "logits/chosen": -18164891.2, "logits/rejected": 1369805.4285714286, "logps/chosen": -442.25068359375, "logps/rejected": -457.75254603794644, "loss": 0.0397, "rewards/chosen": 4.2740215301513675, "rewards/margins": 11.37408539908273, "rewards/rejected": -7.100063868931362, "step": 468 }, { "epoch": 0.12854597779909552, "grad_norm": 14.5, "kl": 6.389166831970215, "learning_rate": 5e-06, "logits/chosen": -5293637.333333333, "logits/rejected": 733730.5555555555, "logps/chosen": -369.07939453125, "logps/rejected": -600.1232096354166, "loss": 0.1099, "rewards/chosen": 4.435970052083333, "rewards/margins": 11.972415669759116, "rewards/rejected": -7.536445617675781, "step": 469 }, { "epoch": 0.1288200630396053, "grad_norm": 5.1875, "kl": 1.9866111278533936, "learning_rate": 5e-06, "logits/chosen": -23277173.333333332, "logits/rejected": 10052617.333333334, "logps/chosen": -460.9782307942708, "logps/rejected": -508.758544921875, "loss": 0.0217, "rewards/chosen": 5.395913441975911, "rewards/margins": 10.774573644002277, "rewards/rejected": -5.378660202026367, "step": 470 }, { "epoch": 0.12909414828011512, "grad_norm": 7.4375, "kl": 8.938782691955566, "learning_rate": 5e-06, "logits/chosen": -7565367.466666667, "logits/rejected": 393982.22222222225, "logps/chosen": -427.22350260416664, "logps/rejected": -504.4758029513889, "loss": 0.0985, "rewards/chosen": 5.106573994954427, "rewards/margins": 11.090380181206598, "rewards/rejected": -5.9838061862521705, "step": 471 }, { "epoch": 0.1293682335206249, "grad_norm": 9.25, "kl": 12.492782592773438, "learning_rate": 5e-06, "logits/chosen": 223685.23076923078, "logits/rejected": -8374301.090909091, "logps/chosen": -391.36170372596155, "logps/rejected": -463.23251065340907, "loss": 0.0386, "rewards/chosen": 5.499791071965144, "rewards/margins": 10.569936418866778, "rewards/rejected": -5.070145346901634, "step": 472 }, { "epoch": 0.12964231876113472, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20186501.333333332, "logits/rejected": 41963460.266666666, "logps/chosen": -465.732421875, "logps/rejected": -423.13678385416665, "loss": 0.0454, "rewards/chosen": 5.138023376464844, "rewards/margins": 11.074158223470052, "rewards/rejected": -5.936134847005208, "step": 473 }, { "epoch": 0.1299164040016445, "grad_norm": 4.875, "kl": 6.0998005867004395, "learning_rate": 5e-06, "logits/chosen": -2289758.769230769, "logits/rejected": -16319968.0, "logps/chosen": -615.0534104567307, "logps/rejected": -539.7853338068181, "loss": 0.0274, "rewards/chosen": 5.290985107421875, "rewards/margins": 10.992495450106535, "rewards/rejected": -5.701510342684659, "step": 474 }, { "epoch": 0.13019048924215432, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7618407.2727272725, "logits/rejected": -10060240.0, "logps/chosen": -384.5530894886364, "logps/rejected": -638.1752554086538, "loss": 0.0519, "rewards/chosen": 5.451144131747159, "rewards/margins": 11.677428452285021, "rewards/rejected": -6.226284320537861, "step": 475 }, { "epoch": 0.1304645744826641, "grad_norm": 10.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4455447.076923077, "logits/rejected": -5531964.363636363, "logps/chosen": -400.7497370793269, "logps/rejected": -576.9176136363636, "loss": 0.0485, "rewards/chosen": 4.37075923039363, "rewards/margins": 10.726849429257268, "rewards/rejected": -6.356090198863637, "step": 476 }, { "epoch": 0.13073865972317392, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18831891.555555556, "logits/rejected": 58664712.53333333, "logps/chosen": -359.9853244357639, "logps/rejected": -645.40859375, "loss": 0.0275, "rewards/chosen": 4.00537829928928, "rewards/margins": 17.86859342787001, "rewards/rejected": -13.863215128580729, "step": 477 }, { "epoch": 0.1310127449636837, "grad_norm": 11.375, "kl": 0.9932206869125366, "learning_rate": 5e-06, "logits/chosen": 7383578.4, "logits/rejected": 17136635.42857143, "logps/chosen": -472.840869140625, "logps/rejected": -581.6494140625, "loss": 0.0558, "rewards/chosen": 4.694891738891601, "rewards/margins": 11.847480719430106, "rewards/rejected": -7.152588980538504, "step": 478 }, { "epoch": 0.13128683020419352, "grad_norm": 10.125, "kl": 2.859952926635742, "learning_rate": 5e-06, "logits/chosen": -4974104.615384615, "logits/rejected": 19360704.0, "logps/chosen": -443.69106820913464, "logps/rejected": -543.1739169034091, "loss": 0.0503, "rewards/chosen": 4.681705181415264, "rewards/margins": 9.720261367050917, "rewards/rejected": -5.038556185635653, "step": 479 }, { "epoch": 0.1315609154447033, "grad_norm": 8.5625, "kl": 6.104207515716553, "learning_rate": 5e-06, "logits/chosen": -3431297.846153846, "logits/rejected": -2234193.090909091, "logps/chosen": -582.3958834134615, "logps/rejected": -612.8113458806819, "loss": 0.0282, "rewards/chosen": 6.446668771597055, "rewards/margins": 12.748430745584981, "rewards/rejected": -6.301761973987926, "step": 480 }, { "epoch": 0.1318350006852131, "grad_norm": 11.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1215490.8, "logits/rejected": 2287215.4285714286, "logps/chosen": -452.913525390625, "logps/rejected": -526.5659877232143, "loss": 0.0664, "rewards/chosen": 5.2718345642089846, "rewards/margins": 10.041837201799666, "rewards/rejected": -4.7700026375906805, "step": 481 }, { "epoch": 0.1321090859257229, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15478252.444444444, "logits/rejected": -21899891.2, "logps/chosen": -373.80186631944446, "logps/rejected": -599.8356770833333, "loss": 0.0511, "rewards/chosen": 3.8004031711154513, "rewards/margins": 10.850962151421442, "rewards/rejected": -7.05055898030599, "step": 482 }, { "epoch": 0.1323831711662327, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 24149251.2, "logits/rejected": -16857041.14285714, "logps/chosen": -475.391015625, "logps/rejected": -528.0207170758929, "loss": 0.018, "rewards/chosen": 6.760969543457032, "rewards/margins": 14.080160522460938, "rewards/rejected": -7.319190979003906, "step": 483 }, { "epoch": 0.1326572564067425, "grad_norm": 8.5, "kl": 5.930367946624756, "learning_rate": 5e-06, "logits/chosen": 9483406.666666666, "logits/rejected": -663090.8333333334, "logps/chosen": -359.7976888020833, "logps/rejected": -390.3515625, "loss": 0.069, "rewards/chosen": 4.736769040425618, "rewards/margins": 11.31773026784261, "rewards/rejected": -6.580961227416992, "step": 484 }, { "epoch": 0.1329313416472523, "grad_norm": 15.125, "kl": 4.285697937011719, "learning_rate": 5e-06, "logits/chosen": -13197861.714285715, "logits/rejected": 29382864.0, "logps/chosen": -476.50474330357144, "logps/rejected": -551.522314453125, "loss": 0.0954, "rewards/chosen": 4.913634163992746, "rewards/margins": 10.423383004324776, "rewards/rejected": -5.509748840332032, "step": 485 }, { "epoch": 0.1332054268877621, "grad_norm": 8.875, "kl": 1.3951575756072998, "learning_rate": 5e-06, "logits/chosen": 46836416.0, "logits/rejected": -14920528.0, "logps/chosen": -431.3029119318182, "logps/rejected": -398.5420673076923, "loss": 0.0464, "rewards/chosen": 4.294571616432884, "rewards/margins": 11.135222908500191, "rewards/rejected": -6.8406512920673075, "step": 486 }, { "epoch": 0.1334795121282719, "grad_norm": 11.4375, "kl": 5.975696563720703, "learning_rate": 5e-06, "logits/chosen": -9240651.692307692, "logits/rejected": -7182513.454545454, "logps/chosen": -406.12642728365387, "logps/rejected": -567.9386541193181, "loss": 0.0452, "rewards/chosen": 5.058206411508413, "rewards/margins": 10.76846350823249, "rewards/rejected": -5.710257096724077, "step": 487 }, { "epoch": 0.1337535973687817, "grad_norm": 8.6875, "kl": 4.092708110809326, "learning_rate": 5e-06, "logits/chosen": -9927616.0, "logits/rejected": 12996797.866666667, "logps/chosen": -350.3146158854167, "logps/rejected": -485.196484375, "loss": 0.0701, "rewards/chosen": 4.901311662462023, "rewards/margins": 12.246675533718534, "rewards/rejected": -7.34536387125651, "step": 488 }, { "epoch": 0.1340276826092915, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6894488.0, "logits/rejected": 79246788.92307693, "logps/chosen": -538.5767489346591, "logps/rejected": -462.7756159855769, "loss": 0.0924, "rewards/chosen": 5.1970759305087, "rewards/margins": 12.182582428405336, "rewards/rejected": -6.985506497896635, "step": 489 }, { "epoch": 0.1343017678498013, "grad_norm": 11.5625, "kl": 6.217202186584473, "learning_rate": 5e-06, "logits/chosen": -7668829.333333333, "logits/rejected": 17056071.111111112, "logps/chosen": -499.2506510416667, "logps/rejected": -357.55864800347223, "loss": 0.0622, "rewards/chosen": 4.66490478515625, "rewards/margins": 9.117198181152343, "rewards/rejected": -4.452293395996094, "step": 490 }, { "epoch": 0.1345758530903111, "grad_norm": 6.0, "kl": 0.38801002502441406, "learning_rate": 5e-06, "logits/chosen": -4106256.6666666665, "logits/rejected": 176450922.66666666, "logps/chosen": -466.158935546875, "logps/rejected": -434.757568359375, "loss": 0.0183, "rewards/chosen": 5.683380126953125, "rewards/margins": 12.672976811726887, "rewards/rejected": -6.989596684773763, "step": 491 }, { "epoch": 0.13484993833082087, "grad_norm": 8.9375, "kl": 1.5582587718963623, "learning_rate": 5e-06, "logits/chosen": -11657159.272727273, "logits/rejected": -12495309.538461538, "logps/chosen": -453.2750355113636, "logps/rejected": -508.4688251201923, "loss": 0.0547, "rewards/chosen": 3.751702742143111, "rewards/margins": 9.556195399144313, "rewards/rejected": -5.804492657001202, "step": 492 }, { "epoch": 0.1351240235713307, "grad_norm": 7.03125, "kl": 4.551628112792969, "learning_rate": 5e-06, "logits/chosen": -13225198.222222222, "logits/rejected": 25445015.466666665, "logps/chosen": -423.81651475694446, "logps/rejected": -568.7185546875, "loss": 0.0364, "rewards/chosen": 4.700219472249349, "rewards/margins": 11.236623382568359, "rewards/rejected": -6.53640391031901, "step": 493 }, { "epoch": 0.13539810881184047, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 14337210.181818182, "logits/rejected": 9033452.307692308, "logps/chosen": -435.11083984375, "logps/rejected": -442.16856971153845, "loss": 0.0527, "rewards/chosen": 4.264907143332741, "rewards/margins": 9.80527309604458, "rewards/rejected": -5.540365952711839, "step": 494 }, { "epoch": 0.1356721940523503, "grad_norm": 13.3125, "kl": 3.156829833984375, "learning_rate": 5e-06, "logits/chosen": 3337881.5384615385, "logits/rejected": -10116785.454545455, "logps/chosen": -462.41654146634613, "logps/rejected": -371.4866832386364, "loss": 0.1006, "rewards/chosen": 5.038512009840745, "rewards/margins": 9.333317709969474, "rewards/rejected": -4.2948057001287285, "step": 495 }, { "epoch": 0.13594627929286007, "grad_norm": 11.6875, "kl": 9.76203441619873, "learning_rate": 5e-06, "logits/chosen": -159690.96666666667, "logits/rejected": -1545647.5555555555, "logps/chosen": -476.39967447916666, "logps/rejected": -463.54112413194446, "loss": 0.1324, "rewards/chosen": 4.569273376464844, "rewards/margins": 8.653359900580512, "rewards/rejected": -4.084086524115668, "step": 496 }, { "epoch": 0.1362203645333699, "grad_norm": 15.625, "kl": 7.030264854431152, "learning_rate": 5e-06, "logits/chosen": 10732167.272727273, "logits/rejected": 6265891.076923077, "logps/chosen": -383.61470170454544, "logps/rejected": -541.9927133413462, "loss": 0.1498, "rewards/chosen": 4.108525362881747, "rewards/margins": 11.088208285245027, "rewards/rejected": -6.979682922363281, "step": 497 }, { "epoch": 0.13649444977387967, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4460029.333333333, "logits/rejected": 12726685.333333334, "logps/chosen": -419.0828450520833, "logps/rejected": -560.8222249348959, "loss": 0.0542, "rewards/chosen": 4.744330724080403, "rewards/margins": 10.038303057352701, "rewards/rejected": -5.293972333272298, "step": 498 }, { "epoch": 0.1367685350143895, "grad_norm": 10.375, "kl": 19.49878692626953, "learning_rate": 5e-06, "logits/chosen": -32373589.333333332, "logits/rejected": 9015348.666666666, "logps/chosen": -586.0295817057291, "logps/rejected": -454.6122639973958, "loss": 0.1071, "rewards/chosen": 6.96650505065918, "rewards/margins": 12.71131706237793, "rewards/rejected": -5.74481201171875, "step": 499 }, { "epoch": 0.13704262025489927, "grad_norm": 8.875, "kl": 4.691189765930176, "learning_rate": 5e-06, "logits/chosen": -18497729.777777776, "logits/rejected": -1015334.6666666666, "logps/chosen": -579.0014105902778, "logps/rejected": -339.2461263020833, "loss": 0.0369, "rewards/chosen": 5.098388671875, "rewards/margins": 10.15869852701823, "rewards/rejected": -5.060309855143229, "step": 500 }, { "epoch": 0.13731670549540909, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 14993276.0, "logits/rejected": 9570859.0, "logps/chosen": -497.8064270019531, "logps/rejected": -534.9885864257812, "loss": 0.0376, "rewards/chosen": 4.507524013519287, "rewards/margins": 10.037208557128906, "rewards/rejected": -5.529684543609619, "step": 501 }, { "epoch": 0.13759079073591887, "grad_norm": 13.1875, "kl": 0.035073600709438324, "learning_rate": 5e-06, "logits/chosen": -12058590.545454545, "logits/rejected": 20815643.076923076, "logps/chosen": -369.27756569602275, "logps/rejected": -635.1790865384615, "loss": 0.063, "rewards/chosen": 4.785824862393466, "rewards/margins": 10.578338196227602, "rewards/rejected": -5.792513333834135, "step": 502 }, { "epoch": 0.13786487597642866, "grad_norm": 13.5625, "kl": 12.73906421661377, "learning_rate": 5e-06, "logits/chosen": -5984139.692307692, "logits/rejected": 25921070.545454547, "logps/chosen": -481.4001652644231, "logps/rejected": -411.71133700284093, "loss": 0.1087, "rewards/chosen": 5.5100578894981975, "rewards/margins": 10.571089924632254, "rewards/rejected": -5.061032035134056, "step": 503 }, { "epoch": 0.13813896121693847, "grad_norm": 8.6875, "kl": 1.8372840881347656, "learning_rate": 5e-06, "logits/chosen": -8043960.470588235, "logits/rejected": -11024427.42857143, "logps/chosen": -445.95582490808823, "logps/rejected": -531.8978097098214, "loss": 0.0329, "rewards/chosen": 4.800678926355698, "rewards/margins": 12.881813177541524, "rewards/rejected": -8.081134251185826, "step": 504 }, { "epoch": 0.13841304645744826, "grad_norm": 14.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4572273.714285715, "logits/rejected": -2528263.0588235296, "logps/chosen": -375.6006556919643, "logps/rejected": -524.9367532169117, "loss": 0.0509, "rewards/chosen": 3.92398316519601, "rewards/margins": 9.666525207647757, "rewards/rejected": -5.742542042451746, "step": 505 }, { "epoch": 0.13868713169795807, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 12713812.0, "logits/rejected": 10194046.666666666, "logps/chosen": -376.2766927083333, "logps/rejected": -689.296630859375, "loss": 0.0712, "rewards/chosen": 4.247028350830078, "rewards/margins": 13.635808944702148, "rewards/rejected": -9.38878059387207, "step": 506 }, { "epoch": 0.13896121693846786, "grad_norm": 10.25, "kl": 9.869163513183594, "learning_rate": 5e-06, "logits/chosen": -3032360.5714285714, "logits/rejected": -10302848.0, "logps/chosen": -461.15586635044644, "logps/rejected": -393.7937255859375, "loss": 0.0877, "rewards/chosen": 5.231511797223773, "rewards/margins": 11.147066606794084, "rewards/rejected": -5.915554809570312, "step": 507 }, { "epoch": 0.13923530217897767, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28517088.0, "logits/rejected": 5048304.0, "logps/chosen": -431.69912109375, "logps/rejected": -373.35567801339283, "loss": 0.0583, "rewards/chosen": 5.117526245117188, "rewards/margins": 10.944035993303572, "rewards/rejected": -5.826509748186384, "step": 508 }, { "epoch": 0.13950938741948746, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15208199.384615384, "logits/rejected": 11198805.818181818, "logps/chosen": -328.70736929086536, "logps/rejected": -632.3313210227273, "loss": 0.0796, "rewards/chosen": 2.957134246826172, "rewards/margins": 9.151478507302024, "rewards/rejected": -6.1943442604758525, "step": 509 }, { "epoch": 0.13978347265999727, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5627269.454545454, "logits/rejected": -2761043.076923077, "logps/chosen": -446.892578125, "logps/rejected": -432.2028245192308, "loss": 0.0432, "rewards/chosen": 5.199734774502841, "rewards/margins": 11.27003436321979, "rewards/rejected": -6.0702995887169475, "step": 510 }, { "epoch": 0.14005755790050706, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3827399.0, "logits/rejected": -2583587.6666666665, "logps/chosen": -448.89599609375, "logps/rejected": -323.4304606119792, "loss": 0.0571, "rewards/chosen": 5.343514124552409, "rewards/margins": 9.734806060791016, "rewards/rejected": -4.3912919362386065, "step": 511 }, { "epoch": 0.14033164314101684, "grad_norm": 6.46875, "kl": 0.9369189143180847, "learning_rate": 5e-06, "logits/chosen": -12849473.6, "logits/rejected": -2078933.142857143, "logps/chosen": -490.030908203125, "logps/rejected": -572.3549107142857, "loss": 0.0661, "rewards/chosen": 4.191656112670898, "rewards/margins": 12.488398143223353, "rewards/rejected": -8.296742030552455, "step": 512 }, { "epoch": 0.14060572838152666, "grad_norm": 6.0, "kl": 1.24981689453125, "learning_rate": 5e-06, "logits/chosen": 9376638.4, "logits/rejected": -2642023.714285714, "logps/chosen": -557.68798828125, "logps/rejected": -404.22059849330356, "loss": 0.0307, "rewards/chosen": 4.943109130859375, "rewards/margins": 11.137554604666573, "rewards/rejected": -6.194445473807199, "step": 513 }, { "epoch": 0.14087981362203644, "grad_norm": 10.9375, "kl": 4.446951866149902, "learning_rate": 5e-06, "logits/chosen": 1985182.2222222222, "logits/rejected": -79848.53333333334, "logps/chosen": -567.3676215277778, "logps/rejected": -534.7527994791667, "loss": 0.0335, "rewards/chosen": 7.1575876871744795, "rewards/margins": 14.304622395833334, "rewards/rejected": -7.147034708658854, "step": 514 }, { "epoch": 0.14115389886254626, "grad_norm": 12.5625, "kl": 5.346248626708984, "learning_rate": 5e-06, "logits/chosen": -19559369.14285714, "logits/rejected": 1530993.2, "logps/chosen": -466.83555385044644, "logps/rejected": -421.85, "loss": 0.0616, "rewards/chosen": 6.285591670445034, "rewards/margins": 11.835429545811245, "rewards/rejected": -5.549837875366211, "step": 515 }, { "epoch": 0.14142798410305604, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9020060.363636363, "logits/rejected": 4662037.538461538, "logps/chosen": -422.4716796875, "logps/rejected": -520.0541240985577, "loss": 0.0427, "rewards/chosen": 6.069047407670454, "rewards/margins": 12.706781454019612, "rewards/rejected": -6.637734046349158, "step": 516 }, { "epoch": 0.14170206934356586, "grad_norm": 11.5625, "kl": 3.5565237998962402, "learning_rate": 5e-06, "logits/chosen": -8134530.666666667, "logits/rejected": -11858000.0, "logps/chosen": -322.5374755859375, "logps/rejected": -310.7281901041667, "loss": 0.116, "rewards/chosen": 3.8397432963053384, "rewards/margins": 7.572998046875, "rewards/rejected": -3.7332547505696616, "step": 517 }, { "epoch": 0.14197615458407564, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11797736.0, "logits/rejected": 8503396.923076924, "logps/chosen": -578.5807883522727, "logps/rejected": -777.0647536057693, "loss": 0.0245, "rewards/chosen": 5.814041831276634, "rewards/margins": 16.607157300402235, "rewards/rejected": -10.7931154691256, "step": 518 }, { "epoch": 0.14225023982458546, "grad_norm": 6.40625, "kl": 2.913555145263672, "learning_rate": 5e-06, "logits/chosen": -4638422.0, "logits/rejected": -7682525.333333333, "logps/chosen": -433.8923746744792, "logps/rejected": -489.7144368489583, "loss": 0.0233, "rewards/chosen": 4.760882059733073, "rewards/margins": 12.429649353027344, "rewards/rejected": -7.6687672932942705, "step": 519 }, { "epoch": 0.14252432506509524, "grad_norm": 10.1875, "kl": 2.512099027633667, "learning_rate": 5e-06, "logits/chosen": 25726848.0, "logits/rejected": -8342382.285714285, "logps/chosen": -407.4517578125, "logps/rejected": -561.9676688058036, "loss": 0.0663, "rewards/chosen": 4.558953094482422, "rewards/margins": 10.855453164236886, "rewards/rejected": -6.296500069754464, "step": 520 }, { "epoch": 0.14279841030560506, "grad_norm": 5.71875, "kl": 0.8742803335189819, "learning_rate": 5e-06, "logits/chosen": 701095.2, "logits/rejected": 34651491.55555555, "logps/chosen": -436.42200520833336, "logps/rejected": -685.9503038194445, "loss": 0.0192, "rewards/chosen": 4.2970942179361975, "rewards/margins": 12.908265516493056, "rewards/rejected": -8.611171298556858, "step": 521 }, { "epoch": 0.14307249554611484, "grad_norm": 10.3125, "kl": 0.6433258056640625, "learning_rate": 5e-06, "logits/chosen": -20622350.222222224, "logits/rejected": -208717.86666666667, "logps/chosen": -326.90557183159723, "logps/rejected": -419.7560221354167, "loss": 0.0581, "rewards/chosen": 3.402361339992947, "rewards/margins": 11.209460745917426, "rewards/rejected": -7.807099405924479, "step": 522 }, { "epoch": 0.14334658078662463, "grad_norm": 11.9375, "kl": 5.593787670135498, "learning_rate": 5e-06, "logits/chosen": 9403181.714285715, "logits/rejected": 6990640.0, "logps/chosen": -509.20113699776783, "logps/rejected": -418.469873046875, "loss": 0.0732, "rewards/chosen": 4.748334067208426, "rewards/margins": 12.227718680245536, "rewards/rejected": -7.4793846130371096, "step": 523 }, { "epoch": 0.14362066602713444, "grad_norm": 12.25, "kl": 3.1030471324920654, "learning_rate": 5e-06, "logits/chosen": -26226726.0, "logits/rejected": -2436877.25, "logps/chosen": -325.7071533203125, "logps/rejected": -530.9447021484375, "loss": 0.0888, "rewards/chosen": 4.121739387512207, "rewards/margins": 10.195124626159668, "rewards/rejected": -6.073385238647461, "step": 524 }, { "epoch": 0.14389475126764423, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2405146.4615384615, "logits/rejected": -18556184.727272727, "logps/chosen": -365.5768855168269, "logps/rejected": -579.4757634943181, "loss": 0.0503, "rewards/chosen": 4.119638296274038, "rewards/margins": 13.541882281536822, "rewards/rejected": -9.422243985262783, "step": 525 }, { "epoch": 0.14416883650815404, "grad_norm": 14.5, "kl": 1.7000045776367188, "learning_rate": 5e-06, "logits/chosen": -16251211.636363637, "logits/rejected": -6131549.538461538, "logps/chosen": -337.4658203125, "logps/rejected": -391.82388070913464, "loss": 0.0924, "rewards/chosen": 4.4920432350852275, "rewards/margins": 9.473159309867379, "rewards/rejected": -4.981116074782151, "step": 526 }, { "epoch": 0.14444292174866383, "grad_norm": 5.46875, "kl": 1.7460473775863647, "learning_rate": 5e-06, "logits/chosen": 3414867.3333333335, "logits/rejected": -3564094.6666666665, "logps/chosen": -376.7683919270833, "logps/rejected": -394.3453369140625, "loss": 0.0347, "rewards/chosen": 5.104538281758626, "rewards/margins": 11.024453163146973, "rewards/rejected": -5.919914881388347, "step": 527 }, { "epoch": 0.14471700698917364, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15891333.818181818, "logits/rejected": -6002129.230769231, "logps/chosen": -448.9606267755682, "logps/rejected": -384.9141376201923, "loss": 0.0251, "rewards/chosen": 5.497893940318715, "rewards/margins": 10.961095609864989, "rewards/rejected": -5.463201669546274, "step": 528 }, { "epoch": 0.14499109222968343, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3447380.923076923, "logits/rejected": -13267098.181818182, "logps/chosen": -487.810546875, "logps/rejected": -654.3671431107955, "loss": 0.0204, "rewards/chosen": 5.145776601938101, "rewards/margins": 13.031073510230005, "rewards/rejected": -7.885296908291903, "step": 529 }, { "epoch": 0.14526517747019324, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16926658.90909091, "logits/rejected": 6817387.076923077, "logps/chosen": -476.09694602272725, "logps/rejected": -512.0441706730769, "loss": 0.032, "rewards/chosen": 4.4676031632856885, "rewards/margins": 10.782134929737012, "rewards/rejected": -6.3145317664513225, "step": 530 }, { "epoch": 0.14553926271070303, "grad_norm": 7.5625, "kl": 9.086786270141602, "learning_rate": 5e-06, "logits/chosen": -15603841.454545455, "logits/rejected": 4450311.384615385, "logps/chosen": -454.63565340909093, "logps/rejected": -512.4550030048077, "loss": 0.0612, "rewards/chosen": 5.494621276855469, "rewards/margins": 11.459212669959435, "rewards/rejected": -5.964591393103967, "step": 531 }, { "epoch": 0.14581334795121284, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23094937.6, "logits/rejected": -14437781.714285715, "logps/chosen": -498.45009765625, "logps/rejected": -455.9053431919643, "loss": 0.0336, "rewards/chosen": 5.446422576904297, "rewards/margins": 12.016939980643137, "rewards/rejected": -6.570517403738839, "step": 532 }, { "epoch": 0.14608743319172263, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11883890.461538462, "logits/rejected": -6878886.545454546, "logps/chosen": -414.7500751201923, "logps/rejected": -475.79691938920456, "loss": 0.0348, "rewards/chosen": 4.9576263427734375, "rewards/margins": 11.900476629083807, "rewards/rejected": -6.942850286310369, "step": 533 }, { "epoch": 0.1463615184322324, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35482716.44444445, "logits/rejected": 96064.53333333334, "logps/chosen": -480.9538845486111, "logps/rejected": -512.857421875, "loss": 0.0833, "rewards/chosen": 6.2110544840494795, "rewards/margins": 12.135884602864584, "rewards/rejected": -5.9248301188151045, "step": 534 }, { "epoch": 0.14663560367274223, "grad_norm": 11.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20944864.0, "logits/rejected": -7660868.363636363, "logps/chosen": -486.3671875, "logps/rejected": -622.7566583806819, "loss": 0.0391, "rewards/chosen": 4.723124284010667, "rewards/margins": 13.733304670640639, "rewards/rejected": -9.01018038662997, "step": 535 }, { "epoch": 0.146909688913252, "grad_norm": 21.5, "kl": 16.1439266204834, "learning_rate": 5e-06, "logits/chosen": 21130235.733333334, "logits/rejected": -4159602.6666666665, "logps/chosen": -455.7719401041667, "logps/rejected": -483.6125217013889, "loss": 0.1556, "rewards/chosen": 4.39088134765625, "rewards/margins": 7.254003228081597, "rewards/rejected": -2.8631218804253473, "step": 536 }, { "epoch": 0.14718377415376183, "grad_norm": 11.5, "kl": 4.660311222076416, "learning_rate": 5e-06, "logits/chosen": -2897785.25, "logits/rejected": -6043845.5, "logps/chosen": -436.21087646484375, "logps/rejected": -548.902099609375, "loss": 0.0439, "rewards/chosen": 5.428614616394043, "rewards/margins": 13.121464729309082, "rewards/rejected": -7.692850112915039, "step": 537 }, { "epoch": 0.1474578593942716, "grad_norm": 10.8125, "kl": 7.010030746459961, "learning_rate": 5e-06, "logits/chosen": -7538892.8, "logits/rejected": -1829990.0, "logps/chosen": -404.555908203125, "logps/rejected": -446.54833984375, "loss": 0.1056, "rewards/chosen": 5.533795166015625, "rewards/margins": 10.582305581229074, "rewards/rejected": -5.048510415213449, "step": 538 }, { "epoch": 0.14773194463478143, "grad_norm": 5.375, "kl": 0.1082509383559227, "learning_rate": 5e-06, "logits/chosen": 5883347.692307692, "logits/rejected": -1411653.4545454546, "logps/chosen": -383.7883112980769, "logps/rejected": -518.1652610085227, "loss": 0.0367, "rewards/chosen": 4.386815584622896, "rewards/margins": 10.808527593012457, "rewards/rejected": -6.42171200838956, "step": 539 }, { "epoch": 0.1480060298752912, "grad_norm": 5.65625, "kl": 1.6048038005828857, "learning_rate": 5e-06, "logits/chosen": -1097087.5, "logits/rejected": -5067624.5, "logps/chosen": -430.6459655761719, "logps/rejected": -339.87213134765625, "loss": 0.0569, "rewards/chosen": 5.0838212966918945, "rewards/margins": 10.008060455322266, "rewards/rejected": -4.924239158630371, "step": 540 }, { "epoch": 0.14828011511580103, "grad_norm": 7.75, "kl": 1.2441266775131226, "learning_rate": 5e-06, "logits/chosen": -19783916.8, "logits/rejected": 10914403.42857143, "logps/chosen": -439.07099609375, "logps/rejected": -508.6421595982143, "loss": 0.0423, "rewards/chosen": 5.303313827514648, "rewards/margins": 10.71962045942034, "rewards/rejected": -5.416306631905692, "step": 541 }, { "epoch": 0.1485542003563108, "grad_norm": 9.5625, "kl": 0.13557052612304688, "learning_rate": 5e-06, "logits/chosen": 4850716.666666667, "logits/rejected": 23761082.666666668, "logps/chosen": -522.2681884765625, "logps/rejected": -312.97015380859375, "loss": 0.0546, "rewards/chosen": 5.885537465413411, "rewards/margins": 10.701275825500488, "rewards/rejected": -4.815738360087077, "step": 542 }, { "epoch": 0.1488282855968206, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13754506.0, "logits/rejected": -26563936.0, "logps/chosen": -466.5826721191406, "logps/rejected": -561.2271728515625, "loss": 0.053, "rewards/chosen": 3.7308006286621094, "rewards/margins": 11.10707139968872, "rewards/rejected": -7.376270771026611, "step": 543 }, { "epoch": 0.1491023708373304, "grad_norm": 12.75, "kl": 8.343570709228516, "learning_rate": 5e-06, "logits/chosen": 26233704.0, "logits/rejected": -11720745.333333334, "logps/chosen": -432.2632649739583, "logps/rejected": -479.3277994791667, "loss": 0.0609, "rewards/chosen": 6.085273742675781, "rewards/margins": 12.366721471150715, "rewards/rejected": -6.281447728474935, "step": 544 }, { "epoch": 0.1493764560778402, "grad_norm": 11.8125, "kl": 4.027656555175781, "learning_rate": 5e-06, "logits/chosen": -10705901.333333334, "logits/rejected": 7647886.666666667, "logps/chosen": -478.7068684895833, "logps/rejected": -400.4776204427083, "loss": 0.0507, "rewards/chosen": 5.224957784016927, "rewards/margins": 10.648794174194336, "rewards/rejected": -5.423836390177409, "step": 545 }, { "epoch": 0.14965054131835, "grad_norm": 9.0, "kl": 1.295804738998413, "learning_rate": 5e-06, "logits/chosen": -12440628.923076924, "logits/rejected": -17389416.727272727, "logps/chosen": -465.13979867788464, "logps/rejected": -471.15891335227275, "loss": 0.0479, "rewards/chosen": 5.119925865760217, "rewards/margins": 12.503448593032944, "rewards/rejected": -7.3835227272727275, "step": 546 }, { "epoch": 0.1499246265588598, "grad_norm": 12.8125, "kl": 7.0821685791015625, "learning_rate": 5e-06, "logits/chosen": 11182172.57142857, "logits/rejected": 3200452.8, "logps/chosen": -423.23866489955356, "logps/rejected": -362.2056884765625, "loss": 0.1065, "rewards/chosen": 4.275881631033761, "rewards/margins": 11.302140481131417, "rewards/rejected": -7.026258850097657, "step": 547 }, { "epoch": 0.1501987117993696, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7906512.7272727275, "logits/rejected": -3982436.0, "logps/chosen": -517.5380415482955, "logps/rejected": -377.9611628605769, "loss": 0.0265, "rewards/chosen": 5.592125632546165, "rewards/margins": 12.90800075931149, "rewards/rejected": -7.315875126765325, "step": 548 }, { "epoch": 0.1504727970398794, "grad_norm": 6.5625, "kl": 1.6002118587493896, "learning_rate": 5e-06, "logits/chosen": -18369440.0, "logits/rejected": 15183968.0, "logps/chosen": -409.7824041193182, "logps/rejected": -530.9921123798077, "loss": 0.036, "rewards/chosen": 5.696862654252485, "rewards/margins": 13.087767594344133, "rewards/rejected": -7.390904940091646, "step": 549 }, { "epoch": 0.1507468822803892, "grad_norm": 6.0625, "kl": 1.3930957317352295, "learning_rate": 5e-06, "logits/chosen": -7324716.571428572, "logits/rejected": -9934646.4, "logps/chosen": -413.44349888392856, "logps/rejected": -565.11875, "loss": 0.0627, "rewards/chosen": 5.489056723458426, "rewards/margins": 13.724171774727957, "rewards/rejected": -8.235115051269531, "step": 550 }, { "epoch": 0.151020967520899, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8004458.181818182, "logits/rejected": -16233137.23076923, "logps/chosen": -530.6799094460227, "logps/rejected": -475.1277043269231, "loss": 0.0394, "rewards/chosen": 5.086612354625355, "rewards/margins": 11.139715288068865, "rewards/rejected": -6.05310293344351, "step": 551 }, { "epoch": 0.1512950527614088, "grad_norm": 16.125, "kl": 12.964876174926758, "learning_rate": 5e-06, "logits/chosen": -4152775.0, "logits/rejected": -383553.5, "logps/chosen": -457.89410400390625, "logps/rejected": -419.9881896972656, "loss": 0.1442, "rewards/chosen": 5.157751560211182, "rewards/margins": 9.325258731842041, "rewards/rejected": -4.167507171630859, "step": 552 }, { "epoch": 0.1515691380019186, "grad_norm": 4.53125, "kl": 2.0241293907165527, "learning_rate": 5e-06, "logits/chosen": -19208674.666666668, "logits/rejected": 897165.3333333334, "logps/chosen": -493.575439453125, "logps/rejected": -593.1337076822916, "loss": 0.0395, "rewards/chosen": 5.848843892415364, "rewards/margins": 12.711115519205729, "rewards/rejected": -6.862271626790364, "step": 553 }, { "epoch": 0.15184322324242838, "grad_norm": 10.1875, "kl": 0.7016042470932007, "learning_rate": 5e-06, "logits/chosen": -11979268.923076924, "logits/rejected": -2416802.1818181816, "logps/chosen": -449.6105769230769, "logps/rejected": -458.43190696022725, "loss": 0.0892, "rewards/chosen": 6.096557030310998, "rewards/margins": 12.756615485344733, "rewards/rejected": -6.660058455033735, "step": 554 }, { "epoch": 0.1521173084829382, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 41501.4, "logits/rejected": -25648.0, "logps/chosen": -470.590234375, "logps/rejected": -546.7232142857143, "loss": 0.0375, "rewards/chosen": 5.366466522216797, "rewards/margins": 11.998538970947266, "rewards/rejected": -6.632072448730469, "step": 555 }, { "epoch": 0.15239139372344798, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6421476.444444444, "logits/rejected": -22378060.8, "logps/chosen": -370.78724500868054, "logps/rejected": -500.29749348958336, "loss": 0.0484, "rewards/chosen": 4.790027194552952, "rewards/margins": 11.658258904351129, "rewards/rejected": -6.868231709798177, "step": 556 }, { "epoch": 0.1526654789639578, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7189026.0, "logits/rejected": 14818656.0, "logps/chosen": -505.59527587890625, "logps/rejected": -503.985595703125, "loss": 0.0499, "rewards/chosen": 5.115341663360596, "rewards/margins": 11.082106113433838, "rewards/rejected": -5.966764450073242, "step": 557 }, { "epoch": 0.15293956420446758, "grad_norm": 9.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2828444.6666666665, "logits/rejected": 1565516.6666666667, "logps/chosen": -418.5796305338542, "logps/rejected": -471.9374186197917, "loss": 0.0407, "rewards/chosen": 4.862676620483398, "rewards/margins": 11.50564193725586, "rewards/rejected": -6.642965316772461, "step": 558 }, { "epoch": 0.1532136494449774, "grad_norm": 13.875, "kl": 4.758914947509766, "learning_rate": 5e-06, "logits/chosen": -17998400.0, "logits/rejected": 12307283.0, "logps/chosen": -534.3897094726562, "logps/rejected": -460.31878662109375, "loss": 0.0731, "rewards/chosen": 4.416209697723389, "rewards/margins": 11.044950008392334, "rewards/rejected": -6.628740310668945, "step": 559 }, { "epoch": 0.15348773468548718, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1172859.2, "logits/rejected": -10252682.666666666, "logps/chosen": -498.25074869791666, "logps/rejected": -531.81201171875, "loss": 0.0514, "rewards/chosen": 4.654095967610677, "rewards/margins": 13.155108133951824, "rewards/rejected": -8.501012166341146, "step": 560 }, { "epoch": 0.153761819925997, "grad_norm": 7.5, "kl": 1.7432136535644531, "learning_rate": 5e-06, "logits/chosen": -8073301.6, "logits/rejected": -14402788.57142857, "logps/chosen": -481.19384765625, "logps/rejected": -451.77099609375, "loss": 0.0895, "rewards/chosen": 5.13271484375, "rewards/margins": 11.427492850167411, "rewards/rejected": -6.294778006417411, "step": 561 }, { "epoch": 0.15403590516650678, "grad_norm": 10.6875, "kl": 4.275628089904785, "learning_rate": 5e-06, "logits/chosen": -12825746.133333333, "logits/rejected": 1192836.888888889, "logps/chosen": -363.2466145833333, "logps/rejected": -512.9204644097222, "loss": 0.0796, "rewards/chosen": 4.072491200764974, "rewards/margins": 13.442607540554471, "rewards/rejected": -9.370116339789497, "step": 562 }, { "epoch": 0.1543099904070166, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30121160.727272727, "logits/rejected": 14967369.846153846, "logps/chosen": -509.3460138494318, "logps/rejected": -593.9396033653846, "loss": 0.0197, "rewards/chosen": 5.469707489013672, "rewards/margins": 12.382302797757662, "rewards/rejected": -6.91259530874399, "step": 563 }, { "epoch": 0.15458407564752638, "grad_norm": 12.4375, "kl": 5.761371612548828, "learning_rate": 5e-06, "logits/chosen": 12711840.0, "logits/rejected": -16059068.8, "logps/chosen": -413.27200753348217, "logps/rejected": -502.84697265625, "loss": 0.0553, "rewards/chosen": 4.379270281110491, "rewards/margins": 11.852776445661274, "rewards/rejected": -7.473506164550781, "step": 564 }, { "epoch": 0.15485816088803617, "grad_norm": 13.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11215968.0, "logits/rejected": -16592122.666666666, "logps/chosen": -387.7265625, "logps/rejected": -510.3687337239583, "loss": 0.0961, "rewards/chosen": 4.460521380106608, "rewards/margins": 10.662376085917156, "rewards/rejected": -6.201854705810547, "step": 565 }, { "epoch": 0.15513224612854598, "grad_norm": 7.375, "kl": 1.7528165578842163, "learning_rate": 5e-06, "logits/chosen": 16104426.181818182, "logits/rejected": -10833811.692307692, "logps/chosen": -325.55104758522725, "logps/rejected": -366.97213040865387, "loss": 0.0749, "rewards/chosen": 4.878678408536044, "rewards/margins": 9.99410469215233, "rewards/rejected": -5.115426283616286, "step": 566 }, { "epoch": 0.15540633136905577, "grad_norm": 17.125, "kl": 3.4098422527313232, "learning_rate": 5e-06, "logits/chosen": 8593019.42857143, "logits/rejected": -4272636.4, "logps/chosen": -520.9862583705357, "logps/rejected": -508.7240234375, "loss": 0.0704, "rewards/chosen": 5.530862535749163, "rewards/margins": 12.385050310407365, "rewards/rejected": -6.854187774658203, "step": 567 }, { "epoch": 0.15568041660956558, "grad_norm": 12.9375, "kl": 0.9093475341796875, "learning_rate": 5e-06, "logits/chosen": 26081213.333333332, "logits/rejected": 2539883.3333333335, "logps/chosen": -580.0585123697916, "logps/rejected": -456.4210205078125, "loss": 0.0811, "rewards/chosen": 4.030539512634277, "rewards/margins": 9.362327257792156, "rewards/rejected": -5.331787745157878, "step": 568 }, { "epoch": 0.15595450185007537, "grad_norm": 16.875, "kl": 5.2902021408081055, "learning_rate": 5e-06, "logits/chosen": 12886873.6, "logits/rejected": -5512829.714285715, "logps/chosen": -495.08017578125, "logps/rejected": -489.44217354910717, "loss": 0.0659, "rewards/chosen": 5.811245727539062, "rewards/margins": 10.864979553222657, "rewards/rejected": -5.053733825683594, "step": 569 }, { "epoch": 0.15622858709058518, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22796827.42857143, "logits/rejected": 3473922.3529411764, "logps/chosen": -405.1205357142857, "logps/rejected": -668.552734375, "loss": 0.0399, "rewards/chosen": 5.352302006312779, "rewards/margins": 14.205779035552208, "rewards/rejected": -8.85347702923943, "step": 570 }, { "epoch": 0.15650267233109497, "grad_norm": 15.6875, "kl": 4.311643600463867, "learning_rate": 5e-06, "logits/chosen": -5352140.0, "logits/rejected": 1022287.8333333334, "logps/chosen": -416.6410319010417, "logps/rejected": -515.9449055989584, "loss": 0.0731, "rewards/chosen": 5.077417055765788, "rewards/margins": 9.832870165507, "rewards/rejected": -4.755453109741211, "step": 571 }, { "epoch": 0.15677675757160478, "grad_norm": 11.75, "kl": 8.629748344421387, "learning_rate": 5e-06, "logits/chosen": -17553662.222222224, "logits/rejected": -18324980.0, "logps/chosen": -497.08653428819446, "logps/rejected": -598.8126627604166, "loss": 0.0595, "rewards/chosen": 4.78342522515191, "rewards/margins": 12.485023074679905, "rewards/rejected": -7.701597849527995, "step": 572 }, { "epoch": 0.15705084281211457, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 17352387.555555556, "logits/rejected": -11370734.933333334, "logps/chosen": -442.3085666232639, "logps/rejected": -461.78427734375, "loss": 0.0346, "rewards/chosen": 6.045078277587891, "rewards/margins": 12.830903371175129, "rewards/rejected": -6.785825093587239, "step": 573 }, { "epoch": 0.15732492805262438, "grad_norm": 20.5, "kl": 17.206520080566406, "learning_rate": 5e-06, "logits/chosen": -12345366.588235294, "logits/rejected": -4391753.714285715, "logps/chosen": -511.6578584558824, "logps/rejected": -577.2845284598214, "loss": 0.1183, "rewards/chosen": 4.916720222024357, "rewards/margins": 13.111989125484179, "rewards/rejected": -8.195268903459821, "step": 574 }, { "epoch": 0.15759901329313417, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11385242.666666666, "logits/rejected": 872564.8, "logps/chosen": -399.36102973090277, "logps/rejected": -399.1138020833333, "loss": 0.0168, "rewards/chosen": 5.810558742947048, "rewards/margins": 11.79163835313585, "rewards/rejected": -5.981079610188802, "step": 575 }, { "epoch": 0.15787309853364395, "grad_norm": 8.25, "kl": 1.7618205547332764, "learning_rate": 5e-06, "logits/chosen": -12381991.2, "logits/rejected": -6964927.428571428, "logps/chosen": -483.90439453125, "logps/rejected": -318.56630161830356, "loss": 0.0578, "rewards/chosen": 5.67884407043457, "rewards/margins": 10.486828558785575, "rewards/rejected": -4.807984488351004, "step": 576 }, { "epoch": 0.15814718377415377, "grad_norm": 14.1875, "kl": 1.9587421417236328, "learning_rate": 5e-06, "logits/chosen": -5327485.142857143, "logits/rejected": -1310499.2, "logps/chosen": -468.7845982142857, "logps/rejected": -519.53447265625, "loss": 0.0602, "rewards/chosen": 4.542179652622768, "rewards/margins": 10.737486812046598, "rewards/rejected": -6.1953071594238285, "step": 577 }, { "epoch": 0.15842126901466355, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6339002.0, "logits/rejected": 581469.1666666666, "logps/chosen": -521.9292805989584, "logps/rejected": -486.0059407552083, "loss": 0.0387, "rewards/chosen": 4.194308598836263, "rewards/margins": 11.413398106892902, "rewards/rejected": -7.219089508056641, "step": 578 }, { "epoch": 0.15869535425517337, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 34170744.0, "logits/rejected": -6131188.444444444, "logps/chosen": -385.1976318359375, "logps/rejected": -544.8289930555555, "loss": 0.0946, "rewards/chosen": 5.003133138020833, "rewards/margins": 11.47082773844401, "rewards/rejected": -6.467694600423177, "step": 579 }, { "epoch": 0.15896943949568315, "grad_norm": 11.8125, "kl": 6.543907165527344, "learning_rate": 5e-06, "logits/chosen": -10657586.0, "logits/rejected": 10758698.0, "logps/chosen": -369.29888916015625, "logps/rejected": -596.492431640625, "loss": 0.0901, "rewards/chosen": 4.716132640838623, "rewards/margins": 12.851670742034912, "rewards/rejected": -8.135538101196289, "step": 580 }, { "epoch": 0.15924352473619297, "grad_norm": 11.5, "kl": 0.21943537890911102, "learning_rate": 5e-06, "logits/chosen": -16306348.57142857, "logits/rejected": -4303826.4, "logps/chosen": -434.2088099888393, "logps/rejected": -778.4916015625, "loss": 0.0651, "rewards/chosen": 3.8467314583914622, "rewards/margins": 12.133083234514508, "rewards/rejected": -8.286351776123047, "step": 581 }, { "epoch": 0.15951760997670275, "grad_norm": 6.53125, "kl": 7.1478071212768555, "learning_rate": 5e-06, "logits/chosen": 6938957.0, "logits/rejected": -10143872.0, "logps/chosen": -512.41943359375, "logps/rejected": -520.489990234375, "loss": 0.0445, "rewards/chosen": 6.454849720001221, "rewards/margins": 13.70491647720337, "rewards/rejected": -7.250066757202148, "step": 582 }, { "epoch": 0.15979169521721256, "grad_norm": 12.25, "kl": 10.34714412689209, "learning_rate": 5e-06, "logits/chosen": -11323276.235294119, "logits/rejected": -4436993.142857143, "logps/chosen": -449.60842715992646, "logps/rejected": -572.3104073660714, "loss": 0.0514, "rewards/chosen": 5.8663428811466, "rewards/margins": 13.543643470571823, "rewards/rejected": -7.677300589425223, "step": 583 }, { "epoch": 0.16006578045772235, "grad_norm": 9.625, "kl": 1.4708786010742188, "learning_rate": 5e-06, "logits/chosen": -12581109.6, "logits/rejected": -9056502.285714285, "logps/chosen": -478.440380859375, "logps/rejected": -578.5616978236607, "loss": 0.0499, "rewards/chosen": 5.401086044311524, "rewards/margins": 16.342020361764092, "rewards/rejected": -10.940934317452568, "step": 584 }, { "epoch": 0.16033986569823214, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9052035.636363637, "logits/rejected": -3314979.6923076925, "logps/chosen": -429.2013494318182, "logps/rejected": -792.3548677884615, "loss": 0.0109, "rewards/chosen": 4.987159382213246, "rewards/margins": 16.688621174205434, "rewards/rejected": -11.701461791992188, "step": 585 }, { "epoch": 0.16061395093874195, "grad_norm": 10.6875, "kl": 2.2626407146453857, "learning_rate": 5e-06, "logits/chosen": -2724571.076923077, "logits/rejected": -611249.0909090909, "logps/chosen": -501.31494140625, "logps/rejected": -517.1008522727273, "loss": 0.0495, "rewards/chosen": 5.267755361703726, "rewards/margins": 11.36803196193455, "rewards/rejected": -6.100276600230824, "step": 586 }, { "epoch": 0.16088803617925174, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15509230.0, "logits/rejected": 7952665.0, "logps/chosen": -591.14208984375, "logps/rejected": -548.32080078125, "loss": 0.0452, "rewards/chosen": 5.308307647705078, "rewards/margins": 13.20925235748291, "rewards/rejected": -7.900944709777832, "step": 587 }, { "epoch": 0.16116212141976155, "grad_norm": 7.3125, "kl": 0.8684476613998413, "learning_rate": 5e-06, "logits/chosen": -21464094.545454547, "logits/rejected": -20607389.53846154, "logps/chosen": -420.55366654829544, "logps/rejected": -470.59427584134613, "loss": 0.0384, "rewards/chosen": 4.096528833562678, "rewards/margins": 9.385719859516705, "rewards/rejected": -5.289191025954026, "step": 588 }, { "epoch": 0.16143620666027134, "grad_norm": 9.9375, "kl": 6.693058013916016, "learning_rate": 5e-06, "logits/chosen": 7993347.428571428, "logits/rejected": -5490477.2, "logps/chosen": -392.38180106026783, "logps/rejected": -450.580908203125, "loss": 0.0692, "rewards/chosen": 5.129941667829241, "rewards/margins": 10.285830797467913, "rewards/rejected": -5.1558891296386715, "step": 589 }, { "epoch": 0.16171029190078115, "grad_norm": 4.6875, "kl": 6.798520088195801, "learning_rate": 5e-06, "logits/chosen": 1199547.6923076923, "logits/rejected": -6975269.818181818, "logps/chosen": -443.7477463942308, "logps/rejected": -534.5969016335227, "loss": 0.0478, "rewards/chosen": 5.66654557448167, "rewards/margins": 12.426325071108092, "rewards/rejected": -6.759779496626421, "step": 590 }, { "epoch": 0.16198437714129094, "grad_norm": 6.34375, "kl": 3.3228163719177246, "learning_rate": 5e-06, "logits/chosen": -4859122.333333333, "logits/rejected": -21014374.666666668, "logps/chosen": -493.0299479166667, "logps/rejected": -586.0147298177084, "loss": 0.0195, "rewards/chosen": 5.148767789204915, "rewards/margins": 12.859822273254395, "rewards/rejected": -7.7110544840494795, "step": 591 }, { "epoch": 0.16225846238180075, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21052065.6, "logits/rejected": 12456104.0, "logps/chosen": -477.59609375, "logps/rejected": -456.83140345982144, "loss": 0.0545, "rewards/chosen": 5.6855918884277346, "rewards/margins": 12.622719682965961, "rewards/rejected": -6.937127794538226, "step": 592 }, { "epoch": 0.16253254762231054, "grad_norm": 10.75, "kl": 2.7291362285614014, "learning_rate": 5e-06, "logits/chosen": -11918392.8, "logits/rejected": 10465554.285714285, "logps/chosen": -381.458203125, "logps/rejected": -606.2400948660714, "loss": 0.0423, "rewards/chosen": 5.796901702880859, "rewards/margins": 11.56010273524693, "rewards/rejected": -5.763201032366071, "step": 593 }, { "epoch": 0.16280663286282035, "grad_norm": 12.0625, "kl": 7.487538814544678, "learning_rate": 5e-06, "logits/chosen": -8001026.909090909, "logits/rejected": 7078738.461538462, "logps/chosen": -397.0167791193182, "logps/rejected": -644.3128004807693, "loss": 0.085, "rewards/chosen": 3.7601852416992188, "rewards/margins": 10.96469937838041, "rewards/rejected": -7.20451413668119, "step": 594 }, { "epoch": 0.16308071810333014, "grad_norm": 1.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3745489.3333333335, "logits/rejected": -10639032.666666666, "logps/chosen": -530.18212890625, "logps/rejected": -616.0508626302084, "loss": 0.0058, "rewards/chosen": 6.1151885986328125, "rewards/margins": 14.470842361450195, "rewards/rejected": -8.355653762817383, "step": 595 }, { "epoch": 0.16335480334383992, "grad_norm": 7.9375, "kl": 2.688185453414917, "learning_rate": 5e-06, "logits/chosen": -17976625.454545453, "logits/rejected": -6369225.846153846, "logps/chosen": -417.20432350852275, "logps/rejected": -382.56629356971155, "loss": 0.0695, "rewards/chosen": 5.68178315596147, "rewards/margins": 11.078763441606, "rewards/rejected": -5.396980285644531, "step": 596 }, { "epoch": 0.16362888858434974, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -991958.3076923077, "logits/rejected": -14542749.090909092, "logps/chosen": -480.1695087139423, "logps/rejected": -747.0297407670455, "loss": 0.027, "rewards/chosen": 4.728157336895283, "rewards/margins": 14.16811746984095, "rewards/rejected": -9.439960132945668, "step": 597 }, { "epoch": 0.16390297382485952, "grad_norm": 12.5, "kl": 5.9982171058654785, "learning_rate": 5e-06, "logits/chosen": 1032654.9333333333, "logits/rejected": 1405691.111111111, "logps/chosen": -506.73681640625, "logps/rejected": -436.9919704861111, "loss": 0.0302, "rewards/chosen": 5.927615356445313, "rewards/margins": 14.07066141764323, "rewards/rejected": -8.143046061197916, "step": 598 }, { "epoch": 0.16417705906536934, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -975705.25, "logits/rejected": -7442788.0, "logps/chosen": -428.7931823730469, "logps/rejected": -457.26446533203125, "loss": 0.0212, "rewards/chosen": 4.656890869140625, "rewards/margins": 11.898754119873047, "rewards/rejected": -7.241863250732422, "step": 599 }, { "epoch": 0.16445114430587912, "grad_norm": 15.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 31468435.2, "logits/rejected": 71734345.14285715, "logps/chosen": -466.204443359375, "logps/rejected": -441.74574497767856, "loss": 0.0429, "rewards/chosen": 5.582163619995117, "rewards/margins": 13.26701077052525, "rewards/rejected": -7.684847150530134, "step": 600 }, { "epoch": 0.16472522954638893, "grad_norm": 13.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21082824.0, "logits/rejected": 3564345.0, "logps/chosen": -353.9439290364583, "logps/rejected": -366.9765218098958, "loss": 0.0888, "rewards/chosen": 4.342309951782227, "rewards/margins": 7.8715051015218105, "rewards/rejected": -3.5291951497395835, "step": 601 }, { "epoch": 0.16499931478689872, "grad_norm": 9.5, "kl": 3.7175166606903076, "learning_rate": 5e-06, "logits/chosen": -9241935.272727273, "logits/rejected": 1783704.6153846155, "logps/chosen": -372.6930486505682, "logps/rejected": -425.8650090144231, "loss": 0.0955, "rewards/chosen": 3.685084256258878, "rewards/margins": 10.668220946838805, "rewards/rejected": -6.9831366905799275, "step": 602 }, { "epoch": 0.16527340002740853, "grad_norm": 12.25, "kl": 0.018527984619140625, "learning_rate": 5e-06, "logits/chosen": -7693715.428571428, "logits/rejected": -4969118.4, "logps/chosen": -354.350830078125, "logps/rejected": -588.23154296875, "loss": 0.0544, "rewards/chosen": 4.955654689243862, "rewards/margins": 12.107929011753628, "rewards/rejected": -7.152274322509766, "step": 603 }, { "epoch": 0.16554748526791832, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6869188.0, "logits/rejected": -2536917.714285714, "logps/chosen": -309.8878173828125, "logps/rejected": -489.63246372767856, "loss": 0.0406, "rewards/chosen": 5.1540180206298825, "rewards/margins": 11.837336022513252, "rewards/rejected": -6.683318001883371, "step": 604 }, { "epoch": 0.16582157050842813, "grad_norm": 12.9375, "kl": 16.354469299316406, "learning_rate": 5e-06, "logits/chosen": -19429112.470588237, "logits/rejected": -1491909.142857143, "logps/chosen": -405.96961167279414, "logps/rejected": -448.18540736607144, "loss": 0.082, "rewards/chosen": 5.253938562729779, "rewards/margins": 12.099199727803718, "rewards/rejected": -6.84526116507394, "step": 605 }, { "epoch": 0.16609565574893792, "grad_norm": 8.625, "kl": 0.2581399381160736, "learning_rate": 5e-06, "logits/chosen": -4891841.142857143, "logits/rejected": -12961888.8, "logps/chosen": -342.68355887276783, "logps/rejected": -444.5822265625, "loss": 0.0365, "rewards/chosen": 4.993088858468192, "rewards/margins": 11.781675284249442, "rewards/rejected": -6.78858642578125, "step": 606 }, { "epoch": 0.1663697409894477, "grad_norm": 13.125, "kl": 0.00039227804518304765, "learning_rate": 5e-06, "logits/chosen": -4891206.857142857, "logits/rejected": 4478147.2, "logps/chosen": -400.7456752232143, "logps/rejected": -675.07861328125, "loss": 0.092, "rewards/chosen": 4.722159794398716, "rewards/margins": 14.735124424525669, "rewards/rejected": -10.012964630126953, "step": 607 }, { "epoch": 0.16664382622995752, "grad_norm": 3.671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34091712.0, "logits/rejected": -25159462.0, "logps/chosen": -361.41656494140625, "logps/rejected": -461.3290100097656, "loss": 0.0206, "rewards/chosen": 5.284795761108398, "rewards/margins": 11.552489757537842, "rewards/rejected": -6.267693996429443, "step": 608 }, { "epoch": 0.1669179114704673, "grad_norm": 10.5625, "kl": 3.3623461723327637, "learning_rate": 5e-06, "logits/chosen": 6837292.923076923, "logits/rejected": -1486046.5454545454, "logps/chosen": -403.54781400240387, "logps/rejected": -588.4688387784091, "loss": 0.0462, "rewards/chosen": 5.061622619628906, "rewards/margins": 13.50637470592152, "rewards/rejected": -8.444752086292613, "step": 609 }, { "epoch": 0.16719199671097712, "grad_norm": 14.5, "kl": 0.8822581171989441, "learning_rate": 5e-06, "logits/chosen": 13707438.4, "logits/rejected": 4474153.714285715, "logps/chosen": -369.165185546875, "logps/rejected": -585.3683733258929, "loss": 0.1197, "rewards/chosen": 4.216021347045898, "rewards/margins": 9.927723966326031, "rewards/rejected": -5.711702619280134, "step": 610 }, { "epoch": 0.1674660819514869, "grad_norm": 8.8125, "kl": 1.596358060836792, "learning_rate": 5e-06, "logits/chosen": 26258974.0, "logits/rejected": -7824429.0, "logps/chosen": -474.74664306640625, "logps/rejected": -491.9432373046875, "loss": 0.0323, "rewards/chosen": 5.283700466156006, "rewards/margins": 11.705013275146484, "rewards/rejected": -6.4213128089904785, "step": 611 }, { "epoch": 0.16774016719199672, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26799819.42857143, "logits/rejected": -1542465.4117647058, "logps/chosen": -592.5936104910714, "logps/rejected": -425.9193474264706, "loss": 0.0523, "rewards/chosen": 6.027210235595703, "rewards/margins": 12.46871073105756, "rewards/rejected": -6.441500495461857, "step": 612 }, { "epoch": 0.1680142524325065, "grad_norm": 7.84375, "kl": 2.365126371383667, "learning_rate": 5e-06, "logits/chosen": -25530170.181818184, "logits/rejected": 68952546.46153846, "logps/chosen": -442.3885387073864, "logps/rejected": -628.3334585336538, "loss": 0.046, "rewards/chosen": 5.079356800426137, "rewards/margins": 12.586581250170727, "rewards/rejected": -7.507224449744592, "step": 613 }, { "epoch": 0.16828833767301632, "grad_norm": 10.75, "kl": 0.2827568054199219, "learning_rate": 5e-06, "logits/chosen": -4128424.0, "logits/rejected": -54651.71428571428, "logps/chosen": -397.627392578125, "logps/rejected": -557.4456263950893, "loss": 0.0308, "rewards/chosen": 6.433086395263672, "rewards/margins": 13.218214852469309, "rewards/rejected": -6.785128457205636, "step": 614 }, { "epoch": 0.1685624229135261, "grad_norm": 11.8125, "kl": 7.693988800048828, "learning_rate": 5e-06, "logits/chosen": -23898385.230769232, "logits/rejected": 32550548.363636363, "logps/chosen": -430.98550180288464, "logps/rejected": -507.16415127840907, "loss": 0.0643, "rewards/chosen": 5.607218815730168, "rewards/margins": 10.975924912032546, "rewards/rejected": -5.368706096302379, "step": 615 }, { "epoch": 0.1688365081540359, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19332352.0, "logits/rejected": 7757929.0, "logps/chosen": -456.856689453125, "logps/rejected": -488.0924987792969, "loss": 0.0649, "rewards/chosen": 4.695016860961914, "rewards/margins": 10.000749588012695, "rewards/rejected": -5.305732727050781, "step": 616 }, { "epoch": 0.1691105933945457, "grad_norm": 13.1875, "kl": 8.47950553894043, "learning_rate": 5e-06, "logits/chosen": -18805700.266666666, "logits/rejected": -1108292.2222222222, "logps/chosen": -447.11774088541665, "logps/rejected": -417.92914496527777, "loss": 0.0408, "rewards/chosen": 5.531303914388021, "rewards/margins": 12.833858574761285, "rewards/rejected": -7.302554660373264, "step": 617 }, { "epoch": 0.1693846786350555, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24637088.0, "logits/rejected": 25770372.0, "logps/chosen": -467.15399169921875, "logps/rejected": -585.050537109375, "loss": 0.0609, "rewards/chosen": 4.638179302215576, "rewards/margins": 14.519019603729248, "rewards/rejected": -9.880840301513672, "step": 618 }, { "epoch": 0.1696587638755653, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16919421.714285713, "logits/rejected": -9626586.352941176, "logps/chosen": -408.08370535714283, "logps/rejected": -449.3669002757353, "loss": 0.03, "rewards/chosen": 5.362816946847098, "rewards/margins": 12.131371634347097, "rewards/rejected": -6.7685546875, "step": 619 }, { "epoch": 0.1699328491160751, "grad_norm": 18.375, "kl": 6.561077117919922, "learning_rate": 5e-06, "logits/chosen": -21778816.0, "logits/rejected": -12777814.0, "logps/chosen": -464.49786376953125, "logps/rejected": -495.2750244140625, "loss": 0.0716, "rewards/chosen": 4.5224714279174805, "rewards/margins": 11.292692184448242, "rewards/rejected": -6.770220756530762, "step": 620 }, { "epoch": 0.1702069343565849, "grad_norm": 9.375, "kl": 1.5907809734344482, "learning_rate": 5e-06, "logits/chosen": -12429041.23076923, "logits/rejected": -16140980.363636363, "logps/chosen": -376.4668719951923, "logps/rejected": -428.6189630681818, "loss": 0.044, "rewards/chosen": 4.267967224121094, "rewards/margins": 13.596860712224787, "rewards/rejected": -9.328893488103693, "step": 621 }, { "epoch": 0.1704810195970947, "grad_norm": 5.28125, "kl": 0.8825353384017944, "learning_rate": 5e-06, "logits/chosen": -34351638.4, "logits/rejected": -13113014.857142856, "logps/chosen": -520.41376953125, "logps/rejected": -529.1662946428571, "loss": 0.0181, "rewards/chosen": 6.936149597167969, "rewards/margins": 14.476635524204799, "rewards/rejected": -7.540485927036831, "step": 622 }, { "epoch": 0.1707551048376045, "grad_norm": 9.6875, "kl": 1.3214330673217773, "learning_rate": 5e-06, "logits/chosen": 3666558.6666666665, "logits/rejected": -17173086.222222224, "logps/chosen": -317.87526448567706, "logps/rejected": -444.1211208767361, "loss": 0.0465, "rewards/chosen": 4.317401885986328, "rewards/margins": 11.369893815782335, "rewards/rejected": -7.052491929796007, "step": 623 }, { "epoch": 0.1710291900781143, "grad_norm": 9.5625, "kl": 4.629144668579102, "learning_rate": 5e-06, "logits/chosen": 2427826.285714286, "logits/rejected": 14464854.4, "logps/chosen": -393.33095005580356, "logps/rejected": -442.07490234375, "loss": 0.0846, "rewards/chosen": 4.473875318254743, "rewards/margins": 9.392814527239118, "rewards/rejected": -4.918939208984375, "step": 624 }, { "epoch": 0.1713032753186241, "grad_norm": 12.0, "kl": 0.21543249487876892, "learning_rate": 5e-06, "logits/chosen": -8418392.727272727, "logits/rejected": 13623239.384615384, "logps/chosen": -489.15207741477275, "logps/rejected": -499.1727764423077, "loss": 0.0736, "rewards/chosen": 6.399031205610796, "rewards/margins": 12.858609499631228, "rewards/rejected": -6.4595782940204325, "step": 625 }, { "epoch": 0.1715773605591339, "grad_norm": 8.125, "kl": 0.034165702760219574, "learning_rate": 5e-06, "logits/chosen": 20084078.545454547, "logits/rejected": -15428061.538461538, "logps/chosen": -373.99746981534093, "logps/rejected": -547.2461313100962, "loss": 0.0924, "rewards/chosen": 5.0749827298251065, "rewards/margins": 12.989877420705515, "rewards/rejected": -7.914894690880408, "step": 626 }, { "epoch": 0.17185144579964368, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11055938.181818182, "logits/rejected": -12696649.846153846, "logps/chosen": -425.85768821022725, "logps/rejected": -339.84945913461536, "loss": 0.043, "rewards/chosen": 4.764149752530185, "rewards/margins": 11.076504207157589, "rewards/rejected": -6.312354454627404, "step": 627 }, { "epoch": 0.1721255310401535, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8353955.0, "logits/rejected": -12929469.0, "logps/chosen": -470.1595764160156, "logps/rejected": -592.481201171875, "loss": 0.0618, "rewards/chosen": 5.096713542938232, "rewards/margins": 13.576488971710205, "rewards/rejected": -8.479775428771973, "step": 628 }, { "epoch": 0.17239961628066328, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 968077.4, "logits/rejected": -16293500.631578946, "logps/chosen": -472.464990234375, "logps/rejected": -564.8239103618421, "loss": 0.0257, "rewards/chosen": 8.043956756591797, "rewards/margins": 16.87747533697831, "rewards/rejected": -8.833518580386514, "step": 629 }, { "epoch": 0.1726737015211731, "grad_norm": 4.5625, "kl": 4.010562419891357, "learning_rate": 5e-06, "logits/chosen": -3148150.6666666665, "logits/rejected": -15031994.666666666, "logps/chosen": -401.37158203125, "logps/rejected": -481.8024088541667, "loss": 0.0188, "rewards/chosen": 5.410300572713216, "rewards/margins": 11.441255569458008, "rewards/rejected": -6.030954996744792, "step": 630 }, { "epoch": 0.17294778676168288, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19013426.90909091, "logits/rejected": -16223379.692307692, "logps/chosen": -396.63174715909093, "logps/rejected": -641.5542367788462, "loss": 0.0696, "rewards/chosen": 4.992831143465909, "rewards/margins": 14.756763004756476, "rewards/rejected": -9.763931861290565, "step": 631 }, { "epoch": 0.1732218720021927, "grad_norm": 4.1875, "kl": 3.0601553916931152, "learning_rate": 5e-06, "logits/chosen": 13681845.818181818, "logits/rejected": -31343463.384615384, "logps/chosen": -523.9605823863636, "logps/rejected": -419.8161808894231, "loss": 0.02, "rewards/chosen": 5.150731520219282, "rewards/margins": 12.677018892514955, "rewards/rejected": -7.526287372295673, "step": 632 }, { "epoch": 0.17349595724270248, "grad_norm": 9.75, "kl": 0.2849667966365814, "learning_rate": 5e-06, "logits/chosen": -12175045.818181818, "logits/rejected": -2442920.153846154, "logps/chosen": -407.50736860795456, "logps/rejected": -473.5400390625, "loss": 0.052, "rewards/chosen": 5.11764873157848, "rewards/margins": 12.648482636138276, "rewards/rejected": -7.530833904559795, "step": 633 }, { "epoch": 0.1737700424832123, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9995310.0, "logits/rejected": -8318003.5, "logps/chosen": -379.6719665527344, "logps/rejected": -532.6536865234375, "loss": 0.06, "rewards/chosen": 4.979827404022217, "rewards/margins": 13.115407466888428, "rewards/rejected": -8.135580062866211, "step": 634 }, { "epoch": 0.17404412772372208, "grad_norm": 13.625, "kl": 0.9264259338378906, "learning_rate": 5e-06, "logits/chosen": 9588302.0, "logits/rejected": 50501514.666666664, "logps/chosen": -468.8567301432292, "logps/rejected": -655.6817626953125, "loss": 0.0898, "rewards/chosen": 5.253516515096028, "rewards/margins": 15.904151916503906, "rewards/rejected": -10.650635401407877, "step": 635 }, { "epoch": 0.1743182129642319, "grad_norm": 11.75, "kl": 10.274627685546875, "learning_rate": 5e-06, "logits/chosen": -10760670.11764706, "logits/rejected": -24146541.714285713, "logps/chosen": -454.4193474264706, "logps/rejected": -283.66990443638394, "loss": 0.0883, "rewards/chosen": 5.011727276970358, "rewards/margins": 10.053315651516954, "rewards/rejected": -5.041588374546596, "step": 636 }, { "epoch": 0.17459229820474167, "grad_norm": 8.5, "kl": 0.7969335317611694, "learning_rate": 5e-06, "logits/chosen": -1974076.6666666667, "logits/rejected": -37983381.333333336, "logps/chosen": -371.3429361979167, "logps/rejected": -539.6302083333334, "loss": 0.0628, "rewards/chosen": 4.60376951429579, "rewards/margins": 12.085649702284071, "rewards/rejected": -7.481880187988281, "step": 637 }, { "epoch": 0.17486638344525146, "grad_norm": 14.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4553660.0, "logits/rejected": -12080604.444444444, "logps/chosen": -492.7112630208333, "logps/rejected": -470.0026584201389, "loss": 0.0563, "rewards/chosen": 6.685591379801433, "rewards/margins": 13.051228841145834, "rewards/rejected": -6.365637461344401, "step": 638 }, { "epoch": 0.17514046868576127, "grad_norm": 17.125, "kl": 1.79473876953125, "learning_rate": 5e-06, "logits/chosen": -24468142.545454547, "logits/rejected": -610688.6153846154, "logps/chosen": -523.5799893465909, "logps/rejected": -557.6064077524038, "loss": 0.0757, "rewards/chosen": 5.368803544477983, "rewards/margins": 12.44222531618772, "rewards/rejected": -7.073421771709736, "step": 639 }, { "epoch": 0.17541455392627106, "grad_norm": 9.125, "kl": 4.310084342956543, "learning_rate": 5e-06, "logits/chosen": -15822255.05882353, "logits/rejected": -22508685.714285713, "logps/chosen": -472.41676240808823, "logps/rejected": -481.84444754464283, "loss": 0.0601, "rewards/chosen": 4.725383534150965, "rewards/margins": 11.680610432344324, "rewards/rejected": -6.955226898193359, "step": 640 }, { "epoch": 0.17568863916678087, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25526708.0, "logits/rejected": -4714817.5, "logps/chosen": -592.0956420898438, "logps/rejected": -493.6246032714844, "loss": 0.0248, "rewards/chosen": 5.870402812957764, "rewards/margins": 14.203567028045654, "rewards/rejected": -8.33316421508789, "step": 641 }, { "epoch": 0.17596272440729066, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12354376.0, "logits/rejected": -19746139.42857143, "logps/chosen": -629.892138671875, "logps/rejected": -516.0794503348214, "loss": 0.0267, "rewards/chosen": 5.061809539794922, "rewards/margins": 13.428099496023995, "rewards/rejected": -8.366289956229073, "step": 642 }, { "epoch": 0.17623680964780047, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17058336.0, "logits/rejected": -4285621.333333333, "logps/chosen": -470.1681315104167, "logps/rejected": -617.9827473958334, "loss": 0.0382, "rewards/chosen": 4.7091671625773115, "rewards/margins": 12.296644528706869, "rewards/rejected": -7.587477366129558, "step": 643 }, { "epoch": 0.17651089488831026, "grad_norm": 11.4375, "kl": 4.509880065917969, "learning_rate": 5e-06, "logits/chosen": -17476712.0, "logits/rejected": -7050060.0, "logps/chosen": -374.22332763671875, "logps/rejected": -497.633544921875, "loss": 0.1316, "rewards/chosen": 3.506448268890381, "rewards/margins": 11.660022258758545, "rewards/rejected": -8.153573989868164, "step": 644 }, { "epoch": 0.17678498012882007, "grad_norm": 3.046875, "kl": 1.3474719524383545, "learning_rate": 5e-06, "logits/chosen": -4813805.0, "logits/rejected": -6792294.666666667, "logps/chosen": -452.0450846354167, "logps/rejected": -574.9203287760416, "loss": 0.0087, "rewards/chosen": 8.35629145304362, "rewards/margins": 15.954441706339518, "rewards/rejected": -7.598150253295898, "step": 645 }, { "epoch": 0.17705906536932986, "grad_norm": 10.6875, "kl": 1.1362636089324951, "learning_rate": 5e-06, "logits/chosen": -16212571.636363637, "logits/rejected": 3122461.5384615385, "logps/chosen": -414.36430220170456, "logps/rejected": -474.57019981971155, "loss": 0.0521, "rewards/chosen": 4.538210435347124, "rewards/margins": 10.655191354818278, "rewards/rejected": -6.116980919471154, "step": 646 }, { "epoch": 0.17733315060983967, "grad_norm": 8.375, "kl": 4.020722389221191, "learning_rate": 5e-06, "logits/chosen": -26785319.384615384, "logits/rejected": -4535187.636363637, "logps/chosen": -356.5911207932692, "logps/rejected": -470.1048473011364, "loss": 0.0511, "rewards/chosen": 5.003856952373798, "rewards/margins": 11.34448861075448, "rewards/rejected": -6.340631658380682, "step": 647 }, { "epoch": 0.17760723585034946, "grad_norm": 14.4375, "kl": 7.32028341293335, "learning_rate": 5e-06, "logits/chosen": -5145808.0, "logits/rejected": 6526503.111111111, "logps/chosen": -411.61546223958334, "logps/rejected": -469.951171875, "loss": 0.1057, "rewards/chosen": 5.406846618652343, "rewards/margins": 12.76443125406901, "rewards/rejected": -7.357584635416667, "step": 648 }, { "epoch": 0.17788132109085925, "grad_norm": 16.625, "kl": 3.731398344039917, "learning_rate": 5e-06, "logits/chosen": -14962496.0, "logits/rejected": -36835926.4, "logps/chosen": -463.05262974330356, "logps/rejected": -545.154296875, "loss": 0.0686, "rewards/chosen": 4.5617634909493585, "rewards/margins": 13.439546694074359, "rewards/rejected": -8.877783203125, "step": 649 }, { "epoch": 0.17815540633136906, "grad_norm": 10.875, "kl": 5.982331275939941, "learning_rate": 5e-06, "logits/chosen": 16831413.818181816, "logits/rejected": -4087279.3846153845, "logps/chosen": -515.5729314630681, "logps/rejected": -472.5354191706731, "loss": 0.0662, "rewards/chosen": 6.266550931063565, "rewards/margins": 14.312198185420538, "rewards/rejected": -8.045647254356972, "step": 650 }, { "epoch": 0.17842949157187885, "grad_norm": 10.8125, "kl": 2.4124507904052734, "learning_rate": 5e-06, "logits/chosen": -11410084.923076924, "logits/rejected": -14445280.0, "logps/chosen": -454.6585036057692, "logps/rejected": -368.64945845170456, "loss": 0.0602, "rewards/chosen": 4.526606926551232, "rewards/margins": 9.853844836041645, "rewards/rejected": -5.327237909490412, "step": 651 }, { "epoch": 0.17870357681238866, "grad_norm": 12.125, "kl": 4.547926902770996, "learning_rate": 5e-06, "logits/chosen": -15997245.090909092, "logits/rejected": 2310278.153846154, "logps/chosen": -363.4921875, "logps/rejected": -651.4320913461538, "loss": 0.0759, "rewards/chosen": 4.602999600497159, "rewards/margins": 14.188006314364348, "rewards/rejected": -9.585006713867188, "step": 652 }, { "epoch": 0.17897766205289845, "grad_norm": 6.125, "kl": 1.457867980003357, "learning_rate": 5e-06, "logits/chosen": -16924388.923076924, "logits/rejected": 4215090.181818182, "logps/chosen": -386.61083984375, "logps/rejected": -601.1967329545455, "loss": 0.0326, "rewards/chosen": 5.171678396371695, "rewards/margins": 13.74350706513945, "rewards/rejected": -8.571828668767756, "step": 653 }, { "epoch": 0.17925174729340826, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23886685.333333332, "logits/rejected": 15876461.333333334, "logps/chosen": -452.3121337890625, "logps/rejected": -528.6349690755209, "loss": 0.0331, "rewards/chosen": 5.482163747151692, "rewards/margins": 12.260026931762695, "rewards/rejected": -6.777863184611003, "step": 654 }, { "epoch": 0.17952583253391804, "grad_norm": 16.625, "kl": 3.020371913909912, "learning_rate": 5e-06, "logits/chosen": 6216066.5, "logits/rejected": 36494708.0, "logps/chosen": -473.69439697265625, "logps/rejected": -546.4367065429688, "loss": 0.1322, "rewards/chosen": 4.707186222076416, "rewards/margins": 11.705403804779053, "rewards/rejected": -6.998217582702637, "step": 655 }, { "epoch": 0.17979991777442786, "grad_norm": 15.25, "kl": 3.056267499923706, "learning_rate": 5e-06, "logits/chosen": -3894609.1428571427, "logits/rejected": -21451481.6, "logps/chosen": -356.53885323660717, "logps/rejected": -570.129443359375, "loss": 0.1057, "rewards/chosen": 4.014279229300363, "rewards/margins": 11.567349297659739, "rewards/rejected": -7.553070068359375, "step": 656 }, { "epoch": 0.18007400301493764, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30478187.42857143, "logits/rejected": 1221798.1176470588, "logps/chosen": -437.268310546875, "logps/rejected": -584.4802389705883, "loss": 0.0423, "rewards/chosen": 5.009434836251395, "rewards/margins": 10.995393160010586, "rewards/rejected": -5.985958323759191, "step": 657 }, { "epoch": 0.18034808825544743, "grad_norm": 4.4375, "kl": 2.5984294414520264, "learning_rate": 5e-06, "logits/chosen": -25621408.0, "logits/rejected": -14317317.818181818, "logps/chosen": -504.54736328125, "logps/rejected": -442.17520419034093, "loss": 0.0141, "rewards/chosen": 5.8022930438701925, "rewards/margins": 12.47317067393056, "rewards/rejected": -6.670877630060369, "step": 658 }, { "epoch": 0.18062217349595724, "grad_norm": 12.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4832799.111111111, "logits/rejected": 13279668.266666668, "logps/chosen": -340.9364963107639, "logps/rejected": -534.7956705729167, "loss": 0.0628, "rewards/chosen": 5.769847869873047, "rewards/margins": 11.672466786702474, "rewards/rejected": -5.902618916829427, "step": 659 }, { "epoch": 0.18089625873646703, "grad_norm": 7.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7954560.0, "logits/rejected": 33777670.4, "logps/chosen": -442.03763253348217, "logps/rejected": -826.02236328125, "loss": 0.0262, "rewards/chosen": 4.904408046177456, "rewards/margins": 16.82922417776925, "rewards/rejected": -11.924816131591797, "step": 660 }, { "epoch": 0.18117034397697684, "grad_norm": 11.3125, "kl": 2.773186445236206, "learning_rate": 5e-06, "logits/chosen": 19473920.0, "logits/rejected": -6037664.0, "logps/chosen": -381.4449462890625, "logps/rejected": -668.2956194196429, "loss": 0.0518, "rewards/chosen": 5.156189727783203, "rewards/margins": 12.537233843122209, "rewards/rejected": -7.381044115339007, "step": 661 }, { "epoch": 0.18144442921748663, "grad_norm": 18.75, "kl": 0.21270053088665009, "learning_rate": 5e-06, "logits/chosen": -18838948.57142857, "logits/rejected": 9693463.2, "logps/chosen": -350.2017299107143, "logps/rejected": -526.1, "loss": 0.1215, "rewards/chosen": 3.9127535138811385, "rewards/margins": 8.854418781825474, "rewards/rejected": -4.941665267944336, "step": 662 }, { "epoch": 0.18171851445799644, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35685810.666666664, "logits/rejected": 6807774.222222222, "logps/chosen": -490.149658203125, "logps/rejected": -479.57275390625, "loss": 0.0203, "rewards/chosen": 6.119256973266602, "rewards/margins": 12.930627822875977, "rewards/rejected": -6.811370849609375, "step": 663 }, { "epoch": 0.18199259969850623, "grad_norm": 11.625, "kl": 9.57091236114502, "learning_rate": 5e-06, "logits/chosen": -12188646.0, "logits/rejected": -10997197.0, "logps/chosen": -408.8459167480469, "logps/rejected": -491.891357421875, "loss": 0.0635, "rewards/chosen": 5.3844380378723145, "rewards/margins": 13.442701816558838, "rewards/rejected": -8.058263778686523, "step": 664 }, { "epoch": 0.18226668493901604, "grad_norm": 9.25, "kl": 0.8608468770980835, "learning_rate": 5e-06, "logits/chosen": 118912277.33333333, "logits/rejected": -13355916.0, "logps/chosen": -469.6709391276042, "logps/rejected": -407.3434244791667, "loss": 0.066, "rewards/chosen": 5.318471272786458, "rewards/margins": 10.814470926920572, "rewards/rejected": -5.495999654134114, "step": 665 }, { "epoch": 0.18254077017952583, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 38526944.0, "logits/rejected": -32757060.57142857, "logps/chosen": -412.59296875, "logps/rejected": -650.3498883928571, "loss": 0.0737, "rewards/chosen": 3.2014957427978517, "rewards/margins": 12.405536270141601, "rewards/rejected": -9.20404052734375, "step": 666 }, { "epoch": 0.18281485542003564, "grad_norm": 7.09375, "kl": 8.214457511901855, "learning_rate": 5e-06, "logits/chosen": -3771388.923076923, "logits/rejected": 3673949.8181818184, "logps/chosen": -415.4045597956731, "logps/rejected": -531.1661044034091, "loss": 0.0858, "rewards/chosen": 5.785321162297175, "rewards/margins": 14.031579291070258, "rewards/rejected": -8.246258128773082, "step": 667 }, { "epoch": 0.18308894066054543, "grad_norm": 15.25, "kl": 11.542875289916992, "learning_rate": 5e-06, "logits/chosen": 5794169.6, "logits/rejected": -9594542.222222222, "logps/chosen": -464.01360677083335, "logps/rejected": -657.1080186631945, "loss": 0.055, "rewards/chosen": 5.3656260172526045, "rewards/margins": 13.288129170735678, "rewards/rejected": -7.922503153483073, "step": 668 }, { "epoch": 0.18336302590105522, "grad_norm": 15.125, "kl": 13.120969772338867, "learning_rate": 5e-06, "logits/chosen": -13788983.272727273, "logits/rejected": 28052258.46153846, "logps/chosen": -534.2568359375, "logps/rejected": -481.0540114182692, "loss": 0.0665, "rewards/chosen": 6.149587457830256, "rewards/margins": 12.476706551505135, "rewards/rejected": -6.32711909367488, "step": 669 }, { "epoch": 0.18363711114156503, "grad_norm": 9.0, "kl": 0.6274642944335938, "learning_rate": 5e-06, "logits/chosen": -9907148.0, "logits/rejected": -1662651.3333333333, "logps/chosen": -516.4973551432291, "logps/rejected": -617.02197265625, "loss": 0.0368, "rewards/chosen": 6.550614674886067, "rewards/margins": 14.693857192993164, "rewards/rejected": -8.143242518107096, "step": 670 }, { "epoch": 0.18391119638207482, "grad_norm": 8.0625, "kl": 2.0946121215820312, "learning_rate": 5e-06, "logits/chosen": 6462261.818181818, "logits/rejected": 10161086.153846154, "logps/chosen": -370.83327414772725, "logps/rejected": -646.6476862980769, "loss": 0.039, "rewards/chosen": 4.5743935324928975, "rewards/margins": 12.211834140590854, "rewards/rejected": -7.637440608097957, "step": 671 }, { "epoch": 0.18418528162258463, "grad_norm": 11.75, "kl": 9.384299278259277, "learning_rate": 5e-06, "logits/chosen": -15353504.0, "logits/rejected": -12988388.0, "logps/chosen": -507.3971354166667, "logps/rejected": -484.0600992838542, "loss": 0.0498, "rewards/chosen": 6.222944895426433, "rewards/margins": 12.37914784749349, "rewards/rejected": -6.156202952067058, "step": 672 }, { "epoch": 0.18445936686309441, "grad_norm": 16.0, "kl": 1.6066831350326538, "learning_rate": 5e-06, "logits/chosen": -10940402.666666666, "logits/rejected": -2078624.3333333333, "logps/chosen": -344.9315185546875, "logps/rejected": -482.9273274739583, "loss": 0.0606, "rewards/chosen": 4.9975935618082685, "rewards/margins": 11.776281356811523, "rewards/rejected": -6.778687795003255, "step": 673 }, { "epoch": 0.18473345210360423, "grad_norm": 13.3125, "kl": 6.260837554931641, "learning_rate": 5e-06, "logits/chosen": 5439182.769230769, "logits/rejected": 2330095.272727273, "logps/chosen": -428.56201171875, "logps/rejected": -487.4564098011364, "loss": 0.0964, "rewards/chosen": 4.348545954777644, "rewards/margins": 11.938914052256337, "rewards/rejected": -7.590368097478693, "step": 674 }, { "epoch": 0.18500753734411401, "grad_norm": 12.125, "kl": 3.449218273162842, "learning_rate": 5e-06, "logits/chosen": -15805232.0, "logits/rejected": 3055830.6666666665, "logps/chosen": -451.4548746744792, "logps/rejected": -495.1999918619792, "loss": 0.0572, "rewards/chosen": 4.746454238891602, "rewards/margins": 9.908045768737793, "rewards/rejected": -5.161591529846191, "step": 675 }, { "epoch": 0.18528162258462383, "grad_norm": 8.4375, "kl": 8.478172302246094, "learning_rate": 5e-06, "logits/chosen": -33232389.333333332, "logits/rejected": 2388525.3333333335, "logps/chosen": -455.060302734375, "logps/rejected": -559.5874837239584, "loss": 0.0375, "rewards/chosen": 5.600802103678386, "rewards/margins": 13.319561004638672, "rewards/rejected": -7.718758900960286, "step": 676 }, { "epoch": 0.18555570782513361, "grad_norm": 7.09375, "kl": 11.30219841003418, "learning_rate": 5e-06, "logits/chosen": 4542419.692307692, "logits/rejected": 6914904.7272727275, "logps/chosen": -421.0422175480769, "logps/rejected": -446.58203125, "loss": 0.0465, "rewards/chosen": 7.001137366661658, "rewards/margins": 12.586546357695038, "rewards/rejected": -5.585408991033381, "step": 677 }, { "epoch": 0.18582979306564343, "grad_norm": 27.625, "kl": 3.571605682373047, "learning_rate": 5e-06, "logits/chosen": -34220149.333333336, "logits/rejected": 12536318.666666666, "logps/chosen": -459.1047770182292, "logps/rejected": -315.3266194661458, "loss": 0.0813, "rewards/chosen": 5.416045506795247, "rewards/margins": 10.990102132161457, "rewards/rejected": -5.574056625366211, "step": 678 }, { "epoch": 0.18610387830615321, "grad_norm": 13.25, "kl": 6.238208770751953, "learning_rate": 5e-06, "logits/chosen": 5504319.428571428, "logits/rejected": -2798040.6, "logps/chosen": -492.7675083705357, "logps/rejected": -283.1787841796875, "loss": 0.0526, "rewards/chosen": 6.383028302873884, "rewards/margins": 10.856230436052595, "rewards/rejected": -4.473202133178711, "step": 679 }, { "epoch": 0.186377963546663, "grad_norm": 12.4375, "kl": 24.762876510620117, "learning_rate": 5e-06, "logits/chosen": -8330652.235294118, "logits/rejected": 36335888.0, "logps/chosen": -365.7442267922794, "logps/rejected": -597.2920619419643, "loss": 0.1758, "rewards/chosen": 5.247539295869715, "rewards/margins": 13.82821905312418, "rewards/rejected": -8.580679757254464, "step": 680 }, { "epoch": 0.18665204878717281, "grad_norm": 12.9375, "kl": 0.7350603938102722, "learning_rate": 5e-06, "logits/chosen": -14394880.0, "logits/rejected": -7032176.0, "logps/chosen": -381.30984933035717, "logps/rejected": -557.5697265625, "loss": 0.0766, "rewards/chosen": 4.136623382568359, "rewards/margins": 10.568343353271484, "rewards/rejected": -6.431719970703125, "step": 681 }, { "epoch": 0.1869261340276826, "grad_norm": 14.375, "kl": 9.99704647064209, "learning_rate": 5e-06, "logits/chosen": -21880710.85714286, "logits/rejected": -20929283.2, "logps/chosen": -329.3447265625, "logps/rejected": -394.1942138671875, "loss": 0.1477, "rewards/chosen": 4.049205780029297, "rewards/margins": 9.306630706787109, "rewards/rejected": -5.257424926757812, "step": 682 }, { "epoch": 0.1872002192681924, "grad_norm": 15.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9460394.666666666, "logits/rejected": 20073996.8, "logps/chosen": -472.9391818576389, "logps/rejected": -387.2791015625, "loss": 0.0519, "rewards/chosen": 4.979915195041233, "rewards/margins": 10.718702019585503, "rewards/rejected": -5.738786824544271, "step": 683 }, { "epoch": 0.1874743045087022, "grad_norm": 16.375, "kl": 5.977967262268066, "learning_rate": 5e-06, "logits/chosen": 25933114.181818184, "logits/rejected": 1280499.3846153845, "logps/chosen": -466.85249467329544, "logps/rejected": -536.5161884014423, "loss": 0.1256, "rewards/chosen": 5.021444840864702, "rewards/margins": 12.220500439197034, "rewards/rejected": -7.199055598332332, "step": 684 }, { "epoch": 0.187748389749212, "grad_norm": 15.3125, "kl": 14.002288818359375, "learning_rate": 5e-06, "logits/chosen": -5936139.764705882, "logits/rejected": -9890893.714285715, "logps/chosen": -422.42790670955884, "logps/rejected": -493.88633510044644, "loss": 0.0885, "rewards/chosen": 5.431978113511029, "rewards/margins": 12.530173870695739, "rewards/rejected": -7.09819575718471, "step": 685 }, { "epoch": 0.1880224749897218, "grad_norm": 14.0625, "kl": 8.123441696166992, "learning_rate": 5e-06, "logits/chosen": 9782142.0, "logits/rejected": -11026819.555555556, "logps/chosen": -540.3661702473959, "logps/rejected": -452.0787760416667, "loss": 0.0493, "rewards/chosen": 7.512808481852214, "rewards/margins": 13.000105539957683, "rewards/rejected": -5.487297058105469, "step": 686 }, { "epoch": 0.1882965602302316, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21941006.222222224, "logits/rejected": -18894329.6, "logps/chosen": -452.3079427083333, "logps/rejected": -579.7229817708334, "loss": 0.0386, "rewards/chosen": 5.02400885687934, "rewards/margins": 12.823171657986112, "rewards/rejected": -7.799162801106771, "step": 687 }, { "epoch": 0.1885706454707414, "grad_norm": 11.0, "kl": 12.096885681152344, "learning_rate": 5e-06, "logits/chosen": -6566683.0, "logits/rejected": -5916447.5, "logps/chosen": -410.5346984863281, "logps/rejected": -535.7684326171875, "loss": 0.0522, "rewards/chosen": 6.431801795959473, "rewards/margins": 13.071205139160156, "rewards/rejected": -6.639403343200684, "step": 688 }, { "epoch": 0.18884473071125119, "grad_norm": 4.3125, "kl": 3.1340396404266357, "learning_rate": 5e-06, "logits/chosen": -18880801.6, "logits/rejected": -7770425.142857143, "logps/chosen": -464.6578125, "logps/rejected": -484.50390625, "loss": 0.0377, "rewards/chosen": 6.190269851684571, "rewards/margins": 12.427403422764371, "rewards/rejected": -6.2371335710797995, "step": 689 }, { "epoch": 0.189118815951761, "grad_norm": 12.1875, "kl": 11.363642692565918, "learning_rate": 5e-06, "logits/chosen": -25526884.0, "logits/rejected": 30449792.0, "logps/chosen": -431.1676940917969, "logps/rejected": -415.81304931640625, "loss": 0.1039, "rewards/chosen": 5.80319881439209, "rewards/margins": 10.151955604553223, "rewards/rejected": -4.348756790161133, "step": 690 }, { "epoch": 0.18939290119227079, "grad_norm": 12.8125, "kl": 8.589733123779297, "learning_rate": 5e-06, "logits/chosen": -14847632.0, "logits/rejected": 4024552.0, "logps/chosen": -414.2594517299107, "logps/rejected": -321.86640625, "loss": 0.0927, "rewards/chosen": 5.063065120152065, "rewards/margins": 9.441832515171598, "rewards/rejected": -4.378767395019532, "step": 691 }, { "epoch": 0.1896669864327806, "grad_norm": 3.71875, "kl": 9.113994598388672, "learning_rate": 5e-06, "logits/chosen": -6806056.615384615, "logits/rejected": 14492004.363636363, "logps/chosen": -431.60884915865387, "logps/rejected": -539.5665838068181, "loss": 0.0317, "rewards/chosen": 6.498329162597656, "rewards/margins": 13.615422335537996, "rewards/rejected": -7.117093172940341, "step": 692 }, { "epoch": 0.18994107167329038, "grad_norm": 12.25, "kl": 3.9564952850341797, "learning_rate": 5e-06, "logits/chosen": -21797067.636363637, "logits/rejected": 16676704.0, "logps/chosen": -399.5560191761364, "logps/rejected": -431.26938100961536, "loss": 0.0697, "rewards/chosen": 5.3419199856844815, "rewards/margins": 9.569448484407438, "rewards/rejected": -4.227528498722957, "step": 693 }, { "epoch": 0.1902151569138002, "grad_norm": 10.6875, "kl": 5.5463786125183105, "learning_rate": 5e-06, "logits/chosen": -27680708.923076924, "logits/rejected": -142730.54545454544, "logps/chosen": -468.69775390625, "logps/rejected": -463.18093039772725, "loss": 0.028, "rewards/chosen": 6.627674396221455, "rewards/margins": 12.447045012787505, "rewards/rejected": -5.819370616566051, "step": 694 }, { "epoch": 0.19048924215430998, "grad_norm": 9.6875, "kl": 7.48018741607666, "learning_rate": 5e-06, "logits/chosen": -20320516.57142857, "logits/rejected": 8842942.4, "logps/chosen": -431.44395228794644, "logps/rejected": -451.68212890625, "loss": 0.062, "rewards/chosen": 5.639814104352679, "rewards/margins": 11.818534197126116, "rewards/rejected": -6.178720092773437, "step": 695 }, { "epoch": 0.1907633273948198, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4025269.1428571427, "logits/rejected": 8232991.05882353, "logps/chosen": -451.0279017857143, "logps/rejected": -607.4477251838235, "loss": 0.038, "rewards/chosen": 5.103515080043247, "rewards/margins": 13.78954574841411, "rewards/rejected": -8.686030668370863, "step": 696 }, { "epoch": 0.19103741263532958, "grad_norm": 14.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36791669.333333336, "logits/rejected": -10359975.333333334, "logps/chosen": -379.1914469401042, "logps/rejected": -469.575927734375, "loss": 0.1036, "rewards/chosen": 4.2799727121988935, "rewards/margins": 11.186428705851238, "rewards/rejected": -6.906455993652344, "step": 697 }, { "epoch": 0.1913114978758394, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21168508.0, "logits/rejected": -7287431.0, "logps/chosen": -383.39117431640625, "logps/rejected": -932.850341796875, "loss": 0.0292, "rewards/chosen": 6.15700101852417, "rewards/margins": 21.55512762069702, "rewards/rejected": -15.398126602172852, "step": 698 }, { "epoch": 0.19158558311634918, "grad_norm": 11.5, "kl": 0.5297800898551941, "learning_rate": 5e-06, "logits/chosen": -25880888.0, "logits/rejected": -1804576.3333333333, "logps/chosen": -485.6241048177083, "logps/rejected": -706.72705078125, "loss": 0.0561, "rewards/chosen": 6.000101725260417, "rewards/margins": 13.67116673787435, "rewards/rejected": -7.671065012613933, "step": 699 }, { "epoch": 0.19185966835685897, "grad_norm": 7.84375, "kl": 6.911484718322754, "learning_rate": 5e-06, "logits/chosen": -14384587.0, "logits/rejected": -7100392.0, "logps/chosen": -476.5574035644531, "logps/rejected": -406.8119812011719, "loss": 0.0709, "rewards/chosen": 5.299774169921875, "rewards/margins": 10.900629043579102, "rewards/rejected": -5.600854873657227, "step": 700 }, { "epoch": 0.19213375359736878, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4824914.666666667, "logits/rejected": 2091335.4666666666, "logps/chosen": -421.04497612847223, "logps/rejected": -477.21318359375, "loss": 0.0329, "rewards/chosen": 5.529703776041667, "rewards/margins": 13.58799031575521, "rewards/rejected": -8.058286539713542, "step": 701 }, { "epoch": 0.19240783883787857, "grad_norm": 12.375, "kl": 2.9964823722839355, "learning_rate": 5e-06, "logits/chosen": 1509169.4285714286, "logits/rejected": -17006102.4, "logps/chosen": -434.60658482142856, "logps/rejected": -422.16484375, "loss": 0.0492, "rewards/chosen": 5.763410295758929, "rewards/margins": 12.628524889264789, "rewards/rejected": -6.86511459350586, "step": 702 }, { "epoch": 0.19268192407838838, "grad_norm": 16.5, "kl": 16.3209228515625, "learning_rate": 5e-06, "logits/chosen": 4892112.0, "logits/rejected": 13432342.0, "logps/chosen": -396.4866027832031, "logps/rejected": -393.23394775390625, "loss": 0.1409, "rewards/chosen": 4.887618064880371, "rewards/margins": 11.367178916931152, "rewards/rejected": -6.479560852050781, "step": 703 }, { "epoch": 0.19295600931889817, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7859377.230769231, "logits/rejected": 869598.1818181818, "logps/chosen": -445.0465745192308, "logps/rejected": -554.4255149147727, "loss": 0.0282, "rewards/chosen": 5.356487567608173, "rewards/margins": 12.82353343830242, "rewards/rejected": -7.467045870694247, "step": 704 }, { "epoch": 0.19323009455940798, "grad_norm": 6.84375, "kl": 16.127561569213867, "learning_rate": 5e-06, "logits/chosen": -14949243.076923076, "logits/rejected": -19264093.09090909, "logps/chosen": -492.65914212740387, "logps/rejected": -525.2933238636364, "loss": 0.0279, "rewards/chosen": 7.3312835693359375, "rewards/margins": 15.643851540305398, "rewards/rejected": -8.31256797096946, "step": 705 }, { "epoch": 0.19350417979991777, "grad_norm": 10.625, "kl": 4.646829128265381, "learning_rate": 5e-06, "logits/chosen": 21552827.076923076, "logits/rejected": -3521306.5454545454, "logps/chosen": -501.8141526442308, "logps/rejected": -419.15700461647725, "loss": 0.0611, "rewards/chosen": 5.34136962890625, "rewards/margins": 12.064824884588068, "rewards/rejected": -6.723455255681818, "step": 706 }, { "epoch": 0.19377826504042758, "grad_norm": 14.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24425287.111111112, "logits/rejected": -458613.3333333333, "logps/chosen": -475.10584852430554, "logps/rejected": -442.6375, "loss": 0.0863, "rewards/chosen": 5.385944366455078, "rewards/margins": 12.11588617960612, "rewards/rejected": -6.729941813151042, "step": 707 }, { "epoch": 0.19405235028093737, "grad_norm": 11.0625, "kl": 1.2148100137710571, "learning_rate": 5e-06, "logits/chosen": -29926359.272727273, "logits/rejected": 7947227.692307692, "logps/chosen": -359.3543146306818, "logps/rejected": -533.3845402644231, "loss": 0.0702, "rewards/chosen": 4.510412736372515, "rewards/margins": 10.80978900402576, "rewards/rejected": -6.299376267653245, "step": 708 }, { "epoch": 0.19432643552144718, "grad_norm": 10.0625, "kl": 4.838833332061768, "learning_rate": 5e-06, "logits/chosen": 6565408.571428572, "logits/rejected": -8158547.2, "logps/chosen": -303.1696079799107, "logps/rejected": -618.14345703125, "loss": 0.0799, "rewards/chosen": 4.552840369088309, "rewards/margins": 14.604834692818777, "rewards/rejected": -10.051994323730469, "step": 709 }, { "epoch": 0.19460052076195697, "grad_norm": 6.34375, "kl": 1.5257911682128906, "learning_rate": 5e-06, "logits/chosen": -18494774.85714286, "logits/rejected": -20748417.6, "logps/chosen": -363.166748046875, "logps/rejected": -447.96875, "loss": 0.0267, "rewards/chosen": 5.571037292480469, "rewards/margins": 12.719467163085938, "rewards/rejected": -7.148429870605469, "step": 710 }, { "epoch": 0.19487460600246675, "grad_norm": 9.25, "kl": 1.2903913259506226, "learning_rate": 5e-06, "logits/chosen": -1493215.6666666667, "logits/rejected": 34964330.666666664, "logps/chosen": -438.3917643229167, "logps/rejected": -480.4703776041667, "loss": 0.0517, "rewards/chosen": 7.058909734090169, "rewards/margins": 16.08990881178114, "rewards/rejected": -9.030999077690971, "step": 711 }, { "epoch": 0.19514869124297657, "grad_norm": 11.0, "kl": 10.855573654174805, "learning_rate": 5e-06, "logits/chosen": -21955253.333333332, "logits/rejected": -2516169.0, "logps/chosen": -510.06201171875, "logps/rejected": -445.5848795572917, "loss": 0.0601, "rewards/chosen": 6.576147079467773, "rewards/margins": 13.267876942952473, "rewards/rejected": -6.6917298634847, "step": 712 }, { "epoch": 0.19542277648348635, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11322525.538461538, "logits/rejected": 18219720.727272727, "logps/chosen": -287.1435546875, "logps/rejected": -495.52854225852275, "loss": 0.1024, "rewards/chosen": 4.2139153113731975, "rewards/margins": 10.29326378882348, "rewards/rejected": -6.079348477450284, "step": 713 }, { "epoch": 0.19569686172399617, "grad_norm": 9.625, "kl": 10.15253734588623, "learning_rate": 5e-06, "logits/chosen": -10300219.692307692, "logits/rejected": -7304046.545454546, "logps/chosen": -482.4400165264423, "logps/rejected": -683.3777521306819, "loss": 0.0615, "rewards/chosen": 6.556089547964243, "rewards/margins": 15.314453445114456, "rewards/rejected": -8.758363897150213, "step": 714 }, { "epoch": 0.19597094696450595, "grad_norm": 13.9375, "kl": 6.302914619445801, "learning_rate": 5e-06, "logits/chosen": -17704455.529411763, "logits/rejected": -12773211.42857143, "logps/chosen": -392.19381893382354, "logps/rejected": -636.4645647321429, "loss": 0.0461, "rewards/chosen": 5.798630209530101, "rewards/margins": 14.405655356014476, "rewards/rejected": -8.607025146484375, "step": 715 }, { "epoch": 0.19624503220501577, "grad_norm": 12.75, "kl": 2.7957754135131836, "learning_rate": 5e-06, "logits/chosen": -17383355.076923076, "logits/rejected": 1262679.4545454546, "logps/chosen": -565.4750600961538, "logps/rejected": -554.9922318892045, "loss": 0.0484, "rewards/chosen": 7.117357107309195, "rewards/margins": 14.707727979113173, "rewards/rejected": -7.5903708718039775, "step": 716 }, { "epoch": 0.19651911744552555, "grad_norm": 6.75, "kl": 7.880585193634033, "learning_rate": 5e-06, "logits/chosen": -11645642.666666666, "logits/rejected": -17952689.777777776, "logps/chosen": -552.3585286458333, "logps/rejected": -425.1178385416667, "loss": 0.0384, "rewards/chosen": 6.785609944661458, "rewards/margins": 12.551067521837023, "rewards/rejected": -5.765457577175564, "step": 717 }, { "epoch": 0.19679320268603537, "grad_norm": 8.5, "kl": 5.3349409103393555, "learning_rate": 5e-06, "logits/chosen": -2866881.8181818184, "logits/rejected": -25408327.384615384, "logps/chosen": -350.8917347301136, "logps/rejected": -575.5890925480769, "loss": 0.0538, "rewards/chosen": 6.735143488103693, "rewards/margins": 12.620541765973286, "rewards/rejected": -5.885398277869592, "step": 718 }, { "epoch": 0.19706728792654515, "grad_norm": 5.5, "kl": 3.4196105003356934, "learning_rate": 5e-06, "logits/chosen": -18115138.0, "logits/rejected": 31079164.0, "logps/chosen": -433.5357666015625, "logps/rejected": -636.82177734375, "loss": 0.0221, "rewards/chosen": 5.885141372680664, "rewards/margins": 14.104856491088867, "rewards/rejected": -8.219715118408203, "step": 719 }, { "epoch": 0.19734137316705497, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20628540.0, "logits/rejected": 16775558.666666666, "logps/chosen": -495.9139811197917, "logps/rejected": -602.0727132161459, "loss": 0.0293, "rewards/chosen": 6.132659276326497, "rewards/margins": 13.494274139404297, "rewards/rejected": -7.3616148630778, "step": 720 }, { "epoch": 0.19761545840756475, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -176410.625, "logits/rejected": -27929874.0, "logps/chosen": -437.7750244140625, "logps/rejected": -567.614013671875, "loss": 0.0487, "rewards/chosen": 6.053165912628174, "rewards/margins": 12.646437168121338, "rewards/rejected": -6.593271255493164, "step": 721 }, { "epoch": 0.19788954364807454, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1163646.3333333333, "logits/rejected": -13939045.333333334, "logps/chosen": -461.3586018880208, "logps/rejected": -436.9265950520833, "loss": 0.0465, "rewards/chosen": 5.901974995930989, "rewards/margins": 11.133646647135416, "rewards/rejected": -5.231671651204427, "step": 722 }, { "epoch": 0.19816362888858435, "grad_norm": 10.1875, "kl": 0.39740753173828125, "learning_rate": 5e-06, "logits/chosen": -4788304.0, "logits/rejected": -9888308.57142857, "logps/chosen": -500.192529296875, "logps/rejected": -492.1509486607143, "loss": 0.0498, "rewards/chosen": 5.4329784393310545, "rewards/margins": 12.11512096949986, "rewards/rejected": -6.6821425301688055, "step": 723 }, { "epoch": 0.19843771412909414, "grad_norm": 5.15625, "kl": 6.531761646270752, "learning_rate": 5e-06, "logits/chosen": -9526487.384615384, "logits/rejected": -35594504.72727273, "logps/chosen": -461.8374774639423, "logps/rejected": -483.3505859375, "loss": 0.0384, "rewards/chosen": 6.30718994140625, "rewards/margins": 13.611536199396307, "rewards/rejected": -7.304346257990057, "step": 724 }, { "epoch": 0.19871179936960395, "grad_norm": 6.53125, "kl": 0.8778683543205261, "learning_rate": 5e-06, "logits/chosen": -24103076.923076924, "logits/rejected": -6501032.0, "logps/chosen": -454.58657602163464, "logps/rejected": -559.9384765625, "loss": 0.0613, "rewards/chosen": 4.76316422682542, "rewards/margins": 10.678753992894311, "rewards/rejected": -5.915589766068892, "step": 725 }, { "epoch": 0.19898588461011374, "grad_norm": 6.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41056296.0, "logits/rejected": 9889978.0, "logps/chosen": -453.27252197265625, "logps/rejected": -431.15777587890625, "loss": 0.0379, "rewards/chosen": 6.120019912719727, "rewards/margins": 12.189565181732178, "rewards/rejected": -6.069545269012451, "step": 726 }, { "epoch": 0.19925996985062355, "grad_norm": 9.9375, "kl": 12.888936042785645, "learning_rate": 5e-06, "logits/chosen": -16324638.11764706, "logits/rejected": 2564734.285714286, "logps/chosen": -431.16819852941177, "logps/rejected": -426.96707589285717, "loss": 0.0804, "rewards/chosen": 6.175759708180147, "rewards/margins": 11.482111506101464, "rewards/rejected": -5.306351797921317, "step": 727 }, { "epoch": 0.19953405509113334, "grad_norm": 11.125, "kl": 7.9905524253845215, "learning_rate": 5e-06, "logits/chosen": -33257634.666666668, "logits/rejected": -9452610.0, "logps/chosen": -475.915771484375, "logps/rejected": -554.5096028645834, "loss": 0.0588, "rewards/chosen": 6.611539204915364, "rewards/margins": 14.280527750651041, "rewards/rejected": -7.668988545735677, "step": 728 }, { "epoch": 0.19980814033164315, "grad_norm": 9.5625, "kl": 0.2564353942871094, "learning_rate": 5e-06, "logits/chosen": -34153292.8, "logits/rejected": -13536148.57142857, "logps/chosen": -464.816943359375, "logps/rejected": -519.8183244977679, "loss": 0.0385, "rewards/chosen": 5.735879516601562, "rewards/margins": 10.560125514439175, "rewards/rejected": -4.824245997837612, "step": 729 }, { "epoch": 0.20008222557215294, "grad_norm": 14.0625, "kl": 5.816828727722168, "learning_rate": 5e-06, "logits/chosen": -12909109.333333334, "logits/rejected": 7265863.333333333, "logps/chosen": -379.2515869140625, "logps/rejected": -500.5321044921875, "loss": 0.1002, "rewards/chosen": 4.2757673263549805, "rewards/margins": 10.059279441833496, "rewards/rejected": -5.783512115478516, "step": 730 }, { "epoch": 0.20035631081266272, "grad_norm": 16.875, "kl": 14.327875137329102, "learning_rate": 5e-06, "logits/chosen": -17113934.222222224, "logits/rejected": 47585536.0, "logps/chosen": -505.7734375, "logps/rejected": -658.74609375, "loss": 0.1018, "rewards/chosen": 6.186133490668403, "rewards/margins": 11.777341630723742, "rewards/rejected": -5.591208140055339, "step": 731 }, { "epoch": 0.20063039605317254, "grad_norm": 7.75, "kl": 5.974937438964844, "learning_rate": 5e-06, "logits/chosen": -35327635.692307696, "logits/rejected": 640896.5454545454, "logps/chosen": -417.02786959134613, "logps/rejected": -556.1243341619319, "loss": 0.0296, "rewards/chosen": 6.063298738919771, "rewards/margins": 14.588093577565013, "rewards/rejected": -8.524794838645242, "step": 732 }, { "epoch": 0.20090448129368232, "grad_norm": 13.125, "kl": 8.090620040893555, "learning_rate": 5e-06, "logits/chosen": -8485298.0, "logits/rejected": -5227593.333333333, "logps/chosen": -374.4207356770833, "logps/rejected": -398.7611490885417, "loss": 0.0895, "rewards/chosen": 5.50567626953125, "rewards/margins": 11.67841402689616, "rewards/rejected": -6.172737757364909, "step": 733 }, { "epoch": 0.20117856653419214, "grad_norm": 8.125, "kl": 3.775745391845703, "learning_rate": 5e-06, "logits/chosen": 19197398.153846152, "logits/rejected": -4065925.8181818184, "logps/chosen": -561.0159254807693, "logps/rejected": -403.16317471590907, "loss": 0.0278, "rewards/chosen": 6.714637169471154, "rewards/margins": 13.911894471495302, "rewards/rejected": -7.1972573020241475, "step": 734 }, { "epoch": 0.20145265177470192, "grad_norm": 9.25, "kl": 2.6316299438476562, "learning_rate": 5e-06, "logits/chosen": 3244047.272727273, "logits/rejected": 59480846.76923077, "logps/chosen": -463.25142045454544, "logps/rejected": -381.13022085336536, "loss": 0.0418, "rewards/chosen": 6.304079922762784, "rewards/margins": 14.874507450557257, "rewards/rejected": -8.570427527794472, "step": 735 }, { "epoch": 0.20172673701521174, "grad_norm": 10.0, "kl": 1.4329612255096436, "learning_rate": 5e-06, "logits/chosen": 14079408.0, "logits/rejected": 18993603.2, "logps/chosen": -473.90614536830356, "logps/rejected": -662.75947265625, "loss": 0.0518, "rewards/chosen": 4.42357417515346, "rewards/margins": 15.178289903913226, "rewards/rejected": -10.754715728759766, "step": 736 }, { "epoch": 0.20200082225572152, "grad_norm": 7.15625, "kl": 4.102336883544922, "learning_rate": 5e-06, "logits/chosen": -16593812.923076924, "logits/rejected": 917704.0909090909, "logps/chosen": -417.0173903245192, "logps/rejected": -478.88618607954544, "loss": 0.0328, "rewards/chosen": 4.877701392540565, "rewards/margins": 11.527625237311517, "rewards/rejected": -6.649923844770952, "step": 737 }, { "epoch": 0.20227490749623134, "grad_norm": 11.125, "kl": 1.0472171306610107, "learning_rate": 5e-06, "logits/chosen": 17280560.0, "logits/rejected": 3819339.6923076925, "logps/chosen": -343.26606889204544, "logps/rejected": -499.2447040264423, "loss": 0.0569, "rewards/chosen": 4.4872072393243965, "rewards/margins": 11.837170500855347, "rewards/rejected": -7.34996326153095, "step": 738 }, { "epoch": 0.20254899273674112, "grad_norm": 6.375, "kl": 1.8321311473846436, "learning_rate": 5e-06, "logits/chosen": -30059128.0, "logits/rejected": -7755562.0, "logps/chosen": -509.92059326171875, "logps/rejected": -594.15771484375, "loss": 0.0429, "rewards/chosen": 7.462072372436523, "rewards/margins": 16.72804069519043, "rewards/rejected": -9.265968322753906, "step": 739 }, { "epoch": 0.20282307797725094, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3071576.0, "logits/rejected": -6016042.0, "logps/chosen": -448.9353942871094, "logps/rejected": -435.5020446777344, "loss": 0.0167, "rewards/chosen": 6.385833740234375, "rewards/margins": 12.736654281616211, "rewards/rejected": -6.350820541381836, "step": 740 }, { "epoch": 0.20309716321776072, "grad_norm": 4.9375, "kl": 4.263530731201172, "learning_rate": 5e-06, "logits/chosen": -17656371.42857143, "logits/rejected": 4092460.0, "logps/chosen": -408.49539620535717, "logps/rejected": -529.051318359375, "loss": 0.0204, "rewards/chosen": 6.291803632463727, "rewards/margins": 14.703752027239117, "rewards/rejected": -8.41194839477539, "step": 741 }, { "epoch": 0.2033712484582705, "grad_norm": 15.0625, "kl": 12.441694259643555, "learning_rate": 5e-06, "logits/chosen": 27545934.769230768, "logits/rejected": 6671768.0, "logps/chosen": -597.1629356971154, "logps/rejected": -550.0503373579545, "loss": 0.0633, "rewards/chosen": 6.650838998647837, "rewards/margins": 14.940860721614811, "rewards/rejected": -8.290021722966975, "step": 742 }, { "epoch": 0.20364533369878032, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14838953.333333334, "logits/rejected": -4383512.333333333, "logps/chosen": -368.6467692057292, "logps/rejected": -533.6416829427084, "loss": 0.0755, "rewards/chosen": 3.978175163269043, "rewards/margins": 11.553542137145996, "rewards/rejected": -7.575366973876953, "step": 743 }, { "epoch": 0.2039194189392901, "grad_norm": 6.15625, "kl": 1.489454984664917, "learning_rate": 5e-06, "logits/chosen": -9509911.111111112, "logits/rejected": -5558281.6, "logps/chosen": -452.2099338107639, "logps/rejected": -538.05439453125, "loss": 0.043, "rewards/chosen": 5.201030731201172, "rewards/margins": 13.582696787516277, "rewards/rejected": -8.381666056315105, "step": 744 }, { "epoch": 0.20419350417979992, "grad_norm": 7.71875, "kl": 7.205536842346191, "learning_rate": 5e-06, "logits/chosen": -31159853.17647059, "logits/rejected": -17670499.42857143, "logps/chosen": -561.5029871323529, "logps/rejected": -303.63392857142856, "loss": 0.0317, "rewards/chosen": 5.73604538861443, "rewards/margins": 12.770618534889543, "rewards/rejected": -7.034573146275112, "step": 745 }, { "epoch": 0.2044675894203097, "grad_norm": 6.53125, "kl": 3.575793981552124, "learning_rate": 5e-06, "logits/chosen": -7594717.333333333, "logits/rejected": -40512792.0, "logps/chosen": -454.879150390625, "logps/rejected": -477.807861328125, "loss": 0.029, "rewards/chosen": 6.412909825642903, "rewards/margins": 13.247353235880533, "rewards/rejected": -6.83444341023763, "step": 746 }, { "epoch": 0.20474167466081952, "grad_norm": 15.8125, "kl": 7.863116264343262, "learning_rate": 5e-06, "logits/chosen": -4591806.571428572, "logits/rejected": -13552561.6, "logps/chosen": -424.799560546875, "logps/rejected": -523.66962890625, "loss": 0.1562, "rewards/chosen": 3.535649980817522, "rewards/margins": 13.113875688825335, "rewards/rejected": -9.578225708007812, "step": 747 }, { "epoch": 0.2050157599013293, "grad_norm": 6.59375, "kl": 2.9247474670410156, "learning_rate": 5e-06, "logits/chosen": -10858391.333333334, "logits/rejected": 8013246.666666667, "logps/chosen": -461.3739420572917, "logps/rejected": -549.2900797526041, "loss": 0.0241, "rewards/chosen": 7.246022542317708, "rewards/margins": 15.394222895304363, "rewards/rejected": -8.148200352986654, "step": 748 }, { "epoch": 0.20528984514183912, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11773870.4, "logits/rejected": -10903076.57142857, "logps/chosen": -560.899658203125, "logps/rejected": -398.9017857142857, "loss": 0.0204, "rewards/chosen": 7.184989166259766, "rewards/margins": 13.252052634102958, "rewards/rejected": -6.067063467843192, "step": 749 }, { "epoch": 0.2055639303823489, "grad_norm": 11.625, "kl": 3.0178604125976562, "learning_rate": 5e-06, "logits/chosen": -6680370.461538462, "logits/rejected": -16510570.181818182, "logps/chosen": -457.4727313701923, "logps/rejected": -434.6300159801136, "loss": 0.0479, "rewards/chosen": 5.043279794546274, "rewards/margins": 12.502676156850962, "rewards/rejected": -7.4593963623046875, "step": 750 }, { "epoch": 0.20583801562285872, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8074276.266666667, "logits/rejected": -3445545.777777778, "logps/chosen": -384.05205078125, "logps/rejected": -752.4237196180555, "loss": 0.0422, "rewards/chosen": 5.084151204427084, "rewards/margins": 16.653291320800783, "rewards/rejected": -11.569140116373697, "step": 751 }, { "epoch": 0.2061121008633685, "grad_norm": 8.8125, "kl": 3.126148223876953, "learning_rate": 5e-06, "logits/chosen": 8194274.666666667, "logits/rejected": -14347042.666666666, "logps/chosen": -616.1604410807291, "logps/rejected": -397.2712809244792, "loss": 0.0236, "rewards/chosen": 5.871391296386719, "rewards/margins": 13.262235005696613, "rewards/rejected": -7.3908437093098955, "step": 752 }, { "epoch": 0.2063861861038783, "grad_norm": 12.0, "kl": 1.052136778831482, "learning_rate": 5e-06, "logits/chosen": -4264055.692307692, "logits/rejected": 8075488.0, "logps/chosen": -426.8695537860577, "logps/rejected": -492.4988458806818, "loss": 0.0405, "rewards/chosen": 5.57391357421875, "rewards/margins": 12.221940474076703, "rewards/rejected": -6.648026899857954, "step": 753 }, { "epoch": 0.2066602713443881, "grad_norm": 8.75, "kl": 2.39020037651062, "learning_rate": 5e-06, "logits/chosen": -37935581.86666667, "logits/rejected": -6925031.555555556, "logps/chosen": -529.8494140625, "logps/rejected": -600.7239583333334, "loss": 0.026, "rewards/chosen": 5.928585815429687, "rewards/margins": 14.853634643554688, "rewards/rejected": -8.925048828125, "step": 754 }, { "epoch": 0.2069343565848979, "grad_norm": 15.25, "kl": 4.490790843963623, "learning_rate": 5e-06, "logits/chosen": 6774462.666666667, "logits/rejected": 29389141.333333332, "logps/chosen": -355.2872721354167, "logps/rejected": -537.0149739583334, "loss": 0.1602, "rewards/chosen": 3.940258344014486, "rewards/margins": 11.683241844177246, "rewards/rejected": -7.742983500162761, "step": 755 }, { "epoch": 0.2072084418254077, "grad_norm": 5.65625, "kl": 2.6066641807556152, "learning_rate": 5e-06, "logits/chosen": 110708.0, "logits/rejected": -12107400.727272727, "logps/chosen": -426.6787109375, "logps/rejected": -537.2445845170455, "loss": 0.044, "rewards/chosen": 6.642066368689904, "rewards/margins": 14.429367172134507, "rewards/rejected": -7.7873008034446025, "step": 756 }, { "epoch": 0.2074825270659175, "grad_norm": 6.625, "kl": 3.8408076763153076, "learning_rate": 5e-06, "logits/chosen": -26145194.666666668, "logits/rejected": -25599072.0, "logps/chosen": -461.6640625, "logps/rejected": -675.7373046875, "loss": 0.0434, "rewards/chosen": 6.228377024332683, "rewards/margins": 15.096630096435547, "rewards/rejected": -8.868253072102865, "step": 757 }, { "epoch": 0.2077566123064273, "grad_norm": 8.5625, "kl": 8.143548965454102, "learning_rate": 5e-06, "logits/chosen": -18938601.14285714, "logits/rejected": 1240973.8, "logps/chosen": -437.90757533482144, "logps/rejected": -520.993896484375, "loss": 0.0374, "rewards/chosen": 6.317909785679409, "rewards/margins": 13.628887721470424, "rewards/rejected": -7.310977935791016, "step": 758 }, { "epoch": 0.2080306975469371, "grad_norm": 13.6875, "kl": 11.421082496643066, "learning_rate": 5e-06, "logits/chosen": -19957178.666666668, "logits/rejected": -2915839.6666666665, "logps/chosen": -469.1311848958333, "logps/rejected": -485.0441080729167, "loss": 0.1394, "rewards/chosen": 5.400608062744141, "rewards/margins": 11.084373474121094, "rewards/rejected": -5.683765411376953, "step": 759 }, { "epoch": 0.2083047827874469, "grad_norm": 10.1875, "kl": 10.603328704833984, "learning_rate": 5e-06, "logits/chosen": 5204781.866666666, "logits/rejected": 21218846.222222224, "logps/chosen": -436.65709635416664, "logps/rejected": -390.8458658854167, "loss": 0.0398, "rewards/chosen": 5.838493855794271, "rewards/margins": 13.430958726671006, "rewards/rejected": -7.592464870876736, "step": 760 }, { "epoch": 0.2085788680279567, "grad_norm": 6.9375, "kl": 3.8361754417419434, "learning_rate": 5e-06, "logits/chosen": -18129842.666666668, "logits/rejected": 808959.6666666666, "logps/chosen": -531.4022623697916, "logps/rejected": -408.5040690104167, "loss": 0.013, "rewards/chosen": 6.396932601928711, "rewards/margins": 13.449330012003582, "rewards/rejected": -7.05239741007487, "step": 761 }, { "epoch": 0.2088529532684665, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27015506.0, "logits/rejected": -1481068.625, "logps/chosen": -450.9007568359375, "logps/rejected": -384.69482421875, "loss": 0.0566, "rewards/chosen": 5.734736919403076, "rewards/margins": 11.846766948699951, "rewards/rejected": -6.112030029296875, "step": 762 }, { "epoch": 0.2091270385089763, "grad_norm": 9.875, "kl": 3.9314002990722656, "learning_rate": 5e-06, "logits/chosen": 683555.7142857143, "logits/rejected": -14031371.2, "logps/chosen": -460.1309291294643, "logps/rejected": -501.671337890625, "loss": 0.079, "rewards/chosen": 4.521148136683872, "rewards/margins": 11.39699957711356, "rewards/rejected": -6.875851440429687, "step": 763 }, { "epoch": 0.20940112374948608, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3862527.75, "logits/rejected": -11927978.0, "logps/chosen": -433.6981506347656, "logps/rejected": -569.9501953125, "loss": 0.0357, "rewards/chosen": 4.852145671844482, "rewards/margins": 12.170495986938477, "rewards/rejected": -7.318350315093994, "step": 764 }, { "epoch": 0.2096752089899959, "grad_norm": 7.0625, "kl": 1.7126191854476929, "learning_rate": 5e-06, "logits/chosen": -16204475.636363637, "logits/rejected": -9703872.0, "logps/chosen": -385.1491033380682, "logps/rejected": -430.9963191105769, "loss": 0.051, "rewards/chosen": 5.436502283269709, "rewards/margins": 12.030707699435574, "rewards/rejected": -6.594205416165865, "step": 765 }, { "epoch": 0.20994929423050568, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36339733.333333336, "logits/rejected": -20073753.6, "logps/chosen": -394.6331380208333, "logps/rejected": -645.8133463541667, "loss": 0.0454, "rewards/chosen": 5.843667348225911, "rewards/margins": 13.59158198038737, "rewards/rejected": -7.747914632161458, "step": 766 }, { "epoch": 0.2102233794710155, "grad_norm": 14.0625, "kl": 1.0650825500488281, "learning_rate": 5e-06, "logits/chosen": -3090517.3333333335, "logits/rejected": 15116774.666666666, "logps/chosen": -489.5052083333333, "logps/rejected": -596.0078938802084, "loss": 0.0533, "rewards/chosen": 5.581108093261719, "rewards/margins": 14.634136199951172, "rewards/rejected": -9.053028106689453, "step": 767 }, { "epoch": 0.21049746471152528, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19592528.0, "logits/rejected": 161873.7142857143, "logps/chosen": -454.52294921875, "logps/rejected": -495.260986328125, "loss": 0.0169, "rewards/chosen": 6.7058837890625, "rewards/margins": 12.815356336321148, "rewards/rejected": -6.109472547258649, "step": 768 }, { "epoch": 0.2107715499520351, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12524450.909090908, "logits/rejected": 58064300.307692304, "logps/chosen": -386.1804865056818, "logps/rejected": -664.8931790865385, "loss": 0.0722, "rewards/chosen": 5.26461445201527, "rewards/margins": 16.913541407018272, "rewards/rejected": -11.648926955003004, "step": 769 }, { "epoch": 0.21104563519254488, "grad_norm": 11.1875, "kl": 7.252317428588867, "learning_rate": 5e-06, "logits/chosen": -28301553.454545453, "logits/rejected": 3253078.769230769, "logps/chosen": -427.97958096590907, "logps/rejected": -459.4924128605769, "loss": 0.0982, "rewards/chosen": 4.799940629438921, "rewards/margins": 11.803881505152564, "rewards/rejected": -7.003940875713642, "step": 770 }, { "epoch": 0.2113197204330547, "grad_norm": 8.75, "kl": 1.2965075969696045, "learning_rate": 5e-06, "logits/chosen": 6539432.888888889, "logits/rejected": -10789243.733333332, "logps/chosen": -473.87049696180554, "logps/rejected": -528.2665364583333, "loss": 0.0305, "rewards/chosen": 5.418751610649957, "rewards/margins": 12.655171797010635, "rewards/rejected": -7.236420186360677, "step": 771 }, { "epoch": 0.21159380567356448, "grad_norm": 15.5, "kl": 6.0154829025268555, "learning_rate": 5e-06, "logits/chosen": -5039543.466666667, "logits/rejected": -13939399.111111112, "logps/chosen": -405.1870442708333, "logps/rejected": -411.8894314236111, "loss": 0.0901, "rewards/chosen": 4.617605590820313, "rewards/margins": 8.562693447536892, "rewards/rejected": -3.94508785671658, "step": 772 }, { "epoch": 0.21186789091407426, "grad_norm": 16.375, "kl": 16.90564727783203, "learning_rate": 5e-06, "logits/chosen": -3277237.0, "logits/rejected": -8283089.0, "logps/chosen": -481.71783447265625, "logps/rejected": -411.66265869140625, "loss": 0.0977, "rewards/chosen": 6.275279998779297, "rewards/margins": 11.795593738555908, "rewards/rejected": -5.520313739776611, "step": 773 }, { "epoch": 0.21214197615458408, "grad_norm": 12.1875, "kl": 14.139385223388672, "learning_rate": 5e-06, "logits/chosen": -10913260.235294119, "logits/rejected": 141905225.14285713, "logps/chosen": -448.9217313878676, "logps/rejected": -559.6861397879464, "loss": 0.067, "rewards/chosen": 5.830348744111903, "rewards/margins": 17.722461476045495, "rewards/rejected": -11.892112731933594, "step": 774 }, { "epoch": 0.21241606139509386, "grad_norm": 6.65625, "kl": 2.413379669189453, "learning_rate": 5e-06, "logits/chosen": -5452568.666666667, "logits/rejected": -22210302.666666668, "logps/chosen": -447.5837809244792, "logps/rejected": -522.9932047526041, "loss": 0.021, "rewards/chosen": 5.910860697428386, "rewards/margins": 12.90816370646159, "rewards/rejected": -6.997303009033203, "step": 775 }, { "epoch": 0.21269014663560368, "grad_norm": 9.1875, "kl": 7.352179527282715, "learning_rate": 5e-06, "logits/chosen": -5022522.461538462, "logits/rejected": -14616078.545454545, "logps/chosen": -340.37886868990387, "logps/rejected": -513.9074928977273, "loss": 0.0847, "rewards/chosen": 5.00821040226863, "rewards/margins": 12.542116125146826, "rewards/rejected": -7.533905722878196, "step": 776 }, { "epoch": 0.21296423187611346, "grad_norm": 15.375, "kl": 15.624171257019043, "learning_rate": 5e-06, "logits/chosen": -11162440.0, "logits/rejected": -2609219.8, "logps/chosen": -486.7987583705357, "logps/rejected": -372.30908203125, "loss": 0.1484, "rewards/chosen": 5.644995553152902, "rewards/margins": 12.762373788016184, "rewards/rejected": -7.117378234863281, "step": 777 }, { "epoch": 0.21323831711662328, "grad_norm": 5.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2274862.8571428573, "logits/rejected": -22301392.0, "logps/chosen": -376.0298549107143, "logps/rejected": -580.19453125, "loss": 0.0208, "rewards/chosen": 5.653802054268973, "rewards/margins": 15.724290030343191, "rewards/rejected": -10.070487976074219, "step": 778 }, { "epoch": 0.21351240235713306, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43235232.0, "logits/rejected": -1844086.0, "logps/chosen": -658.7725219726562, "logps/rejected": -447.2777099609375, "loss": 0.0378, "rewards/chosen": 7.31680965423584, "rewards/margins": 12.496614933013916, "rewards/rejected": -5.179805278778076, "step": 779 }, { "epoch": 0.21378648759764288, "grad_norm": 6.75, "kl": 4.125914573669434, "learning_rate": 5e-06, "logits/chosen": 49798352.0, "logits/rejected": -879772.25, "logps/chosen": -522.0180053710938, "logps/rejected": -448.9217529296875, "loss": 0.031, "rewards/chosen": 6.138671398162842, "rewards/margins": 12.214663028717041, "rewards/rejected": -6.075991630554199, "step": 780 }, { "epoch": 0.21406057283815266, "grad_norm": 6.21875, "kl": 1.8067386150360107, "learning_rate": 5e-06, "logits/chosen": -1925317.4545454546, "logits/rejected": -7771922.461538462, "logps/chosen": -466.9610706676136, "logps/rejected": -364.44632662259613, "loss": 0.0339, "rewards/chosen": 6.350019281560725, "rewards/margins": 12.885754191792095, "rewards/rejected": -6.53573491023137, "step": 781 }, { "epoch": 0.21433465807866248, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12427677.538461538, "logits/rejected": -2806401.4545454546, "logps/chosen": -456.7385817307692, "logps/rejected": -658.8488103693181, "loss": 0.0403, "rewards/chosen": 6.015933697040264, "rewards/margins": 14.734149639423077, "rewards/rejected": -8.718215942382812, "step": 782 }, { "epoch": 0.21460874331917226, "grad_norm": 11.875, "kl": 2.139094114303589, "learning_rate": 5e-06, "logits/chosen": -7487238.666666667, "logits/rejected": -10436422.0, "logps/chosen": -354.6562093098958, "logps/rejected": -390.2307942708333, "loss": 0.0901, "rewards/chosen": 3.337937672932943, "rewards/margins": 8.396622975667318, "rewards/rejected": -5.058685302734375, "step": 783 }, { "epoch": 0.21488282855968205, "grad_norm": 4.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15334228.8, "logits/rejected": -14738584.0, "logps/chosen": -513.75966796875, "logps/rejected": -531.1060267857143, "loss": 0.0135, "rewards/chosen": 7.004496765136719, "rewards/margins": 14.720783015659878, "rewards/rejected": -7.716286250523159, "step": 784 }, { "epoch": 0.21515691380019186, "grad_norm": 10.875, "kl": 1.4460923671722412, "learning_rate": 5e-06, "logits/chosen": 13255237.333333334, "logits/rejected": -14462961.333333334, "logps/chosen": -407.7301432291667, "logps/rejected": -540.2618001302084, "loss": 0.0613, "rewards/chosen": 5.259487787882487, "rewards/margins": 11.784413655598959, "rewards/rejected": -6.524925867716472, "step": 785 }, { "epoch": 0.21543099904070165, "grad_norm": 13.1875, "kl": 8.36020278930664, "learning_rate": 5e-06, "logits/chosen": -9509896.533333333, "logits/rejected": -5673616.888888889, "logps/chosen": -466.8964518229167, "logps/rejected": -412.00816514756946, "loss": 0.0686, "rewards/chosen": 5.936824544270833, "rewards/margins": 10.944032626681857, "rewards/rejected": -5.007208082411024, "step": 786 }, { "epoch": 0.21570508428121146, "grad_norm": 14.5625, "kl": 15.448874473571777, "learning_rate": 5e-06, "logits/chosen": -26908597.333333332, "logits/rejected": -979102.6666666666, "logps/chosen": -456.72389322916666, "logps/rejected": -502.7439236111111, "loss": 0.0592, "rewards/chosen": 6.203707377115886, "rewards/margins": 14.351029290093315, "rewards/rejected": -8.14732191297743, "step": 787 }, { "epoch": 0.21597916952172125, "grad_norm": 11.4375, "kl": 0.4601237177848816, "learning_rate": 5e-06, "logits/chosen": -14746101.818181818, "logits/rejected": 12521933.538461538, "logps/chosen": -486.6290838068182, "logps/rejected": -530.7475210336538, "loss": 0.0442, "rewards/chosen": 5.7650136080655185, "rewards/margins": 12.20903967477225, "rewards/rejected": -6.444026066706731, "step": 788 }, { "epoch": 0.21625325476223106, "grad_norm": 7.90625, "kl": 5.412566184997559, "learning_rate": 5e-06, "logits/chosen": -8369248.7272727275, "logits/rejected": 16714823.384615384, "logps/chosen": -376.16872336647725, "logps/rejected": -580.6371694711538, "loss": 0.029, "rewards/chosen": 6.25532323663885, "rewards/margins": 14.336659037983502, "rewards/rejected": -8.081335801344652, "step": 789 }, { "epoch": 0.21652734000274085, "grad_norm": 13.375, "kl": 3.6281979084014893, "learning_rate": 5e-06, "logits/chosen": -15883492.57142857, "logits/rejected": -22757379.2, "logps/chosen": -394.3607700892857, "logps/rejected": -470.15390625, "loss": 0.0944, "rewards/chosen": 5.054285321916852, "rewards/margins": 10.450627408708844, "rewards/rejected": -5.396342086791992, "step": 790 }, { "epoch": 0.21680142524325066, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 22199918.0, "logits/rejected": 347031.5, "logps/chosen": -478.6980895996094, "logps/rejected": -371.31634521484375, "loss": 0.0324, "rewards/chosen": 5.348327159881592, "rewards/margins": 11.169523239135742, "rewards/rejected": -5.82119607925415, "step": 791 }, { "epoch": 0.21707551048376045, "grad_norm": 9.5, "kl": 3.9613208770751953, "learning_rate": 5e-06, "logits/chosen": 3375050.769230769, "logits/rejected": -16863291.636363637, "logps/chosen": -536.2646484375, "logps/rejected": -516.8904474431819, "loss": 0.0549, "rewards/chosen": 5.846859271709736, "rewards/margins": 12.592699170946242, "rewards/rejected": -6.745839899236506, "step": 792 }, { "epoch": 0.21734959572427026, "grad_norm": 9.1875, "kl": 1.864248275756836, "learning_rate": 5e-06, "logits/chosen": -1880824.6666666667, "logits/rejected": -5009074.0, "logps/chosen": -421.4808756510417, "logps/rejected": -610.680419921875, "loss": 0.0505, "rewards/chosen": 4.074622472127278, "rewards/margins": 13.100934982299805, "rewards/rejected": -9.026312510172525, "step": 793 }, { "epoch": 0.21762368096478005, "grad_norm": 11.75, "kl": 1.3024375438690186, "learning_rate": 5e-06, "logits/chosen": -17476727.466666665, "logits/rejected": -19119276.444444444, "logps/chosen": -340.6991861979167, "logps/rejected": -455.9899088541667, "loss": 0.0714, "rewards/chosen": 4.486894226074218, "rewards/margins": 9.774479166666666, "rewards/rejected": -5.287584940592448, "step": 794 }, { "epoch": 0.21789776620528983, "grad_norm": 3.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13838947.555555556, "logits/rejected": -13920452.266666668, "logps/chosen": -480.13536241319446, "logps/rejected": -475.4839192708333, "loss": 0.0157, "rewards/chosen": 5.9218860202365455, "rewards/margins": 13.273551771375868, "rewards/rejected": -7.351665751139323, "step": 795 }, { "epoch": 0.21817185144579965, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22990919.272727273, "logits/rejected": -16966534.153846152, "logps/chosen": -517.8690518465909, "logps/rejected": -515.9424579326923, "loss": 0.0422, "rewards/chosen": 5.47568789395419, "rewards/margins": 12.092770929936762, "rewards/rejected": -6.6170830359825725, "step": 796 }, { "epoch": 0.21844593668630943, "grad_norm": 7.3125, "kl": 4.938290596008301, "learning_rate": 5e-06, "logits/chosen": -10226675.42857143, "logits/rejected": -931422.4, "logps/chosen": -503.98653738839283, "logps/rejected": -431.5666015625, "loss": 0.0621, "rewards/chosen": 6.058909824916294, "rewards/margins": 11.766854694911412, "rewards/rejected": -5.707944869995117, "step": 797 }, { "epoch": 0.21872002192681925, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20675685.818181816, "logits/rejected": -5755533.538461538, "logps/chosen": -541.4126420454545, "logps/rejected": -490.55453725961536, "loss": 0.0225, "rewards/chosen": 5.373708204789595, "rewards/margins": 13.086327105968982, "rewards/rejected": -7.712618901179387, "step": 798 }, { "epoch": 0.21899410716732903, "grad_norm": 11.6875, "kl": 9.616449356079102, "learning_rate": 5e-06, "logits/chosen": -5517784.533333333, "logits/rejected": -14818889.777777778, "logps/chosen": -419.859375, "logps/rejected": -370.29918077256946, "loss": 0.09, "rewards/chosen": 5.198553466796875, "rewards/margins": 9.908153279622397, "rewards/rejected": -4.7095998128255205, "step": 799 }, { "epoch": 0.21926819240783885, "grad_norm": 10.4375, "kl": 13.885015487670898, "learning_rate": 5e-06, "logits/chosen": -19560950.0, "logits/rejected": 2832783.0, "logps/chosen": -525.465087890625, "logps/rejected": -727.39111328125, "loss": 0.0519, "rewards/chosen": 6.72096061706543, "rewards/margins": 16.010310173034668, "rewards/rejected": -9.289349555969238, "step": 800 }, { "epoch": 0.21954227764834863, "grad_norm": 4.5, "kl": 0.8050836324691772, "learning_rate": 5e-06, "logits/chosen": -228709.5, "logits/rejected": 598701.6, "logps/chosen": -398.02755301339283, "logps/rejected": -538.45947265625, "loss": 0.0564, "rewards/chosen": 4.699580601283482, "rewards/margins": 12.03304007393973, "rewards/rejected": -7.33345947265625, "step": 801 }, { "epoch": 0.21981636288885845, "grad_norm": 7.53125, "kl": 16.348541259765625, "learning_rate": 5e-06, "logits/chosen": -14802203.2, "logits/rejected": -4229220.285714285, "logps/chosen": -438.68115234375, "logps/rejected": -509.82376534598217, "loss": 0.0285, "rewards/chosen": 7.43970947265625, "rewards/margins": 15.192590659005301, "rewards/rejected": -7.752881186349051, "step": 802 }, { "epoch": 0.22009044812936823, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15771357.090909092, "logits/rejected": -6195572.307692308, "logps/chosen": -470.28067294034093, "logps/rejected": -445.89107572115387, "loss": 0.018, "rewards/chosen": 5.302019292658025, "rewards/margins": 12.859784959913133, "rewards/rejected": -7.557765667255108, "step": 803 }, { "epoch": 0.22036453336987802, "grad_norm": 10.875, "kl": 5.66575813293457, "learning_rate": 5e-06, "logits/chosen": -18620745.6, "logits/rejected": 6343512.0, "logps/chosen": -449.19765625, "logps/rejected": -398.8115234375, "loss": 0.0503, "rewards/chosen": 6.566942596435547, "rewards/margins": 11.810203661237445, "rewards/rejected": -5.243261064801898, "step": 804 }, { "epoch": 0.22063861861038783, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2722838.0, "logits/rejected": 4748530.133333334, "logps/chosen": -315.577392578125, "logps/rejected": -532.30966796875, "loss": 0.0702, "rewards/chosen": 4.971497429741754, "rewards/margins": 13.77307824028863, "rewards/rejected": -8.801580810546875, "step": 805 }, { "epoch": 0.22091270385089762, "grad_norm": 7.34375, "kl": 5.366238594055176, "learning_rate": 5e-06, "logits/chosen": -11429568.888888888, "logits/rejected": 3661066.6666666665, "logps/chosen": -341.68519422743054, "logps/rejected": -619.441162109375, "loss": 0.0587, "rewards/chosen": 4.518801795111762, "rewards/margins": 13.106998231675888, "rewards/rejected": -8.588196436564127, "step": 806 }, { "epoch": 0.22118678909140743, "grad_norm": 8.9375, "kl": 5.439787864685059, "learning_rate": 5e-06, "logits/chosen": -31843468.8, "logits/rejected": 27567611.42857143, "logps/chosen": -455.722705078125, "logps/rejected": -484.18966238839283, "loss": 0.0259, "rewards/chosen": 6.9479927062988285, "rewards/margins": 13.248453521728516, "rewards/rejected": -6.3004608154296875, "step": 807 }, { "epoch": 0.22146087433191722, "grad_norm": 4.9375, "kl": 1.182965636253357, "learning_rate": 5e-06, "logits/chosen": -17608854.4, "logits/rejected": 25340061.714285713, "logps/chosen": -518.547607421875, "logps/rejected": -488.6659458705357, "loss": 0.0328, "rewards/chosen": 5.450493621826172, "rewards/margins": 14.149022456577846, "rewards/rejected": -8.698528834751674, "step": 808 }, { "epoch": 0.22173495957242703, "grad_norm": 4.375, "kl": 4.537447929382324, "learning_rate": 5e-06, "logits/chosen": -2281173.714285714, "logits/rejected": -6736168.0, "logps/chosen": -416.07400948660717, "logps/rejected": -534.679443359375, "loss": 0.0219, "rewards/chosen": 5.8769345964704245, "rewards/margins": 15.361314937046597, "rewards/rejected": -9.484380340576172, "step": 809 }, { "epoch": 0.22200904481293682, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19360893.714285713, "logits/rejected": -6470221.647058823, "logps/chosen": -549.5157645089286, "logps/rejected": -457.0021541819853, "loss": 0.0537, "rewards/chosen": 7.590850285121372, "rewards/margins": 15.087336275757862, "rewards/rejected": -7.496485990636489, "step": 810 }, { "epoch": 0.22228313005344663, "grad_norm": 10.25, "kl": 6.14879846572876, "learning_rate": 5e-06, "logits/chosen": -13819918.933333334, "logits/rejected": 31246193.777777776, "logps/chosen": -475.95009765625, "logps/rejected": -384.916015625, "loss": 0.0522, "rewards/chosen": 6.043702189127604, "rewards/margins": 12.703101433648003, "rewards/rejected": -6.659399244520399, "step": 811 }, { "epoch": 0.22255721529395642, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1886207.7142857143, "logits/rejected": -18539627.2, "logps/chosen": -430.6611328125, "logps/rejected": -633.483154296875, "loss": 0.0549, "rewards/chosen": 5.298552376883371, "rewards/margins": 16.49320798601423, "rewards/rejected": -11.194655609130859, "step": 812 }, { "epoch": 0.22283130053446623, "grad_norm": 11.3125, "kl": 4.3205976486206055, "learning_rate": 5e-06, "logits/chosen": 3930466.4615384615, "logits/rejected": -7295194.909090909, "logps/chosen": -411.21615835336536, "logps/rejected": -448.92764559659093, "loss": 0.0572, "rewards/chosen": 5.479726938100962, "rewards/margins": 12.788760418658491, "rewards/rejected": -7.309033480557528, "step": 813 }, { "epoch": 0.22310538577497602, "grad_norm": 10.8125, "kl": 2.710541009902954, "learning_rate": 5e-06, "logits/chosen": -1771884.0, "logits/rejected": -17250840.0, "logps/chosen": -405.97119140625, "logps/rejected": -572.5330200195312, "loss": 0.0485, "rewards/chosen": 5.29817533493042, "rewards/margins": 14.265697002410889, "rewards/rejected": -8.967521667480469, "step": 814 }, { "epoch": 0.2233794710154858, "grad_norm": 11.375, "kl": 2.5503196716308594, "learning_rate": 5e-06, "logits/chosen": -6752412.363636363, "logits/rejected": -7237039.384615385, "logps/chosen": -399.80961470170456, "logps/rejected": -469.5404522235577, "loss": 0.0631, "rewards/chosen": 4.89246957952326, "rewards/margins": 12.958711504102586, "rewards/rejected": -8.066241924579327, "step": 815 }, { "epoch": 0.22365355625599562, "grad_norm": 5.84375, "kl": 7.282975673675537, "learning_rate": 5e-06, "logits/chosen": -19381006.933333334, "logits/rejected": -25690888.888888888, "logps/chosen": -386.33704427083336, "logps/rejected": -423.8784993489583, "loss": 0.0613, "rewards/chosen": 5.49405771891276, "rewards/margins": 11.416936832004122, "rewards/rejected": -5.922879113091363, "step": 816 }, { "epoch": 0.2239276414965054, "grad_norm": 7.90625, "kl": 1.9397945404052734, "learning_rate": 5e-06, "logits/chosen": -26491810.285714287, "logits/rejected": -3768008.705882353, "logps/chosen": -389.41573660714283, "logps/rejected": -551.8060661764706, "loss": 0.0249, "rewards/chosen": 5.859434945242746, "rewards/margins": 12.902331440388656, "rewards/rejected": -7.04289649514591, "step": 817 }, { "epoch": 0.22420172673701522, "grad_norm": 7.0625, "kl": 6.762911796569824, "learning_rate": 5e-06, "logits/chosen": -22964233.6, "logits/rejected": 3309326.8571428573, "logps/chosen": -401.66513671875, "logps/rejected": -606.759765625, "loss": 0.0543, "rewards/chosen": 5.7550914764404295, "rewards/margins": 14.93085310799735, "rewards/rejected": -9.17576163155692, "step": 818 }, { "epoch": 0.224475811977525, "grad_norm": 8.625, "kl": 8.852701187133789, "learning_rate": 5e-06, "logits/chosen": -6928333.866666666, "logits/rejected": -25770796.444444444, "logps/chosen": -460.76402994791664, "logps/rejected": -557.2111545138889, "loss": 0.0303, "rewards/chosen": 7.055112711588541, "rewards/margins": 14.767147148980033, "rewards/rejected": -7.712034437391493, "step": 819 }, { "epoch": 0.22474989721803482, "grad_norm": 9.625, "kl": 4.297152996063232, "learning_rate": 5e-06, "logits/chosen": -9633650.4, "logits/rejected": -11117885.714285715, "logps/chosen": -378.98720703125, "logps/rejected": -460.8983677455357, "loss": 0.041, "rewards/chosen": 4.318521118164062, "rewards/margins": 12.48156018938337, "rewards/rejected": -8.163039071219307, "step": 820 }, { "epoch": 0.2250239824585446, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3644907.7333333334, "logits/rejected": 1838000.4444444445, "logps/chosen": -389.21803385416666, "logps/rejected": -411.3120388454861, "loss": 0.0597, "rewards/chosen": 5.54043935139974, "rewards/margins": 12.052359856499567, "rewards/rejected": -6.511920505099827, "step": 821 }, { "epoch": 0.22529806769905442, "grad_norm": 7.03125, "kl": 4.115140914916992, "learning_rate": 5e-06, "logits/chosen": -3832325.5384615385, "logits/rejected": 45581716.36363637, "logps/chosen": -423.9743464543269, "logps/rejected": -549.5117631392045, "loss": 0.0269, "rewards/chosen": 5.354648883526142, "rewards/margins": 12.963849101033244, "rewards/rejected": -7.6092002175071025, "step": 822 }, { "epoch": 0.2255721529395642, "grad_norm": 6.75, "kl": 1.2415618896484375, "learning_rate": 5e-06, "logits/chosen": 6387709.6, "logits/rejected": -16107942.857142856, "logps/chosen": -486.3078125, "logps/rejected": -321.22670200892856, "loss": 0.0321, "rewards/chosen": 5.934611129760742, "rewards/margins": 11.814936447143555, "rewards/rejected": -5.8803253173828125, "step": 823 }, { "epoch": 0.22584623818007402, "grad_norm": 7.78125, "kl": 3.3480162620544434, "learning_rate": 5e-06, "logits/chosen": -9403510.4, "logits/rejected": -10638764.57142857, "logps/chosen": -466.937451171875, "logps/rejected": -557.6642717633929, "loss": 0.0388, "rewards/chosen": 5.875347137451172, "rewards/margins": 14.652554648263115, "rewards/rejected": -8.777207510811943, "step": 824 }, { "epoch": 0.2261203234205838, "grad_norm": 6.3125, "kl": 7.0272440910339355, "learning_rate": 5e-06, "logits/chosen": 3942095.272727273, "logits/rejected": -18973264.0, "logps/chosen": -406.7698863636364, "logps/rejected": -535.8597506009615, "loss": 0.0362, "rewards/chosen": 5.810187946666371, "rewards/margins": 12.375477370682297, "rewards/rejected": -6.565289424015925, "step": 825 }, { "epoch": 0.2263944086610936, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9365670.4, "logits/rejected": -13169491.42857143, "logps/chosen": -557.17099609375, "logps/rejected": -528.6283133370536, "loss": 0.0474, "rewards/chosen": 6.54110107421875, "rewards/margins": 13.583388083321708, "rewards/rejected": -7.0422870091029575, "step": 826 }, { "epoch": 0.2266684939016034, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2509744.3636363638, "logits/rejected": -2630145.230769231, "logps/chosen": -384.45565518465907, "logps/rejected": -475.34555288461536, "loss": 0.0823, "rewards/chosen": 4.410513444380327, "rewards/margins": 10.859631838498416, "rewards/rejected": -6.449118394118089, "step": 827 }, { "epoch": 0.2269425791421132, "grad_norm": 7.8125, "kl": 1.7063522338867188, "learning_rate": 5e-06, "logits/chosen": -13149235.692307692, "logits/rejected": 23127741.09090909, "logps/chosen": -406.3348858173077, "logps/rejected": -502.53151633522725, "loss": 0.0424, "rewards/chosen": 5.391617408165565, "rewards/margins": 13.13696902615207, "rewards/rejected": -7.745351617986506, "step": 828 }, { "epoch": 0.227216664382623, "grad_norm": 12.375, "kl": 0.6445509791374207, "learning_rate": 5e-06, "logits/chosen": -8874888.615384616, "logits/rejected": -1484684.3636363635, "logps/chosen": -322.11268028846155, "logps/rejected": -424.9056285511364, "loss": 0.0947, "rewards/chosen": 4.345704885629507, "rewards/margins": 10.466566979468286, "rewards/rejected": -6.120862093838778, "step": 829 }, { "epoch": 0.2274907496231328, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21789399.111111112, "logits/rejected": 11621902.933333334, "logps/chosen": -376.5714518229167, "logps/rejected": -587.4615234375, "loss": 0.0312, "rewards/chosen": 4.899071163601345, "rewards/margins": 15.090054236518013, "rewards/rejected": -10.190983072916667, "step": 830 }, { "epoch": 0.2277648348636426, "grad_norm": 5.1875, "kl": 6.841798782348633, "learning_rate": 5e-06, "logits/chosen": -741924.6666666666, "logits/rejected": -22391357.333333332, "logps/chosen": -455.0276692708333, "logps/rejected": -507.3132731119792, "loss": 0.0249, "rewards/chosen": 5.8507639567057295, "rewards/margins": 12.74376932779948, "rewards/rejected": -6.89300537109375, "step": 831 }, { "epoch": 0.2280389201041524, "grad_norm": 7.625, "kl": 5.603376388549805, "learning_rate": 5e-06, "logits/chosen": -14874090.666666666, "logits/rejected": 1814346.6666666667, "logps/chosen": -427.7555338541667, "logps/rejected": -572.5993245442709, "loss": 0.0734, "rewards/chosen": 4.628363291422526, "rewards/margins": 14.726048787434895, "rewards/rejected": -10.09768549601237, "step": 832 }, { "epoch": 0.2283130053446622, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9078902.4, "logits/rejected": -8200260.0, "logps/chosen": -504.258935546875, "logps/rejected": -487.64341517857144, "loss": 0.0438, "rewards/chosen": 5.383628082275391, "rewards/margins": 12.725254930768696, "rewards/rejected": -7.341626848493304, "step": 833 }, { "epoch": 0.228587090585172, "grad_norm": 7.25, "kl": 12.277402877807617, "learning_rate": 5e-06, "logits/chosen": -29135682.0, "logits/rejected": -25316868.0, "logps/chosen": -446.307861328125, "logps/rejected": -466.001708984375, "loss": 0.0463, "rewards/chosen": 6.411397457122803, "rewards/margins": 14.00470495223999, "rewards/rejected": -7.5933074951171875, "step": 834 }, { "epoch": 0.2288611758256818, "grad_norm": 3.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14304940.444444444, "logits/rejected": -14484059.733333332, "logps/chosen": -501.45220269097223, "logps/rejected": -457.49811197916665, "loss": 0.0266, "rewards/chosen": 5.408864339192708, "rewards/margins": 12.365062459309897, "rewards/rejected": -6.956198120117188, "step": 835 }, { "epoch": 0.2291352610661916, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1367470.0, "logits/rejected": -7555722.285714285, "logps/chosen": -434.8716796875, "logps/rejected": -539.647705078125, "loss": 0.0369, "rewards/chosen": 5.503591156005859, "rewards/margins": 15.223096466064453, "rewards/rejected": -9.719505310058594, "step": 836 }, { "epoch": 0.22940934630670137, "grad_norm": 9.5, "kl": 1.6526902914047241, "learning_rate": 5e-06, "logits/chosen": -8014273.714285715, "logits/rejected": 27290144.0, "logps/chosen": -339.30283900669644, "logps/rejected": -452.34443359375, "loss": 0.0713, "rewards/chosen": 4.129632677350726, "rewards/margins": 11.169831957135882, "rewards/rejected": -7.040199279785156, "step": 837 }, { "epoch": 0.2296834315472112, "grad_norm": 6.25, "kl": 2.0782599449157715, "learning_rate": 5e-06, "logits/chosen": -4750218.133333334, "logits/rejected": 10421428.444444444, "logps/chosen": -455.10768229166666, "logps/rejected": -545.9123263888889, "loss": 0.0458, "rewards/chosen": 6.895383707682291, "rewards/margins": 13.2638181898329, "rewards/rejected": -6.368434482150608, "step": 838 }, { "epoch": 0.22995751678772097, "grad_norm": 8.4375, "kl": 9.331491470336914, "learning_rate": 5e-06, "logits/chosen": -13010771.764705881, "logits/rejected": -16067992.0, "logps/chosen": -380.3355066636029, "logps/rejected": -625.9729352678571, "loss": 0.0916, "rewards/chosen": 4.750011668485754, "rewards/margins": 15.706105592871914, "rewards/rejected": -10.956093924386161, "step": 839 }, { "epoch": 0.2302316020282308, "grad_norm": 3.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15415754.181818182, "logits/rejected": -7267485.538461538, "logps/chosen": -418.92391690340907, "logps/rejected": -434.18310546875, "loss": 0.0304, "rewards/chosen": 6.664093711159446, "rewards/margins": 14.567186315576514, "rewards/rejected": -7.9030926044170675, "step": 840 }, { "epoch": 0.23050568726874057, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7049078.285714285, "logits/rejected": -9818782.11764706, "logps/chosen": -399.5365513392857, "logps/rejected": -525.4549632352941, "loss": 0.0318, "rewards/chosen": 4.99981198992048, "rewards/margins": 12.433035041103844, "rewards/rejected": -7.433223051183364, "step": 841 }, { "epoch": 0.23077977250925039, "grad_norm": 13.4375, "kl": 2.628507137298584, "learning_rate": 5e-06, "logits/chosen": -3693788.3636363638, "logits/rejected": 6916694.769230769, "logps/chosen": -397.1531427556818, "logps/rejected": -567.6492638221154, "loss": 0.0837, "rewards/chosen": 4.833805431019176, "rewards/margins": 11.013066271801929, "rewards/rejected": -6.179260840782752, "step": 842 }, { "epoch": 0.23105385774976017, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8401435.333333334, "logits/rejected": 30409528.0, "logps/chosen": -346.2425130208333, "logps/rejected": -715.8655598958334, "loss": 0.0505, "rewards/chosen": 3.380221684773763, "rewards/margins": 14.07386334737142, "rewards/rejected": -10.693641662597656, "step": 843 }, { "epoch": 0.23132794299026999, "grad_norm": 4.9375, "kl": 2.6917293071746826, "learning_rate": 5e-06, "logits/chosen": -26597280.0, "logits/rejected": -23587973.333333332, "logps/chosen": -420.4447428385417, "logps/rejected": -455.9367268880208, "loss": 0.0454, "rewards/chosen": 6.278371175130208, "rewards/margins": 14.02878189086914, "rewards/rejected": -7.750410715738933, "step": 844 }, { "epoch": 0.23160202823077977, "grad_norm": 5.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3715337.1428571427, "logits/rejected": 44092883.2, "logps/chosen": -414.736572265625, "logps/rejected": -546.480322265625, "loss": 0.0261, "rewards/chosen": 5.7176938738141745, "rewards/margins": 16.136469813755582, "rewards/rejected": -10.418775939941407, "step": 845 }, { "epoch": 0.23187611347128956, "grad_norm": 10.6875, "kl": 14.612869262695312, "learning_rate": 5e-06, "logits/chosen": -10613205.333333334, "logits/rejected": -36024458.666666664, "logps/chosen": -573.4852213541667, "logps/rejected": -521.3101671006945, "loss": 0.0426, "rewards/chosen": 7.659720357259115, "rewards/margins": 13.315475718180338, "rewards/rejected": -5.655755360921224, "step": 846 }, { "epoch": 0.23215019871179937, "grad_norm": 6.09375, "kl": 0.1102396696805954, "learning_rate": 5e-06, "logits/chosen": 12241957.333333334, "logits/rejected": -14462712.0, "logps/chosen": -453.8828125, "logps/rejected": -367.6239420572917, "loss": 0.0214, "rewards/chosen": 5.992277145385742, "rewards/margins": 12.533732096354168, "rewards/rejected": -6.541454950968425, "step": 847 }, { "epoch": 0.23242428395230916, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2463405.090909091, "logits/rejected": 9108174.76923077, "logps/chosen": -485.3443714488636, "logps/rejected": -463.6696965144231, "loss": 0.0162, "rewards/chosen": 7.034216447310015, "rewards/margins": 15.52078284416999, "rewards/rejected": -8.486566396859976, "step": 848 }, { "epoch": 0.23269836919281897, "grad_norm": 3.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5763527.2727272725, "logits/rejected": -9133715.076923076, "logps/chosen": -478.24240944602275, "logps/rejected": -411.2191631610577, "loss": 0.0127, "rewards/chosen": 5.459336020729759, "rewards/margins": 11.193102429796767, "rewards/rejected": -5.733766409067007, "step": 849 }, { "epoch": 0.23297245443332876, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10685323.0, "logits/rejected": 19510504.0, "logps/chosen": -472.3848571777344, "logps/rejected": -462.69207763671875, "loss": 0.0483, "rewards/chosen": 5.107512474060059, "rewards/margins": 11.861164569854736, "rewards/rejected": -6.753652095794678, "step": 850 }, { "epoch": 0.23324653967383857, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6298588.666666667, "logits/rejected": 5180108.666666667, "logps/chosen": -313.38828531901044, "logps/rejected": -432.7322998046875, "loss": 0.0358, "rewards/chosen": 6.31338373819987, "rewards/margins": 13.359962463378906, "rewards/rejected": -7.046578725179036, "step": 851 }, { "epoch": 0.23352062491434836, "grad_norm": 9.6875, "kl": 0.9980697631835938, "learning_rate": 5e-06, "logits/chosen": -11001760.0, "logits/rejected": -5872745.090909091, "logps/chosen": -425.8888972355769, "logps/rejected": -456.88503196022725, "loss": 0.0499, "rewards/chosen": 5.94000244140625, "rewards/margins": 9.860009626908736, "rewards/rejected": -3.920007185502486, "step": 852 }, { "epoch": 0.23379471015485817, "grad_norm": 11.8125, "kl": 1.5001157522201538, "learning_rate": 5e-06, "logits/chosen": -5993779.2, "logits/rejected": -17484995.555555556, "logps/chosen": -438.0805989583333, "logps/rejected": -553.3753255208334, "loss": 0.0492, "rewards/chosen": 4.777060953776042, "rewards/margins": 14.218423122829861, "rewards/rejected": -9.44136216905382, "step": 853 }, { "epoch": 0.23406879539536796, "grad_norm": 11.6875, "kl": 6.219813346862793, "learning_rate": 5e-06, "logits/chosen": -10982214.0, "logits/rejected": -11653192.0, "logps/chosen": -406.85626220703125, "logps/rejected": -539.385986328125, "loss": 0.0572, "rewards/chosen": 6.004615783691406, "rewards/margins": 14.016836166381836, "rewards/rejected": -8.01222038269043, "step": 854 }, { "epoch": 0.23434288063587777, "grad_norm": 3.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21399753.6, "logits/rejected": -5781651.428571428, "logps/chosen": -486.142529296875, "logps/rejected": -569.9812709263393, "loss": 0.0168, "rewards/chosen": 6.310839462280273, "rewards/margins": 15.23527248927525, "rewards/rejected": -8.924433026994977, "step": 855 }, { "epoch": 0.23461696587638756, "grad_norm": 3.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13221537.0, "logits/rejected": 80187616.0, "logps/chosen": -374.51287841796875, "logps/rejected": -535.7410888671875, "loss": 0.0295, "rewards/chosen": 4.993886947631836, "rewards/margins": 13.549175262451172, "rewards/rejected": -8.555288314819336, "step": 856 }, { "epoch": 0.23489105111689734, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3183168.533333333, "logits/rejected": -507579.55555555556, "logps/chosen": -351.6521484375, "logps/rejected": -429.3649088541667, "loss": 0.039, "rewards/chosen": 5.284422810872396, "rewards/margins": 12.303526814778646, "rewards/rejected": -7.01910400390625, "step": 857 }, { "epoch": 0.23516513635740716, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11551660.0, "logits/rejected": -4113224.3333333335, "logps/chosen": -363.8453776041667, "logps/rejected": -483.569580078125, "loss": 0.098, "rewards/chosen": 4.33900260925293, "rewards/margins": 10.507422129313152, "rewards/rejected": -6.168419520060222, "step": 858 }, { "epoch": 0.23543922159791694, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11004660.8, "logits/rejected": -11528645.714285715, "logps/chosen": -397.067578125, "logps/rejected": -531.1529715401786, "loss": 0.0287, "rewards/chosen": 6.7886909484863285, "rewards/margins": 14.73840833391462, "rewards/rejected": -7.9497173854282925, "step": 859 }, { "epoch": 0.23571330683842676, "grad_norm": 12.6875, "kl": 6.069465160369873, "learning_rate": 5e-06, "logits/chosen": -23867997.53846154, "logits/rejected": -20229099.636363637, "logps/chosen": -394.72445913461536, "logps/rejected": -498.8902698863636, "loss": 0.0677, "rewards/chosen": 4.605005997877854, "rewards/margins": 11.642970558646676, "rewards/rejected": -7.037964560768821, "step": 860 }, { "epoch": 0.23598739207893654, "grad_norm": 3.09375, "kl": 2.1913630962371826, "learning_rate": 5e-06, "logits/chosen": -7051408.533333333, "logits/rejected": -9108555.555555556, "logps/chosen": -420.20283203125, "logps/rejected": -683.4813368055555, "loss": 0.0118, "rewards/chosen": 6.3790135701497395, "rewards/margins": 15.89761488172743, "rewards/rejected": -9.51860131157769, "step": 861 }, { "epoch": 0.23626147731944636, "grad_norm": 9.375, "kl": 1.1645368337631226, "learning_rate": 5e-06, "logits/chosen": -11806162.181818182, "logits/rejected": 12422446.76923077, "logps/chosen": -436.21835049715907, "logps/rejected": -687.3269981971154, "loss": 0.0456, "rewards/chosen": 5.256955927068537, "rewards/margins": 16.163410160091374, "rewards/rejected": -10.906454233022837, "step": 862 }, { "epoch": 0.23653556255995614, "grad_norm": 7.21875, "kl": 4.468235969543457, "learning_rate": 5e-06, "logits/chosen": -3313508.923076923, "logits/rejected": -10538740.363636363, "logps/chosen": -425.85415414663464, "logps/rejected": -599.3912020596591, "loss": 0.0158, "rewards/chosen": 6.50668217585637, "rewards/margins": 12.600884390877678, "rewards/rejected": -6.094202215021307, "step": 863 }, { "epoch": 0.23680964780046596, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19293977.333333332, "logits/rejected": 9448739.333333334, "logps/chosen": -411.5407307942708, "logps/rejected": -601.5840657552084, "loss": 0.0109, "rewards/chosen": 6.8816986083984375, "rewards/margins": 14.870519002278645, "rewards/rejected": -7.988820393880208, "step": 864 }, { "epoch": 0.23708373304097574, "grad_norm": 14.875, "kl": 3.966329574584961, "learning_rate": 5e-06, "logits/chosen": -10584011.733333332, "logits/rejected": -23760718.222222224, "logps/chosen": -472.82434895833336, "logps/rejected": -454.72056749131946, "loss": 0.0795, "rewards/chosen": 4.874595133463542, "rewards/margins": 10.270219082302518, "rewards/rejected": -5.395623948838976, "step": 865 }, { "epoch": 0.23735781828148556, "grad_norm": 4.46875, "kl": 5.4783525466918945, "learning_rate": 5e-06, "logits/chosen": -13866308.266666668, "logits/rejected": 9327029.333333334, "logps/chosen": -394.29518229166666, "logps/rejected": -559.1533203125, "loss": 0.0194, "rewards/chosen": 5.881537882486979, "rewards/margins": 13.649051581488715, "rewards/rejected": -7.767513699001736, "step": 866 }, { "epoch": 0.23763190352199534, "grad_norm": 4.3125, "kl": 2.953699827194214, "learning_rate": 5e-06, "logits/chosen": -1245680.2307692308, "logits/rejected": -2079248.7272727273, "logps/chosen": -422.2362530048077, "logps/rejected": -442.12362393465907, "loss": 0.0495, "rewards/chosen": 6.007197453425481, "rewards/margins": 11.758277359542312, "rewards/rejected": -5.751079906116832, "step": 867 }, { "epoch": 0.23790598876250513, "grad_norm": 11.8125, "kl": 11.470787048339844, "learning_rate": 5e-06, "logits/chosen": -3365308.0, "logits/rejected": -5892390.5, "logps/chosen": -348.96038818359375, "logps/rejected": -326.9132080078125, "loss": 0.0731, "rewards/chosen": 5.0208635330200195, "rewards/margins": 11.66939640045166, "rewards/rejected": -6.648532867431641, "step": 868 }, { "epoch": 0.23818007400301494, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7220483.636363637, "logits/rejected": -8120369.230769231, "logps/chosen": -439.23508522727275, "logps/rejected": -361.92300180288464, "loss": 0.0365, "rewards/chosen": 7.108888799493963, "rewards/margins": 14.812320149028217, "rewards/rejected": -7.703431349534255, "step": 869 }, { "epoch": 0.23845415924352473, "grad_norm": 9.875, "kl": 9.25204849243164, "learning_rate": 5e-06, "logits/chosen": -25950004.57142857, "logits/rejected": 2384699.6, "logps/chosen": -496.029052734375, "logps/rejected": -510.03291015625, "loss": 0.0454, "rewards/chosen": 6.020174843924386, "rewards/margins": 13.560754067557198, "rewards/rejected": -7.540579223632813, "step": 870 }, { "epoch": 0.23872824448403454, "grad_norm": 5.90625, "kl": 2.4701271057128906, "learning_rate": 5e-06, "logits/chosen": 4990902.285714285, "logits/rejected": -6435178.0, "logps/chosen": -470.89132254464283, "logps/rejected": -433.12958984375, "loss": 0.0227, "rewards/chosen": 6.56896482195173, "rewards/margins": 12.813875634329658, "rewards/rejected": -6.244910812377929, "step": 871 }, { "epoch": 0.23900232972454433, "grad_norm": 5.84375, "kl": 1.4306056499481201, "learning_rate": 5e-06, "logits/chosen": 14552681.142857144, "logits/rejected": -20051905.6, "logps/chosen": -370.39090401785717, "logps/rejected": -457.485595703125, "loss": 0.0472, "rewards/chosen": 4.845833914620536, "rewards/margins": 13.901859610421315, "rewards/rejected": -9.05602569580078, "step": 872 }, { "epoch": 0.23927641496505414, "grad_norm": 8.875, "kl": 4.464672088623047, "learning_rate": 5e-06, "logits/chosen": 4895942.285714285, "logits/rejected": -6036876.4, "logps/chosen": -371.23447963169644, "logps/rejected": -452.418408203125, "loss": 0.0543, "rewards/chosen": 4.55328859601702, "rewards/margins": 12.490703146798271, "rewards/rejected": -7.93741455078125, "step": 873 }, { "epoch": 0.23955050020556393, "grad_norm": 16.375, "kl": 3.9444408416748047, "learning_rate": 5e-06, "logits/chosen": -18040964.0, "logits/rejected": -11666229.333333334, "logps/chosen": -394.443115234375, "logps/rejected": -465.3247884114583, "loss": 0.0292, "rewards/chosen": 5.607100168863933, "rewards/margins": 13.94128926595052, "rewards/rejected": -8.334189097086588, "step": 874 }, { "epoch": 0.23982458544607374, "grad_norm": 8.3125, "kl": 4.6769561767578125, "learning_rate": 5e-06, "logits/chosen": -10324859.333333334, "logits/rejected": 3537171.3333333335, "logps/chosen": -456.6743977864583, "logps/rejected": -569.1338297526041, "loss": 0.0274, "rewards/chosen": 5.548547108968099, "rewards/margins": 14.42900276184082, "rewards/rejected": -8.88045565287272, "step": 875 }, { "epoch": 0.24009867068658353, "grad_norm": 13.25, "kl": 3.0734939575195312, "learning_rate": 5e-06, "logits/chosen": -19067107.42857143, "logits/rejected": 2106588.4, "logps/chosen": -424.40157645089283, "logps/rejected": -543.92490234375, "loss": 0.0454, "rewards/chosen": 5.685338156563895, "rewards/margins": 11.476146262032646, "rewards/rejected": -5.79080810546875, "step": 876 }, { "epoch": 0.2403727559270933, "grad_norm": 10.1875, "kl": 6.518584728240967, "learning_rate": 5e-06, "logits/chosen": -2253815.3846153845, "logits/rejected": -2996663.8181818184, "logps/chosen": -390.07117638221155, "logps/rejected": -475.89706143465907, "loss": 0.044, "rewards/chosen": 5.592864403357873, "rewards/margins": 13.244854426884151, "rewards/rejected": -7.651990023526278, "step": 877 }, { "epoch": 0.24064684116760313, "grad_norm": 9.3125, "kl": 4.6004133224487305, "learning_rate": 5e-06, "logits/chosen": -5341806.117647059, "logits/rejected": 8821469.714285715, "logps/chosen": -352.7598230698529, "logps/rejected": -570.0659877232143, "loss": 0.0528, "rewards/chosen": 4.8933868408203125, "rewards/margins": 16.146034240722656, "rewards/rejected": -11.252647399902344, "step": 878 }, { "epoch": 0.2409209264081129, "grad_norm": 11.5625, "kl": 3.9827208518981934, "learning_rate": 5e-06, "logits/chosen": -42888944.0, "logits/rejected": -5705435.333333333, "logps/chosen": -421.3941243489583, "logps/rejected": -390.9328206380208, "loss": 0.0641, "rewards/chosen": 5.503225326538086, "rewards/margins": 13.037641525268555, "rewards/rejected": -7.534416198730469, "step": 879 }, { "epoch": 0.24119501164862273, "grad_norm": 5.875, "kl": 1.0286357402801514, "learning_rate": 5e-06, "logits/chosen": -4672495.333333333, "logits/rejected": -20750872.0, "logps/chosen": -466.2459309895833, "logps/rejected": -491.5945638020833, "loss": 0.021, "rewards/chosen": 5.598375956217448, "rewards/margins": 11.390850067138672, "rewards/rejected": -5.792474110921224, "step": 880 }, { "epoch": 0.2414690968891325, "grad_norm": 7.5625, "kl": 2.0841147899627686, "learning_rate": 5e-06, "logits/chosen": 17015794.666666668, "logits/rejected": -19451601.333333332, "logps/chosen": -476.5810953776042, "logps/rejected": -448.53076171875, "loss": 0.0287, "rewards/chosen": 5.2958634694417315, "rewards/margins": 12.929672876993815, "rewards/rejected": -7.633809407552083, "step": 881 }, { "epoch": 0.24174318212964233, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11155120.888888888, "logits/rejected": -9350574.933333334, "logps/chosen": -374.6775716145833, "logps/rejected": -501.93951822916665, "loss": 0.0327, "rewards/chosen": 6.834637112087673, "rewards/margins": 12.752752346462673, "rewards/rejected": -5.918115234375, "step": 882 }, { "epoch": 0.2420172673701521, "grad_norm": 9.75, "kl": 2.4677138328552246, "learning_rate": 5e-06, "logits/chosen": 963511.7142857143, "logits/rejected": -10347238.4, "logps/chosen": -487.35707310267856, "logps/rejected": -471.41796875, "loss": 0.0659, "rewards/chosen": 5.651353018624442, "rewards/margins": 14.863739340645925, "rewards/rejected": -9.212386322021484, "step": 883 }, { "epoch": 0.24229135261066193, "grad_norm": 7.71875, "kl": 2.6126277446746826, "learning_rate": 5e-06, "logits/chosen": -17941994.666666668, "logits/rejected": 12105354.666666666, "logps/chosen": -570.7707112630209, "logps/rejected": -636.6070556640625, "loss": 0.055, "rewards/chosen": 6.329636891682942, "rewards/margins": 15.800102233886719, "rewards/rejected": -9.470465342203775, "step": 884 }, { "epoch": 0.2425654378511717, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21772342.153846152, "logits/rejected": -18218276.363636363, "logps/chosen": -412.3405198317308, "logps/rejected": -598.7698863636364, "loss": 0.0186, "rewards/chosen": 5.983622624323918, "rewards/margins": 16.012707263439687, "rewards/rejected": -10.029084639115768, "step": 885 }, { "epoch": 0.24283952309168153, "grad_norm": 7.34375, "kl": 0.8917821645736694, "learning_rate": 5e-06, "logits/chosen": -5120956.333333333, "logits/rejected": -13650992.0, "logps/chosen": -438.1419677734375, "logps/rejected": -446.6163736979167, "loss": 0.0488, "rewards/chosen": 5.668552398681641, "rewards/margins": 11.684336980183918, "rewards/rejected": -6.015784581502278, "step": 886 }, { "epoch": 0.2431136083321913, "grad_norm": 6.34375, "kl": 1.4322662353515625, "learning_rate": 5e-06, "logits/chosen": -5776847.05882353, "logits/rejected": -4654774.285714285, "logps/chosen": -478.38792509191177, "logps/rejected": -465.9013671875, "loss": 0.0343, "rewards/chosen": 5.947717105641084, "rewards/margins": 14.71735734699153, "rewards/rejected": -8.769640241350446, "step": 887 }, { "epoch": 0.2433876935727011, "grad_norm": 13.1875, "kl": 3.878490447998047, "learning_rate": 5e-06, "logits/chosen": 1161720.6153846155, "logits/rejected": -10451024.727272727, "logps/chosen": -428.4821213942308, "logps/rejected": -398.30934836647725, "loss": 0.1065, "rewards/chosen": 5.836492685171274, "rewards/margins": 12.316798443560835, "rewards/rejected": -6.48030575838956, "step": 888 }, { "epoch": 0.2436617788132109, "grad_norm": 6.96875, "kl": 0.18441645801067352, "learning_rate": 5e-06, "logits/chosen": 500479.1111111111, "logits/rejected": 8820059.733333332, "logps/chosen": -455.00005425347223, "logps/rejected": -467.6315104166667, "loss": 0.0403, "rewards/chosen": 6.93484624226888, "rewards/margins": 13.043108876546224, "rewards/rejected": -6.108262634277343, "step": 889 }, { "epoch": 0.2439358640537207, "grad_norm": 11.625, "kl": 13.631585121154785, "learning_rate": 5e-06, "logits/chosen": -10566150.0, "logits/rejected": 2417871.0, "logps/chosen": -451.55584716796875, "logps/rejected": -634.0518188476562, "loss": 0.0753, "rewards/chosen": 6.989587306976318, "rewards/margins": 14.261556625366211, "rewards/rejected": -7.271969318389893, "step": 890 }, { "epoch": 0.2442099492942305, "grad_norm": 12.4375, "kl": 0.5342572927474976, "learning_rate": 5e-06, "logits/chosen": 3818223.4285714286, "logits/rejected": -65124.9, "logps/chosen": -385.6034458705357, "logps/rejected": -377.2816650390625, "loss": 0.0669, "rewards/chosen": 6.423535483224051, "rewards/margins": 12.09806627546038, "rewards/rejected": -5.674530792236328, "step": 891 }, { "epoch": 0.2444840345347403, "grad_norm": 5.25, "kl": 0.5142968893051147, "learning_rate": 5e-06, "logits/chosen": -5920954.0, "logits/rejected": -3485066.3333333335, "logps/chosen": -471.6937255859375, "logps/rejected": -383.9448649088542, "loss": 0.0195, "rewards/chosen": 6.093478520711263, "rewards/margins": 12.809310277303059, "rewards/rejected": -6.715831756591797, "step": 892 }, { "epoch": 0.2447581197752501, "grad_norm": 10.3125, "kl": 11.916531562805176, "learning_rate": 5e-06, "logits/chosen": 1576177.6, "logits/rejected": 7609912.888888889, "logps/chosen": -514.5260416666666, "logps/rejected": -483.81488715277777, "loss": 0.0949, "rewards/chosen": 6.379988098144532, "rewards/margins": 14.30866427951389, "rewards/rejected": -7.928676181369358, "step": 893 }, { "epoch": 0.2450322050157599, "grad_norm": 3.984375, "kl": 4.396402359008789, "learning_rate": 5e-06, "logits/chosen": -7265622.4, "logits/rejected": -7037759.428571428, "logps/chosen": -438.014599609375, "logps/rejected": -528.0247628348214, "loss": 0.015, "rewards/chosen": 7.045130920410156, "rewards/margins": 15.603974805559432, "rewards/rejected": -8.558843885149274, "step": 894 }, { "epoch": 0.2453062902562697, "grad_norm": 12.0, "kl": 7.803592205047607, "learning_rate": 5e-06, "logits/chosen": 4820914.8, "logits/rejected": 331545.25, "logps/chosen": -404.36357421875, "logps/rejected": -395.761474609375, "loss": 0.056, "rewards/chosen": 5.558138656616211, "rewards/margins": 11.229707935878208, "rewards/rejected": -5.671569279261997, "step": 895 }, { "epoch": 0.2455803754967795, "grad_norm": 10.75, "kl": 8.456311225891113, "learning_rate": 5e-06, "logits/chosen": 3609885.1764705884, "logits/rejected": -25494605.714285713, "logps/chosen": -368.40349264705884, "logps/rejected": -655.7732979910714, "loss": 0.1357, "rewards/chosen": 4.552709691664752, "rewards/margins": 14.29209393413127, "rewards/rejected": -9.739384242466517, "step": 896 }, { "epoch": 0.2458544607372893, "grad_norm": 11.875, "kl": 0.22890345752239227, "learning_rate": 5e-06, "logits/chosen": 2480576.2, "logits/rejected": -4218635.428571428, "logps/chosen": -321.97138671875, "logps/rejected": -498.01210239955356, "loss": 0.0737, "rewards/chosen": 3.866904067993164, "rewards/margins": 10.917189952305385, "rewards/rejected": -7.050285884312221, "step": 897 }, { "epoch": 0.2461285459777991, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 822912.6, "logits/rejected": -201781.85714285713, "logps/chosen": -300.694384765625, "logps/rejected": -484.88295200892856, "loss": 0.1059, "rewards/chosen": 3.2070392608642577, "rewards/margins": 9.053189250401088, "rewards/rejected": -5.846149989536831, "step": 898 }, { "epoch": 0.24640263121830888, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2828576.3333333335, "logits/rejected": -7794964.0, "logps/chosen": -343.1846110026042, "logps/rejected": -340.7328287760417, "loss": 0.0848, "rewards/chosen": 5.4699045817057295, "rewards/margins": 9.893821398417156, "rewards/rejected": -4.423916816711426, "step": 899 }, { "epoch": 0.2466767164588187, "grad_norm": 6.625, "kl": 3.162677764892578, "learning_rate": 5e-06, "logits/chosen": -9911075.2, "logits/rejected": -17805722.666666668, "logps/chosen": -394.2755859375, "logps/rejected": -454.52826605902777, "loss": 0.0339, "rewards/chosen": 6.296118672688802, "rewards/margins": 12.12198452419705, "rewards/rejected": -5.825865851508246, "step": 900 }, { "epoch": 0.24695080169932848, "grad_norm": 5.875, "kl": 3.4440486431121826, "learning_rate": 5e-06, "logits/chosen": -11467348.57142857, "logits/rejected": -14295361.6, "logps/chosen": -395.06637137276783, "logps/rejected": -474.60205078125, "loss": 0.045, "rewards/chosen": 5.409457615443638, "rewards/margins": 10.031068638392856, "rewards/rejected": -4.621611022949219, "step": 901 }, { "epoch": 0.2472248869398383, "grad_norm": 3.0, "kl": 1.7582563161849976, "learning_rate": 5e-06, "logits/chosen": -18748304.0, "logits/rejected": -18660110.85714286, "logps/chosen": -325.14931640625, "logps/rejected": -573.3818708147321, "loss": 0.0216, "rewards/chosen": 5.821567535400391, "rewards/margins": 13.930174691336495, "rewards/rejected": -8.108607155936104, "step": 902 }, { "epoch": 0.24749897218034808, "grad_norm": 6.34375, "kl": 0.4022468030452728, "learning_rate": 5e-06, "logits/chosen": -9868430.545454545, "logits/rejected": 3265392.0, "logps/chosen": -461.02059659090907, "logps/rejected": -606.2365534855769, "loss": 0.0333, "rewards/chosen": 6.155001553622159, "rewards/margins": 14.899684452510382, "rewards/rejected": -8.744682898888222, "step": 903 }, { "epoch": 0.2477730574208579, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18596872.0, "logits/rejected": -24515113.6, "logps/chosen": -373.15157645089283, "logps/rejected": -443.896533203125, "loss": 0.0426, "rewards/chosen": 4.868895394461496, "rewards/margins": 11.96216321672712, "rewards/rejected": -7.093267822265625, "step": 904 }, { "epoch": 0.24804714266136768, "grad_norm": 9.875, "kl": 1.2324079275131226, "learning_rate": 5e-06, "logits/chosen": -5615712.0, "logits/rejected": 28971086.769230768, "logps/chosen": -473.41335227272725, "logps/rejected": -590.9322415865385, "loss": 0.028, "rewards/chosen": 5.248553882945668, "rewards/margins": 13.586131115893384, "rewards/rejected": -8.337577232947716, "step": 905 }, { "epoch": 0.2483212279018775, "grad_norm": 7.53125, "kl": 6.313266754150391, "learning_rate": 5e-06, "logits/chosen": -4732.846153846154, "logits/rejected": 20555780.363636363, "logps/chosen": -383.08882962740387, "logps/rejected": -378.05282315340907, "loss": 0.0587, "rewards/chosen": 5.67813227726863, "rewards/margins": 10.993088942307693, "rewards/rejected": -5.3149566650390625, "step": 906 }, { "epoch": 0.24859531314238728, "grad_norm": 5.71875, "kl": 2.7646586894989014, "learning_rate": 5e-06, "logits/chosen": 2788652.0, "logits/rejected": -8502876.666666666, "logps/chosen": -484.665771484375, "logps/rejected": -618.5754801432291, "loss": 0.0373, "rewards/chosen": 5.906757990519206, "rewards/margins": 14.281230926513672, "rewards/rejected": -8.374472935994467, "step": 907 }, { "epoch": 0.2488693983828971, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6585628.0, "logits/rejected": 109823945.14285715, "logps/chosen": -357.176318359375, "logps/rejected": -491.6630859375, "loss": 0.0507, "rewards/chosen": 4.7701984405517575, "rewards/margins": 12.312666920253207, "rewards/rejected": -7.5424684797014505, "step": 908 }, { "epoch": 0.24914348362340688, "grad_norm": 14.0625, "kl": 5.3483734130859375, "learning_rate": 5e-06, "logits/chosen": -8292202.461538462, "logits/rejected": -9132279.272727273, "logps/chosen": -432.00304236778845, "logps/rejected": -372.68319424715907, "loss": 0.0859, "rewards/chosen": 5.593698354867788, "rewards/margins": 11.776576328944493, "rewards/rejected": -6.182877974076704, "step": 909 }, { "epoch": 0.24941756886391667, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17409973.333333332, "logits/rejected": -12914493.866666667, "logps/chosen": -527.0060763888889, "logps/rejected": -535.7731770833333, "loss": 0.0313, "rewards/chosen": 7.964708116319445, "rewards/margins": 14.871660529242622, "rewards/rejected": -6.906952412923177, "step": 910 }, { "epoch": 0.24969165410442648, "grad_norm": 13.8125, "kl": 4.882532119750977, "learning_rate": 5e-06, "logits/chosen": -18888985.14285714, "logits/rejected": 30271404.8, "logps/chosen": -495.67372349330356, "logps/rejected": -543.234814453125, "loss": 0.0408, "rewards/chosen": 6.301877702985491, "rewards/margins": 13.817504991803851, "rewards/rejected": -7.51562728881836, "step": 911 }, { "epoch": 0.24996573934493627, "grad_norm": 8.3125, "kl": 6.9503045082092285, "learning_rate": 5e-06, "logits/chosen": -10076728.0, "logits/rejected": -13727842.0, "logps/chosen": -463.40985107421875, "logps/rejected": -469.47760009765625, "loss": 0.0726, "rewards/chosen": 5.730602264404297, "rewards/margins": 12.40587854385376, "rewards/rejected": -6.675276279449463, "step": 912 }, { "epoch": 0.2502398245854461, "grad_norm": 1.4609375, "kl": 0.29437255859375, "learning_rate": 5e-06, "logits/chosen": 6196024.0, "logits/rejected": -9337963.333333334, "logps/chosen": -436.1181640625, "logps/rejected": -603.20166015625, "loss": 0.0051, "rewards/chosen": 5.8333485921223955, "rewards/margins": 13.59798494974772, "rewards/rejected": -7.764636357625325, "step": 913 }, { "epoch": 0.2505139098259559, "grad_norm": 7.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 53867667.2, "logits/rejected": -8786059.42857143, "logps/chosen": -465.022607421875, "logps/rejected": -433.970458984375, "loss": 0.0464, "rewards/chosen": 6.6525115966796875, "rewards/margins": 14.307448250906809, "rewards/rejected": -7.654936654227121, "step": 914 }, { "epoch": 0.25078799506646565, "grad_norm": 6.84375, "kl": 3.7618823051452637, "learning_rate": 5e-06, "logits/chosen": -13159069.333333334, "logits/rejected": -22417904.0, "logps/chosen": -524.9901936848959, "logps/rejected": -395.2134195963542, "loss": 0.018, "rewards/chosen": 7.9787336985270185, "rewards/margins": 14.535828272501629, "rewards/rejected": -6.557094573974609, "step": 915 }, { "epoch": 0.25106208030697547, "grad_norm": 8.4375, "kl": 7.156263828277588, "learning_rate": 5e-06, "logits/chosen": -17660606.0, "logits/rejected": -11516666.0, "logps/chosen": -346.79144287109375, "logps/rejected": -533.2935791015625, "loss": 0.0556, "rewards/chosen": 5.673281192779541, "rewards/margins": 11.974663734436035, "rewards/rejected": -6.301382541656494, "step": 916 }, { "epoch": 0.2513361655474853, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4414292.8, "logits/rejected": -22099988.57142857, "logps/chosen": -433.259765625, "logps/rejected": -427.9249790736607, "loss": 0.0548, "rewards/chosen": 4.791815185546875, "rewards/margins": 12.418247331891742, "rewards/rejected": -7.626432146344866, "step": 917 }, { "epoch": 0.2516102507879951, "grad_norm": 11.75, "kl": 0.24113211035728455, "learning_rate": 5e-06, "logits/chosen": -37867.73333333333, "logits/rejected": 96504632.8888889, "logps/chosen": -385.068359375, "logps/rejected": -564.9070638020834, "loss": 0.0602, "rewards/chosen": 6.2832275390625, "rewards/margins": 15.637029690212673, "rewards/rejected": -9.353802151150173, "step": 918 }, { "epoch": 0.25188433602850485, "grad_norm": 8.875, "kl": 1.7308566570281982, "learning_rate": 5e-06, "logits/chosen": -7336364.307692308, "logits/rejected": 86564986.18181819, "logps/chosen": -475.1014873798077, "logps/rejected": -531.0253018465909, "loss": 0.0681, "rewards/chosen": 4.829886803260217, "rewards/margins": 14.628054638842602, "rewards/rejected": -9.798167835582387, "step": 919 }, { "epoch": 0.25215842126901467, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7530956.666666667, "logits/rejected": -12078447.111111112, "logps/chosen": -327.9090576171875, "logps/rejected": -370.92637803819446, "loss": 0.03, "rewards/chosen": 6.435161590576172, "rewards/margins": 12.497562408447266, "rewards/rejected": -6.062400817871094, "step": 920 }, { "epoch": 0.2524325065095245, "grad_norm": 11.4375, "kl": 4.517287254333496, "learning_rate": 5e-06, "logits/chosen": -9347672.666666666, "logits/rejected": -2737953.3333333335, "logps/chosen": -427.499267578125, "logps/rejected": -537.3381754557291, "loss": 0.0547, "rewards/chosen": 5.07589594523112, "rewards/margins": 11.91788164774577, "rewards/rejected": -6.841985702514648, "step": 921 }, { "epoch": 0.25270659175003424, "grad_norm": 12.9375, "kl": 8.73958683013916, "learning_rate": 5e-06, "logits/chosen": -6200136.0, "logits/rejected": -10963647.2, "logps/chosen": -486.68648856026783, "logps/rejected": -620.94697265625, "loss": 0.061, "rewards/chosen": 6.070579528808594, "rewards/margins": 14.079050445556641, "rewards/rejected": -8.008470916748047, "step": 922 }, { "epoch": 0.25298067699054405, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14990013.090909092, "logits/rejected": -11253555.692307692, "logps/chosen": -421.90580610795456, "logps/rejected": -401.7461688701923, "loss": 0.0561, "rewards/chosen": 6.623251481489702, "rewards/margins": 12.239715362762238, "rewards/rejected": -5.616463881272536, "step": 923 }, { "epoch": 0.25325476223105386, "grad_norm": 2.1875, "kl": 2.2463645935058594, "learning_rate": 5e-06, "logits/chosen": -13219335.466666667, "logits/rejected": -21139651.555555556, "logps/chosen": -407.77161458333336, "logps/rejected": -457.65668402777777, "loss": 0.011, "rewards/chosen": 5.629483032226562, "rewards/margins": 12.574112277560765, "rewards/rejected": -6.944629245334202, "step": 924 }, { "epoch": 0.2535288474715637, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13847524.0, "logits/rejected": -6613991.0, "logps/chosen": -492.1877136230469, "logps/rejected": -429.9212646484375, "loss": 0.0627, "rewards/chosen": 4.666096210479736, "rewards/margins": 10.960054874420166, "rewards/rejected": -6.29395866394043, "step": 925 }, { "epoch": 0.25380293271207344, "grad_norm": 4.5, "kl": 0.01488494873046875, "learning_rate": 5e-06, "logits/chosen": -10508772.0, "logits/rejected": 22788652.0, "logps/chosen": -442.5178527832031, "logps/rejected": -452.7877197265625, "loss": 0.0151, "rewards/chosen": 6.099307060241699, "rewards/margins": 14.387831687927246, "rewards/rejected": -8.288524627685547, "step": 926 }, { "epoch": 0.25407701795258325, "grad_norm": 7.0, "kl": 0.38301214575767517, "learning_rate": 5e-06, "logits/chosen": 76005767.1111111, "logits/rejected": -2757578.6666666665, "logps/chosen": -529.3077256944445, "logps/rejected": -470.68916015625, "loss": 0.0298, "rewards/chosen": 6.033032735188802, "rewards/margins": 13.695370992024738, "rewards/rejected": -7.6623382568359375, "step": 927 }, { "epoch": 0.25435110319309306, "grad_norm": 7.65625, "kl": 12.514925956726074, "learning_rate": 5e-06, "logits/chosen": -13094809.6, "logits/rejected": 21575957.333333332, "logps/chosen": -452.8899739583333, "logps/rejected": -416.86382378472223, "loss": 0.0373, "rewards/chosen": 7.530692036946615, "rewards/margins": 12.830259365505643, "rewards/rejected": -5.299567328559028, "step": 928 }, { "epoch": 0.2546251884336029, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17092971.636363637, "logits/rejected": -6762188.923076923, "logps/chosen": -498.63045987215907, "logps/rejected": -508.0510817307692, "loss": 0.0421, "rewards/chosen": 6.120948097922585, "rewards/margins": 13.715166425371503, "rewards/rejected": -7.594218327448918, "step": 929 }, { "epoch": 0.25489927367411264, "grad_norm": 14.8125, "kl": 13.12739086151123, "learning_rate": 5e-06, "logits/chosen": -7100736.470588235, "logits/rejected": -19221458.285714287, "logps/chosen": -402.6879308363971, "logps/rejected": -338.19754464285717, "loss": 0.0843, "rewards/chosen": 5.8213635612936585, "rewards/margins": 13.522131078383502, "rewards/rejected": -7.700767517089844, "step": 930 }, { "epoch": 0.25517335891462245, "grad_norm": 8.0625, "kl": 1.4539146423339844, "learning_rate": 5e-06, "logits/chosen": -13405831.384615384, "logits/rejected": -1545909.8181818181, "logps/chosen": -439.3039738581731, "logps/rejected": -433.34419389204544, "loss": 0.0452, "rewards/chosen": 5.321765606219952, "rewards/margins": 11.661070443533518, "rewards/rejected": -6.339304837313565, "step": 931 }, { "epoch": 0.25544744415513226, "grad_norm": 10.0625, "kl": 3.399259090423584, "learning_rate": 5e-06, "logits/chosen": -8413127.272727273, "logits/rejected": -27279332.923076924, "logps/chosen": -391.43581321022725, "logps/rejected": -502.1301457331731, "loss": 0.0445, "rewards/chosen": 6.071232188831676, "rewards/margins": 12.855521035361123, "rewards/rejected": -6.7842888465294475, "step": 932 }, { "epoch": 0.255721529395642, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2944584.3333333335, "logits/rejected": 7941820.0, "logps/chosen": -428.3954264322917, "logps/rejected": -517.614501953125, "loss": 0.0097, "rewards/chosen": 7.481566111246745, "rewards/margins": 15.449105580647787, "rewards/rejected": -7.967539469401042, "step": 933 }, { "epoch": 0.25599561463615184, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19035586.666666668, "logits/rejected": -23632624.0, "logps/chosen": -423.1187337239583, "logps/rejected": -536.0687255859375, "loss": 0.0633, "rewards/chosen": 4.428627014160156, "rewards/margins": 13.217323303222656, "rewards/rejected": -8.7886962890625, "step": 934 }, { "epoch": 0.25626969987666165, "grad_norm": 9.0, "kl": 4.850799560546875, "learning_rate": 5e-06, "logits/chosen": -26773099.2, "logits/rejected": 957733.5714285715, "logps/chosen": -374.823876953125, "logps/rejected": -399.3833705357143, "loss": 0.0815, "rewards/chosen": 4.907474517822266, "rewards/margins": 9.542278289794922, "rewards/rejected": -4.634803771972656, "step": 935 }, { "epoch": 0.25654378511717146, "grad_norm": 10.125, "kl": 1.4296506643295288, "learning_rate": 5e-06, "logits/chosen": 7132537.333333333, "logits/rejected": -16720717.333333334, "logps/chosen": -461.3999430338542, "logps/rejected": -410.0968017578125, "loss": 0.0239, "rewards/chosen": 7.1636098225911455, "rewards/margins": 13.945627212524414, "rewards/rejected": -6.7820173899332685, "step": 936 }, { "epoch": 0.2568178703576812, "grad_norm": 5.78125, "kl": 0.2263285368680954, "learning_rate": 5e-06, "logits/chosen": -26801489.454545453, "logits/rejected": -10626057.846153846, "logps/chosen": -473.2156427556818, "logps/rejected": -485.4734074519231, "loss": 0.0334, "rewards/chosen": 6.293356461958452, "rewards/margins": 12.83051689521416, "rewards/rejected": -6.537160433255709, "step": 937 }, { "epoch": 0.25709195559819104, "grad_norm": 13.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25814518.85714286, "logits/rejected": 16868707.76470588, "logps/chosen": -533.4038434709821, "logps/rejected": -456.38634535845586, "loss": 0.0855, "rewards/chosen": 7.523799896240234, "rewards/margins": 12.844219207763672, "rewards/rejected": -5.3204193115234375, "step": 938 }, { "epoch": 0.25736604083870085, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21786222.545454547, "logits/rejected": 8213337.230769231, "logps/chosen": -404.57803622159093, "logps/rejected": -502.85013521634613, "loss": 0.0426, "rewards/chosen": 6.61287411776456, "rewards/margins": 15.423713844139257, "rewards/rejected": -8.810839726374699, "step": 939 }, { "epoch": 0.2576401260792106, "grad_norm": 11.5625, "kl": 1.5645307302474976, "learning_rate": 5e-06, "logits/chosen": 2765695.3846153845, "logits/rejected": 14542126.545454545, "logps/chosen": -408.25863882211536, "logps/rejected": -648.1149680397727, "loss": 0.0527, "rewards/chosen": 4.792485750638521, "rewards/margins": 16.15737402855933, "rewards/rejected": -11.36488827792081, "step": 940 }, { "epoch": 0.2579142113197204, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9941264.0, "logits/rejected": -10818266.0, "logps/chosen": -357.4606119791667, "logps/rejected": -486.0686442057292, "loss": 0.036, "rewards/chosen": 5.202293395996094, "rewards/margins": 13.249788284301758, "rewards/rejected": -8.047494888305664, "step": 941 }, { "epoch": 0.25818829656023023, "grad_norm": 8.1875, "kl": 2.035142421722412, "learning_rate": 5e-06, "logits/chosen": -10975740.0, "logits/rejected": -948173.4285714285, "logps/chosen": -412.54267578125, "logps/rejected": -449.9885951450893, "loss": 0.0368, "rewards/chosen": 6.060214996337891, "rewards/margins": 12.400022779192243, "rewards/rejected": -6.339807782854352, "step": 942 }, { "epoch": 0.25846238180074005, "grad_norm": 4.6875, "kl": 5.574146270751953, "learning_rate": 5e-06, "logits/chosen": -5063740.0, "logits/rejected": 17181104.0, "logps/chosen": -451.06549072265625, "logps/rejected": -635.756591796875, "loss": 0.0191, "rewards/chosen": 5.601768493652344, "rewards/margins": 16.67848300933838, "rewards/rejected": -11.076714515686035, "step": 943 }, { "epoch": 0.2587364670412498, "grad_norm": 10.5, "kl": 0.9868850708007812, "learning_rate": 5e-06, "logits/chosen": 3458957.5384615385, "logits/rejected": 23426443.636363637, "logps/chosen": -466.24181189903845, "logps/rejected": -457.88960404829544, "loss": 0.0445, "rewards/chosen": 5.7142486572265625, "rewards/margins": 11.384352597323332, "rewards/rejected": -5.6701039400967685, "step": 944 }, { "epoch": 0.2590105522817596, "grad_norm": 3.578125, "kl": 1.4286088943481445, "learning_rate": 5e-06, "logits/chosen": -15297911.272727273, "logits/rejected": -21364731.076923076, "logps/chosen": -427.86075106534093, "logps/rejected": -445.92931189903845, "loss": 0.0139, "rewards/chosen": 6.472465515136719, "rewards/margins": 14.033012390136719, "rewards/rejected": -7.560546875, "step": 945 }, { "epoch": 0.25928463752226943, "grad_norm": 14.8125, "kl": 9.275156021118164, "learning_rate": 5e-06, "logits/chosen": -17854344.0, "logits/rejected": -8699460.8, "logps/chosen": -442.778564453125, "logps/rejected": -437.7201171875, "loss": 0.1009, "rewards/chosen": 6.077787126813616, "rewards/margins": 12.359316362653459, "rewards/rejected": -6.281529235839844, "step": 946 }, { "epoch": 0.25955872276277925, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3239731.5555555555, "logits/rejected": -11263851.733333332, "logps/chosen": -307.06377495659723, "logps/rejected": -653.6268880208333, "loss": 0.0812, "rewards/chosen": 5.745368957519531, "rewards/margins": 18.10077667236328, "rewards/rejected": -12.35540771484375, "step": 947 }, { "epoch": 0.259832808003289, "grad_norm": 10.1875, "kl": 4.156101226806641, "learning_rate": 5e-06, "logits/chosen": -5595590.666666667, "logits/rejected": -6885563.333333333, "logps/chosen": -435.846435546875, "logps/rejected": -295.63063557942706, "loss": 0.0498, "rewards/chosen": 5.35194714864095, "rewards/margins": 10.460397402445475, "rewards/rejected": -5.108450253804524, "step": 948 }, { "epoch": 0.2601068932437988, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18743166.545454547, "logits/rejected": 18522653.53846154, "logps/chosen": -437.55317826704544, "logps/rejected": -432.8920147235577, "loss": 0.0446, "rewards/chosen": 5.558706456964666, "rewards/margins": 11.530129679433117, "rewards/rejected": -5.97142322246845, "step": 949 }, { "epoch": 0.26038097848430863, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17414065.6, "logits/rejected": 41745490.28571428, "logps/chosen": -442.70380859375, "logps/rejected": -640.3610491071429, "loss": 0.0177, "rewards/chosen": 6.492620849609375, "rewards/margins": 14.203569248744419, "rewards/rejected": -7.710948399135044, "step": 950 }, { "epoch": 0.2606550637248184, "grad_norm": 8.0, "kl": 9.705123901367188, "learning_rate": 5e-06, "logits/chosen": -21389548.307692308, "logits/rejected": -4786125.090909091, "logps/chosen": -479.0183293269231, "logps/rejected": -502.0965021306818, "loss": 0.0744, "rewards/chosen": 5.6826031024639425, "rewards/margins": 12.391062436403928, "rewards/rejected": -6.708459333939985, "step": 951 }, { "epoch": 0.2609291489653282, "grad_norm": 6.5, "kl": 1.7347755432128906, "learning_rate": 5e-06, "logits/chosen": -24822039.272727273, "logits/rejected": -21956561.230769232, "logps/chosen": -503.6298828125, "logps/rejected": -387.9125225360577, "loss": 0.0408, "rewards/chosen": 6.578715931285512, "rewards/margins": 12.98420886059741, "rewards/rejected": -6.405492929311899, "step": 952 }, { "epoch": 0.261203234205838, "grad_norm": 4.0625, "kl": 5.236913204193115, "learning_rate": 5e-06, "logits/chosen": -23705184.0, "logits/rejected": -2028384.888888889, "logps/chosen": -360.6419921875, "logps/rejected": -516.6667751736111, "loss": 0.0158, "rewards/chosen": 5.900875854492187, "rewards/margins": 12.827637905544705, "rewards/rejected": -6.926762051052517, "step": 953 }, { "epoch": 0.26147731944634783, "grad_norm": 8.6875, "kl": 0.9404615163803101, "learning_rate": 5e-06, "logits/chosen": -18541736.615384616, "logits/rejected": -10328807.272727273, "logps/chosen": -398.70229867788464, "logps/rejected": -507.57759232954544, "loss": 0.0481, "rewards/chosen": 6.301127507136418, "rewards/margins": 14.371227771252187, "rewards/rejected": -8.070100264115768, "step": 954 }, { "epoch": 0.2617514046868576, "grad_norm": 7.875, "kl": 0.39804649353027344, "learning_rate": 5e-06, "logits/chosen": 13279889.6, "logits/rejected": -24020688.0, "logps/chosen": -401.1297607421875, "logps/rejected": -577.0579659598214, "loss": 0.0441, "rewards/chosen": 4.8051410675048825, "rewards/margins": 13.83436655317034, "rewards/rejected": -9.029225485665458, "step": 955 }, { "epoch": 0.2620254899273674, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3189024.0, "logits/rejected": -25619140.57142857, "logps/chosen": -437.06552734375, "logps/rejected": -421.9520786830357, "loss": 0.0253, "rewards/chosen": 6.554267883300781, "rewards/margins": 15.914881025041852, "rewards/rejected": -9.360613141741071, "step": 956 }, { "epoch": 0.2622995751678772, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3373718.6666666665, "logits/rejected": -26068816.0, "logps/chosen": -416.0738932291667, "logps/rejected": -521.2947591145834, "loss": 0.0347, "rewards/chosen": 5.346377690633138, "rewards/margins": 14.520790735880535, "rewards/rejected": -9.174413045247396, "step": 957 }, { "epoch": 0.26257366040838703, "grad_norm": 9.375, "kl": 2.0879924297332764, "learning_rate": 5e-06, "logits/chosen": -12908710.153846154, "logits/rejected": -8553201.454545455, "logps/chosen": -473.35494290865387, "logps/rejected": -459.91020063920456, "loss": 0.0334, "rewards/chosen": 6.431424654447115, "rewards/margins": 13.793816733193564, "rewards/rejected": -7.362392078746449, "step": 958 }, { "epoch": 0.2628477456488968, "grad_norm": 2.296875, "kl": 2.938140869140625, "learning_rate": 5e-06, "logits/chosen": -10537907.2, "logits/rejected": -11319142.222222222, "logps/chosen": -489.4039713541667, "logps/rejected": -541.1126844618055, "loss": 0.0102, "rewards/chosen": 6.293031311035156, "rewards/margins": 15.245240783691406, "rewards/rejected": -8.95220947265625, "step": 959 }, { "epoch": 0.2631218308894066, "grad_norm": 9.5, "kl": 5.80673885345459, "learning_rate": 5e-06, "logits/chosen": -5391376.94117647, "logits/rejected": 1361037.857142857, "logps/chosen": -493.67176011029414, "logps/rejected": -582.36767578125, "loss": 0.0504, "rewards/chosen": 5.294521107393153, "rewards/margins": 14.139217697271778, "rewards/rejected": -8.844696589878627, "step": 960 }, { "epoch": 0.2633959161299164, "grad_norm": 4.21875, "kl": 5.678752899169922, "learning_rate": 5e-06, "logits/chosen": -4585215.2, "logits/rejected": -18316629.714285713, "logps/chosen": -419.330859375, "logps/rejected": -368.00830078125, "loss": 0.0212, "rewards/chosen": 5.6047416687011715, "rewards/margins": 11.84775913783482, "rewards/rejected": -6.243017469133649, "step": 961 }, { "epoch": 0.2636700013704262, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11604454.666666666, "logits/rejected": -426368.0, "logps/chosen": -365.679443359375, "logps/rejected": -419.4940185546875, "loss": 0.0922, "rewards/chosen": 3.6728665033976235, "rewards/margins": 11.11265786488851, "rewards/rejected": -7.439791361490886, "step": 962 }, { "epoch": 0.263944086610936, "grad_norm": 3.4375, "kl": 1.3374608755111694, "learning_rate": 5e-06, "logits/chosen": -17885184.0, "logits/rejected": 32626876.0, "logps/chosen": -398.31536865234375, "logps/rejected": -672.5433349609375, "loss": 0.0106, "rewards/chosen": 5.379430294036865, "rewards/margins": 15.035218715667725, "rewards/rejected": -9.65578842163086, "step": 963 }, { "epoch": 0.2642181718514458, "grad_norm": 10.5625, "kl": 0.6282132863998413, "learning_rate": 5e-06, "logits/chosen": -7376353.230769231, "logits/rejected": 20584650.181818184, "logps/chosen": -288.2746769831731, "logps/rejected": -564.7292258522727, "loss": 0.0494, "rewards/chosen": 5.300265972430889, "rewards/margins": 11.28464929540674, "rewards/rejected": -5.9843833229758525, "step": 964 }, { "epoch": 0.2644922570919556, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37104252.0, "logits/rejected": -28856902.0, "logps/chosen": -463.9609680175781, "logps/rejected": -454.84942626953125, "loss": 0.0458, "rewards/chosen": 6.546067714691162, "rewards/margins": 13.237388134002686, "rewards/rejected": -6.691320419311523, "step": 965 }, { "epoch": 0.2647663423324654, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 20605065.333333332, "logits/rejected": -4952044.666666667, "logps/chosen": -372.472900390625, "logps/rejected": -479.3736979166667, "loss": 0.0289, "rewards/chosen": 4.921517690022786, "rewards/margins": 14.148375193277996, "rewards/rejected": -9.226857503255209, "step": 966 }, { "epoch": 0.2650404275729752, "grad_norm": 11.75, "kl": 2.2142951488494873, "learning_rate": 5e-06, "logits/chosen": 8088939.692307692, "logits/rejected": -7275341.090909091, "logps/chosen": -488.35096153846155, "logps/rejected": -409.36328125, "loss": 0.0469, "rewards/chosen": 5.299001253568209, "rewards/margins": 12.64659022618007, "rewards/rejected": -7.34758897261186, "step": 967 }, { "epoch": 0.265314512813485, "grad_norm": 9.25, "kl": 2.3059210777282715, "learning_rate": 5e-06, "logits/chosen": -33436126.0, "logits/rejected": -13831083.0, "logps/chosen": -371.44189453125, "logps/rejected": -502.6777038574219, "loss": 0.0398, "rewards/chosen": 5.133700370788574, "rewards/margins": 12.68954610824585, "rewards/rejected": -7.555845737457275, "step": 968 }, { "epoch": 0.2655885980539948, "grad_norm": 8.5, "kl": 1.1195144653320312, "learning_rate": 5e-06, "logits/chosen": -45799740.44444445, "logits/rejected": -22784825.6, "logps/chosen": -475.7770182291667, "logps/rejected": -551.4458333333333, "loss": 0.023, "rewards/chosen": 6.021810743543837, "rewards/margins": 14.768728468153212, "rewards/rejected": -8.746917724609375, "step": 969 }, { "epoch": 0.2658626832945046, "grad_norm": 3.484375, "kl": 1.4408175945281982, "learning_rate": 5e-06, "logits/chosen": -13356653.090909092, "logits/rejected": -16705595.076923076, "logps/chosen": -428.93887606534093, "logps/rejected": -451.0604717548077, "loss": 0.0155, "rewards/chosen": 6.848553744229403, "rewards/margins": 14.518089454490822, "rewards/rejected": -7.669535710261418, "step": 970 }, { "epoch": 0.2661367685350144, "grad_norm": 9.375, "kl": 1.857261061668396, "learning_rate": 5e-06, "logits/chosen": -21292941.714285713, "logits/rejected": -29703740.8, "logps/chosen": -485.36495535714283, "logps/rejected": -519.17587890625, "loss": 0.0344, "rewards/chosen": 6.820086342947824, "rewards/margins": 15.830750710623605, "rewards/rejected": -9.010664367675782, "step": 971 }, { "epoch": 0.2664108537755242, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8251118.4, "logits/rejected": -28313005.714285713, "logps/chosen": -460.565966796875, "logps/rejected": -516.0386788504464, "loss": 0.0352, "rewards/chosen": 5.210797882080078, "rewards/margins": 13.488185228620257, "rewards/rejected": -8.277387346540179, "step": 972 }, { "epoch": 0.26668493901603396, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3071187.6666666665, "logits/rejected": 1678408.6666666667, "logps/chosen": -376.8160400390625, "logps/rejected": -677.2494303385416, "loss": 0.0959, "rewards/chosen": 4.310688018798828, "rewards/margins": 14.445522944132486, "rewards/rejected": -10.134834925333658, "step": 973 }, { "epoch": 0.2669590242565438, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10560091.636363637, "logits/rejected": -887594.1538461539, "logps/chosen": -397.77450284090907, "logps/rejected": -511.9929387019231, "loss": 0.0818, "rewards/chosen": 5.70974384654652, "rewards/margins": 12.299604029088588, "rewards/rejected": -6.5898601825420675, "step": 974 }, { "epoch": 0.2672331094970536, "grad_norm": 4.53125, "kl": 0.11975988000631332, "learning_rate": 5e-06, "logits/chosen": -24671945.14285714, "logits/rejected": -14682316.8, "logps/chosen": -442.70082310267856, "logps/rejected": -454.61201171875, "loss": 0.014, "rewards/chosen": 5.8400726318359375, "rewards/margins": 13.801502990722657, "rewards/rejected": -7.961430358886719, "step": 975 }, { "epoch": 0.2675071947375634, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2447245.3333333335, "logits/rejected": -18352693.333333332, "logps/chosen": -418.7146809895833, "logps/rejected": -451.53033854166665, "loss": 0.019, "rewards/chosen": 5.473955790201823, "rewards/margins": 12.943558247884114, "rewards/rejected": -7.469602457682291, "step": 976 }, { "epoch": 0.26778127997807316, "grad_norm": 5.46875, "kl": 1.3988406658172607, "learning_rate": 5e-06, "logits/chosen": -33867656.72727273, "logits/rejected": -7088601.846153846, "logps/chosen": -451.82492897727275, "logps/rejected": -462.0129957932692, "loss": 0.0326, "rewards/chosen": 6.2246315696022725, "rewards/margins": 12.480298449109483, "rewards/rejected": -6.255666879507212, "step": 977 }, { "epoch": 0.268055365218583, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27969584.0, "logits/rejected": 1814046.3333333333, "logps/chosen": -499.2157389322917, "logps/rejected": -465.3012288411458, "loss": 0.0118, "rewards/chosen": 7.1764787038167315, "rewards/margins": 16.1704896291097, "rewards/rejected": -8.994010925292969, "step": 978 }, { "epoch": 0.2683294504590928, "grad_norm": 10.8125, "kl": 11.131546974182129, "learning_rate": 5e-06, "logits/chosen": -13004112.0, "logits/rejected": -2973310.8, "logps/chosen": -424.23158482142856, "logps/rejected": -426.372119140625, "loss": 0.0584, "rewards/chosen": 5.962975093296596, "rewards/margins": 11.274077769688198, "rewards/rejected": -5.311102676391601, "step": 979 }, { "epoch": 0.2686035356996026, "grad_norm": 5.25, "kl": 3.9341049194335938, "learning_rate": 5e-06, "logits/chosen": -19889006.933333334, "logits/rejected": -13125095.111111112, "logps/chosen": -362.65358072916666, "logps/rejected": -715.7421875, "loss": 0.0841, "rewards/chosen": 4.897745259602865, "rewards/margins": 15.166280449761285, "rewards/rejected": -10.26853519015842, "step": 980 }, { "epoch": 0.26887762094011236, "grad_norm": 1.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3219412.8571428573, "logits/rejected": -11993333.6, "logps/chosen": -423.2242954799107, "logps/rejected": -354.755419921875, "loss": 0.0063, "rewards/chosen": 5.759278433663504, "rewards/margins": 12.85308336530413, "rewards/rejected": -7.093804931640625, "step": 981 }, { "epoch": 0.2691517061806222, "grad_norm": 11.0625, "kl": 2.3579013347625732, "learning_rate": 5e-06, "logits/chosen": -26604315.42857143, "logits/rejected": 19238147.2, "logps/chosen": -442.20486886160717, "logps/rejected": -653.826904296875, "loss": 0.0341, "rewards/chosen": 6.585381099155971, "rewards/margins": 20.38740441458566, "rewards/rejected": -13.802023315429688, "step": 982 }, { "epoch": 0.269425791421132, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30009024.0, "logits/rejected": 25519003.2, "logps/chosen": -419.47781808035717, "logps/rejected": -632.65966796875, "loss": 0.0277, "rewards/chosen": 5.76971435546875, "rewards/margins": 15.481197357177734, "rewards/rejected": -9.711483001708984, "step": 983 }, { "epoch": 0.26969987666164175, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29348556.8, "logits/rejected": -10097856.0, "logps/chosen": -474.005908203125, "logps/rejected": -583.9505440848214, "loss": 0.0264, "rewards/chosen": 5.45623550415039, "rewards/margins": 14.396884264264788, "rewards/rejected": -8.940648760114398, "step": 984 }, { "epoch": 0.26997396190215156, "grad_norm": 10.8125, "kl": 4.431816101074219, "learning_rate": 5e-06, "logits/chosen": -12552480.0, "logits/rejected": -10850514.285714285, "logps/chosen": -354.415380859375, "logps/rejected": -665.3643275669643, "loss": 0.0628, "rewards/chosen": 4.3610382080078125, "rewards/margins": 13.503692626953125, "rewards/rejected": -9.142654418945312, "step": 985 }, { "epoch": 0.2702480471426614, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19739834.0, "logits/rejected": -22817160.0, "logps/chosen": -425.3714599609375, "logps/rejected": -595.1431884765625, "loss": 0.0471, "rewards/chosen": 4.480156421661377, "rewards/margins": 16.289021968841553, "rewards/rejected": -11.808865547180176, "step": 986 }, { "epoch": 0.2705221323831712, "grad_norm": 8.3125, "kl": 1.653464674949646, "learning_rate": 5e-06, "logits/chosen": -33412304.0, "logits/rejected": 6628528.0, "logps/chosen": -579.78076171875, "logps/rejected": -354.072021484375, "loss": 0.0376, "rewards/chosen": 6.098360061645508, "rewards/margins": 12.937888463338215, "rewards/rejected": -6.839528401692708, "step": 987 }, { "epoch": 0.27079621762368095, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14282215.384615384, "logits/rejected": -11826376.727272727, "logps/chosen": -380.7237079326923, "logps/rejected": -523.0771484375, "loss": 0.0748, "rewards/chosen": 4.104017404409555, "rewards/margins": 14.942578028965663, "rewards/rejected": -10.838560624556107, "step": 988 }, { "epoch": 0.27107030286419076, "grad_norm": 7.09375, "kl": 5.267779350280762, "learning_rate": 5e-06, "logits/chosen": -21226935.466666665, "logits/rejected": -20124282.666666668, "logps/chosen": -548.986328125, "logps/rejected": -367.38921440972223, "loss": 0.0536, "rewards/chosen": 7.164665730794271, "rewards/margins": 14.685030280219184, "rewards/rejected": -7.520364549424913, "step": 989 }, { "epoch": 0.2713443881047006, "grad_norm": 7.40625, "kl": 0.8214542269706726, "learning_rate": 5e-06, "logits/chosen": -22585513.846153848, "logits/rejected": -28622507.636363637, "logps/chosen": -510.9357346754808, "logps/rejected": -514.99609375, "loss": 0.036, "rewards/chosen": 6.538578913762019, "rewards/margins": 18.513403298971536, "rewards/rejected": -11.974824385209518, "step": 990 }, { "epoch": 0.2716184733452104, "grad_norm": 6.59375, "kl": 1.976820945739746, "learning_rate": 5e-06, "logits/chosen": -23198592.0, "logits/rejected": -19692679.384615384, "logps/chosen": -360.877197265625, "logps/rejected": -383.13724459134613, "loss": 0.0562, "rewards/chosen": 5.607992345636541, "rewards/margins": 11.706487829034979, "rewards/rejected": -6.0984954833984375, "step": 991 }, { "epoch": 0.27189255858572015, "grad_norm": 11.3125, "kl": 5.621987819671631, "learning_rate": 5e-06, "logits/chosen": -22548907.42857143, "logits/rejected": 11118212.0, "logps/chosen": -425.5569545200893, "logps/rejected": -503.700439453125, "loss": 0.0512, "rewards/chosen": 5.738159724644253, "rewards/margins": 14.939941951206752, "rewards/rejected": -9.2017822265625, "step": 992 }, { "epoch": 0.27216664382622996, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8767764.0, "logits/rejected": -4957152.666666667, "logps/chosen": -414.1932779947917, "logps/rejected": -672.245849609375, "loss": 0.0243, "rewards/chosen": 4.566397984822591, "rewards/margins": 13.29069709777832, "rewards/rejected": -8.724299112955729, "step": 993 }, { "epoch": 0.2724407290667398, "grad_norm": 4.8125, "kl": 0.08488655090332031, "learning_rate": 5e-06, "logits/chosen": -17605513.333333332, "logits/rejected": 16053541.333333334, "logps/chosen": -470.0772298177083, "logps/rejected": -475.3201497395833, "loss": 0.0228, "rewards/chosen": 6.6449400583903, "rewards/margins": 14.797927220662434, "rewards/rejected": -8.152987162272135, "step": 994 }, { "epoch": 0.27271481430724953, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7045323.333333333, "logits/rejected": -18671653.333333332, "logps/chosen": -386.0382486979167, "logps/rejected": -517.8528238932291, "loss": 0.0163, "rewards/chosen": 5.070969263712565, "rewards/margins": 14.558516820271809, "rewards/rejected": -9.487547556559244, "step": 995 }, { "epoch": 0.27298889954775934, "grad_norm": 11.0, "kl": 1.0233535766601562, "learning_rate": 5e-06, "logits/chosen": -5502548.923076923, "logits/rejected": -3396532.3636363638, "logps/chosen": -381.20030799278845, "logps/rejected": -386.7265625, "loss": 0.0878, "rewards/chosen": 5.228311978853666, "rewards/margins": 10.523257435618582, "rewards/rejected": -5.294945456764915, "step": 996 }, { "epoch": 0.27326298478826916, "grad_norm": 4.5, "kl": 4.741988658905029, "learning_rate": 5e-06, "logits/chosen": -20020589.333333332, "logits/rejected": -14770521.333333334, "logps/chosen": -451.7237548828125, "logps/rejected": -437.9883626302083, "loss": 0.0381, "rewards/chosen": 5.643041610717773, "rewards/margins": 12.191490809122723, "rewards/rejected": -6.548449198404948, "step": 997 }, { "epoch": 0.273537070028779, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19967657.14285714, "logits/rejected": -21280025.6, "logps/chosen": -479.8775111607143, "logps/rejected": -523.888818359375, "loss": 0.0458, "rewards/chosen": 6.200954437255859, "rewards/margins": 16.394202423095702, "rewards/rejected": -10.193247985839843, "step": 998 }, { "epoch": 0.27381115526928873, "grad_norm": 2.796875, "kl": 1.0272510051727295, "learning_rate": 5e-06, "logits/chosen": -34588681.14285714, "logits/rejected": -33578627.2, "logps/chosen": -558.9356166294643, "logps/rejected": -542.04912109375, "loss": 0.0083, "rewards/chosen": 6.846635001046317, "rewards/margins": 16.55622591291155, "rewards/rejected": -9.709590911865234, "step": 999 }, { "epoch": 0.27408524050979854, "grad_norm": 5.40625, "kl": 3.860377788543701, "learning_rate": 5e-06, "logits/chosen": -15805382.4, "logits/rejected": -26299078.85714286, "logps/chosen": -512.5771484375, "logps/rejected": -485.08377511160717, "loss": 0.0107, "rewards/chosen": 6.135759735107422, "rewards/margins": 15.47008525303432, "rewards/rejected": -9.334325517926898, "step": 1000 }, { "epoch": 0.27435932575030836, "grad_norm": 7.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7708006.4, "logits/rejected": 5997503.111111111, "logps/chosen": -400.18079427083336, "logps/rejected": -456.10053168402777, "loss": 0.0335, "rewards/chosen": 7.4956720987955725, "rewards/margins": 13.610594516330295, "rewards/rejected": -6.114922417534722, "step": 1001 }, { "epoch": 0.27463341099081817, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11117831.272727273, "logits/rejected": 4834847.076923077, "logps/chosen": -299.36330344460225, "logps/rejected": -569.62890625, "loss": 0.0305, "rewards/chosen": 5.209623856977983, "rewards/margins": 13.365825146228286, "rewards/rejected": -8.156201289250301, "step": 1002 }, { "epoch": 0.27490749623132793, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14826829.714285715, "logits/rejected": -32552480.0, "logps/chosen": -465.4224330357143, "logps/rejected": -558.6013671875, "loss": 0.0165, "rewards/chosen": 6.533670697893415, "rewards/margins": 16.270319257463726, "rewards/rejected": -9.736648559570312, "step": 1003 }, { "epoch": 0.27518158147183774, "grad_norm": 11.375, "kl": 0.4152107238769531, "learning_rate": 5e-06, "logits/chosen": -18087392.0, "logits/rejected": -20402593.454545453, "logps/chosen": -419.5105543870192, "logps/rejected": -584.8737127130681, "loss": 0.0736, "rewards/chosen": 5.411284813514123, "rewards/margins": 11.807409913389833, "rewards/rejected": -6.39612509987571, "step": 1004 }, { "epoch": 0.27545566671234756, "grad_norm": 7.09375, "kl": 3.9716577529907227, "learning_rate": 5e-06, "logits/chosen": -25971106.90909091, "logits/rejected": 27839881.846153848, "logps/chosen": -466.18026455965907, "logps/rejected": -565.5613356370193, "loss": 0.0334, "rewards/chosen": 5.311735326593572, "rewards/margins": 16.446032624144657, "rewards/rejected": -11.134297297551083, "step": 1005 }, { "epoch": 0.2757297519528573, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30462441.846153848, "logits/rejected": -6716441.454545454, "logps/chosen": -393.65902944711536, "logps/rejected": -616.2245649857955, "loss": 0.0745, "rewards/chosen": 4.7719257061298075, "rewards/margins": 13.450955691037478, "rewards/rejected": -8.67902998490767, "step": 1006 }, { "epoch": 0.27600383719336713, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6681082.461538462, "logits/rejected": -1856813.8181818181, "logps/chosen": -398.69076772836536, "logps/rejected": -484.8663884943182, "loss": 0.0444, "rewards/chosen": 5.324650691105769, "rewards/margins": 10.93513125973148, "rewards/rejected": -5.61048056862571, "step": 1007 }, { "epoch": 0.27627792243387694, "grad_norm": 5.625, "kl": 4.992013454437256, "learning_rate": 5e-06, "logits/chosen": -16124926.76923077, "logits/rejected": -637090.9090909091, "logps/chosen": -490.6535832331731, "logps/rejected": -751.0204190340909, "loss": 0.0159, "rewards/chosen": 6.189952556903545, "rewards/margins": 18.99440562641704, "rewards/rejected": -12.804453069513494, "step": 1008 }, { "epoch": 0.27655200767438676, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25684523.636363637, "logits/rejected": 2932078.153846154, "logps/chosen": -370.8082830255682, "logps/rejected": -481.65767728365387, "loss": 0.0364, "rewards/chosen": 5.543337041681463, "rewards/margins": 11.617421983838915, "rewards/rejected": -6.074084942157452, "step": 1009 }, { "epoch": 0.2768260929148965, "grad_norm": 5.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8211702.769230769, "logits/rejected": -3464994.909090909, "logps/chosen": -455.84525240384613, "logps/rejected": -652.8102805397727, "loss": 0.051, "rewards/chosen": 6.031146709735577, "rewards/margins": 16.07746268319083, "rewards/rejected": -10.046315973455256, "step": 1010 }, { "epoch": 0.27710017815540633, "grad_norm": 6.96875, "kl": 9.899712562561035, "learning_rate": 5e-06, "logits/chosen": -45657072.0, "logits/rejected": -14067370.666666666, "logps/chosen": -506.8863932291667, "logps/rejected": -589.3668619791666, "loss": 0.0248, "rewards/chosen": 7.091495513916016, "rewards/margins": 16.63838768005371, "rewards/rejected": -9.546892166137695, "step": 1011 }, { "epoch": 0.27737426339591614, "grad_norm": 10.3125, "kl": 11.389968872070312, "learning_rate": 5e-06, "logits/chosen": -8008195.764705882, "logits/rejected": -21932246.85714286, "logps/chosen": -490.2385684742647, "logps/rejected": -456.27218191964283, "loss": 0.0904, "rewards/chosen": 6.639415067784927, "rewards/margins": 15.04119805728688, "rewards/rejected": -8.401782989501953, "step": 1012 }, { "epoch": 0.2776483486364259, "grad_norm": 6.5625, "kl": 3.612496852874756, "learning_rate": 5e-06, "logits/chosen": -15081585.777777778, "logits/rejected": 2115933.6, "logps/chosen": -503.76768663194446, "logps/rejected": -399.16285807291666, "loss": 0.0432, "rewards/chosen": 6.2352489895290795, "rewards/margins": 13.983489142523872, "rewards/rejected": -7.748240152994792, "step": 1013 }, { "epoch": 0.2779224338769357, "grad_norm": 7.03125, "kl": 0.9009103775024414, "learning_rate": 5e-06, "logits/chosen": 2301244.3333333335, "logits/rejected": -9967433.333333334, "logps/chosen": -362.6772054036458, "logps/rejected": -366.228515625, "loss": 0.0696, "rewards/chosen": 4.864762941996257, "rewards/margins": 10.801440874735516, "rewards/rejected": -5.936677932739258, "step": 1014 }, { "epoch": 0.27819651911744553, "grad_norm": 4.71875, "kl": 1.807373046875, "learning_rate": 5e-06, "logits/chosen": -8368467.333333333, "logits/rejected": -25375160.0, "logps/chosen": -488.4856770833333, "logps/rejected": -398.3640950520833, "loss": 0.02, "rewards/chosen": 6.684074401855469, "rewards/margins": 14.289117177327473, "rewards/rejected": -7.605042775472005, "step": 1015 }, { "epoch": 0.27847060435795534, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18586404.923076924, "logits/rejected": -7847058.181818182, "logps/chosen": -408.17330228365387, "logps/rejected": -474.54927201704544, "loss": 0.0341, "rewards/chosen": 5.805907029371995, "rewards/margins": 13.35485445035921, "rewards/rejected": -7.548947420987216, "step": 1016 }, { "epoch": 0.2787446895984651, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1851314.5, "logits/rejected": -22344561.777777776, "logps/chosen": -571.4340006510416, "logps/rejected": -509.1574435763889, "loss": 0.0837, "rewards/chosen": 5.746174494425456, "rewards/margins": 12.603613747490776, "rewards/rejected": -6.857439253065321, "step": 1017 }, { "epoch": 0.2790187748389749, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12357066.666666666, "logits/rejected": -32571981.333333332, "logps/chosen": -391.3145345052083, "logps/rejected": -525.4727783203125, "loss": 0.0277, "rewards/chosen": 4.827047665913899, "rewards/margins": 13.94763406117757, "rewards/rejected": -9.120586395263672, "step": 1018 }, { "epoch": 0.27929286007948473, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33898921.84615385, "logits/rejected": -19487517.09090909, "logps/chosen": -418.3359375, "logps/rejected": -442.34561434659093, "loss": 0.0197, "rewards/chosen": 6.123816856971154, "rewards/margins": 14.477632109101837, "rewards/rejected": -8.353815252130682, "step": 1019 }, { "epoch": 0.27956694531999454, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19898533.333333332, "logits/rejected": -8300217.333333333, "logps/chosen": -432.1309407552083, "logps/rejected": -509.2061767578125, "loss": 0.0372, "rewards/chosen": 6.292348225911458, "rewards/margins": 13.916521708170572, "rewards/rejected": -7.624173482259114, "step": 1020 }, { "epoch": 0.2798410305605043, "grad_norm": 8.5625, "kl": 3.6256484985351562, "learning_rate": 5e-06, "logits/chosen": -20317110.85714286, "logits/rejected": 12318951.2, "logps/chosen": -450.3526088169643, "logps/rejected": -439.718701171875, "loss": 0.0626, "rewards/chosen": 4.772733960832868, "rewards/margins": 11.297489820207868, "rewards/rejected": -6.524755859375, "step": 1021 }, { "epoch": 0.2801151158010141, "grad_norm": 10.875, "kl": 5.631248474121094, "learning_rate": 5e-06, "logits/chosen": -14596987.294117646, "logits/rejected": -16595910.857142856, "logps/chosen": -464.2237764246324, "logps/rejected": -456.35609654017856, "loss": 0.0601, "rewards/chosen": 6.117100883932674, "rewards/margins": 15.68095455650522, "rewards/rejected": -9.563853672572545, "step": 1022 }, { "epoch": 0.28038920104152393, "grad_norm": 13.25, "kl": 8.821293830871582, "learning_rate": 5e-06, "logits/chosen": -13515932.235294119, "logits/rejected": -16695419.42857143, "logps/chosen": -402.56603285845586, "logps/rejected": -407.7859584263393, "loss": 0.0639, "rewards/chosen": 5.663322897518382, "rewards/margins": 14.805657426850134, "rewards/rejected": -9.142334529331752, "step": 1023 }, { "epoch": 0.2806632862820337, "grad_norm": 8.0625, "kl": 5.594987392425537, "learning_rate": 5e-06, "logits/chosen": -34416795.428571425, "logits/rejected": -25783809.6, "logps/chosen": -460.832275390625, "logps/rejected": -460.39375, "loss": 0.0303, "rewards/chosen": 5.920329502650669, "rewards/margins": 12.507434300013951, "rewards/rejected": -6.587104797363281, "step": 1024 }, { "epoch": 0.2809373715225435, "grad_norm": 3.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23935588.57142857, "logits/rejected": -6905974.117647059, "logps/chosen": -466.9895717075893, "logps/rejected": -621.2115119485294, "loss": 0.0094, "rewards/chosen": 7.17201178414481, "rewards/margins": 17.105433199585985, "rewards/rejected": -9.933421415441176, "step": 1025 }, { "epoch": 0.2812114567630533, "grad_norm": 9.9375, "kl": 4.986053466796875, "learning_rate": 5e-06, "logits/chosen": -15462363.636363637, "logits/rejected": -1763191.3846153845, "logps/chosen": -457.94380326704544, "logps/rejected": -499.35460486778845, "loss": 0.0594, "rewards/chosen": 6.302353598854759, "rewards/margins": 15.246554554759207, "rewards/rejected": -8.944200955904448, "step": 1026 }, { "epoch": 0.2814855420035631, "grad_norm": 12.875, "kl": 2.2345938682556152, "learning_rate": 5e-06, "logits/chosen": -27547015.529411763, "logits/rejected": -55258.0, "logps/chosen": -429.12327665441177, "logps/rejected": -591.3708844866071, "loss": 0.0748, "rewards/chosen": 6.848017973058364, "rewards/margins": 15.531502186751165, "rewards/rejected": -8.683484213692802, "step": 1027 }, { "epoch": 0.2817596272440729, "grad_norm": 4.28125, "kl": 0.0613301619887352, "learning_rate": 5e-06, "logits/chosen": -18968784.0, "logits/rejected": -21105027.2, "logps/chosen": -425.473876953125, "logps/rejected": -616.11572265625, "loss": 0.0176, "rewards/chosen": 5.183281489780971, "rewards/margins": 16.001516505650113, "rewards/rejected": -10.81823501586914, "step": 1028 }, { "epoch": 0.2820337124845827, "grad_norm": 7.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32651392.0, "logits/rejected": -16293186.133333333, "logps/chosen": -649.8935004340278, "logps/rejected": -508.3138020833333, "loss": 0.0195, "rewards/chosen": 8.658390469021267, "rewards/margins": 17.575484381781685, "rewards/rejected": -8.917093912760416, "step": 1029 }, { "epoch": 0.2823077977250925, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16276788.0, "logits/rejected": 26987510.0, "logps/chosen": -340.35546875, "logps/rejected": -681.496337890625, "loss": 0.0527, "rewards/chosen": 4.903312683105469, "rewards/margins": 15.753532409667969, "rewards/rejected": -10.8502197265625, "step": 1030 }, { "epoch": 0.2825818829656023, "grad_norm": 6.1875, "kl": 4.352703094482422, "learning_rate": 5e-06, "logits/chosen": -10524502.857142856, "logits/rejected": 5332264.8, "logps/chosen": -448.61526925223217, "logps/rejected": -635.91923828125, "loss": 0.0241, "rewards/chosen": 4.971036093575614, "rewards/margins": 13.388267844063893, "rewards/rejected": -8.41723175048828, "step": 1031 }, { "epoch": 0.2828559682061121, "grad_norm": 7.5625, "kl": 0.734167754650116, "learning_rate": 5e-06, "logits/chosen": -15503662.4, "logits/rejected": 34483782.85714286, "logps/chosen": -531.09775390625, "logps/rejected": -451.7713099888393, "loss": 0.0231, "rewards/chosen": 6.637080383300781, "rewards/margins": 14.053517259870258, "rewards/rejected": -7.416436876569476, "step": 1032 }, { "epoch": 0.2831300534466219, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22897517.714285713, "logits/rejected": 12459880.8, "logps/chosen": -377.11732700892856, "logps/rejected": -440.646337890625, "loss": 0.0465, "rewards/chosen": 6.1419492449079245, "rewards/margins": 12.590690394810268, "rewards/rejected": -6.448741149902344, "step": 1033 }, { "epoch": 0.2834041386871317, "grad_norm": 10.6875, "kl": 4.83935546875, "learning_rate": 5e-06, "logits/chosen": 2727267.0, "logits/rejected": -21644440.0, "logps/chosen": -410.34466552734375, "logps/rejected": -508.0398254394531, "loss": 0.0434, "rewards/chosen": 4.421614170074463, "rewards/margins": 12.467484951019287, "rewards/rejected": -8.045870780944824, "step": 1034 }, { "epoch": 0.28367822392764147, "grad_norm": 8.0, "kl": 7.326183795928955, "learning_rate": 5e-06, "logits/chosen": -27136601.6, "logits/rejected": -6415240.0, "logps/chosen": -499.423828125, "logps/rejected": -416.54052734375, "loss": 0.0298, "rewards/chosen": 6.5901845296223955, "rewards/margins": 12.002118428548176, "rewards/rejected": -5.411933898925781, "step": 1035 }, { "epoch": 0.2839523091681513, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20273792.0, "logits/rejected": 16015890.666666666, "logps/chosen": -344.8353678385417, "logps/rejected": -738.7303059895834, "loss": 0.0237, "rewards/chosen": 5.180613199869792, "rewards/margins": 15.617181142171223, "rewards/rejected": -10.436567942301432, "step": 1036 }, { "epoch": 0.2842263944086611, "grad_norm": 3.828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21548668.444444444, "logits/rejected": -14511101.866666667, "logps/chosen": -425.8210177951389, "logps/rejected": -581.9740234375, "loss": 0.0151, "rewards/chosen": 5.401647355821398, "rewards/margins": 14.373034074571398, "rewards/rejected": -8.97138671875, "step": 1037 }, { "epoch": 0.2845004796491709, "grad_norm": 9.125, "kl": 1.6272634267807007, "learning_rate": 5e-06, "logits/chosen": -4011469.090909091, "logits/rejected": -13776806.153846154, "logps/chosen": -305.3956853693182, "logps/rejected": -323.38852163461536, "loss": 0.0512, "rewards/chosen": 6.201800953258168, "rewards/margins": 11.167413511476317, "rewards/rejected": -4.965612558218149, "step": 1038 }, { "epoch": 0.28477456488968067, "grad_norm": 6.375, "kl": 7.809026718139648, "learning_rate": 5e-06, "logits/chosen": -12320982.4, "logits/rejected": -11223996.444444444, "logps/chosen": -357.33131510416666, "logps/rejected": -393.53266059027777, "loss": 0.0292, "rewards/chosen": 6.3090662638346355, "rewards/margins": 14.172460259331597, "rewards/rejected": -7.863393995496962, "step": 1039 }, { "epoch": 0.2850486501301905, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21275539.692307692, "logits/rejected": -13634042.181818182, "logps/chosen": -347.28958834134613, "logps/rejected": -398.72953657670456, "loss": 0.0894, "rewards/chosen": 3.9158935546875, "rewards/margins": 11.239140597256746, "rewards/rejected": -7.323247042569247, "step": 1040 }, { "epoch": 0.2853227353707003, "grad_norm": 8.875, "kl": 5.999659538269043, "learning_rate": 5e-06, "logits/chosen": -23463956.363636363, "logits/rejected": -13993112.615384616, "logps/chosen": -531.0636541193181, "logps/rejected": -499.4268329326923, "loss": 0.0383, "rewards/chosen": 6.697977239435369, "rewards/margins": 14.290571679602136, "rewards/rejected": -7.592594440166767, "step": 1041 }, { "epoch": 0.2855968206112101, "grad_norm": 9.375, "kl": 2.4499335289001465, "learning_rate": 5e-06, "logits/chosen": -14163729.23076923, "logits/rejected": -15152349.090909092, "logps/chosen": -491.7963115985577, "logps/rejected": -424.6080433238636, "loss": 0.07, "rewards/chosen": 6.226189246544471, "rewards/margins": 12.470556672636445, "rewards/rejected": -6.244367426091975, "step": 1042 }, { "epoch": 0.28587090585171987, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30352160.0, "logits/rejected": -11951236.0, "logps/chosen": -481.19476318359375, "logps/rejected": -431.94024658203125, "loss": 0.0471, "rewards/chosen": 6.449286460876465, "rewards/margins": 13.613616466522217, "rewards/rejected": -7.164330005645752, "step": 1043 }, { "epoch": 0.2861449910922297, "grad_norm": 10.5625, "kl": 8.123210906982422, "learning_rate": 5e-06, "logits/chosen": -26301561.14285714, "logits/rejected": -18455566.4, "logps/chosen": -448.93101283482144, "logps/rejected": -472.445849609375, "loss": 0.0659, "rewards/chosen": 4.612923758370536, "rewards/margins": 13.62155565534319, "rewards/rejected": -9.008631896972656, "step": 1044 }, { "epoch": 0.2864190763327395, "grad_norm": 10.5625, "kl": 1.443238615989685, "learning_rate": 5e-06, "logits/chosen": -14198080.0, "logits/rejected": -16394100.923076924, "logps/chosen": -384.86123934659093, "logps/rejected": -604.0172776442307, "loss": 0.0422, "rewards/chosen": 4.822819796475497, "rewards/margins": 12.231283494642565, "rewards/rejected": -7.4084636981670675, "step": 1045 }, { "epoch": 0.28669316157324926, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11034453.0, "logits/rejected": -1201113.5, "logps/chosen": -517.672607421875, "logps/rejected": -479.0350341796875, "loss": 0.0134, "rewards/chosen": 6.930972099304199, "rewards/margins": 14.210758209228516, "rewards/rejected": -7.279786109924316, "step": 1046 }, { "epoch": 0.28696724681375907, "grad_norm": 11.625, "kl": 3.6272189617156982, "learning_rate": 5e-06, "logits/chosen": -27503059.692307692, "logits/rejected": -17905413.818181816, "logps/chosen": -505.3182842548077, "logps/rejected": -451.25656960227275, "loss": 0.0387, "rewards/chosen": 6.806201641376202, "rewards/margins": 13.643287712043815, "rewards/rejected": -6.837086070667613, "step": 1047 }, { "epoch": 0.2872413320542689, "grad_norm": 5.6875, "kl": 1.2566936016082764, "learning_rate": 5e-06, "logits/chosen": -15148939.636363637, "logits/rejected": -1871178.7692307692, "logps/chosen": -375.53151633522725, "logps/rejected": -432.6023137019231, "loss": 0.0296, "rewards/chosen": 5.013955549760298, "rewards/margins": 12.099335517083015, "rewards/rejected": -7.085379967322717, "step": 1048 }, { "epoch": 0.2875154172947787, "grad_norm": 3.46875, "kl": 1.2203433513641357, "learning_rate": 5e-06, "logits/chosen": -2114926.8, "logits/rejected": -25927108.57142857, "logps/chosen": -504.67314453125, "logps/rejected": -426.61387416294644, "loss": 0.0478, "rewards/chosen": 7.112785339355469, "rewards/margins": 13.9005001613072, "rewards/rejected": -6.78771482195173, "step": 1049 }, { "epoch": 0.28778950253528846, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14335989.333333334, "logits/rejected": -17752946.666666668, "logps/chosen": -361.5531819661458, "logps/rejected": -410.8125813802083, "loss": 0.0319, "rewards/chosen": 5.563000996907552, "rewards/margins": 13.638468424479168, "rewards/rejected": -8.075467427571615, "step": 1050 }, { "epoch": 0.28806358777579827, "grad_norm": 4.5, "kl": 0.7651647329330444, "learning_rate": 5e-06, "logits/chosen": -24913780.0, "logits/rejected": -11836820.0, "logps/chosen": -385.7103576660156, "logps/rejected": -479.47235107421875, "loss": 0.0231, "rewards/chosen": 4.7784576416015625, "rewards/margins": 11.89432954788208, "rewards/rejected": -7.115871906280518, "step": 1051 }, { "epoch": 0.2883376730163081, "grad_norm": 8.6875, "kl": 4.783511161804199, "learning_rate": 5e-06, "logits/chosen": -8366705.230769231, "logits/rejected": -21262398.545454547, "logps/chosen": -404.19471153846155, "logps/rejected": -653.4149502840909, "loss": 0.0451, "rewards/chosen": 6.038523160494291, "rewards/margins": 15.793534979120002, "rewards/rejected": -9.75501181862571, "step": 1052 }, { "epoch": 0.2886117582568179, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17017168.0, "logits/rejected": -17867234.666666668, "logps/chosen": -374.813720703125, "logps/rejected": -623.4566243489584, "loss": 0.0511, "rewards/chosen": 5.069204330444336, "rewards/margins": 16.847750981648765, "rewards/rejected": -11.778546651204428, "step": 1053 }, { "epoch": 0.28888584349732765, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1209590.4, "logits/rejected": -692758.2857142857, "logps/chosen": -409.03916015625, "logps/rejected": -522.5687779017857, "loss": 0.0349, "rewards/chosen": 5.010184478759766, "rewards/margins": 13.48134536743164, "rewards/rejected": -8.471160888671875, "step": 1054 }, { "epoch": 0.28915992873783747, "grad_norm": 9.5, "kl": 1.2458966970443726, "learning_rate": 5e-06, "logits/chosen": -31012428.8, "logits/rejected": -4998890.857142857, "logps/chosen": -410.692236328125, "logps/rejected": -417.02915736607144, "loss": 0.0408, "rewards/chosen": 6.262042617797851, "rewards/margins": 13.982600239345004, "rewards/rejected": -7.720557621547154, "step": 1055 }, { "epoch": 0.2894340139783473, "grad_norm": 11.25, "kl": 8.454475402832031, "learning_rate": 5e-06, "logits/chosen": -17003448.0, "logits/rejected": -17515996.8, "logps/chosen": -387.80064174107144, "logps/rejected": -495.625830078125, "loss": 0.0956, "rewards/chosen": 5.443414960588727, "rewards/margins": 13.534266553606306, "rewards/rejected": -8.090851593017579, "step": 1056 }, { "epoch": 0.28970809921885704, "grad_norm": 3.109375, "kl": 2.396523952484131, "learning_rate": 5e-06, "logits/chosen": -16329322.666666666, "logits/rejected": 5603040.0, "logps/chosen": -502.1051432291667, "logps/rejected": -515.252197265625, "loss": 0.0099, "rewards/chosen": 5.879538218180339, "rewards/margins": 15.340593973795574, "rewards/rejected": -9.461055755615234, "step": 1057 }, { "epoch": 0.28998218445936685, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2380156.6666666665, "logits/rejected": -22600373.333333332, "logps/chosen": -553.546630859375, "logps/rejected": -432.5323486328125, "loss": 0.0134, "rewards/chosen": 5.283844947814941, "rewards/margins": 14.104009310404459, "rewards/rejected": -8.820164362589518, "step": 1058 }, { "epoch": 0.29025626969987667, "grad_norm": 1.6953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14871890.285714285, "logits/rejected": -19869477.647058822, "logps/chosen": -424.50537109375, "logps/rejected": -638.6605009191177, "loss": 0.0045, "rewards/chosen": 6.789881569998605, "rewards/margins": 16.971602079247226, "rewards/rejected": -10.18172050924862, "step": 1059 }, { "epoch": 0.2905303549403865, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32406213.818181816, "logits/rejected": -10939591.384615384, "logps/chosen": -451.45432350852275, "logps/rejected": -473.54574819711536, "loss": 0.0354, "rewards/chosen": 5.203213778409091, "rewards/margins": 13.881738089181326, "rewards/rejected": -8.678524310772236, "step": 1060 }, { "epoch": 0.29080444018089624, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5505919.0, "logits/rejected": -30723940.0, "logps/chosen": -471.20684814453125, "logps/rejected": -508.80206298828125, "loss": 0.0211, "rewards/chosen": 6.311337471008301, "rewards/margins": 13.786884307861328, "rewards/rejected": -7.475546836853027, "step": 1061 }, { "epoch": 0.29107852542140605, "grad_norm": 4.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32339125.333333332, "logits/rejected": -30703978.666666668, "logps/chosen": -458.0292561848958, "logps/rejected": -563.4398193359375, "loss": 0.0131, "rewards/chosen": 7.351245880126953, "rewards/margins": 15.702267328898111, "rewards/rejected": -8.351021448771158, "step": 1062 }, { "epoch": 0.29135261066191587, "grad_norm": 3.515625, "kl": 2.6981561183929443, "learning_rate": 5e-06, "logits/chosen": -18537581.333333332, "logits/rejected": -12362966.666666666, "logps/chosen": -452.6629638671875, "logps/rejected": -396.4302571614583, "loss": 0.0164, "rewards/chosen": 5.954792022705078, "rewards/margins": 11.829280853271484, "rewards/rejected": -5.874488830566406, "step": 1063 }, { "epoch": 0.2916266959024257, "grad_norm": 2.828125, "kl": 3.318840742111206, "learning_rate": 5e-06, "logits/chosen": -9820822.857142856, "logits/rejected": -20950278.4, "logps/chosen": -472.4717494419643, "logps/rejected": -518.106640625, "loss": 0.0125, "rewards/chosen": 6.551551273890904, "rewards/margins": 15.169544437953405, "rewards/rejected": -8.6179931640625, "step": 1064 }, { "epoch": 0.29190078114293544, "grad_norm": 13.3125, "kl": 1.8733642101287842, "learning_rate": 5e-06, "logits/chosen": -24955288.470588237, "logits/rejected": -9010952.57142857, "logps/chosen": -396.58645450367646, "logps/rejected": -541.9744001116071, "loss": 0.0932, "rewards/chosen": 5.1887525670668655, "rewards/margins": 13.528749994870996, "rewards/rejected": -8.33999742780413, "step": 1065 }, { "epoch": 0.29217486638344525, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15002837.818181818, "logits/rejected": 627769.2307692308, "logps/chosen": -409.5984552556818, "logps/rejected": -557.6712364783654, "loss": 0.0621, "rewards/chosen": 5.061722495339134, "rewards/margins": 15.744879609221346, "rewards/rejected": -10.683157113882212, "step": 1066 }, { "epoch": 0.29244895162395507, "grad_norm": 7.5, "kl": 1.7091584205627441, "learning_rate": 5e-06, "logits/chosen": -24869195.42857143, "logits/rejected": -26840172.8, "logps/chosen": -358.55252511160717, "logps/rejected": -390.28671875, "loss": 0.0487, "rewards/chosen": 5.7441302708217075, "rewards/margins": 11.884954016549248, "rewards/rejected": -6.140823745727539, "step": 1067 }, { "epoch": 0.2927230368644648, "grad_norm": 2.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9634794.666666666, "logits/rejected": 1009096.8888888889, "logps/chosen": -405.4883626302083, "logps/rejected": -508.52533637152777, "loss": 0.0272, "rewards/chosen": 7.490355809529622, "rewards/margins": 14.689340591430664, "rewards/rejected": -7.198984781901042, "step": 1068 }, { "epoch": 0.29299712210497464, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17215108.0, "logits/rejected": -15416437.333333334, "logps/chosen": -451.6673177083333, "logps/rejected": -400.9019368489583, "loss": 0.0459, "rewards/chosen": 5.178770701090495, "rewards/margins": 13.202402114868164, "rewards/rejected": -8.02363141377767, "step": 1069 }, { "epoch": 0.29327120734548445, "grad_norm": 14.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5949926.181818182, "logits/rejected": -24896526.769230768, "logps/chosen": -435.1437322443182, "logps/rejected": -560.3046875, "loss": 0.0384, "rewards/chosen": 6.8018341064453125, "rewards/margins": 16.085703923152042, "rewards/rejected": -9.28386981670673, "step": 1070 }, { "epoch": 0.29354529258599427, "grad_norm": 7.21875, "kl": 0.16980235278606415, "learning_rate": 5e-06, "logits/chosen": -20083329.454545453, "logits/rejected": -12497774.76923077, "logps/chosen": -502.06338778409093, "logps/rejected": -635.7085336538462, "loss": 0.0234, "rewards/chosen": 5.924740184437145, "rewards/margins": 19.461408228307334, "rewards/rejected": -13.536668043870192, "step": 1071 }, { "epoch": 0.293819377826504, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11031816.888888888, "logits/rejected": -32099972.266666666, "logps/chosen": -488.98828125, "logps/rejected": -458.9402669270833, "loss": 0.0382, "rewards/chosen": 6.118995666503906, "rewards/margins": 13.058966573079427, "rewards/rejected": -6.939970906575521, "step": 1072 }, { "epoch": 0.29409346306701384, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6884333.818181818, "logits/rejected": -35201634.461538464, "logps/chosen": -393.03329190340907, "logps/rejected": -581.3615534855769, "loss": 0.0144, "rewards/chosen": 6.209661310369318, "rewards/margins": 14.164065701144558, "rewards/rejected": -7.95440439077524, "step": 1073 }, { "epoch": 0.29436754830752365, "grad_norm": 6.21875, "kl": 6.744792938232422, "learning_rate": 5e-06, "logits/chosen": -15124625.6, "logits/rejected": -3091367.1428571427, "logps/chosen": -509.141162109375, "logps/rejected": -534.9545549665179, "loss": 0.0604, "rewards/chosen": 6.743038940429687, "rewards/margins": 15.220724051339285, "rewards/rejected": -8.477685110909599, "step": 1074 }, { "epoch": 0.29464163354803347, "grad_norm": 13.6875, "kl": 5.341987609863281, "learning_rate": 5e-06, "logits/chosen": -13127037.538461538, "logits/rejected": -19059591.272727273, "logps/chosen": -563.2160832331731, "logps/rejected": -408.50297407670456, "loss": 0.0749, "rewards/chosen": 6.3085773174579325, "rewards/margins": 12.374953743461129, "rewards/rejected": -6.066376426003196, "step": 1075 }, { "epoch": 0.2949157187885432, "grad_norm": 7.21875, "kl": 8.58033275604248, "learning_rate": 5e-06, "logits/chosen": -2816062.153846154, "logits/rejected": -19647082.181818184, "logps/chosen": -530.2874849759615, "logps/rejected": -585.5297407670455, "loss": 0.0456, "rewards/chosen": 6.20426999605619, "rewards/margins": 15.714654989175862, "rewards/rejected": -9.510384993119674, "step": 1076 }, { "epoch": 0.29518980402905304, "grad_norm": 6.375, "kl": 4.17081356048584, "learning_rate": 5e-06, "logits/chosen": -25350200.0, "logits/rejected": -20847030.666666668, "logps/chosen": -534.4053548177084, "logps/rejected": -494.622314453125, "loss": 0.047, "rewards/chosen": 5.750114440917969, "rewards/margins": 15.110954284667969, "rewards/rejected": -9.36083984375, "step": 1077 }, { "epoch": 0.29546388926956285, "grad_norm": 10.5625, "kl": 8.079852104187012, "learning_rate": 5e-06, "logits/chosen": -14046033.23076923, "logits/rejected": -25527479.272727273, "logps/chosen": -434.4167668269231, "logps/rejected": -393.33327414772725, "loss": 0.0623, "rewards/chosen": 4.9113910381610575, "rewards/margins": 12.985945481520432, "rewards/rejected": -8.074554443359375, "step": 1078 }, { "epoch": 0.2957379745100726, "grad_norm": 11.0625, "kl": 10.608439445495605, "learning_rate": 5e-06, "logits/chosen": -25649683.2, "logits/rejected": 37360160.0, "logps/chosen": -482.2635091145833, "logps/rejected": -630.8063151041666, "loss": 0.0371, "rewards/chosen": 7.016084798177084, "rewards/margins": 20.572950914171006, "rewards/rejected": -13.556866115993923, "step": 1079 }, { "epoch": 0.2960120597505824, "grad_norm": 5.90625, "kl": 0.19478607177734375, "learning_rate": 5e-06, "logits/chosen": -16388784.0, "logits/rejected": -5076516.0, "logps/chosen": -422.02469308035717, "logps/rejected": -591.952880859375, "loss": 0.022, "rewards/chosen": 5.273594992501395, "rewards/margins": 14.180884116036552, "rewards/rejected": -8.907289123535156, "step": 1080 }, { "epoch": 0.29628614499109224, "grad_norm": 12.4375, "kl": 1.0446605682373047, "learning_rate": 5e-06, "logits/chosen": 6267887.2727272725, "logits/rejected": -14359136.0, "logps/chosen": -479.1066228693182, "logps/rejected": -519.7198768028846, "loss": 0.0382, "rewards/chosen": 4.598660208962181, "rewards/margins": 12.640118338844992, "rewards/rejected": -8.041458129882812, "step": 1081 }, { "epoch": 0.29656023023160205, "grad_norm": 11.875, "kl": 5.391844749450684, "learning_rate": 5e-06, "logits/chosen": -14120670.857142856, "logits/rejected": -18279454.4, "logps/chosen": -445.18603515625, "logps/rejected": -414.27236328125, "loss": 0.0771, "rewards/chosen": 4.615233830043247, "rewards/margins": 10.571694782802037, "rewards/rejected": -5.956460952758789, "step": 1082 }, { "epoch": 0.2968343154721118, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27751571.692307692, "logits/rejected": -7902020.363636363, "logps/chosen": -440.64197716346155, "logps/rejected": -591.2212801846591, "loss": 0.022, "rewards/chosen": 5.421122037447416, "rewards/margins": 13.685796137456293, "rewards/rejected": -8.264674100008877, "step": 1083 }, { "epoch": 0.2971084007126216, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38197222.4, "logits/rejected": -22924128.0, "logps/chosen": -440.285986328125, "logps/rejected": -445.1131068638393, "loss": 0.0194, "rewards/chosen": 7.108845520019531, "rewards/margins": 14.604392133440289, "rewards/rejected": -7.495546613420759, "step": 1084 }, { "epoch": 0.29738248595313144, "grad_norm": 14.625, "kl": 1.947662353515625, "learning_rate": 5e-06, "logits/chosen": -27492865.777777776, "logits/rejected": 852710.4, "logps/chosen": -419.2498372395833, "logps/rejected": -514.1194986979167, "loss": 0.1182, "rewards/chosen": 4.868280198838976, "rewards/margins": 11.866567145453558, "rewards/rejected": -6.998286946614583, "step": 1085 }, { "epoch": 0.2976565711936412, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18991745.454545453, "logits/rejected": -15528653.538461538, "logps/chosen": -537.1700994318181, "logps/rejected": -519.7867337740385, "loss": 0.0121, "rewards/chosen": 6.882707075639204, "rewards/margins": 16.300579577892808, "rewards/rejected": -9.417872502253605, "step": 1086 }, { "epoch": 0.297930656434151, "grad_norm": 6.0, "kl": 2.4870707988739014, "learning_rate": 5e-06, "logits/chosen": -28695914.666666668, "logits/rejected": -11593565.333333334, "logps/chosen": -388.7858072916667, "logps/rejected": -691.775146484375, "loss": 0.0218, "rewards/chosen": 5.29501469930013, "rewards/margins": 16.264420827229817, "rewards/rejected": -10.969406127929688, "step": 1087 }, { "epoch": 0.2982047416746608, "grad_norm": 3.46875, "kl": 5.706443786621094, "learning_rate": 5e-06, "logits/chosen": -15538813.866666667, "logits/rejected": -19515484.444444444, "logps/chosen": -428.5836588541667, "logps/rejected": -437.85438368055554, "loss": 0.1, "rewards/chosen": 5.017101033528646, "rewards/margins": 14.73963097466363, "rewards/rejected": -9.722529941134983, "step": 1088 }, { "epoch": 0.29847882691517064, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9325378.285714285, "logits/rejected": -31410041.6, "logps/chosen": -370.256103515625, "logps/rejected": -580.913427734375, "loss": 0.0355, "rewards/chosen": 5.489479064941406, "rewards/margins": 13.690165710449218, "rewards/rejected": -8.200686645507812, "step": 1089 }, { "epoch": 0.2987529121556804, "grad_norm": 2.9375, "kl": 6.512880802154541, "learning_rate": 5e-06, "logits/chosen": -22667570.666666668, "logits/rejected": -17383560.0, "logps/chosen": -443.2462972005208, "logps/rejected": -479.2784830729167, "loss": 0.0112, "rewards/chosen": 7.684643427530925, "rewards/margins": 14.769641240437826, "rewards/rejected": -7.084997812906901, "step": 1090 }, { "epoch": 0.2990269973961902, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19108027.636363637, "logits/rejected": -13640694.153846154, "logps/chosen": -490.7666015625, "logps/rejected": -496.0592698317308, "loss": 0.0617, "rewards/chosen": 5.911700855601918, "rewards/margins": 14.251769966178841, "rewards/rejected": -8.340069110576923, "step": 1091 }, { "epoch": 0.2993010826367, "grad_norm": 5.90625, "kl": 5.34882926940918, "learning_rate": 5e-06, "logits/chosen": -10878946.0, "logits/rejected": -28544346.0, "logps/chosen": -464.678955078125, "logps/rejected": -375.37664794921875, "loss": 0.0313, "rewards/chosen": 6.705815315246582, "rewards/margins": 14.166656494140625, "rewards/rejected": -7.460841178894043, "step": 1092 }, { "epoch": 0.29957516787720984, "grad_norm": 12.5, "kl": 18.68014144897461, "learning_rate": 5e-06, "logits/chosen": -9616103.111111112, "logits/rejected": 10151607.333333334, "logps/chosen": -468.7623697916667, "logps/rejected": -459.9700520833333, "loss": 0.0842, "rewards/chosen": 6.068935818142361, "rewards/margins": 11.580194897121853, "rewards/rejected": -5.511259078979492, "step": 1093 }, { "epoch": 0.2998492531177196, "grad_norm": 8.0625, "kl": 0.23200353980064392, "learning_rate": 5e-06, "logits/chosen": -8954899.076923076, "logits/rejected": -15458192.0, "logps/chosen": -438.33188100961536, "logps/rejected": -544.5822088068181, "loss": 0.0396, "rewards/chosen": 5.914050762469952, "rewards/margins": 14.054758992228475, "rewards/rejected": -8.140708229758523, "step": 1094 }, { "epoch": 0.3001233383582294, "grad_norm": 4.0, "kl": 1.9282376766204834, "learning_rate": 5e-06, "logits/chosen": -27572057.6, "logits/rejected": -19553472.0, "logps/chosen": -507.26526692708336, "logps/rejected": -418.2261013454861, "loss": 0.0314, "rewards/chosen": 6.777867126464844, "rewards/margins": 13.642784457736546, "rewards/rejected": -6.864917331271702, "step": 1095 }, { "epoch": 0.3003974235987392, "grad_norm": 3.671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34849160.53333333, "logits/rejected": -35274492.44444445, "logps/chosen": -473.1590169270833, "logps/rejected": -361.50086805555554, "loss": 0.0167, "rewards/chosen": 6.4980412801106775, "rewards/margins": 13.608519575330947, "rewards/rejected": -7.110478295220269, "step": 1096 }, { "epoch": 0.300671508839249, "grad_norm": 4.875, "kl": 4.424947738647461, "learning_rate": 5e-06, "logits/chosen": -21203251.692307692, "logits/rejected": -14793579.636363637, "logps/chosen": -425.73745492788464, "logps/rejected": -429.84224076704544, "loss": 0.0212, "rewards/chosen": 6.34053215613732, "rewards/margins": 13.352931309413243, "rewards/rejected": -7.012399153275923, "step": 1097 }, { "epoch": 0.3009455940797588, "grad_norm": 9.8125, "kl": 3.948117733001709, "learning_rate": 5e-06, "logits/chosen": -16994389.714285713, "logits/rejected": 20260006.4, "logps/chosen": -335.0615931919643, "logps/rejected": -589.384521484375, "loss": 0.0825, "rewards/chosen": 4.18879154750279, "rewards/margins": 12.486162458147321, "rewards/rejected": -8.297370910644531, "step": 1098 }, { "epoch": 0.3012196793202686, "grad_norm": 5.9375, "kl": 1.8686473369598389, "learning_rate": 5e-06, "logits/chosen": -35991478.15384615, "logits/rejected": -22958000.0, "logps/chosen": -542.4319411057693, "logps/rejected": -473.32954545454544, "loss": 0.0201, "rewards/chosen": 7.028864933894231, "rewards/margins": 13.908925303212413, "rewards/rejected": -6.880060369318182, "step": 1099 }, { "epoch": 0.3014937645607784, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6031541.090909091, "logits/rejected": 2938423.6923076925, "logps/chosen": -457.18954190340907, "logps/rejected": -717.6520432692307, "loss": 0.0269, "rewards/chosen": 7.409467523748225, "rewards/margins": 19.386979589929112, "rewards/rejected": -11.97751206618089, "step": 1100 }, { "epoch": 0.3017678498012882, "grad_norm": 7.125, "kl": 0.8637079000473022, "learning_rate": 5e-06, "logits/chosen": -28092068.923076924, "logits/rejected": -3085385.090909091, "logps/chosen": -377.3088191105769, "logps/rejected": -295.3263494318182, "loss": 0.0616, "rewards/chosen": 5.493105961726262, "rewards/margins": 10.273266345470935, "rewards/rejected": -4.780160383744673, "step": 1101 }, { "epoch": 0.302041935041798, "grad_norm": 4.65625, "kl": 5.014363765716553, "learning_rate": 5e-06, "logits/chosen": -16456837.333333334, "logits/rejected": -19389546.666666668, "logps/chosen": -430.40797526041666, "logps/rejected": -496.99175347222223, "loss": 0.0444, "rewards/chosen": 5.986407470703125, "rewards/margins": 13.23175523546007, "rewards/rejected": -7.245347764756945, "step": 1102 }, { "epoch": 0.3023160202823078, "grad_norm": 10.1875, "kl": 1.3516795635223389, "learning_rate": 5e-06, "logits/chosen": -7081457.230769231, "logits/rejected": -11371383.272727273, "logps/chosen": -474.31531700721155, "logps/rejected": -430.0437677556818, "loss": 0.0701, "rewards/chosen": 5.885730449969952, "rewards/margins": 12.511324155580748, "rewards/rejected": -6.625593705610796, "step": 1103 }, { "epoch": 0.3025901055228176, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3352704.0, "logits/rejected": 1318257.4285714286, "logps/chosen": -531.1283203125, "logps/rejected": -460.05057198660717, "loss": 0.0713, "rewards/chosen": 4.3938850402832035, "rewards/margins": 11.883385358537947, "rewards/rejected": -7.489500318254743, "step": 1104 }, { "epoch": 0.3028641907633274, "grad_norm": 6.84375, "kl": 5.0400238037109375, "learning_rate": 5e-06, "logits/chosen": -26644864.0, "logits/rejected": -13971832.0, "logps/chosen": -438.41376953125, "logps/rejected": -469.74720982142856, "loss": 0.0279, "rewards/chosen": 5.338924407958984, "rewards/margins": 14.50471660069057, "rewards/rejected": -9.165792192731585, "step": 1105 }, { "epoch": 0.3031382760038372, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15594854.4, "logits/rejected": -24808946.285714287, "logps/chosen": -401.133251953125, "logps/rejected": -584.4102260044643, "loss": 0.0137, "rewards/chosen": 5.9389698028564455, "rewards/margins": 14.065623201642719, "rewards/rejected": -8.126653398786273, "step": 1106 }, { "epoch": 0.303412361244347, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41553069.333333336, "logits/rejected": -16113961.333333334, "logps/chosen": -327.7891438802083, "logps/rejected": -740.14599609375, "loss": 0.0502, "rewards/chosen": 5.245719909667969, "rewards/margins": 16.48113250732422, "rewards/rejected": -11.23541259765625, "step": 1107 }, { "epoch": 0.30368644648485676, "grad_norm": 8.75, "kl": 5.278210639953613, "learning_rate": 5e-06, "logits/chosen": -27744164.57142857, "logits/rejected": -11980136.0, "logps/chosen": -328.1917201450893, "logps/rejected": -556.1958984375, "loss": 0.0802, "rewards/chosen": 5.312091827392578, "rewards/margins": 10.576219177246093, "rewards/rejected": -5.264127349853515, "step": 1108 }, { "epoch": 0.3039605317253666, "grad_norm": 4.84375, "kl": 4.635922908782959, "learning_rate": 5e-06, "logits/chosen": -16893806.933333334, "logits/rejected": -5052448.888888889, "logps/chosen": -452.74261067708335, "logps/rejected": -548.5677083333334, "loss": 0.0153, "rewards/chosen": 7.021608988444011, "rewards/margins": 14.54437255859375, "rewards/rejected": -7.522763570149739, "step": 1109 }, { "epoch": 0.3042346169658764, "grad_norm": 7.53125, "kl": 3.9188945293426514, "learning_rate": 5e-06, "logits/chosen": -37166240.0, "logits/rejected": -31446748.8, "logps/chosen": -446.92738560267856, "logps/rejected": -562.90361328125, "loss": 0.0821, "rewards/chosen": 5.226675306047712, "rewards/margins": 13.841503034319196, "rewards/rejected": -8.614827728271484, "step": 1110 }, { "epoch": 0.3045087022063862, "grad_norm": 14.8125, "kl": 10.944302558898926, "learning_rate": 5e-06, "logits/chosen": -6210255.5, "logits/rejected": -35833408.0, "logps/chosen": -472.58660888671875, "logps/rejected": -375.86749267578125, "loss": 0.053, "rewards/chosen": 6.941847801208496, "rewards/margins": 13.216641426086426, "rewards/rejected": -6.27479362487793, "step": 1111 }, { "epoch": 0.30478278744689596, "grad_norm": 0.96484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5444774.8, "logits/rejected": -7245877.142857143, "logps/chosen": -540.574365234375, "logps/rejected": -568.0019182477679, "loss": 0.0025, "rewards/chosen": 7.858541870117188, "rewards/margins": 18.300874546595985, "rewards/rejected": -10.442332676478795, "step": 1112 }, { "epoch": 0.3050568726874058, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24649878.4, "logits/rejected": -20868141.714285713, "logps/chosen": -480.0603515625, "logps/rejected": -444.13260323660717, "loss": 0.019, "rewards/chosen": 6.729631042480468, "rewards/margins": 14.462582070486885, "rewards/rejected": -7.7329510280064175, "step": 1113 }, { "epoch": 0.3053309579279156, "grad_norm": 12.625, "kl": 9.827690124511719, "learning_rate": 5e-06, "logits/chosen": -29150296.615384616, "logits/rejected": -10360420.363636363, "logps/chosen": -473.0603215144231, "logps/rejected": -569.5564630681819, "loss": 0.0615, "rewards/chosen": 6.2190716083233175, "rewards/margins": 15.068694161368416, "rewards/rejected": -8.8496225530451, "step": 1114 }, { "epoch": 0.3056050431684254, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4732742.769230769, "logits/rejected": 11744700.363636363, "logps/chosen": -424.76370943509613, "logps/rejected": -590.1255326704545, "loss": 0.0197, "rewards/chosen": 5.981447073129507, "rewards/margins": 14.154662072241724, "rewards/rejected": -8.173214999112217, "step": 1115 }, { "epoch": 0.30587912840893516, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26557486.545454547, "logits/rejected": -37149907.692307696, "logps/chosen": -454.08225319602275, "logps/rejected": -527.5900691105769, "loss": 0.0375, "rewards/chosen": 5.985993818803267, "rewards/margins": 13.461940098475743, "rewards/rejected": -7.475946279672476, "step": 1116 }, { "epoch": 0.306153213649445, "grad_norm": 6.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4081049.6666666665, "logits/rejected": 4081008.0, "logps/chosen": -458.5350748697917, "logps/rejected": -520.7919514973959, "loss": 0.0214, "rewards/chosen": 6.024700164794922, "rewards/margins": 15.293402353922525, "rewards/rejected": -9.268702189127604, "step": 1117 }, { "epoch": 0.3064272988899548, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15540549.333333334, "logits/rejected": 5576996.666666667, "logps/chosen": -469.4650065104167, "logps/rejected": -810.830322265625, "loss": 0.0513, "rewards/chosen": 5.644620259602864, "rewards/margins": 16.225018819173176, "rewards/rejected": -10.580398559570312, "step": 1118 }, { "epoch": 0.30670138413046455, "grad_norm": 5.71875, "kl": 0.6377710103988647, "learning_rate": 5e-06, "logits/chosen": -12035082.4, "logits/rejected": -36782317.71428572, "logps/chosen": -386.558837890625, "logps/rejected": -618.1937779017857, "loss": 0.029, "rewards/chosen": 4.778298187255859, "rewards/margins": 13.055484444754462, "rewards/rejected": -8.277186257498604, "step": 1119 }, { "epoch": 0.30697546937097436, "grad_norm": 5.71875, "kl": 0.059848152101039886, "learning_rate": 5e-06, "logits/chosen": -4329021.142857143, "logits/rejected": 1868146.9411764706, "logps/chosen": -363.5145786830357, "logps/rejected": -422.4412626378676, "loss": 0.0225, "rewards/chosen": 6.3840130397251675, "rewards/margins": 13.100170840736197, "rewards/rejected": -6.716157801011029, "step": 1120 }, { "epoch": 0.3072495546114842, "grad_norm": 3.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14929307.2, "logits/rejected": -39248109.71428572, "logps/chosen": -520.995947265625, "logps/rejected": -515.9207938058036, "loss": 0.0188, "rewards/chosen": 6.51365966796875, "rewards/margins": 15.609920283726282, "rewards/rejected": -9.096260615757533, "step": 1121 }, { "epoch": 0.307523639851994, "grad_norm": 10.6875, "kl": 4.210796356201172, "learning_rate": 5e-06, "logits/chosen": -6896560.0, "logits/rejected": -30107827.2, "logps/chosen": -446.34444754464283, "logps/rejected": -471.1564453125, "loss": 0.0522, "rewards/chosen": 6.8186830793108255, "rewards/margins": 12.978336552211216, "rewards/rejected": -6.1596534729003904, "step": 1122 }, { "epoch": 0.30779772509250375, "grad_norm": 3.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14698108.57142857, "logits/rejected": -1829652.0, "logps/chosen": -430.99490792410717, "logps/rejected": -486.8235294117647, "loss": 0.0132, "rewards/chosen": 6.43873051234654, "rewards/margins": 15.55770418824268, "rewards/rejected": -9.11897367589614, "step": 1123 }, { "epoch": 0.30807181033301356, "grad_norm": 12.6875, "kl": 6.803936004638672, "learning_rate": 5e-06, "logits/chosen": -24908819.2, "logits/rejected": -6831124.444444444, "logps/chosen": -424.61640625, "logps/rejected": -677.0373806423611, "loss": 0.0579, "rewards/chosen": 5.829174296061198, "rewards/margins": 16.531361728244356, "rewards/rejected": -10.702187432183159, "step": 1124 }, { "epoch": 0.3083458955735234, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28876547.2, "logits/rejected": -11089851.42857143, "logps/chosen": -374.8903564453125, "logps/rejected": -718.9300362723214, "loss": 0.0434, "rewards/chosen": 5.724866485595703, "rewards/margins": 19.45114778791155, "rewards/rejected": -13.726281302315849, "step": 1125 }, { "epoch": 0.3086199808140332, "grad_norm": 9.25, "kl": 6.458516597747803, "learning_rate": 5e-06, "logits/chosen": -27177826.666666668, "logits/rejected": -16957969.333333332, "logps/chosen": -433.68896484375, "logps/rejected": -445.2791341145833, "loss": 0.0604, "rewards/chosen": 5.69816525777181, "rewards/margins": 13.222309112548828, "rewards/rejected": -7.5241438547770185, "step": 1126 }, { "epoch": 0.30889406605454295, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12285541.6, "logits/rejected": -25067698.285714287, "logps/chosen": -522.24833984375, "logps/rejected": -579.6522391183036, "loss": 0.0093, "rewards/chosen": 7.550592041015625, "rewards/margins": 18.986725289481026, "rewards/rejected": -11.436133248465401, "step": 1127 }, { "epoch": 0.30916815129505276, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26853388.8, "logits/rejected": -22417142.85714286, "logps/chosen": -500.7642578125, "logps/rejected": -527.8025251116071, "loss": 0.0554, "rewards/chosen": 4.376620864868164, "rewards/margins": 14.514084025791712, "rewards/rejected": -10.137463160923549, "step": 1128 }, { "epoch": 0.3094422365355626, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19761684.363636363, "logits/rejected": -20397795.692307692, "logps/chosen": -382.1985973011364, "logps/rejected": -504.7409480168269, "loss": 0.1015, "rewards/chosen": 4.027758858420632, "rewards/margins": 14.000074720049238, "rewards/rejected": -9.972315861628605, "step": 1129 }, { "epoch": 0.30971632177607233, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17062910.545454547, "logits/rejected": -18836011.076923076, "logps/chosen": -515.5007990056819, "logps/rejected": -462.1457331730769, "loss": 0.0311, "rewards/chosen": 7.242712541060015, "rewards/margins": 16.808415713010135, "rewards/rejected": -9.56570317195012, "step": 1130 }, { "epoch": 0.30999040701658215, "grad_norm": 13.625, "kl": 7.98950719833374, "learning_rate": 5e-06, "logits/chosen": -26201685.333333332, "logits/rejected": -38847125.333333336, "logps/chosen": -478.8164876302083, "logps/rejected": -548.8184814453125, "loss": 0.0561, "rewards/chosen": 4.826607386271159, "rewards/margins": 15.14499346415202, "rewards/rejected": -10.31838607788086, "step": 1131 }, { "epoch": 0.31026449225709196, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37136998.4, "logits/rejected": -32405074.285714287, "logps/chosen": -559.0873046875, "logps/rejected": -398.1171177455357, "loss": 0.0194, "rewards/chosen": 7.2521202087402346, "rewards/margins": 16.099533952985492, "rewards/rejected": -8.847413744245257, "step": 1132 }, { "epoch": 0.3105385774976018, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 36402940.44444445, "logits/rejected": -4598411.2, "logps/chosen": -526.6043294270834, "logps/rejected": -527.4513020833333, "loss": 0.0528, "rewards/chosen": 5.774631924099392, "rewards/margins": 12.982745530870226, "rewards/rejected": -7.208113606770834, "step": 1133 }, { "epoch": 0.31081266273811153, "grad_norm": 2.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22183154.666666668, "logits/rejected": -27090005.333333332, "logps/chosen": -404.3594156901042, "logps/rejected": -669.7356770833334, "loss": 0.0085, "rewards/chosen": 6.2249806722005205, "rewards/margins": 18.799392700195312, "rewards/rejected": -12.574412027994791, "step": 1134 }, { "epoch": 0.31108674797862135, "grad_norm": 15.1875, "kl": 11.777234077453613, "learning_rate": 5e-06, "logits/chosen": -15224299.789473685, "logits/rejected": -4054100.8, "logps/chosen": -482.1708470394737, "logps/rejected": -533.596142578125, "loss": 0.0649, "rewards/chosen": 6.104137219880757, "rewards/margins": 13.780396069978412, "rewards/rejected": -7.676258850097656, "step": 1135 }, { "epoch": 0.31136083321913116, "grad_norm": 13.0625, "kl": 5.362753868103027, "learning_rate": 5e-06, "logits/chosen": -10899944.0, "logits/rejected": -4430133.142857143, "logps/chosen": -492.382421875, "logps/rejected": -431.46010044642856, "loss": 0.0561, "rewards/chosen": 6.355989837646485, "rewards/margins": 14.646747153145927, "rewards/rejected": -8.290757315499443, "step": 1136 }, { "epoch": 0.311634918459641, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8372220.0, "logits/rejected": -15667513.333333334, "logps/chosen": -355.4342447916667, "logps/rejected": -419.4850667317708, "loss": 0.0927, "rewards/chosen": 4.725744247436523, "rewards/margins": 13.69633928934733, "rewards/rejected": -8.970595041910807, "step": 1137 }, { "epoch": 0.31190900370015073, "grad_norm": 3.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24273705.14285714, "logits/rejected": -30262032.0, "logps/chosen": -356.79990931919644, "logps/rejected": -622.3302734375, "loss": 0.0333, "rewards/chosen": 5.3051959446498325, "rewards/margins": 15.648598044259206, "rewards/rejected": -10.343402099609374, "step": 1138 }, { "epoch": 0.31218308894066055, "grad_norm": 5.9375, "kl": 7.575168609619141, "learning_rate": 5e-06, "logits/chosen": -17733692.307692308, "logits/rejected": -11937655.272727273, "logps/chosen": -396.5891676682692, "logps/rejected": -685.6845259232955, "loss": 0.0216, "rewards/chosen": 6.830788832444411, "rewards/margins": 20.215283960729213, "rewards/rejected": -13.3844951282848, "step": 1139 }, { "epoch": 0.31245717418117036, "grad_norm": 8.125, "kl": 0.36707115173339844, "learning_rate": 5e-06, "logits/chosen": -22980775.111111112, "logits/rejected": -7483971.2, "logps/chosen": -414.5800509982639, "logps/rejected": -522.08681640625, "loss": 0.0339, "rewards/chosen": 5.329615698920356, "rewards/margins": 13.497943539089626, "rewards/rejected": -8.16832784016927, "step": 1140 }, { "epoch": 0.3127312594216801, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5628656.0, "logits/rejected": -28622709.333333332, "logps/chosen": -390.4486490885417, "logps/rejected": -561.354248046875, "loss": 0.0398, "rewards/chosen": 5.95725949605306, "rewards/margins": 13.520312627156574, "rewards/rejected": -7.563053131103516, "step": 1141 }, { "epoch": 0.31300534466218993, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1750119.3846153845, "logits/rejected": -13354810.181818182, "logps/chosen": -475.03087439903845, "logps/rejected": -551.2029030539773, "loss": 0.0448, "rewards/chosen": 6.735497107872596, "rewards/margins": 16.007895623053702, "rewards/rejected": -9.272398515181107, "step": 1142 }, { "epoch": 0.31327942990269975, "grad_norm": 7.53125, "kl": 13.705860137939453, "learning_rate": 5e-06, "logits/chosen": -42123177.14285714, "logits/rejected": -28303692.8, "logps/chosen": -439.58297293526783, "logps/rejected": -451.337451171875, "loss": 0.0211, "rewards/chosen": 6.2049745832170755, "rewards/margins": 16.425059727260045, "rewards/rejected": -10.220085144042969, "step": 1143 }, { "epoch": 0.31355351514320956, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12708219.42857143, "logits/rejected": -6455980.8, "logps/chosen": -434.91796875, "logps/rejected": -442.08388671875, "loss": 0.0827, "rewards/chosen": 5.00634275163923, "rewards/margins": 15.360912758963448, "rewards/rejected": -10.354570007324218, "step": 1144 }, { "epoch": 0.3138276003837193, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26256867.555555556, "logits/rejected": -11869030.4, "logps/chosen": -538.5508897569445, "logps/rejected": -492.8033203125, "loss": 0.0353, "rewards/chosen": 5.790754106309679, "rewards/margins": 15.462355465359158, "rewards/rejected": -9.671601359049479, "step": 1145 }, { "epoch": 0.31410168562422913, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15936115.692307692, "logits/rejected": -13732283.636363637, "logps/chosen": -431.0832331730769, "logps/rejected": -420.4944513494318, "loss": 0.0364, "rewards/chosen": 5.07518298809345, "rewards/margins": 14.975565983698917, "rewards/rejected": -9.900382995605469, "step": 1146 }, { "epoch": 0.31437577086473895, "grad_norm": 9.125, "kl": 8.274698257446289, "learning_rate": 5e-06, "logits/chosen": -8364762.352941177, "logits/rejected": -4672750.285714285, "logps/chosen": -361.94450827205884, "logps/rejected": -514.7030552455357, "loss": 0.0875, "rewards/chosen": 5.5512856876148895, "rewards/margins": 12.671623037642792, "rewards/rejected": -7.120337350027902, "step": 1147 }, { "epoch": 0.31464985610524876, "grad_norm": 10.5, "kl": 0.43775051832199097, "learning_rate": 5e-06, "logits/chosen": -14476633.846153846, "logits/rejected": -6956013.090909091, "logps/chosen": -452.74158653846155, "logps/rejected": -522.5855823863636, "loss": 0.0334, "rewards/chosen": 4.967149587777945, "rewards/margins": 17.044529681439165, "rewards/rejected": -12.07738009366122, "step": 1148 }, { "epoch": 0.3149239413457585, "grad_norm": 9.5, "kl": 6.221829414367676, "learning_rate": 5e-06, "logits/chosen": -9924899.0, "logits/rejected": -26917100.0, "logps/chosen": -313.40533447265625, "logps/rejected": -396.9111022949219, "loss": 0.0729, "rewards/chosen": 5.033756732940674, "rewards/margins": 13.36721658706665, "rewards/rejected": -8.333459854125977, "step": 1149 }, { "epoch": 0.31519802658626833, "grad_norm": 11.6875, "kl": 9.071938514709473, "learning_rate": 5e-06, "logits/chosen": 65293013.333333336, "logits/rejected": -17332666.666666668, "logps/chosen": -479.1280924479167, "logps/rejected": -462.8512912326389, "loss": 0.0416, "rewards/chosen": 7.838744099934896, "rewards/margins": 15.974749077690973, "rewards/rejected": -8.136004977756077, "step": 1150 }, { "epoch": 0.31547211182677815, "grad_norm": 10.5625, "kl": 7.703329086303711, "learning_rate": 5e-06, "logits/chosen": -22556315.42857143, "logits/rejected": -11899918.4, "logps/chosen": -493.83583286830356, "logps/rejected": -467.24814453125, "loss": 0.0424, "rewards/chosen": 6.358802250453404, "rewards/margins": 12.117681721278599, "rewards/rejected": -5.758879470825195, "step": 1151 }, { "epoch": 0.3157461970672879, "grad_norm": 4.9375, "kl": 4.577509880065918, "learning_rate": 5e-06, "logits/chosen": -2467461.3333333335, "logits/rejected": 2349169.0, "logps/chosen": -372.2451985677083, "logps/rejected": -506.5409342447917, "loss": 0.0515, "rewards/chosen": 6.364952087402344, "rewards/margins": 13.255651473999023, "rewards/rejected": -6.89069938659668, "step": 1152 }, { "epoch": 0.3160202823077977, "grad_norm": 7.21875, "kl": 1.3168342113494873, "learning_rate": 5e-06, "logits/chosen": -6725494.0, "logits/rejected": -32180148.0, "logps/chosen": -502.929931640625, "logps/rejected": -595.3538818359375, "loss": 0.0424, "rewards/chosen": 6.250344753265381, "rewards/margins": 16.57170534133911, "rewards/rejected": -10.32136058807373, "step": 1153 }, { "epoch": 0.31629436754830753, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17354492.8, "logits/rejected": 2194358.8571428573, "logps/chosen": -429.759326171875, "logps/rejected": -518.90673828125, "loss": 0.0538, "rewards/chosen": 5.85051383972168, "rewards/margins": 12.264706584385465, "rewards/rejected": -6.414192744663784, "step": 1154 }, { "epoch": 0.31656845278881734, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9298368.0, "logits/rejected": -31593867.636363637, "logps/chosen": -440.1586162860577, "logps/rejected": -511.07297585227275, "loss": 0.0478, "rewards/chosen": 5.76588146503155, "rewards/margins": 14.465150846467985, "rewards/rejected": -8.699269381436435, "step": 1155 }, { "epoch": 0.3168425380293271, "grad_norm": 2.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23880467.692307692, "logits/rejected": -3423785.4545454546, "logps/chosen": -447.4307391826923, "logps/rejected": -541.3955965909091, "loss": 0.0092, "rewards/chosen": 6.466619638296274, "rewards/margins": 15.361125919368718, "rewards/rejected": -8.894506281072443, "step": 1156 }, { "epoch": 0.3171166232698369, "grad_norm": 5.25, "kl": 2.1809730529785156, "learning_rate": 5e-06, "logits/chosen": -21716980.0, "logits/rejected": 8338357.333333333, "logps/chosen": -392.3838704427083, "logps/rejected": -682.95654296875, "loss": 0.0212, "rewards/chosen": 5.402378082275391, "rewards/margins": 16.67902628580729, "rewards/rejected": -11.2766482035319, "step": 1157 }, { "epoch": 0.31739070851034673, "grad_norm": 11.8125, "kl": 11.737926483154297, "learning_rate": 5e-06, "logits/chosen": -7441884.0, "logits/rejected": -2892612.6666666665, "logps/chosen": -422.9606119791667, "logps/rejected": -500.9922688802083, "loss": 0.1305, "rewards/chosen": 5.026755650838216, "rewards/margins": 12.21358553568522, "rewards/rejected": -7.186829884847005, "step": 1158 }, { "epoch": 0.3176647937508565, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29603122.666666668, "logits/rejected": -19160085.333333332, "logps/chosen": -407.663818359375, "logps/rejected": -512.3623860677084, "loss": 0.0538, "rewards/chosen": 6.235420227050781, "rewards/margins": 14.011001586914062, "rewards/rejected": -7.775581359863281, "step": 1159 }, { "epoch": 0.3179388789913663, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 15095870.666666666, "logits/rejected": -22269488.0, "logps/chosen": -364.275634765625, "logps/rejected": -513.6405436197916, "loss": 0.0559, "rewards/chosen": 5.396472930908203, "rewards/margins": 12.659716288248699, "rewards/rejected": -7.263243357340495, "step": 1160 }, { "epoch": 0.3182129642318761, "grad_norm": 5.40625, "kl": 9.279672622680664, "learning_rate": 5e-06, "logits/chosen": -11623190.857142856, "logits/rejected": -8413583.2, "logps/chosen": -507.645751953125, "logps/rejected": -586.5779296875, "loss": 0.0132, "rewards/chosen": 7.289750235421317, "rewards/margins": 14.292203085763113, "rewards/rejected": -7.002452850341797, "step": 1161 }, { "epoch": 0.31848704947238593, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12235054.857142856, "logits/rejected": -20937072.0, "logps/chosen": -433.22140066964283, "logps/rejected": -589.396826171875, "loss": 0.0528, "rewards/chosen": 5.0459153311593195, "rewards/margins": 13.420089830671039, "rewards/rejected": -8.374174499511719, "step": 1162 }, { "epoch": 0.3187611347128957, "grad_norm": 5.03125, "kl": 0.33131250739097595, "learning_rate": 5e-06, "logits/chosen": -9182752.0, "logits/rejected": -11226645.090909092, "logps/chosen": -305.6901292067308, "logps/rejected": -544.9593394886364, "loss": 0.0524, "rewards/chosen": 4.891591585599459, "rewards/margins": 12.744571072238308, "rewards/rejected": -7.85297948663885, "step": 1163 }, { "epoch": 0.3190352199534055, "grad_norm": 7.90625, "kl": 14.131340980529785, "learning_rate": 5e-06, "logits/chosen": -31129692.444444444, "logits/rejected": -23249680.0, "logps/chosen": -507.21185980902777, "logps/rejected": -538.7060139973959, "loss": 0.0778, "rewards/chosen": 7.067195468478733, "rewards/margins": 12.793799930148655, "rewards/rejected": -5.726604461669922, "step": 1164 }, { "epoch": 0.3193093051939153, "grad_norm": 8.25, "kl": 6.756797790527344, "learning_rate": 5e-06, "logits/chosen": 829686.5882352941, "logits/rejected": -7055081.142857143, "logps/chosen": -447.29210707720586, "logps/rejected": -720.0415736607143, "loss": 0.0619, "rewards/chosen": 6.437074549057904, "rewards/margins": 15.577076663490102, "rewards/rejected": -9.140002114432198, "step": 1165 }, { "epoch": 0.31958339043442513, "grad_norm": 5.5, "kl": 1.960250973701477, "learning_rate": 5e-06, "logits/chosen": -16460730.285714285, "logits/rejected": -3975685.2, "logps/chosen": -432.08067103794644, "logps/rejected": -594.32265625, "loss": 0.0198, "rewards/chosen": 6.866404942103794, "rewards/margins": 14.411066654750279, "rewards/rejected": -7.5446617126464846, "step": 1166 }, { "epoch": 0.3198574756749349, "grad_norm": 11.4375, "kl": 1.6100267171859741, "learning_rate": 5e-06, "logits/chosen": -8692463.272727273, "logits/rejected": 11477192.615384616, "logps/chosen": -458.4069158380682, "logps/rejected": -479.3219651442308, "loss": 0.062, "rewards/chosen": 5.5694580078125, "rewards/margins": 11.199720529409555, "rewards/rejected": -5.630262521597055, "step": 1167 }, { "epoch": 0.3201315609154447, "grad_norm": 5.46875, "kl": 1.4560165405273438, "learning_rate": 5e-06, "logits/chosen": 2303709.8, "logits/rejected": 8425676.57142857, "logps/chosen": -491.8908203125, "logps/rejected": -465.68624441964283, "loss": 0.0171, "rewards/chosen": 7.9257354736328125, "rewards/margins": 14.806507110595703, "rewards/rejected": -6.880771636962891, "step": 1168 }, { "epoch": 0.3204056461559545, "grad_norm": 5.5625, "kl": 6.092613220214844, "learning_rate": 5e-06, "logits/chosen": -8744929.714285715, "logits/rejected": -13399074.4, "logps/chosen": -329.21397181919644, "logps/rejected": -598.3533203125, "loss": 0.0875, "rewards/chosen": 5.373543875558036, "rewards/margins": 13.651844351632253, "rewards/rejected": -8.278300476074218, "step": 1169 }, { "epoch": 0.3206797313964643, "grad_norm": 5.46875, "kl": 3.053645133972168, "learning_rate": 5e-06, "logits/chosen": -21026009.6, "logits/rejected": -6414764.444444444, "logps/chosen": -456.8955403645833, "logps/rejected": -422.5910915798611, "loss": 0.0312, "rewards/chosen": 6.026104736328125, "rewards/margins": 14.366967434353299, "rewards/rejected": -8.340862698025173, "step": 1170 }, { "epoch": 0.3209538166369741, "grad_norm": 8.75, "kl": 1.2905133962631226, "learning_rate": 5e-06, "logits/chosen": -11321156.363636363, "logits/rejected": -1434900.6153846155, "logps/chosen": -471.4459339488636, "logps/rejected": -591.0265925480769, "loss": 0.0243, "rewards/chosen": 5.98569765957919, "rewards/margins": 13.28016059715431, "rewards/rejected": -7.29446293757512, "step": 1171 }, { "epoch": 0.3212279018774839, "grad_norm": 1.53125, "kl": 0.08454259485006332, "learning_rate": 5e-06, "logits/chosen": 15376942.4, "logits/rejected": -14523124.57142857, "logps/chosen": -505.79921875, "logps/rejected": -701.3679547991071, "loss": 0.0061, "rewards/chosen": 6.9858558654785154, "rewards/margins": 17.133606719970704, "rewards/rejected": -10.147750854492188, "step": 1172 }, { "epoch": 0.3215019871179937, "grad_norm": 6.75, "kl": 0.3732573390007019, "learning_rate": 5e-06, "logits/chosen": -5336387.733333333, "logits/rejected": -13616911.111111112, "logps/chosen": -357.4832356770833, "logps/rejected": -375.6852213541667, "loss": 0.0377, "rewards/chosen": 6.381930541992188, "rewards/margins": 13.58549296061198, "rewards/rejected": -7.203562418619792, "step": 1173 }, { "epoch": 0.3217760723585035, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15811042.461538462, "logits/rejected": -1010605.4545454546, "logps/chosen": -417.6706355168269, "logps/rejected": -329.10074129971593, "loss": 0.0473, "rewards/chosen": 5.559964106633113, "rewards/margins": 11.75258897901415, "rewards/rejected": -6.192624872381037, "step": 1174 }, { "epoch": 0.3220501575990133, "grad_norm": 8.125, "kl": 0.08994357287883759, "learning_rate": 5e-06, "logits/chosen": 2618691.2, "logits/rejected": -22097147.42857143, "logps/chosen": -444.18076171875, "logps/rejected": -534.7527204241071, "loss": 0.03, "rewards/chosen": 5.637625122070313, "rewards/margins": 13.540726906912667, "rewards/rejected": -7.903101784842355, "step": 1175 }, { "epoch": 0.3223242428395231, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5552379.692307692, "logits/rejected": -17981589.818181816, "logps/chosen": -456.2801983173077, "logps/rejected": -680.3558238636364, "loss": 0.0138, "rewards/chosen": 6.37734867976262, "rewards/margins": 16.22558753807228, "rewards/rejected": -9.848238858309658, "step": 1176 }, { "epoch": 0.3225983280800329, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16245242.666666666, "logits/rejected": -25798634.666666668, "logps/chosen": -399.1853841145833, "logps/rejected": -558.6941731770834, "loss": 0.0754, "rewards/chosen": 4.876808802286784, "rewards/margins": 14.10021146138509, "rewards/rejected": -9.223402659098307, "step": 1177 }, { "epoch": 0.3228724133205427, "grad_norm": 5.84375, "kl": 0.3276354670524597, "learning_rate": 5e-06, "logits/chosen": -9778686.666666666, "logits/rejected": 1758652.6666666667, "logps/chosen": -435.9910888671875, "logps/rejected": -428.4689534505208, "loss": 0.0444, "rewards/chosen": 6.060552597045898, "rewards/margins": 12.655286153157551, "rewards/rejected": -6.594733556111653, "step": 1178 }, { "epoch": 0.3231464985610525, "grad_norm": 8.375, "kl": 13.017390251159668, "learning_rate": 5e-06, "logits/chosen": -8752453.866666667, "logits/rejected": -8682142.222222222, "logps/chosen": -460.2688802083333, "logps/rejected": -410.3181966145833, "loss": 0.0309, "rewards/chosen": 6.629913330078125, "rewards/margins": 14.555469936794704, "rewards/rejected": -7.9255566067165795, "step": 1179 }, { "epoch": 0.3234205838015623, "grad_norm": 4.5625, "kl": 8.250862121582031, "learning_rate": 5e-06, "logits/chosen": -37483648.0, "logits/rejected": -9787162.666666666, "logps/chosen": -402.93662109375, "logps/rejected": -531.7704535590278, "loss": 0.0498, "rewards/chosen": 6.700354512532552, "rewards/margins": 14.521513875325521, "rewards/rejected": -7.821159362792969, "step": 1180 }, { "epoch": 0.32369466904207206, "grad_norm": 13.4375, "kl": 10.156219482421875, "learning_rate": 5e-06, "logits/chosen": -2263015.3333333335, "logits/rejected": 5843202.666666667, "logps/chosen": -464.8185628255208, "logps/rejected": -539.9193522135416, "loss": 0.1001, "rewards/chosen": 6.347422917683919, "rewards/margins": 12.04693857828776, "rewards/rejected": -5.699515660603841, "step": 1181 }, { "epoch": 0.32396875428258187, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3428658.888888889, "logits/rejected": -16983432.533333335, "logps/chosen": -374.43500434027777, "logps/rejected": -651.7907552083333, "loss": 0.0451, "rewards/chosen": 5.398522694905599, "rewards/margins": 13.393142445882162, "rewards/rejected": -7.994619750976563, "step": 1182 }, { "epoch": 0.3242428395230917, "grad_norm": 7.96875, "kl": 1.2424037456512451, "learning_rate": 5e-06, "logits/chosen": 2346592.8, "logits/rejected": 1676519.857142857, "logps/chosen": -347.45908203125, "logps/rejected": -598.1847098214286, "loss": 0.0437, "rewards/chosen": 5.641571807861328, "rewards/margins": 14.52960662841797, "rewards/rejected": -8.88803482055664, "step": 1183 }, { "epoch": 0.3245169247636015, "grad_norm": 9.125, "kl": 3.5108134746551514, "learning_rate": 5e-06, "logits/chosen": -23310902.153846152, "logits/rejected": -21246859.636363637, "logps/chosen": -506.07992788461536, "logps/rejected": -458.53835227272725, "loss": 0.0346, "rewards/chosen": 5.9981830303485575, "rewards/margins": 15.133690787362053, "rewards/rejected": -9.135507757013494, "step": 1184 }, { "epoch": 0.32479101000411126, "grad_norm": 12.0, "kl": 0.776556670665741, "learning_rate": 5e-06, "logits/chosen": -20993942.666666668, "logits/rejected": -12629253.333333334, "logps/chosen": -398.3246663411458, "logps/rejected": -405.6863606770833, "loss": 0.0685, "rewards/chosen": 6.175860087076823, "rewards/margins": 12.362746556599935, "rewards/rejected": -6.186886469523112, "step": 1185 }, { "epoch": 0.32506509524462107, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20629932.8, "logits/rejected": -8316620.0, "logps/chosen": -531.394482421875, "logps/rejected": -593.3517020089286, "loss": 0.009, "rewards/chosen": 6.257732772827149, "rewards/margins": 14.014148548671177, "rewards/rejected": -7.756415775844029, "step": 1186 }, { "epoch": 0.3253391804851309, "grad_norm": 10.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16314566.153846154, "logits/rejected": -10519380.363636363, "logps/chosen": -419.4812199519231, "logps/rejected": -789.44921875, "loss": 0.0356, "rewards/chosen": 6.7898418719951925, "rewards/margins": 18.01457193014505, "rewards/rejected": -11.224730058149857, "step": 1187 }, { "epoch": 0.3256132657256407, "grad_norm": 8.5, "kl": 4.6957292556762695, "learning_rate": 5e-06, "logits/chosen": -32607236.0, "logits/rejected": -21783618.0, "logps/chosen": -527.1728515625, "logps/rejected": -517.7452392578125, "loss": 0.0185, "rewards/chosen": 7.889807224273682, "rewards/margins": 15.424062252044678, "rewards/rejected": -7.534255027770996, "step": 1188 }, { "epoch": 0.32588735096615046, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10582266.181818182, "logits/rejected": 55864644.92307692, "logps/chosen": -313.92001065340907, "logps/rejected": -639.5745192307693, "loss": 0.0204, "rewards/chosen": 6.040659817782315, "rewards/margins": 16.59304441438688, "rewards/rejected": -10.552384596604567, "step": 1189 }, { "epoch": 0.32616143620666027, "grad_norm": 15.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8842232.727272727, "logits/rejected": 51223168.0, "logps/chosen": -419.49032315340907, "logps/rejected": -630.9265324519231, "loss": 0.0636, "rewards/chosen": 4.6804986433549365, "rewards/margins": 15.662627080103736, "rewards/rejected": -10.982128436748798, "step": 1190 }, { "epoch": 0.3264355214471701, "grad_norm": 7.71875, "kl": 1.9881821870803833, "learning_rate": 5e-06, "logits/chosen": -41177614.76923077, "logits/rejected": -14505559.272727273, "logps/chosen": -373.75792518028845, "logps/rejected": -545.7056107954545, "loss": 0.0549, "rewards/chosen": 4.701269589937651, "rewards/margins": 14.54410336901258, "rewards/rejected": -9.84283377907493, "step": 1191 }, { "epoch": 0.32670960668767984, "grad_norm": 25.875, "kl": 3.7162349224090576, "learning_rate": 5e-06, "logits/chosen": 8560597.333333334, "logits/rejected": -9689158.222222222, "logps/chosen": -477.4779296875, "logps/rejected": -512.9380967881945, "loss": 0.0757, "rewards/chosen": 5.362704467773438, "rewards/margins": 13.413135613335502, "rewards/rejected": -8.050431145562065, "step": 1192 }, { "epoch": 0.32698369192818966, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 350639.8076923077, "logits/rejected": 24119421.09090909, "logps/chosen": -462.45590444711536, "logps/rejected": -665.71484375, "loss": 0.0451, "rewards/chosen": 5.872995229867788, "rewards/margins": 17.070287210957986, "rewards/rejected": -11.1972919810902, "step": 1193 }, { "epoch": 0.32725777716869947, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13231088.0, "logits/rejected": -21208189.53846154, "logps/chosen": -386.9197443181818, "logps/rejected": -531.6293194110577, "loss": 0.0255, "rewards/chosen": 6.386638294566762, "rewards/margins": 15.49598101635913, "rewards/rejected": -9.109342721792368, "step": 1194 }, { "epoch": 0.3275318624092093, "grad_norm": 9.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6177538.857142857, "logits/rejected": -14275531.2, "logps/chosen": -400.182861328125, "logps/rejected": -489.24365234375, "loss": 0.0502, "rewards/chosen": 3.91058840070452, "rewards/margins": 11.756761496407645, "rewards/rejected": -7.846173095703125, "step": 1195 }, { "epoch": 0.32780594764971904, "grad_norm": 9.125, "kl": 1.8942363262176514, "learning_rate": 5e-06, "logits/chosen": -3066000.6153846155, "logits/rejected": 52575749.81818182, "logps/chosen": -438.84217247596155, "logps/rejected": -509.70467862215907, "loss": 0.0362, "rewards/chosen": 6.329108018141526, "rewards/margins": 13.518912095289963, "rewards/rejected": -7.1898040771484375, "step": 1196 }, { "epoch": 0.32808003289022886, "grad_norm": 4.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6300993.230769231, "logits/rejected": -12380727.272727273, "logps/chosen": -358.85659555288464, "logps/rejected": -496.0945933948864, "loss": 0.0156, "rewards/chosen": 6.222024184006911, "rewards/margins": 13.228989127632621, "rewards/rejected": -7.00696494362571, "step": 1197 }, { "epoch": 0.32835411813073867, "grad_norm": 8.75, "kl": 2.3236682415008545, "learning_rate": 5e-06, "logits/chosen": -30072752.0, "logits/rejected": -27078258.666666668, "logps/chosen": -385.8374837239583, "logps/rejected": -400.2322184244792, "loss": 0.0432, "rewards/chosen": 5.3966725667317705, "rewards/margins": 13.283398310343424, "rewards/rejected": -7.886725743611653, "step": 1198 }, { "epoch": 0.3286282033712485, "grad_norm": 16.625, "kl": 9.980278015136719, "learning_rate": 5e-06, "logits/chosen": -20035949.17647059, "logits/rejected": -71834.28571428571, "logps/chosen": -390.58185891544116, "logps/rejected": -609.9865373883929, "loss": 0.228, "rewards/chosen": 4.0956932516659, "rewards/margins": 12.359976568141906, "rewards/rejected": -8.264283316476005, "step": 1199 }, { "epoch": 0.32890228861175824, "grad_norm": 26.25, "kl": 2.7658839225769043, "learning_rate": 5e-06, "logits/chosen": -2437580.3333333335, "logits/rejected": -38730010.666666664, "logps/chosen": -350.1627604166667, "logps/rejected": -462.8528238932292, "loss": 0.0689, "rewards/chosen": 5.1633256276448565, "rewards/margins": 12.883314768473307, "rewards/rejected": -7.71998914082845, "step": 1200 }, { "epoch": 0.32917637385226806, "grad_norm": 10.5625, "kl": 4.23895263671875, "learning_rate": 5e-06, "logits/chosen": -27449652.363636363, "logits/rejected": -16687508.923076924, "logps/chosen": -494.7429865056818, "logps/rejected": -474.8124248798077, "loss": 0.0327, "rewards/chosen": 6.80999755859375, "rewards/margins": 13.831446721003605, "rewards/rejected": -7.021449162409856, "step": 1201 }, { "epoch": 0.32945045909277787, "grad_norm": 7.34375, "kl": 2.2956137657165527, "learning_rate": 5e-06, "logits/chosen": -14003965.333333334, "logits/rejected": 6040162.666666667, "logps/chosen": -401.5078938802083, "logps/rejected": -543.28759765625, "loss": 0.0245, "rewards/chosen": 5.706839243570964, "rewards/margins": 13.346028010050457, "rewards/rejected": -7.639188766479492, "step": 1202 }, { "epoch": 0.32972454433328763, "grad_norm": 2.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13439644.8, "logits/rejected": -7219532.571428572, "logps/chosen": -478.514501953125, "logps/rejected": -491.261474609375, "loss": 0.0059, "rewards/chosen": 7.041134643554687, "rewards/margins": 15.3087890625, "rewards/rejected": -8.267654418945312, "step": 1203 }, { "epoch": 0.32999862957379744, "grad_norm": 9.25, "kl": 8.120466232299805, "learning_rate": 5e-06, "logits/chosen": -17117996.307692308, "logits/rejected": -27815709.09090909, "logps/chosen": -490.23189603365387, "logps/rejected": -375.9583629261364, "loss": 0.0489, "rewards/chosen": 7.1848930945763225, "rewards/margins": 13.07268204055466, "rewards/rejected": -5.887788945978338, "step": 1204 }, { "epoch": 0.33027271481430726, "grad_norm": 9.4375, "kl": 5.146360874176025, "learning_rate": 5e-06, "logits/chosen": -26410899.2, "logits/rejected": -26770181.333333332, "logps/chosen": -411.0533203125, "logps/rejected": -532.7998589409722, "loss": 0.0291, "rewards/chosen": 6.613288879394531, "rewards/margins": 14.24958970811632, "rewards/rejected": -7.636300828721788, "step": 1205 }, { "epoch": 0.33054680005481707, "grad_norm": 8.875, "kl": 0.6698926091194153, "learning_rate": 5e-06, "logits/chosen": -24407264.0, "logits/rejected": -4042576.0, "logps/chosen": -412.0711263020833, "logps/rejected": -420.8854573567708, "loss": 0.0475, "rewards/chosen": 7.169291814168294, "rewards/margins": 13.774815241495768, "rewards/rejected": -6.605523427327474, "step": 1206 }, { "epoch": 0.3308208852953268, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3838175.6363636362, "logits/rejected": -28434030.769230768, "logps/chosen": -455.58735795454544, "logps/rejected": -501.15268179086536, "loss": 0.0358, "rewards/chosen": 5.351837851784446, "rewards/margins": 13.471432719197306, "rewards/rejected": -8.11959486741286, "step": 1207 }, { "epoch": 0.33109497053583664, "grad_norm": 5.9375, "kl": 3.0373053550720215, "learning_rate": 5e-06, "logits/chosen": -16680700.307692308, "logits/rejected": 11844421.090909092, "logps/chosen": -474.7907527043269, "logps/rejected": -501.40145596590907, "loss": 0.0279, "rewards/chosen": 7.270541851337139, "rewards/margins": 17.022012243737706, "rewards/rejected": -9.751470392400568, "step": 1208 }, { "epoch": 0.33136905577634646, "grad_norm": 5.5, "kl": 2.2150230407714844, "learning_rate": 5e-06, "logits/chosen": -17223036.0, "logits/rejected": 15182428.0, "logps/chosen": -358.6787109375, "logps/rejected": -491.5517578125, "loss": 0.0475, "rewards/chosen": 5.241259574890137, "rewards/margins": 13.420369148254395, "rewards/rejected": -8.179109573364258, "step": 1209 }, { "epoch": 0.33164314101685627, "grad_norm": 5.25, "kl": 4.423098564147949, "learning_rate": 5e-06, "logits/chosen": -7126712.0, "logits/rejected": -15287925.333333334, "logps/chosen": -512.468359375, "logps/rejected": -401.09228515625, "loss": 0.0201, "rewards/chosen": 7.392661539713542, "rewards/margins": 15.382902696397569, "rewards/rejected": -7.990241156684028, "step": 1210 }, { "epoch": 0.331917226257366, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21417255.384615384, "logits/rejected": -23973511.272727273, "logps/chosen": -362.3595628004808, "logps/rejected": -353.5812322443182, "loss": 0.0371, "rewards/chosen": 5.959391080416166, "rewards/margins": 11.834482633150541, "rewards/rejected": -5.875091552734375, "step": 1211 }, { "epoch": 0.33219131149787584, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39593990.4, "logits/rejected": -17771506.285714287, "logps/chosen": -327.827978515625, "logps/rejected": -549.6358119419643, "loss": 0.0623, "rewards/chosen": 4.5672649383544925, "rewards/margins": 12.332192938668388, "rewards/rejected": -7.764928000313895, "step": 1212 }, { "epoch": 0.33246539673838565, "grad_norm": 6.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6216574.857142857, "logits/rejected": -26888011.29411765, "logps/chosen": -418.67236328125, "logps/rejected": -561.7405215992648, "loss": 0.0354, "rewards/chosen": 6.582426343645368, "rewards/margins": 14.850221681995551, "rewards/rejected": -8.267795338350183, "step": 1213 }, { "epoch": 0.3327394819788954, "grad_norm": 7.28125, "kl": 2.5900371074676514, "learning_rate": 5e-06, "logits/chosen": -16680496.94117647, "logits/rejected": -18034596.57142857, "logps/chosen": -393.7159639246324, "logps/rejected": -426.09901646205356, "loss": 0.0413, "rewards/chosen": 5.865534165326287, "rewards/margins": 13.36280774669487, "rewards/rejected": -7.4972735813685825, "step": 1214 }, { "epoch": 0.3330135672194052, "grad_norm": 6.0, "kl": 1.3270480632781982, "learning_rate": 5e-06, "logits/chosen": 8571613.818181818, "logits/rejected": -27333137.230769232, "logps/chosen": -424.51962002840907, "logps/rejected": -621.3281625600962, "loss": 0.0222, "rewards/chosen": 6.209842335094105, "rewards/margins": 15.447550206751258, "rewards/rejected": -9.237707871657152, "step": 1215 }, { "epoch": 0.33328765245991504, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17425413.333333332, "logits/rejected": -7751737.777777778, "logps/chosen": -342.398828125, "logps/rejected": -456.8992513020833, "loss": 0.0369, "rewards/chosen": 5.859694417317709, "rewards/margins": 15.67277340359158, "rewards/rejected": -9.813078986273872, "step": 1216 }, { "epoch": 0.33356173770042485, "grad_norm": 7.875, "kl": 1.8747692108154297, "learning_rate": 5e-06, "logits/chosen": -25668549.818181816, "logits/rejected": -2268948.923076923, "logps/chosen": -550.1069779829545, "logps/rejected": -496.68712439903845, "loss": 0.0367, "rewards/chosen": 6.660763827237216, "rewards/margins": 13.437271865097792, "rewards/rejected": -6.776508037860577, "step": 1217 }, { "epoch": 0.3338358229409346, "grad_norm": 2.15625, "kl": 0.18373362720012665, "learning_rate": 5e-06, "logits/chosen": 2253104.0, "logits/rejected": -10821824.0, "logps/chosen": -524.5254720052084, "logps/rejected": -283.0256754557292, "loss": 0.0274, "rewards/chosen": 7.29104741414388, "rewards/margins": 13.33014170328776, "rewards/rejected": -6.03909428914388, "step": 1218 }, { "epoch": 0.3341099081814444, "grad_norm": 2.109375, "kl": 3.139235258102417, "learning_rate": 5e-06, "logits/chosen": -33475532.8, "logits/rejected": 16758605.714285715, "logps/chosen": -484.395361328125, "logps/rejected": -448.20772879464283, "loss": 0.0108, "rewards/chosen": 6.757251739501953, "rewards/margins": 14.131597791399274, "rewards/rejected": -7.374346051897321, "step": 1219 }, { "epoch": 0.33438399342195424, "grad_norm": 5.0625, "kl": 6.057814598083496, "learning_rate": 5e-06, "logits/chosen": 1713587.0, "logits/rejected": -29002624.0, "logps/chosen": -564.0206705729166, "logps/rejected": -411.810546875, "loss": 0.017, "rewards/chosen": 7.454323450724284, "rewards/margins": 16.042112350463867, "rewards/rejected": -8.587788899739584, "step": 1220 }, { "epoch": 0.33465807866246405, "grad_norm": 9.875, "kl": 11.155864715576172, "learning_rate": 5e-06, "logits/chosen": -8895784.470588235, "logits/rejected": -18061864.0, "logps/chosen": -401.45392922794116, "logps/rejected": -323.546875, "loss": 0.0742, "rewards/chosen": 5.523937449735754, "rewards/margins": 13.568368863658744, "rewards/rejected": -8.044431413922991, "step": 1221 }, { "epoch": 0.3349321639029738, "grad_norm": 3.84375, "kl": 0.014163970947265625, "learning_rate": 5e-06, "logits/chosen": -2635105.0, "logits/rejected": -16375635.0, "logps/chosen": -478.462890625, "logps/rejected": -502.5587158203125, "loss": 0.0254, "rewards/chosen": 6.547120094299316, "rewards/margins": 12.959866523742676, "rewards/rejected": -6.412746429443359, "step": 1222 }, { "epoch": 0.3352062491434836, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34490160.0, "logits/rejected": -29317440.0, "logps/chosen": -543.1142578125, "logps/rejected": -554.0764508928571, "loss": 0.0187, "rewards/chosen": 6.397125244140625, "rewards/margins": 15.962221418108259, "rewards/rejected": -9.565096173967634, "step": 1223 }, { "epoch": 0.33548033438399344, "grad_norm": 14.8125, "kl": 2.9220595359802246, "learning_rate": 5e-06, "logits/chosen": -30054137.6, "logits/rejected": -12470257.142857144, "logps/chosen": -481.53681640625, "logps/rejected": -468.2173549107143, "loss": 0.0551, "rewards/chosen": 7.220896911621094, "rewards/margins": 13.366917528424946, "rewards/rejected": -6.146020616803851, "step": 1224 }, { "epoch": 0.3357544196245032, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24458290.285714287, "logits/rejected": -1968883.5294117648, "logps/chosen": -434.9697963169643, "logps/rejected": -462.8405330882353, "loss": 0.0552, "rewards/chosen": 6.641635894775391, "rewards/margins": 13.420854680678424, "rewards/rejected": -6.7792187859030335, "step": 1225 }, { "epoch": 0.336028504865013, "grad_norm": 6.3125, "kl": 1.4849942922592163, "learning_rate": 5e-06, "logits/chosen": -38027434.666666664, "logits/rejected": -16773892.0, "logps/chosen": -400.3790690104167, "logps/rejected": -641.7018636067709, "loss": 0.0488, "rewards/chosen": 5.829662958780925, "rewards/margins": 16.001946767171223, "rewards/rejected": -10.172283808390299, "step": 1226 }, { "epoch": 0.3363025901055228, "grad_norm": 6.625, "kl": 1.1540145874023438, "learning_rate": 5e-06, "logits/chosen": -15745933.714285715, "logits/rejected": -11831040.0, "logps/chosen": -398.12953404017856, "logps/rejected": -447.395947265625, "loss": 0.0451, "rewards/chosen": 5.938357761928013, "rewards/margins": 13.683405521937779, "rewards/rejected": -7.745047760009766, "step": 1227 }, { "epoch": 0.33657667534603264, "grad_norm": 6.84375, "kl": 7.233365058898926, "learning_rate": 5e-06, "logits/chosen": -30817.6, "logits/rejected": -37493504.0, "logps/chosen": -480.29925130208335, "logps/rejected": -521.1045464409722, "loss": 0.0251, "rewards/chosen": 6.4085240681966145, "rewards/margins": 16.554313320583766, "rewards/rejected": -10.145789252387154, "step": 1228 }, { "epoch": 0.3368507605865424, "grad_norm": 8.125, "kl": 0.4291045069694519, "learning_rate": 5e-06, "logits/chosen": -7190679.384615385, "logits/rejected": 1353330.1818181819, "logps/chosen": -379.36177884615387, "logps/rejected": -438.89084694602275, "loss": 0.0341, "rewards/chosen": 7.482193580040565, "rewards/margins": 16.629796701711374, "rewards/rejected": -9.14760312167081, "step": 1229 }, { "epoch": 0.3371248458270522, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4388480.0, "logits/rejected": -27243822.933333334, "logps/chosen": -343.29009331597223, "logps/rejected": -452.8907877604167, "loss": 0.0193, "rewards/chosen": 5.874403211805555, "rewards/margins": 14.983387586805556, "rewards/rejected": -9.108984375, "step": 1230 }, { "epoch": 0.337398931067562, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1485042.0, "logits/rejected": -43511381.333333336, "logps/chosen": -437.70703125, "logps/rejected": -556.7176106770834, "loss": 0.0266, "rewards/chosen": 6.309226989746094, "rewards/margins": 14.758533477783203, "rewards/rejected": -8.44930648803711, "step": 1231 }, { "epoch": 0.3376730163080718, "grad_norm": 3.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26410978.285714287, "logits/rejected": 66259494.4, "logps/chosen": -368.96041434151783, "logps/rejected": -607.14462890625, "loss": 0.0188, "rewards/chosen": 5.5965745108468195, "rewards/margins": 17.617347063337053, "rewards/rejected": -12.020772552490234, "step": 1232 }, { "epoch": 0.3379471015485816, "grad_norm": 3.828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33956986.18181818, "logits/rejected": 41840059.07692308, "logps/chosen": -363.83993252840907, "logps/rejected": -622.8477313701923, "loss": 0.0293, "rewards/chosen": 4.715938221324574, "rewards/margins": 15.020665628926738, "rewards/rejected": -10.304727407602163, "step": 1233 }, { "epoch": 0.3382211867890914, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39248147.692307696, "logits/rejected": 5536930.909090909, "logps/chosen": -413.15978064903845, "logps/rejected": -870.0567294034091, "loss": 0.0161, "rewards/chosen": 6.6099067101111775, "rewards/margins": 24.920883605530214, "rewards/rejected": -18.310976895419035, "step": 1234 }, { "epoch": 0.3384952720296012, "grad_norm": 4.25, "kl": 4.4235758781433105, "learning_rate": 5e-06, "logits/chosen": -22588784.0, "logits/rejected": -13207412.8, "logps/chosen": -423.654541015625, "logps/rejected": -378.888720703125, "loss": 0.0166, "rewards/chosen": 5.8980222429547995, "rewards/margins": 12.720809718540737, "rewards/rejected": -6.822787475585938, "step": 1235 }, { "epoch": 0.338769357270111, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17226638.545454547, "logits/rejected": -22389902.769230768, "logps/chosen": -429.4274236505682, "logps/rejected": -605.8745868389423, "loss": 0.0153, "rewards/chosen": 5.83513086492365, "rewards/margins": 16.601361521474132, "rewards/rejected": -10.76623065655048, "step": 1236 }, { "epoch": 0.3390434425106208, "grad_norm": 7.96875, "kl": 5.500591278076172, "learning_rate": 5e-06, "logits/chosen": -25090208.0, "logits/rejected": -4657427.2, "logps/chosen": -381.8555385044643, "logps/rejected": -604.9166015625, "loss": 0.0579, "rewards/chosen": 6.466400146484375, "rewards/margins": 18.680760192871094, "rewards/rejected": -12.21436004638672, "step": 1237 }, { "epoch": 0.3393175277511306, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27828952.615384616, "logits/rejected": -13589579.636363637, "logps/chosen": -431.16736778846155, "logps/rejected": -570.2627840909091, "loss": 0.0185, "rewards/chosen": 5.611135042630709, "rewards/margins": 13.319583279269558, "rewards/rejected": -7.70844823663885, "step": 1238 }, { "epoch": 0.3395916129916404, "grad_norm": 5.9375, "kl": 3.9764468669891357, "learning_rate": 5e-06, "logits/chosen": 5328744.0, "logits/rejected": -3010482.285714286, "logps/chosen": -434.7072265625, "logps/rejected": -555.9549386160714, "loss": 0.0101, "rewards/chosen": 6.644402313232422, "rewards/margins": 16.021378326416016, "rewards/rejected": -9.376976013183594, "step": 1239 }, { "epoch": 0.3398656982321502, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39494516.36363637, "logits/rejected": -18571436.307692308, "logps/chosen": -407.84237393465907, "logps/rejected": -459.5261042668269, "loss": 0.023, "rewards/chosen": 4.947659232399681, "rewards/margins": 12.903738568712782, "rewards/rejected": -7.956079336313101, "step": 1240 }, { "epoch": 0.34013978347266, "grad_norm": 7.96875, "kl": 1.0451158285140991, "learning_rate": 5e-06, "logits/chosen": 5439735.0, "logits/rejected": -23171412.0, "logps/chosen": -410.4218444824219, "logps/rejected": -400.0116882324219, "loss": 0.0449, "rewards/chosen": 5.493742942810059, "rewards/margins": 14.057731628417969, "rewards/rejected": -8.56398868560791, "step": 1241 }, { "epoch": 0.3404138687131698, "grad_norm": 4.59375, "kl": 2.079237699508667, "learning_rate": 5e-06, "logits/chosen": 684347.4285714285, "logits/rejected": -30386025.6, "logps/chosen": -392.0712890625, "logps/rejected": -449.209814453125, "loss": 0.0211, "rewards/chosen": 5.472547258649554, "rewards/margins": 13.294048418317523, "rewards/rejected": -7.821501159667969, "step": 1242 }, { "epoch": 0.34068795395367957, "grad_norm": 10.9375, "kl": 1.346996784210205, "learning_rate": 5e-06, "logits/chosen": 70852.45454545454, "logits/rejected": -18389676.307692308, "logps/chosen": -430.74391867897725, "logps/rejected": -300.50184044471155, "loss": 0.0444, "rewards/chosen": 5.883422157981179, "rewards/margins": 12.592322022764833, "rewards/rejected": -6.708899864783654, "step": 1243 }, { "epoch": 0.3409620391941894, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20621638.4, "logits/rejected": -35717390.222222224, "logps/chosen": -289.30638020833334, "logps/rejected": -397.9353841145833, "loss": 0.0706, "rewards/chosen": 4.409767659505208, "rewards/margins": 11.35365007188585, "rewards/rejected": -6.943882412380642, "step": 1244 }, { "epoch": 0.3412361244346992, "grad_norm": 4.375, "kl": 0.45884960889816284, "learning_rate": 5e-06, "logits/chosen": -597441.1666666666, "logits/rejected": 20748196.0, "logps/chosen": -387.3460286458333, "logps/rejected": -851.9964192708334, "loss": 0.0153, "rewards/chosen": 5.4947160085042315, "rewards/margins": 19.704863866170246, "rewards/rejected": -14.210147857666016, "step": 1245 }, { "epoch": 0.341510209675209, "grad_norm": 8.25, "kl": 1.7123897075653076, "learning_rate": 5e-06, "logits/chosen": -31803944.0, "logits/rejected": -16072602.0, "logps/chosen": -382.8211975097656, "logps/rejected": -392.42889404296875, "loss": 0.0654, "rewards/chosen": 4.705574035644531, "rewards/margins": 9.837162017822266, "rewards/rejected": -5.131587982177734, "step": 1246 }, { "epoch": 0.34178429491571877, "grad_norm": 8.75, "kl": 3.7896108627319336, "learning_rate": 5e-06, "logits/chosen": -12929213.090909092, "logits/rejected": -15735536.0, "logps/chosen": -421.2516424005682, "logps/rejected": -603.4812199519231, "loss": 0.0361, "rewards/chosen": 4.884756608442827, "rewards/margins": 11.926088559877623, "rewards/rejected": -7.041331951434795, "step": 1247 }, { "epoch": 0.3420583801562286, "grad_norm": 10.5625, "kl": 12.278522491455078, "learning_rate": 5e-06, "logits/chosen": -22058148.57142857, "logits/rejected": 6152788.4, "logps/chosen": -372.1862095424107, "logps/rejected": -391.2426513671875, "loss": 0.0596, "rewards/chosen": 5.827069418770926, "rewards/margins": 11.17784914289202, "rewards/rejected": -5.350779724121094, "step": 1248 }, { "epoch": 0.3423324653967384, "grad_norm": 7.53125, "kl": 6.018805027008057, "learning_rate": 5e-06, "logits/chosen": -22310756.0, "logits/rejected": -22058478.0, "logps/chosen": -483.52874755859375, "logps/rejected": -574.6767578125, "loss": 0.046, "rewards/chosen": 6.355218887329102, "rewards/margins": 14.106407165527344, "rewards/rejected": -7.751188278198242, "step": 1249 }, { "epoch": 0.3426065506372482, "grad_norm": 6.25, "kl": 5.14984130859375, "learning_rate": 5e-06, "logits/chosen": 12068906.666666666, "logits/rejected": 43393853.333333336, "logps/chosen": -443.9790445963542, "logps/rejected": -779.3678385416666, "loss": 0.0166, "rewards/chosen": 6.326105117797852, "rewards/margins": 16.827573776245117, "rewards/rejected": -10.501468658447266, "step": 1250 }, { "epoch": 0.34288063587775797, "grad_norm": 7.1875, "kl": 0.8134326934814453, "learning_rate": 5e-06, "logits/chosen": 121450333.0909091, "logits/rejected": -19215328.0, "logps/chosen": -441.66677024147725, "logps/rejected": -484.9346454326923, "loss": 0.0535, "rewards/chosen": 4.9761834578080615, "rewards/margins": 12.474657578901812, "rewards/rejected": -7.49847412109375, "step": 1251 }, { "epoch": 0.3431547211182678, "grad_norm": 12.3125, "kl": 3.5763092041015625, "learning_rate": 5e-06, "logits/chosen": -29359342.222222224, "logits/rejected": 78147106.13333334, "logps/chosen": -463.03960503472223, "logps/rejected": -431.93092447916666, "loss": 0.0657, "rewards/chosen": 5.798042721218533, "rewards/margins": 10.873333655463323, "rewards/rejected": -5.075290934244792, "step": 1252 }, { "epoch": 0.3434288063587776, "grad_norm": 13.6875, "kl": 10.048030853271484, "learning_rate": 5e-06, "logits/chosen": 25359542.85714286, "logits/rejected": -28719052.8, "logps/chosen": -545.0076729910714, "logps/rejected": -491.243359375, "loss": 0.0437, "rewards/chosen": 7.314601898193359, "rewards/margins": 12.439236450195313, "rewards/rejected": -5.124634552001953, "step": 1253 }, { "epoch": 0.34370289159928735, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21128974.222222224, "logits/rejected": -12144360.533333333, "logps/chosen": -439.1780056423611, "logps/rejected": -528.5660481770833, "loss": 0.0394, "rewards/chosen": 5.507199181450738, "rewards/margins": 13.276402367485893, "rewards/rejected": -7.769203186035156, "step": 1254 }, { "epoch": 0.34397697683979717, "grad_norm": 10.0625, "kl": 1.182486891746521, "learning_rate": 5e-06, "logits/chosen": -18095720.0, "logits/rejected": -22321104.0, "logps/chosen": -472.0280064174107, "logps/rejected": -531.26640625, "loss": 0.0584, "rewards/chosen": 7.239185333251953, "rewards/margins": 16.559611511230468, "rewards/rejected": -9.320426177978515, "step": 1255 }, { "epoch": 0.344251062080307, "grad_norm": 2.34375, "kl": 5.579519271850586, "learning_rate": 5e-06, "logits/chosen": 2878060.6666666665, "logits/rejected": 5018867.333333333, "logps/chosen": -326.13970947265625, "logps/rejected": -501.0694173177083, "loss": 0.0227, "rewards/chosen": 5.993811289469401, "rewards/margins": 13.87100601196289, "rewards/rejected": -7.877194722493489, "step": 1256 }, { "epoch": 0.3445251473208168, "grad_norm": 11.625, "kl": 4.639813423156738, "learning_rate": 5e-06, "logits/chosen": -37861749.333333336, "logits/rejected": -14926389.333333334, "logps/chosen": -470.19384765625, "logps/rejected": -442.448974609375, "loss": 0.047, "rewards/chosen": 5.994414647420247, "rewards/margins": 11.961570103963215, "rewards/rejected": -5.967155456542969, "step": 1257 }, { "epoch": 0.34479923256132655, "grad_norm": 9.5625, "kl": 8.764667510986328, "learning_rate": 5e-06, "logits/chosen": -19795396.0, "logits/rejected": 3200943.5, "logps/chosen": -494.7484130859375, "logps/rejected": -535.1976928710938, "loss": 0.0351, "rewards/chosen": 6.143914222717285, "rewards/margins": 15.743852615356445, "rewards/rejected": -9.59993839263916, "step": 1258 }, { "epoch": 0.34507331780183637, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55769051.428571425, "logits/rejected": -22123802.352941178, "logps/chosen": -543.223876953125, "logps/rejected": -352.78831571691177, "loss": 0.0921, "rewards/chosen": 5.605388096400669, "rewards/margins": 11.228049013794971, "rewards/rejected": -5.622660917394302, "step": 1259 }, { "epoch": 0.3453474030423462, "grad_norm": 3.484375, "kl": 1.7140382528305054, "learning_rate": 5e-06, "logits/chosen": 9496301.714285715, "logits/rejected": -11398445.176470589, "logps/chosen": -444.5281459263393, "logps/rejected": -528.5434857536765, "loss": 0.0252, "rewards/chosen": 5.082436152866909, "rewards/margins": 12.584315676649078, "rewards/rejected": -7.501879523782169, "step": 1260 }, { "epoch": 0.345621488282856, "grad_norm": 3.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21938100.0, "logits/rejected": -30708272.0, "logps/chosen": -317.86236572265625, "logps/rejected": -564.5863647460938, "loss": 0.0099, "rewards/chosen": 6.581609725952148, "rewards/margins": 15.669561386108398, "rewards/rejected": -9.08795166015625, "step": 1261 }, { "epoch": 0.34589557352336575, "grad_norm": 5.0625, "kl": 2.511996030807495, "learning_rate": 5e-06, "logits/chosen": -21495342.933333334, "logits/rejected": 95678158.22222222, "logps/chosen": -514.0337890625, "logps/rejected": -590.9471028645834, "loss": 0.017, "rewards/chosen": 6.664604695638021, "rewards/margins": 15.025859069824218, "rewards/rejected": -8.361254374186197, "step": 1262 }, { "epoch": 0.34616965876387557, "grad_norm": 10.8125, "kl": 0.3677825927734375, "learning_rate": 5e-06, "logits/chosen": 64952265.14285714, "logits/rejected": -8453396.8, "logps/chosen": -498.4266880580357, "logps/rejected": -428.1337890625, "loss": 0.0563, "rewards/chosen": 6.522565024239676, "rewards/margins": 14.290187399727959, "rewards/rejected": -7.767622375488282, "step": 1263 }, { "epoch": 0.3464437440043854, "grad_norm": 11.8125, "kl": 1.3236020803451538, "learning_rate": 5e-06, "logits/chosen": -11866116.0, "logits/rejected": -6028621.5, "logps/chosen": -437.9803466796875, "logps/rejected": -489.28802490234375, "loss": 0.0568, "rewards/chosen": 6.732024192810059, "rewards/margins": 14.043596744537354, "rewards/rejected": -7.311572551727295, "step": 1264 }, { "epoch": 0.34671782924489514, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28284038.0, "logits/rejected": -9753722.0, "logps/chosen": -385.1573791503906, "logps/rejected": -503.81207275390625, "loss": 0.0262, "rewards/chosen": 5.118495941162109, "rewards/margins": 13.432450294494629, "rewards/rejected": -8.31395435333252, "step": 1265 }, { "epoch": 0.34699191448540495, "grad_norm": 6.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -247224.66666666666, "logits/rejected": -24684392.0, "logps/chosen": -458.1810709635417, "logps/rejected": -536.4454752604166, "loss": 0.0739, "rewards/chosen": 5.153723398844401, "rewards/margins": 12.875935872395834, "rewards/rejected": -7.722212473551433, "step": 1266 }, { "epoch": 0.34726599972591476, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25667357.09090909, "logits/rejected": -23449873.230769232, "logps/chosen": -401.7415216619318, "logps/rejected": -613.2364032451923, "loss": 0.0093, "rewards/chosen": 6.078629927201704, "rewards/margins": 17.136386791309278, "rewards/rejected": -11.057756864107573, "step": 1267 }, { "epoch": 0.3475400849664246, "grad_norm": 15.0625, "kl": 23.92203712463379, "learning_rate": 5e-06, "logits/chosen": -25815585.684210528, "logits/rejected": 37112809.6, "logps/chosen": -499.5717516447368, "logps/rejected": -475.18759765625, "loss": 0.0852, "rewards/chosen": 7.059334202816612, "rewards/margins": 13.712972661068566, "rewards/rejected": -6.653638458251953, "step": 1268 }, { "epoch": 0.34781417020693434, "grad_norm": 4.6875, "kl": 1.0004127025604248, "learning_rate": 5e-06, "logits/chosen": -32182565.818181816, "logits/rejected": -295054.76923076925, "logps/chosen": -480.2322887073864, "logps/rejected": -501.82534555288464, "loss": 0.0131, "rewards/chosen": 6.082162336869673, "rewards/margins": 14.608731996763002, "rewards/rejected": -8.52656965989333, "step": 1269 }, { "epoch": 0.34808825544744415, "grad_norm": 12.5625, "kl": 8.940969467163086, "learning_rate": 5e-06, "logits/chosen": -4587033.454545454, "logits/rejected": -13618036.923076924, "logps/chosen": -468.63045987215907, "logps/rejected": -391.04285606971155, "loss": 0.064, "rewards/chosen": 6.498510187322443, "rewards/margins": 12.940226308115712, "rewards/rejected": -6.441716120793269, "step": 1270 }, { "epoch": 0.34836234068795396, "grad_norm": 9.8125, "kl": 3.536268472671509, "learning_rate": 5e-06, "logits/chosen": -49845521.45454545, "logits/rejected": -28157287.384615384, "logps/chosen": -529.9746537642045, "logps/rejected": -507.81820913461536, "loss": 0.0252, "rewards/chosen": 6.956392461603338, "rewards/margins": 14.594820782854839, "rewards/rejected": -7.638428321251502, "step": 1271 }, { "epoch": 0.3486364259284638, "grad_norm": 5.4375, "kl": 6.030410289764404, "learning_rate": 5e-06, "logits/chosen": -1422952.3636363635, "logits/rejected": -6381428.307692308, "logps/chosen": -420.2405894886364, "logps/rejected": -366.4655949519231, "loss": 0.0255, "rewards/chosen": 5.621126695112749, "rewards/margins": 12.958489131260585, "rewards/rejected": -7.337362436147837, "step": 1272 }, { "epoch": 0.34891051116897354, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20067429.333333332, "logits/rejected": -10536909.866666667, "logps/chosen": -459.3997395833333, "logps/rejected": -525.700390625, "loss": 0.0285, "rewards/chosen": 6.251608106825087, "rewards/margins": 15.306894768608942, "rewards/rejected": -9.055286661783855, "step": 1273 }, { "epoch": 0.34918459640948335, "grad_norm": 9.125, "kl": 1.0714213848114014, "learning_rate": 5e-06, "logits/chosen": -30317140.363636363, "logits/rejected": -29982867.692307692, "logps/chosen": -474.99564985795456, "logps/rejected": -581.5750826322115, "loss": 0.0384, "rewards/chosen": 6.4201507568359375, "rewards/margins": 15.769912719726562, "rewards/rejected": -9.349761962890625, "step": 1274 }, { "epoch": 0.34945868164999316, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27983162.181818184, "logits/rejected": -7270101.538461538, "logps/chosen": -529.8025568181819, "logps/rejected": -575.8052884615385, "loss": 0.0465, "rewards/chosen": 6.788722645152699, "rewards/margins": 16.14351531342193, "rewards/rejected": -9.35479266826923, "step": 1275 }, { "epoch": 0.3497327668905029, "grad_norm": 11.5625, "kl": 3.8622679710388184, "learning_rate": 5e-06, "logits/chosen": -12276425.333333334, "logits/rejected": -18188286.666666668, "logps/chosen": -576.5846354166666, "logps/rejected": -309.48944091796875, "loss": 0.046, "rewards/chosen": 7.082334518432617, "rewards/margins": 13.89383379618327, "rewards/rejected": -6.811499277750651, "step": 1276 }, { "epoch": 0.35000685213101274, "grad_norm": 7.90625, "kl": 2.726546049118042, "learning_rate": 5e-06, "logits/chosen": -21331565.333333332, "logits/rejected": -4252472.0, "logps/chosen": -467.8134358723958, "logps/rejected": -484.5453287760417, "loss": 0.0314, "rewards/chosen": 6.42411994934082, "rewards/margins": 13.401754379272461, "rewards/rejected": -6.977634429931641, "step": 1277 }, { "epoch": 0.35028093737152255, "grad_norm": 8.6875, "kl": 3.4084017276763916, "learning_rate": 5e-06, "logits/chosen": 13934672.0, "logits/rejected": 6640946.0, "logps/chosen": -443.3578796386719, "logps/rejected": -603.2515258789062, "loss": 0.0589, "rewards/chosen": 5.740791320800781, "rewards/margins": 11.554264068603516, "rewards/rejected": -5.813472747802734, "step": 1278 }, { "epoch": 0.35055502261203236, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 52591699.2, "logits/rejected": -25850893.714285713, "logps/chosen": -449.125634765625, "logps/rejected": -508.21023995535717, "loss": 0.0219, "rewards/chosen": 6.053909683227539, "rewards/margins": 15.584746606009347, "rewards/rejected": -9.530836922781807, "step": 1279 }, { "epoch": 0.3508291078525421, "grad_norm": 3.390625, "kl": 3.964183807373047, "learning_rate": 5e-06, "logits/chosen": -22173121.6, "logits/rejected": -8593845.714285715, "logps/chosen": -470.144921875, "logps/rejected": -400.59915597098217, "loss": 0.0114, "rewards/chosen": 7.118508148193359, "rewards/margins": 15.22127500261579, "rewards/rejected": -8.102766854422432, "step": 1280 }, { "epoch": 0.35110319309305194, "grad_norm": 7.75, "kl": 2.018568754196167, "learning_rate": 5e-06, "logits/chosen": -20129236.363636363, "logits/rejected": -36807724.307692304, "logps/chosen": -424.83988813920456, "logps/rejected": -522.8200871394231, "loss": 0.0225, "rewards/chosen": 5.767672798850319, "rewards/margins": 16.836012220049238, "rewards/rejected": -11.068339421198917, "step": 1281 }, { "epoch": 0.35137727833356175, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3082195.777777778, "logits/rejected": -23800405.333333332, "logps/chosen": -386.9546169704861, "logps/rejected": -496.0756510416667, "loss": 0.0144, "rewards/chosen": 5.993258582221137, "rewards/margins": 14.455346086290149, "rewards/rejected": -8.462087504069011, "step": 1282 }, { "epoch": 0.35165136357407156, "grad_norm": 11.75, "kl": 1.0808709859848022, "learning_rate": 5e-06, "logits/chosen": -24212796.0, "logits/rejected": -29993568.0, "logps/chosen": -426.8018798828125, "logps/rejected": -517.6547241210938, "loss": 0.0832, "rewards/chosen": 5.666676044464111, "rewards/margins": 13.630431652069092, "rewards/rejected": -7.9637556076049805, "step": 1283 }, { "epoch": 0.3519254488145813, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26887806.0, "logits/rejected": -16336717.0, "logps/chosen": -346.2093811035156, "logps/rejected": -572.4217529296875, "loss": 0.076, "rewards/chosen": 5.168123722076416, "rewards/margins": 12.926959991455078, "rewards/rejected": -7.758836269378662, "step": 1284 }, { "epoch": 0.35219953405509113, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23217163.636363637, "logits/rejected": -19921065.846153848, "logps/chosen": -453.4841974431818, "logps/rejected": -404.0202448918269, "loss": 0.0308, "rewards/chosen": 6.903073397549716, "rewards/margins": 14.95590402029611, "rewards/rejected": -8.052830622746395, "step": 1285 }, { "epoch": 0.35247361929560095, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5985430.0, "logits/rejected": 22515308.0, "logps/chosen": -439.6016540527344, "logps/rejected": -660.1099853515625, "loss": 0.0387, "rewards/chosen": 5.528641223907471, "rewards/margins": 15.703888416290283, "rewards/rejected": -10.175247192382812, "step": 1286 }, { "epoch": 0.3527477045361107, "grad_norm": 12.75, "kl": 9.942333221435547, "learning_rate": 5e-06, "logits/chosen": -32867526.85714286, "logits/rejected": -25858120.0, "logps/chosen": -435.10899135044644, "logps/rejected": -446.90546875, "loss": 0.0682, "rewards/chosen": 5.556978498186384, "rewards/margins": 14.139021955217634, "rewards/rejected": -8.58204345703125, "step": 1287 }, { "epoch": 0.3530217897766205, "grad_norm": 3.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18492801.333333332, "logits/rejected": -19816968.0, "logps/chosen": -392.7293701171875, "logps/rejected": -435.1166585286458, "loss": 0.0172, "rewards/chosen": 6.396155039469401, "rewards/margins": 15.138716379801433, "rewards/rejected": -8.742561340332031, "step": 1288 }, { "epoch": 0.35329587501713033, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 762718.0, "logits/rejected": -16257786.666666666, "logps/chosen": -532.9734700520834, "logps/rejected": -545.8428955078125, "loss": 0.0386, "rewards/chosen": 6.855381011962891, "rewards/margins": 17.061293284098305, "rewards/rejected": -10.205912272135416, "step": 1289 }, { "epoch": 0.35356996025764015, "grad_norm": 3.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4128529.4545454546, "logits/rejected": -33583480.615384616, "logps/chosen": -458.08371803977275, "logps/rejected": -362.1698467548077, "loss": 0.0157, "rewards/chosen": 6.409679066051137, "rewards/margins": 13.845279933689358, "rewards/rejected": -7.435600867638221, "step": 1290 }, { "epoch": 0.3538440454981499, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35717127.11111111, "logits/rejected": -4986755.2, "logps/chosen": -506.0832248263889, "logps/rejected": -453.01539713541666, "loss": 0.0168, "rewards/chosen": 6.179176330566406, "rewards/margins": 14.51216074625651, "rewards/rejected": -8.332984415690104, "step": 1291 }, { "epoch": 0.3541181307386597, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10608055.333333334, "logits/rejected": -21984636.0, "logps/chosen": -412.2329915364583, "logps/rejected": -511.6921793619792, "loss": 0.0099, "rewards/chosen": 7.159761428833008, "rewards/margins": 14.249600728352863, "rewards/rejected": -7.0898392995198565, "step": 1292 }, { "epoch": 0.35439221597916953, "grad_norm": 13.125, "kl": 6.420409679412842, "learning_rate": 5e-06, "logits/chosen": -16684484.0, "logits/rejected": -11186486.666666666, "logps/chosen": -406.9567057291667, "logps/rejected": -520.1183675130209, "loss": 0.126, "rewards/chosen": 6.5984446207682295, "rewards/margins": 16.006303787231445, "rewards/rejected": -9.407859166463217, "step": 1293 }, { "epoch": 0.35466630121967935, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10702108.8, "logits/rejected": -37735364.571428575, "logps/chosen": -440.56904296875, "logps/rejected": -430.7725306919643, "loss": 0.0322, "rewards/chosen": 6.184767150878907, "rewards/margins": 13.684653799874443, "rewards/rejected": -7.499886648995536, "step": 1294 }, { "epoch": 0.3549403864601891, "grad_norm": 3.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 27865605.333333332, "logits/rejected": 3253661.3333333335, "logps/chosen": -432.7342936197917, "logps/rejected": -450.2485677083333, "loss": 0.0301, "rewards/chosen": 5.478259616427952, "rewards/margins": 13.355972629123265, "rewards/rejected": -7.877713012695312, "step": 1295 }, { "epoch": 0.3552144717006989, "grad_norm": 1.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9333519.272727273, "logits/rejected": -35939062.15384615, "logps/chosen": -484.45028409090907, "logps/rejected": -580.9374248798077, "loss": 0.0037, "rewards/chosen": 8.44615450772372, "rewards/margins": 19.578347266137182, "rewards/rejected": -11.132192758413462, "step": 1296 }, { "epoch": 0.35548855694120873, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17375426.90909091, "logits/rejected": 12664857.846153846, "logps/chosen": -573.8337180397727, "logps/rejected": -604.5164513221154, "loss": 0.0295, "rewards/chosen": 6.536183443936435, "rewards/margins": 17.810158522812635, "rewards/rejected": -11.273975078876202, "step": 1297 }, { "epoch": 0.3557626421817185, "grad_norm": 5.09375, "kl": 4.143932342529297, "learning_rate": 5e-06, "logits/chosen": -31900462.222222224, "logits/rejected": -27771767.466666665, "logps/chosen": -415.7805989583333, "logps/rejected": -482.64283854166666, "loss": 0.0182, "rewards/chosen": 5.759411282009548, "rewards/margins": 14.93718024359809, "rewards/rejected": -9.177768961588542, "step": 1298 }, { "epoch": 0.3560367274222283, "grad_norm": 11.8125, "kl": 4.610648155212402, "learning_rate": 5e-06, "logits/chosen": -43050269.538461536, "logits/rejected": -11778701.090909092, "logps/chosen": -465.49008413461536, "logps/rejected": -408.39106889204544, "loss": 0.0272, "rewards/chosen": 7.130151015061599, "rewards/margins": 14.056853341055916, "rewards/rejected": -6.926702325994318, "step": 1299 }, { "epoch": 0.3563108126627381, "grad_norm": 6.03125, "kl": 1.3630321025848389, "learning_rate": 5e-06, "logits/chosen": -37713810.28571428, "logits/rejected": -21411318.4, "logps/chosen": -517.4641810825893, "logps/rejected": -494.392236328125, "loss": 0.0558, "rewards/chosen": 6.015328543526786, "rewards/margins": 13.658880179268973, "rewards/rejected": -7.643551635742187, "step": 1300 }, { "epoch": 0.35658489790324793, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23903618.285714287, "logits/rejected": -28081756.8, "logps/chosen": -445.42867606026783, "logps/rejected": -688.71748046875, "loss": 0.0341, "rewards/chosen": 5.393154689243862, "rewards/margins": 19.133147975376673, "rewards/rejected": -13.739993286132812, "step": 1301 }, { "epoch": 0.3568589831437577, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27504354.90909091, "logits/rejected": -14870432.0, "logps/chosen": -403.61501242897725, "logps/rejected": -487.60738431490387, "loss": 0.025, "rewards/chosen": 6.670066833496094, "rewards/margins": 16.93485142634465, "rewards/rejected": -10.264784592848558, "step": 1302 }, { "epoch": 0.3571330683842675, "grad_norm": 11.25, "kl": 0.9948209524154663, "learning_rate": 5e-06, "logits/chosen": -32690249.6, "logits/rejected": 6021069.714285715, "logps/chosen": -496.0615234375, "logps/rejected": -436.96219308035717, "loss": 0.0531, "rewards/chosen": 6.707853698730469, "rewards/margins": 14.730636160714287, "rewards/rejected": -8.022782461983818, "step": 1303 }, { "epoch": 0.3574071536247773, "grad_norm": 7.8125, "kl": 5.054374694824219, "learning_rate": 5e-06, "logits/chosen": -3172121.6, "logits/rejected": -29711868.444444444, "logps/chosen": -465.14541015625, "logps/rejected": -616.58154296875, "loss": 0.0504, "rewards/chosen": 6.325357055664062, "rewards/margins": 15.467130703396267, "rewards/rejected": -9.141773647732204, "step": 1304 }, { "epoch": 0.3576812388652871, "grad_norm": 12.125, "kl": 1.0861448049545288, "learning_rate": 5e-06, "logits/chosen": -18089294.4, "logits/rejected": -6496945.142857143, "logps/chosen": -330.7667236328125, "logps/rejected": -438.0565708705357, "loss": 0.0809, "rewards/chosen": 4.689807891845703, "rewards/margins": 10.761029379708425, "rewards/rejected": -6.071221487862723, "step": 1305 }, { "epoch": 0.3579553241057969, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41831384.0, "logits/rejected": -27157701.333333332, "logps/chosen": -381.8842366536458, "logps/rejected": -466.1717936197917, "loss": 0.0577, "rewards/chosen": 4.797765731811523, "rewards/margins": 14.289328893025717, "rewards/rejected": -9.491563161214193, "step": 1306 }, { "epoch": 0.3582294093463067, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12849508.0, "logits/rejected": -7583174.666666667, "logps/chosen": -423.2957356770833, "logps/rejected": -578.3371175130209, "loss": 0.0306, "rewards/chosen": 4.575792948404948, "rewards/margins": 16.19078318277995, "rewards/rejected": -11.614990234375, "step": 1307 }, { "epoch": 0.3585034945868165, "grad_norm": 11.375, "kl": 11.734859466552734, "learning_rate": 5e-06, "logits/chosen": 9186558.153846154, "logits/rejected": -21167319.272727273, "logps/chosen": -283.85509314903845, "logps/rejected": -653.0593927556819, "loss": 0.0598, "rewards/chosen": 5.761540339543269, "rewards/margins": 16.54813507720307, "rewards/rejected": -10.7865947376598, "step": 1308 }, { "epoch": 0.3587775798273263, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22383260.8, "logits/rejected": -32022848.0, "logps/chosen": -260.2913818359375, "logps/rejected": -528.0099748883929, "loss": 0.0233, "rewards/chosen": 5.100243759155274, "rewards/margins": 14.766826139177596, "rewards/rejected": -9.666582380022321, "step": 1309 }, { "epoch": 0.3590516650678361, "grad_norm": 7.84375, "kl": 6.815630912780762, "learning_rate": 5e-06, "logits/chosen": -4721747.076923077, "logits/rejected": -20130903.272727273, "logps/chosen": -437.1066706730769, "logps/rejected": -303.9165704900568, "loss": 0.0299, "rewards/chosen": 6.16632080078125, "rewards/margins": 12.707740090110086, "rewards/rejected": -6.541419289328835, "step": 1310 }, { "epoch": 0.3593257503083459, "grad_norm": 9.25, "kl": 5.709621429443359, "learning_rate": 5e-06, "logits/chosen": -30522522.181818184, "logits/rejected": -19481489.230769232, "logps/chosen": -500.94180575284093, "logps/rejected": -698.640625, "loss": 0.0336, "rewards/chosen": 7.874920931729403, "rewards/margins": 21.157253131999838, "rewards/rejected": -13.282332200270433, "step": 1311 }, { "epoch": 0.3595998355488557, "grad_norm": 9.25, "kl": 0.2712481915950775, "learning_rate": 5e-06, "logits/chosen": -40382870.85714286, "logits/rejected": -19543784.0, "logps/chosen": -441.33775111607144, "logps/rejected": -469.541357421875, "loss": 0.0426, "rewards/chosen": 5.598931993756976, "rewards/margins": 12.396519579206196, "rewards/rejected": -6.797587585449219, "step": 1312 }, { "epoch": 0.3598739207893655, "grad_norm": 4.875, "kl": 9.364509582519531, "learning_rate": 5e-06, "logits/chosen": -13006277.333333334, "logits/rejected": -22428958.222222224, "logps/chosen": -452.94423828125, "logps/rejected": -677.0086805555555, "loss": 0.0144, "rewards/chosen": 7.7840627034505205, "rewards/margins": 18.464282565646702, "rewards/rejected": -10.68021986219618, "step": 1313 }, { "epoch": 0.3601480060298753, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 31358734.769230768, "logits/rejected": -33038609.454545453, "logps/chosen": -445.72280649038464, "logps/rejected": -636.6590465198864, "loss": 0.0468, "rewards/chosen": 6.730492811936599, "rewards/margins": 15.706851399028217, "rewards/rejected": -8.97635858709162, "step": 1314 }, { "epoch": 0.3604220912703851, "grad_norm": 10.0625, "kl": 5.853211879730225, "learning_rate": 5e-06, "logits/chosen": -36339108.571428575, "logits/rejected": -21485129.6, "logps/chosen": -348.01171875, "logps/rejected": -370.750830078125, "loss": 0.0793, "rewards/chosen": 5.1577301025390625, "rewards/margins": 12.436305236816406, "rewards/rejected": -7.278575134277344, "step": 1315 }, { "epoch": 0.36069617651089486, "grad_norm": 10.375, "kl": 2.0093703269958496, "learning_rate": 5e-06, "logits/chosen": -41900278.85714286, "logits/rejected": -18907080.0, "logps/chosen": -540.6643415178571, "logps/rejected": -319.0109619140625, "loss": 0.0465, "rewards/chosen": 6.819254193987165, "rewards/margins": 12.67639912196568, "rewards/rejected": -5.857144927978515, "step": 1316 }, { "epoch": 0.3609702617514047, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14321528.0, "logits/rejected": -36782116.571428575, "logps/chosen": -495.96142578125, "logps/rejected": -556.3396344866071, "loss": 0.0622, "rewards/chosen": 5.918019866943359, "rewards/margins": 15.344394029889788, "rewards/rejected": -9.426374162946429, "step": 1317 }, { "epoch": 0.3612443469919145, "grad_norm": 6.90625, "kl": 0.8402189016342163, "learning_rate": 5e-06, "logits/chosen": -54924160.0, "logits/rejected": -31828740.923076924, "logps/chosen": -435.63680752840907, "logps/rejected": -521.4894831730769, "loss": 0.0263, "rewards/chosen": 6.22345664284446, "rewards/margins": 14.001089136083642, "rewards/rejected": -7.7776324932391825, "step": 1318 }, { "epoch": 0.3615184322324243, "grad_norm": 7.96875, "kl": 2.3883540630340576, "learning_rate": 5e-06, "logits/chosen": -43834096.0, "logits/rejected": 64513930.666666664, "logps/chosen": -557.210693359375, "logps/rejected": -550.3265787760416, "loss": 0.0147, "rewards/chosen": 7.8451188405354815, "rewards/margins": 18.076984405517578, "rewards/rejected": -10.231865564982096, "step": 1319 }, { "epoch": 0.36179251747293406, "grad_norm": 8.0625, "kl": 6.100249290466309, "learning_rate": 5e-06, "logits/chosen": -17344240.0, "logits/rejected": -22285942.4, "logps/chosen": -492.3059779575893, "logps/rejected": -450.86787109375, "loss": 0.0558, "rewards/chosen": 6.75000980922154, "rewards/margins": 16.376309095110212, "rewards/rejected": -9.626299285888672, "step": 1320 }, { "epoch": 0.3620666027134439, "grad_norm": 6.9375, "kl": 9.57080364227295, "learning_rate": 5e-06, "logits/chosen": -36577878.15384615, "logits/rejected": -16014388.363636363, "logps/chosen": -405.5471754807692, "logps/rejected": -669.9740767045455, "loss": 0.0266, "rewards/chosen": 5.745517437274639, "rewards/margins": 15.05219274134069, "rewards/rejected": -9.30667530406605, "step": 1321 }, { "epoch": 0.3623406879539537, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25470229.333333332, "logits/rejected": -6783238.666666667, "logps/chosen": -381.9912923177083, "logps/rejected": -561.4071451822916, "loss": 0.0513, "rewards/chosen": 5.411099116007487, "rewards/margins": 16.811756769816082, "rewards/rejected": -11.400657653808594, "step": 1322 }, { "epoch": 0.3626147731944635, "grad_norm": 5.65625, "kl": 3.8147010803222656, "learning_rate": 5e-06, "logits/chosen": -6007678.857142857, "logits/rejected": -13911926.4, "logps/chosen": -489.59933035714283, "logps/rejected": -622.409375, "loss": 0.0221, "rewards/chosen": 7.675437927246094, "rewards/margins": 17.146231842041015, "rewards/rejected": -9.470793914794921, "step": 1323 }, { "epoch": 0.36288885843497326, "grad_norm": 7.28125, "kl": 2.002584457397461, "learning_rate": 5e-06, "logits/chosen": -10316900.0, "logits/rejected": -24719522.285714287, "logps/chosen": -668.27333984375, "logps/rejected": -569.5201241629464, "loss": 0.0243, "rewards/chosen": 7.305685424804688, "rewards/margins": 15.383961922781808, "rewards/rejected": -8.07827649797712, "step": 1324 }, { "epoch": 0.3631629436754831, "grad_norm": 11.875, "kl": 5.34709358215332, "learning_rate": 5e-06, "logits/chosen": -38493464.0, "logits/rejected": -28808568.0, "logps/chosen": -478.2115885416667, "logps/rejected": -458.6781412760417, "loss": 0.0506, "rewards/chosen": 6.829026540120442, "rewards/margins": 12.93684196472168, "rewards/rejected": -6.107815424601237, "step": 1325 }, { "epoch": 0.3634370289159929, "grad_norm": 9.8125, "kl": 1.26107919216156, "learning_rate": 5e-06, "logits/chosen": -30977610.666666668, "logits/rejected": -17471784.0, "logps/chosen": -384.3649495442708, "logps/rejected": -447.6797281901042, "loss": 0.0741, "rewards/chosen": 5.723009745279948, "rewards/margins": 12.88739840189616, "rewards/rejected": -7.164388656616211, "step": 1326 }, { "epoch": 0.36371111415650265, "grad_norm": 3.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29047138.0, "logits/rejected": -15421160.0, "logps/chosen": -418.77301025390625, "logps/rejected": -597.17626953125, "loss": 0.0147, "rewards/chosen": 5.837831497192383, "rewards/margins": 15.897794723510742, "rewards/rejected": -10.05996322631836, "step": 1327 }, { "epoch": 0.36398519939701246, "grad_norm": 9.6875, "kl": 1.5181134939193726, "learning_rate": 5e-06, "logits/chosen": -33980556.307692304, "logits/rejected": -8399060.363636363, "logps/chosen": -388.17964993990387, "logps/rejected": -587.7167080965909, "loss": 0.0439, "rewards/chosen": 6.914930490347055, "rewards/margins": 14.170903612683702, "rewards/rejected": -7.2559731223366475, "step": 1328 }, { "epoch": 0.3642592846375223, "grad_norm": 13.5, "kl": 5.242144584655762, "learning_rate": 5e-06, "logits/chosen": -17161778.0, "logits/rejected": -29613894.0, "logps/chosen": -348.9447937011719, "logps/rejected": -320.2668762207031, "loss": 0.0915, "rewards/chosen": 4.60166072845459, "rewards/margins": 11.523388862609863, "rewards/rejected": -6.921728134155273, "step": 1329 }, { "epoch": 0.3645333698780321, "grad_norm": 6.21875, "kl": 8.600996017456055, "learning_rate": 5e-06, "logits/chosen": -26692068.57142857, "logits/rejected": 9186982.4, "logps/chosen": -610.27783203125, "logps/rejected": -335.0466796875, "loss": 0.023, "rewards/chosen": 6.715667724609375, "rewards/margins": 14.025341796875, "rewards/rejected": -7.309674072265625, "step": 1330 }, { "epoch": 0.36480745511854185, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34753081.6, "logits/rejected": -31398886.85714286, "logps/chosen": -419.83388671875, "logps/rejected": -500.0185546875, "loss": 0.0605, "rewards/chosen": 6.813080596923828, "rewards/margins": 15.64086445399693, "rewards/rejected": -8.827783857073102, "step": 1331 }, { "epoch": 0.36508154035905166, "grad_norm": 13.9375, "kl": 2.5543341636657715, "learning_rate": 5e-06, "logits/chosen": -38640743.384615384, "logits/rejected": -15202846.545454545, "logps/chosen": -464.4075270432692, "logps/rejected": -397.57839133522725, "loss": 0.0646, "rewards/chosen": 5.463929396409255, "rewards/margins": 12.175578510844623, "rewards/rejected": -6.711649114435369, "step": 1332 }, { "epoch": 0.3653556255995615, "grad_norm": 9.25, "kl": 4.818833827972412, "learning_rate": 5e-06, "logits/chosen": -21652912.94117647, "logits/rejected": -23707344.0, "logps/chosen": -355.64430147058823, "logps/rejected": -560.6383928571429, "loss": 0.0596, "rewards/chosen": 5.611994126263787, "rewards/margins": 17.947263509285552, "rewards/rejected": -12.335269383021764, "step": 1333 }, { "epoch": 0.3656297108400713, "grad_norm": 2.484375, "kl": 0.7742919921875, "learning_rate": 5e-06, "logits/chosen": -12666893.0, "logits/rejected": -2695983.5, "logps/chosen": -546.8629760742188, "logps/rejected": -516.3068237304688, "loss": 0.006, "rewards/chosen": 7.597806453704834, "rewards/margins": 16.69279432296753, "rewards/rejected": -9.094987869262695, "step": 1334 }, { "epoch": 0.36590379608058105, "grad_norm": 6.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22029728.0, "logits/rejected": -4588336.0, "logps/chosen": -420.8055889423077, "logps/rejected": -433.0086115056818, "loss": 0.0716, "rewards/chosen": 5.376212486853967, "rewards/margins": 11.60070811451732, "rewards/rejected": -6.2244956276633525, "step": 1335 }, { "epoch": 0.36617788132109086, "grad_norm": 13.4375, "kl": 14.16617202758789, "learning_rate": 5e-06, "logits/chosen": -13624747.555555556, "logits/rejected": -19007910.666666668, "logps/chosen": -524.9396158854166, "logps/rejected": -547.08984375, "loss": 0.0813, "rewards/chosen": 6.521681891547309, "rewards/margins": 15.992014990912544, "rewards/rejected": -9.470333099365234, "step": 1336 }, { "epoch": 0.3664519665616007, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14086905.6, "logits/rejected": -32529961.14285714, "logps/chosen": -311.9728515625, "logps/rejected": -543.7139718191964, "loss": 0.0287, "rewards/chosen": 6.002128982543946, "rewards/margins": 13.106008638654437, "rewards/rejected": -7.103879656110491, "step": 1337 }, { "epoch": 0.36672605180211043, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14628756.363636363, "logits/rejected": -14302305.23076923, "logps/chosen": -409.42524857954544, "logps/rejected": -482.17394080528845, "loss": 0.0572, "rewards/chosen": 6.726418928666548, "rewards/margins": 14.125341188657533, "rewards/rejected": -7.398922259990986, "step": 1338 }, { "epoch": 0.36700013704262024, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5255850.333333333, "logits/rejected": -12894789.333333334, "logps/chosen": -472.4346516927083, "logps/rejected": -596.9496256510416, "loss": 0.0394, "rewards/chosen": 5.601472854614258, "rewards/margins": 14.68197504679362, "rewards/rejected": -9.080502192179361, "step": 1339 }, { "epoch": 0.36727422228313006, "grad_norm": 10.625, "kl": 5.201087951660156, "learning_rate": 5e-06, "logits/chosen": -12326772.0, "logits/rejected": -41313914.666666664, "logps/chosen": -404.2187093098958, "logps/rejected": -576.4457194010416, "loss": 0.0458, "rewards/chosen": 5.89248784383138, "rewards/margins": 15.154366811116535, "rewards/rejected": -9.261878967285156, "step": 1340 }, { "epoch": 0.36754830752363987, "grad_norm": 13.3125, "kl": 4.881062984466553, "learning_rate": 5e-06, "logits/chosen": -21207163.076923076, "logits/rejected": -14283694.545454545, "logps/chosen": -517.8909630408654, "logps/rejected": -464.77885298295456, "loss": 0.053, "rewards/chosen": 7.4721538837139425, "rewards/margins": 14.944844119198674, "rewards/rejected": -7.47269023548473, "step": 1341 }, { "epoch": 0.36782239276414963, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21533912.888888888, "logits/rejected": -36796381.86666667, "logps/chosen": -508.58550347222223, "logps/rejected": -518.9287434895833, "loss": 0.018, "rewards/chosen": 6.709846496582031, "rewards/margins": 16.024578348795572, "rewards/rejected": -9.314731852213542, "step": 1342 }, { "epoch": 0.36809647800465944, "grad_norm": 3.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28966682.181818184, "logits/rejected": -18942816.0, "logps/chosen": -461.1251775568182, "logps/rejected": -488.08800330528845, "loss": 0.0216, "rewards/chosen": 7.002656416459517, "rewards/margins": 14.10326524214311, "rewards/rejected": -7.100608825683594, "step": 1343 }, { "epoch": 0.36837056324516926, "grad_norm": 12.8125, "kl": 8.113641738891602, "learning_rate": 5e-06, "logits/chosen": -24313412.266666666, "logits/rejected": -16114282.666666666, "logps/chosen": -390.80071614583335, "logps/rejected": -446.17450629340277, "loss": 0.0769, "rewards/chosen": 5.332761637369791, "rewards/margins": 13.47234819200304, "rewards/rejected": -8.139586554633247, "step": 1344 }, { "epoch": 0.36864464848567907, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40756936.0, "logits/rejected": -6486666.0, "logps/chosen": -353.58599853515625, "logps/rejected": -572.01025390625, "loss": 0.0104, "rewards/chosen": 6.601680278778076, "rewards/margins": 17.377156734466553, "rewards/rejected": -10.775476455688477, "step": 1345 }, { "epoch": 0.36891873372618883, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16314056.0, "logits/rejected": -1064486.0, "logps/chosen": -392.78729248046875, "logps/rejected": -533.464599609375, "loss": 0.0341, "rewards/chosen": 5.21478271484375, "rewards/margins": 16.008170127868652, "rewards/rejected": -10.793387413024902, "step": 1346 }, { "epoch": 0.36919281896669864, "grad_norm": 13.375, "kl": 8.712431907653809, "learning_rate": 5e-06, "logits/chosen": -32512925.333333332, "logits/rejected": -29165002.666666668, "logps/chosen": -373.238525390625, "logps/rejected": -544.400146484375, "loss": 0.0409, "rewards/chosen": 6.612119038899739, "rewards/margins": 13.796839396158854, "rewards/rejected": -7.184720357259114, "step": 1347 }, { "epoch": 0.36946690420720846, "grad_norm": 1.953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46394944.0, "logits/rejected": -19369697.454545453, "logps/chosen": -421.2126652644231, "logps/rejected": -451.87473366477275, "loss": 0.0283, "rewards/chosen": 6.013519287109375, "rewards/margins": 15.543168501420455, "rewards/rejected": -9.52964921431108, "step": 1348 }, { "epoch": 0.3697409894477182, "grad_norm": 1.9921875, "kl": 6.517764091491699, "learning_rate": 5e-06, "logits/chosen": -28197072.0, "logits/rejected": -18243488.0, "logps/chosen": -483.9344482421875, "logps/rejected": -399.2773132324219, "loss": 0.0086, "rewards/chosen": 6.47272253036499, "rewards/margins": 13.5272798538208, "rewards/rejected": -7.0545573234558105, "step": 1349 }, { "epoch": 0.37001507468822803, "grad_norm": 8.0625, "kl": 1.350947380065918, "learning_rate": 5e-06, "logits/chosen": -23262821.333333332, "logits/rejected": -9536202.666666666, "logps/chosen": -435.9119059244792, "logps/rejected": -446.5491943359375, "loss": 0.0557, "rewards/chosen": 5.531155904134114, "rewards/margins": 10.774600346883137, "rewards/rejected": -5.243444442749023, "step": 1350 }, { "epoch": 0.37028915992873784, "grad_norm": 5.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20476981.333333332, "logits/rejected": -20783402.666666668, "logps/chosen": -479.5096028645833, "logps/rejected": -453.2215983072917, "loss": 0.0203, "rewards/chosen": 6.991508483886719, "rewards/margins": 18.298458099365234, "rewards/rejected": -11.306949615478516, "step": 1351 }, { "epoch": 0.37056324516924766, "grad_norm": 13.125, "kl": 11.53343391418457, "learning_rate": 5e-06, "logits/chosen": -31289747.692307692, "logits/rejected": -23894474.181818184, "logps/chosen": -379.7421875, "logps/rejected": -619.6512784090909, "loss": 0.0585, "rewards/chosen": 5.971141521747295, "rewards/margins": 16.978200792432666, "rewards/rejected": -11.00705927068537, "step": 1352 }, { "epoch": 0.3708373304097574, "grad_norm": 10.375, "kl": 5.800472259521484, "learning_rate": 5e-06, "logits/chosen": -7409966.545454546, "logits/rejected": -18418816.0, "logps/chosen": -375.73215553977275, "logps/rejected": -552.8592623197115, "loss": 0.0257, "rewards/chosen": 6.080924987792969, "rewards/margins": 14.06476064828726, "rewards/rejected": -7.983835660494291, "step": 1353 }, { "epoch": 0.37111141565026723, "grad_norm": 8.8125, "kl": 0.18428167700767517, "learning_rate": 5e-06, "logits/chosen": -21571378.285714287, "logits/rejected": -23533732.8, "logps/chosen": -328.05008370535717, "logps/rejected": -570.90224609375, "loss": 0.0837, "rewards/chosen": 4.795946938650949, "rewards/margins": 13.722739846365794, "rewards/rejected": -8.926792907714844, "step": 1354 }, { "epoch": 0.37138550089077704, "grad_norm": 11.3125, "kl": 8.792890548706055, "learning_rate": 5e-06, "logits/chosen": -17316034.46153846, "logits/rejected": -20969166.545454547, "logps/chosen": -412.92127403846155, "logps/rejected": -474.11234907670456, "loss": 0.1087, "rewards/chosen": 5.153356698843149, "rewards/margins": 15.654392909336757, "rewards/rejected": -10.501036210493607, "step": 1355 }, { "epoch": 0.37165958613128686, "grad_norm": 5.46875, "kl": 2.325054168701172, "learning_rate": 5e-06, "logits/chosen": -8730145.6, "logits/rejected": -9192875.42857143, "logps/chosen": -421.088671875, "logps/rejected": -557.3135463169643, "loss": 0.0252, "rewards/chosen": 6.443943023681641, "rewards/margins": 14.80918938773019, "rewards/rejected": -8.365246364048549, "step": 1356 }, { "epoch": 0.3719336713717966, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24187081.846153848, "logits/rejected": -29516834.90909091, "logps/chosen": -456.4699894831731, "logps/rejected": -656.2371715198864, "loss": 0.0314, "rewards/chosen": 6.957057659442608, "rewards/margins": 16.79329548015461, "rewards/rejected": -9.836237820712002, "step": 1357 }, { "epoch": 0.37220775661230643, "grad_norm": 20.25, "kl": 12.240156173706055, "learning_rate": 5e-06, "logits/chosen": -4085222.588235294, "logits/rejected": -17976052.57142857, "logps/chosen": -388.50109145220586, "logps/rejected": -469.46732003348217, "loss": 0.1402, "rewards/chosen": 4.42716261919807, "rewards/margins": 12.227441964029264, "rewards/rejected": -7.8002793448311945, "step": 1358 }, { "epoch": 0.37248184185281624, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4691790.545454546, "logits/rejected": -22464556.307692308, "logps/chosen": -449.96262428977275, "logps/rejected": -661.587890625, "loss": 0.0086, "rewards/chosen": 6.193917707963423, "rewards/margins": 17.772883115114865, "rewards/rejected": -11.578965407151442, "step": 1359 }, { "epoch": 0.372755927093326, "grad_norm": 13.5, "kl": 7.822842597961426, "learning_rate": 5e-06, "logits/chosen": -25519687.111111112, "logits/rejected": 2859292.0, "logps/chosen": -429.31255425347223, "logps/rejected": -471.4455973307292, "loss": 0.1126, "rewards/chosen": 5.375203874376085, "rewards/margins": 11.127295600043404, "rewards/rejected": -5.752091725667317, "step": 1360 }, { "epoch": 0.3730300123338358, "grad_norm": 10.9375, "kl": 7.173015594482422, "learning_rate": 5e-06, "logits/chosen": -17772228.923076924, "logits/rejected": -45256526.54545455, "logps/chosen": -395.1916691706731, "logps/rejected": -442.22305575284093, "loss": 0.0502, "rewards/chosen": 7.150512108435998, "rewards/margins": 16.54223157976057, "rewards/rejected": -9.391719471324574, "step": 1361 }, { "epoch": 0.37330409757434563, "grad_norm": 7.03125, "kl": 4.963908672332764, "learning_rate": 5e-06, "logits/chosen": -20737308.8, "logits/rejected": -32698262.85714286, "logps/chosen": -427.07255859375, "logps/rejected": -612.3273577008929, "loss": 0.0339, "rewards/chosen": 6.004665374755859, "rewards/margins": 16.832601819719585, "rewards/rejected": -10.827936444963727, "step": 1362 }, { "epoch": 0.37357818281485544, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40703926.15384615, "logits/rejected": -19191863.272727273, "logps/chosen": -465.59731820913464, "logps/rejected": -480.9312855113636, "loss": 0.0643, "rewards/chosen": 6.849513714130108, "rewards/margins": 13.332285847697225, "rewards/rejected": -6.482772133567116, "step": 1363 }, { "epoch": 0.3738522680553652, "grad_norm": 8.0625, "kl": 6.128045558929443, "learning_rate": 5e-06, "logits/chosen": -15175236.363636363, "logits/rejected": -15011224.615384616, "logps/chosen": -372.8523615056818, "logps/rejected": -582.0616736778846, "loss": 0.058, "rewards/chosen": 5.613186922940341, "rewards/margins": 14.920761268455664, "rewards/rejected": -9.307574345515324, "step": 1364 }, { "epoch": 0.374126353295875, "grad_norm": 3.875, "kl": 1.0866343975067139, "learning_rate": 5e-06, "logits/chosen": -19945410.46153846, "logits/rejected": -31464459.636363637, "logps/chosen": -426.2310321514423, "logps/rejected": -649.7448952414773, "loss": 0.0067, "rewards/chosen": 7.876290541428786, "rewards/margins": 19.568312691641854, "rewards/rejected": -11.692022150213068, "step": 1365 }, { "epoch": 0.3744004385363848, "grad_norm": 9.5, "kl": 3.170278549194336, "learning_rate": 5e-06, "logits/chosen": -14971720.0, "logits/rejected": -23046512.0, "logps/chosen": -390.8773193359375, "logps/rejected": -539.6959228515625, "loss": 0.0389, "rewards/chosen": 5.581034342447917, "rewards/margins": 15.993284225463867, "rewards/rejected": -10.412249883015951, "step": 1366 }, { "epoch": 0.37467452377689464, "grad_norm": 3.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26175864.0, "logits/rejected": -12746666.666666666, "logps/chosen": -484.115478515625, "logps/rejected": -506.0573323567708, "loss": 0.012, "rewards/chosen": 5.342023213704427, "rewards/margins": 14.353556315104168, "rewards/rejected": -9.01153310139974, "step": 1367 }, { "epoch": 0.3749486090174044, "grad_norm": 8.9375, "kl": 5.3463454246521, "learning_rate": 5e-06, "logits/chosen": -27990606.933333334, "logits/rejected": 7252065.333333333, "logps/chosen": -391.03502604166664, "logps/rejected": -540.2826605902778, "loss": 0.0387, "rewards/chosen": 5.848204549153646, "rewards/margins": 16.38685760498047, "rewards/rejected": -10.538653055826822, "step": 1368 }, { "epoch": 0.3752226942579142, "grad_norm": 3.515625, "kl": 2.1995444297790527, "learning_rate": 5e-06, "logits/chosen": -22740668.444444444, "logits/rejected": -15242888.533333333, "logps/chosen": -356.75726996527777, "logps/rejected": -487.87421875, "loss": 0.0368, "rewards/chosen": 6.148828294542101, "rewards/margins": 15.101002163357204, "rewards/rejected": -8.952173868815104, "step": 1369 }, { "epoch": 0.375496779498424, "grad_norm": 5.34375, "kl": 3.834341526031494, "learning_rate": 5e-06, "logits/chosen": -17311448.0, "logits/rejected": -23720056.0, "logps/chosen": -387.4996337890625, "logps/rejected": -329.1541442871094, "loss": 0.0556, "rewards/chosen": 6.472764492034912, "rewards/margins": 11.150480270385742, "rewards/rejected": -4.67771577835083, "step": 1370 }, { "epoch": 0.3757708647389338, "grad_norm": 10.1875, "kl": 2.4680233001708984, "learning_rate": 5e-06, "logits/chosen": -23193893.333333332, "logits/rejected": -22049678.666666668, "logps/chosen": -473.3784993489583, "logps/rejected": -551.5137939453125, "loss": 0.0527, "rewards/chosen": 5.980538050333659, "rewards/margins": 15.94972038269043, "rewards/rejected": -9.969182332356771, "step": 1371 }, { "epoch": 0.3760449499794436, "grad_norm": 7.75, "kl": 5.320826530456543, "learning_rate": 5e-06, "logits/chosen": -24369036.307692308, "logits/rejected": -29353600.0, "logps/chosen": -529.43212890625, "logps/rejected": -578.52734375, "loss": 0.0341, "rewards/chosen": 6.14867929311899, "rewards/margins": 16.09266011698263, "rewards/rejected": -9.943980823863637, "step": 1372 }, { "epoch": 0.3763190352199534, "grad_norm": 8.1875, "kl": 2.913560390472412, "learning_rate": 5e-06, "logits/chosen": 42676160.0, "logits/rejected": -26703561.14285714, "logps/chosen": -426.602099609375, "logps/rejected": -672.1701311383929, "loss": 0.0574, "rewards/chosen": 5.629318618774414, "rewards/margins": 17.218637030465263, "rewards/rejected": -11.589318411690849, "step": 1373 }, { "epoch": 0.3765931204604632, "grad_norm": 14.5, "kl": 3.1393322944641113, "learning_rate": 5e-06, "logits/chosen": -24517693.714285713, "logits/rejected": -22183401.6, "logps/chosen": -399.96358816964283, "logps/rejected": -513.32119140625, "loss": 0.0916, "rewards/chosen": 5.446321759905134, "rewards/margins": 13.67736576625279, "rewards/rejected": -8.231044006347656, "step": 1374 }, { "epoch": 0.376867205700973, "grad_norm": 6.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19839670.85714286, "logits/rejected": -21188291.76470588, "logps/chosen": -518.0590122767857, "logps/rejected": -570.8956801470588, "loss": 0.0801, "rewards/chosen": 6.295253753662109, "rewards/margins": 14.410270017736098, "rewards/rejected": -8.115016264073988, "step": 1375 }, { "epoch": 0.3771412909414828, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20146128.0, "logits/rejected": -23185107.692307692, "logps/chosen": -456.4454900568182, "logps/rejected": -450.89148888221155, "loss": 0.0242, "rewards/chosen": 6.681622591885653, "rewards/margins": 13.615015043245329, "rewards/rejected": -6.933392451359675, "step": 1376 }, { "epoch": 0.3774153761819926, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28711020.8, "logits/rejected": -447470.85714285716, "logps/chosen": -377.9548828125, "logps/rejected": -476.52968052455356, "loss": 0.0596, "rewards/chosen": 5.9559326171875, "rewards/margins": 12.511324746268137, "rewards/rejected": -6.555392129080636, "step": 1377 }, { "epoch": 0.37768946142250237, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17720616.0, "logits/rejected": -11930717.714285715, "logps/chosen": -322.306591796875, "logps/rejected": -379.26806640625, "loss": 0.0442, "rewards/chosen": 4.535403442382813, "rewards/margins": 12.624801417759485, "rewards/rejected": -8.089397975376674, "step": 1378 }, { "epoch": 0.3779635466630122, "grad_norm": 6.59375, "kl": 4.842496395111084, "learning_rate": 5e-06, "logits/chosen": -17207272.533333335, "logits/rejected": -29896711.111111112, "logps/chosen": -357.76178385416665, "logps/rejected": -597.9447699652778, "loss": 0.0663, "rewards/chosen": 4.686810811360677, "rewards/margins": 15.42571512858073, "rewards/rejected": -10.738904317220053, "step": 1379 }, { "epoch": 0.378237631903522, "grad_norm": 10.5, "kl": 8.89041805267334, "learning_rate": 5e-06, "logits/chosen": -22908275.2, "logits/rejected": -15090704.0, "logps/chosen": -397.74895833333335, "logps/rejected": -474.81846788194446, "loss": 0.0681, "rewards/chosen": 5.940164693196615, "rewards/margins": 11.84436535305447, "rewards/rejected": -5.904200659857856, "step": 1380 }, { "epoch": 0.3785117171440318, "grad_norm": 12.375, "kl": 5.787055969238281, "learning_rate": 5e-06, "logits/chosen": -25085703.529411763, "logits/rejected": -30077053.714285713, "logps/chosen": -430.9404296875, "logps/rejected": -524.4429757254464, "loss": 0.0526, "rewards/chosen": 6.537899690515855, "rewards/margins": 14.344460751830029, "rewards/rejected": -7.8065610613141745, "step": 1381 }, { "epoch": 0.37878580238454157, "grad_norm": 4.5625, "kl": 0.07213084399700165, "learning_rate": 5e-06, "logits/chosen": -36316472.615384616, "logits/rejected": -22782487.272727273, "logps/chosen": -447.64896334134613, "logps/rejected": -874.220703125, "loss": 0.0107, "rewards/chosen": 6.842833298903245, "rewards/margins": 20.307765827312338, "rewards/rejected": -13.464932528409092, "step": 1382 }, { "epoch": 0.3790598876250514, "grad_norm": 7.46875, "kl": 1.2391414642333984, "learning_rate": 5e-06, "logits/chosen": -10205222.857142856, "logits/rejected": -9450368.0, "logps/chosen": -385.673828125, "logps/rejected": -441.63857421875, "loss": 0.0427, "rewards/chosen": 6.154412405831473, "rewards/margins": 13.412180655343192, "rewards/rejected": -7.257768249511718, "step": 1383 }, { "epoch": 0.3793339728655612, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33911207.11111111, "logits/rejected": -13058011.733333332, "logps/chosen": -400.8806966145833, "logps/rejected": -427.82255859375, "loss": 0.0143, "rewards/chosen": 7.18104722764757, "rewards/margins": 17.738263617621527, "rewards/rejected": -10.557216389973958, "step": 1384 }, { "epoch": 0.379608058106071, "grad_norm": 9.4375, "kl": 0.0013427734375, "learning_rate": 5e-06, "logits/chosen": -15569785.6, "logits/rejected": -11483112.0, "logps/chosen": -404.3706787109375, "logps/rejected": -402.08642578125, "loss": 0.0501, "rewards/chosen": 6.231559371948242, "rewards/margins": 13.765058408464704, "rewards/rejected": -7.533499036516462, "step": 1385 }, { "epoch": 0.37988214334658077, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2147298.4615384615, "logits/rejected": -16964609.454545453, "logps/chosen": -378.64633413461536, "logps/rejected": -603.8248401988636, "loss": 0.0267, "rewards/chosen": 5.670162494365986, "rewards/margins": 15.652236191542832, "rewards/rejected": -9.982073697176846, "step": 1386 }, { "epoch": 0.3801562285870906, "grad_norm": 18.125, "kl": 8.916831016540527, "learning_rate": 5e-06, "logits/chosen": -24932721.454545453, "logits/rejected": 55292219.07692308, "logps/chosen": -388.6292613636364, "logps/rejected": -586.9148137019231, "loss": 0.1303, "rewards/chosen": 5.100325150923296, "rewards/margins": 15.713935051764643, "rewards/rejected": -10.613609900841347, "step": 1387 }, { "epoch": 0.3804303138276004, "grad_norm": 8.125, "kl": 4.023049831390381, "learning_rate": 5e-06, "logits/chosen": -33439646.11764706, "logits/rejected": -12284657.142857144, "logps/chosen": -527.9079733455883, "logps/rejected": -607.30224609375, "loss": 0.051, "rewards/chosen": 7.876656924977022, "rewards/margins": 16.607018863453582, "rewards/rejected": -8.730361938476562, "step": 1388 }, { "epoch": 0.38070439906811016, "grad_norm": 6.625, "kl": 2.3660855293273926, "learning_rate": 5e-06, "logits/chosen": -22813740.307692308, "logits/rejected": -13124648.727272727, "logps/chosen": -387.0304987980769, "logps/rejected": -509.43412642045456, "loss": 0.0861, "rewards/chosen": 6.212790269118089, "rewards/margins": 16.17036459329245, "rewards/rejected": -9.957574324174361, "step": 1389 }, { "epoch": 0.38097848430861997, "grad_norm": 3.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11514995.692307692, "logits/rejected": -8128618.181818182, "logps/chosen": -364.83529897836536, "logps/rejected": -401.13725142045456, "loss": 0.0185, "rewards/chosen": 6.449990492600661, "rewards/margins": 16.177859006228147, "rewards/rejected": -9.727868513627486, "step": 1390 }, { "epoch": 0.3812525695491298, "grad_norm": 12.125, "kl": 4.009106636047363, "learning_rate": 5e-06, "logits/chosen": 48565058.90909091, "logits/rejected": -45311271.384615384, "logps/chosen": -494.5110973011364, "logps/rejected": -435.9802809495192, "loss": 0.0496, "rewards/chosen": 6.272986672141335, "rewards/margins": 13.433048435024448, "rewards/rejected": -7.160061762883113, "step": 1391 }, { "epoch": 0.3815266547896396, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25986397.333333332, "logits/rejected": -11705772.0, "logps/chosen": -319.29233805338544, "logps/rejected": -414.7865397135417, "loss": 0.0442, "rewards/chosen": 6.384930928548177, "rewards/margins": 13.787354787190754, "rewards/rejected": -7.402423858642578, "step": 1392 }, { "epoch": 0.38180074003014935, "grad_norm": 5.46875, "kl": 9.212395668029785, "learning_rate": 5e-06, "logits/chosen": -17544302.933333334, "logits/rejected": -38258378.666666664, "logps/chosen": -512.2736002604166, "logps/rejected": -572.9869791666666, "loss": 0.0939, "rewards/chosen": 5.774121602376302, "rewards/margins": 15.125226338704426, "rewards/rejected": -9.351104736328125, "step": 1393 }, { "epoch": 0.38207482527065917, "grad_norm": 5.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28904883.2, "logits/rejected": -36532700.44444445, "logps/chosen": -485.22447916666664, "logps/rejected": -563.7375759548611, "loss": 0.0156, "rewards/chosen": 6.482096354166667, "rewards/margins": 18.07865227593316, "rewards/rejected": -11.596555921766493, "step": 1394 }, { "epoch": 0.382348910511169, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10705168.666666666, "logits/rejected": -10249010.0, "logps/chosen": -441.7935384114583, "logps/rejected": -628.5597330729166, "loss": 0.016, "rewards/chosen": 7.558779398600261, "rewards/margins": 15.805034637451172, "rewards/rejected": -8.246255238850912, "step": 1395 }, { "epoch": 0.3826229957516788, "grad_norm": 8.75, "kl": 7.114432334899902, "learning_rate": 5e-06, "logits/chosen": -42048029.538461536, "logits/rejected": -18811610.181818184, "logps/chosen": -326.1633112980769, "logps/rejected": -493.24507279829544, "loss": 0.0377, "rewards/chosen": 7.006379934457632, "rewards/margins": 15.437968540858556, "rewards/rejected": -8.431588606400924, "step": 1396 }, { "epoch": 0.38289708099218855, "grad_norm": 1.6796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21719678.222222224, "logits/rejected": -23319820.8, "logps/chosen": -419.68861219618054, "logps/rejected": -646.9729817708334, "loss": 0.0037, "rewards/chosen": 7.737933688693577, "rewards/margins": 19.3145511203342, "rewards/rejected": -11.576617431640624, "step": 1397 }, { "epoch": 0.38317116623269837, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11433850.0, "logits/rejected": -1957782.125, "logps/chosen": -350.4543762207031, "logps/rejected": -609.0047607421875, "loss": 0.0213, "rewards/chosen": 5.920838832855225, "rewards/margins": 14.65678358078003, "rewards/rejected": -8.735944747924805, "step": 1398 }, { "epoch": 0.3834452514732082, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3641027.5555555555, "logits/rejected": -7396843.733333333, "logps/chosen": -303.7442220052083, "logps/rejected": -724.3945963541667, "loss": 0.0228, "rewards/chosen": 5.072980244954427, "rewards/margins": 17.202144877115884, "rewards/rejected": -12.129164632161459, "step": 1399 }, { "epoch": 0.38371933671371794, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20140606.222222224, "logits/rejected": -15172292.266666668, "logps/chosen": -562.4630533854166, "logps/rejected": -479.6453450520833, "loss": 0.0699, "rewards/chosen": 6.297349294026692, "rewards/margins": 13.762322235107423, "rewards/rejected": -7.464972941080729, "step": 1400 }, { "epoch": 0.38399342195422775, "grad_norm": 14.25, "kl": 1.3475902080535889, "learning_rate": 5e-06, "logits/chosen": 16433543.272727273, "logits/rejected": -608617.8461538461, "logps/chosen": -352.2791193181818, "logps/rejected": -440.7605543870192, "loss": 0.0852, "rewards/chosen": 3.7790308865633877, "rewards/margins": 11.704532783348244, "rewards/rejected": -7.925501896784856, "step": 1401 }, { "epoch": 0.38426750719473757, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9733805.714285715, "logits/rejected": -24229008.0, "logps/chosen": -422.49776785714283, "logps/rejected": -435.386669921875, "loss": 0.0317, "rewards/chosen": 4.980261121477399, "rewards/margins": 12.659009443010603, "rewards/rejected": -7.678748321533203, "step": 1402 }, { "epoch": 0.3845415924352474, "grad_norm": 5.84375, "kl": 1.2645753622055054, "learning_rate": 5e-06, "logits/chosen": -17009901.09090909, "logits/rejected": -22763913.846153848, "logps/chosen": -341.24174360795456, "logps/rejected": -588.1577899639423, "loss": 0.0403, "rewards/chosen": 5.793030478737571, "rewards/margins": 15.367644783500193, "rewards/rejected": -9.57461430476262, "step": 1403 }, { "epoch": 0.38481567767575714, "grad_norm": 5.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31081606.4, "logits/rejected": -14136518.857142856, "logps/chosen": -438.66708984375, "logps/rejected": -461.45567103794644, "loss": 0.0284, "rewards/chosen": 5.904724884033203, "rewards/margins": 13.803697531563895, "rewards/rejected": -7.898972647530692, "step": 1404 }, { "epoch": 0.38508976291626695, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31141169.454545453, "logits/rejected": -31546112.0, "logps/chosen": -470.2242542613636, "logps/rejected": -516.6022385817307, "loss": 0.0318, "rewards/chosen": 6.536177201704546, "rewards/margins": 18.61775068803267, "rewards/rejected": -12.081573486328125, "step": 1405 }, { "epoch": 0.38536384815677677, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14639452.8, "logits/rejected": 26190873.14285714, "logps/chosen": -381.5597412109375, "logps/rejected": -513.1459612165179, "loss": 0.0498, "rewards/chosen": 5.216278457641602, "rewards/margins": 15.561525998796736, "rewards/rejected": -10.345247541155134, "step": 1406 }, { "epoch": 0.3856379333972866, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20138573.714285713, "logits/rejected": -25199166.11764706, "logps/chosen": -419.7700892857143, "logps/rejected": -490.9120519301471, "loss": 0.0412, "rewards/chosen": 5.785412924630301, "rewards/margins": 14.967668389071939, "rewards/rejected": -9.182255464441637, "step": 1407 }, { "epoch": 0.38591201863779634, "grad_norm": 7.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28348374.4, "logits/rejected": -19506324.57142857, "logps/chosen": -591.6673828125, "logps/rejected": -440.2857142857143, "loss": 0.0296, "rewards/chosen": 7.85906982421875, "rewards/margins": 16.827218191964285, "rewards/rejected": -8.968148367745536, "step": 1408 }, { "epoch": 0.38618610387830615, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7159299.428571428, "logits/rejected": -19971766.588235293, "logps/chosen": -489.5111607142857, "logps/rejected": -512.6785960477941, "loss": 0.039, "rewards/chosen": 5.497883387974331, "rewards/margins": 13.902515956333705, "rewards/rejected": -8.404632568359375, "step": 1409 }, { "epoch": 0.38646018911881597, "grad_norm": 11.0625, "kl": 8.206087112426758, "learning_rate": 5e-06, "logits/chosen": -33406006.0, "logits/rejected": -7040979.5, "logps/chosen": -398.5121154785156, "logps/rejected": -559.1600952148438, "loss": 0.0661, "rewards/chosen": 6.292716979980469, "rewards/margins": 13.778829574584961, "rewards/rejected": -7.486112594604492, "step": 1410 }, { "epoch": 0.3867342743593257, "grad_norm": 9.3125, "kl": 6.8136115074157715, "learning_rate": 5e-06, "logits/chosen": -19231750.4, "logits/rejected": -16003850.666666666, "logps/chosen": -343.5164388020833, "logps/rejected": -518.9685872395834, "loss": 0.0617, "rewards/chosen": 5.851529947916666, "rewards/margins": 13.387879774305556, "rewards/rejected": -7.536349826388889, "step": 1411 }, { "epoch": 0.38700835959983554, "grad_norm": 12.1875, "kl": 12.79141902923584, "learning_rate": 5e-06, "logits/chosen": -12059554.133333333, "logits/rejected": -29063504.0, "logps/chosen": -377.70625, "logps/rejected": -592.2458767361111, "loss": 0.0584, "rewards/chosen": 6.876853434244792, "rewards/margins": 17.14130859375, "rewards/rejected": -10.264455159505209, "step": 1412 }, { "epoch": 0.38728244484034535, "grad_norm": 3.203125, "kl": 1.6815261840820312, "learning_rate": 5e-06, "logits/chosen": -58631099.428571425, "logits/rejected": -36261612.8, "logps/chosen": -450.61586216517856, "logps/rejected": -618.961767578125, "loss": 0.0119, "rewards/chosen": 6.43743896484375, "rewards/margins": 16.019895172119142, "rewards/rejected": -9.58245620727539, "step": 1413 }, { "epoch": 0.38755653008085517, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15847315.2, "logits/rejected": -27325570.285714287, "logps/chosen": -342.9879150390625, "logps/rejected": -616.8536551339286, "loss": 0.0274, "rewards/chosen": 6.1737007141113285, "rewards/margins": 15.54806420462472, "rewards/rejected": -9.374363490513392, "step": 1414 }, { "epoch": 0.3878306153213649, "grad_norm": 8.6875, "kl": 8.707789421081543, "learning_rate": 5e-06, "logits/chosen": -19554310.85714286, "logits/rejected": 3619711.2, "logps/chosen": -442.80873325892856, "logps/rejected": -423.546630859375, "loss": 0.064, "rewards/chosen": 6.673303876604352, "rewards/margins": 13.897506604875836, "rewards/rejected": -7.224202728271484, "step": 1415 }, { "epoch": 0.38810470056187474, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7865761.142857143, "logits/rejected": -12082646.4, "logps/chosen": -541.0916922433036, "logps/rejected": -528.327001953125, "loss": 0.019, "rewards/chosen": 7.124297550746372, "rewards/margins": 15.25155508858817, "rewards/rejected": -8.127257537841796, "step": 1416 }, { "epoch": 0.38837878580238455, "grad_norm": 9.75, "kl": 5.503966808319092, "learning_rate": 5e-06, "logits/chosen": -47028122.18181818, "logits/rejected": -22637587.692307692, "logps/chosen": -474.98393110795456, "logps/rejected": -493.7075946514423, "loss": 0.0895, "rewards/chosen": 6.810101595791903, "rewards/margins": 12.153839271385353, "rewards/rejected": -5.34373767559345, "step": 1417 }, { "epoch": 0.38865287104289437, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22519178.666666668, "logits/rejected": -14820861.333333334, "logps/chosen": -531.2034098307291, "logps/rejected": -636.7543131510416, "loss": 0.0101, "rewards/chosen": 6.227511723836263, "rewards/margins": 16.882418314615887, "rewards/rejected": -10.654906590779623, "step": 1418 }, { "epoch": 0.3889269562834041, "grad_norm": 5.46875, "kl": 1.9965922832489014, "learning_rate": 5e-06, "logits/chosen": -9062376.0, "logits/rejected": -6835682.0, "logps/chosen": -395.8440348307292, "logps/rejected": -521.3203125, "loss": 0.0191, "rewards/chosen": 6.174262364705403, "rewards/margins": 13.873225529988606, "rewards/rejected": -7.698963165283203, "step": 1419 }, { "epoch": 0.38920104152391394, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22347016.533333335, "logits/rejected": -31988721.777777776, "logps/chosen": -360.98984375, "logps/rejected": -406.25474717881946, "loss": 0.044, "rewards/chosen": 5.7815800984700525, "rewards/margins": 15.438178507486981, "rewards/rejected": -9.656598409016928, "step": 1420 }, { "epoch": 0.38947512676442375, "grad_norm": 7.90625, "kl": 4.616863250732422, "learning_rate": 5e-06, "logits/chosen": -30900057.14285714, "logits/rejected": 20677763.2, "logps/chosen": -436.92794363839283, "logps/rejected": -547.849951171875, "loss": 0.0322, "rewards/chosen": 6.030820574079241, "rewards/margins": 14.604752458844866, "rewards/rejected": -8.573931884765624, "step": 1421 }, { "epoch": 0.3897492120049335, "grad_norm": 2.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16453453.866666667, "logits/rejected": -26936094.222222224, "logps/chosen": -479.99290364583334, "logps/rejected": -573.5221354166666, "loss": 0.0033, "rewards/chosen": 7.086372884114583, "rewards/margins": 19.061302693684894, "rewards/rejected": -11.974929809570312, "step": 1422 }, { "epoch": 0.3900232972454433, "grad_norm": 11.25, "kl": 2.436372756958008, "learning_rate": 5e-06, "logits/chosen": -6977084.0, "logits/rejected": -34996189.333333336, "logps/chosen": -447.3065592447917, "logps/rejected": -422.8838704427083, "loss": 0.0303, "rewards/chosen": 6.295797983805339, "rewards/margins": 14.933187484741211, "rewards/rejected": -8.637389500935873, "step": 1423 }, { "epoch": 0.39029738248595314, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32476570.181818184, "logits/rejected": -3153703.3846153845, "logps/chosen": -433.28231534090907, "logps/rejected": -675.8703425480769, "loss": 0.0293, "rewards/chosen": 6.067646373401988, "rewards/margins": 15.876363954343995, "rewards/rejected": -9.808717580942007, "step": 1424 }, { "epoch": 0.39057146772646295, "grad_norm": 17.75, "kl": 1.9658699035644531, "learning_rate": 5e-06, "logits/chosen": -12233398.153846154, "logits/rejected": 32754670.545454547, "logps/chosen": -582.0058218149038, "logps/rejected": -558.1749378551136, "loss": 0.0454, "rewards/chosen": 5.968829815204327, "rewards/margins": 16.03259213320859, "rewards/rejected": -10.063762318004262, "step": 1425 }, { "epoch": 0.3908455529669727, "grad_norm": 8.0, "kl": 9.317862510681152, "learning_rate": 5e-06, "logits/chosen": -19722677.333333332, "logits/rejected": -32237528.888888888, "logps/chosen": -525.9533203125, "logps/rejected": -459.78868272569446, "loss": 0.0503, "rewards/chosen": 7.321242268880209, "rewards/margins": 15.89762437608507, "rewards/rejected": -8.57638210720486, "step": 1426 }, { "epoch": 0.3911196382074825, "grad_norm": 8.6875, "kl": 8.646275520324707, "learning_rate": 5e-06, "logits/chosen": -26821293.17647059, "logits/rejected": -13473360.0, "logps/chosen": -454.1703239889706, "logps/rejected": -393.92947823660717, "loss": 0.0316, "rewards/chosen": 6.1007223690257355, "rewards/margins": 13.657848550491973, "rewards/rejected": -7.557126181466239, "step": 1427 }, { "epoch": 0.39139372344799234, "grad_norm": 10.4375, "kl": 2.6792781352996826, "learning_rate": 5e-06, "logits/chosen": -28234188.307692308, "logits/rejected": -37368037.81818182, "logps/chosen": -423.0764723557692, "logps/rejected": -706.1218039772727, "loss": 0.0417, "rewards/chosen": 5.288563654972957, "rewards/margins": 15.350997071166137, "rewards/rejected": -10.062433416193182, "step": 1428 }, { "epoch": 0.39166780868850215, "grad_norm": 7.625, "kl": 2.2425589561462402, "learning_rate": 5e-06, "logits/chosen": -1235513.142857143, "logits/rejected": -13479128.470588235, "logps/chosen": -473.5034877232143, "logps/rejected": -491.32077205882354, "loss": 0.023, "rewards/chosen": 7.6078289576939175, "rewards/margins": 16.325391464874524, "rewards/rejected": -8.717562507180606, "step": 1429 }, { "epoch": 0.3919418939290119, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29087056.0, "logits/rejected": -6828335.0, "logps/chosen": -385.7794189453125, "logps/rejected": -546.1878051757812, "loss": 0.0306, "rewards/chosen": 5.247454643249512, "rewards/margins": 14.41151237487793, "rewards/rejected": -9.164057731628418, "step": 1430 }, { "epoch": 0.3922159791695217, "grad_norm": 6.40625, "kl": 0.22409455478191376, "learning_rate": 5e-06, "logits/chosen": -20071770.181818184, "logits/rejected": 25014614.153846152, "logps/chosen": -416.4949396306818, "logps/rejected": -558.9255934495193, "loss": 0.0337, "rewards/chosen": 5.883283441716975, "rewards/margins": 16.937212697275868, "rewards/rejected": -11.053929255558895, "step": 1431 }, { "epoch": 0.39249006441003154, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6168787.076923077, "logits/rejected": -24468468.363636363, "logps/chosen": -432.57132662259613, "logps/rejected": -499.22554154829544, "loss": 0.0362, "rewards/chosen": 6.501321645883413, "rewards/margins": 16.599509179175318, "rewards/rejected": -10.098187533291904, "step": 1432 }, { "epoch": 0.3927641496505413, "grad_norm": 3.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16884940.0, "logits/rejected": -14892088.0, "logps/chosen": -368.162353515625, "logps/rejected": -387.3690999348958, "loss": 0.0209, "rewards/chosen": 6.306844711303711, "rewards/margins": 13.33728535970052, "rewards/rejected": -7.03044064839681, "step": 1433 }, { "epoch": 0.3930382348910511, "grad_norm": 3.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17802244.0, "logits/rejected": -19308810.0, "logps/chosen": -460.7379455566406, "logps/rejected": -510.0834045410156, "loss": 0.0137, "rewards/chosen": 6.377381324768066, "rewards/margins": 15.60499095916748, "rewards/rejected": -9.227609634399414, "step": 1434 }, { "epoch": 0.3933123201315609, "grad_norm": 55.5, "kl": 1.450218915939331, "learning_rate": 5e-06, "logits/chosen": -23854823.111111112, "logits/rejected": -6188038.666666667, "logps/chosen": -410.9397786458333, "logps/rejected": -627.9956868489584, "loss": 0.057, "rewards/chosen": 5.33441162109375, "rewards/margins": 11.596467971801758, "rewards/rejected": -6.262056350708008, "step": 1435 }, { "epoch": 0.39358640537207074, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18941240.888888888, "logits/rejected": -6673364.8, "logps/chosen": -433.24004448784723, "logps/rejected": -559.98359375, "loss": 0.0504, "rewards/chosen": 6.035040537516276, "rewards/margins": 16.0218630472819, "rewards/rejected": -9.986822509765625, "step": 1436 }, { "epoch": 0.3938604906125805, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12548608.0, "logits/rejected": -34081949.09090909, "logps/chosen": -459.91248497596155, "logps/rejected": -491.638671875, "loss": 0.0137, "rewards/chosen": 6.2793438251201925, "rewards/margins": 15.476913238738799, "rewards/rejected": -9.197569413618607, "step": 1437 }, { "epoch": 0.3941345758530903, "grad_norm": 2.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14534262.0, "logits/rejected": -15378716.0, "logps/chosen": -373.912109375, "logps/rejected": -461.9368591308594, "loss": 0.0106, "rewards/chosen": 6.300682544708252, "rewards/margins": 14.191205501556396, "rewards/rejected": -7.8905229568481445, "step": 1438 }, { "epoch": 0.3944086610936001, "grad_norm": 4.5, "kl": 2.3463454246520996, "learning_rate": 5e-06, "logits/chosen": -15791783.111111112, "logits/rejected": -23825480.533333335, "logps/chosen": -546.93505859375, "logps/rejected": -489.9173177083333, "loss": 0.0109, "rewards/chosen": 6.098035176595052, "rewards/margins": 14.933752950032552, "rewards/rejected": -8.8357177734375, "step": 1439 }, { "epoch": 0.39468274633410994, "grad_norm": 2.59375, "kl": 9.409567832946777, "learning_rate": 5e-06, "logits/chosen": -12557673.846153846, "logits/rejected": 15575792.0, "logps/chosen": -422.6071589543269, "logps/rejected": -383.36452414772725, "loss": 0.0409, "rewards/chosen": 7.500178997333233, "rewards/margins": 15.378815444199354, "rewards/rejected": -7.878636446866122, "step": 1440 }, { "epoch": 0.3949568315746197, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7068266.0, "logits/rejected": -28445317.333333332, "logps/chosen": -408.2491861979167, "logps/rejected": -403.8436686197917, "loss": 0.0254, "rewards/chosen": 6.308460871378581, "rewards/margins": 13.265715281168621, "rewards/rejected": -6.957254409790039, "step": 1441 }, { "epoch": 0.3952309168151295, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34314231.27272727, "logits/rejected": -29510390.153846152, "logps/chosen": -412.84956498579544, "logps/rejected": -573.2711087740385, "loss": 0.0336, "rewards/chosen": 6.0770111083984375, "rewards/margins": 15.802318866436298, "rewards/rejected": -9.72530775803786, "step": 1442 }, { "epoch": 0.3955050020556393, "grad_norm": 3.28125, "kl": 5.1361541748046875, "learning_rate": 5e-06, "logits/chosen": -15218531.2, "logits/rejected": -34328086.85714286, "logps/chosen": -382.60498046875, "logps/rejected": -538.9275948660714, "loss": 0.0121, "rewards/chosen": 7.175220489501953, "rewards/margins": 17.16269302368164, "rewards/rejected": -9.987472534179688, "step": 1443 }, { "epoch": 0.3957790872961491, "grad_norm": 1.1640625, "kl": 6.151179313659668, "learning_rate": 5e-06, "logits/chosen": -39765316.92307692, "logits/rejected": -17213381.818181816, "logps/chosen": -460.6415264423077, "logps/rejected": -649.7746803977273, "loss": 0.004, "rewards/chosen": 7.516977750338041, "rewards/margins": 17.8018752945053, "rewards/rejected": -10.284897544167258, "step": 1444 }, { "epoch": 0.3960531725366589, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21204821.818181816, "logits/rejected": 496978.76923076925, "logps/chosen": -406.0511363636364, "logps/rejected": -513.6083233173077, "loss": 0.0292, "rewards/chosen": 6.10753700949929, "rewards/margins": 13.484957875071707, "rewards/rejected": -7.377420865572416, "step": 1445 }, { "epoch": 0.3963272577771687, "grad_norm": 9.625, "kl": 4.7978034019470215, "learning_rate": 5e-06, "logits/chosen": -23328679.384615384, "logits/rejected": -34616273.45454545, "logps/chosen": -394.95519080528845, "logps/rejected": -510.11039595170456, "loss": 0.0709, "rewards/chosen": 5.499660785381611, "rewards/margins": 13.846459075287505, "rewards/rejected": -8.346798289905895, "step": 1446 }, { "epoch": 0.3966013430176785, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1411829.4285714286, "logits/rejected": -2949816.0, "logps/chosen": -447.2909458705357, "logps/rejected": -506.774169921875, "loss": 0.1075, "rewards/chosen": 4.000745500837054, "rewards/margins": 15.395313371930804, "rewards/rejected": -11.39456787109375, "step": 1447 }, { "epoch": 0.3968754282581883, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1754096.6666666667, "logits/rejected": -19670812.444444444, "logps/chosen": -556.821044921875, "logps/rejected": -558.7339409722222, "loss": 0.0084, "rewards/chosen": 7.009188334147136, "rewards/margins": 16.81033155653212, "rewards/rejected": -9.801143222384983, "step": 1448 }, { "epoch": 0.3971495134986981, "grad_norm": 5.09375, "kl": 12.061675071716309, "learning_rate": 5e-06, "logits/chosen": -22063474.0, "logits/rejected": -24597092.0, "logps/chosen": -359.2086486816406, "logps/rejected": -385.9973449707031, "loss": 0.0639, "rewards/chosen": 6.684747695922852, "rewards/margins": 16.034339904785156, "rewards/rejected": -9.349592208862305, "step": 1449 }, { "epoch": 0.3974235987392079, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7487710.545454546, "logits/rejected": -25872679.384615384, "logps/chosen": -421.15926846590907, "logps/rejected": -502.6572265625, "loss": 0.0208, "rewards/chosen": 6.651611328125, "rewards/margins": 16.94122783954327, "rewards/rejected": -10.28961651141827, "step": 1450 }, { "epoch": 0.3976976839797177, "grad_norm": 2.8125, "kl": 0.4695243835449219, "learning_rate": 5e-06, "logits/chosen": -21610115.2, "logits/rejected": -4513142.285714285, "logps/chosen": -414.314208984375, "logps/rejected": -399.8384486607143, "loss": 0.0095, "rewards/chosen": 6.281748962402344, "rewards/margins": 13.931868198939732, "rewards/rejected": -7.650119236537388, "step": 1451 }, { "epoch": 0.3979717692202275, "grad_norm": 9.0, "kl": 6.9360175132751465, "learning_rate": 5e-06, "logits/chosen": 8568599.384615384, "logits/rejected": -23706859.636363637, "logps/chosen": -424.1638746995192, "logps/rejected": -531.8884055397727, "loss": 0.0647, "rewards/chosen": 5.8361640343299275, "rewards/margins": 12.902629238742215, "rewards/rejected": -7.066465204412287, "step": 1452 }, { "epoch": 0.3982458544607373, "grad_norm": 3.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25886621.09090909, "logits/rejected": -9216925.538461538, "logps/chosen": -396.11430220170456, "logps/rejected": -620.1935096153846, "loss": 0.0084, "rewards/chosen": 6.38185605135831, "rewards/margins": 18.712037466622732, "rewards/rejected": -12.330181415264423, "step": 1453 }, { "epoch": 0.3985199397012471, "grad_norm": 1.9296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -646973.5384615385, "logits/rejected": -11282706.909090908, "logps/chosen": -413.7139423076923, "logps/rejected": -451.2281605113636, "loss": 0.0087, "rewards/chosen": 5.46990966796875, "rewards/margins": 13.383010864257812, "rewards/rejected": -7.9131011962890625, "step": 1454 }, { "epoch": 0.39879402494175686, "grad_norm": 7.96875, "kl": 8.33979606628418, "learning_rate": 5e-06, "logits/chosen": -22244800.0, "logits/rejected": -24258067.2, "logps/chosen": -515.4043666294643, "logps/rejected": -580.246875, "loss": 0.0503, "rewards/chosen": 8.21474838256836, "rewards/margins": 19.101691436767577, "rewards/rejected": -10.886943054199218, "step": 1455 }, { "epoch": 0.3990681101822667, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49840153.6, "logits/rejected": 5585209.142857143, "logps/chosen": -458.6103515625, "logps/rejected": -358.1227329799107, "loss": 0.0336, "rewards/chosen": 6.82957763671875, "rewards/margins": 13.573753683907645, "rewards/rejected": -6.744176047188895, "step": 1456 }, { "epoch": 0.3993421954227765, "grad_norm": 8.6875, "kl": 0.7789306640625, "learning_rate": 5e-06, "logits/chosen": -17194382.666666668, "logits/rejected": -22968320.0, "logps/chosen": -497.4375813802083, "logps/rejected": -624.2766520182291, "loss": 0.0178, "rewards/chosen": 6.036853790283203, "rewards/margins": 17.87268956502279, "rewards/rejected": -11.835835774739584, "step": 1457 }, { "epoch": 0.3996162806632863, "grad_norm": 13.0, "kl": 5.994259834289551, "learning_rate": 5e-06, "logits/chosen": 3536754.909090909, "logits/rejected": -18464868.923076924, "logps/chosen": -408.98876953125, "logps/rejected": -404.9997746394231, "loss": 0.0374, "rewards/chosen": 5.698380556973544, "rewards/margins": 16.849491866318495, "rewards/rejected": -11.151111309344952, "step": 1458 }, { "epoch": 0.39989036590379606, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32262365.09090909, "logits/rejected": -10437982.76923077, "logps/chosen": -392.85542436079544, "logps/rejected": -472.2626201923077, "loss": 0.0723, "rewards/chosen": 4.708522103049538, "rewards/margins": 13.660202213100622, "rewards/rejected": -8.951680110051083, "step": 1459 }, { "epoch": 0.4001644511443059, "grad_norm": 9.4375, "kl": 0.9056529998779297, "learning_rate": 5e-06, "logits/chosen": -17837296.0, "logits/rejected": -22912341.333333332, "logps/chosen": -299.2662353515625, "logps/rejected": -448.9315592447917, "loss": 0.0694, "rewards/chosen": 4.599858283996582, "rewards/margins": 14.52905241648356, "rewards/rejected": -9.929194132486979, "step": 1460 }, { "epoch": 0.4004385363848157, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2834306.0, "logits/rejected": -33643650.13333333, "logps/chosen": -443.73499891493054, "logps/rejected": -584.9385416666667, "loss": 0.017, "rewards/chosen": 4.902130550808376, "rewards/margins": 17.25531556871202, "rewards/rejected": -12.353185017903646, "step": 1461 }, { "epoch": 0.40071262162532545, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12723380.57142857, "logits/rejected": 3826140.0, "logps/chosen": -311.38441685267856, "logps/rejected": -631.50673828125, "loss": 0.0819, "rewards/chosen": 4.947787693568638, "rewards/margins": 11.385347965785435, "rewards/rejected": -6.437560272216797, "step": 1462 }, { "epoch": 0.40098670686583526, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9677306.666666666, "logits/rejected": -13461651.2, "logps/chosen": -425.11078559027777, "logps/rejected": -459.22486979166666, "loss": 0.0402, "rewards/chosen": 5.371241675482856, "rewards/margins": 14.55075725979275, "rewards/rejected": -9.179515584309895, "step": 1463 }, { "epoch": 0.4012607921063451, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29695066.666666668, "logits/rejected": -6879804.8, "logps/chosen": -440.9660915798611, "logps/rejected": -550.978515625, "loss": 0.0201, "rewards/chosen": 7.373270670572917, "rewards/margins": 17.880476888020834, "rewards/rejected": -10.507206217447917, "step": 1464 }, { "epoch": 0.4015348773468549, "grad_norm": 13.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21727227.076923076, "logits/rejected": -24089861.818181816, "logps/chosen": -515.1975661057693, "logps/rejected": -403.83598188920456, "loss": 0.0531, "rewards/chosen": 6.092559227576623, "rewards/margins": 14.058143722427475, "rewards/rejected": -7.9655844948508525, "step": 1465 }, { "epoch": 0.40180896258736465, "grad_norm": 3.703125, "kl": 1.713127851486206, "learning_rate": 5e-06, "logits/chosen": -14022167.384615384, "logits/rejected": -16301797.818181818, "logps/chosen": -368.4157151442308, "logps/rejected": -565.6483931107955, "loss": 0.0227, "rewards/chosen": 5.291444631723257, "rewards/margins": 14.111966593282206, "rewards/rejected": -8.82052196155895, "step": 1466 }, { "epoch": 0.40208304782787446, "grad_norm": 14.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10145942.76923077, "logits/rejected": -14381547.636363637, "logps/chosen": -375.4387770432692, "logps/rejected": -491.78959517045456, "loss": 0.0704, "rewards/chosen": 4.754027733435998, "rewards/margins": 13.056617149939903, "rewards/rejected": -8.302589416503906, "step": 1467 }, { "epoch": 0.4023571330683843, "grad_norm": 7.03125, "kl": 8.001840591430664, "learning_rate": 5e-06, "logits/chosen": -8703178.666666666, "logits/rejected": -1943296.0, "logps/chosen": -452.34534505208336, "logps/rejected": -670.1793619791666, "loss": 0.0296, "rewards/chosen": 7.0906824747721355, "rewards/margins": 17.51414269341363, "rewards/rejected": -10.423460218641493, "step": 1468 }, { "epoch": 0.4026312183088941, "grad_norm": 4.28125, "kl": 3.2544476985931396, "learning_rate": 5e-06, "logits/chosen": -2026193.3333333333, "logits/rejected": -20799697.333333332, "logps/chosen": -441.83203125, "logps/rejected": -389.2533365885417, "loss": 0.0279, "rewards/chosen": 7.457075754801433, "rewards/margins": 15.907121022542317, "rewards/rejected": -8.450045267740885, "step": 1469 }, { "epoch": 0.40290530354940385, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24432529.066666666, "logits/rejected": -31654762.666666668, "logps/chosen": -483.0033854166667, "logps/rejected": -527.1343315972222, "loss": 0.0273, "rewards/chosen": 6.5596923828125, "rewards/margins": 15.178263346354166, "rewards/rejected": -8.618570963541666, "step": 1470 }, { "epoch": 0.40317938878991366, "grad_norm": 5.59375, "kl": 2.2591662406921387, "learning_rate": 5e-06, "logits/chosen": -12075089.066666666, "logits/rejected": -15396216.888888888, "logps/chosen": -407.08141276041664, "logps/rejected": -423.15863715277777, "loss": 0.0252, "rewards/chosen": 7.035185241699219, "rewards/margins": 16.038982984754774, "rewards/rejected": -9.003797743055555, "step": 1471 }, { "epoch": 0.4034534740304235, "grad_norm": 3.015625, "kl": 3.760878324508667, "learning_rate": 5e-06, "logits/chosen": -19786685.53846154, "logits/rejected": -7276115.636363637, "logps/chosen": -428.9979717548077, "logps/rejected": -501.9641779119318, "loss": 0.0095, "rewards/chosen": 5.708309467022236, "rewards/margins": 15.015982834609238, "rewards/rejected": -9.307673367587002, "step": 1472 }, { "epoch": 0.40372755927093323, "grad_norm": 6.84375, "kl": 1.3622945547103882, "learning_rate": 5e-06, "logits/chosen": -37337422.76923077, "logits/rejected": -8294416.0, "logps/chosen": -471.9011793870192, "logps/rejected": -687.3347389914773, "loss": 0.0163, "rewards/chosen": 6.989632239708533, "rewards/margins": 18.310638321029558, "rewards/rejected": -11.321006081321023, "step": 1473 }, { "epoch": 0.40400164451144305, "grad_norm": 5.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11554678.666666666, "logits/rejected": -37342432.0, "logps/chosen": -484.5870361328125, "logps/rejected": -583.251953125, "loss": 0.0156, "rewards/chosen": 5.775282541910808, "rewards/margins": 16.623162587483723, "rewards/rejected": -10.847880045572916, "step": 1474 }, { "epoch": 0.40427572975195286, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26104448.0, "logits/rejected": 14657998.666666666, "logps/chosen": -421.9873046875, "logps/rejected": -673.696533203125, "loss": 0.0214, "rewards/chosen": 7.211241404215495, "rewards/margins": 20.237263997395832, "rewards/rejected": -13.026022593180338, "step": 1475 }, { "epoch": 0.4045498149924627, "grad_norm": 3.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24809742.545454547, "logits/rejected": -19864475.076923076, "logps/chosen": -431.6033824573864, "logps/rejected": -449.24744591346155, "loss": 0.0198, "rewards/chosen": 6.759264859286222, "rewards/margins": 15.053899164800043, "rewards/rejected": -8.294634305513823, "step": 1476 }, { "epoch": 0.40482390023297243, "grad_norm": 7.78125, "kl": 1.7629013061523438, "learning_rate": 5e-06, "logits/chosen": 15727074.461538462, "logits/rejected": -18217482.181818184, "logps/chosen": -315.05618990384613, "logps/rejected": -458.24360795454544, "loss": 0.0329, "rewards/chosen": 4.903113145094651, "rewards/margins": 13.611186154238826, "rewards/rejected": -8.708073009144176, "step": 1477 }, { "epoch": 0.40509798547348225, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13248793.6, "logits/rejected": -16671290.666666666, "logps/chosen": -411.79072265625, "logps/rejected": -399.70684136284723, "loss": 0.0356, "rewards/chosen": 5.835392761230469, "rewards/margins": 12.593606228298611, "rewards/rejected": -6.758213467068142, "step": 1478 }, { "epoch": 0.40537207071399206, "grad_norm": 7.53125, "kl": 3.5350100994110107, "learning_rate": 5e-06, "logits/chosen": 1542597.6363636365, "logits/rejected": -1824450.4615384615, "logps/chosen": -368.49101118607956, "logps/rejected": -541.4662710336538, "loss": 0.0354, "rewards/chosen": 6.519221912730824, "rewards/margins": 14.773482636138276, "rewards/rejected": -8.254260723407452, "step": 1479 }, { "epoch": 0.4056461559545019, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4878536.363636363, "logits/rejected": -20892512.0, "logps/chosen": -420.54398970170456, "logps/rejected": -611.9679612379807, "loss": 0.0129, "rewards/chosen": 6.449182683771307, "rewards/margins": 17.816755174756885, "rewards/rejected": -11.367572490985577, "step": 1480 }, { "epoch": 0.40592024119501163, "grad_norm": 6.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -966379.5, "logits/rejected": -6144220.0, "logps/chosen": -266.6871337890625, "logps/rejected": -547.8576049804688, "loss": 0.0296, "rewards/chosen": 4.181737899780273, "rewards/margins": 14.400148391723633, "rewards/rejected": -10.21841049194336, "step": 1481 }, { "epoch": 0.40619432643552145, "grad_norm": 9.375, "kl": 3.554900884628296, "learning_rate": 5e-06, "logits/chosen": -21535918.0, "logits/rejected": -28316708.0, "logps/chosen": -478.109130859375, "logps/rejected": -551.4567260742188, "loss": 0.0211, "rewards/chosen": 6.583198547363281, "rewards/margins": 15.032108306884766, "rewards/rejected": -8.448909759521484, "step": 1482 }, { "epoch": 0.40646841167603126, "grad_norm": 6.53125, "kl": 4.1687445640563965, "learning_rate": 5e-06, "logits/chosen": -29411104.0, "logits/rejected": 13486664.0, "logps/chosen": -496.2179361979167, "logps/rejected": -661.8578694661459, "loss": 0.023, "rewards/chosen": 6.555273691813151, "rewards/margins": 19.48746617635091, "rewards/rejected": -12.93219248453776, "step": 1483 }, { "epoch": 0.406742496916541, "grad_norm": 6.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9548754.857142856, "logits/rejected": -12293316.705882354, "logps/chosen": -468.99142020089283, "logps/rejected": -707.2580422794117, "loss": 0.0161, "rewards/chosen": 6.234718322753906, "rewards/margins": 16.49249132941751, "rewards/rejected": -10.257773006663603, "step": 1484 }, { "epoch": 0.40701658215705083, "grad_norm": 8.125, "kl": 1.4382604360580444, "learning_rate": 5e-06, "logits/chosen": -26548958.11764706, "logits/rejected": -11544485.714285715, "logps/chosen": -450.20651424632354, "logps/rejected": -513.9938616071429, "loss": 0.0389, "rewards/chosen": 6.181376737706802, "rewards/margins": 13.13003636207901, "rewards/rejected": -6.94865962437221, "step": 1485 }, { "epoch": 0.40729066739756065, "grad_norm": 6.125, "kl": 0.13907623291015625, "learning_rate": 5e-06, "logits/chosen": -18245578.285714287, "logits/rejected": -30096371.2, "logps/chosen": -426.79209681919644, "logps/rejected": -309.5555419921875, "loss": 0.03, "rewards/chosen": 5.954307556152344, "rewards/margins": 13.038497161865234, "rewards/rejected": -7.084189605712891, "step": 1486 }, { "epoch": 0.40756475263807046, "grad_norm": 7.15625, "kl": 6.838308334350586, "learning_rate": 5e-06, "logits/chosen": -22732484.57142857, "logits/rejected": -35827315.2, "logps/chosen": -399.14390345982144, "logps/rejected": -506.925, "loss": 0.0238, "rewards/chosen": 7.1687180655343195, "rewards/margins": 15.067055620465961, "rewards/rejected": -7.898337554931641, "step": 1487 }, { "epoch": 0.4078388378785802, "grad_norm": 3.625, "kl": 3.84302020072937, "learning_rate": 5e-06, "logits/chosen": -27730457.6, "logits/rejected": -28825831.111111112, "logps/chosen": -371.37513020833336, "logps/rejected": -512.0144314236111, "loss": 0.0221, "rewards/chosen": 6.535305786132812, "rewards/margins": 14.960977681477864, "rewards/rejected": -8.425671895345053, "step": 1488 }, { "epoch": 0.40811292311909003, "grad_norm": 6.28125, "kl": 19.619991302490234, "learning_rate": 5e-06, "logits/chosen": -22912812.0, "logits/rejected": -47752260.0, "logps/chosen": -590.8394165039062, "logps/rejected": -510.35308837890625, "loss": 0.0327, "rewards/chosen": 8.189374923706055, "rewards/margins": 16.432146072387695, "rewards/rejected": -8.24277114868164, "step": 1489 }, { "epoch": 0.40838700835959985, "grad_norm": 8.5625, "kl": 1.9583232402801514, "learning_rate": 5e-06, "logits/chosen": -32802346.666666668, "logits/rejected": -25454204.444444444, "logps/chosen": -406.66412760416665, "logps/rejected": -522.8013237847222, "loss": 0.0369, "rewards/chosen": 5.968517049153646, "rewards/margins": 15.763225301106772, "rewards/rejected": -9.794708251953125, "step": 1490 }, { "epoch": 0.40866109360010966, "grad_norm": 15.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29992251.076923076, "logits/rejected": -11183658.181818182, "logps/chosen": -490.65478515625, "logps/rejected": -564.2613636363636, "loss": 0.0504, "rewards/chosen": 6.267730126014123, "rewards/margins": 14.67549143971263, "rewards/rejected": -8.407761313698508, "step": 1491 }, { "epoch": 0.4089351788406194, "grad_norm": 1.5390625, "kl": 0.7082545161247253, "learning_rate": 5e-06, "logits/chosen": -20883075.555555556, "logits/rejected": -10106859.733333332, "logps/chosen": -437.23963758680554, "logps/rejected": -509.076953125, "loss": 0.0049, "rewards/chosen": 5.979043748643663, "rewards/margins": 14.628331671820746, "rewards/rejected": -8.649287923177083, "step": 1492 }, { "epoch": 0.40920926408112923, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39260433.777777776, "logits/rejected": -12391978.666666666, "logps/chosen": -475.2283528645833, "logps/rejected": -485.5102864583333, "loss": 0.0186, "rewards/chosen": 8.768072340223524, "rewards/margins": 17.70952741834852, "rewards/rejected": -8.941455078125, "step": 1493 }, { "epoch": 0.40948334932163905, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7358335.5, "logits/rejected": -37390452.0, "logps/chosen": -345.9652404785156, "logps/rejected": -407.53759765625, "loss": 0.0051, "rewards/chosen": 7.745540142059326, "rewards/margins": 16.702767848968506, "rewards/rejected": -8.95722770690918, "step": 1494 }, { "epoch": 0.4097574345621488, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17997334.153846152, "logits/rejected": 22877950.545454547, "logps/chosen": -411.39107572115387, "logps/rejected": -775.2912819602273, "loss": 0.0589, "rewards/chosen": 5.380047137920673, "rewards/margins": 18.568201878687717, "rewards/rejected": -13.188154740767045, "step": 1495 }, { "epoch": 0.4100315198026586, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10091312.0, "logits/rejected": -30931919.05882353, "logps/chosen": -419.42923409598217, "logps/rejected": -580.7510340073529, "loss": 0.0069, "rewards/chosen": 5.6043597630092075, "rewards/margins": 14.940406767260125, "rewards/rejected": -9.336047004250918, "step": 1496 }, { "epoch": 0.41030560504316843, "grad_norm": 10.8125, "kl": 24.467987060546875, "learning_rate": 5e-06, "logits/chosen": -27822548.210526317, "logits/rejected": -57046400.0, "logps/chosen": -404.20703125, "logps/rejected": -479.74892578125, "loss": 0.1638, "rewards/chosen": 6.576140554327714, "rewards/margins": 15.41725307263826, "rewards/rejected": -8.841112518310547, "step": 1497 }, { "epoch": 0.41057969028367824, "grad_norm": 7.0625, "kl": 4.12349271774292, "learning_rate": 5e-06, "logits/chosen": -19401602.46153846, "logits/rejected": 4737989.818181818, "logps/chosen": -362.48959585336536, "logps/rejected": -616.3498757102273, "loss": 0.0383, "rewards/chosen": 6.122836773212139, "rewards/margins": 15.679570338109157, "rewards/rejected": -9.556733564897018, "step": 1498 }, { "epoch": 0.410853775524188, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19378572.307692308, "logits/rejected": -1236792.7272727273, "logps/chosen": -412.8322566105769, "logps/rejected": -567.1394708806819, "loss": 0.0329, "rewards/chosen": 5.464794452373798, "rewards/margins": 15.144147646177066, "rewards/rejected": -9.679353193803268, "step": 1499 }, { "epoch": 0.4111278607646978, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 43300372.0, "logits/rejected": -3350165.0, "logps/chosen": -314.4950866699219, "logps/rejected": -479.03204345703125, "loss": 0.0425, "rewards/chosen": 5.32094669342041, "rewards/margins": 14.228443145751953, "rewards/rejected": -8.907496452331543, "step": 1500 }, { "epoch": 0.41140194600520763, "grad_norm": 4.90625, "kl": 4.028920650482178, "learning_rate": 5e-06, "logits/chosen": -19877596.0, "logits/rejected": -22079952.0, "logps/chosen": -386.90496826171875, "logps/rejected": -554.68994140625, "loss": 0.0415, "rewards/chosen": 6.13759183883667, "rewards/margins": 19.0024094581604, "rewards/rejected": -12.86481761932373, "step": 1501 }, { "epoch": 0.41167603124571744, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 14974318.76923077, "logits/rejected": -1621306.5454545454, "logps/chosen": -497.54770132211536, "logps/rejected": -655.5644975142045, "loss": 0.0254, "rewards/chosen": 6.851729759803185, "rewards/margins": 16.111745074078755, "rewards/rejected": -9.260015314275568, "step": 1502 }, { "epoch": 0.4119501164862272, "grad_norm": 5.71875, "kl": 4.905412197113037, "learning_rate": 5e-06, "logits/chosen": 2976966.4, "logits/rejected": -22766444.444444444, "logps/chosen": -342.87298177083335, "logps/rejected": -566.9479166666666, "loss": 0.0485, "rewards/chosen": 5.4793650309244795, "rewards/margins": 14.509288363986546, "rewards/rejected": -9.029923333062065, "step": 1503 }, { "epoch": 0.412224201726737, "grad_norm": 6.5, "kl": 0.9857572317123413, "learning_rate": 5e-06, "logits/chosen": -11935246.76923077, "logits/rejected": -9187056.727272727, "logps/chosen": -405.27249849759613, "logps/rejected": -377.3447265625, "loss": 0.0606, "rewards/chosen": 5.332678574782151, "rewards/margins": 10.871634610049373, "rewards/rejected": -5.538956035267223, "step": 1504 }, { "epoch": 0.41249828696724683, "grad_norm": 7.09375, "kl": 3.498997926712036, "learning_rate": 5e-06, "logits/chosen": -17043323.733333334, "logits/rejected": 19309212.444444444, "logps/chosen": -385.4990234375, "logps/rejected": -610.4763454861111, "loss": 0.042, "rewards/chosen": 6.086223347981771, "rewards/margins": 17.150648498535155, "rewards/rejected": -11.064425150553385, "step": 1505 }, { "epoch": 0.4127723722077566, "grad_norm": 9.3125, "kl": 3.7721385955810547, "learning_rate": 5e-06, "logits/chosen": -16225155.0, "logits/rejected": 2870178.75, "logps/chosen": -301.61871337890625, "logps/rejected": -507.9135437011719, "loss": 0.0898, "rewards/chosen": 5.108923435211182, "rewards/margins": 12.910522937774658, "rewards/rejected": -7.801599502563477, "step": 1506 }, { "epoch": 0.4130464574482664, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11265241.846153846, "logits/rejected": -40194164.36363637, "logps/chosen": -399.3532527043269, "logps/rejected": -685.3951526988636, "loss": 0.0233, "rewards/chosen": 6.226615905761719, "rewards/margins": 17.16845633766868, "rewards/rejected": -10.94184043190696, "step": 1507 }, { "epoch": 0.4133205426887762, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9563902.153846154, "logits/rejected": -2692076.0, "logps/chosen": -483.16312349759613, "logps/rejected": -568.1131036931819, "loss": 0.0195, "rewards/chosen": 5.880338228665865, "rewards/margins": 15.491435391085965, "rewards/rejected": -9.6110971624201, "step": 1508 }, { "epoch": 0.41359462792928603, "grad_norm": 3.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19665776.0, "logits/rejected": 42146592.0, "logps/chosen": -390.9180908203125, "logps/rejected": -755.7329915364584, "loss": 0.0123, "rewards/chosen": 5.982678731282552, "rewards/margins": 22.25913365681966, "rewards/rejected": -16.27645492553711, "step": 1509 }, { "epoch": 0.4138687131697958, "grad_norm": 7.125, "kl": 0.5511068105697632, "learning_rate": 5e-06, "logits/chosen": 559068.3636363636, "logits/rejected": -25917304.615384616, "logps/chosen": -451.97749467329544, "logps/rejected": -488.25304236778845, "loss": 0.0384, "rewards/chosen": 6.067348133433949, "rewards/margins": 14.870933479362435, "rewards/rejected": -8.803585345928486, "step": 1510 }, { "epoch": 0.4141427984103056, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4478400.307692308, "logits/rejected": 29143296.0, "logps/chosen": -477.3567082331731, "logps/rejected": -563.5612571022727, "loss": 0.0824, "rewards/chosen": 6.380731435922476, "rewards/margins": 18.13871039543952, "rewards/rejected": -11.757978959517045, "step": 1511 }, { "epoch": 0.4144168836508154, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11917360.0, "logits/rejected": -11555668.0, "logps/chosen": -467.65011160714283, "logps/rejected": -548.305078125, "loss": 0.0077, "rewards/chosen": 7.38763918195452, "rewards/margins": 16.255243246895926, "rewards/rejected": -8.867604064941407, "step": 1512 }, { "epoch": 0.41469096889132523, "grad_norm": 8.75, "kl": 2.128582000732422, "learning_rate": 5e-06, "logits/chosen": -7216981.333333333, "logits/rejected": -17470984.0, "logps/chosen": -393.8875325520833, "logps/rejected": -451.8509928385417, "loss": 0.0275, "rewards/chosen": 6.308730443318685, "rewards/margins": 13.404128392537434, "rewards/rejected": -7.09539794921875, "step": 1513 }, { "epoch": 0.414965054131835, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10821109.714285715, "logits/rejected": -20953388.8, "logps/chosen": -364.69918387276783, "logps/rejected": -487.754638671875, "loss": 0.0173, "rewards/chosen": 6.099061148507254, "rewards/margins": 14.400100272042412, "rewards/rejected": -8.301039123535157, "step": 1514 }, { "epoch": 0.4152391393723448, "grad_norm": 14.1875, "kl": 5.584901809692383, "learning_rate": 5e-06, "logits/chosen": -38928392.0, "logits/rejected": -23322200.0, "logps/chosen": -643.406982421875, "logps/rejected": -411.2641296386719, "loss": 0.0431, "rewards/chosen": 7.77677583694458, "rewards/margins": 17.172772884368896, "rewards/rejected": -9.395997047424316, "step": 1515 }, { "epoch": 0.4155132246128546, "grad_norm": 3.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22475081.14285714, "logits/rejected": -13483364.705882354, "logps/chosen": -435.79603794642856, "logps/rejected": -439.60061465992646, "loss": 0.0099, "rewards/chosen": 6.608120509556362, "rewards/margins": 15.054312633867024, "rewards/rejected": -8.446192124310661, "step": 1516 }, { "epoch": 0.4157873098533644, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6479731.692307692, "logits/rejected": -22550773.818181816, "logps/chosen": -343.9499699519231, "logps/rejected": -628.3488991477273, "loss": 0.0511, "rewards/chosen": 5.263839134803185, "rewards/margins": 17.39558906821938, "rewards/rejected": -12.131749933416193, "step": 1517 }, { "epoch": 0.4160613950938742, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15653361.333333334, "logits/rejected": -2495365.0, "logps/chosen": -383.0970458984375, "logps/rejected": -692.6321614583334, "loss": 0.016, "rewards/chosen": 6.279998143513997, "rewards/margins": 16.005977630615234, "rewards/rejected": -9.725979487101236, "step": 1518 }, { "epoch": 0.416335480334384, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5363676.7272727275, "logits/rejected": 9905513.846153846, "logps/chosen": -451.9601384943182, "logps/rejected": -409.0895432692308, "loss": 0.0447, "rewards/chosen": 6.084763960404829, "rewards/margins": 12.914734100128387, "rewards/rejected": -6.8299701397235575, "step": 1519 }, { "epoch": 0.4166095655748938, "grad_norm": 7.375, "kl": 1.7349803447723389, "learning_rate": 5e-06, "logits/chosen": -2002430.0, "logits/rejected": -25306748.8, "logps/chosen": -393.39488002232144, "logps/rejected": -520.854150390625, "loss": 0.0236, "rewards/chosen": 5.899818965366909, "rewards/margins": 15.790530177525113, "rewards/rejected": -9.890711212158203, "step": 1520 }, { "epoch": 0.4168836508154036, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7214011.636363637, "logits/rejected": -17185025.230769232, "logps/chosen": -367.2659357244318, "logps/rejected": -476.76412259615387, "loss": 0.0304, "rewards/chosen": 6.266568270596591, "rewards/margins": 14.203864037573755, "rewards/rejected": -7.937295766977163, "step": 1521 }, { "epoch": 0.4171577360559134, "grad_norm": 15.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18128888.615384616, "logits/rejected": -27855115.636363637, "logps/chosen": -407.41515174278845, "logps/rejected": -576.9181019176136, "loss": 0.0819, "rewards/chosen": 4.783447852501502, "rewards/margins": 13.971576237178347, "rewards/rejected": -9.188128384676846, "step": 1522 }, { "epoch": 0.4174318212964232, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14389132.444444444, "logits/rejected": 208934.66666666666, "logps/chosen": -358.13970269097223, "logps/rejected": -605.3841145833334, "loss": 0.0504, "rewards/chosen": 5.708697848849827, "rewards/margins": 15.97867668999566, "rewards/rejected": -10.269978841145834, "step": 1523 }, { "epoch": 0.417705906536933, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8771157.538461538, "logits/rejected": -23733809.454545453, "logps/chosen": -439.9362980769231, "logps/rejected": -596.68798828125, "loss": 0.041, "rewards/chosen": 7.441099900465745, "rewards/margins": 17.36575200007512, "rewards/rejected": -9.924652099609375, "step": 1524 }, { "epoch": 0.41797999177744277, "grad_norm": 10.0625, "kl": 5.265926361083984, "learning_rate": 5e-06, "logits/chosen": -29577064.727272727, "logits/rejected": -15021830.153846154, "logps/chosen": -470.3328746448864, "logps/rejected": -410.3132136418269, "loss": 0.0346, "rewards/chosen": 8.790005770596592, "rewards/margins": 16.50916364976576, "rewards/rejected": -7.71915787916917, "step": 1525 }, { "epoch": 0.4182540770179526, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 41148643.55555555, "logits/rejected": 4779276.8, "logps/chosen": -552.0703667534722, "logps/rejected": -629.9243489583333, "loss": 0.0095, "rewards/chosen": 6.657067616780599, "rewards/margins": 19.003048451741535, "rewards/rejected": -12.345980834960937, "step": 1526 }, { "epoch": 0.4185281622584624, "grad_norm": 7.875, "kl": 4.155570983886719, "learning_rate": 5e-06, "logits/chosen": -31699589.818181816, "logits/rejected": -10352164.923076924, "logps/chosen": -484.9675958806818, "logps/rejected": -558.7210036057693, "loss": 0.0301, "rewards/chosen": 6.803590947931463, "rewards/margins": 16.236626578377678, "rewards/rejected": -9.433035630446215, "step": 1527 }, { "epoch": 0.41880224749897216, "grad_norm": 13.6875, "kl": 10.14826774597168, "learning_rate": 5e-06, "logits/chosen": -2949269.846153846, "logits/rejected": 3963788.3636363638, "logps/chosen": -355.0744816706731, "logps/rejected": -486.1534534801136, "loss": 0.1219, "rewards/chosen": 5.580959613506611, "rewards/margins": 14.338928862885162, "rewards/rejected": -8.75796924937855, "step": 1528 }, { "epoch": 0.41907633273948197, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3103815.3846153845, "logits/rejected": -23953794.90909091, "logps/chosen": -396.3659855769231, "logps/rejected": -652.7680220170455, "loss": 0.0188, "rewards/chosen": 7.042715219350962, "rewards/margins": 16.974785864769995, "rewards/rejected": -9.932070645419033, "step": 1529 }, { "epoch": 0.4193504179799918, "grad_norm": 2.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34180800.0, "logits/rejected": -32232066.666666668, "logps/chosen": -329.18017578125, "logps/rejected": -532.3974609375, "loss": 0.007, "rewards/chosen": 6.306168874104817, "rewards/margins": 15.654330571492512, "rewards/rejected": -9.348161697387695, "step": 1530 }, { "epoch": 0.4196245032205016, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -129105.77777777778, "logits/rejected": 8079260.8, "logps/chosen": -586.6809353298611, "logps/rejected": -533.5797526041666, "loss": 0.0544, "rewards/chosen": 6.875941806369358, "rewards/margins": 17.07572207980686, "rewards/rejected": -10.1997802734375, "step": 1531 }, { "epoch": 0.41989858846101136, "grad_norm": 3.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26282437.818181816, "logits/rejected": 72607350.15384616, "logps/chosen": -416.50577059659093, "logps/rejected": -671.8360877403846, "loss": 0.0071, "rewards/chosen": 6.39859355579723, "rewards/margins": 19.85296038647632, "rewards/rejected": -13.454366830679087, "step": 1532 }, { "epoch": 0.42017267370152117, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 36929521.23076923, "logits/rejected": -11025502.545454545, "logps/chosen": -495.0349308894231, "logps/rejected": -596.2455610795455, "loss": 0.063, "rewards/chosen": 5.008951040414663, "rewards/margins": 15.239949686543925, "rewards/rejected": -10.230998646129262, "step": 1533 }, { "epoch": 0.420446758942031, "grad_norm": 7.1875, "kl": 6.9141340255737305, "learning_rate": 5e-06, "logits/chosen": 24656373.333333332, "logits/rejected": -9297787.733333332, "logps/chosen": -581.9512261284722, "logps/rejected": -613.9294921875, "loss": 0.0428, "rewards/chosen": 7.181610955132379, "rewards/margins": 14.871751742892794, "rewards/rejected": -7.690140787760416, "step": 1534 }, { "epoch": 0.42072084418254074, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8804599.384615384, "logits/rejected": -27936471.272727273, "logps/chosen": -485.7540940504808, "logps/rejected": -486.20485617897725, "loss": 0.0283, "rewards/chosen": 5.834888164813702, "rewards/margins": 14.079300673691543, "rewards/rejected": -8.244412508877842, "step": 1535 }, { "epoch": 0.42099492942305056, "grad_norm": 11.0, "kl": 5.880061149597168, "learning_rate": 5e-06, "logits/chosen": 25260430.769230768, "logits/rejected": -27633367.272727273, "logps/chosen": -461.5110051081731, "logps/rejected": -425.02974076704544, "loss": 0.0836, "rewards/chosen": 5.5699638953575725, "rewards/margins": 12.50929356288243, "rewards/rejected": -6.939329667524858, "step": 1536 }, { "epoch": 0.42126901466356037, "grad_norm": 7.0625, "kl": 6.828696250915527, "learning_rate": 5e-06, "logits/chosen": -36197970.666666664, "logits/rejected": -22511424.0, "logps/chosen": -477.1971028645833, "logps/rejected": -509.8732503255208, "loss": 0.0365, "rewards/chosen": 6.798379262288411, "rewards/margins": 16.467273076375324, "rewards/rejected": -9.668893814086914, "step": 1537 }, { "epoch": 0.4215430999040702, "grad_norm": 8.0625, "kl": 7.717883110046387, "learning_rate": 5e-06, "logits/chosen": -42604224.0, "logits/rejected": -33150080.0, "logps/chosen": -479.38162667410717, "logps/rejected": -471.97060546875, "loss": 0.0628, "rewards/chosen": 7.218311309814453, "rewards/margins": 16.33539810180664, "rewards/rejected": -9.117086791992188, "step": 1538 }, { "epoch": 0.42181718514457994, "grad_norm": 6.8125, "kl": 6.664118766784668, "learning_rate": 5e-06, "logits/chosen": -13320206.545454545, "logits/rejected": -16031707.076923076, "logps/chosen": -345.1785333806818, "logps/rejected": -425.72547325721155, "loss": 0.0339, "rewards/chosen": 5.876386469060725, "rewards/margins": 11.174193428946541, "rewards/rejected": -5.2978069598858175, "step": 1539 }, { "epoch": 0.42209127038508976, "grad_norm": 10.8125, "kl": 4.454648017883301, "learning_rate": 5e-06, "logits/chosen": -17350840.470588237, "logits/rejected": -10353230.857142856, "logps/chosen": -408.49046415441177, "logps/rejected": -409.13145228794644, "loss": 0.0404, "rewards/chosen": 6.552255069508272, "rewards/margins": 15.165470924698004, "rewards/rejected": -8.613215855189733, "step": 1540 }, { "epoch": 0.42236535562559957, "grad_norm": 4.84375, "kl": 0.8794390559196472, "learning_rate": 5e-06, "logits/chosen": -3727641.8181818184, "logits/rejected": -6911583.384615385, "logps/chosen": -357.27903053977275, "logps/rejected": -440.46837439903845, "loss": 0.0185, "rewards/chosen": 6.032072587446733, "rewards/margins": 13.407716977846373, "rewards/rejected": -7.375644390399639, "step": 1541 }, { "epoch": 0.4226394408661094, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17418865.333333332, "logits/rejected": -32748922.666666668, "logps/chosen": -397.7556966145833, "logps/rejected": -578.1730143229166, "loss": 0.0582, "rewards/chosen": 4.7894948323567705, "rewards/margins": 16.398882548014324, "rewards/rejected": -11.609387715657553, "step": 1542 }, { "epoch": 0.42291352610661914, "grad_norm": 15.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36096128.0, "logits/rejected": -13789058.0, "logps/chosen": -309.9322509765625, "logps/rejected": -575.154052734375, "loss": 0.0497, "rewards/chosen": 4.656304836273193, "rewards/margins": 13.237362384796143, "rewards/rejected": -8.58105754852295, "step": 1543 }, { "epoch": 0.42318761134712896, "grad_norm": 3.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29605749.333333332, "logits/rejected": -21375784.0, "logps/chosen": -495.8406982421875, "logps/rejected": -526.9629720052084, "loss": 0.0129, "rewards/chosen": 6.906485239664714, "rewards/margins": 17.06740125020345, "rewards/rejected": -10.160916010538736, "step": 1544 }, { "epoch": 0.42346169658763877, "grad_norm": 9.4375, "kl": 1.1882305145263672, "learning_rate": 5e-06, "logits/chosen": -15522327.272727273, "logits/rejected": -37734500.92307692, "logps/chosen": -433.25887784090907, "logps/rejected": -568.2223557692307, "loss": 0.0316, "rewards/chosen": 6.3574350530451, "rewards/margins": 18.016937522621422, "rewards/rejected": -11.659502469576323, "step": 1545 }, { "epoch": 0.42373578182814853, "grad_norm": 3.515625, "kl": 7.600671291351318, "learning_rate": 5e-06, "logits/chosen": -23304267.42857143, "logits/rejected": -23849673.6, "logps/chosen": -515.9459751674107, "logps/rejected": -450.842041015625, "loss": 0.0107, "rewards/chosen": 8.595849173409599, "rewards/margins": 16.356603567940848, "rewards/rejected": -7.76075439453125, "step": 1546 }, { "epoch": 0.42400986706865834, "grad_norm": 0.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20179577.6, "logits/rejected": -12882137.142857144, "logps/chosen": -462.02509765625, "logps/rejected": -656.7594866071429, "loss": 0.0013, "rewards/chosen": 7.603567504882813, "rewards/margins": 20.255046953473773, "rewards/rejected": -12.65147944859096, "step": 1547 }, { "epoch": 0.42428395230916816, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7147912.8, "logits/rejected": -13027499.42857143, "logps/chosen": -530.90205078125, "logps/rejected": -538.2541852678571, "loss": 0.0385, "rewards/chosen": 6.426943969726563, "rewards/margins": 17.764019339425225, "rewards/rejected": -11.337075369698661, "step": 1548 }, { "epoch": 0.42455803754967797, "grad_norm": 13.0, "kl": 1.2523658275604248, "learning_rate": 5e-06, "logits/chosen": -902864.4615384615, "logits/rejected": -26773082.181818184, "logps/chosen": -422.50939002403845, "logps/rejected": -414.38449928977275, "loss": 0.0566, "rewards/chosen": 5.604836097130408, "rewards/margins": 16.27329590270569, "rewards/rejected": -10.668459805575283, "step": 1549 }, { "epoch": 0.4248321227901877, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14996006.857142856, "logits/rejected": -22476702.4, "logps/chosen": -434.0245884486607, "logps/rejected": -437.3607421875, "loss": 0.0347, "rewards/chosen": 7.004281180245536, "rewards/margins": 15.497458975655693, "rewards/rejected": -8.493177795410157, "step": 1550 }, { "epoch": 0.42510620803069754, "grad_norm": 10.3125, "kl": 8.679424285888672, "learning_rate": 5e-06, "logits/chosen": -13035430.857142856, "logits/rejected": -26929472.0, "logps/chosen": -404.72178431919644, "logps/rejected": -436.378125, "loss": 0.0432, "rewards/chosen": 6.378510611397879, "rewards/margins": 14.26559328351702, "rewards/rejected": -7.88708267211914, "step": 1551 }, { "epoch": 0.42538029327120735, "grad_norm": 10.8125, "kl": 1.219626784324646, "learning_rate": 5e-06, "logits/chosen": -22833469.333333332, "logits/rejected": -25671.333333333332, "logps/chosen": -440.1573486328125, "logps/rejected": -453.225830078125, "loss": 0.0502, "rewards/chosen": 5.702290852864583, "rewards/margins": 14.237310409545898, "rewards/rejected": -8.535019556681315, "step": 1552 }, { "epoch": 0.42565437851171717, "grad_norm": 5.96875, "kl": 0.5507545471191406, "learning_rate": 5e-06, "logits/chosen": -20875306.0, "logits/rejected": -14836620.0, "logps/chosen": -415.559814453125, "logps/rejected": -571.2532958984375, "loss": 0.0169, "rewards/chosen": 6.394080638885498, "rewards/margins": 18.272799015045166, "rewards/rejected": -11.878718376159668, "step": 1553 }, { "epoch": 0.4259284637522269, "grad_norm": 3.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25842684.444444444, "logits/rejected": -24498816.0, "logps/chosen": -430.46630859375, "logps/rejected": -391.51116536458335, "loss": 0.0205, "rewards/chosen": 5.49709235297309, "rewards/margins": 13.227211168077257, "rewards/rejected": -7.730118815104166, "step": 1554 }, { "epoch": 0.42620254899273674, "grad_norm": 9.8125, "kl": 1.4395898580551147, "learning_rate": 5e-06, "logits/chosen": -8721655.384615384, "logits/rejected": 10477371.636363637, "logps/chosen": -482.04447115384613, "logps/rejected": -559.9357688210227, "loss": 0.0275, "rewards/chosen": 5.972091087928185, "rewards/margins": 16.511533670492106, "rewards/rejected": -10.53944258256392, "step": 1555 }, { "epoch": 0.42647663423324655, "grad_norm": 2.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21717734.85714286, "logits/rejected": -33201635.2, "logps/chosen": -462.30747767857144, "logps/rejected": -448.46484375, "loss": 0.0265, "rewards/chosen": 5.7779966081891745, "rewards/margins": 15.035906764439176, "rewards/rejected": -9.25791015625, "step": 1556 }, { "epoch": 0.4267507194737563, "grad_norm": 9.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3719223.272727273, "logits/rejected": -33130646.153846152, "logps/chosen": -535.8848544034091, "logps/rejected": -486.3874699519231, "loss": 0.0477, "rewards/chosen": 8.232005726207387, "rewards/margins": 16.519433775148194, "rewards/rejected": -8.287428048940805, "step": 1557 }, { "epoch": 0.4270248047142661, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20912887.111111112, "logits/rejected": -25442094.933333334, "logps/chosen": -289.104736328125, "logps/rejected": -583.4145182291667, "loss": 0.0447, "rewards/chosen": 4.333204905192058, "rewards/margins": 15.025232696533202, "rewards/rejected": -10.692027791341145, "step": 1558 }, { "epoch": 0.42729888995477594, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29526067.2, "logits/rejected": -23657299.555555556, "logps/chosen": -496.2544270833333, "logps/rejected": -914.4874131944445, "loss": 0.0127, "rewards/chosen": 6.553559366861979, "rewards/margins": 21.393811713324652, "rewards/rejected": -14.840252346462673, "step": 1559 }, { "epoch": 0.42757297519528575, "grad_norm": 6.375, "kl": 1.7694952487945557, "learning_rate": 5e-06, "logits/chosen": -11552968.888888888, "logits/rejected": -24912558.933333334, "logps/chosen": -368.04155815972223, "logps/rejected": -435.56064453125, "loss": 0.0442, "rewards/chosen": 6.150076548258464, "rewards/margins": 14.718037160237632, "rewards/rejected": -8.567960611979167, "step": 1560 }, { "epoch": 0.4278470604357955, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20692624.0, "logits/rejected": -26528910.4, "logps/chosen": -437.61328125, "logps/rejected": -654.0646484375, "loss": 0.0274, "rewards/chosen": 6.245622907366071, "rewards/margins": 16.753797040666853, "rewards/rejected": -10.50817413330078, "step": 1561 }, { "epoch": 0.4281211456763053, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30765444.923076924, "logits/rejected": -1327318.6363636365, "logps/chosen": -423.52110877403845, "logps/rejected": -454.47274502840907, "loss": 0.0447, "rewards/chosen": 5.297383235051082, "rewards/margins": 13.21655881654966, "rewards/rejected": -7.919175581498579, "step": 1562 }, { "epoch": 0.42839523091681514, "grad_norm": 15.25, "kl": 2.7215564250946045, "learning_rate": 5e-06, "logits/chosen": -20699884.8, "logits/rejected": -8199564.444444444, "logps/chosen": -438.71875, "logps/rejected": -667.3834635416666, "loss": 0.047, "rewards/chosen": 5.93561757405599, "rewards/margins": 16.331015184190537, "rewards/rejected": -10.395397610134548, "step": 1563 }, { "epoch": 0.42866931615732495, "grad_norm": 7.5625, "kl": 1.8192590475082397, "learning_rate": 5e-06, "logits/chosen": -20246765.333333332, "logits/rejected": -11181466.0, "logps/chosen": -385.1337076822917, "logps/rejected": -568.6243489583334, "loss": 0.0686, "rewards/chosen": 5.4161726633707685, "rewards/margins": 12.986188888549805, "rewards/rejected": -7.570016225179036, "step": 1564 }, { "epoch": 0.4289434013978347, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6769580.444444444, "logits/rejected": -11017386.666666666, "logps/chosen": -455.2392578125, "logps/rejected": -462.1536458333333, "loss": 0.0444, "rewards/chosen": 5.913819207085504, "rewards/margins": 13.481346978081596, "rewards/rejected": -7.567527770996094, "step": 1565 }, { "epoch": 0.4292174866383445, "grad_norm": 9.5625, "kl": 4.43835973739624, "learning_rate": 5e-06, "logits/chosen": -14110095.0, "logits/rejected": -8997012.0, "logps/chosen": -532.1517333984375, "logps/rejected": -464.2311706542969, "loss": 0.0586, "rewards/chosen": 6.498110294342041, "rewards/margins": 14.127247333526611, "rewards/rejected": -7.62913703918457, "step": 1566 }, { "epoch": 0.42949157187885434, "grad_norm": 7.1875, "kl": 1.230910062789917, "learning_rate": 5e-06, "logits/chosen": -8651596.57142857, "logits/rejected": -16544830.4, "logps/chosen": -416.76597377232144, "logps/rejected": -434.637451171875, "loss": 0.0281, "rewards/chosen": 5.276154654366629, "rewards/margins": 14.276842825753349, "rewards/rejected": -9.000688171386718, "step": 1567 }, { "epoch": 0.4297656571193641, "grad_norm": 3.140625, "kl": 3.337705135345459, "learning_rate": 5e-06, "logits/chosen": -28913317.333333332, "logits/rejected": -8583816.0, "logps/chosen": -375.8260498046875, "logps/rejected": -551.7837727864584, "loss": 0.0113, "rewards/chosen": 6.486595153808594, "rewards/margins": 15.02840487162272, "rewards/rejected": -8.541809717814127, "step": 1568 }, { "epoch": 0.4300397423598739, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30468480.0, "logits/rejected": -14224932.923076924, "logps/chosen": -417.07594992897725, "logps/rejected": -340.54356971153845, "loss": 0.0738, "rewards/chosen": 5.6888427734375, "rewards/margins": 12.522584181565506, "rewards/rejected": -6.833741408128005, "step": 1569 }, { "epoch": 0.4303138276003837, "grad_norm": 3.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10058890.909090908, "logits/rejected": -15214660.923076924, "logps/chosen": -427.70023970170456, "logps/rejected": -611.5141225961538, "loss": 0.0081, "rewards/chosen": 7.318155462091619, "rewards/margins": 16.024436097045047, "rewards/rejected": -8.706280634953426, "step": 1570 }, { "epoch": 0.43058791284089354, "grad_norm": 3.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10097241.846153846, "logits/rejected": 11669538.909090908, "logps/chosen": -385.2795222355769, "logps/rejected": -588.7693093039773, "loss": 0.0319, "rewards/chosen": 4.551521888146033, "rewards/margins": 18.70226901394504, "rewards/rejected": -14.150747125799006, "step": 1571 }, { "epoch": 0.4308619980814033, "grad_norm": 12.4375, "kl": 1.3918012380599976, "learning_rate": 5e-06, "logits/chosen": -4073968.5714285714, "logits/rejected": -9014650.4, "logps/chosen": -353.56187220982144, "logps/rejected": -602.799609375, "loss": 0.134, "rewards/chosen": 3.9151733943394254, "rewards/margins": 11.6842590876988, "rewards/rejected": -7.769085693359375, "step": 1572 }, { "epoch": 0.4311360833219131, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25475835.42857143, "logits/rejected": -16630923.294117646, "logps/chosen": -323.25571986607144, "logps/rejected": -591.9396829044117, "loss": 0.0184, "rewards/chosen": 6.9311948503766745, "rewards/margins": 16.565570927467668, "rewards/rejected": -9.634376077090993, "step": 1573 }, { "epoch": 0.4314101685624229, "grad_norm": 7.46875, "kl": 0.5056228637695312, "learning_rate": 5e-06, "logits/chosen": -21206793.846153848, "logits/rejected": -1977051.4545454546, "logps/chosen": -363.48497596153845, "logps/rejected": -559.69140625, "loss": 0.0259, "rewards/chosen": 7.0742011437049275, "rewards/margins": 14.355299942976945, "rewards/rejected": -7.281098799272017, "step": 1574 }, { "epoch": 0.43168425380293274, "grad_norm": 17.875, "kl": 5.256450176239014, "learning_rate": 5e-06, "logits/chosen": -17159120.94117647, "logits/rejected": -16076352.0, "logps/chosen": -394.5378848805147, "logps/rejected": -739.6941964285714, "loss": 0.14, "rewards/chosen": 4.6304621976964615, "rewards/margins": 14.0065946178276, "rewards/rejected": -9.376132420131139, "step": 1575 }, { "epoch": 0.4319583390434425, "grad_norm": 9.625, "kl": 4.344319820404053, "learning_rate": 5e-06, "logits/chosen": -26530278.4, "logits/rejected": 28303500.444444444, "logps/chosen": -482.57018229166664, "logps/rejected": -750.7344835069445, "loss": 0.0698, "rewards/chosen": 5.96619618733724, "rewards/margins": 19.952626037597657, "rewards/rejected": -13.986429850260416, "step": 1576 }, { "epoch": 0.4322324242839523, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -424526.22222222225, "logits/rejected": -12842584.533333333, "logps/chosen": -313.5080837673611, "logps/rejected": -427.5298177083333, "loss": 0.0371, "rewards/chosen": 5.234290652804905, "rewards/margins": 13.049124230278863, "rewards/rejected": -7.814833577473959, "step": 1577 }, { "epoch": 0.4325065095244621, "grad_norm": 7.59375, "kl": 2.7760372161865234, "learning_rate": 5e-06, "logits/chosen": -13555025.142857144, "logits/rejected": -18968347.2, "logps/chosen": -427.65945870535717, "logps/rejected": -552.38154296875, "loss": 0.0255, "rewards/chosen": 6.15326908656529, "rewards/margins": 17.83082798549107, "rewards/rejected": -11.677558898925781, "step": 1578 }, { "epoch": 0.4327805947649719, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36134136.615384616, "logits/rejected": 2205537.090909091, "logps/chosen": -360.97329477163464, "logps/rejected": -518.2091619318181, "loss": 0.0343, "rewards/chosen": 5.180013803335337, "rewards/margins": 15.521375802847055, "rewards/rejected": -10.341361999511719, "step": 1579 }, { "epoch": 0.4330546800054817, "grad_norm": 6.96875, "kl": 4.137892723083496, "learning_rate": 5e-06, "logits/chosen": -9889382.4, "logits/rejected": -10085556.57142857, "logps/chosen": -515.211279296875, "logps/rejected": -406.03738839285717, "loss": 0.0329, "rewards/chosen": 5.934810638427734, "rewards/margins": 12.077788761683873, "rewards/rejected": -6.142978123256138, "step": 1580 }, { "epoch": 0.4333287652459915, "grad_norm": 8.5, "kl": 6.2965192794799805, "learning_rate": 5e-06, "logits/chosen": -9774653.714285715, "logits/rejected": 7323669.6, "logps/chosen": -446.53724888392856, "logps/rejected": -575.860302734375, "loss": 0.0332, "rewards/chosen": 5.845665522984096, "rewards/margins": 16.591905757359097, "rewards/rejected": -10.746240234375, "step": 1581 }, { "epoch": 0.4336028504865013, "grad_norm": 1.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1105105.0, "logits/rejected": -32466576.0, "logps/chosen": -472.293701171875, "logps/rejected": -580.6322631835938, "loss": 0.0058, "rewards/chosen": 7.21293830871582, "rewards/margins": 16.623648643493652, "rewards/rejected": -9.410710334777832, "step": 1582 }, { "epoch": 0.4338769357270111, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23046554.666666668, "logits/rejected": -11823264.0, "logps/chosen": -419.2908528645833, "logps/rejected": -382.1836751302083, "loss": 0.0224, "rewards/chosen": 4.619925498962402, "rewards/margins": 13.129349708557129, "rewards/rejected": -8.509424209594727, "step": 1583 }, { "epoch": 0.4341510209675209, "grad_norm": 15.1875, "kl": 9.030853271484375, "learning_rate": 5e-06, "logits/chosen": -14699395.2, "logits/rejected": -22527586.285714287, "logps/chosen": -411.340234375, "logps/rejected": -500.1363002232143, "loss": 0.0801, "rewards/chosen": 6.032492065429688, "rewards/margins": 13.947268458775113, "rewards/rejected": -7.9147763933454245, "step": 1584 }, { "epoch": 0.4344251062080307, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10538158.222222222, "logits/rejected": -5921980.8, "logps/chosen": -475.8049045138889, "logps/rejected": -568.1458333333334, "loss": 0.0096, "rewards/chosen": 6.532720777723524, "rewards/margins": 15.846026441786025, "rewards/rejected": -9.3133056640625, "step": 1585 }, { "epoch": 0.4346991914485405, "grad_norm": 7.8125, "kl": 2.0229835510253906, "learning_rate": 5e-06, "logits/chosen": -7294612.0, "logits/rejected": -8344828.0, "logps/chosen": -451.2098911830357, "logps/rejected": -624.97900390625, "loss": 0.033, "rewards/chosen": 5.6775327410016745, "rewards/margins": 15.443360682896206, "rewards/rejected": -9.765827941894532, "step": 1586 }, { "epoch": 0.4349732766890503, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13978626.285714285, "logits/rejected": -13349896.0, "logps/chosen": -395.0674525669643, "logps/rejected": -491.29462890625, "loss": 0.0452, "rewards/chosen": 5.499007088797433, "rewards/margins": 16.71218763078962, "rewards/rejected": -11.213180541992188, "step": 1587 }, { "epoch": 0.4352473619295601, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2072552.6153846155, "logits/rejected": -20443649.454545453, "logps/chosen": -337.6418269230769, "logps/rejected": -472.9308416193182, "loss": 0.0396, "rewards/chosen": 4.185869363638071, "rewards/margins": 12.784905840466907, "rewards/rejected": -8.599036476828836, "step": 1588 }, { "epoch": 0.4355214471700699, "grad_norm": 4.71875, "kl": 1.0556329488754272, "learning_rate": 5e-06, "logits/chosen": 6471694.285714285, "logits/rejected": 29640513.88235294, "logps/chosen": -350.63204520089283, "logps/rejected": -572.3281824448529, "loss": 0.0566, "rewards/chosen": 8.143637520926339, "rewards/margins": 18.516521934701615, "rewards/rejected": -10.372884413775276, "step": 1589 }, { "epoch": 0.43579553241057967, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27497158.4, "logits/rejected": -16338928.0, "logps/chosen": -546.7263671875, "logps/rejected": -474.30465262276783, "loss": 0.0206, "rewards/chosen": 7.434725952148438, "rewards/margins": 16.715448434012277, "rewards/rejected": -9.280722481863839, "step": 1590 }, { "epoch": 0.4360696176510895, "grad_norm": 7.375, "kl": 0.036284130066633224, "learning_rate": 5e-06, "logits/chosen": 9180269.333333334, "logits/rejected": -28020896.0, "logps/chosen": -377.82568359375, "logps/rejected": -372.40930989583336, "loss": 0.0489, "rewards/chosen": 5.5637622409396705, "rewards/margins": 12.080713229709202, "rewards/rejected": -6.516950988769532, "step": 1591 }, { "epoch": 0.4363437028915993, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12558744.8, "logits/rejected": -1304006.857142857, "logps/chosen": -364.173486328125, "logps/rejected": -509.8643275669643, "loss": 0.0622, "rewards/chosen": 5.0812427520751955, "rewards/margins": 16.352591977800643, "rewards/rejected": -11.271349225725446, "step": 1592 }, { "epoch": 0.4366177881321091, "grad_norm": 8.5, "kl": 3.9487390518188477, "learning_rate": 5e-06, "logits/chosen": -18162534.85714286, "logits/rejected": -29886540.8, "logps/chosen": -414.6497279575893, "logps/rejected": -505.68251953125, "loss": 0.0273, "rewards/chosen": 6.518254961286273, "rewards/margins": 15.583546556745256, "rewards/rejected": -9.065291595458984, "step": 1593 }, { "epoch": 0.43689187337261887, "grad_norm": 3.953125, "kl": 1.8361270427703857, "learning_rate": 5e-06, "logits/chosen": -20345783.111111112, "logits/rejected": -26725632.0, "logps/chosen": -444.0055338541667, "logps/rejected": -666.4371744791666, "loss": 0.008, "rewards/chosen": 7.1822967529296875, "rewards/margins": 17.07911071777344, "rewards/rejected": -9.89681396484375, "step": 1594 }, { "epoch": 0.4371659586131287, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17922396.444444444, "logits/rejected": -18206350.933333334, "logps/chosen": -417.0534396701389, "logps/rejected": -468.61350911458334, "loss": 0.0221, "rewards/chosen": 6.047190348307292, "rewards/margins": 14.934466552734374, "rewards/rejected": -8.887276204427083, "step": 1595 }, { "epoch": 0.4374400438536385, "grad_norm": 7.09375, "kl": 6.582607269287109, "learning_rate": 5e-06, "logits/chosen": -27982576.0, "logits/rejected": 14815844.0, "logps/chosen": -401.638427734375, "logps/rejected": -727.9290364583334, "loss": 0.0169, "rewards/chosen": 6.889856338500977, "rewards/margins": 18.57656796773275, "rewards/rejected": -11.686711629231771, "step": 1596 }, { "epoch": 0.4377141290941483, "grad_norm": 2.90625, "kl": 4.336249351501465, "learning_rate": 5e-06, "logits/chosen": -19859305.846153848, "logits/rejected": -29676445.09090909, "logps/chosen": -428.9484675480769, "logps/rejected": -592.6337002840909, "loss": 0.0073, "rewards/chosen": 6.669932438777043, "rewards/margins": 15.723300747104457, "rewards/rejected": -9.053368308327414, "step": 1597 }, { "epoch": 0.43798821433465807, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8397837.333333334, "logits/rejected": -17573800.0, "logps/chosen": -336.4859619140625, "logps/rejected": -511.5308430989583, "loss": 0.0272, "rewards/chosen": 5.384997685750325, "rewards/margins": 16.19845136006673, "rewards/rejected": -10.813453674316406, "step": 1598 }, { "epoch": 0.4382622995751679, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24738984.727272727, "logits/rejected": -19300173.53846154, "logps/chosen": -369.61689897017044, "logps/rejected": -574.8435246394231, "loss": 0.0457, "rewards/chosen": 6.369357022372159, "rewards/margins": 16.788028850422037, "rewards/rejected": -10.41867182804988, "step": 1599 }, { "epoch": 0.4385363848156777, "grad_norm": 12.625, "kl": 5.976953029632568, "learning_rate": 5e-06, "logits/chosen": -12034653.538461538, "logits/rejected": -41405765.81818182, "logps/chosen": -380.30333533653845, "logps/rejected": -613.7848011363636, "loss": 0.0403, "rewards/chosen": 6.570397597092849, "rewards/margins": 16.242716489138303, "rewards/rejected": -9.672318892045455, "step": 1600 }, { "epoch": 0.43881047005618745, "grad_norm": 11.4375, "kl": 5.164576530456543, "learning_rate": 5e-06, "logits/chosen": -51269477.333333336, "logits/rejected": -36268021.333333336, "logps/chosen": -350.6538899739583, "logps/rejected": -439.0476888020833, "loss": 0.1183, "rewards/chosen": 5.8839467366536455, "rewards/margins": 13.428728739420572, "rewards/rejected": -7.544782002766927, "step": 1601 }, { "epoch": 0.43908455529669727, "grad_norm": 3.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28004548.923076924, "logits/rejected": -10725927.272727273, "logps/chosen": -351.61485877403845, "logps/rejected": -333.95439009232956, "loss": 0.0365, "rewards/chosen": 4.602257361778846, "rewards/margins": 11.469885285917695, "rewards/rejected": -6.86762792413885, "step": 1602 }, { "epoch": 0.4393586405372071, "grad_norm": 7.875, "kl": 1.4268290996551514, "learning_rate": 5e-06, "logits/chosen": -16060489.6, "logits/rejected": -21778964.57142857, "logps/chosen": -431.728076171875, "logps/rejected": -517.9756905691964, "loss": 0.0493, "rewards/chosen": 5.308440399169922, "rewards/margins": 14.477192796979633, "rewards/rejected": -9.16875239780971, "step": 1603 }, { "epoch": 0.4396327257777169, "grad_norm": 13.25, "kl": 0.9417349696159363, "learning_rate": 5e-06, "logits/chosen": -8169249.230769231, "logits/rejected": -23015761.454545453, "logps/chosen": -301.14013671875, "logps/rejected": -594.4637784090909, "loss": 0.0514, "rewards/chosen": 5.947084280160757, "rewards/margins": 15.255504634830501, "rewards/rejected": -9.308420354669744, "step": 1604 }, { "epoch": 0.43990681101822665, "grad_norm": 4.59375, "kl": 0.8764635920524597, "learning_rate": 5e-06, "logits/chosen": -36448085.333333336, "logits/rejected": -12963608.888888888, "logps/chosen": -437.87877604166664, "logps/rejected": -490.3688151041667, "loss": 0.0144, "rewards/chosen": 6.5087336222330725, "rewards/margins": 17.89163089328342, "rewards/rejected": -11.382897271050346, "step": 1605 }, { "epoch": 0.44018089625873646, "grad_norm": 12.3125, "kl": 9.698051452636719, "learning_rate": 5e-06, "logits/chosen": -7615410.133333334, "logits/rejected": -17646840.888888888, "logps/chosen": -540.3231119791667, "logps/rejected": -424.1411404079861, "loss": 0.0621, "rewards/chosen": 8.096905517578126, "rewards/margins": 17.086988152398003, "rewards/rejected": -8.990082634819878, "step": 1606 }, { "epoch": 0.4404549814992463, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50491337.84615385, "logits/rejected": 4594315.636363637, "logps/chosen": -330.6887770432692, "logps/rejected": -425.76558061079544, "loss": 0.0698, "rewards/chosen": 5.220531757061298, "rewards/margins": 13.502035607824793, "rewards/rejected": -8.281503850763494, "step": 1607 }, { "epoch": 0.44072906673975604, "grad_norm": 6.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15257862.857142856, "logits/rejected": 39383228.23529412, "logps/chosen": -346.62088448660717, "logps/rejected": -546.9765050551471, "loss": 0.035, "rewards/chosen": 7.4061110360281805, "rewards/margins": 19.35273130400842, "rewards/rejected": -11.946620267980238, "step": 1608 }, { "epoch": 0.44100315198026585, "grad_norm": 2.625, "kl": 4.7509260177612305, "learning_rate": 5e-06, "logits/chosen": -35992005.81818182, "logits/rejected": -17769737.846153848, "logps/chosen": -413.6678355823864, "logps/rejected": -416.7546198918269, "loss": 0.0073, "rewards/chosen": 6.766878995028409, "rewards/margins": 16.34737732360413, "rewards/rejected": -9.580498328575722, "step": 1609 }, { "epoch": 0.44127723722077566, "grad_norm": 3.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24966777.14285714, "logits/rejected": -8148878.4, "logps/chosen": -475.26021902901783, "logps/rejected": -639.0001953125, "loss": 0.0107, "rewards/chosen": 5.919411250523159, "rewards/margins": 20.986913081577846, "rewards/rejected": -15.067501831054688, "step": 1610 }, { "epoch": 0.4415513224612855, "grad_norm": 7.5625, "kl": 3.6646170616149902, "learning_rate": 5e-06, "logits/chosen": -8263650.461538462, "logits/rejected": -14555076.363636363, "logps/chosen": -545.4847130408654, "logps/rejected": -457.32151100852275, "loss": 0.0527, "rewards/chosen": 5.945017887995793, "rewards/margins": 14.76159892048869, "rewards/rejected": -8.816581032492898, "step": 1611 }, { "epoch": 0.44182540770179524, "grad_norm": 9.5625, "kl": 3.506129741668701, "learning_rate": 5e-06, "logits/chosen": -21597806.545454547, "logits/rejected": -22176270.769230768, "logps/chosen": -437.64657315340907, "logps/rejected": -523.5126577524038, "loss": 0.0434, "rewards/chosen": 6.445435957475142, "rewards/margins": 16.033998556070394, "rewards/rejected": -9.588562598595253, "step": 1612 }, { "epoch": 0.44209949294230505, "grad_norm": 0.6953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27436484.923076924, "logits/rejected": 8069934.545454546, "logps/chosen": -499.4791917067308, "logps/rejected": -538.3486772017045, "loss": 0.0023, "rewards/chosen": 7.95752187875601, "rewards/margins": 19.195862776749617, "rewards/rejected": -11.238340897993607, "step": 1613 }, { "epoch": 0.44237357818281486, "grad_norm": 13.375, "kl": 0.4750315546989441, "learning_rate": 5e-06, "logits/chosen": -14526560.0, "logits/rejected": -18913761.6, "logps/chosen": -353.45703125, "logps/rejected": -471.638623046875, "loss": 0.0456, "rewards/chosen": 6.09002685546875, "rewards/margins": 14.259207153320313, "rewards/rejected": -8.169180297851563, "step": 1614 }, { "epoch": 0.4426476634233247, "grad_norm": 2.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11926714.352941176, "logits/rejected": -17116421.714285713, "logps/chosen": -407.3124425551471, "logps/rejected": -643.5661272321429, "loss": 0.0068, "rewards/chosen": 7.40140084659352, "rewards/margins": 17.3354054939847, "rewards/rejected": -9.934004647391182, "step": 1615 }, { "epoch": 0.44292174866383444, "grad_norm": 9.625, "kl": 0.8935505747795105, "learning_rate": 5e-06, "logits/chosen": -11578714.285714285, "logits/rejected": -24899270.4, "logps/chosen": -432.53271484375, "logps/rejected": -547.163916015625, "loss": 0.0483, "rewards/chosen": 5.193505423409598, "rewards/margins": 17.47497591291155, "rewards/rejected": -12.281470489501952, "step": 1616 }, { "epoch": 0.44319583390434425, "grad_norm": 13.125, "kl": 9.52022933959961, "learning_rate": 5e-06, "logits/chosen": -24764077.714285713, "logits/rejected": -20368204.8, "logps/chosen": -426.6773158482143, "logps/rejected": -448.876416015625, "loss": 0.0998, "rewards/chosen": 6.779391697474888, "rewards/margins": 17.181907108851842, "rewards/rejected": -10.402515411376953, "step": 1617 }, { "epoch": 0.44346991914485406, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35039989.333333336, "logits/rejected": -32510370.133333333, "logps/chosen": -445.3374837239583, "logps/rejected": -403.90406901041666, "loss": 0.042, "rewards/chosen": 6.415061526828342, "rewards/margins": 15.133513471815322, "rewards/rejected": -8.71845194498698, "step": 1618 }, { "epoch": 0.4437440043853638, "grad_norm": 2.84375, "kl": 7.0674896240234375, "learning_rate": 5e-06, "logits/chosen": -4537326.461538462, "logits/rejected": -33879360.0, "logps/chosen": -458.28061147836536, "logps/rejected": -509.18013139204544, "loss": 0.0106, "rewards/chosen": 8.038973881648136, "rewards/margins": 17.361817953469867, "rewards/rejected": -9.322844071821732, "step": 1619 }, { "epoch": 0.44401808962587364, "grad_norm": 3.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8616354.823529411, "logits/rejected": 66077645.71428572, "logps/chosen": -445.77277688419116, "logps/rejected": -639.4210379464286, "loss": 0.0088, "rewards/chosen": 6.960884543026195, "rewards/margins": 20.351590196625523, "rewards/rejected": -13.39070565359933, "step": 1620 }, { "epoch": 0.44429217486638345, "grad_norm": 11.9375, "kl": 4.877805709838867, "learning_rate": 5e-06, "logits/chosen": -16048120.888888888, "logits/rejected": -5844683.733333333, "logps/chosen": -509.4775390625, "logps/rejected": -449.883984375, "loss": 0.0261, "rewards/chosen": 7.203847249348958, "rewards/margins": 16.277764892578126, "rewards/rejected": -9.073917643229167, "step": 1621 }, { "epoch": 0.44456626010689326, "grad_norm": 4.1875, "kl": 0.9618561863899231, "learning_rate": 5e-06, "logits/chosen": 5030148.307692308, "logits/rejected": -19122234.181818184, "logps/chosen": -470.98937049278845, "logps/rejected": -554.8478781960227, "loss": 0.0184, "rewards/chosen": 6.558130117563101, "rewards/margins": 15.124824897392646, "rewards/rejected": -8.566694779829545, "step": 1622 }, { "epoch": 0.444840345347403, "grad_norm": 1.6015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2900054.0, "logits/rejected": -11607685.714285715, "logps/chosen": -366.6426513671875, "logps/rejected": -572.0793805803571, "loss": 0.0037, "rewards/chosen": 7.732093811035156, "rewards/margins": 18.361781529017858, "rewards/rejected": -10.629687717982701, "step": 1623 }, { "epoch": 0.44511443058791283, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10483862.0, "logits/rejected": -5503631.0, "logps/chosen": -445.8854166666667, "logps/rejected": -501.3751627604167, "loss": 0.0154, "rewards/chosen": 6.492940266927083, "rewards/margins": 15.429641723632812, "rewards/rejected": -8.936701456705729, "step": 1624 }, { "epoch": 0.44538851582842265, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 19543508.8, "logits/rejected": -28528944.0, "logps/chosen": -368.5140869140625, "logps/rejected": -530.5874720982143, "loss": 0.0623, "rewards/chosen": 5.4311073303222654, "rewards/margins": 14.702875082833426, "rewards/rejected": -9.271767752511161, "step": 1625 }, { "epoch": 0.44566260106893246, "grad_norm": 6.625, "kl": 0.3709462583065033, "learning_rate": 5e-06, "logits/chosen": 6602111.384615385, "logits/rejected": -23935028.363636363, "logps/chosen": -433.13269981971155, "logps/rejected": -405.8216441761364, "loss": 0.0252, "rewards/chosen": 5.85598872258113, "rewards/margins": 14.173687808163518, "rewards/rejected": -8.317699085582387, "step": 1626 }, { "epoch": 0.4459366863094422, "grad_norm": 10.4375, "kl": 19.157194137573242, "learning_rate": 5e-06, "logits/chosen": -26535949.17647059, "logits/rejected": -34204208.0, "logps/chosen": -491.8421415441176, "logps/rejected": -488.80440848214283, "loss": 0.0653, "rewards/chosen": 7.5327301025390625, "rewards/margins": 16.28550556727818, "rewards/rejected": -8.752775464739118, "step": 1627 }, { "epoch": 0.44621077154995203, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13101413.333333334, "logits/rejected": -7527658.666666667, "logps/chosen": -385.9886067708333, "logps/rejected": -491.7929280598958, "loss": 0.0573, "rewards/chosen": 5.628131866455078, "rewards/margins": 15.343802134195963, "rewards/rejected": -9.715670267740885, "step": 1628 }, { "epoch": 0.44648485679046185, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11132960.0, "logits/rejected": -22993906.285714287, "logps/chosen": -375.987841796875, "logps/rejected": -521.00341796875, "loss": 0.0345, "rewards/chosen": 6.91351318359375, "rewards/margins": 15.45094462803432, "rewards/rejected": -8.53743144444057, "step": 1629 }, { "epoch": 0.4467589420309716, "grad_norm": 7.75, "kl": 7.083909034729004, "learning_rate": 5e-06, "logits/chosen": -20764034.285714287, "logits/rejected": 33358393.6, "logps/chosen": -370.28299386160717, "logps/rejected": -556.053662109375, "loss": 0.0556, "rewards/chosen": 6.4532961164202005, "rewards/margins": 16.902691868373324, "rewards/rejected": -10.449395751953125, "step": 1630 }, { "epoch": 0.4470330272714814, "grad_norm": 2.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5396382.769230769, "logits/rejected": -19794064.0, "logps/chosen": -397.78722205528845, "logps/rejected": -564.6499467329545, "loss": 0.0067, "rewards/chosen": 6.864127525916467, "rewards/margins": 19.28014453807911, "rewards/rejected": -12.416017012162643, "step": 1631 }, { "epoch": 0.44730711251199123, "grad_norm": 1.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 19407182.545454547, "logits/rejected": -17738809.846153848, "logps/chosen": -450.73193359375, "logps/rejected": -702.9397536057693, "loss": 0.0049, "rewards/chosen": 7.135676990855824, "rewards/margins": 17.263367232742844, "rewards/rejected": -10.12769024188702, "step": 1632 }, { "epoch": 0.44758119775250105, "grad_norm": 5.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17481745.6, "logits/rejected": -11392288.0, "logps/chosen": -334.5301025390625, "logps/rejected": -555.6188616071429, "loss": 0.0367, "rewards/chosen": 4.077671051025391, "rewards/margins": 14.48890849522182, "rewards/rejected": -10.411237444196429, "step": 1633 }, { "epoch": 0.4478552829930108, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7127992.0, "logits/rejected": 96839891.2, "logps/chosen": -417.3936244419643, "logps/rejected": -547.777392578125, "loss": 0.0537, "rewards/chosen": 5.947007315499442, "rewards/margins": 14.024064200265066, "rewards/rejected": -8.077056884765625, "step": 1634 }, { "epoch": 0.4481293682335206, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29806142.222222224, "logits/rejected": -12480049.066666666, "logps/chosen": -503.5505099826389, "logps/rejected": -471.7436848958333, "loss": 0.022, "rewards/chosen": 7.056818220350477, "rewards/margins": 16.224021996392146, "rewards/rejected": -9.167203776041667, "step": 1635 }, { "epoch": 0.44840345347403043, "grad_norm": 2.0625, "kl": 7.586144924163818, "learning_rate": 5e-06, "logits/chosen": -8740294.857142856, "logits/rejected": -9576947.2, "logps/chosen": -505.3546665736607, "logps/rejected": -496.8560546875, "loss": 0.0416, "rewards/chosen": 7.514672415597098, "rewards/margins": 17.675181143624442, "rewards/rejected": -10.160508728027343, "step": 1636 }, { "epoch": 0.44867753871454025, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10192892.8, "logits/rejected": -19619812.57142857, "logps/chosen": -403.9939208984375, "logps/rejected": -609.0163225446429, "loss": 0.0067, "rewards/chosen": 7.485856628417968, "rewards/margins": 18.678268868582588, "rewards/rejected": -11.19241224016462, "step": 1637 }, { "epoch": 0.44895162395505, "grad_norm": 8.1875, "kl": 2.938762664794922, "learning_rate": 5e-06, "logits/chosen": 19759197.866666667, "logits/rejected": -6147216.444444444, "logps/chosen": -474.1456705729167, "logps/rejected": -754.3939887152778, "loss": 0.0529, "rewards/chosen": 6.011942545572917, "rewards/margins": 17.841898600260418, "rewards/rejected": -11.8299560546875, "step": 1638 }, { "epoch": 0.4492257091955598, "grad_norm": 10.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15194781.333333334, "logits/rejected": -25793173.333333332, "logps/chosen": -326.8212076822917, "logps/rejected": -716.6625162760416, "loss": 0.0467, "rewards/chosen": 5.539063771565755, "rewards/margins": 16.822612762451172, "rewards/rejected": -11.283548990885416, "step": 1639 }, { "epoch": 0.44949979443606963, "grad_norm": 5.8125, "kl": 1.4932245016098022, "learning_rate": 5e-06, "logits/chosen": -13225469.714285715, "logits/rejected": -4212331.6, "logps/chosen": -329.97813197544644, "logps/rejected": -611.74267578125, "loss": 0.0315, "rewards/chosen": 5.574873788016183, "rewards/margins": 15.420550973074779, "rewards/rejected": -9.845677185058594, "step": 1640 }, { "epoch": 0.4497738796765794, "grad_norm": 3.15625, "kl": 8.383417129516602, "learning_rate": 5e-06, "logits/chosen": -23534653.53846154, "logits/rejected": -30002752.0, "logps/chosen": -573.2593900240385, "logps/rejected": -455.76518110795456, "loss": 0.0102, "rewards/chosen": 8.192196185772236, "rewards/margins": 18.63432237318346, "rewards/rejected": -10.44212618741122, "step": 1641 }, { "epoch": 0.4500479649170892, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -405986.4, "logits/rejected": -13343293.714285715, "logps/chosen": -359.9531494140625, "logps/rejected": -465.36903599330356, "loss": 0.0893, "rewards/chosen": 5.042927169799805, "rewards/margins": 13.030072729928154, "rewards/rejected": -7.987145560128348, "step": 1642 }, { "epoch": 0.450322050157599, "grad_norm": 3.453125, "kl": 3.5321948528289795, "learning_rate": 5e-06, "logits/chosen": -42219625.14285714, "logits/rejected": 73464972.8, "logps/chosen": -499.80186244419644, "logps/rejected": -615.669091796875, "loss": 0.0078, "rewards/chosen": 7.024814060756138, "rewards/margins": 25.635254124232702, "rewards/rejected": -18.610440063476563, "step": 1643 }, { "epoch": 0.45059613539810883, "grad_norm": 5.4375, "kl": 3.9668173789978027, "learning_rate": 5e-06, "logits/chosen": -11940672.0, "logits/rejected": -1345260.1818181819, "logps/chosen": -407.501953125, "logps/rejected": -656.0498934659091, "loss": 0.0184, "rewards/chosen": 7.9962158203125, "rewards/margins": 17.293535405939274, "rewards/rejected": -9.297319585626775, "step": 1644 }, { "epoch": 0.4508702206386186, "grad_norm": 10.0, "kl": 8.886152267456055, "learning_rate": 5e-06, "logits/chosen": -24678589.53846154, "logits/rejected": -25159598.545454547, "logps/chosen": -393.21529447115387, "logps/rejected": -484.6580255681818, "loss": 0.0462, "rewards/chosen": 5.907256493201623, "rewards/margins": 14.68162824724104, "rewards/rejected": -8.774371754039418, "step": 1645 }, { "epoch": 0.4511443058791284, "grad_norm": 7.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 453955.5, "logits/rejected": -20130612.8, "logps/chosen": -678.9776611328125, "logps/rejected": -442.9744140625, "loss": 0.0896, "rewards/chosen": 7.095569610595703, "rewards/margins": 13.935923767089843, "rewards/rejected": -6.840354156494141, "step": 1646 }, { "epoch": 0.4514183911196382, "grad_norm": 3.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10254125.6, "logits/rejected": -13035568.0, "logps/chosen": -368.7681884765625, "logps/rejected": -540.6209542410714, "loss": 0.0125, "rewards/chosen": 5.310494613647461, "rewards/margins": 14.431333323887415, "rewards/rejected": -9.120838710239955, "step": 1647 }, { "epoch": 0.45169247636014803, "grad_norm": 3.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 41226500.0, "logits/rejected": -7846700.0, "logps/chosen": -363.8887634277344, "logps/rejected": -618.3214111328125, "loss": 0.0215, "rewards/chosen": 4.966091632843018, "rewards/margins": 14.06819772720337, "rewards/rejected": -9.102106094360352, "step": 1648 }, { "epoch": 0.4519665616006578, "grad_norm": 9.3125, "kl": 3.6866555213928223, "learning_rate": 5e-06, "logits/chosen": -7984409.142857143, "logits/rejected": -16671241.6, "logps/chosen": -346.58663504464283, "logps/rejected": -505.9958984375, "loss": 0.0624, "rewards/chosen": 6.141032627650669, "rewards/margins": 13.301349094935826, "rewards/rejected": -7.160316467285156, "step": 1649 }, { "epoch": 0.4522406468411676, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24967350.4, "logits/rejected": 5856523.428571428, "logps/chosen": -333.070458984375, "logps/rejected": -448.32352120535717, "loss": 0.0333, "rewards/chosen": 5.883818817138672, "rewards/margins": 14.15088849748884, "rewards/rejected": -8.267069680350167, "step": 1650 }, { "epoch": 0.4525147320816774, "grad_norm": 10.1875, "kl": 1.9765117168426514, "learning_rate": 5e-06, "logits/chosen": -11081370.666666666, "logits/rejected": -27542986.666666668, "logps/chosen": -377.0992838541667, "logps/rejected": -460.6643473307292, "loss": 0.0585, "rewards/chosen": 5.377266565958659, "rewards/margins": 12.936354955037435, "rewards/rejected": -7.559088389078776, "step": 1651 }, { "epoch": 0.4527888173221872, "grad_norm": 7.71875, "kl": 9.872448921203613, "learning_rate": 5e-06, "logits/chosen": -8909801.142857144, "logits/rejected": -4694728.8, "logps/chosen": -323.0625, "logps/rejected": -420.320166015625, "loss": 0.0626, "rewards/chosen": 6.288860321044922, "rewards/margins": 14.23117446899414, "rewards/rejected": -7.942314147949219, "step": 1652 }, { "epoch": 0.453062902562697, "grad_norm": 10.6875, "kl": 7.067468166351318, "learning_rate": 5e-06, "logits/chosen": -10801121.23076923, "logits/rejected": -10053328.0, "logps/chosen": -407.5549504206731, "logps/rejected": -362.89284446022725, "loss": 0.0648, "rewards/chosen": 7.071421109713041, "rewards/margins": 12.913342135769504, "rewards/rejected": -5.841921026056463, "step": 1653 }, { "epoch": 0.4533369878032068, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26776538.666666668, "logits/rejected": -10423369.333333334, "logps/chosen": -535.9851888020834, "logps/rejected": -538.9908040364584, "loss": 0.0696, "rewards/chosen": 6.019396464029948, "rewards/margins": 15.443354924519856, "rewards/rejected": -9.423958460489908, "step": 1654 }, { "epoch": 0.4536110730437166, "grad_norm": 11.125, "kl": 3.67706298828125, "learning_rate": 5e-06, "logits/chosen": -4080334.3333333335, "logits/rejected": -4359416.0, "logps/chosen": -308.0704752604167, "logps/rejected": -367.8809000651042, "loss": 0.1081, "rewards/chosen": 4.6833070119222, "rewards/margins": 10.344533920288086, "rewards/rejected": -5.661226908365886, "step": 1655 }, { "epoch": 0.4538851582842264, "grad_norm": 0.8984375, "kl": 6.170504093170166, "learning_rate": 5e-06, "logits/chosen": -8672504.0, "logits/rejected": 17395504.0, "logps/chosen": -429.67578125, "logps/rejected": -685.2325439453125, "loss": 0.003, "rewards/chosen": 8.003698348999023, "rewards/margins": 19.800585746765137, "rewards/rejected": -11.796887397766113, "step": 1656 }, { "epoch": 0.4541592435247362, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17426732.0, "logits/rejected": -8971247.0, "logps/chosen": -486.8130798339844, "logps/rejected": -543.311767578125, "loss": 0.0117, "rewards/chosen": 6.186143398284912, "rewards/margins": 15.482910633087158, "rewards/rejected": -9.296767234802246, "step": 1657 }, { "epoch": 0.454433328765246, "grad_norm": 9.75, "kl": 3.4021365642547607, "learning_rate": 5e-06, "logits/chosen": -21958126.933333334, "logits/rejected": -7450010.666666667, "logps/chosen": -342.11858723958335, "logps/rejected": -634.3654513888889, "loss": 0.0818, "rewards/chosen": 6.399081420898438, "rewards/margins": 18.174235534667968, "rewards/rejected": -11.775154113769531, "step": 1658 }, { "epoch": 0.4547074140057558, "grad_norm": 4.5, "kl": 2.1718270778656006, "learning_rate": 5e-06, "logits/chosen": -23876661.333333332, "logits/rejected": 28407739.733333334, "logps/chosen": -342.56792534722223, "logps/rejected": -782.2903645833334, "loss": 0.0385, "rewards/chosen": 5.544088999430339, "rewards/margins": 23.50750249226888, "rewards/rejected": -17.96341349283854, "step": 1659 }, { "epoch": 0.4549814992462656, "grad_norm": 8.1875, "kl": 6.910480976104736, "learning_rate": 5e-06, "logits/chosen": -17576480.0, "logits/rejected": -29124672.0, "logps/chosen": -322.58019080528845, "logps/rejected": -486.3767755681818, "loss": 0.0266, "rewards/chosen": 6.523874136117788, "rewards/margins": 15.915561009120275, "rewards/rejected": -9.391686873002486, "step": 1660 }, { "epoch": 0.4552555844867754, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27467160.615384616, "logits/rejected": -34172177.45454545, "logps/chosen": -429.1819411057692, "logps/rejected": -388.6783558238636, "loss": 0.0526, "rewards/chosen": 6.275446965144231, "rewards/margins": 13.631985217541247, "rewards/rejected": -7.356538252397017, "step": 1661 }, { "epoch": 0.4555296697272852, "grad_norm": 8.375, "kl": 11.749387741088867, "learning_rate": 5e-06, "logits/chosen": -24009166.933333334, "logits/rejected": -6718025.777777778, "logps/chosen": -417.01604817708335, "logps/rejected": -417.8064236111111, "loss": 0.0313, "rewards/chosen": 6.679830932617188, "rewards/margins": 14.568787638346354, "rewards/rejected": -7.888956705729167, "step": 1662 }, { "epoch": 0.45580375496779496, "grad_norm": 10.1875, "kl": 0.4525540769100189, "learning_rate": 5e-06, "logits/chosen": -17109940.0, "logits/rejected": 15071477.333333334, "logps/chosen": -319.3381754557292, "logps/rejected": -708.2626953125, "loss": 0.0463, "rewards/chosen": 5.832986195882161, "rewards/margins": 14.342573801676433, "rewards/rejected": -8.509587605794271, "step": 1663 }, { "epoch": 0.4560778402083048, "grad_norm": 1.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 998358.9, "logits/rejected": -4212055.142857143, "logps/chosen": -478.3154296875, "logps/rejected": -441.92201450892856, "loss": 0.0028, "rewards/chosen": 7.805268859863281, "rewards/margins": 16.36726531982422, "rewards/rejected": -8.561996459960938, "step": 1664 }, { "epoch": 0.4563519254488146, "grad_norm": 3.109375, "kl": 5.793788909912109, "learning_rate": 5e-06, "logits/chosen": -15877393.333333334, "logits/rejected": -28144000.0, "logps/chosen": -417.4274088541667, "logps/rejected": -503.7835286458333, "loss": 0.0139, "rewards/chosen": 6.937294006347656, "rewards/margins": 19.01272964477539, "rewards/rejected": -12.075435638427734, "step": 1665 }, { "epoch": 0.4566260106893244, "grad_norm": 8.25, "kl": 0.32558950781822205, "learning_rate": 5e-06, "logits/chosen": -31651800.615384616, "logits/rejected": 2107970.727272727, "logps/chosen": -430.36959134615387, "logps/rejected": -557.9992453835227, "loss": 0.0708, "rewards/chosen": 4.91006352351262, "rewards/margins": 14.11682982544799, "rewards/rejected": -9.20676630193537, "step": 1666 }, { "epoch": 0.45690009592983416, "grad_norm": 17.625, "kl": 7.495529651641846, "learning_rate": 5e-06, "logits/chosen": -16611636.266666668, "logits/rejected": -6260107.111111111, "logps/chosen": -425.1860026041667, "logps/rejected": -519.3334418402778, "loss": 0.0682, "rewards/chosen": 6.476250712076823, "rewards/margins": 14.29713372124566, "rewards/rejected": -7.820883009168837, "step": 1667 }, { "epoch": 0.457174181170344, "grad_norm": 8.0625, "kl": 3.6604013442993164, "learning_rate": 5e-06, "logits/chosen": -7275998.588235294, "logits/rejected": -6406930.285714285, "logps/chosen": -620.8793658088235, "logps/rejected": -491.03763253348217, "loss": 0.0446, "rewards/chosen": 6.910154454848346, "rewards/margins": 13.102085498200745, "rewards/rejected": -6.191931043352399, "step": 1668 }, { "epoch": 0.4574482664108538, "grad_norm": 12.5, "kl": 7.367467880249023, "learning_rate": 5e-06, "logits/chosen": -27970533.818181816, "logits/rejected": -9707217.23076923, "logps/chosen": -431.25883345170456, "logps/rejected": -591.4619140625, "loss": 0.0444, "rewards/chosen": 5.689906033602628, "rewards/margins": 16.720557419570177, "rewards/rejected": -11.030651385967548, "step": 1669 }, { "epoch": 0.4577223516513636, "grad_norm": 8.4375, "kl": 3.4151358604431152, "learning_rate": 5e-06, "logits/chosen": -9808922.666666666, "logits/rejected": 1718593.3333333333, "logps/chosen": -374.3586832682292, "logps/rejected": -573.3936767578125, "loss": 0.0331, "rewards/chosen": 6.177549362182617, "rewards/margins": 15.556535720825195, "rewards/rejected": -9.378986358642578, "step": 1670 }, { "epoch": 0.45799643689187336, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15295173.333333334, "logits/rejected": -28531904.0, "logps/chosen": -400.40098741319446, "logps/rejected": -526.2947591145834, "loss": 0.0553, "rewards/chosen": 7.846744961208767, "rewards/margins": 15.302373419867621, "rewards/rejected": -7.455628458658854, "step": 1671 }, { "epoch": 0.4582705221323832, "grad_norm": 3.515625, "kl": 3.4124019145965576, "learning_rate": 5e-06, "logits/chosen": -25529612.307692308, "logits/rejected": -3541481.8181818184, "logps/chosen": -487.5212590144231, "logps/rejected": -635.6708984375, "loss": 0.0382, "rewards/chosen": 6.04890617957482, "rewards/margins": 16.29345254964762, "rewards/rejected": -10.244546370072799, "step": 1672 }, { "epoch": 0.458544607372893, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17195749.333333332, "logits/rejected": 17806536.533333335, "logps/chosen": -456.90375434027777, "logps/rejected": -524.9536458333333, "loss": 0.0645, "rewards/chosen": 5.99097654554579, "rewards/margins": 16.255053287082248, "rewards/rejected": -10.264076741536458, "step": 1673 }, { "epoch": 0.45881869261340275, "grad_norm": 3.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9824670.222222222, "logits/rejected": -15499771.733333332, "logps/chosen": -419.7868381076389, "logps/rejected": -416.55579427083336, "loss": 0.0324, "rewards/chosen": 5.751608106825087, "rewards/margins": 14.639554680718316, "rewards/rejected": -8.88794657389323, "step": 1674 }, { "epoch": 0.45909277785391256, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 39730606.54545455, "logits/rejected": 10120450.461538462, "logps/chosen": -443.51376065340907, "logps/rejected": -590.6284930889423, "loss": 0.0202, "rewards/chosen": 5.153550581498579, "rewards/margins": 15.22309427328043, "rewards/rejected": -10.06954369178185, "step": 1675 }, { "epoch": 0.4593668630944224, "grad_norm": 14.375, "kl": 4.643971920013428, "learning_rate": 5e-06, "logits/chosen": 3026720.4444444445, "logits/rejected": -6319134.933333334, "logps/chosen": -358.44476996527777, "logps/rejected": -569.3490885416667, "loss": 0.0745, "rewards/chosen": 6.8018616570366754, "rewards/margins": 17.46179207695855, "rewards/rejected": -10.659930419921874, "step": 1676 }, { "epoch": 0.4596409483349322, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2710994.909090909, "logits/rejected": -2109954.153846154, "logps/chosen": -391.14897017045456, "logps/rejected": -589.61962890625, "loss": 0.014, "rewards/chosen": 6.503354159268466, "rewards/margins": 18.265018916630243, "rewards/rejected": -11.761664757361778, "step": 1677 }, { "epoch": 0.45991503357544194, "grad_norm": 3.578125, "kl": 2.1121902465820312, "learning_rate": 5e-06, "logits/chosen": -31023896.615384616, "logits/rejected": -14545905.454545455, "logps/chosen": -513.7229942908654, "logps/rejected": -472.2059215198864, "loss": 0.0123, "rewards/chosen": 7.115669837364783, "rewards/margins": 17.71613786604021, "rewards/rejected": -10.600468028675426, "step": 1678 }, { "epoch": 0.46018911881595176, "grad_norm": 4.71875, "kl": 2.4123740196228027, "learning_rate": 5e-06, "logits/chosen": -16045000.0, "logits/rejected": -22676274.285714287, "logps/chosen": -449.972607421875, "logps/rejected": -497.6011439732143, "loss": 0.028, "rewards/chosen": 6.625751495361328, "rewards/margins": 15.715674264090401, "rewards/rejected": -9.089922768729073, "step": 1679 }, { "epoch": 0.4604632040564616, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4224403.692307692, "logits/rejected": -31058740.363636363, "logps/chosen": -364.01663912259613, "logps/rejected": -540.4870383522727, "loss": 0.0144, "rewards/chosen": 7.314696678748498, "rewards/margins": 16.639172053837278, "rewards/rejected": -9.32447537508878, "step": 1680 }, { "epoch": 0.46073728929697133, "grad_norm": 7.40625, "kl": 5.1912946701049805, "learning_rate": 5e-06, "logits/chosen": -44700824.615384616, "logits/rejected": -27330894.545454547, "logps/chosen": -419.81482872596155, "logps/rejected": -533.6186079545455, "loss": 0.0473, "rewards/chosen": 6.046580387995793, "rewards/margins": 17.83260084032179, "rewards/rejected": -11.786020452325994, "step": 1681 }, { "epoch": 0.46101137453748114, "grad_norm": 3.9375, "kl": 2.178518295288086, "learning_rate": 5e-06, "logits/chosen": -9626916.705882354, "logits/rejected": -17525276.57142857, "logps/chosen": -411.64148667279414, "logps/rejected": -410.66831752232144, "loss": 0.0132, "rewards/chosen": 7.163823296042049, "rewards/margins": 16.341194729845064, "rewards/rejected": -9.177371433803014, "step": 1682 }, { "epoch": 0.46128545977799096, "grad_norm": 3.171875, "kl": 1.6191076040267944, "learning_rate": 5e-06, "logits/chosen": -13751496.727272727, "logits/rejected": -21142217.846153848, "logps/chosen": -354.58200905539775, "logps/rejected": -442.36910306490387, "loss": 0.0315, "rewards/chosen": 4.665382038463246, "rewards/margins": 13.805455374550986, "rewards/rejected": -9.14007333608774, "step": 1683 }, { "epoch": 0.46155954501850077, "grad_norm": 3.953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29768448.0, "logits/rejected": -26296806.4, "logps/chosen": -397.05132378472223, "logps/rejected": -504.0786458333333, "loss": 0.0194, "rewards/chosen": 7.10087415907118, "rewards/margins": 15.62848646375868, "rewards/rejected": -8.5276123046875, "step": 1684 }, { "epoch": 0.46183363025901053, "grad_norm": 9.0, "kl": 6.346858978271484, "learning_rate": 5e-06, "logits/chosen": -9726483.333333334, "logits/rejected": -23424117.333333332, "logps/chosen": -384.3035888671875, "logps/rejected": -487.3333333333333, "loss": 0.035, "rewards/chosen": 6.78436279296875, "rewards/margins": 18.847468058268227, "rewards/rejected": -12.063105265299479, "step": 1685 }, { "epoch": 0.46210771549952034, "grad_norm": 9.1875, "kl": 12.066595077514648, "learning_rate": 5e-06, "logits/chosen": -28407683.555555556, "logits/rejected": -92274.0, "logps/chosen": -437.7995876736111, "logps/rejected": -476.4726969401042, "loss": 0.0295, "rewards/chosen": 6.991308000352648, "rewards/margins": 17.289805518256294, "rewards/rejected": -10.298497517903646, "step": 1686 }, { "epoch": 0.46238180074003016, "grad_norm": 3.203125, "kl": 3.8043861389160156, "learning_rate": 5e-06, "logits/chosen": -15123502.857142856, "logits/rejected": -9638312.0, "logps/chosen": -483.7584751674107, "logps/rejected": -549.671533203125, "loss": 0.0268, "rewards/chosen": 6.622264317103794, "rewards/margins": 19.14669974190848, "rewards/rejected": -12.524435424804688, "step": 1687 }, { "epoch": 0.46265588598053997, "grad_norm": 4.71875, "kl": 1.5119298696517944, "learning_rate": 5e-06, "logits/chosen": -16942726.4, "logits/rejected": -17508851.555555556, "logps/chosen": -358.5192057291667, "logps/rejected": -442.4229329427083, "loss": 0.0141, "rewards/chosen": 5.889860534667969, "rewards/margins": 15.631549580891928, "rewards/rejected": -9.741689046223959, "step": 1688 }, { "epoch": 0.46292997122104973, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8961638.222222222, "logits/rejected": -11250312.533333333, "logps/chosen": -451.10389539930554, "logps/rejected": -486.91578776041666, "loss": 0.0371, "rewards/chosen": 8.140796237521702, "rewards/margins": 17.39507276746962, "rewards/rejected": -9.254276529947917, "step": 1689 }, { "epoch": 0.46320405646155954, "grad_norm": 6.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22575352.0, "logits/rejected": -33758248.0, "logps/chosen": -315.0330505371094, "logps/rejected": -480.6097106933594, "loss": 0.0159, "rewards/chosen": 6.096626281738281, "rewards/margins": 15.455322265625, "rewards/rejected": -9.358695983886719, "step": 1690 }, { "epoch": 0.46347814170206936, "grad_norm": 3.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30828918.4, "logits/rejected": -40345165.71428572, "logps/chosen": -491.60400390625, "logps/rejected": -497.85585239955356, "loss": 0.028, "rewards/chosen": 6.542910766601563, "rewards/margins": 15.3159907749721, "rewards/rejected": -8.773080008370536, "step": 1691 }, { "epoch": 0.4637522269425791, "grad_norm": 1.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7056406.4, "logits/rejected": -13818329.777777778, "logps/chosen": -418.9163411458333, "logps/rejected": -548.7373046875, "loss": 0.0069, "rewards/chosen": 6.457537841796875, "rewards/margins": 16.708722093370227, "rewards/rejected": -10.251184251573351, "step": 1692 }, { "epoch": 0.46402631218308893, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12750016.0, "logits/rejected": -16764871.0, "logps/chosen": -360.90802001953125, "logps/rejected": -447.96612548828125, "loss": 0.0454, "rewards/chosen": 6.860402584075928, "rewards/margins": 16.18801259994507, "rewards/rejected": -9.32761001586914, "step": 1693 }, { "epoch": 0.46430039742359874, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12547288.615384616, "logits/rejected": -24998301.09090909, "logps/chosen": -327.8277118389423, "logps/rejected": -546.2495561079545, "loss": 0.0198, "rewards/chosen": 5.97627669114333, "rewards/margins": 16.907631560639068, "rewards/rejected": -10.931354869495738, "step": 1694 }, { "epoch": 0.46457448266410856, "grad_norm": 5.15625, "kl": 10.120944023132324, "learning_rate": 5e-06, "logits/chosen": -28861140.57142857, "logits/rejected": -31471942.4, "logps/chosen": -440.75537109375, "logps/rejected": -512.94384765625, "loss": 0.0187, "rewards/chosen": 6.952501569475446, "rewards/margins": 16.427845655168806, "rewards/rejected": -9.47534408569336, "step": 1695 }, { "epoch": 0.4648485679046183, "grad_norm": 2.734375, "kl": 1.9487762451171875, "learning_rate": 5e-06, "logits/chosen": -27876937.846153848, "logits/rejected": -26681885.09090909, "logps/chosen": -517.7034254807693, "logps/rejected": -583.6222478693181, "loss": 0.0086, "rewards/chosen": 7.234901428222656, "rewards/margins": 16.612262379039418, "rewards/rejected": -9.377360950816762, "step": 1696 }, { "epoch": 0.46512265314512813, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17521928.615384616, "logits/rejected": -15317056.0, "logps/chosen": -365.98580228365387, "logps/rejected": -670.5622336647727, "loss": 0.0767, "rewards/chosen": 4.622768108661358, "rewards/margins": 15.317984200857735, "rewards/rejected": -10.695216092196377, "step": 1697 }, { "epoch": 0.46539673838563794, "grad_norm": 8.5625, "kl": 6.266646862030029, "learning_rate": 5e-06, "logits/chosen": -10905769.142857144, "logits/rejected": -20460694.4, "logps/chosen": -475.68038504464283, "logps/rejected": -434.183935546875, "loss": 0.0688, "rewards/chosen": 6.765536717006138, "rewards/margins": 12.914122990199498, "rewards/rejected": -6.148586273193359, "step": 1698 }, { "epoch": 0.46567082362614776, "grad_norm": 7.90625, "kl": 3.460441827774048, "learning_rate": 5e-06, "logits/chosen": -37908524.0, "logits/rejected": -2656849.0, "logps/chosen": -483.4461669921875, "logps/rejected": -510.94000244140625, "loss": 0.0246, "rewards/chosen": 6.889766693115234, "rewards/margins": 14.275043964385986, "rewards/rejected": -7.385277271270752, "step": 1699 }, { "epoch": 0.4659449088666575, "grad_norm": 12.6875, "kl": 7.921131134033203, "learning_rate": 5e-06, "logits/chosen": -8370612.571428572, "logits/rejected": 69966233.6, "logps/chosen": -462.45797293526783, "logps/rejected": -811.92294921875, "loss": 0.0728, "rewards/chosen": 6.194572448730469, "rewards/margins": 23.808604431152343, "rewards/rejected": -17.614031982421874, "step": 1700 }, { "epoch": 0.46621899410716733, "grad_norm": 10.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8374808.888888889, "logits/rejected": -17345011.2, "logps/chosen": -389.7767740885417, "logps/rejected": -533.6797200520833, "loss": 0.0645, "rewards/chosen": 5.238567776150173, "rewards/margins": 15.07330084906684, "rewards/rejected": -9.834733072916666, "step": 1701 }, { "epoch": 0.46649307934767714, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30879545.14285714, "logits/rejected": -15246166.4, "logps/chosen": -430.812744140625, "logps/rejected": -457.98857421875, "loss": 0.032, "rewards/chosen": 5.609842572893415, "rewards/margins": 14.478392682756697, "rewards/rejected": -8.868550109863282, "step": 1702 }, { "epoch": 0.4667671645881869, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21201356.8, "logits/rejected": -10524163.42857143, "logps/chosen": -450.152099609375, "logps/rejected": -442.1358119419643, "loss": 0.0249, "rewards/chosen": 8.022964477539062, "rewards/margins": 15.498146057128906, "rewards/rejected": -7.475181579589844, "step": 1703 }, { "epoch": 0.4670412498286967, "grad_norm": 8.625, "kl": 12.218151092529297, "learning_rate": 5e-06, "logits/chosen": -16933184.0, "logits/rejected": -5793397.6, "logps/chosen": -479.412841796875, "logps/rejected": -651.31826171875, "loss": 0.0749, "rewards/chosen": 5.957964760916574, "rewards/margins": 21.624017007010323, "rewards/rejected": -15.66605224609375, "step": 1704 }, { "epoch": 0.46731533506920653, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28486350.769230768, "logits/rejected": -14592904.727272727, "logps/chosen": -368.64633413461536, "logps/rejected": -517.8259943181819, "loss": 0.0495, "rewards/chosen": 5.982340299166166, "rewards/margins": 14.455531300364674, "rewards/rejected": -8.473191001198508, "step": 1705 }, { "epoch": 0.46758942030971634, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20912817.230769232, "logits/rejected": -15670466.909090908, "logps/chosen": -284.78091195913464, "logps/rejected": -539.6136807528409, "loss": 0.0606, "rewards/chosen": 5.1031329815204325, "rewards/margins": 16.378137975305943, "rewards/rejected": -11.275004993785512, "step": 1706 }, { "epoch": 0.4678635055502261, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25888332.444444444, "logits/rejected": -20506301.866666667, "logps/chosen": -365.67206488715277, "logps/rejected": -610.8548828125, "loss": 0.0317, "rewards/chosen": 5.631873236762153, "rewards/margins": 17.454921129014757, "rewards/rejected": -11.823047892252605, "step": 1707 }, { "epoch": 0.4681375907907359, "grad_norm": 2.5, "kl": 0.9259414672851562, "learning_rate": 5e-06, "logits/chosen": -20356174.769230768, "logits/rejected": -11810011.636363637, "logps/chosen": -469.22543569711536, "logps/rejected": -580.1753373579545, "loss": 0.0087, "rewards/chosen": 6.462041414701021, "rewards/margins": 14.239171541654146, "rewards/rejected": -7.777130126953125, "step": 1708 }, { "epoch": 0.4684116760312457, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24580057.14285714, "logits/rejected": -12401569.88235294, "logps/chosen": -405.86181640625, "logps/rejected": -554.1746323529412, "loss": 0.0274, "rewards/chosen": 6.433249882289341, "rewards/margins": 15.36135405853015, "rewards/rejected": -8.928104176240808, "step": 1709 }, { "epoch": 0.46868576127175554, "grad_norm": 2.359375, "kl": 2.2583796977996826, "learning_rate": 5e-06, "logits/chosen": -14685120.0, "logits/rejected": -7635593.142857143, "logps/chosen": -485.022314453125, "logps/rejected": -525.01806640625, "loss": 0.0054, "rewards/chosen": 8.168260955810547, "rewards/margins": 18.577594321114677, "rewards/rejected": -10.40933336530413, "step": 1710 }, { "epoch": 0.4689598465122653, "grad_norm": 2.359375, "kl": 0.043883007019758224, "learning_rate": 5e-06, "logits/chosen": -15452819.2, "logits/rejected": -24974393.14285714, "logps/chosen": -525.10888671875, "logps/rejected": -607.0219029017857, "loss": 0.0189, "rewards/chosen": 7.411550140380859, "rewards/margins": 19.1441529410226, "rewards/rejected": -11.732602800641741, "step": 1711 }, { "epoch": 0.4692339317527751, "grad_norm": 4.90625, "kl": 2.2185516357421875, "learning_rate": 5e-06, "logits/chosen": -11490661.333333334, "logits/rejected": -25797226.666666668, "logps/chosen": -439.6583658854167, "logps/rejected": -459.13916015625, "loss": 0.0673, "rewards/chosen": 5.716467115614149, "rewards/margins": 13.766057544284397, "rewards/rejected": -8.049590428670248, "step": 1712 }, { "epoch": 0.4695080169932849, "grad_norm": 8.125, "kl": 0.7700144648551941, "learning_rate": 5e-06, "logits/chosen": 3892979.4285714286, "logits/rejected": -30021548.8, "logps/chosen": -473.704833984375, "logps/rejected": -445.43486328125, "loss": 0.0515, "rewards/chosen": 4.963799612862723, "rewards/margins": 14.501913016183035, "rewards/rejected": -9.538113403320313, "step": 1713 }, { "epoch": 0.4697821022337947, "grad_norm": 5.75, "kl": 4.9347381591796875, "learning_rate": 5e-06, "logits/chosen": -10010000.0, "logits/rejected": 50952014.76923077, "logps/chosen": -448.5056818181818, "logps/rejected": -546.8798076923077, "loss": 0.0208, "rewards/chosen": 6.73655215176669, "rewards/margins": 19.233326678509478, "rewards/rejected": -12.496774526742788, "step": 1714 }, { "epoch": 0.4700561874743045, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8735749.6, "logits/rejected": -15509334.857142856, "logps/chosen": -338.249853515625, "logps/rejected": -580.1738978794643, "loss": 0.0198, "rewards/chosen": 4.979413223266602, "rewards/margins": 13.568934358869281, "rewards/rejected": -8.589521135602679, "step": 1715 }, { "epoch": 0.4703302727148143, "grad_norm": 13.25, "kl": 4.573616981506348, "learning_rate": 5e-06, "logits/chosen": -19202152.727272727, "logits/rejected": 2290331.076923077, "logps/chosen": -310.43814364346593, "logps/rejected": -408.9820087139423, "loss": 0.1011, "rewards/chosen": 4.24181435324929, "rewards/margins": 13.496539882846644, "rewards/rejected": -9.254725529597355, "step": 1716 }, { "epoch": 0.4706043579553241, "grad_norm": 16.625, "kl": 4.853233337402344, "learning_rate": 5e-06, "logits/chosen": -6814181.866666666, "logits/rejected": -24235681.777777776, "logps/chosen": -474.4140625, "logps/rejected": -473.2750651041667, "loss": 0.1028, "rewards/chosen": 5.99122060139974, "rewards/margins": 12.235826195610894, "rewards/rejected": -6.244605594211155, "step": 1717 }, { "epoch": 0.4708784431958339, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27249122.285714287, "logits/rejected": -31007382.588235293, "logps/chosen": -300.43948800223217, "logps/rejected": -581.6171300551471, "loss": 0.0627, "rewards/chosen": 4.458695002964565, "rewards/margins": 11.283843192733638, "rewards/rejected": -6.825148189769072, "step": 1718 }, { "epoch": 0.4711525284363437, "grad_norm": 7.03125, "kl": 4.356175422668457, "learning_rate": 5e-06, "logits/chosen": -29919904.0, "logits/rejected": 12088084.0, "logps/chosen": -376.7576904296875, "logps/rejected": -477.5568033854167, "loss": 0.0239, "rewards/chosen": 5.41218630472819, "rewards/margins": 14.567485173543293, "rewards/rejected": -9.155298868815104, "step": 1719 }, { "epoch": 0.4714266136768535, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29162051.2, "logits/rejected": -21730441.14285714, "logps/chosen": -494.8162109375, "logps/rejected": -557.0776018415179, "loss": 0.0131, "rewards/chosen": 6.485673522949218, "rewards/margins": 15.36318849836077, "rewards/rejected": -8.877514975411552, "step": 1720 }, { "epoch": 0.4717006989173633, "grad_norm": 6.5, "kl": 5.710989952087402, "learning_rate": 5e-06, "logits/chosen": -6248456.666666667, "logits/rejected": -8080106.0, "logps/chosen": -353.8678385416667, "logps/rejected": -462.4733479817708, "loss": 0.0717, "rewards/chosen": 6.696165720621745, "rewards/margins": 16.093798955281574, "rewards/rejected": -9.39763323465983, "step": 1721 }, { "epoch": 0.4719747841578731, "grad_norm": 4.6875, "kl": 2.9611613750457764, "learning_rate": 5e-06, "logits/chosen": -31069926.4, "logits/rejected": -23353648.0, "logps/chosen": -430.9833658854167, "logps/rejected": -561.7768012152778, "loss": 0.0413, "rewards/chosen": 6.2404052734375, "rewards/margins": 16.722451612684463, "rewards/rejected": -10.482046339246962, "step": 1722 }, { "epoch": 0.4722488693983829, "grad_norm": 13.3125, "kl": 2.153919219970703, "learning_rate": 5e-06, "logits/chosen": -26730660.57142857, "logits/rejected": -19227302.4, "logps/chosen": -438.88462611607144, "logps/rejected": -528.17666015625, "loss": 0.0234, "rewards/chosen": 8.788391658238002, "rewards/margins": 20.604691096714564, "rewards/rejected": -11.816299438476562, "step": 1723 }, { "epoch": 0.4725229546388927, "grad_norm": 5.28125, "kl": 8.103209495544434, "learning_rate": 5e-06, "logits/chosen": -23273424.94117647, "logits/rejected": -6795886.857142857, "logps/chosen": -389.1346220128676, "logps/rejected": -590.9950474330357, "loss": 0.0171, "rewards/chosen": 7.719419591567096, "rewards/margins": 17.121781870096672, "rewards/rejected": -9.402362278529576, "step": 1724 }, { "epoch": 0.47279703987940247, "grad_norm": 4.625, "kl": 3.171070098876953, "learning_rate": 5e-06, "logits/chosen": -35072192.0, "logits/rejected": -3654680.0, "logps/chosen": -446.74776785714283, "logps/rejected": -346.900537109375, "loss": 0.0158, "rewards/chosen": 6.928188868931362, "rewards/margins": 14.853886958530971, "rewards/rejected": -7.925698089599609, "step": 1725 }, { "epoch": 0.4730711251199123, "grad_norm": 8.0625, "kl": 0.2731831967830658, "learning_rate": 5e-06, "logits/chosen": -4667737.6, "logits/rejected": -46049581.71428572, "logps/chosen": -602.6849609375, "logps/rejected": -691.4379185267857, "loss": 0.0177, "rewards/chosen": 7.016824340820312, "rewards/margins": 18.332723781040734, "rewards/rejected": -11.315899440220424, "step": 1726 }, { "epoch": 0.4733452103604221, "grad_norm": 4.5625, "kl": 3.666719436645508, "learning_rate": 5e-06, "logits/chosen": 9549667.636363637, "logits/rejected": -24935884.307692308, "logps/chosen": -497.79647549715907, "logps/rejected": -404.20804537259613, "loss": 0.0208, "rewards/chosen": 8.073626431551846, "rewards/margins": 15.631741290325884, "rewards/rejected": -7.558114858774038, "step": 1727 }, { "epoch": 0.4736192956009319, "grad_norm": 1.1171875, "kl": 3.095902919769287, "learning_rate": 5e-06, "logits/chosen": -9678567.272727273, "logits/rejected": -19049222.153846152, "logps/chosen": -377.5650745738636, "logps/rejected": -294.47171724759613, "loss": 0.0039, "rewards/chosen": 8.276838822798295, "rewards/margins": 14.808806572760734, "rewards/rejected": -6.53196774996244, "step": 1728 }, { "epoch": 0.47389338084144167, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21679616.0, "logits/rejected": -2558334.6153846155, "logps/chosen": -588.8974165482955, "logps/rejected": -595.4732947716346, "loss": 0.0126, "rewards/chosen": 8.039434259588068, "rewards/margins": 18.798513239080258, "rewards/rejected": -10.759078979492188, "step": 1729 }, { "epoch": 0.4741674660819515, "grad_norm": 6.8125, "kl": 4.439382076263428, "learning_rate": 5e-06, "logits/chosen": -33666214.4, "logits/rejected": 1251648.0, "logps/chosen": -430.1591796875, "logps/rejected": -551.0348074776786, "loss": 0.0286, "rewards/chosen": 5.9394981384277346, "rewards/margins": 15.079901885986327, "rewards/rejected": -9.140403747558594, "step": 1730 }, { "epoch": 0.4744415513224613, "grad_norm": 5.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35950540.8, "logits/rejected": -24483264.0, "logps/chosen": -451.118603515625, "logps/rejected": -535.5440848214286, "loss": 0.0164, "rewards/chosen": 6.484437561035156, "rewards/margins": 18.143871198381696, "rewards/rejected": -11.65943363734654, "step": 1731 }, { "epoch": 0.4747156365629711, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4037740.5714285714, "logits/rejected": -13294748.8, "logps/chosen": -386.86655970982144, "logps/rejected": -434.4720703125, "loss": 0.0253, "rewards/chosen": 5.585097176688058, "rewards/margins": 13.711100442068918, "rewards/rejected": -8.12600326538086, "step": 1732 }, { "epoch": 0.47498972180348087, "grad_norm": 6.46875, "kl": 6.7461957931518555, "learning_rate": 5e-06, "logits/chosen": -27108872.0, "logits/rejected": -11637613.0, "logps/chosen": -555.9730224609375, "logps/rejected": -513.642333984375, "loss": 0.05, "rewards/chosen": 8.204333305358887, "rewards/margins": 16.63151264190674, "rewards/rejected": -8.427179336547852, "step": 1733 }, { "epoch": 0.4752638070439907, "grad_norm": 6.71875, "kl": 5.792693138122559, "learning_rate": 5e-06, "logits/chosen": -36192064.0, "logits/rejected": -17755990.4, "logps/chosen": -446.2562779017857, "logps/rejected": -404.3393798828125, "loss": 0.0358, "rewards/chosen": 7.077086857386997, "rewards/margins": 14.093060520717074, "rewards/rejected": -7.015973663330078, "step": 1734 }, { "epoch": 0.4755378922845005, "grad_norm": 12.75, "kl": 3.104572296142578, "learning_rate": 5e-06, "logits/chosen": -17702146.46153846, "logits/rejected": 3938207.6363636362, "logps/chosen": -414.87094350961536, "logps/rejected": -445.5094549005682, "loss": 0.0719, "rewards/chosen": 5.019219031700721, "rewards/margins": 13.122806095576784, "rewards/rejected": -8.103587063876065, "step": 1735 }, { "epoch": 0.47581197752501025, "grad_norm": 4.34375, "kl": 7.778056621551514, "learning_rate": 5e-06, "logits/chosen": -27889886.11764706, "logits/rejected": -37008740.571428575, "logps/chosen": -440.0104549632353, "logps/rejected": -332.8716517857143, "loss": 0.027, "rewards/chosen": 5.8062842873966, "rewards/margins": 13.563672811043363, "rewards/rejected": -7.757388523646763, "step": 1736 }, { "epoch": 0.47608606276552007, "grad_norm": 18.125, "kl": 6.1865034103393555, "learning_rate": 5e-06, "logits/chosen": -14725274.666666666, "logits/rejected": -17034149.333333332, "logps/chosen": -513.524658203125, "logps/rejected": -441.9706217447917, "loss": 0.077, "rewards/chosen": 5.434621810913086, "rewards/margins": 11.689544677734375, "rewards/rejected": -6.254922866821289, "step": 1737 }, { "epoch": 0.4763601480060299, "grad_norm": 5.71875, "kl": 0.34301885962486267, "learning_rate": 5e-06, "logits/chosen": -27259341.333333332, "logits/rejected": -5174554.0, "logps/chosen": -477.2246907552083, "logps/rejected": -479.990478515625, "loss": 0.0211, "rewards/chosen": 6.786771138509114, "rewards/margins": 15.512872695922852, "rewards/rejected": -8.726101557413736, "step": 1738 }, { "epoch": 0.4766342332465397, "grad_norm": 5.625, "kl": 7.602664947509766, "learning_rate": 5e-06, "logits/chosen": -20719444.0, "logits/rejected": -21218222.0, "logps/chosen": -430.06549072265625, "logps/rejected": -493.4507751464844, "loss": 0.0249, "rewards/chosen": 6.737005710601807, "rewards/margins": 14.590097904205322, "rewards/rejected": -7.853092193603516, "step": 1739 }, { "epoch": 0.47690831848704945, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6343762.857142857, "logits/rejected": -14142745.6, "logps/chosen": -425.992431640625, "logps/rejected": -544.41064453125, "loss": 0.0467, "rewards/chosen": 6.036842891148159, "rewards/margins": 17.258165522984097, "rewards/rejected": -11.221322631835937, "step": 1740 }, { "epoch": 0.47718240372755927, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30742061.333333332, "logits/rejected": 7463903.333333333, "logps/chosen": -431.1814778645833, "logps/rejected": -460.7071533203125, "loss": 0.0377, "rewards/chosen": 7.320431391398112, "rewards/margins": 16.227760950724285, "rewards/rejected": -8.907329559326172, "step": 1741 }, { "epoch": 0.4774564889680691, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29444497.454545453, "logits/rejected": -16265868.307692308, "logps/chosen": -418.0491832386364, "logps/rejected": -636.1583533653846, "loss": 0.0376, "rewards/chosen": 6.305139021439985, "rewards/margins": 16.02702059445681, "rewards/rejected": -9.721881573016827, "step": 1742 }, { "epoch": 0.4777305742085789, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7733428.666666667, "logits/rejected": -14062988.0, "logps/chosen": -447.41259765625, "logps/rejected": -624.1127522786459, "loss": 0.048, "rewards/chosen": 5.816048940022786, "rewards/margins": 18.700150807698567, "rewards/rejected": -12.884101867675781, "step": 1743 }, { "epoch": 0.47800465944908865, "grad_norm": 7.8125, "kl": 0.2375590056180954, "learning_rate": 5e-06, "logits/chosen": -23543787.2, "logits/rejected": -20054875.42857143, "logps/chosen": -526.951171875, "logps/rejected": -485.21714564732144, "loss": 0.0164, "rewards/chosen": 6.54974136352539, "rewards/margins": 15.982974352155413, "rewards/rejected": -9.433232988630023, "step": 1744 }, { "epoch": 0.47827874468959847, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19345233.454545453, "logits/rejected": -4492041.230769231, "logps/chosen": -400.50346235795456, "logps/rejected": -653.8682391826923, "loss": 0.0462, "rewards/chosen": 6.657537286931818, "rewards/margins": 16.357398079825447, "rewards/rejected": -9.69986079289363, "step": 1745 }, { "epoch": 0.4785528299301083, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38074478.76923077, "logits/rejected": -26336680.727272727, "logps/chosen": -431.2440655048077, "logps/rejected": -668.8501864346591, "loss": 0.0314, "rewards/chosen": 6.641356248121995, "rewards/margins": 16.61587961903819, "rewards/rejected": -9.974523370916193, "step": 1746 }, { "epoch": 0.47882691517061804, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18870440.0, "logits/rejected": -28200805.333333332, "logps/chosen": -385.0126139322917, "logps/rejected": -773.9786783854166, "loss": 0.0197, "rewards/chosen": 6.617798487345378, "rewards/margins": 18.33506202697754, "rewards/rejected": -11.717263539632162, "step": 1747 }, { "epoch": 0.47910100041112785, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3567980.4444444445, "logits/rejected": -21074660.266666666, "logps/chosen": -455.39659288194446, "logps/rejected": -386.39765625, "loss": 0.0201, "rewards/chosen": 4.895331064860026, "rewards/margins": 14.299363454182942, "rewards/rejected": -9.404032389322916, "step": 1748 }, { "epoch": 0.47937508565163767, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7929053.866666666, "logits/rejected": -23250912.0, "logps/chosen": -577.4915364583334, "logps/rejected": -428.67369249131946, "loss": 0.0204, "rewards/chosen": 6.82713623046875, "rewards/margins": 15.856824747721355, "rewards/rejected": -9.029688517252604, "step": 1749 }, { "epoch": 0.4796491708921475, "grad_norm": 6.1875, "kl": 12.175565719604492, "learning_rate": 5e-06, "logits/chosen": 7249922.823529412, "logits/rejected": -17282372.57142857, "logps/chosen": -529.9791475183823, "logps/rejected": -406.8190220424107, "loss": 0.079, "rewards/chosen": 6.499926847570083, "rewards/margins": 15.574907447109702, "rewards/rejected": -9.07498059953962, "step": 1750 }, { "epoch": 0.47992325613265724, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17163776.0, "logits/rejected": -9804270.76923077, "logps/chosen": -309.0297185724432, "logps/rejected": -609.0321138822115, "loss": 0.0272, "rewards/chosen": 5.039549047296697, "rewards/margins": 16.950825057663284, "rewards/rejected": -11.911276010366587, "step": 1751 }, { "epoch": 0.48019734137316705, "grad_norm": 3.265625, "kl": 0.030724843963980675, "learning_rate": 5e-06, "logits/chosen": -9821353.142857144, "logits/rejected": -45598825.6, "logps/chosen": -444.2739955357143, "logps/rejected": -527.3685546875, "loss": 0.0105, "rewards/chosen": 5.94829341343471, "rewards/margins": 15.90647212437221, "rewards/rejected": -9.9581787109375, "step": 1752 }, { "epoch": 0.48047142661367687, "grad_norm": 4.125, "kl": 2.2532706260681152, "learning_rate": 5e-06, "logits/chosen": -38587063.46666667, "logits/rejected": -11429121.777777778, "logps/chosen": -438.7443033854167, "logps/rejected": -459.2112630208333, "loss": 0.0426, "rewards/chosen": 6.2213389078776045, "rewards/margins": 16.98258548312717, "rewards/rejected": -10.761246575249565, "step": 1753 }, { "epoch": 0.4807455118541866, "grad_norm": 7.84375, "kl": 9.252123832702637, "learning_rate": 5e-06, "logits/chosen": 12737844.0, "logits/rejected": -39653834.666666664, "logps/chosen": -380.3117268880208, "logps/rejected": -506.115234375, "loss": 0.0804, "rewards/chosen": 7.022637685139974, "rewards/margins": 16.828820546468098, "rewards/rejected": -9.806182861328125, "step": 1754 }, { "epoch": 0.48101959709469644, "grad_norm": 7.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12432654.4, "logits/rejected": 66974898.28571428, "logps/chosen": -449.4849609375, "logps/rejected": -510.22813197544644, "loss": 0.0301, "rewards/chosen": 6.58228759765625, "rewards/margins": 17.989154488699775, "rewards/rejected": -11.406866891043526, "step": 1755 }, { "epoch": 0.48129368233520625, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49823093.333333336, "logits/rejected": -17027302.666666668, "logps/chosen": -458.7109781901042, "logps/rejected": -493.8128662109375, "loss": 0.0555, "rewards/chosen": 5.439146677652995, "rewards/margins": 13.929757436116535, "rewards/rejected": -8.490610758463541, "step": 1756 }, { "epoch": 0.48156776757571607, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19180732.0, "logits/rejected": -19656290.0, "logps/chosen": -355.6199951171875, "logps/rejected": -549.4818115234375, "loss": 0.0319, "rewards/chosen": 5.0023698806762695, "rewards/margins": 15.163084983825684, "rewards/rejected": -10.160715103149414, "step": 1757 }, { "epoch": 0.4818418528162258, "grad_norm": 4.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11025736.888888888, "logits/rejected": -28984834.133333333, "logps/chosen": -550.59619140625, "logps/rejected": -455.3085611979167, "loss": 0.0133, "rewards/chosen": 9.22160169813368, "rewards/margins": 17.972566053602428, "rewards/rejected": -8.75096435546875, "step": 1758 }, { "epoch": 0.48211593805673564, "grad_norm": 2.78125, "kl": 1.1189759969711304, "learning_rate": 5e-06, "logits/chosen": -8539960.0, "logits/rejected": -20810587.42857143, "logps/chosen": -566.9564453125, "logps/rejected": -624.3837890625, "loss": 0.0088, "rewards/chosen": 6.439105224609375, "rewards/margins": 16.1263185773577, "rewards/rejected": -9.687213352748326, "step": 1759 }, { "epoch": 0.48239002329724545, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7294793.714285715, "logits/rejected": -12141376.8, "logps/chosen": -405.24874441964283, "logps/rejected": -390.2250732421875, "loss": 0.0319, "rewards/chosen": 6.4775559561593195, "rewards/margins": 14.549434770856585, "rewards/rejected": -8.071878814697266, "step": 1760 }, { "epoch": 0.48266410853775527, "grad_norm": 7.6875, "kl": 10.363985061645508, "learning_rate": 5e-06, "logits/chosen": 1193031.0, "logits/rejected": -20749960.0, "logps/chosen": -396.3492736816406, "logps/rejected": -483.1043701171875, "loss": 0.0356, "rewards/chosen": 7.0453901290893555, "rewards/margins": 18.367045402526855, "rewards/rejected": -11.3216552734375, "step": 1761 }, { "epoch": 0.482938193778265, "grad_norm": 9.5, "kl": 5.896111488342285, "learning_rate": 5e-06, "logits/chosen": -33080827.076923076, "logits/rejected": 1285000.7272727273, "logps/chosen": -448.76600060096155, "logps/rejected": -612.9714133522727, "loss": 0.0235, "rewards/chosen": 6.746042691744291, "rewards/margins": 16.744232444496422, "rewards/rejected": -9.99818975275213, "step": 1762 }, { "epoch": 0.48321227901877484, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38528996.92307692, "logits/rejected": -2705815.6363636362, "logps/chosen": -413.0025165264423, "logps/rejected": -679.2234108664773, "loss": 0.0464, "rewards/chosen": 5.77316166804387, "rewards/margins": 20.41125509622214, "rewards/rejected": -14.638093428178268, "step": 1763 }, { "epoch": 0.48348636425928465, "grad_norm": 2.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6499113.333333333, "logits/rejected": -36666650.666666664, "logps/chosen": -427.7484130859375, "logps/rejected": -567.00634765625, "loss": 0.0113, "rewards/chosen": 6.396520614624023, "rewards/margins": 16.856106440226235, "rewards/rejected": -10.459585825602213, "step": 1764 }, { "epoch": 0.4837604494997944, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13893102.857142856, "logits/rejected": -33835027.2, "logps/chosen": -354.59437779017856, "logps/rejected": -492.558544921875, "loss": 0.032, "rewards/chosen": 6.822902134486607, "rewards/margins": 16.611228397914342, "rewards/rejected": -9.788326263427734, "step": 1765 }, { "epoch": 0.4840345347403042, "grad_norm": 4.125, "kl": 5.847678184509277, "learning_rate": 5e-06, "logits/chosen": -12912740.363636363, "logits/rejected": -4621408.615384615, "logps/chosen": -437.208984375, "logps/rejected": -496.39693509615387, "loss": 0.0145, "rewards/chosen": 7.201310591264204, "rewards/margins": 15.007785717090528, "rewards/rejected": -7.8064751258263225, "step": 1766 }, { "epoch": 0.48430861998081404, "grad_norm": 12.0625, "kl": 8.803018569946289, "learning_rate": 5e-06, "logits/chosen": -13636448.888888888, "logits/rejected": 81331968.0, "logps/chosen": -382.5050998263889, "logps/rejected": -639.1169026692709, "loss": 0.0739, "rewards/chosen": 5.821333991156684, "rewards/margins": 20.38214662339952, "rewards/rejected": -14.560812632242838, "step": 1767 }, { "epoch": 0.48458270522132385, "grad_norm": 4.5625, "kl": 3.1757960319519043, "learning_rate": 5e-06, "logits/chosen": -17400568.533333335, "logits/rejected": -15886288.0, "logps/chosen": -410.6228515625, "logps/rejected": -468.07183159722223, "loss": 0.0281, "rewards/chosen": 6.762692260742187, "rewards/margins": 18.001434665256077, "rewards/rejected": -11.23874240451389, "step": 1768 }, { "epoch": 0.4848567904618336, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13196359.272727273, "logits/rejected": -30540105.846153848, "logps/chosen": -594.3966175426136, "logps/rejected": -680.6549729567307, "loss": 0.0277, "rewards/chosen": 5.919022993607954, "rewards/margins": 17.836759340513, "rewards/rejected": -11.917736346905048, "step": 1769 }, { "epoch": 0.4851308757023434, "grad_norm": 9.75, "kl": 1.8847808837890625, "learning_rate": 5e-06, "logits/chosen": -27459899.076923076, "logits/rejected": -9223883.636363637, "logps/chosen": -409.4921875, "logps/rejected": -722.6735174005681, "loss": 0.0309, "rewards/chosen": 6.713321392352764, "rewards/margins": 19.581411935232737, "rewards/rejected": -12.86809054287997, "step": 1770 }, { "epoch": 0.48540496094285324, "grad_norm": 8.8125, "kl": 9.634231567382812, "learning_rate": 5e-06, "logits/chosen": -8032254.769230769, "logits/rejected": -12141349.818181818, "logps/chosen": -411.1780348557692, "logps/rejected": -578.5405717329545, "loss": 0.0451, "rewards/chosen": 7.250727726862981, "rewards/margins": 17.8469330047394, "rewards/rejected": -10.59620527787642, "step": 1771 }, { "epoch": 0.48567904618336305, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7173357.714285715, "logits/rejected": -46676156.23529412, "logps/chosen": -351.55392020089283, "logps/rejected": -469.9995978860294, "loss": 0.0299, "rewards/chosen": 5.535685947963169, "rewards/margins": 15.6855694426208, "rewards/rejected": -10.14988349465763, "step": 1772 }, { "epoch": 0.4859531314238728, "grad_norm": 17.0, "kl": 19.262712478637695, "learning_rate": 5e-06, "logits/chosen": -13251339.294117646, "logits/rejected": -10916339.42857143, "logps/chosen": -394.5244140625, "logps/rejected": -448.7244349888393, "loss": 0.1239, "rewards/chosen": 6.423341638901654, "rewards/margins": 14.032816910944064, "rewards/rejected": -7.609475272042411, "step": 1773 }, { "epoch": 0.4862272166643826, "grad_norm": 10.25, "kl": 7.473109245300293, "learning_rate": 5e-06, "logits/chosen": -7601178.285714285, "logits/rejected": 1772707.2, "logps/chosen": -535.5997488839286, "logps/rejected": -440.668896484375, "loss": 0.0414, "rewards/chosen": 7.758562360491071, "rewards/margins": 14.14832409449986, "rewards/rejected": -6.389761734008789, "step": 1774 }, { "epoch": 0.48650130190489244, "grad_norm": 7.5625, "kl": 12.39700984954834, "learning_rate": 5e-06, "logits/chosen": -21081348.57142857, "logits/rejected": -32771580.8, "logps/chosen": -434.640869140625, "logps/rejected": -508.609619140625, "loss": 0.0454, "rewards/chosen": 6.209149496895926, "rewards/margins": 15.97211946759905, "rewards/rejected": -9.762969970703125, "step": 1775 }, { "epoch": 0.4867753871454022, "grad_norm": 3.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2236808.0, "logits/rejected": -12231328.0, "logps/chosen": -322.4861572265625, "logps/rejected": -525.6944056919643, "loss": 0.0164, "rewards/chosen": 6.618618774414062, "rewards/margins": 14.929071916852678, "rewards/rejected": -8.310453142438616, "step": 1776 }, { "epoch": 0.487049472385912, "grad_norm": 7.6875, "kl": 1.1159381866455078, "learning_rate": 5e-06, "logits/chosen": -3692841.714285714, "logits/rejected": -20519174.4, "logps/chosen": -451.77197265625, "logps/rejected": -591.741748046875, "loss": 0.0373, "rewards/chosen": 6.078469957624163, "rewards/margins": 16.16288768223354, "rewards/rejected": -10.084417724609375, "step": 1777 }, { "epoch": 0.4873235576264218, "grad_norm": 5.71875, "kl": 0.987372636795044, "learning_rate": 5e-06, "logits/chosen": -21940395.2, "logits/rejected": 14402618.285714285, "logps/chosen": -520.73564453125, "logps/rejected": -431.87437220982144, "loss": 0.0464, "rewards/chosen": 6.190509414672851, "rewards/margins": 12.403749356951032, "rewards/rejected": -6.2132399422781805, "step": 1778 }, { "epoch": 0.48759764286693164, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7143600.571428572, "logits/rejected": -24578701.17647059, "logps/chosen": -426.01011439732144, "logps/rejected": -404.3736213235294, "loss": 0.0508, "rewards/chosen": 6.981184823172433, "rewards/margins": 15.379625913475742, "rewards/rejected": -8.398441090303308, "step": 1779 }, { "epoch": 0.4878717281074414, "grad_norm": 6.34375, "kl": 4.548999786376953, "learning_rate": 5e-06, "logits/chosen": -7529917.714285715, "logits/rejected": 2496992.0, "logps/chosen": -464.6592494419643, "logps/rejected": -700.513720703125, "loss": 0.0222, "rewards/chosen": 7.1822509765625, "rewards/margins": 17.972817993164064, "rewards/rejected": -10.790567016601562, "step": 1780 }, { "epoch": 0.4881458133479512, "grad_norm": 15.625, "kl": 8.042892456054688, "learning_rate": 5e-06, "logits/chosen": -18310933.714285713, "logits/rejected": -10444678.4, "logps/chosen": -365.3692103794643, "logps/rejected": -464.9951171875, "loss": 0.0922, "rewards/chosen": 6.2843813214983255, "rewards/margins": 12.628118351527622, "rewards/rejected": -6.343737030029297, "step": 1781 }, { "epoch": 0.488419898588461, "grad_norm": 6.1875, "kl": 10.517126083374023, "learning_rate": 5e-06, "logits/chosen": -28771372.307692308, "logits/rejected": -40340928.0, "logps/chosen": -429.46371694711536, "logps/rejected": -431.26509232954544, "loss": 0.0393, "rewards/chosen": 7.775269728440505, "rewards/margins": 14.741013746995193, "rewards/rejected": -6.9657440185546875, "step": 1782 }, { "epoch": 0.48869398382897083, "grad_norm": 1.53125, "kl": 2.7481181621551514, "learning_rate": 5e-06, "logits/chosen": -2861492.4444444445, "logits/rejected": 30862222.933333334, "logps/chosen": -523.4061957465278, "logps/rejected": -472.4748046875, "loss": 0.0066, "rewards/chosen": 8.23838636610243, "rewards/margins": 17.44798312717014, "rewards/rejected": -9.209596761067708, "step": 1783 }, { "epoch": 0.4889680690694806, "grad_norm": 10.0, "kl": 0.05387115478515625, "learning_rate": 5e-06, "logits/chosen": 21088487.384615384, "logits/rejected": -26625960.727272727, "logps/chosen": -385.3213641826923, "logps/rejected": -413.4191228693182, "loss": 0.0369, "rewards/chosen": 7.828392615685096, "rewards/margins": 14.475603170328206, "rewards/rejected": -6.64721055464311, "step": 1784 }, { "epoch": 0.4892421543099904, "grad_norm": 7.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19190988.8, "logits/rejected": -18204683.42857143, "logps/chosen": -428.993505859375, "logps/rejected": -457.1895228794643, "loss": 0.0498, "rewards/chosen": 6.439765167236328, "rewards/margins": 17.247156742640904, "rewards/rejected": -10.807391575404576, "step": 1785 }, { "epoch": 0.4895162395505002, "grad_norm": 3.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7540512.0, "logits/rejected": -24668721.454545453, "logps/chosen": -470.34093299278845, "logps/rejected": -514.8166725852273, "loss": 0.0098, "rewards/chosen": 7.193491422213041, "rewards/margins": 18.36974569467398, "rewards/rejected": -11.176254272460938, "step": 1786 }, { "epoch": 0.48979032479101, "grad_norm": 3.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30974768.0, "logits/rejected": -25107849.14285714, "logps/chosen": -420.82568359375, "logps/rejected": -497.42759486607144, "loss": 0.0102, "rewards/chosen": 6.145232391357422, "rewards/margins": 14.967936161586216, "rewards/rejected": -8.822703770228795, "step": 1787 }, { "epoch": 0.4900644100315198, "grad_norm": 13.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20968493.333333332, "logits/rejected": -5345209.333333333, "logps/chosen": -519.5088704427084, "logps/rejected": -413.2561848958333, "loss": 0.046, "rewards/chosen": 7.387864430745442, "rewards/margins": 15.872656504313152, "rewards/rejected": -8.484792073567709, "step": 1788 }, { "epoch": 0.4903384952720296, "grad_norm": 7.84375, "kl": 15.06696891784668, "learning_rate": 5e-06, "logits/chosen": -15060384.0, "logits/rejected": -25948043.42857143, "logps/chosen": -516.8956227022059, "logps/rejected": -639.4048549107143, "loss": 0.0459, "rewards/chosen": 7.561774758731618, "rewards/margins": 19.642165688907397, "rewards/rejected": -12.080390930175781, "step": 1789 }, { "epoch": 0.4906125805125394, "grad_norm": 3.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36242612.36363637, "logits/rejected": -21226717.53846154, "logps/chosen": -390.39204545454544, "logps/rejected": -588.6441180889423, "loss": 0.0089, "rewards/chosen": 7.2524330832741475, "rewards/margins": 17.29671643663953, "rewards/rejected": -10.044283353365385, "step": 1790 }, { "epoch": 0.4908866657530492, "grad_norm": 7.125, "kl": 1.9116218090057373, "learning_rate": 5e-06, "logits/chosen": -14989486.666666666, "logits/rejected": 42692237.333333336, "logps/chosen": -478.9889729817708, "logps/rejected": -443.2635904947917, "loss": 0.0359, "rewards/chosen": 5.837038675944011, "rewards/margins": 12.345006942749023, "rewards/rejected": -6.507968266805013, "step": 1791 }, { "epoch": 0.491160750993559, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53935402.666666664, "logits/rejected": -34930513.06666667, "logps/chosen": -539.0223524305555, "logps/rejected": -579.10078125, "loss": 0.0213, "rewards/chosen": 8.24419911702474, "rewards/margins": 21.686575826009115, "rewards/rejected": -13.442376708984375, "step": 1792 }, { "epoch": 0.4914348362340688, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13299793.0, "logits/rejected": 43039984.0, "logps/chosen": -510.55133056640625, "logps/rejected": -688.0726318359375, "loss": 0.0376, "rewards/chosen": 6.17347526550293, "rewards/margins": 17.769323348999023, "rewards/rejected": -11.595848083496094, "step": 1793 }, { "epoch": 0.4917089214745786, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8955514.181818182, "logits/rejected": -20371619.692307692, "logps/chosen": -387.78346946022725, "logps/rejected": -542.4397911658654, "loss": 0.0375, "rewards/chosen": 6.8514321067116475, "rewards/margins": 15.744498512961648, "rewards/rejected": -8.89306640625, "step": 1794 }, { "epoch": 0.4919830067150884, "grad_norm": 4.9375, "kl": 6.049198150634766, "learning_rate": 5e-06, "logits/chosen": -20064402.666666668, "logits/rejected": -35968629.333333336, "logps/chosen": -456.1905110677083, "logps/rejected": -389.5896402994792, "loss": 0.0209, "rewards/chosen": 7.572961171468099, "rewards/margins": 15.456235249837238, "rewards/rejected": -7.883274078369141, "step": 1795 }, { "epoch": 0.4922570919555982, "grad_norm": 10.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31097696.0, "logits/rejected": -35302218.666666664, "logps/chosen": -489.4864095052083, "logps/rejected": -548.312744140625, "loss": 0.0284, "rewards/chosen": 7.743176142374675, "rewards/margins": 16.94200897216797, "rewards/rejected": -9.198832829793295, "step": 1796 }, { "epoch": 0.492531177196108, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32802934.4, "logits/rejected": -19181040.842105262, "logps/chosen": -374.7264892578125, "logps/rejected": -571.057462993421, "loss": 0.0498, "rewards/chosen": 4.924801635742187, "rewards/margins": 16.165395796926397, "rewards/rejected": -11.24059416118421, "step": 1797 }, { "epoch": 0.49280526243661776, "grad_norm": 3.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7968988.8, "logits/rejected": 7498640.0, "logps/chosen": -387.3123372395833, "logps/rejected": -680.4517144097222, "loss": 0.014, "rewards/chosen": 6.274061075846354, "rewards/margins": 21.725726318359374, "rewards/rejected": -15.451665242513021, "step": 1798 }, { "epoch": 0.4930793476771276, "grad_norm": 2.421875, "kl": 3.057717800140381, "learning_rate": 5e-06, "logits/chosen": -25781888.0, "logits/rejected": -29800940.0, "logps/chosen": -564.26123046875, "logps/rejected": -495.64862060546875, "loss": 0.0059, "rewards/chosen": 7.116621017456055, "rewards/margins": 19.313379287719727, "rewards/rejected": -12.196758270263672, "step": 1799 }, { "epoch": 0.4933534329176374, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22772544.0, "logits/rejected": -28937643.42857143, "logps/chosen": -543.90908203125, "logps/rejected": -584.10205078125, "loss": 0.0106, "rewards/chosen": 6.895449829101563, "rewards/margins": 16.569644165039062, "rewards/rejected": -9.6741943359375, "step": 1800 }, { "epoch": 0.4936275181581472, "grad_norm": 14.6875, "kl": 3.2846522331237793, "learning_rate": 5e-06, "logits/chosen": -39731185.777777776, "logits/rejected": -21724569.6, "logps/chosen": -345.7128634982639, "logps/rejected": -624.5638671875, "loss": 0.0218, "rewards/chosen": 5.8255157470703125, "rewards/margins": 17.99734903971354, "rewards/rejected": -12.17183329264323, "step": 1801 }, { "epoch": 0.49390160339865696, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11421006.545454545, "logits/rejected": -22220199.384615384, "logps/chosen": -312.6653941761364, "logps/rejected": -539.0269681490385, "loss": 0.0476, "rewards/chosen": 4.285158677534624, "rewards/margins": 17.13784563291323, "rewards/rejected": -12.852686955378605, "step": 1802 }, { "epoch": 0.4941756886391668, "grad_norm": 15.5625, "kl": 5.812921047210693, "learning_rate": 5e-06, "logits/chosen": -19305181.866666667, "logits/rejected": -24361299.555555556, "logps/chosen": -375.70777994791666, "logps/rejected": -548.1682400173611, "loss": 0.0537, "rewards/chosen": 5.77024180094401, "rewards/margins": 17.155239698621962, "rewards/rejected": -11.384997897677952, "step": 1803 }, { "epoch": 0.4944497738796766, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11050916.0, "logits/rejected": -16396470.0, "logps/chosen": -387.3955078125, "logps/rejected": -863.58935546875, "loss": 0.0137, "rewards/chosen": 5.3177032470703125, "rewards/margins": 20.287123680114746, "rewards/rejected": -14.969420433044434, "step": 1804 }, { "epoch": 0.4947238591201864, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18526462.666666668, "logits/rejected": 16376365.333333334, "logps/chosen": -435.9605712890625, "logps/rejected": -731.5984700520834, "loss": 0.052, "rewards/chosen": 6.001984278361003, "rewards/margins": 20.137678146362305, "rewards/rejected": -14.135693868001303, "step": 1805 }, { "epoch": 0.49499794436069616, "grad_norm": 10.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8786562.666666666, "logits/rejected": -13024266.666666666, "logps/chosen": -383.3684488932292, "logps/rejected": -669.991943359375, "loss": 0.0649, "rewards/chosen": 4.987113952636719, "rewards/margins": 15.555606842041016, "rewards/rejected": -10.568492889404297, "step": 1806 }, { "epoch": 0.495272029601206, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18776821.333333332, "logits/rejected": -3645256.8, "logps/chosen": -324.6200358072917, "logps/rejected": -497.0045572916667, "loss": 0.0622, "rewards/chosen": 5.783656650119358, "rewards/margins": 15.637192620171442, "rewards/rejected": -9.853535970052084, "step": 1807 }, { "epoch": 0.4955461148417158, "grad_norm": 3.734375, "kl": 6.440056800842285, "learning_rate": 5e-06, "logits/chosen": -23790279.384615384, "logits/rejected": -27859159.272727273, "logps/chosen": -439.1162860576923, "logps/rejected": -411.94340376420456, "loss": 0.0125, "rewards/chosen": 6.818398695725661, "rewards/margins": 17.535454970139725, "rewards/rejected": -10.717056274414062, "step": 1808 }, { "epoch": 0.49582020008222555, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14330466.909090908, "logits/rejected": -14577104.0, "logps/chosen": -322.2854669744318, "logps/rejected": -451.2936823918269, "loss": 0.0296, "rewards/chosen": 6.235917524857954, "rewards/margins": 14.114596653651525, "rewards/rejected": -7.87867912879357, "step": 1809 }, { "epoch": 0.49609428532273536, "grad_norm": 1.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9922471.111111112, "logits/rejected": -10870819.2, "logps/chosen": -550.3799370659722, "logps/rejected": -557.6625, "loss": 0.0032, "rewards/chosen": 6.913510216606988, "rewards/margins": 16.32842517428928, "rewards/rejected": -9.414914957682292, "step": 1810 }, { "epoch": 0.4963683705632452, "grad_norm": 2.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4732532.4, "logits/rejected": -28183858.285714287, "logps/chosen": -458.14384765625, "logps/rejected": -460.00791713169644, "loss": 0.0268, "rewards/chosen": 6.871623992919922, "rewards/margins": 15.24711172921317, "rewards/rejected": -8.375487736293248, "step": 1811 }, { "epoch": 0.496642455803755, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20855200.0, "logits/rejected": -24575352.0, "logps/chosen": -459.9456787109375, "logps/rejected": -548.0222574869791, "loss": 0.0477, "rewards/chosen": 5.77895991007487, "rewards/margins": 18.87765884399414, "rewards/rejected": -13.098698933919271, "step": 1812 }, { "epoch": 0.49691654104426475, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29765326.769230768, "logits/rejected": -23818696.727272727, "logps/chosen": -489.3439378004808, "logps/rejected": -600.7195933948864, "loss": 0.0174, "rewards/chosen": 7.148534334622896, "rewards/margins": 15.96160584563142, "rewards/rejected": -8.813071511008523, "step": 1813 }, { "epoch": 0.49719062628477456, "grad_norm": 3.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14640888.615384616, "logits/rejected": -29845120.0, "logps/chosen": -329.1374323918269, "logps/rejected": -432.92041015625, "loss": 0.0343, "rewards/chosen": 7.258965125450721, "rewards/margins": 16.513166841093476, "rewards/rejected": -9.254201715642756, "step": 1814 }, { "epoch": 0.4974647115252844, "grad_norm": 1.3671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24367556.0, "logits/rejected": -19287996.0, "logps/chosen": -468.44464111328125, "logps/rejected": -712.5546875, "loss": 0.0042, "rewards/chosen": 6.6940789222717285, "rewards/margins": 19.19316816329956, "rewards/rejected": -12.499089241027832, "step": 1815 }, { "epoch": 0.4977387967657942, "grad_norm": 11.625, "kl": 1.2004716396331787, "learning_rate": 5e-06, "logits/chosen": -10769569.142857144, "logits/rejected": -22461260.8, "logps/chosen": -488.51607840401783, "logps/rejected": -621.862890625, "loss": 0.0554, "rewards/chosen": 6.690207890101841, "rewards/margins": 14.356658390590123, "rewards/rejected": -7.666450500488281, "step": 1816 }, { "epoch": 0.49801288200630395, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8668510.0, "logits/rejected": -32930832.0, "logps/chosen": -418.108154296875, "logps/rejected": -550.1099853515625, "loss": 0.0533, "rewards/chosen": 5.765256881713867, "rewards/margins": 16.202472686767578, "rewards/rejected": -10.437215805053711, "step": 1817 }, { "epoch": 0.49828696724681376, "grad_norm": 1.578125, "kl": 1.611250638961792, "learning_rate": 5e-06, "logits/chosen": -19865902.545454547, "logits/rejected": -18625083.076923076, "logps/chosen": -488.65482954545456, "logps/rejected": -562.1663912259615, "loss": 0.005, "rewards/chosen": 6.486215764825994, "rewards/margins": 16.060727366200695, "rewards/rejected": -9.574511601374699, "step": 1818 }, { "epoch": 0.4985610524873236, "grad_norm": 1.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20944004.0, "logits/rejected": -3264879.75, "logps/chosen": -443.02392578125, "logps/rejected": -600.4041748046875, "loss": 0.0034, "rewards/chosen": 6.1015753746032715, "rewards/margins": 17.85908079147339, "rewards/rejected": -11.757505416870117, "step": 1819 }, { "epoch": 0.49883513772783333, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21839620.363636363, "logits/rejected": -16965442.46153846, "logps/chosen": -495.56547407670456, "logps/rejected": -526.857421875, "loss": 0.0337, "rewards/chosen": 6.907652421431108, "rewards/margins": 15.751348428792888, "rewards/rejected": -8.843696007361778, "step": 1820 }, { "epoch": 0.49910922296834315, "grad_norm": 6.125, "kl": 2.268970012664795, "learning_rate": 5e-06, "logits/chosen": -17550619.42857143, "logits/rejected": -15596236.8, "logps/chosen": -361.1845005580357, "logps/rejected": -401.243994140625, "loss": 0.0539, "rewards/chosen": 5.743246895926339, "rewards/margins": 14.230321175711495, "rewards/rejected": -8.487074279785157, "step": 1821 }, { "epoch": 0.49938330820885296, "grad_norm": 7.03125, "kl": 0.5063247680664062, "learning_rate": 5e-06, "logits/chosen": 6362903.2727272725, "logits/rejected": -6346172.307692308, "logps/chosen": -344.89262251420456, "logps/rejected": -632.1823167067307, "loss": 0.038, "rewards/chosen": 5.135928414084694, "rewards/margins": 16.576504820710294, "rewards/rejected": -11.4405764066256, "step": 1822 }, { "epoch": 0.4996573934493628, "grad_norm": 7.625, "kl": 1.1982269287109375, "learning_rate": 5e-06, "logits/chosen": -28694520.888888888, "logits/rejected": -29844565.333333332, "logps/chosen": -448.0064290364583, "logps/rejected": -449.56569010416666, "loss": 0.0278, "rewards/chosen": 6.236981709798177, "rewards/margins": 14.007654317220052, "rewards/rejected": -7.770672607421875, "step": 1823 }, { "epoch": 0.49993147868987253, "grad_norm": 6.40625, "kl": 0.3127174377441406, "learning_rate": 5e-06, "logits/chosen": -13294824.888888888, "logits/rejected": -15832359.466666667, "logps/chosen": -490.8831380208333, "logps/rejected": -418.34381510416665, "loss": 0.0248, "rewards/chosen": 6.886906517876519, "rewards/margins": 15.282404157850479, "rewards/rejected": -8.395497639973959, "step": 1824 }, { "epoch": 0.5002055639303824, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2244525.0, "logits/rejected": 19706854.666666668, "logps/chosen": -415.6193033854167, "logps/rejected": -590.5792236328125, "loss": 0.0834, "rewards/chosen": 5.650423685709636, "rewards/margins": 20.31116485595703, "rewards/rejected": -14.660741170247396, "step": 1825 }, { "epoch": 0.5004796491708922, "grad_norm": 1.15625, "kl": 9.546701431274414, "learning_rate": 5e-06, "logits/chosen": -47386484.36363637, "logits/rejected": -10039006.153846154, "logps/chosen": -535.4002130681819, "logps/rejected": -604.6746544471154, "loss": 0.0031, "rewards/chosen": 7.414317737926137, "rewards/margins": 18.394547042313157, "rewards/rejected": -10.98022930438702, "step": 1826 }, { "epoch": 0.5007537344114019, "grad_norm": 5.90625, "kl": 2.253997802734375, "learning_rate": 5e-06, "logits/chosen": -7143191.2727272725, "logits/rejected": -27050880.0, "logps/chosen": -314.0930841619318, "logps/rejected": -633.3143780048077, "loss": 0.0238, "rewards/chosen": 5.48170956698331, "rewards/margins": 16.622956656075857, "rewards/rejected": -11.141247089092548, "step": 1827 }, { "epoch": 0.5010278196519118, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16392288.0, "logits/rejected": -12536241.6, "logps/chosen": -583.5862165178571, "logps/rejected": -558.447802734375, "loss": 0.0171, "rewards/chosen": 7.005336216517857, "rewards/margins": 17.807517460414342, "rewards/rejected": -10.802181243896484, "step": 1828 }, { "epoch": 0.5013019048924215, "grad_norm": 8.0625, "kl": 4.29071569442749, "learning_rate": 5e-06, "logits/chosen": 4890479.333333333, "logits/rejected": -13329170.666666666, "logps/chosen": -426.7911376953125, "logps/rejected": -521.0794270833334, "loss": 0.0719, "rewards/chosen": 6.906386057535808, "rewards/margins": 14.318745930989584, "rewards/rejected": -7.412359873453776, "step": 1829 }, { "epoch": 0.5015759901329313, "grad_norm": 5.8125, "kl": 5.934690475463867, "learning_rate": 5e-06, "logits/chosen": -19689520.94117647, "logits/rejected": 8819801.714285715, "logps/chosen": -429.3717256433824, "logps/rejected": -534.5111607142857, "loss": 0.0254, "rewards/chosen": 7.7367383171530335, "rewards/margins": 16.239664189955768, "rewards/rejected": -8.502925872802734, "step": 1830 }, { "epoch": 0.5018500753734412, "grad_norm": 5.125, "kl": 1.1667487621307373, "learning_rate": 5e-06, "logits/chosen": -35719658.666666664, "logits/rejected": 31622680.0, "logps/chosen": -344.3074544270833, "logps/rejected": -448.3525797526042, "loss": 0.0239, "rewards/chosen": 6.1328074137369795, "rewards/margins": 15.52423604329427, "rewards/rejected": -9.391428629557291, "step": 1831 }, { "epoch": 0.5021241606139509, "grad_norm": 3.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 14899998.545454545, "logits/rejected": -15318818.461538462, "logps/chosen": -511.4967151988636, "logps/rejected": -527.5749323918269, "loss": 0.0257, "rewards/chosen": 7.173296841708097, "rewards/margins": 16.46032405399776, "rewards/rejected": -9.287027212289663, "step": 1832 }, { "epoch": 0.5023982458544607, "grad_norm": 8.75, "kl": 1.407135009765625, "learning_rate": 5e-06, "logits/chosen": -24930925.333333332, "logits/rejected": -23417162.666666668, "logps/chosen": -497.2250162760417, "logps/rejected": -557.2020670572916, "loss": 0.0258, "rewards/chosen": 7.69239616394043, "rewards/margins": 17.03074073791504, "rewards/rejected": -9.33834457397461, "step": 1833 }, { "epoch": 0.5026723310949706, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6636776.0, "logits/rejected": 19669465.14285714, "logps/chosen": -513.41708984375, "logps/rejected": -584.7953055245536, "loss": 0.0338, "rewards/chosen": 7.859170532226562, "rewards/margins": 17.113591221400668, "rewards/rejected": -9.254420689174108, "step": 1834 }, { "epoch": 0.5029464163354803, "grad_norm": 10.5625, "kl": 7.7207560539245605, "learning_rate": 5e-06, "logits/chosen": -23108189.714285713, "logits/rejected": -25421233.6, "logps/chosen": -413.3676060267857, "logps/rejected": -318.79267578125, "loss": 0.0438, "rewards/chosen": 7.418357304164341, "rewards/margins": 13.943886021205357, "rewards/rejected": -6.5255287170410154, "step": 1835 }, { "epoch": 0.5032205015759902, "grad_norm": 12.875, "kl": 16.2327880859375, "learning_rate": 5e-06, "logits/chosen": -12490603.42857143, "logits/rejected": -5929892.4, "logps/chosen": -421.4756556919643, "logps/rejected": -540.295166015625, "loss": 0.1043, "rewards/chosen": 7.031656537737165, "rewards/margins": 17.331101880754744, "rewards/rejected": -10.299445343017577, "step": 1836 }, { "epoch": 0.5034945868165, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43066429.09090909, "logits/rejected": -21591158.153846152, "logps/chosen": -327.3441051136364, "logps/rejected": -673.1984675480769, "loss": 0.0207, "rewards/chosen": 5.577524358575994, "rewards/margins": 18.77634168504835, "rewards/rejected": -13.198817326472355, "step": 1837 }, { "epoch": 0.5037686720570097, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9317134.4, "logits/rejected": -4035559.1428571427, "logps/chosen": -290.25556640625, "logps/rejected": -327.90042550223217, "loss": 0.0303, "rewards/chosen": 5.409018707275391, "rewards/margins": 14.109584045410156, "rewards/rejected": -8.700565338134766, "step": 1838 }, { "epoch": 0.5040427572975196, "grad_norm": 7.46875, "kl": 6.564234733581543, "learning_rate": 5e-06, "logits/chosen": -16975453.866666667, "logits/rejected": -28283537.777777776, "logps/chosen": -426.565234375, "logps/rejected": -578.0475260416666, "loss": 0.0187, "rewards/chosen": 7.375102742513021, "rewards/margins": 16.98038330078125, "rewards/rejected": -9.605280558268229, "step": 1839 }, { "epoch": 0.5043168425380293, "grad_norm": 14.375, "kl": 0.9146105647087097, "learning_rate": 5e-06, "logits/chosen": 5948198.153846154, "logits/rejected": -15579632.0, "logps/chosen": -311.1503342848558, "logps/rejected": -407.27294921875, "loss": 0.0637, "rewards/chosen": 5.3130317101111775, "rewards/margins": 11.816049482438947, "rewards/rejected": -6.50301777232777, "step": 1840 }, { "epoch": 0.5045909277785391, "grad_norm": 12.1875, "kl": 7.996652603149414, "learning_rate": 5e-06, "logits/chosen": -8856080.666666666, "logits/rejected": -10440989.333333334, "logps/chosen": -432.0009765625, "logps/rejected": -641.4774169921875, "loss": 0.0385, "rewards/chosen": 6.493803024291992, "rewards/margins": 16.25209744771322, "rewards/rejected": -9.758294423421225, "step": 1841 }, { "epoch": 0.504865013019049, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6377368.7272727275, "logits/rejected": -19187670.153846152, "logps/chosen": -388.5872247869318, "logps/rejected": -531.1043419471154, "loss": 0.0406, "rewards/chosen": 6.680498296564275, "rewards/margins": 15.271374949208505, "rewards/rejected": -8.59087665264423, "step": 1842 }, { "epoch": 0.5051390982595587, "grad_norm": 5.4375, "kl": 11.320798873901367, "learning_rate": 5e-06, "logits/chosen": -21605917.53846154, "logits/rejected": 12298122.181818182, "logps/chosen": -489.14002403846155, "logps/rejected": -511.2191051136364, "loss": 0.0457, "rewards/chosen": 7.774100083571214, "rewards/margins": 16.787499381112053, "rewards/rejected": -9.013399297540838, "step": 1843 }, { "epoch": 0.5054131835000685, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17401188.923076924, "logits/rejected": 14707752.727272727, "logps/chosen": -474.62015474759613, "logps/rejected": -635.4600053267045, "loss": 0.0125, "rewards/chosen": 7.3022308349609375, "rewards/margins": 18.247997630726207, "rewards/rejected": -10.94576679576527, "step": 1844 }, { "epoch": 0.5056872687405783, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12476951.272727273, "logits/rejected": -17058363.076923076, "logps/chosen": -442.06893643465907, "logps/rejected": -701.1896033653846, "loss": 0.0136, "rewards/chosen": 5.847994717684659, "rewards/margins": 19.407363144667833, "rewards/rejected": -13.559368426983173, "step": 1845 }, { "epoch": 0.5059613539810881, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37980704.0, "logits/rejected": -19261977.6, "logps/chosen": -546.6724175347222, "logps/rejected": -561.6716145833333, "loss": 0.0302, "rewards/chosen": 6.490666283501519, "rewards/margins": 15.987691836886935, "rewards/rejected": -9.497025553385416, "step": 1846 }, { "epoch": 0.506235439221598, "grad_norm": 8.6875, "kl": 1.4716072082519531, "learning_rate": 5e-06, "logits/chosen": -13067552.0, "logits/rejected": 18799556.923076924, "logps/chosen": -441.9396306818182, "logps/rejected": -680.6442307692307, "loss": 0.0397, "rewards/chosen": 7.6706626198508525, "rewards/margins": 18.677318973141116, "rewards/rejected": -11.006656353290264, "step": 1847 }, { "epoch": 0.5065095244621077, "grad_norm": 3.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5786814.4, "logits/rejected": -21739744.0, "logps/chosen": -410.9658203125, "logps/rejected": -488.5762416294643, "loss": 0.0084, "rewards/chosen": 5.985995101928711, "rewards/margins": 14.96154943193708, "rewards/rejected": -8.97555433000837, "step": 1848 }, { "epoch": 0.5067836097026175, "grad_norm": 4.625, "kl": 1.3896090984344482, "learning_rate": 5e-06, "logits/chosen": -13775640.0, "logits/rejected": -13976825.142857144, "logps/chosen": -523.57451171875, "logps/rejected": -486.962890625, "loss": 0.0252, "rewards/chosen": 8.149481964111327, "rewards/margins": 17.0084362574986, "rewards/rejected": -8.858954293387276, "step": 1849 }, { "epoch": 0.5070576949431274, "grad_norm": 5.71875, "kl": 3.7046284675598145, "learning_rate": 5e-06, "logits/chosen": -26023432.0, "logits/rejected": -7549308.5, "logps/chosen": -350.8645935058594, "logps/rejected": -305.64801025390625, "loss": 0.0538, "rewards/chosen": 6.324007987976074, "rewards/margins": 12.380768775939941, "rewards/rejected": -6.056760787963867, "step": 1850 }, { "epoch": 0.5073317801836371, "grad_norm": 2.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31139502.545454547, "logits/rejected": -18866414.769230768, "logps/chosen": -493.5810546875, "logps/rejected": -394.91800631009613, "loss": 0.0071, "rewards/chosen": 8.179509943181818, "rewards/margins": 17.582653018978093, "rewards/rejected": -9.403143075796274, "step": 1851 }, { "epoch": 0.5076058654241469, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2426065.8181818184, "logits/rejected": -27305331.692307692, "logps/chosen": -308.933837890625, "logps/rejected": -464.7083082932692, "loss": 0.0212, "rewards/chosen": 5.084782340309837, "rewards/margins": 13.309036068149378, "rewards/rejected": -8.224253727839542, "step": 1852 }, { "epoch": 0.5078799506646567, "grad_norm": 5.65625, "kl": 5.825991630554199, "learning_rate": 5e-06, "logits/chosen": 2317210.4, "logits/rejected": -9120069.714285715, "logps/chosen": -682.869677734375, "logps/rejected": -619.9910714285714, "loss": 0.0101, "rewards/chosen": 9.2813232421875, "rewards/margins": 23.153476388113837, "rewards/rejected": -13.872153145926339, "step": 1853 }, { "epoch": 0.5081540359051665, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24541768.727272727, "logits/rejected": -29532908.307692308, "logps/chosen": -340.01027610085225, "logps/rejected": -621.6083984375, "loss": 0.0316, "rewards/chosen": 5.113174091685902, "rewards/margins": 17.774576867377007, "rewards/rejected": -12.661402775691105, "step": 1854 }, { "epoch": 0.5084281211456763, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1346028.6666666667, "logits/rejected": -18049666.133333333, "logps/chosen": -387.9413248697917, "logps/rejected": -490.11474609375, "loss": 0.0312, "rewards/chosen": 5.639376322428386, "rewards/margins": 16.667349751790365, "rewards/rejected": -11.02797342936198, "step": 1855 }, { "epoch": 0.5087022063861861, "grad_norm": 4.375, "kl": 4.291277885437012, "learning_rate": 5e-06, "logits/chosen": -10434742.545454545, "logits/rejected": -19137699.692307692, "logps/chosen": -386.36399147727275, "logps/rejected": -451.5416917067308, "loss": 0.0231, "rewards/chosen": 6.99334023215554, "rewards/margins": 15.965567475432284, "rewards/rejected": -8.972227243276743, "step": 1856 }, { "epoch": 0.5089762916266959, "grad_norm": 7.5, "kl": 4.054513454437256, "learning_rate": 5e-06, "logits/chosen": -16110304.0, "logits/rejected": -15190083.692307692, "logps/chosen": -493.87801846590907, "logps/rejected": -483.08642578125, "loss": 0.0248, "rewards/chosen": 6.855175365101207, "rewards/margins": 16.926067699085582, "rewards/rejected": -10.070892333984375, "step": 1857 }, { "epoch": 0.5092503768672058, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29680240.0, "logits/rejected": -13522218.0, "logps/chosen": -397.03765869140625, "logps/rejected": -555.6334228515625, "loss": 0.0412, "rewards/chosen": 5.07309627532959, "rewards/margins": 13.647006034851074, "rewards/rejected": -8.573909759521484, "step": 1858 }, { "epoch": 0.5095244621077155, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22144178.666666668, "logits/rejected": -30529136.0, "logps/chosen": -485.1957194010417, "logps/rejected": -408.407470703125, "loss": 0.0228, "rewards/chosen": 7.819050470987956, "rewards/margins": 18.65289878845215, "rewards/rejected": -10.833848317464193, "step": 1859 }, { "epoch": 0.5097985473482253, "grad_norm": 5.25, "kl": 3.126948118209839, "learning_rate": 5e-06, "logits/chosen": -13707560.0, "logits/rejected": -13848467.2, "logps/chosen": -496.69029017857144, "logps/rejected": -555.46171875, "loss": 0.0185, "rewards/chosen": 7.6481203351702005, "rewards/margins": 18.640830448695592, "rewards/rejected": -10.99271011352539, "step": 1860 }, { "epoch": 0.5100726325887351, "grad_norm": 4.5625, "kl": 2.2093162536621094, "learning_rate": 5e-06, "logits/chosen": -36180757.333333336, "logits/rejected": -7962110.666666667, "logps/chosen": -384.9470621744792, "logps/rejected": -673.7259114583334, "loss": 0.0504, "rewards/chosen": 6.589188893636067, "rewards/margins": 19.64163335164388, "rewards/rejected": -13.052444458007812, "step": 1861 }, { "epoch": 0.5103467178292449, "grad_norm": 10.1875, "kl": 10.615885734558105, "learning_rate": 5e-06, "logits/chosen": -12669504.0, "logits/rejected": -29217835.636363637, "logps/chosen": -433.02982271634613, "logps/rejected": -521.3946200284091, "loss": 0.0404, "rewards/chosen": 7.235099792480469, "rewards/margins": 17.50708215886896, "rewards/rejected": -10.271982366388494, "step": 1862 }, { "epoch": 0.5106208030697547, "grad_norm": 7.65625, "kl": 0.06767463684082031, "learning_rate": 5e-06, "logits/chosen": -196577.6, "logits/rejected": -4355159.714285715, "logps/chosen": -465.3033203125, "logps/rejected": -381.4557407924107, "loss": 0.0396, "rewards/chosen": 7.2331291198730465, "rewards/margins": 14.528933824811663, "rewards/rejected": -7.295804704938616, "step": 1863 }, { "epoch": 0.5108948883102645, "grad_norm": 2.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24114486.0, "logits/rejected": -29719798.0, "logps/chosen": -408.23822021484375, "logps/rejected": -499.03668212890625, "loss": 0.0087, "rewards/chosen": 7.062049865722656, "rewards/margins": 17.779264450073242, "rewards/rejected": -10.717214584350586, "step": 1864 }, { "epoch": 0.5111689735507743, "grad_norm": 1.9609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22327145.14285714, "logits/rejected": -4800770.823529412, "logps/chosen": -444.88424246651783, "logps/rejected": -502.23902803308823, "loss": 0.0034, "rewards/chosen": 8.982838221958705, "rewards/margins": 19.19243115336955, "rewards/rejected": -10.209592931410846, "step": 1865 }, { "epoch": 0.511443058791284, "grad_norm": 1.2109375, "kl": 0.3737386167049408, "learning_rate": 5e-06, "logits/chosen": -40184822.4, "logits/rejected": -23760438.85714286, "logps/chosen": -383.3470458984375, "logps/rejected": -593.2956194196429, "loss": 0.0042, "rewards/chosen": 6.650343322753907, "rewards/margins": 17.944960021972655, "rewards/rejected": -11.29461669921875, "step": 1866 }, { "epoch": 0.5117171440317939, "grad_norm": 3.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9492217.454545455, "logits/rejected": -29253252.923076924, "logps/chosen": -335.85391512784093, "logps/rejected": -512.2162710336538, "loss": 0.0317, "rewards/chosen": 6.361948186700994, "rewards/margins": 18.119322876830203, "rewards/rejected": -11.757374690129208, "step": 1867 }, { "epoch": 0.5119912292723037, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9996129.6, "logits/rejected": -19254921.14285714, "logps/chosen": -356.5142333984375, "logps/rejected": -480.8333217075893, "loss": 0.0451, "rewards/chosen": 5.988904571533203, "rewards/margins": 17.89064995901925, "rewards/rejected": -11.901745387486049, "step": 1868 }, { "epoch": 0.5122653145128134, "grad_norm": 6.59375, "kl": 1.5992724895477295, "learning_rate": 5e-06, "logits/chosen": -10774525.333333334, "logits/rejected": -19462570.666666668, "logps/chosen": -372.4517415364583, "logps/rejected": -401.4600423177083, "loss": 0.0322, "rewards/chosen": 6.548372268676758, "rewards/margins": 14.996452967325846, "rewards/rejected": -8.448080698649088, "step": 1869 }, { "epoch": 0.5125393997533233, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24890174.222222224, "logits/rejected": -9863232.0, "logps/chosen": -316.618896484375, "logps/rejected": -554.4638671875, "loss": 0.0549, "rewards/chosen": 4.672776116265191, "rewards/margins": 14.876859368218316, "rewards/rejected": -10.204083251953126, "step": 1870 }, { "epoch": 0.5128134849938331, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6290909.090909091, "logits/rejected": -16708454.153846154, "logps/chosen": -417.93701171875, "logps/rejected": -439.56798377403845, "loss": 0.0292, "rewards/chosen": 4.836382779208097, "rewards/margins": 14.376426536720118, "rewards/rejected": -9.54004375751202, "step": 1871 }, { "epoch": 0.5130875702343429, "grad_norm": 6.71875, "kl": 0.11554718017578125, "learning_rate": 5e-06, "logits/chosen": -23324204.307692308, "logits/rejected": -16367937.454545455, "logps/chosen": -418.36117788461536, "logps/rejected": -413.5341796875, "loss": 0.0515, "rewards/chosen": 6.923530578613281, "rewards/margins": 15.94886502352628, "rewards/rejected": -9.025334444912998, "step": 1872 }, { "epoch": 0.5133616554748527, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11474756.0, "logits/rejected": 9831116.57142857, "logps/chosen": -361.42431640625, "logps/rejected": -579.5355050223214, "loss": 0.0215, "rewards/chosen": 6.149270629882812, "rewards/margins": 17.621534511021203, "rewards/rejected": -11.472263881138392, "step": 1873 }, { "epoch": 0.5136357407153624, "grad_norm": 13.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8456848.0, "logits/rejected": 11262454.4, "logps/chosen": -364.95458984375, "logps/rejected": -454.610302734375, "loss": 0.0802, "rewards/chosen": 4.635556357247489, "rewards/margins": 12.359119742257255, "rewards/rejected": -7.723563385009766, "step": 1874 }, { "epoch": 0.5139098259558723, "grad_norm": 9.25, "kl": 1.9262620210647583, "learning_rate": 5e-06, "logits/chosen": -23867776.0, "logits/rejected": -11089408.0, "logps/chosen": -532.9763055098684, "logps/rejected": -351.346044921875, "loss": 0.0434, "rewards/chosen": 6.3782605622944075, "rewards/margins": 10.843373248451634, "rewards/rejected": -4.465112686157227, "step": 1875 }, { "epoch": 0.5141839111963821, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11570776.615384616, "logits/rejected": -13851825.454545455, "logps/chosen": -451.7795973557692, "logps/rejected": -573.1290838068181, "loss": 0.0286, "rewards/chosen": 5.472148014948918, "rewards/margins": 14.107252107633578, "rewards/rejected": -8.635104092684658, "step": 1876 }, { "epoch": 0.5144579964368918, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21883036.444444444, "logits/rejected": 4446074.666666667, "logps/chosen": -549.728515625, "logps/rejected": -756.2244140625, "loss": 0.0106, "rewards/chosen": 8.419158087836372, "rewards/margins": 22.26202918158637, "rewards/rejected": -13.84287109375, "step": 1877 }, { "epoch": 0.5147320816774017, "grad_norm": 5.75, "kl": 5.933967590332031, "learning_rate": 5e-06, "logits/chosen": -30933358.545454547, "logits/rejected": -30025144.615384616, "logps/chosen": -472.5095880681818, "logps/rejected": -627.7722355769231, "loss": 0.0189, "rewards/chosen": 7.3738181374289775, "rewards/margins": 18.589600889832823, "rewards/rejected": -11.215782752403847, "step": 1878 }, { "epoch": 0.5150061669179115, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9517795.076923076, "logits/rejected": -20263906.90909091, "logps/chosen": -473.28695913461536, "logps/rejected": -557.4214311079545, "loss": 0.0187, "rewards/chosen": 6.013870826134315, "rewards/margins": 22.04481287602778, "rewards/rejected": -16.030942049893465, "step": 1879 }, { "epoch": 0.5152802521584212, "grad_norm": 7.34375, "kl": 2.782278537750244, "learning_rate": 5e-06, "logits/chosen": -35362485.333333336, "logits/rejected": -27677058.133333333, "logps/chosen": -417.2151692708333, "logps/rejected": -470.187109375, "loss": 0.0129, "rewards/chosen": 7.338962131076389, "rewards/margins": 16.576948377821182, "rewards/rejected": -9.237986246744791, "step": 1880 }, { "epoch": 0.5155543373989311, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19535282.90909091, "logits/rejected": -24124553.846153848, "logps/chosen": -393.5009765625, "logps/rejected": -555.7692683293269, "loss": 0.0195, "rewards/chosen": 6.298825350674716, "rewards/margins": 19.943094853754644, "rewards/rejected": -13.644269503079927, "step": 1881 }, { "epoch": 0.5158284226394408, "grad_norm": 6.9375, "kl": 0.29446539282798767, "learning_rate": 5e-06, "logits/chosen": -33452550.4, "logits/rejected": -26904429.714285713, "logps/chosen": -445.69482421875, "logps/rejected": -479.6420200892857, "loss": 0.054, "rewards/chosen": 4.327595138549805, "rewards/margins": 11.77233216421945, "rewards/rejected": -7.444737025669643, "step": 1882 }, { "epoch": 0.5161025078799507, "grad_norm": 1.21875, "kl": 0.8824717402458191, "learning_rate": 5e-06, "logits/chosen": -4706523.692307692, "logits/rejected": -28326045.09090909, "logps/chosen": -485.96292818509613, "logps/rejected": -601.6827503551136, "loss": 0.0041, "rewards/chosen": 6.929058955265925, "rewards/margins": 21.600723106544333, "rewards/rejected": -14.671664151278408, "step": 1883 }, { "epoch": 0.5163765931204605, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 920874.9090909091, "logits/rejected": -14946050.461538462, "logps/chosen": -275.7028142755682, "logps/rejected": -519.90771484375, "loss": 0.0607, "rewards/chosen": 5.1031060652299365, "rewards/margins": 13.798762474860347, "rewards/rejected": -8.69565640963041, "step": 1884 }, { "epoch": 0.5166506783609702, "grad_norm": 10.125, "kl": 5.090258598327637, "learning_rate": 5e-06, "logits/chosen": -16397608.0, "logits/rejected": 5057922.0, "logps/chosen": -382.042724609375, "logps/rejected": -639.0416259765625, "loss": 0.0678, "rewards/chosen": 6.9650492668151855, "rewards/margins": 15.941833972930908, "rewards/rejected": -8.976784706115723, "step": 1885 }, { "epoch": 0.5169247636014801, "grad_norm": 6.1875, "kl": 1.1638858318328857, "learning_rate": 5e-06, "logits/chosen": -20332306.46153846, "logits/rejected": -20958282.181818184, "logps/chosen": -345.8996394230769, "logps/rejected": -717.759765625, "loss": 0.0599, "rewards/chosen": 4.58966064453125, "rewards/margins": 19.013531771573156, "rewards/rejected": -14.423871127041904, "step": 1886 }, { "epoch": 0.5171988488419899, "grad_norm": 4.0625, "kl": 2.4165968894958496, "learning_rate": 5e-06, "logits/chosen": -5599796.0, "logits/rejected": -21969350.4, "logps/chosen": -490.3152553013393, "logps/rejected": -550.84326171875, "loss": 0.011, "rewards/chosen": 6.665384565080915, "rewards/margins": 17.901951490129743, "rewards/rejected": -11.236566925048828, "step": 1887 }, { "epoch": 0.5174729340824996, "grad_norm": 7.21875, "kl": 2.7020556926727295, "learning_rate": 5e-06, "logits/chosen": -35967717.81818182, "logits/rejected": -6111399.384615385, "logps/chosen": -306.97017045454544, "logps/rejected": -490.77576622596155, "loss": 0.0368, "rewards/chosen": 5.951403531161222, "rewards/margins": 14.242986078862543, "rewards/rejected": -8.291582547701323, "step": 1888 }, { "epoch": 0.5177470193230095, "grad_norm": 4.375, "kl": 3.43537974357605, "learning_rate": 5e-06, "logits/chosen": -22748465.230769232, "logits/rejected": 5871506.909090909, "logps/chosen": -346.5784254807692, "logps/rejected": -565.7806729403409, "loss": 0.0356, "rewards/chosen": 6.065809396597055, "rewards/margins": 18.55947838629876, "rewards/rejected": -12.493668989701705, "step": 1889 }, { "epoch": 0.5180211045635192, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26216538.666666668, "logits/rejected": 3147421.3333333335, "logps/chosen": -365.5530598958333, "logps/rejected": -619.2451985677084, "loss": 0.054, "rewards/chosen": 6.075915018717448, "rewards/margins": 17.12962595621745, "rewards/rejected": -11.0537109375, "step": 1890 }, { "epoch": 0.518295189804029, "grad_norm": 7.75, "kl": 3.3984744548797607, "learning_rate": 5e-06, "logits/chosen": -16677794.666666666, "logits/rejected": -25121280.0, "logps/chosen": -381.7273356119792, "logps/rejected": -561.4903971354166, "loss": 0.0361, "rewards/chosen": 5.727361679077148, "rewards/margins": 16.480399449666344, "rewards/rejected": -10.753037770589193, "step": 1891 }, { "epoch": 0.5185692750445389, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14567098.666666666, "logits/rejected": -41658442.666666664, "logps/chosen": -372.1287841796875, "logps/rejected": -457.3775227864583, "loss": 0.0165, "rewards/chosen": 6.148770650227864, "rewards/margins": 13.212933222452799, "rewards/rejected": -7.064162572224935, "step": 1892 }, { "epoch": 0.5188433602850486, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23222789.818181816, "logits/rejected": -15932787.692307692, "logps/chosen": -330.8794611150568, "logps/rejected": -511.9949293870192, "loss": 0.0216, "rewards/chosen": 5.588517622514204, "rewards/margins": 14.948146153163243, "rewards/rejected": -9.359628530649038, "step": 1893 }, { "epoch": 0.5191174455255585, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23355680.0, "logits/rejected": -12955002.666666666, "logps/chosen": -390.064697265625, "logps/rejected": -699.907470703125, "loss": 0.0486, "rewards/chosen": 5.150793393452962, "rewards/margins": 14.431588172912598, "rewards/rejected": -9.280794779459635, "step": 1894 }, { "epoch": 0.5193915307660683, "grad_norm": 4.125, "kl": 5.222983360290527, "learning_rate": 5e-06, "logits/chosen": -28341582.933333334, "logits/rejected": 30767872.0, "logps/chosen": -465.3265625, "logps/rejected": -619.8627387152778, "loss": 0.0113, "rewards/chosen": 6.150596110026042, "rewards/margins": 24.383075629340276, "rewards/rejected": -18.232479519314236, "step": 1895 }, { "epoch": 0.519665616006578, "grad_norm": 1.3515625, "kl": 2.327498197555542, "learning_rate": 5e-06, "logits/chosen": -15522684.57142857, "logits/rejected": 19005848.0, "logps/chosen": -496.7705078125, "logps/rejected": -562.9626953125, "loss": 0.004, "rewards/chosen": 8.366912841796875, "rewards/margins": 21.338568115234374, "rewards/rejected": -12.9716552734375, "step": 1896 }, { "epoch": 0.5199397012470879, "grad_norm": 2.984375, "kl": 3.4733095169067383, "learning_rate": 5e-06, "logits/chosen": -25662582.85714286, "logits/rejected": -13304514.4, "logps/chosen": -484.7562779017857, "logps/rejected": -407.0048095703125, "loss": 0.0094, "rewards/chosen": 7.347662789481027, "rewards/margins": 15.879676491873607, "rewards/rejected": -8.532013702392579, "step": 1897 }, { "epoch": 0.5202137864875976, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18621313.230769232, "logits/rejected": -31088221.09090909, "logps/chosen": -414.8876953125, "logps/rejected": -517.861328125, "loss": 0.0404, "rewards/chosen": 7.7758636474609375, "rewards/margins": 17.05133195356889, "rewards/rejected": -9.275468306107955, "step": 1898 }, { "epoch": 0.5204878717281074, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5492266.5, "logits/rejected": 15640038.0, "logps/chosen": -306.3226318359375, "logps/rejected": -474.39373779296875, "loss": 0.033, "rewards/chosen": 4.177394866943359, "rewards/margins": 15.207413673400879, "rewards/rejected": -11.03001880645752, "step": 1899 }, { "epoch": 0.5207619569686173, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3305295.6666666665, "logits/rejected": 11371114.666666666, "logps/chosen": -381.8365885416667, "logps/rejected": -598.28173828125, "loss": 0.0596, "rewards/chosen": 5.81479008992513, "rewards/margins": 16.378529866536457, "rewards/rejected": -10.563739776611328, "step": 1900 }, { "epoch": 0.521036042209127, "grad_norm": 1.1640625, "kl": 3.0917296409606934, "learning_rate": 5e-06, "logits/chosen": -13124094.4, "logits/rejected": -28532601.14285714, "logps/chosen": -458.81748046875, "logps/rejected": -670.4451032366071, "loss": 0.0035, "rewards/chosen": 8.215762329101562, "rewards/margins": 18.724253845214843, "rewards/rejected": -10.508491516113281, "step": 1901 }, { "epoch": 0.5213101274496368, "grad_norm": 4.625, "kl": 3.601628065109253, "learning_rate": 5e-06, "logits/chosen": -29730502.85714286, "logits/rejected": -54440435.2, "logps/chosen": -499.4915248325893, "logps/rejected": -514.359130859375, "loss": 0.0136, "rewards/chosen": 7.848878043038504, "rewards/margins": 18.167118399483815, "rewards/rejected": -10.318240356445312, "step": 1902 }, { "epoch": 0.5215842126901467, "grad_norm": 1.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 14197552.0, "logits/rejected": -37051234.28571428, "logps/chosen": -466.6361328125, "logps/rejected": -501.86097935267856, "loss": 0.0051, "rewards/chosen": 6.280129241943359, "rewards/margins": 14.516395241873603, "rewards/rejected": -8.236265999930245, "step": 1903 }, { "epoch": 0.5218582979306564, "grad_norm": 3.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21656476.0, "logits/rejected": -27574797.333333332, "logps/chosen": -472.9267578125, "logps/rejected": -805.6966959635416, "loss": 0.0116, "rewards/chosen": 7.136797587076823, "rewards/margins": 24.64711634318034, "rewards/rejected": -17.510318756103516, "step": 1904 }, { "epoch": 0.5221323831711663, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19279808.0, "logits/rejected": -15402080.0, "logps/chosen": -488.46591796875, "logps/rejected": -455.00962611607144, "loss": 0.0106, "rewards/chosen": 6.445304107666016, "rewards/margins": 16.390562111990793, "rewards/rejected": -9.945258004324776, "step": 1905 }, { "epoch": 0.522406468411676, "grad_norm": 7.40625, "kl": 5.653787136077881, "learning_rate": 5e-06, "logits/chosen": -17539414.0, "logits/rejected": -53599880.0, "logps/chosen": -320.7252197265625, "logps/rejected": -653.7774047851562, "loss": 0.0393, "rewards/chosen": 5.6380791664123535, "rewards/margins": 19.029886722564697, "rewards/rejected": -13.391807556152344, "step": 1906 }, { "epoch": 0.5226805536521858, "grad_norm": 5.8125, "kl": 1.1112544536590576, "learning_rate": 5e-06, "logits/chosen": -7801216.0, "logits/rejected": -19331365.818181816, "logps/chosen": -487.8351862980769, "logps/rejected": -719.3346946022727, "loss": 0.031, "rewards/chosen": 7.580371563251202, "rewards/margins": 20.28796877560916, "rewards/rejected": -12.707597212357955, "step": 1907 }, { "epoch": 0.5229546388926957, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 58382460.44444445, "logits/rejected": 12896291.2, "logps/chosen": -592.8708767361111, "logps/rejected": -417.68828125, "loss": 0.0178, "rewards/chosen": 6.623093075222439, "rewards/margins": 15.845030127631293, "rewards/rejected": -9.221937052408855, "step": 1908 }, { "epoch": 0.5232287241332054, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25105072.0, "logits/rejected": -29630294.85714286, "logps/chosen": -459.02587890625, "logps/rejected": -428.25174386160717, "loss": 0.0285, "rewards/chosen": 4.965755844116211, "rewards/margins": 15.362646756853376, "rewards/rejected": -10.396890912737165, "step": 1909 }, { "epoch": 0.5235028093737152, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6964464.0, "logits/rejected": -7611987.636363637, "logps/chosen": -335.3508112980769, "logps/rejected": -612.77734375, "loss": 0.0228, "rewards/chosen": 5.282379150390625, "rewards/margins": 16.125749067826703, "rewards/rejected": -10.84336991743608, "step": 1910 }, { "epoch": 0.523776894614225, "grad_norm": 13.125, "kl": 11.4205961227417, "learning_rate": 5e-06, "logits/chosen": -20914045.866666667, "logits/rejected": -25641429.333333332, "logps/chosen": -414.65087890625, "logps/rejected": -450.92803276909723, "loss": 0.0646, "rewards/chosen": 7.178647867838541, "rewards/margins": 18.031497192382812, "rewards/rejected": -10.852849324544271, "step": 1911 }, { "epoch": 0.5240509798547348, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11062888.0, "logits/rejected": -8875125.0, "logps/chosen": -503.30072021484375, "logps/rejected": -577.806640625, "loss": 0.0353, "rewards/chosen": 6.1350321769714355, "rewards/margins": 15.135215282440186, "rewards/rejected": -9.00018310546875, "step": 1912 }, { "epoch": 0.5243250650952446, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73087013.33333333, "logits/rejected": -33477482.666666668, "logps/chosen": -408.7508544921875, "logps/rejected": -621.7999267578125, "loss": 0.0456, "rewards/chosen": 5.904436747233073, "rewards/margins": 20.747337341308594, "rewards/rejected": -14.842900594075521, "step": 1913 }, { "epoch": 0.5245991503357544, "grad_norm": 2.734375, "kl": 0.38163504004478455, "learning_rate": 5e-06, "logits/chosen": 2442800.727272727, "logits/rejected": -31625309.53846154, "logps/chosen": -555.0970348011364, "logps/rejected": -637.1727764423077, "loss": 0.0052, "rewards/chosen": 7.717886491255327, "rewards/margins": 20.172212240579245, "rewards/rejected": -12.454325749323917, "step": 1914 }, { "epoch": 0.5248732355762642, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1268871.5555555555, "logits/rejected": -6796360.0, "logps/chosen": -393.11962890625, "logps/rejected": -488.2543619791667, "loss": 0.0391, "rewards/chosen": 7.170398288302952, "rewards/margins": 15.955574883355036, "rewards/rejected": -8.785176595052084, "step": 1915 }, { "epoch": 0.5251473208167741, "grad_norm": 5.75, "kl": 1.0884017944335938, "learning_rate": 5e-06, "logits/chosen": -19784699.636363637, "logits/rejected": -23876049.230769232, "logps/chosen": -468.9749644886364, "logps/rejected": -575.9841871995193, "loss": 0.0321, "rewards/chosen": 6.497587724165483, "rewards/margins": 16.8198333953644, "rewards/rejected": -10.322245671198917, "step": 1916 }, { "epoch": 0.5254214060572838, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11869862.545454545, "logits/rejected": -6168193.230769231, "logps/chosen": -434.3722034801136, "logps/rejected": -497.6319110576923, "loss": 0.0155, "rewards/chosen": 7.442346746271307, "rewards/margins": 17.917126315457004, "rewards/rejected": -10.474779569185698, "step": 1917 }, { "epoch": 0.5256954912977936, "grad_norm": 3.6875, "kl": 2.708667755126953, "learning_rate": 5e-06, "logits/chosen": -20165664.0, "logits/rejected": -49794850.461538464, "logps/chosen": -413.93559126420456, "logps/rejected": -709.1010366586538, "loss": 0.0114, "rewards/chosen": 6.214147394353693, "rewards/margins": 19.989187067205258, "rewards/rejected": -13.775039672851562, "step": 1918 }, { "epoch": 0.5259695765383035, "grad_norm": 11.1875, "kl": 1.8729941844940186, "learning_rate": 5e-06, "logits/chosen": -11441124.57142857, "logits/rejected": 106726496.0, "logps/chosen": -477.22666713169644, "logps/rejected": -681.40947265625, "loss": 0.0323, "rewards/chosen": 6.967890058244977, "rewards/margins": 29.13556583949498, "rewards/rejected": -22.16767578125, "step": 1919 }, { "epoch": 0.5262436617788132, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8745707.692307692, "logits/rejected": -20061040.0, "logps/chosen": -327.0386493389423, "logps/rejected": -431.38623046875, "loss": 0.0507, "rewards/chosen": 5.541495689978967, "rewards/margins": 14.874469516994235, "rewards/rejected": -9.33297382701527, "step": 1920 }, { "epoch": 0.526517747019323, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41799416.88888889, "logits/rejected": -16172872.533333333, "logps/chosen": -529.3423394097222, "logps/rejected": -563.8809895833333, "loss": 0.0356, "rewards/chosen": 7.070701599121094, "rewards/margins": 17.671947733561197, "rewards/rejected": -10.601246134440105, "step": 1921 }, { "epoch": 0.5267918322598328, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28969390.545454547, "logits/rejected": 7503457.230769231, "logps/chosen": -517.2676669034091, "logps/rejected": -492.0358698918269, "loss": 0.0281, "rewards/chosen": 6.236834439364347, "rewards/margins": 18.676611520193674, "rewards/rejected": -12.439777080829327, "step": 1922 }, { "epoch": 0.5270659175003426, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70145952.0, "logits/rejected": 27549692.444444444, "logps/chosen": -501.6156412760417, "logps/rejected": -504.2727864583333, "loss": 0.0049, "rewards/chosen": 7.216528574625651, "rewards/margins": 17.471658918592667, "rewards/rejected": -10.255130343967014, "step": 1923 }, { "epoch": 0.5273400027408524, "grad_norm": 1.9765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38176669.333333336, "logits/rejected": -4263320.888888889, "logps/chosen": -536.963623046875, "logps/rejected": -556.6449110243055, "loss": 0.0051, "rewards/chosen": 8.188591639200846, "rewards/margins": 20.428456412421333, "rewards/rejected": -12.239864773220486, "step": 1924 }, { "epoch": 0.5276140879813622, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11025019.076923076, "logits/rejected": -30439653.818181816, "logps/chosen": -375.61177884615387, "logps/rejected": -425.6194513494318, "loss": 0.0098, "rewards/chosen": 6.99001957820012, "rewards/margins": 16.946147091738826, "rewards/rejected": -9.956127513538707, "step": 1925 }, { "epoch": 0.527888173221872, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14120872.533333333, "logits/rejected": 26543475.555555556, "logps/chosen": -360.65869140625, "logps/rejected": -561.5819769965278, "loss": 0.0122, "rewards/chosen": 5.283993530273437, "rewards/margins": 16.960387674967446, "rewards/rejected": -11.67639414469401, "step": 1926 }, { "epoch": 0.5281622584623819, "grad_norm": 7.90625, "kl": 4.2393622398376465, "learning_rate": 5e-06, "logits/chosen": -22117465.333333332, "logits/rejected": -10250964.666666666, "logps/chosen": -420.5400797526042, "logps/rejected": -457.1145833333333, "loss": 0.0671, "rewards/chosen": 5.436605453491211, "rewards/margins": 13.630580266316732, "rewards/rejected": -8.193974812825521, "step": 1927 }, { "epoch": 0.5284363437028916, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24334280.727272727, "logits/rejected": -14612368.0, "logps/chosen": -483.17928799715907, "logps/rejected": -389.4661207932692, "loss": 0.0542, "rewards/chosen": 5.655465906316584, "rewards/margins": 13.956480733164542, "rewards/rejected": -8.301014826847958, "step": 1928 }, { "epoch": 0.5287104289434014, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31178488.0, "logits/rejected": -9482198.0, "logps/chosen": -400.1265869140625, "logps/rejected": -618.1717529296875, "loss": 0.0576, "rewards/chosen": 6.874124526977539, "rewards/margins": 21.47116152445475, "rewards/rejected": -14.597036997477213, "step": 1929 }, { "epoch": 0.5289845141839112, "grad_norm": 6.46875, "kl": 2.061540126800537, "learning_rate": 5e-06, "logits/chosen": -10773184.0, "logits/rejected": -56181456.0, "logps/chosen": -427.9606119791667, "logps/rejected": -528.2788899739584, "loss": 0.0187, "rewards/chosen": 5.620031992594401, "rewards/margins": 20.075814565022785, "rewards/rejected": -14.455782572428385, "step": 1930 }, { "epoch": 0.529258599424421, "grad_norm": 7.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10368522.4, "logits/rejected": -8080509.714285715, "logps/chosen": -411.1603515625, "logps/rejected": -658.4390345982143, "loss": 0.0156, "rewards/chosen": 6.576564788818359, "rewards/margins": 16.68648910522461, "rewards/rejected": -10.10992431640625, "step": 1931 }, { "epoch": 0.5295326846649308, "grad_norm": 6.625, "kl": 1.5108928680419922, "learning_rate": 5e-06, "logits/chosen": -17895856.0, "logits/rejected": -14099390.76923077, "logps/chosen": -493.08860085227275, "logps/rejected": -492.0447340745192, "loss": 0.0227, "rewards/chosen": 7.391075827858665, "rewards/margins": 17.477111923111067, "rewards/rejected": -10.086036095252403, "step": 1932 }, { "epoch": 0.5298067699054406, "grad_norm": 2.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30319116.0, "logits/rejected": -22291616.0, "logps/chosen": -552.658203125, "logps/rejected": -538.4310913085938, "loss": 0.0059, "rewards/chosen": 5.961715221405029, "rewards/margins": 17.51321840286255, "rewards/rejected": -11.55150318145752, "step": 1933 }, { "epoch": 0.5300808551459504, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64222900.0, "logits/rejected": -5946120.0, "logps/chosen": -445.2442321777344, "logps/rejected": -400.1105651855469, "loss": 0.0115, "rewards/chosen": 7.6532392501831055, "rewards/margins": 15.924601554870605, "rewards/rejected": -8.2713623046875, "step": 1934 }, { "epoch": 0.5303549403864601, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -97563.57142857143, "logits/rejected": -26494393.6, "logps/chosen": -423.52601841517856, "logps/rejected": -506.59521484375, "loss": 0.0497, "rewards/chosen": 5.208802359444754, "rewards/margins": 15.333851187569753, "rewards/rejected": -10.125048828125, "step": 1935 }, { "epoch": 0.53062902562697, "grad_norm": 5.90625, "kl": 3.0983316898345947, "learning_rate": 5e-06, "logits/chosen": -29262356.363636363, "logits/rejected": 3105154.153846154, "logps/chosen": -446.00319602272725, "logps/rejected": -586.9841496394231, "loss": 0.0185, "rewards/chosen": 7.324712579900568, "rewards/margins": 18.87095492703098, "rewards/rejected": -11.54624234713041, "step": 1936 }, { "epoch": 0.5309031108674798, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28840556.8, "logits/rejected": -28762845.714285713, "logps/chosen": -470.2947265625, "logps/rejected": -499.3191615513393, "loss": 0.0173, "rewards/chosen": 8.194895172119141, "rewards/margins": 19.851217106410438, "rewards/rejected": -11.656321934291295, "step": 1937 }, { "epoch": 0.5311771961079896, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16683504.0, "logits/rejected": -25992532.57142857, "logps/chosen": -472.56455078125, "logps/rejected": -474.30594308035717, "loss": 0.0361, "rewards/chosen": 6.691064453125, "rewards/margins": 17.55208740234375, "rewards/rejected": -10.86102294921875, "step": 1938 }, { "epoch": 0.5314512813484994, "grad_norm": 6.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20632753.066666666, "logits/rejected": -19657960.888888888, "logps/chosen": -456.39752604166665, "logps/rejected": -756.5608723958334, "loss": 0.0344, "rewards/chosen": 6.146915181477865, "rewards/margins": 23.124688212076823, "rewards/rejected": -16.977773030598957, "step": 1939 }, { "epoch": 0.5317253665890092, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2770749.8333333335, "logits/rejected": -12642730.666666666, "logps/chosen": -344.0498046875, "logps/rejected": -422.1036376953125, "loss": 0.0752, "rewards/chosen": 4.468405087788899, "rewards/margins": 14.569931983947754, "rewards/rejected": -10.101526896158854, "step": 1940 }, { "epoch": 0.531999451829519, "grad_norm": 11.6875, "kl": 3.52075457572937, "learning_rate": 5e-06, "logits/chosen": 2609126.0, "logits/rejected": -15036250.666666666, "logps/chosen": -499.1206461588542, "logps/rejected": -804.7942708333334, "loss": 0.0451, "rewards/chosen": 7.43520991007487, "rewards/margins": 20.70528793334961, "rewards/rejected": -13.27007802327474, "step": 1941 }, { "epoch": 0.5322735370700288, "grad_norm": 4.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13955456.0, "logits/rejected": -27073744.0, "logps/chosen": -457.776953125, "logps/rejected": -535.7732282366071, "loss": 0.0302, "rewards/chosen": 6.433529663085937, "rewards/margins": 18.112530517578126, "rewards/rejected": -11.679000854492188, "step": 1942 }, { "epoch": 0.5325476223105385, "grad_norm": 4.59375, "kl": 2.151803493499756, "learning_rate": 5e-06, "logits/chosen": -22342408.533333335, "logits/rejected": -13536631.111111112, "logps/chosen": -347.60406901041665, "logps/rejected": -602.9264865451389, "loss": 0.0187, "rewards/chosen": 5.834704081217448, "rewards/margins": 16.14789326985677, "rewards/rejected": -10.313189188639322, "step": 1943 }, { "epoch": 0.5328217075510484, "grad_norm": 10.375, "kl": 6.313737392425537, "learning_rate": 5e-06, "logits/chosen": -2434903.5, "logits/rejected": -17463016.0, "logps/chosen": -466.65289306640625, "logps/rejected": -357.4850158691406, "loss": 0.0244, "rewards/chosen": 7.675724983215332, "rewards/margins": 18.322507858276367, "rewards/rejected": -10.646782875061035, "step": 1944 }, { "epoch": 0.5330957927915582, "grad_norm": 11.0625, "kl": 11.716662406921387, "learning_rate": 5e-06, "logits/chosen": -10787420.235294119, "logits/rejected": -15083069.714285715, "logps/chosen": -480.22144990808823, "logps/rejected": -665.3152204241071, "loss": 0.0738, "rewards/chosen": 6.8426379035500915, "rewards/margins": 19.173914933405, "rewards/rejected": -12.331277029854911, "step": 1945 }, { "epoch": 0.5333698780320679, "grad_norm": 7.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9322467.692307692, "logits/rejected": -34686609.45454545, "logps/chosen": -488.49962439903845, "logps/rejected": -637.0181551846591, "loss": 0.0327, "rewards/chosen": 6.7118999774639425, "rewards/margins": 19.84977007245684, "rewards/rejected": -13.137870094992898, "step": 1946 }, { "epoch": 0.5336439632725778, "grad_norm": 3.59375, "kl": 0.0745900496840477, "learning_rate": 5e-06, "logits/chosen": -17357123.2, "logits/rejected": -19952384.0, "logps/chosen": -333.0138671875, "logps/rejected": -529.2732979910714, "loss": 0.0195, "rewards/chosen": 5.966352844238282, "rewards/margins": 16.46481606619699, "rewards/rejected": -10.498463221958705, "step": 1947 }, { "epoch": 0.5339180485130876, "grad_norm": 4.6875, "kl": 4.096301078796387, "learning_rate": 5e-06, "logits/chosen": -18110118.666666668, "logits/rejected": -24199002.666666668, "logps/chosen": -491.6464029947917, "logps/rejected": -588.3450520833334, "loss": 0.0288, "rewards/chosen": 7.316490809122722, "rewards/margins": 18.301964441935223, "rewards/rejected": -10.9854736328125, "step": 1948 }, { "epoch": 0.5341921337535974, "grad_norm": 5.71875, "kl": 4.816805839538574, "learning_rate": 5e-06, "logits/chosen": -17770659.555555556, "logits/rejected": -12079312.0, "logps/chosen": -459.4990234375, "logps/rejected": -807.9728190104166, "loss": 0.0212, "rewards/chosen": 6.193811204698351, "rewards/margins": 24.112455156114365, "rewards/rejected": -17.918643951416016, "step": 1949 }, { "epoch": 0.5344662189941072, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25580068.923076924, "logits/rejected": -5554976.363636363, "logps/chosen": -368.9768629807692, "logps/rejected": -425.81569602272725, "loss": 0.0545, "rewards/chosen": 4.349890488844651, "rewards/margins": 13.241449049302748, "rewards/rejected": -8.891558560458096, "step": 1950 }, { "epoch": 0.5347403042346169, "grad_norm": 15.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45346528.0, "logits/rejected": -1767925.2222222222, "logps/chosen": -262.2280680338542, "logps/rejected": -539.8885091145834, "loss": 0.0307, "rewards/chosen": 5.899993896484375, "rewards/margins": 15.31266360812717, "rewards/rejected": -9.412669711642796, "step": 1951 }, { "epoch": 0.5350143894751268, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11023790.545454545, "logits/rejected": -14269948.307692308, "logps/chosen": -325.12031693892044, "logps/rejected": -559.7769681490385, "loss": 0.0883, "rewards/chosen": 4.952693939208984, "rewards/margins": 17.034349588247444, "rewards/rejected": -12.081655649038462, "step": 1952 }, { "epoch": 0.5352884747156366, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17112700.0, "logits/rejected": -14826806.666666666, "logps/chosen": -454.5099690755208, "logps/rejected": -511.2619222005208, "loss": 0.0209, "rewards/chosen": 7.5735626220703125, "rewards/margins": 17.384615580240883, "rewards/rejected": -9.811052958170572, "step": 1953 }, { "epoch": 0.5355625599561463, "grad_norm": 16.625, "kl": 7.780580043792725, "learning_rate": 5e-06, "logits/chosen": -16542333.866666667, "logits/rejected": -28814944.0, "logps/chosen": -360.0992838541667, "logps/rejected": -593.0552300347222, "loss": 0.0927, "rewards/chosen": 5.3006739298502605, "rewards/margins": 19.115872192382813, "rewards/rejected": -13.815198262532553, "step": 1954 }, { "epoch": 0.5358366451966562, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15391741.714285715, "logits/rejected": -4872432.8, "logps/chosen": -370.77587890625, "logps/rejected": -434.694921875, "loss": 0.0224, "rewards/chosen": 5.378004891531808, "rewards/margins": 12.909436471121651, "rewards/rejected": -7.531431579589844, "step": 1955 }, { "epoch": 0.536110730437166, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29384226.666666668, "logits/rejected": 192702.66666666666, "logps/chosen": -373.0771484375, "logps/rejected": -751.8419596354166, "loss": 0.047, "rewards/chosen": 8.282699584960938, "rewards/margins": 21.722654978434242, "rewards/rejected": -13.439955393473307, "step": 1956 }, { "epoch": 0.5363848156776757, "grad_norm": 9.0625, "kl": 0.36511868238449097, "learning_rate": 5e-06, "logits/chosen": -22393760.0, "logits/rejected": 152436.46153846153, "logps/chosen": -510.8670543323864, "logps/rejected": -721.4514723557693, "loss": 0.0253, "rewards/chosen": 7.67048436945135, "rewards/margins": 23.693743458994618, "rewards/rejected": -16.02325908954327, "step": 1957 }, { "epoch": 0.5366589009181856, "grad_norm": 4.375, "kl": 2.56642484664917, "learning_rate": 5e-06, "logits/chosen": -19478350.933333334, "logits/rejected": -36351690.666666664, "logps/chosen": -405.8378580729167, "logps/rejected": -428.1500651041667, "loss": 0.027, "rewards/chosen": 6.3802032470703125, "rewards/margins": 15.22413804796007, "rewards/rejected": -8.843934800889757, "step": 1958 }, { "epoch": 0.5369329861586953, "grad_norm": 3.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8575765.333333334, "logits/rejected": -25319496.533333335, "logps/chosen": -368.2090115017361, "logps/rejected": -476.01263020833335, "loss": 0.0081, "rewards/chosen": 5.872817569308811, "rewards/margins": 15.252132754855687, "rewards/rejected": -9.379315185546876, "step": 1959 }, { "epoch": 0.5372070713992052, "grad_norm": 6.90625, "kl": 1.8429229259490967, "learning_rate": 5e-06, "logits/chosen": -1284690.1818181819, "logits/rejected": -15751155.692307692, "logps/chosen": -451.19802024147725, "logps/rejected": -373.9114332932692, "loss": 0.0369, "rewards/chosen": 6.247333873401988, "rewards/margins": 14.818925764177228, "rewards/rejected": -8.57159189077524, "step": 1960 }, { "epoch": 0.537481156639715, "grad_norm": 13.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40356059.07692308, "logits/rejected": -21112907.636363637, "logps/chosen": -524.7218299278846, "logps/rejected": -496.6012073863636, "loss": 0.0348, "rewards/chosen": 7.799296452448918, "rewards/margins": 18.420734218784144, "rewards/rejected": -10.621437766335227, "step": 1961 }, { "epoch": 0.5377552418802247, "grad_norm": 11.0, "kl": 7.162234306335449, "learning_rate": 5e-06, "logits/chosen": -36175218.666666664, "logits/rejected": -24737245.333333332, "logps/chosen": -452.5108642578125, "logps/rejected": -549.0267740885416, "loss": 0.0468, "rewards/chosen": 6.665901819864909, "rewards/margins": 18.578575770060223, "rewards/rejected": -11.912673950195312, "step": 1962 }, { "epoch": 0.5380293271207346, "grad_norm": 2.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21129294.769230768, "logits/rejected": -32269352.727272727, "logps/chosen": -402.28564453125, "logps/rejected": -435.9655095880682, "loss": 0.0091, "rewards/chosen": 6.376314016488882, "rewards/margins": 14.574995934546411, "rewards/rejected": -8.19868191805753, "step": 1963 }, { "epoch": 0.5383034123612443, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11381751.384615384, "logits/rejected": -7538855.2727272725, "logps/chosen": -333.07970252403845, "logps/rejected": -594.9216086647727, "loss": 0.0314, "rewards/chosen": 5.624699519230769, "rewards/margins": 19.437643411276223, "rewards/rejected": -13.812943892045455, "step": 1964 }, { "epoch": 0.5385774976017541, "grad_norm": 5.0, "kl": 1.2152055501937866, "learning_rate": 5e-06, "logits/chosen": 9866650.666666666, "logits/rejected": -31132634.666666668, "logps/chosen": -477.2919921875, "logps/rejected": -482.5303548177083, "loss": 0.0212, "rewards/chosen": 6.407098134358724, "rewards/margins": 17.715740203857422, "rewards/rejected": -11.308642069498697, "step": 1965 }, { "epoch": 0.538851582842264, "grad_norm": 6.4375, "kl": 0.7388496398925781, "learning_rate": 5e-06, "logits/chosen": -21367293.714285713, "logits/rejected": -12239399.2, "logps/chosen": -320.51621791294644, "logps/rejected": -622.8015625, "loss": 0.0247, "rewards/chosen": 5.26696286882673, "rewards/margins": 19.26608396257673, "rewards/rejected": -13.99912109375, "step": 1966 }, { "epoch": 0.5391256680827737, "grad_norm": 1.7578125, "kl": 2.1680362224578857, "learning_rate": 5e-06, "logits/chosen": -40532710.4, "logits/rejected": 2849061.4285714286, "logps/chosen": -467.892724609375, "logps/rejected": -481.31417410714283, "loss": 0.006, "rewards/chosen": 6.896630096435547, "rewards/margins": 14.979842267717633, "rewards/rejected": -8.083212171282087, "step": 1967 }, { "epoch": 0.5393997533232835, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23783382.153846152, "logits/rejected": 18659864.727272727, "logps/chosen": -418.6029522235577, "logps/rejected": -606.7215909090909, "loss": 0.0373, "rewards/chosen": 6.234024634728065, "rewards/margins": 15.815543328131827, "rewards/rejected": -9.581518693403764, "step": 1968 }, { "epoch": 0.5396738385637934, "grad_norm": 4.4375, "kl": 1.3812065124511719, "learning_rate": 5e-06, "logits/chosen": -47369002.666666664, "logits/rejected": -20478418.666666668, "logps/chosen": -379.4788818359375, "logps/rejected": -658.6854654947916, "loss": 0.0333, "rewards/chosen": 5.3607133229573565, "rewards/margins": 19.109718958536785, "rewards/rejected": -13.749005635579428, "step": 1969 }, { "epoch": 0.5399479238043031, "grad_norm": 8.4375, "kl": 6.053230285644531, "learning_rate": 5e-06, "logits/chosen": -27791926.153846152, "logits/rejected": -29703060.363636363, "logps/chosen": -360.7521784855769, "logps/rejected": -623.2814719460227, "loss": 0.056, "rewards/chosen": 6.214083158052885, "rewards/margins": 20.021564403613965, "rewards/rejected": -13.80748124556108, "step": 1970 }, { "epoch": 0.540222009044813, "grad_norm": 5.125, "kl": 0.6773898005485535, "learning_rate": 5e-06, "logits/chosen": 22986106.181818184, "logits/rejected": -23007069.53846154, "logps/chosen": -389.48251065340907, "logps/rejected": -547.5648287259615, "loss": 0.0172, "rewards/chosen": 5.322064139626243, "rewards/margins": 17.380810477516867, "rewards/rejected": -12.058746337890625, "step": 1971 }, { "epoch": 0.5404960942853227, "grad_norm": 2.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9449712.0, "logits/rejected": -15105815.272727273, "logps/chosen": -453.7942457932692, "logps/rejected": -706.8162286931819, "loss": 0.0098, "rewards/chosen": 7.13252199613131, "rewards/margins": 20.995398194639833, "rewards/rejected": -13.862876198508523, "step": 1972 }, { "epoch": 0.5407701795258325, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7216758.153846154, "logits/rejected": -21023970.90909091, "logps/chosen": -274.46762319711536, "logps/rejected": -503.69429154829544, "loss": 0.0707, "rewards/chosen": 5.892066368689904, "rewards/margins": 16.26732992959189, "rewards/rejected": -10.375263560901988, "step": 1973 }, { "epoch": 0.5410442647663424, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27627069.09090909, "logits/rejected": 5981154.461538462, "logps/chosen": -419.23317649147725, "logps/rejected": -548.1864483173077, "loss": 0.0443, "rewards/chosen": 5.88132754239169, "rewards/margins": 15.616172043593615, "rewards/rejected": -9.734844501201923, "step": 1974 }, { "epoch": 0.5413183500068521, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36841524.36363637, "logits/rejected": 17826752.0, "logps/chosen": -500.71342329545456, "logps/rejected": -546.9864783653846, "loss": 0.0147, "rewards/chosen": 7.060117548162287, "rewards/margins": 16.811374637630436, "rewards/rejected": -9.75125708946815, "step": 1975 }, { "epoch": 0.5415924352473619, "grad_norm": 6.03125, "kl": 1.3003578186035156, "learning_rate": 5e-06, "logits/chosen": -12279453.538461538, "logits/rejected": -35798190.54545455, "logps/chosen": -446.8046123798077, "logps/rejected": -557.5252574573864, "loss": 0.023, "rewards/chosen": 6.842075054462139, "rewards/margins": 17.88008677876079, "rewards/rejected": -11.03801172429865, "step": 1976 }, { "epoch": 0.5418665204878718, "grad_norm": 6.09375, "kl": 2.4943742752075195, "learning_rate": 5e-06, "logits/chosen": -12445145.6, "logits/rejected": -22604787.555555556, "logps/chosen": -353.2705403645833, "logps/rejected": -546.7139756944445, "loss": 0.0894, "rewards/chosen": 5.2796071370442705, "rewards/margins": 15.434792412651909, "rewards/rejected": -10.15518527560764, "step": 1977 }, { "epoch": 0.5421406057283815, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16530350.857142856, "logits/rejected": 4020416.0, "logps/chosen": -453.28421456473217, "logps/rejected": -452.41259765625, "loss": 0.0179, "rewards/chosen": 6.58710697719029, "rewards/margins": 16.03478306361607, "rewards/rejected": -9.447676086425782, "step": 1978 }, { "epoch": 0.5424146909688913, "grad_norm": 3.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22152110.769230768, "logits/rejected": -3269194.909090909, "logps/chosen": -485.97964242788464, "logps/rejected": -476.9859730113636, "loss": 0.0162, "rewards/chosen": 5.829129732572115, "rewards/margins": 16.611796025629644, "rewards/rejected": -10.78266629305753, "step": 1979 }, { "epoch": 0.5426887762094011, "grad_norm": 0.1123046875, "kl": 0.35151419043540955, "learning_rate": 5e-06, "logits/chosen": -18356870.666666668, "logits/rejected": -36956442.666666664, "logps/chosen": -398.2459309895833, "logps/rejected": -637.340087890625, "loss": 0.0003, "rewards/chosen": 8.705663681030273, "rewards/margins": 21.358545303344727, "rewards/rejected": -12.652881622314453, "step": 1980 }, { "epoch": 0.5429628614499109, "grad_norm": 1.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34858764.8, "logits/rejected": -28057394.285714287, "logps/chosen": -360.54296875, "logps/rejected": -565.1248953683036, "loss": 0.0052, "rewards/chosen": 7.013519287109375, "rewards/margins": 17.836090087890625, "rewards/rejected": -10.82257080078125, "step": 1981 }, { "epoch": 0.5432369466904208, "grad_norm": 8.3125, "kl": 0.17630133032798767, "learning_rate": 5e-06, "logits/chosen": -20594290.90909091, "logits/rejected": -18691708.307692308, "logps/chosen": -545.8963955965909, "logps/rejected": -511.82534555288464, "loss": 0.0483, "rewards/chosen": 7.334488608620384, "rewards/margins": 16.91841077471113, "rewards/rejected": -9.583922166090746, "step": 1982 }, { "epoch": 0.5435110319309305, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7958555.636363637, "logits/rejected": -35194158.76923077, "logps/chosen": -424.89102450284093, "logps/rejected": -511.74658203125, "loss": 0.0138, "rewards/chosen": 6.107039711692116, "rewards/margins": 19.03239958436339, "rewards/rejected": -12.925359872671274, "step": 1983 }, { "epoch": 0.5437851171714403, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24852425.846153848, "logits/rejected": -12955815.272727273, "logps/chosen": -378.79916616586536, "logps/rejected": -704.1368075284091, "loss": 0.0153, "rewards/chosen": 6.468044574444111, "rewards/margins": 18.95840795556982, "rewards/rejected": -12.49036338112571, "step": 1984 }, { "epoch": 0.5440592024119502, "grad_norm": 5.9375, "kl": 9.188819885253906, "learning_rate": 5e-06, "logits/chosen": -13647424.0, "logits/rejected": 72262636.8, "logps/chosen": -400.63058035714283, "logps/rejected": -543.56875, "loss": 0.0523, "rewards/chosen": 6.1377378191266745, "rewards/margins": 18.45145775931222, "rewards/rejected": -12.313719940185546, "step": 1985 }, { "epoch": 0.5443332876524599, "grad_norm": 5.96875, "kl": 6.900744438171387, "learning_rate": 5e-06, "logits/chosen": -34747333.333333336, "logits/rejected": -11934705.333333334, "logps/chosen": -473.878173828125, "logps/rejected": -595.72265625, "loss": 0.0217, "rewards/chosen": 7.693979263305664, "rewards/margins": 18.66578229268392, "rewards/rejected": -10.971803029378256, "step": 1986 }, { "epoch": 0.5446073728929697, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33546022.4, "logits/rejected": -29638073.14285714, "logps/chosen": -390.997705078125, "logps/rejected": -566.0798688616071, "loss": 0.0331, "rewards/chosen": 5.299746704101563, "rewards/margins": 16.321087210518975, "rewards/rejected": -11.021340506417411, "step": 1987 }, { "epoch": 0.5448814581334795, "grad_norm": 6.5625, "kl": 0.05865923687815666, "learning_rate": 5e-06, "logits/chosen": -9310103.2, "logits/rejected": -24667369.14285714, "logps/chosen": -361.86455078125, "logps/rejected": -651.2667410714286, "loss": 0.0355, "rewards/chosen": 6.918270874023437, "rewards/margins": 19.15074506487165, "rewards/rejected": -12.232474190848214, "step": 1988 }, { "epoch": 0.5451555433739893, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20576426.0, "logits/rejected": -31373104.0, "logps/chosen": -421.66552734375, "logps/rejected": -472.98565673828125, "loss": 0.0588, "rewards/chosen": 6.403536319732666, "rewards/margins": 16.577624797821045, "rewards/rejected": -10.174088478088379, "step": 1989 }, { "epoch": 0.5454296286144991, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20569653.333333332, "logits/rejected": -8977031.466666667, "logps/chosen": -411.32318793402777, "logps/rejected": -477.9525390625, "loss": 0.0233, "rewards/chosen": 5.814556545681423, "rewards/margins": 13.878995429144965, "rewards/rejected": -8.064438883463541, "step": 1990 }, { "epoch": 0.5457037138550089, "grad_norm": 12.4375, "kl": 3.1332836151123047, "learning_rate": 5e-06, "logits/chosen": -24891572.0, "logits/rejected": -27789712.0, "logps/chosen": -382.2705078125, "logps/rejected": -617.1807861328125, "loss": 0.0455, "rewards/chosen": 5.094465255737305, "rewards/margins": 21.263795852661133, "rewards/rejected": -16.169330596923828, "step": 1991 }, { "epoch": 0.5459777990955187, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28403353.14285714, "logits/rejected": -28005522.82352941, "logps/chosen": -304.4720982142857, "logps/rejected": -530.8615004595588, "loss": 0.0065, "rewards/chosen": 7.455472128731864, "rewards/margins": 19.077648355179473, "rewards/rejected": -11.62217622644761, "step": 1992 }, { "epoch": 0.5462518843360286, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -440609.5, "logits/rejected": -20984838.0, "logps/chosen": -441.7107849121094, "logps/rejected": -491.2418212890625, "loss": 0.0166, "rewards/chosen": 6.9824323654174805, "rewards/margins": 18.423850059509277, "rewards/rejected": -11.441417694091797, "step": 1993 }, { "epoch": 0.5465259695765383, "grad_norm": 11.0, "kl": 3.999202251434326, "learning_rate": 5e-06, "logits/chosen": -5941496.0, "logits/rejected": -24071390.4, "logps/chosen": -448.23228236607144, "logps/rejected": -404.8801513671875, "loss": 0.0358, "rewards/chosen": 7.237390790666852, "rewards/margins": 15.328091321672712, "rewards/rejected": -8.09070053100586, "step": 1994 }, { "epoch": 0.5468000548170481, "grad_norm": 6.46875, "kl": 4.966086387634277, "learning_rate": 5e-06, "logits/chosen": -24066023.272727273, "logits/rejected": -24364711.384615384, "logps/chosen": -440.66170987215907, "logps/rejected": -506.0388371394231, "loss": 0.0179, "rewards/chosen": 7.449176441539418, "rewards/margins": 17.440627998405404, "rewards/rejected": -9.991451556865986, "step": 1995 }, { "epoch": 0.547074140057558, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11662485.818181818, "logits/rejected": -16985289.846153848, "logps/chosen": -391.37935014204544, "logps/rejected": -596.7431640625, "loss": 0.0665, "rewards/chosen": 5.484852183948863, "rewards/margins": 20.324636072545616, "rewards/rejected": -14.839783888596754, "step": 1996 }, { "epoch": 0.5473482252980677, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1351426.6666666667, "logits/rejected": 6659866.133333334, "logps/chosen": -288.8828125, "logps/rejected": -631.4895182291667, "loss": 0.0344, "rewards/chosen": 6.364362080891927, "rewards/margins": 16.854852803548177, "rewards/rejected": -10.49049072265625, "step": 1997 }, { "epoch": 0.5476223105385775, "grad_norm": 3.65625, "kl": 1.9465503692626953, "learning_rate": 5e-06, "logits/chosen": -40519776.0, "logits/rejected": -36212230.85714286, "logps/chosen": -380.105908203125, "logps/rejected": -549.4310477120536, "loss": 0.024, "rewards/chosen": 6.667947387695312, "rewards/margins": 19.87635192871094, "rewards/rejected": -13.208404541015625, "step": 1998 }, { "epoch": 0.5478963957790873, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6837592.533333333, "logits/rejected": -33303793.777777776, "logps/chosen": -413.36399739583334, "logps/rejected": -518.0972764756945, "loss": 0.0462, "rewards/chosen": 5.534806315104166, "rewards/margins": 14.876236131456164, "rewards/rejected": -9.341429816351997, "step": 1999 }, { "epoch": 0.5481704810195971, "grad_norm": 12.4375, "kl": 3.919995069503784, "learning_rate": 5e-06, "logits/chosen": 21176918.153846152, "logits/rejected": -24603816.727272727, "logps/chosen": -388.97171724759613, "logps/rejected": -407.07328657670456, "loss": 0.0379, "rewards/chosen": 6.078194251427283, "rewards/margins": 14.486630419751148, "rewards/rejected": -8.408436168323863, "step": 2000 }, { "epoch": 0.5484445662601068, "grad_norm": 8.5, "kl": 2.1002821922302246, "learning_rate": 5e-06, "logits/chosen": -22748801.88235294, "logits/rejected": -8577352.57142857, "logps/chosen": -419.35765165441177, "logps/rejected": -586.9787946428571, "loss": 0.0304, "rewards/chosen": 6.951938853544347, "rewards/margins": 16.81557977500082, "rewards/rejected": -9.863640921456474, "step": 2001 }, { "epoch": 0.5487186515006167, "grad_norm": 3.71875, "kl": 4.389316558837891, "learning_rate": 5e-06, "logits/chosen": -14304073.142857144, "logits/rejected": -33064582.4, "logps/chosen": -508.11251395089283, "logps/rejected": -597.62529296875, "loss": 0.014, "rewards/chosen": 7.789364950997489, "rewards/margins": 21.08175997052874, "rewards/rejected": -13.29239501953125, "step": 2002 }, { "epoch": 0.5489927367411265, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24377016.727272727, "logits/rejected": -21745846.153846152, "logps/chosen": -594.9388760653409, "logps/rejected": -512.9338566706731, "loss": 0.0375, "rewards/chosen": 6.956851612437855, "rewards/margins": 20.297070963399392, "rewards/rejected": -13.340219350961538, "step": 2003 }, { "epoch": 0.5492668219816363, "grad_norm": 14.3125, "kl": 12.712512969970703, "learning_rate": 5e-06, "logits/chosen": -41771628.307692304, "logits/rejected": -28309992.727272727, "logps/chosen": -453.0762469951923, "logps/rejected": -446.5817205255682, "loss": 0.1082, "rewards/chosen": 7.46902583195613, "rewards/margins": 17.120673919891143, "rewards/rejected": -9.651648087935014, "step": 2004 }, { "epoch": 0.5495409072221461, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28413352.0, "logits/rejected": -29819712.0, "logps/chosen": -526.0605061848959, "logps/rejected": -534.8649088541666, "loss": 0.0211, "rewards/chosen": 7.336350758870442, "rewards/margins": 18.21841557820638, "rewards/rejected": -10.882064819335938, "step": 2005 }, { "epoch": 0.5498149924626559, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29634304.0, "logits/rejected": -17759052.0, "logps/chosen": -474.9342956542969, "logps/rejected": -510.7363586425781, "loss": 0.0172, "rewards/chosen": 7.7972307205200195, "rewards/margins": 18.031460762023926, "rewards/rejected": -10.234230041503906, "step": 2006 }, { "epoch": 0.5500890777031657, "grad_norm": 2.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43483120.0, "logits/rejected": -11455236.0, "logps/chosen": -513.032470703125, "logps/rejected": -562.996337890625, "loss": 0.0064, "rewards/chosen": 6.053489685058594, "rewards/margins": 16.30825901031494, "rewards/rejected": -10.254769325256348, "step": 2007 }, { "epoch": 0.5503631629436755, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6623742.545454546, "logits/rejected": -11147616.0, "logps/chosen": -376.6277521306818, "logps/rejected": -580.9522986778846, "loss": 0.0254, "rewards/chosen": 6.469759160822088, "rewards/margins": 18.07069957172954, "rewards/rejected": -11.600940410907452, "step": 2008 }, { "epoch": 0.5506372481841852, "grad_norm": 12.6875, "kl": 17.819456100463867, "learning_rate": 5e-06, "logits/chosen": -24232265.14285714, "logits/rejected": -24391321.6, "logps/chosen": -604.0933314732143, "logps/rejected": -676.730712890625, "loss": 0.036, "rewards/chosen": 7.188533782958984, "rewards/margins": 19.157642364501953, "rewards/rejected": -11.969108581542969, "step": 2009 }, { "epoch": 0.5509113334246951, "grad_norm": 3.375, "kl": 3.971749782562256, "learning_rate": 5e-06, "logits/chosen": -17507332.363636363, "logits/rejected": -15790077.538461538, "logps/chosen": -454.14501953125, "logps/rejected": -440.35648287259613, "loss": 0.0503, "rewards/chosen": 5.758055253462358, "rewards/margins": 14.230697418426299, "rewards/rejected": -8.472642164963942, "step": 2010 }, { "epoch": 0.5511854186652049, "grad_norm": 2.3125, "kl": 1.3431930541992188, "learning_rate": 5e-06, "logits/chosen": -9262873.142857144, "logits/rejected": -17143435.2, "logps/chosen": -444.26988002232144, "logps/rejected": -467.150537109375, "loss": 0.0283, "rewards/chosen": 6.786708286830357, "rewards/margins": 15.7215822492327, "rewards/rejected": -8.934873962402344, "step": 2011 }, { "epoch": 0.5514595039057146, "grad_norm": 6.1875, "kl": 1.2001241445541382, "learning_rate": 5e-06, "logits/chosen": -4275160.533333333, "logits/rejected": -9279510.222222222, "logps/chosen": -471.874609375, "logps/rejected": -373.954833984375, "loss": 0.0595, "rewards/chosen": 5.795854187011718, "rewards/margins": 12.48376702202691, "rewards/rejected": -6.687912835015191, "step": 2012 }, { "epoch": 0.5517335891462245, "grad_norm": 9.9375, "kl": 5.4280548095703125, "learning_rate": 5e-06, "logits/chosen": -18144305.230769232, "logits/rejected": -11201975.272727273, "logps/chosen": -494.38912259615387, "logps/rejected": -517.2211026278409, "loss": 0.0742, "rewards/chosen": 8.248563913198618, "rewards/margins": 15.803231299340307, "rewards/rejected": -7.55466738614169, "step": 2013 }, { "epoch": 0.5520076743867343, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29858752.0, "logits/rejected": -929911.5294117647, "logps/chosen": -345.7225864955357, "logps/rejected": -532.9274471507352, "loss": 0.0208, "rewards/chosen": 7.510665348597935, "rewards/margins": 18.788842946541408, "rewards/rejected": -11.278177597943474, "step": 2014 }, { "epoch": 0.552281759627244, "grad_norm": 1.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18150323.692307692, "logits/rejected": -4917349.818181818, "logps/chosen": -404.5233623798077, "logps/rejected": -608.3477450284091, "loss": 0.0046, "rewards/chosen": 7.727083646334135, "rewards/margins": 22.257857849547914, "rewards/rejected": -14.53077420321378, "step": 2015 }, { "epoch": 0.5525558448677539, "grad_norm": 5.6875, "kl": 4.643553733825684, "learning_rate": 5e-06, "logits/chosen": -9903636.0, "logits/rejected": 1795622.6666666667, "logps/chosen": -358.3850504557292, "logps/rejected": -513.1094156901041, "loss": 0.0391, "rewards/chosen": 6.256624857584636, "rewards/margins": 18.909868876139324, "rewards/rejected": -12.653244018554688, "step": 2016 }, { "epoch": 0.5528299301082636, "grad_norm": 5.625, "kl": 3.7463455200195312, "learning_rate": 5e-06, "logits/chosen": -17307492.57142857, "logits/rejected": -16076336.0, "logps/chosen": -525.8926478794643, "logps/rejected": -378.65205078125, "loss": 0.022, "rewards/chosen": 7.165495736258371, "rewards/margins": 17.155009896414622, "rewards/rejected": -9.98951416015625, "step": 2017 }, { "epoch": 0.5531040153487735, "grad_norm": 5.40625, "kl": 10.359808921813965, "learning_rate": 5e-06, "logits/chosen": -24406514.82352941, "logits/rejected": -8153515.428571428, "logps/chosen": -492.51953125, "logps/rejected": -519.904296875, "loss": 0.0151, "rewards/chosen": 7.565406350528493, "rewards/margins": 16.806045083438647, "rewards/rejected": -9.240638732910156, "step": 2018 }, { "epoch": 0.5533781005892833, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19840677.333333332, "logits/rejected": -22812433.066666666, "logps/chosen": -363.25718858506946, "logps/rejected": -567.8939453125, "loss": 0.0436, "rewards/chosen": 6.060742696126302, "rewards/margins": 18.126434834798175, "rewards/rejected": -12.065692138671874, "step": 2019 }, { "epoch": 0.553652185829793, "grad_norm": 8.5, "kl": 1.388490080833435, "learning_rate": 5e-06, "logits/chosen": 31485188.923076924, "logits/rejected": -19169920.0, "logps/chosen": -415.6407001201923, "logps/rejected": -445.01318359375, "loss": 0.0273, "rewards/chosen": 6.720120943509615, "rewards/margins": 15.20251230093149, "rewards/rejected": -8.482391357421875, "step": 2020 }, { "epoch": 0.5539262710703029, "grad_norm": 9.0625, "kl": 7.324720859527588, "learning_rate": 5e-06, "logits/chosen": -26734882.46153846, "logits/rejected": -10638718.545454545, "logps/chosen": -443.71029897836536, "logps/rejected": -546.6664595170455, "loss": 0.0713, "rewards/chosen": 6.382366473858173, "rewards/margins": 18.598412893868826, "rewards/rejected": -12.216046420010654, "step": 2021 }, { "epoch": 0.5542003563108127, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39982300.44444445, "logits/rejected": -11128388.266666668, "logps/chosen": -506.1388346354167, "logps/rejected": -546.3864583333333, "loss": 0.0129, "rewards/chosen": 8.745025634765625, "rewards/margins": 18.563565063476563, "rewards/rejected": -9.818539428710938, "step": 2022 }, { "epoch": 0.5544744415513224, "grad_norm": 7.8125, "kl": 2.730870008468628, "learning_rate": 5e-06, "logits/chosen": -18940093.53846154, "logits/rejected": 8162820.363636363, "logps/chosen": -369.37439903846155, "logps/rejected": -807.0591264204545, "loss": 0.0254, "rewards/chosen": 7.690899188701923, "rewards/margins": 21.1404536320613, "rewards/rejected": -13.449554443359375, "step": 2023 }, { "epoch": 0.5547485267918323, "grad_norm": 4.46875, "kl": 3.093989849090576, "learning_rate": 5e-06, "logits/chosen": 12762149.333333334, "logits/rejected": -33797077.333333336, "logps/chosen": -398.7218831380208, "logps/rejected": -507.0123697916667, "loss": 0.0152, "rewards/chosen": 6.406976064046224, "rewards/margins": 15.745165506998699, "rewards/rejected": -9.338189442952475, "step": 2024 }, { "epoch": 0.555022612032342, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2560317.8181818184, "logits/rejected": -18956657.230769232, "logps/chosen": -377.05868252840907, "logps/rejected": -517.3298527644231, "loss": 0.082, "rewards/chosen": 4.940584009343928, "rewards/margins": 15.305813689331908, "rewards/rejected": -10.36522967998798, "step": 2025 }, { "epoch": 0.5552966972728518, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9723437.6, "logits/rejected": -33313435.42857143, "logps/chosen": -350.398291015625, "logps/rejected": -626.0630580357143, "loss": 0.0358, "rewards/chosen": 5.316434097290039, "rewards/margins": 21.1143248966762, "rewards/rejected": -15.797890799386161, "step": 2026 }, { "epoch": 0.5555707825133617, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6109403.5, "logits/rejected": -20219062.0, "logps/chosen": -344.7581787109375, "logps/rejected": -506.08123779296875, "loss": 0.0213, "rewards/chosen": 7.274290084838867, "rewards/margins": 16.290401458740234, "rewards/rejected": -9.016111373901367, "step": 2027 }, { "epoch": 0.5558448677538714, "grad_norm": 9.5625, "kl": 12.859132766723633, "learning_rate": 5e-06, "logits/chosen": -18913673.846153848, "logits/rejected": -5219924.363636363, "logps/chosen": -413.19302133413464, "logps/rejected": -483.62717507102275, "loss": 0.0525, "rewards/chosen": 6.9937896728515625, "rewards/margins": 17.36113947088068, "rewards/rejected": -10.36734979802912, "step": 2028 }, { "epoch": 0.5561189529943813, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33190821.818181816, "logits/rejected": -16883672.615384616, "logps/chosen": -414.34761186079544, "logps/rejected": -689.8000300480769, "loss": 0.0071, "rewards/chosen": 6.761186773126775, "rewards/margins": 19.368177880774013, "rewards/rejected": -12.606991107647236, "step": 2029 }, { "epoch": 0.5563930382348911, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21189642.666666668, "logits/rejected": -21940536.0, "logps/chosen": -425.83642578125, "logps/rejected": -564.0718180338541, "loss": 0.015, "rewards/chosen": 5.437908172607422, "rewards/margins": 16.38923772176107, "rewards/rejected": -10.951329549153646, "step": 2030 }, { "epoch": 0.5566671234754008, "grad_norm": 2.59375, "kl": 3.8025107383728027, "learning_rate": 5e-06, "logits/chosen": -4889770.0, "logits/rejected": -15182988.0, "logps/chosen": -499.7201741536458, "logps/rejected": -558.3907877604166, "loss": 0.0074, "rewards/chosen": 7.622974395751953, "rewards/margins": 18.72271728515625, "rewards/rejected": -11.099742889404297, "step": 2031 }, { "epoch": 0.5569412087159107, "grad_norm": 3.421875, "kl": 1.4474284648895264, "learning_rate": 5e-06, "logits/chosen": -18885019.636363637, "logits/rejected": -18024500.923076924, "logps/chosen": -382.92911044034093, "logps/rejected": -554.4886568509615, "loss": 0.0225, "rewards/chosen": 7.239145452325994, "rewards/margins": 17.393023297503277, "rewards/rejected": -10.153877845177284, "step": 2032 }, { "epoch": 0.5572152939564204, "grad_norm": 9.8125, "kl": 5.306757926940918, "learning_rate": 5e-06, "logits/chosen": 23703492.923076924, "logits/rejected": -10440535.272727273, "logps/chosen": -468.0413161057692, "logps/rejected": -440.60258345170456, "loss": 0.0296, "rewards/chosen": 7.113986088679387, "rewards/margins": 17.28528680334558, "rewards/rejected": -10.171300714666193, "step": 2033 }, { "epoch": 0.5574893791969302, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 19875656.888888888, "logits/rejected": -25590792.533333335, "logps/chosen": -424.5439724392361, "logps/rejected": -533.959375, "loss": 0.0198, "rewards/chosen": 6.3433727688259545, "rewards/margins": 16.014266120062935, "rewards/rejected": -9.670893351236979, "step": 2034 }, { "epoch": 0.5577634644374401, "grad_norm": 7.125, "kl": 7.219174385070801, "learning_rate": 5e-06, "logits/chosen": -11531810.461538462, "logits/rejected": -7196197.090909091, "logps/chosen": -371.02869591346155, "logps/rejected": -471.3123668323864, "loss": 0.0275, "rewards/chosen": 6.1788189227764425, "rewards/margins": 15.091445015860604, "rewards/rejected": -8.912626093084162, "step": 2035 }, { "epoch": 0.5580375496779498, "grad_norm": 7.6875, "kl": 5.4024658203125, "learning_rate": 5e-06, "logits/chosen": -20943251.2, "logits/rejected": -13999336.888888888, "logps/chosen": -378.15514322916664, "logps/rejected": -547.5056423611111, "loss": 0.0279, "rewards/chosen": 6.889598592122396, "rewards/margins": 18.63487854003906, "rewards/rejected": -11.745279947916666, "step": 2036 }, { "epoch": 0.5583116349184596, "grad_norm": 2.234375, "kl": 2.8902359008789062, "learning_rate": 5e-06, "logits/chosen": 19320336.0, "logits/rejected": -30016698.666666668, "logps/chosen": -550.89794921875, "logps/rejected": -527.8994954427084, "loss": 0.0066, "rewards/chosen": 7.920032501220703, "rewards/margins": 18.33830897013346, "rewards/rejected": -10.41827646891276, "step": 2037 }, { "epoch": 0.5585857201589695, "grad_norm": 11.0625, "kl": 10.563787460327148, "learning_rate": 5e-06, "logits/chosen": -23710602.666666668, "logits/rejected": -17736092.444444444, "logps/chosen": -506.75166015625, "logps/rejected": -555.8828667534722, "loss": 0.0566, "rewards/chosen": 7.844805908203125, "rewards/margins": 20.78945041232639, "rewards/rejected": -12.944644504123264, "step": 2038 }, { "epoch": 0.5588598053994792, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9721255.272727273, "logits/rejected": 12484025.846153846, "logps/chosen": -341.35091885653407, "logps/rejected": -571.6024639423077, "loss": 0.0076, "rewards/chosen": 6.881196455522017, "rewards/margins": 19.979002945906633, "rewards/rejected": -13.097806490384615, "step": 2039 }, { "epoch": 0.5591338906399891, "grad_norm": 3.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5810537.714285715, "logits/rejected": -27473395.2, "logps/chosen": -483.91231863839283, "logps/rejected": -432.962353515625, "loss": 0.0181, "rewards/chosen": 6.506486075265067, "rewards/margins": 16.45742220197405, "rewards/rejected": -9.950936126708985, "step": 2040 }, { "epoch": 0.5594079758804988, "grad_norm": 7.65625, "kl": 2.7337374687194824, "learning_rate": 5e-06, "logits/chosen": 3998499.6363636362, "logits/rejected": -1312556.923076923, "logps/chosen": -471.07266512784093, "logps/rejected": -640.7967247596154, "loss": 0.0238, "rewards/chosen": 6.662114923650568, "rewards/margins": 14.56688599486451, "rewards/rejected": -7.9047710712139425, "step": 2041 }, { "epoch": 0.5596820611210086, "grad_norm": 4.84375, "kl": 0.49485844373703003, "learning_rate": 5e-06, "logits/chosen": -35175982.54545455, "logits/rejected": -14944483.692307692, "logps/chosen": -481.3671875, "logps/rejected": -619.9342698317307, "loss": 0.0215, "rewards/chosen": 8.081215598366477, "rewards/margins": 19.871400206239073, "rewards/rejected": -11.790184607872597, "step": 2042 }, { "epoch": 0.5599561463615185, "grad_norm": 6.0625, "kl": 2.2590346336364746, "learning_rate": 5e-06, "logits/chosen": -9188406.4, "logits/rejected": -21621028.57142857, "logps/chosen": -465.292919921875, "logps/rejected": -490.5645228794643, "loss": 0.0187, "rewards/chosen": 8.309645843505859, "rewards/margins": 17.660303715297154, "rewards/rejected": -9.350657871791295, "step": 2043 }, { "epoch": 0.5602302316020282, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18853852.0, "logits/rejected": -33794260.0, "logps/chosen": -331.5677490234375, "logps/rejected": -444.5323791503906, "loss": 0.0401, "rewards/chosen": 6.604351997375488, "rewards/margins": 14.793074607849121, "rewards/rejected": -8.188722610473633, "step": 2044 }, { "epoch": 0.560504316842538, "grad_norm": 13.875, "kl": 5.342080593109131, "learning_rate": 5e-06, "logits/chosen": -21493942.666666668, "logits/rejected": -9117653.333333334, "logps/chosen": -453.4437662760417, "logps/rejected": -510.4639078776042, "loss": 0.0309, "rewards/chosen": 7.207461675008138, "rewards/margins": 16.65392239888509, "rewards/rejected": -9.446460723876953, "step": 2045 }, { "epoch": 0.5607784020830479, "grad_norm": 6.4375, "kl": 2.7111170291900635, "learning_rate": 5e-06, "logits/chosen": -12296162.285714285, "logits/rejected": 9307250.4, "logps/chosen": -415.09287806919644, "logps/rejected": -547.463623046875, "loss": 0.0329, "rewards/chosen": 6.7943235124860495, "rewards/margins": 17.042554255894252, "rewards/rejected": -10.248230743408204, "step": 2046 }, { "epoch": 0.5610524873235576, "grad_norm": 7.84375, "kl": 3.5440711975097656, "learning_rate": 5e-06, "logits/chosen": -12938108.307692308, "logits/rejected": -20840106.181818184, "logps/chosen": -396.99500450721155, "logps/rejected": -411.3558238636364, "loss": 0.0569, "rewards/chosen": 8.58245849609375, "rewards/margins": 15.978175076571379, "rewards/rejected": -7.395716580477628, "step": 2047 }, { "epoch": 0.5613265725640674, "grad_norm": 4.90625, "kl": 0.6454347372055054, "learning_rate": 5e-06, "logits/chosen": -10289237.6, "logits/rejected": -43038907.428571425, "logps/chosen": -418.886083984375, "logps/rejected": -506.46484375, "loss": 0.0112, "rewards/chosen": 6.091970062255859, "rewards/margins": 16.380999864850725, "rewards/rejected": -10.289029802594866, "step": 2048 }, { "epoch": 0.5616006578045772, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18708484.8, "logits/rejected": -19295536.0, "logps/chosen": -285.0521240234375, "logps/rejected": -451.0864955357143, "loss": 0.0633, "rewards/chosen": 4.541218566894531, "rewards/margins": 14.29973624093192, "rewards/rejected": -9.758517674037389, "step": 2049 }, { "epoch": 0.561874743045087, "grad_norm": 4.8125, "kl": 7.612316131591797, "learning_rate": 5e-06, "logits/chosen": -22582970.181818184, "logits/rejected": -20109723.076923076, "logps/chosen": -361.4546564275568, "logps/rejected": -602.7472956730769, "loss": 0.0164, "rewards/chosen": 6.975404912775213, "rewards/margins": 17.735629475200092, "rewards/rejected": -10.76022456242488, "step": 2050 }, { "epoch": 0.5621488282855969, "grad_norm": 2.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12632060.307692308, "logits/rejected": -667941.7272727273, "logps/chosen": -388.11583533653845, "logps/rejected": -714.5502041903409, "loss": 0.0235, "rewards/chosen": 6.783875685471755, "rewards/margins": 18.89615124255627, "rewards/rejected": -12.112275557084518, "step": 2051 }, { "epoch": 0.5624229135261066, "grad_norm": 8.8125, "kl": 4.119449615478516, "learning_rate": 5e-06, "logits/chosen": -4432965.818181818, "logits/rejected": 3089712.6153846155, "logps/chosen": -447.39399857954544, "logps/rejected": -507.2043269230769, "loss": 0.0243, "rewards/chosen": 6.984277898615057, "rewards/margins": 16.612635419085308, "rewards/rejected": -9.628357520470253, "step": 2052 }, { "epoch": 0.5626969987666164, "grad_norm": 4.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9941067.636363637, "logits/rejected": -12247873.23076923, "logps/chosen": -310.43430397727275, "logps/rejected": -617.3185847355769, "loss": 0.0368, "rewards/chosen": 5.308880199085582, "rewards/margins": 16.242042007979812, "rewards/rejected": -10.93316180889423, "step": 2053 }, { "epoch": 0.5629710840071263, "grad_norm": 5.84375, "kl": 4.203070640563965, "learning_rate": 5e-06, "logits/chosen": -19880276.363636363, "logits/rejected": -4916661.846153846, "logps/chosen": -417.80522017045456, "logps/rejected": -391.92397836538464, "loss": 0.0281, "rewards/chosen": 7.09552626176314, "rewards/margins": 16.464615241630927, "rewards/rejected": -9.369088979867788, "step": 2054 }, { "epoch": 0.563245169247636, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19920024.727272727, "logits/rejected": -20132883.692307692, "logps/chosen": -466.7145330255682, "logps/rejected": -518.2913161057693, "loss": 0.0155, "rewards/chosen": 6.9775779030539775, "rewards/margins": 19.1888135363172, "rewards/rejected": -12.211235633263222, "step": 2055 }, { "epoch": 0.5635192544881458, "grad_norm": 14.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9207120.0, "logits/rejected": -22704650.666666668, "logps/chosen": -462.3973795572917, "logps/rejected": -587.6271158854166, "loss": 0.0559, "rewards/chosen": 5.008958498636882, "rewards/margins": 14.491937319437664, "rewards/rejected": -9.482978820800781, "step": 2056 }, { "epoch": 0.5637933397286556, "grad_norm": 2.3125, "kl": 2.9892630577087402, "learning_rate": 5e-06, "logits/chosen": -14475642.285714285, "logits/rejected": -9022905.6, "logps/chosen": -412.09877232142856, "logps/rejected": -472.3427734375, "loss": 0.0339, "rewards/chosen": 6.996598379952567, "rewards/margins": 16.97983202253069, "rewards/rejected": -9.983233642578124, "step": 2057 }, { "epoch": 0.5640674249691654, "grad_norm": 12.3125, "kl": 4.231632232666016, "learning_rate": 5e-06, "logits/chosen": -10706216.615384616, "logits/rejected": -19072450.90909091, "logps/chosen": -445.7146559495192, "logps/rejected": -469.58487215909093, "loss": 0.0562, "rewards/chosen": 5.621434725247896, "rewards/margins": 15.29516436170031, "rewards/rejected": -9.673729636452414, "step": 2058 }, { "epoch": 0.5643415102096752, "grad_norm": 1.4375, "kl": 4.89117431640625, "learning_rate": 5e-06, "logits/chosen": -19071538.0, "logits/rejected": -23976462.0, "logps/chosen": -457.23980712890625, "logps/rejected": -851.4812622070312, "loss": 0.0033, "rewards/chosen": 8.557952880859375, "rewards/margins": 23.18563175201416, "rewards/rejected": -14.627678871154785, "step": 2059 }, { "epoch": 0.564615595450185, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31121969.777777776, "logits/rejected": -24147517.866666667, "logps/chosen": -395.0246310763889, "logps/rejected": -558.0017578125, "loss": 0.0618, "rewards/chosen": 6.109155442979601, "rewards/margins": 18.30270199245877, "rewards/rejected": -12.193546549479167, "step": 2060 }, { "epoch": 0.5648896806906948, "grad_norm": 1.7890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34704872.0, "logits/rejected": -30612748.0, "logps/chosen": -436.7883605957031, "logps/rejected": -493.3774108886719, "loss": 0.0047, "rewards/chosen": 7.130896091461182, "rewards/margins": 17.539700031280518, "rewards/rejected": -10.408803939819336, "step": 2061 }, { "epoch": 0.5651637659312047, "grad_norm": 7.46875, "kl": 0.19741439819335938, "learning_rate": 5e-06, "logits/chosen": -3984182.0, "logits/rejected": -26669512.0, "logps/chosen": -425.657470703125, "logps/rejected": -492.5303141276042, "loss": 0.0271, "rewards/chosen": 6.086368560791016, "rewards/margins": 15.385078430175781, "rewards/rejected": -9.298709869384766, "step": 2062 }, { "epoch": 0.5654378511717144, "grad_norm": 6.71875, "kl": 1.0464839935302734, "learning_rate": 5e-06, "logits/chosen": -34603608.0, "logits/rejected": -4211123.666666667, "logps/chosen": -509.798583984375, "logps/rejected": -464.4973551432292, "loss": 0.0664, "rewards/chosen": 8.021881103515625, "rewards/margins": 15.55669657389323, "rewards/rejected": -7.5348154703776045, "step": 2063 }, { "epoch": 0.5657119364122242, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8792413.333333334, "logits/rejected": -35298485.333333336, "logps/chosen": -361.32752821180554, "logps/rejected": -443.17356770833334, "loss": 0.0392, "rewards/chosen": 6.634291330973308, "rewards/margins": 15.399355824788412, "rewards/rejected": -8.765064493815105, "step": 2064 }, { "epoch": 0.565986021652734, "grad_norm": 3.40625, "kl": 1.7908185720443726, "learning_rate": 5e-06, "logits/chosen": 11494455.0, "logits/rejected": -32297800.0, "logps/chosen": -431.0013122558594, "logps/rejected": -739.7415161132812, "loss": 0.0124, "rewards/chosen": 7.094418525695801, "rewards/margins": 22.49496841430664, "rewards/rejected": -15.40054988861084, "step": 2065 }, { "epoch": 0.5662601068932438, "grad_norm": 8.0625, "kl": 4.083518028259277, "learning_rate": 5e-06, "logits/chosen": -25801853.09090909, "logits/rejected": -6457996.307692308, "logps/chosen": -371.51493696732956, "logps/rejected": -474.00473257211536, "loss": 0.0415, "rewards/chosen": 6.3538970947265625, "rewards/margins": 16.457999596228966, "rewards/rejected": -10.104102501502403, "step": 2066 }, { "epoch": 0.5665341921337536, "grad_norm": 8.625, "kl": 2.9704437255859375, "learning_rate": 5e-06, "logits/chosen": -19606553.6, "logits/rejected": -21401757.714285713, "logps/chosen": -405.70302734375, "logps/rejected": -549.5874720982143, "loss": 0.0247, "rewards/chosen": 7.746845245361328, "rewards/margins": 17.044160570417134, "rewards/rejected": -9.297315325055804, "step": 2067 }, { "epoch": 0.5668082773742634, "grad_norm": 6.6875, "kl": 0.6808280944824219, "learning_rate": 5e-06, "logits/chosen": -33052680.0, "logits/rejected": -13268816.0, "logps/chosen": -448.9460856119792, "logps/rejected": -516.218994140625, "loss": 0.0307, "rewards/chosen": 8.415655771891275, "rewards/margins": 20.167142232259113, "rewards/rejected": -11.751486460367838, "step": 2068 }, { "epoch": 0.5670823626147732, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21448922.666666668, "logits/rejected": -17023644.444444444, "logps/chosen": -383.3302815755208, "logps/rejected": -648.2427300347222, "loss": 0.0305, "rewards/chosen": 6.589866638183594, "rewards/margins": 19.94148678249783, "rewards/rejected": -13.351620144314236, "step": 2069 }, { "epoch": 0.5673564478552829, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30491136.0, "logits/rejected": 11961441.142857144, "logps/chosen": -423.56982421875, "logps/rejected": -480.81337193080356, "loss": 0.0345, "rewards/chosen": 5.8189697265625, "rewards/margins": 14.523801531110491, "rewards/rejected": -8.704831804547991, "step": 2070 }, { "epoch": 0.5676305330957928, "grad_norm": 5.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18974468.923076924, "logits/rejected": -33562176.0, "logps/chosen": -348.23106971153845, "logps/rejected": -618.1946910511364, "loss": 0.0178, "rewards/chosen": 6.789068368765024, "rewards/margins": 22.260890934017155, "rewards/rejected": -15.47182256525213, "step": 2071 }, { "epoch": 0.5679046183363026, "grad_norm": 4.03125, "kl": 2.0583763122558594, "learning_rate": 5e-06, "logits/chosen": 13960800.0, "logits/rejected": -19700612.923076924, "logps/chosen": -386.5319158380682, "logps/rejected": -662.6481370192307, "loss": 0.0144, "rewards/chosen": 7.222702719948509, "rewards/margins": 17.918760553106562, "rewards/rejected": -10.696057833158052, "step": 2072 }, { "epoch": 0.5681787035768124, "grad_norm": 1.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19122838.666666668, "logits/rejected": -33310216.0, "logps/chosen": -517.5096842447916, "logps/rejected": -631.6138509114584, "loss": 0.0046, "rewards/chosen": 6.836645762125651, "rewards/margins": 19.314071655273438, "rewards/rejected": -12.477425893147787, "step": 2073 }, { "epoch": 0.5684527888173222, "grad_norm": 1.7734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23811507.2, "logits/rejected": -18274912.0, "logps/chosen": -460.75107421875, "logps/rejected": -591.0448521205357, "loss": 0.0044, "rewards/chosen": 7.216944885253906, "rewards/margins": 20.965812465122767, "rewards/rejected": -13.748867579868861, "step": 2074 }, { "epoch": 0.568726874057832, "grad_norm": 11.625, "kl": 4.345172882080078, "learning_rate": 5e-06, "logits/chosen": -36191268.92307692, "logits/rejected": -5027569.454545454, "logps/chosen": -431.3273737980769, "logps/rejected": -473.71360085227275, "loss": 0.0594, "rewards/chosen": 7.313653212327224, "rewards/margins": 17.927016998504424, "rewards/rejected": -10.613363786177201, "step": 2075 }, { "epoch": 0.5690009592983418, "grad_norm": 9.6875, "kl": 9.799956321716309, "learning_rate": 5e-06, "logits/chosen": -17844091.733333334, "logits/rejected": -13699420.444444444, "logps/chosen": -419.55286458333336, "logps/rejected": -607.8879665798611, "loss": 0.0767, "rewards/chosen": 5.878955078125, "rewards/margins": 14.941011725531684, "rewards/rejected": -9.062056647406685, "step": 2076 }, { "epoch": 0.5692750445388516, "grad_norm": 1.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39617417.14285714, "logits/rejected": -21855879.529411763, "logps/chosen": -562.5190778459821, "logps/rejected": -596.2018612132352, "loss": 0.004, "rewards/chosen": 6.751523154122489, "rewards/margins": 19.029613687210723, "rewards/rejected": -12.278090533088236, "step": 2077 }, { "epoch": 0.5695491297793613, "grad_norm": 8.5, "kl": 1.5358861684799194, "learning_rate": 5e-06, "logits/chosen": -41172168.53333333, "logits/rejected": -21286344.888888888, "logps/chosen": -329.3490234375, "logps/rejected": -472.326171875, "loss": 0.0754, "rewards/chosen": 6.476792907714843, "rewards/margins": 16.300337727864584, "rewards/rejected": -9.82354482014974, "step": 2078 }, { "epoch": 0.5698232150198712, "grad_norm": 3.96875, "kl": 3.060559034347534, "learning_rate": 5e-06, "logits/chosen": -40130934.15384615, "logits/rejected": -6968980.363636363, "logps/chosen": -373.3424729567308, "logps/rejected": -508.31729403409093, "loss": 0.0134, "rewards/chosen": 6.35586665226863, "rewards/margins": 15.839093081601018, "rewards/rejected": -9.483226429332387, "step": 2079 }, { "epoch": 0.570097300260381, "grad_norm": 3.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8285601.777777778, "logits/rejected": -25940654.933333334, "logps/chosen": -478.4104275173611, "logps/rejected": -489.6453125, "loss": 0.0052, "rewards/chosen": 7.346316867404514, "rewards/margins": 18.148645358615454, "rewards/rejected": -10.802328491210938, "step": 2080 }, { "epoch": 0.5703713855008907, "grad_norm": 6.46875, "kl": 6.029782772064209, "learning_rate": 5e-06, "logits/chosen": -20214668.8, "logits/rejected": -27193237.333333332, "logps/chosen": -373.4889322916667, "logps/rejected": -357.8538411458333, "loss": 0.0383, "rewards/chosen": 6.0312449137369795, "rewards/margins": 14.60493384467231, "rewards/rejected": -8.57368893093533, "step": 2081 }, { "epoch": 0.5706454707414006, "grad_norm": 5.625, "kl": 3.610734462738037, "learning_rate": 5e-06, "logits/chosen": -30202792.0, "logits/rejected": -22343610.0, "logps/chosen": -422.1995849609375, "logps/rejected": -855.8651123046875, "loss": 0.0218, "rewards/chosen": 6.630320072174072, "rewards/margins": 23.159056186676025, "rewards/rejected": -16.528736114501953, "step": 2082 }, { "epoch": 0.5709195559819104, "grad_norm": 9.625, "kl": 1.3122981786727905, "learning_rate": 5e-06, "logits/chosen": -22760862.0, "logits/rejected": -7302757.5, "logps/chosen": -475.8143615722656, "logps/rejected": -486.5482177734375, "loss": 0.0352, "rewards/chosen": 7.386274337768555, "rewards/margins": 19.83949565887451, "rewards/rejected": -12.453221321105957, "step": 2083 }, { "epoch": 0.5711936412224202, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9503720.0, "logits/rejected": -20302489.6, "logps/chosen": -400.71620396205356, "logps/rejected": -496.470361328125, "loss": 0.0198, "rewards/chosen": 7.212094988141741, "rewards/margins": 16.57865687779018, "rewards/rejected": -9.366561889648438, "step": 2084 }, { "epoch": 0.57146772646293, "grad_norm": 4.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 282051.2727272727, "logits/rejected": -12096541.538461538, "logps/chosen": -402.0343572443182, "logps/rejected": -632.6154597355769, "loss": 0.024, "rewards/chosen": 5.618443228981712, "rewards/margins": 17.300356164678828, "rewards/rejected": -11.681912935697115, "step": 2085 }, { "epoch": 0.5717418117034397, "grad_norm": 5.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34925299.692307696, "logits/rejected": -24780741.818181816, "logps/chosen": -377.8694035456731, "logps/rejected": -489.4947620738636, "loss": 0.0164, "rewards/chosen": 5.288227961613582, "rewards/margins": 16.254274381624235, "rewards/rejected": -10.966046420010654, "step": 2086 }, { "epoch": 0.5720158969439496, "grad_norm": 10.25, "kl": 6.493488311767578, "learning_rate": 5e-06, "logits/chosen": -13684820.363636363, "logits/rejected": -18635475.692307692, "logps/chosen": -363.1796875, "logps/rejected": -615.6890775240385, "loss": 0.0576, "rewards/chosen": 6.13356503573331, "rewards/margins": 17.485255474810835, "rewards/rejected": -11.351690439077524, "step": 2087 }, { "epoch": 0.5722899821844594, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34483491.55555555, "logits/rejected": -14224716.8, "logps/chosen": -470.396484375, "logps/rejected": -439.8736979166667, "loss": 0.0087, "rewards/chosen": 6.637686411539714, "rewards/margins": 16.39895960489909, "rewards/rejected": -9.761273193359376, "step": 2088 }, { "epoch": 0.5725640674249691, "grad_norm": 2.8125, "kl": 3.3315486907958984, "learning_rate": 5e-06, "logits/chosen": -28862205.333333332, "logits/rejected": -21582384.0, "logps/chosen": -408.689697265625, "logps/rejected": -504.9444986979167, "loss": 0.0362, "rewards/chosen": 7.031878153483073, "rewards/margins": 16.449883778889973, "rewards/rejected": -9.4180056254069, "step": 2089 }, { "epoch": 0.572838152665479, "grad_norm": 9.75, "kl": 1.7754848003387451, "learning_rate": 5e-06, "logits/chosen": -10075564.0, "logits/rejected": -10287381.714285715, "logps/chosen": -329.882080078125, "logps/rejected": -383.7634974888393, "loss": 0.0283, "rewards/chosen": 6.477745056152344, "rewards/margins": 14.043464660644531, "rewards/rejected": -7.5657196044921875, "step": 2090 }, { "epoch": 0.5731122379059888, "grad_norm": 5.78125, "kl": 4.885934829711914, "learning_rate": 5e-06, "logits/chosen": -25911572.0, "logits/rejected": -42740664.0, "logps/chosen": -470.95831298828125, "logps/rejected": -531.4826049804688, "loss": 0.0521, "rewards/chosen": 6.692975044250488, "rewards/margins": 19.405585289001465, "rewards/rejected": -12.712610244750977, "step": 2091 }, { "epoch": 0.5733863231464985, "grad_norm": 7.9375, "kl": 11.036136627197266, "learning_rate": 5e-06, "logits/chosen": -25945320.0, "logits/rejected": 1978273.125, "logps/chosen": -392.59796142578125, "logps/rejected": -461.633056640625, "loss": 0.0564, "rewards/chosen": 6.913478374481201, "rewards/margins": 13.973340034484863, "rewards/rejected": -7.059861660003662, "step": 2092 }, { "epoch": 0.5736604083870084, "grad_norm": 4.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32323488.0, "logits/rejected": -23017210.666666668, "logps/chosen": -482.9468994140625, "logps/rejected": -504.5812174479167, "loss": 0.025, "rewards/chosen": 7.964707692464192, "rewards/margins": 18.303264617919922, "rewards/rejected": -10.338556925455729, "step": 2093 }, { "epoch": 0.5739344936275181, "grad_norm": 1.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21903950.222222224, "logits/rejected": -16065969.066666666, "logps/chosen": -369.20421006944446, "logps/rejected": -490.06565755208334, "loss": 0.0065, "rewards/chosen": 7.2705841064453125, "rewards/margins": 17.45656941731771, "rewards/rejected": -10.185985310872395, "step": 2094 }, { "epoch": 0.574208578868028, "grad_norm": 6.5625, "kl": 3.5964412689208984, "learning_rate": 5e-06, "logits/chosen": -21120157.866666667, "logits/rejected": -9543590.222222222, "logps/chosen": -456.9173828125, "logps/rejected": -478.9441189236111, "loss": 0.0206, "rewards/chosen": 8.04248046875, "rewards/margins": 16.760399712456596, "rewards/rejected": -8.717919243706596, "step": 2095 }, { "epoch": 0.5744826641085378, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 20639243.2, "logits/rejected": -43350070.85714286, "logps/chosen": -322.284716796875, "logps/rejected": -423.91385323660717, "loss": 0.0215, "rewards/chosen": 6.927011871337891, "rewards/margins": 15.674321964808874, "rewards/rejected": -8.747310093470983, "step": 2096 }, { "epoch": 0.5747567493490475, "grad_norm": 15.0625, "kl": 9.382925987243652, "learning_rate": 5e-06, "logits/chosen": -29218978.0, "logits/rejected": -32289220.0, "logps/chosen": -440.97540283203125, "logps/rejected": -452.5029296875, "loss": 0.0838, "rewards/chosen": 5.846131324768066, "rewards/margins": 15.74048137664795, "rewards/rejected": -9.894350051879883, "step": 2097 }, { "epoch": 0.5750308345895574, "grad_norm": 1.9765625, "kl": 5.98236083984375, "learning_rate": 5e-06, "logits/chosen": 176190.93333333332, "logits/rejected": -41584170.666666664, "logps/chosen": -477.080078125, "logps/rejected": -473.56749131944446, "loss": 0.0454, "rewards/chosen": 7.5953725179036455, "rewards/margins": 18.389156765407986, "rewards/rejected": -10.793784247504341, "step": 2098 }, { "epoch": 0.5753049198300672, "grad_norm": 10.375, "kl": 8.216522216796875, "learning_rate": 5e-06, "logits/chosen": -45981928.0, "logits/rejected": -8197049.0, "logps/chosen": -468.54986572265625, "logps/rejected": -440.19366455078125, "loss": 0.0504, "rewards/chosen": 6.64915657043457, "rewards/margins": 17.176493644714355, "rewards/rejected": -10.527337074279785, "step": 2099 }, { "epoch": 0.5755790050705769, "grad_norm": 8.125, "kl": 3.3587217330932617, "learning_rate": 5e-06, "logits/chosen": -4487548.0, "logits/rejected": 2266728.8, "logps/chosen": -311.0000697544643, "logps/rejected": -500.89365234375, "loss": 0.0559, "rewards/chosen": 6.243882315499442, "rewards/margins": 14.572942679268973, "rewards/rejected": -8.329060363769532, "step": 2100 }, { "epoch": 0.5758530903110868, "grad_norm": 2.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20973284.923076924, "logits/rejected": -31175217.454545453, "logps/chosen": -377.38198617788464, "logps/rejected": -736.5512251420455, "loss": 0.0173, "rewards/chosen": 5.325739933894231, "rewards/margins": 21.75927126157534, "rewards/rejected": -16.43353132768111, "step": 2101 }, { "epoch": 0.5761271755515965, "grad_norm": 5.875, "kl": 1.7646329402923584, "learning_rate": 5e-06, "logits/chosen": -31233878.153846152, "logits/rejected": -4909754.909090909, "logps/chosen": -432.52249849759613, "logps/rejected": -491.00852272727275, "loss": 0.0389, "rewards/chosen": 7.995442903958834, "rewards/margins": 18.519511562960965, "rewards/rejected": -10.52406865900213, "step": 2102 }, { "epoch": 0.5764012607921063, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27826224.0, "logits/rejected": -25210718.0, "logps/chosen": -444.09649658203125, "logps/rejected": -473.62554931640625, "loss": 0.0355, "rewards/chosen": 6.4911909103393555, "rewards/margins": 15.71739387512207, "rewards/rejected": -9.226202964782715, "step": 2103 }, { "epoch": 0.5766753460326162, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27304388.923076924, "logits/rejected": 103435077.81818181, "logps/chosen": -373.89453125, "logps/rejected": -604.2321555397727, "loss": 0.0325, "rewards/chosen": 6.190062889685998, "rewards/margins": 22.136676548244235, "rewards/rejected": -15.946613658558238, "step": 2104 }, { "epoch": 0.5769494312731259, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42747159.27272727, "logits/rejected": -18942505.846153848, "logps/chosen": -512.6155450994319, "logps/rejected": -584.7796349158654, "loss": 0.0264, "rewards/chosen": 6.569181268865412, "rewards/margins": 18.834616120878636, "rewards/rejected": -12.265434852013222, "step": 2105 }, { "epoch": 0.5772235165136358, "grad_norm": 7.4375, "kl": 5.894499778747559, "learning_rate": 5e-06, "logits/chosen": -15655228.8, "logits/rejected": -22649750.85714286, "logps/chosen": -556.186279296875, "logps/rejected": -665.0118582589286, "loss": 0.0175, "rewards/chosen": 8.099593353271484, "rewards/margins": 20.019693974086216, "rewards/rejected": -11.920100620814733, "step": 2106 }, { "epoch": 0.5774976017541456, "grad_norm": 13.375, "kl": 1.5390205383300781, "learning_rate": 5e-06, "logits/chosen": -34530001.45454545, "logits/rejected": -24891788.307692308, "logps/chosen": -468.5486505681818, "logps/rejected": -395.0619365985577, "loss": 0.019, "rewards/chosen": 6.842016740278765, "rewards/margins": 15.35609532069493, "rewards/rejected": -8.514078580416166, "step": 2107 }, { "epoch": 0.5777716869946553, "grad_norm": 2.5625, "kl": 6.553595542907715, "learning_rate": 5e-06, "logits/chosen": -52999778.461538464, "logits/rejected": 17780629.818181816, "logps/chosen": -571.5456355168269, "logps/rejected": -501.38210227272725, "loss": 0.0083, "rewards/chosen": 8.351004967322716, "rewards/margins": 17.444307607370654, "rewards/rejected": -9.09330264004794, "step": 2108 }, { "epoch": 0.5780457722351652, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20104435.692307692, "logits/rejected": -49949789.09090909, "logps/chosen": -457.77640474759613, "logps/rejected": -535.1843039772727, "loss": 0.0451, "rewards/chosen": 6.852756206805889, "rewards/margins": 20.835590069110577, "rewards/rejected": -13.982833862304688, "step": 2109 }, { "epoch": 0.5783198574756749, "grad_norm": 10.1875, "kl": 0.7286924123764038, "learning_rate": 5e-06, "logits/chosen": -24990557.09090909, "logits/rejected": -19241465.846153848, "logps/chosen": -366.5929066051136, "logps/rejected": -518.2817007211538, "loss": 0.0321, "rewards/chosen": 6.137416492808949, "rewards/margins": 15.97176473124044, "rewards/rejected": -9.83434823843149, "step": 2110 }, { "epoch": 0.5785939427161847, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1937323.6, "logits/rejected": -38430454.85714286, "logps/chosen": -330.4925048828125, "logps/rejected": -556.7364327566964, "loss": 0.0525, "rewards/chosen": 6.4247795104980465, "rewards/margins": 19.305113547188895, "rewards/rejected": -12.880334036690849, "step": 2111 }, { "epoch": 0.5788680279566946, "grad_norm": 0.6796875, "kl": 0.20784315466880798, "learning_rate": 5e-06, "logits/chosen": -35404730.18181818, "logits/rejected": -754229.5384615385, "logps/chosen": -416.15571732954544, "logps/rejected": -551.7974008413462, "loss": 0.0014, "rewards/chosen": 8.31671142578125, "rewards/margins": 19.585764958308292, "rewards/rejected": -11.269053532527042, "step": 2112 }, { "epoch": 0.5791421131972043, "grad_norm": 7.0, "kl": 11.207412719726562, "learning_rate": 5e-06, "logits/chosen": -23323861.333333332, "logits/rejected": -32302269.333333332, "logps/chosen": -578.2957356770834, "logps/rejected": -455.6427815755208, "loss": 0.0209, "rewards/chosen": 8.872123718261719, "rewards/margins": 18.69966379801432, "rewards/rejected": -9.827540079752604, "step": 2113 }, { "epoch": 0.5794161984377141, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15404108.8, "logits/rejected": -23770276.57142857, "logps/chosen": -360.096826171875, "logps/rejected": -498.4780970982143, "loss": 0.0282, "rewards/chosen": 6.392802810668945, "rewards/margins": 18.987138094220843, "rewards/rejected": -12.594335283551898, "step": 2114 }, { "epoch": 0.579690283678224, "grad_norm": 6.8125, "kl": 3.4968650341033936, "learning_rate": 5e-06, "logits/chosen": -20651965.866666667, "logits/rejected": -18057134.222222224, "logps/chosen": -452.4965494791667, "logps/rejected": -531.30859375, "loss": 0.0355, "rewards/chosen": 8.154464721679688, "rewards/margins": 18.46558295355903, "rewards/rejected": -10.311118231879341, "step": 2115 }, { "epoch": 0.5799643689187337, "grad_norm": 9.5625, "kl": 3.404376983642578, "learning_rate": 5e-06, "logits/chosen": -35656276.36363637, "logits/rejected": -50039950.76923077, "logps/chosen": -425.41841264204544, "logps/rejected": -462.19095552884613, "loss": 0.1039, "rewards/chosen": 6.326819679953835, "rewards/margins": 18.366365766191816, "rewards/rejected": -12.03954608623798, "step": 2116 }, { "epoch": 0.5802384541592436, "grad_norm": 5.78125, "kl": 10.574642181396484, "learning_rate": 5e-06, "logits/chosen": -13398089.6, "logits/rejected": -20997525.333333332, "logps/chosen": -354.26051432291666, "logps/rejected": -651.7620442708334, "loss": 0.0189, "rewards/chosen": 7.993954976399739, "rewards/margins": 21.975521511501736, "rewards/rejected": -13.981566535101997, "step": 2117 }, { "epoch": 0.5805125393997533, "grad_norm": 13.9375, "kl": 7.084535121917725, "learning_rate": 5e-06, "logits/chosen": -20724531.2, "logits/rejected": -26287982.222222224, "logps/chosen": -432.46630859375, "logps/rejected": -389.88783094618054, "loss": 0.0494, "rewards/chosen": 7.517392476399739, "rewards/margins": 16.051957024468315, "rewards/rejected": -8.534564548068577, "step": 2118 }, { "epoch": 0.5807866246402631, "grad_norm": 2.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61059481.6, "logits/rejected": -19761173.89473684, "logps/chosen": -608.561279296875, "logps/rejected": -608.2375616776316, "loss": 0.0062, "rewards/chosen": 8.45541763305664, "rewards/margins": 19.675864571019225, "rewards/rejected": -11.220446937962583, "step": 2119 }, { "epoch": 0.581060709880773, "grad_norm": 1.3515625, "kl": 0.9387969970703125, "learning_rate": 5e-06, "logits/chosen": -23910544.0, "logits/rejected": -21846635.42857143, "logps/chosen": -526.762646484375, "logps/rejected": -644.3900669642857, "loss": 0.0054, "rewards/chosen": 9.650862121582032, "rewards/margins": 22.38133087158203, "rewards/rejected": -12.73046875, "step": 2120 }, { "epoch": 0.5813347951212827, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40712740.571428575, "logits/rejected": 15073232.0, "logps/chosen": -320.27633231026783, "logps/rejected": -505.7634765625, "loss": 0.0387, "rewards/chosen": 6.698146275111607, "rewards/margins": 14.364833286830358, "rewards/rejected": -7.66668701171875, "step": 2121 }, { "epoch": 0.5816088803617925, "grad_norm": 8.875, "kl": 0.3316332697868347, "learning_rate": 5e-06, "logits/chosen": 1974687.3333333333, "logits/rejected": -13635832.0, "logps/chosen": -317.0728759765625, "logps/rejected": -713.017822265625, "loss": 0.0475, "rewards/chosen": 5.425852457682292, "rewards/margins": 16.941635131835938, "rewards/rejected": -11.515782674153646, "step": 2122 }, { "epoch": 0.5818829656023023, "grad_norm": 8.1875, "kl": 0.49021148681640625, "learning_rate": 5e-06, "logits/chosen": -8407385.6, "logits/rejected": -32626346.666666668, "logps/chosen": -456.38977864583336, "logps/rejected": -486.6891818576389, "loss": 0.0569, "rewards/chosen": 6.524491373697916, "rewards/margins": 17.892460462782118, "rewards/rejected": -11.367969089084202, "step": 2123 }, { "epoch": 0.5821570508428121, "grad_norm": 6.03125, "kl": 0.06039460748434067, "learning_rate": 5e-06, "logits/chosen": -18291204.923076924, "logits/rejected": -22242619.636363637, "logps/chosen": -348.0985576923077, "logps/rejected": -616.7020152698864, "loss": 0.0436, "rewards/chosen": 5.3938129131610575, "rewards/margins": 17.275525393185916, "rewards/rejected": -11.881712480024857, "step": 2124 }, { "epoch": 0.5824311360833219, "grad_norm": 9.0, "kl": 6.6717071533203125, "learning_rate": 5e-06, "logits/chosen": 2827789.846153846, "logits/rejected": -42614016.0, "logps/chosen": -376.6828801081731, "logps/rejected": -490.75106534090907, "loss": 0.0477, "rewards/chosen": 5.8784966102013225, "rewards/margins": 17.292611822381723, "rewards/rejected": -11.414115212180398, "step": 2125 }, { "epoch": 0.5827052213238317, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28013516.8, "logits/rejected": -20109261.714285713, "logps/chosen": -481.6634765625, "logps/rejected": -453.98263113839283, "loss": 0.0197, "rewards/chosen": 6.073551940917969, "rewards/margins": 16.13782958984375, "rewards/rejected": -10.064277648925781, "step": 2126 }, { "epoch": 0.5829793065643415, "grad_norm": 2.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14363436.307692308, "logits/rejected": -11805894.545454545, "logps/chosen": -357.7663762019231, "logps/rejected": -576.7266068892045, "loss": 0.0059, "rewards/chosen": 7.161321786733774, "rewards/margins": 17.08135810265174, "rewards/rejected": -9.920036315917969, "step": 2127 }, { "epoch": 0.5832533918048514, "grad_norm": 12.4375, "kl": 3.049069881439209, "learning_rate": 5e-06, "logits/chosen": -26736617.14285714, "logits/rejected": -18772052.8, "logps/chosen": -388.40555245535717, "logps/rejected": -412.874853515625, "loss": 0.0717, "rewards/chosen": 6.58173097882952, "rewards/margins": 15.460637228829519, "rewards/rejected": -8.87890625, "step": 2128 }, { "epoch": 0.5835274770453611, "grad_norm": 6.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2852836.0, "logits/rejected": -13626715.0, "logps/chosen": -357.8809509277344, "logps/rejected": -507.8498229980469, "loss": 0.0256, "rewards/chosen": 7.232987403869629, "rewards/margins": 17.812052726745605, "rewards/rejected": -10.579065322875977, "step": 2129 }, { "epoch": 0.5838015622858709, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13883869.090909092, "logits/rejected": 13281833.846153846, "logps/chosen": -416.65944602272725, "logps/rejected": -661.1028395432693, "loss": 0.0154, "rewards/chosen": 5.860104647549716, "rewards/margins": 19.695092581368826, "rewards/rejected": -13.83498793381911, "step": 2130 }, { "epoch": 0.5840756475263807, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13170779.636363637, "logits/rejected": 22437666.46153846, "logps/chosen": -447.8284357244318, "logps/rejected": -630.0425931490385, "loss": 0.0415, "rewards/chosen": 6.424459284002131, "rewards/margins": 23.09402903310069, "rewards/rejected": -16.669569749098557, "step": 2131 }, { "epoch": 0.5843497327668905, "grad_norm": 5.6875, "kl": 2.955056667327881, "learning_rate": 5e-06, "logits/chosen": -12953050.181818182, "logits/rejected": -388118.3076923077, "logps/chosen": -360.4784490411932, "logps/rejected": -534.7435021033654, "loss": 0.0404, "rewards/chosen": 5.165999325838956, "rewards/margins": 15.280766547143042, "rewards/rejected": -10.114767221304087, "step": 2132 }, { "epoch": 0.5846238180074003, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23227192.888888888, "logits/rejected": -17811018.666666668, "logps/chosen": -436.65142144097223, "logps/rejected": -497.03167317708335, "loss": 0.0089, "rewards/chosen": 7.439449734157986, "rewards/margins": 18.960767279730902, "rewards/rejected": -11.521317545572916, "step": 2133 }, { "epoch": 0.5848979032479101, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59352081.45454545, "logits/rejected": -38902749.538461536, "logps/chosen": -474.32137784090907, "logps/rejected": -509.9743088942308, "loss": 0.0383, "rewards/chosen": 5.766646645285866, "rewards/margins": 16.63357826713082, "rewards/rejected": -10.866931621844952, "step": 2134 }, { "epoch": 0.5851719884884199, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30578150.4, "logits/rejected": -31764621.714285713, "logps/chosen": -454.131982421875, "logps/rejected": -655.3639090401786, "loss": 0.0261, "rewards/chosen": 7.082856750488281, "rewards/margins": 17.900400870186942, "rewards/rejected": -10.817544119698661, "step": 2135 }, { "epoch": 0.5854460737289297, "grad_norm": 9.9375, "kl": 9.792182922363281, "learning_rate": 5e-06, "logits/chosen": -29449686.4, "logits/rejected": -20426358.85714286, "logps/chosen": -421.122265625, "logps/rejected": -512.9408133370536, "loss": 0.0442, "rewards/chosen": 6.762929534912109, "rewards/margins": 16.719585854666573, "rewards/rejected": -9.956656319754464, "step": 2136 }, { "epoch": 0.5857201589694395, "grad_norm": 10.6875, "kl": 9.099266052246094, "learning_rate": 5e-06, "logits/chosen": -39770240.0, "logits/rejected": -29997649.454545453, "logps/chosen": -460.45935997596155, "logps/rejected": -513.3543146306819, "loss": 0.0567, "rewards/chosen": 7.289678720327524, "rewards/margins": 18.36076178917518, "rewards/rejected": -11.071083068847656, "step": 2137 }, { "epoch": 0.5859942442099493, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9557194.666666666, "logits/rejected": -21107687.111111112, "logps/chosen": -432.68483072916666, "logps/rejected": -362.8634440104167, "loss": 0.0993, "rewards/chosen": 6.263851928710937, "rewards/margins": 13.382362704806859, "rewards/rejected": -7.1185107760959205, "step": 2138 }, { "epoch": 0.5862683294504591, "grad_norm": 7.25, "kl": 3.507750988006592, "learning_rate": 5e-06, "logits/chosen": -4016063.5, "logits/rejected": -19777628.0, "logps/chosen": -562.9644775390625, "logps/rejected": -577.1492919921875, "loss": 0.0155, "rewards/chosen": 7.153438568115234, "rewards/margins": 16.567160606384277, "rewards/rejected": -9.413722038269043, "step": 2139 }, { "epoch": 0.5865424146909689, "grad_norm": 5.625, "kl": 0.3746757507324219, "learning_rate": 5e-06, "logits/chosen": -36201016.0, "logits/rejected": -27067386.0, "logps/chosen": -459.77313232421875, "logps/rejected": -447.0, "loss": 0.0324, "rewards/chosen": 7.493675231933594, "rewards/margins": 18.159661293029785, "rewards/rejected": -10.665986061096191, "step": 2140 }, { "epoch": 0.5868164999314787, "grad_norm": 2.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23160781.333333332, "logits/rejected": -46890698.666666664, "logps/chosen": -341.0271809895833, "logps/rejected": -540.1982828776041, "loss": 0.0086, "rewards/chosen": 5.989185333251953, "rewards/margins": 16.2037296295166, "rewards/rejected": -10.214544296264648, "step": 2141 }, { "epoch": 0.5870905851719885, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31258821.333333332, "logits/rejected": -33125826.666666668, "logps/chosen": -496.2452799479167, "logps/rejected": -561.580078125, "loss": 0.0967, "rewards/chosen": 7.1285400390625, "rewards/margins": 16.92383130391439, "rewards/rejected": -9.795291264851889, "step": 2142 }, { "epoch": 0.5873646704124983, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34571792.0, "logits/rejected": -17220934.0, "logps/chosen": -383.933837890625, "logps/rejected": -520.1976318359375, "loss": 0.0241, "rewards/chosen": 7.86857795715332, "rewards/margins": 17.6112642288208, "rewards/rejected": -9.74268627166748, "step": 2143 }, { "epoch": 0.587638755653008, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31935913.6, "logits/rejected": -10201350.857142856, "logps/chosen": -380.1107177734375, "logps/rejected": -541.0489676339286, "loss": 0.0115, "rewards/chosen": 6.821420288085937, "rewards/margins": 17.963831656319755, "rewards/rejected": -11.142411368233818, "step": 2144 }, { "epoch": 0.5879128408935179, "grad_norm": 16.625, "kl": 15.671846389770508, "learning_rate": 5e-06, "logits/chosen": 2235385.0, "logits/rejected": -29750584.0, "logps/chosen": -465.69329833984375, "logps/rejected": -365.5209045410156, "loss": 0.0885, "rewards/chosen": 6.900042533874512, "rewards/margins": 13.135589599609375, "rewards/rejected": -6.235547065734863, "step": 2145 }, { "epoch": 0.5881869261340277, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22104452.57142857, "logits/rejected": -29875206.4, "logps/chosen": -523.9329310825893, "logps/rejected": -547.948388671875, "loss": 0.0491, "rewards/chosen": 6.992813655308315, "rewards/margins": 19.851302119663785, "rewards/rejected": -12.858488464355469, "step": 2146 }, { "epoch": 0.5884610113745374, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13912362.666666666, "logits/rejected": -34413752.88888889, "logps/chosen": -510.5093994140625, "logps/rejected": -649.1004231770834, "loss": 0.0165, "rewards/chosen": 5.4101003011067705, "rewards/margins": 17.144261678059895, "rewards/rejected": -11.734161376953125, "step": 2147 }, { "epoch": 0.5887350966150473, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32375536.0, "logits/rejected": -24367384.0, "logps/chosen": -321.416015625, "logps/rejected": -587.7035522460938, "loss": 0.0106, "rewards/chosen": 6.052656173706055, "rewards/margins": 16.4710693359375, "rewards/rejected": -10.418413162231445, "step": 2148 }, { "epoch": 0.5890091818555571, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26518007.111111112, "logits/rejected": -17126122.666666668, "logps/chosen": -333.8068576388889, "logps/rejected": -685.3757161458333, "loss": 0.0325, "rewards/chosen": 5.637394799126519, "rewards/margins": 18.999518415662976, "rewards/rejected": -13.362123616536458, "step": 2149 }, { "epoch": 0.5892832670960669, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31661034.666666668, "logits/rejected": -45120122.666666664, "logps/chosen": -445.745361328125, "logps/rejected": -453.5135498046875, "loss": 0.013, "rewards/chosen": 7.462151209513347, "rewards/margins": 18.17829958597819, "rewards/rejected": -10.716148376464844, "step": 2150 }, { "epoch": 0.5895573523365767, "grad_norm": 2.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25227408.0, "logits/rejected": -24266072.0, "logps/chosen": -432.31134033203125, "logps/rejected": -560.2775268554688, "loss": 0.0058, "rewards/chosen": 8.427921295166016, "rewards/margins": 20.040393829345703, "rewards/rejected": -11.612472534179688, "step": 2151 }, { "epoch": 0.5898314375770864, "grad_norm": 2.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40920684.8, "logits/rejected": -24149734.85714286, "logps/chosen": -388.62744140625, "logps/rejected": -580.7172502790179, "loss": 0.0069, "rewards/chosen": 6.943627166748047, "rewards/margins": 18.010248129708426, "rewards/rejected": -11.06662096296038, "step": 2152 }, { "epoch": 0.5901055228175963, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42653984.0, "logits/rejected": -24061462.0, "logps/chosen": -396.92230224609375, "logps/rejected": -448.247314453125, "loss": 0.0251, "rewards/chosen": 5.708291530609131, "rewards/margins": 14.90488576889038, "rewards/rejected": -9.19659423828125, "step": 2153 }, { "epoch": 0.5903796080581061, "grad_norm": 1.8359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36611568.0, "logits/rejected": -16978782.666666668, "logps/chosen": -468.1298828125, "logps/rejected": -559.4476318359375, "loss": 0.0067, "rewards/chosen": 6.507033030192058, "rewards/margins": 17.14499855041504, "rewards/rejected": -10.637965520222982, "step": 2154 }, { "epoch": 0.5906536932986158, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38929178.666666664, "logits/rejected": -34675002.666666664, "logps/chosen": -401.1503092447917, "logps/rejected": -515.5962727864584, "loss": 0.0384, "rewards/chosen": 6.114525477091472, "rewards/margins": 18.458396275838215, "rewards/rejected": -12.343870798746744, "step": 2155 }, { "epoch": 0.5909277785391257, "grad_norm": 3.703125, "kl": 4.912854194641113, "learning_rate": 5e-06, "logits/chosen": -3963438.769230769, "logits/rejected": -15395319.272727273, "logps/chosen": -416.56460336538464, "logps/rejected": -636.7313565340909, "loss": 0.047, "rewards/chosen": 7.316204951359675, "rewards/margins": 22.009377059403, "rewards/rejected": -14.693172108043324, "step": 2156 }, { "epoch": 0.5912018637796355, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5877941.333333333, "logits/rejected": -77481500.44444445, "logps/chosen": -371.11891276041666, "logps/rejected": -595.7492404513889, "loss": 0.0507, "rewards/chosen": 4.714406331380208, "rewards/margins": 16.93650224473741, "rewards/rejected": -12.222095913357204, "step": 2157 }, { "epoch": 0.5914759490201452, "grad_norm": 3.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31910208.0, "logits/rejected": -15788998.857142856, "logps/chosen": -399.7058349609375, "logps/rejected": -424.2987583705357, "loss": 0.0098, "rewards/chosen": 7.388449096679688, "rewards/margins": 17.79518519810268, "rewards/rejected": -10.406736101422991, "step": 2158 }, { "epoch": 0.5917500342606551, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 20241450.666666668, "logits/rejected": -21708177.333333332, "logps/chosen": -600.6414388020834, "logps/rejected": -480.9698893229167, "loss": 0.0208, "rewards/chosen": 6.8651173909505205, "rewards/margins": 17.84917704264323, "rewards/rejected": -10.984059651692709, "step": 2159 }, { "epoch": 0.5920241195011648, "grad_norm": 1.0859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15893121.6, "logits/rejected": -30321435.42857143, "logps/chosen": -463.964599609375, "logps/rejected": -662.9504743303571, "loss": 0.0026, "rewards/chosen": 7.244794464111328, "rewards/margins": 19.78776866367885, "rewards/rejected": -12.542974199567523, "step": 2160 }, { "epoch": 0.5922982047416746, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29574117.818181816, "logits/rejected": -36193341.538461536, "logps/chosen": -390.03391335227275, "logps/rejected": -588.4260441706731, "loss": 0.0266, "rewards/chosen": 5.063650998202237, "rewards/margins": 16.232042299283968, "rewards/rejected": -11.16839130108173, "step": 2161 }, { "epoch": 0.5925722899821845, "grad_norm": 3.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69288535.27272727, "logits/rejected": -31085550.769230768, "logps/chosen": -459.8890269886364, "logps/rejected": -442.49320162259613, "loss": 0.0074, "rewards/chosen": 6.653273148970171, "rewards/margins": 17.302338606827742, "rewards/rejected": -10.649065457857573, "step": 2162 }, { "epoch": 0.5928463752226942, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35577841.23076923, "logits/rejected": -46537026.90909091, "logps/chosen": -452.7204026442308, "logps/rejected": -502.05886008522725, "loss": 0.0214, "rewards/chosen": 6.3166339580829325, "rewards/margins": 18.194429090806654, "rewards/rejected": -11.87779513272372, "step": 2163 }, { "epoch": 0.5931204604632041, "grad_norm": 18.5, "kl": 5.8484697341918945, "learning_rate": 5e-06, "logits/chosen": -26128170.666666668, "logits/rejected": -43174304.0, "logps/chosen": -406.64153645833335, "logps/rejected": -555.2554253472222, "loss": 0.0776, "rewards/chosen": 5.993400065104167, "rewards/margins": 18.86180826822917, "rewards/rejected": -12.868408203125, "step": 2164 }, { "epoch": 0.5933945457037139, "grad_norm": 2.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20227963.2, "logits/rejected": -41076982.85714286, "logps/chosen": -482.5244140625, "logps/rejected": -548.1082589285714, "loss": 0.0048, "rewards/chosen": 6.34664306640625, "rewards/margins": 18.30279039655413, "rewards/rejected": -11.95614733014788, "step": 2165 }, { "epoch": 0.5936686309442236, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11015589.818181818, "logits/rejected": -14553710.76923077, "logps/chosen": -456.61110617897725, "logps/rejected": -456.3073542668269, "loss": 0.0256, "rewards/chosen": 4.694655678488991, "rewards/margins": 17.382998433146444, "rewards/rejected": -12.688342754657452, "step": 2166 }, { "epoch": 0.5939427161847335, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37773765.333333336, "logits/rejected": -14648200.0, "logps/chosen": -390.2578125, "logps/rejected": -419.5199788411458, "loss": 0.0207, "rewards/chosen": 8.50216801961263, "rewards/margins": 15.816275914510092, "rewards/rejected": -7.314107894897461, "step": 2167 }, { "epoch": 0.5942168014252432, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30373757.09090909, "logits/rejected": -32913331.692307692, "logps/chosen": -341.3960626775568, "logps/rejected": -404.111328125, "loss": 0.008, "rewards/chosen": 6.762606534090909, "rewards/margins": 16.919912911795237, "rewards/rejected": -10.157306377704327, "step": 2168 }, { "epoch": 0.594490886665753, "grad_norm": 5.0625, "kl": 5.508502006530762, "learning_rate": 5e-06, "logits/chosen": -26487502.222222224, "logits/rejected": -33091136.0, "logps/chosen": -372.16826714409723, "logps/rejected": -531.287890625, "loss": 0.0164, "rewards/chosen": 6.465708838568793, "rewards/margins": 17.90114991929796, "rewards/rejected": -11.435441080729166, "step": 2169 }, { "epoch": 0.5947649719062629, "grad_norm": 0.37890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49769584.0, "logits/rejected": -39453385.14285714, "logps/chosen": -526.72509765625, "logps/rejected": -636.3349609375, "loss": 0.0013, "rewards/chosen": 8.010971832275391, "rewards/margins": 21.133274296351843, "rewards/rejected": -13.122302464076451, "step": 2170 }, { "epoch": 0.5950390571467726, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17944529.6, "logits/rejected": -28860379.42857143, "logps/chosen": -371.789208984375, "logps/rejected": -696.0126953125, "loss": 0.0473, "rewards/chosen": 4.213309860229492, "rewards/margins": 17.22279210771833, "rewards/rejected": -13.009482247488839, "step": 2171 }, { "epoch": 0.5953131423872824, "grad_norm": 1.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11995141.6, "logits/rejected": -16251163.42857143, "logps/chosen": -472.012060546875, "logps/rejected": -449.34769112723217, "loss": 0.0054, "rewards/chosen": 7.734413146972656, "rewards/margins": 17.524215698242188, "rewards/rejected": -9.789802551269531, "step": 2172 }, { "epoch": 0.5955872276277923, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45140362.666666664, "logits/rejected": -41980154.666666664, "logps/chosen": -315.3769938151042, "logps/rejected": -655.1410725911459, "loss": 0.0233, "rewards/chosen": 5.762478510538737, "rewards/margins": 18.820287704467773, "rewards/rejected": -13.057809193929037, "step": 2173 }, { "epoch": 0.595861312868302, "grad_norm": 9.75, "kl": 6.16293478012085, "learning_rate": 5e-06, "logits/chosen": -19614102.0, "logits/rejected": -61638940.0, "logps/chosen": -410.6842956542969, "logps/rejected": -565.7598266601562, "loss": 0.0785, "rewards/chosen": 6.03289794921875, "rewards/margins": 20.493722915649414, "rewards/rejected": -14.460824966430664, "step": 2174 }, { "epoch": 0.5961353981088119, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31477715.2, "logits/rejected": -19473366.85714286, "logps/chosen": -527.793359375, "logps/rejected": -514.9218052455357, "loss": 0.0169, "rewards/chosen": 5.627285003662109, "rewards/margins": 16.793113163539342, "rewards/rejected": -11.165828159877233, "step": 2175 }, { "epoch": 0.5964094833493216, "grad_norm": 11.8125, "kl": 2.049309492111206, "learning_rate": 5e-06, "logits/chosen": -25835832.888888888, "logits/rejected": -28199449.6, "logps/chosen": -471.5774739583333, "logps/rejected": -668.8044270833333, "loss": 0.0285, "rewards/chosen": 6.244370354546441, "rewards/margins": 19.147822909884983, "rewards/rejected": -12.903452555338541, "step": 2176 }, { "epoch": 0.5966835685898314, "grad_norm": 11.5, "kl": 5.008486270904541, "learning_rate": 5e-06, "logits/chosen": -8911688.533333333, "logits/rejected": 50801731.55555555, "logps/chosen": -427.08570963541666, "logps/rejected": -487.0569118923611, "loss": 0.0354, "rewards/chosen": 6.230491638183594, "rewards/margins": 19.46101972791884, "rewards/rejected": -13.230528089735243, "step": 2177 }, { "epoch": 0.5969576538303413, "grad_norm": 5.0, "kl": 8.84598159790039, "learning_rate": 5e-06, "logits/chosen": -26518109.866666667, "logits/rejected": -85343800.8888889, "logps/chosen": -501.1429036458333, "logps/rejected": -598.5286458333334, "loss": 0.056, "rewards/chosen": 6.596148681640625, "rewards/margins": 21.661279975043403, "rewards/rejected": -15.065131293402779, "step": 2178 }, { "epoch": 0.597231739070851, "grad_norm": 10.5625, "kl": 10.65564250946045, "learning_rate": 5e-06, "logits/chosen": -43579933.538461536, "logits/rejected": -21101469.09090909, "logps/chosen": -463.27013221153845, "logps/rejected": -582.2319779829545, "loss": 0.0384, "rewards/chosen": 7.344271733210637, "rewards/margins": 18.386445639016745, "rewards/rejected": -11.042173905806107, "step": 2179 }, { "epoch": 0.5975058243113608, "grad_norm": 7.75, "kl": 9.445077896118164, "learning_rate": 5e-06, "logits/chosen": -26130950.4, "logits/rejected": 22711854.222222224, "logps/chosen": -447.4447916666667, "logps/rejected": -640.5590277777778, "loss": 0.0724, "rewards/chosen": 7.231640116373698, "rewards/margins": 18.150772772894964, "rewards/rejected": -10.919132656521267, "step": 2180 }, { "epoch": 0.5977799095518707, "grad_norm": 2.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25212189.333333332, "logits/rejected": -35348394.666666664, "logps/chosen": -431.1717529296875, "logps/rejected": -600.1024576822916, "loss": 0.0058, "rewards/chosen": 7.027581532796224, "rewards/margins": 20.477624257405598, "rewards/rejected": -13.450042724609375, "step": 2181 }, { "epoch": 0.5980539947923804, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25465553.454545453, "logits/rejected": -29627505.230769232, "logps/chosen": -422.99702592329544, "logps/rejected": -721.3228665865385, "loss": 0.017, "rewards/chosen": 7.079569036310369, "rewards/margins": 20.083618484176956, "rewards/rejected": -13.004049447866587, "step": 2182 }, { "epoch": 0.5983280800328902, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13331138.666666666, "logits/rejected": -25850978.666666668, "logps/chosen": -325.160888671875, "logps/rejected": -486.3913981119792, "loss": 0.0501, "rewards/chosen": 5.32306448618571, "rewards/margins": 18.60388469696045, "rewards/rejected": -13.28082021077474, "step": 2183 }, { "epoch": 0.5986021652734, "grad_norm": 2.6875, "kl": 0.8591588735580444, "learning_rate": 5e-06, "logits/chosen": -8470990.76923077, "logits/rejected": -30197597.09090909, "logps/chosen": -445.4480168269231, "logps/rejected": -585.4016335227273, "loss": 0.0043, "rewards/chosen": 6.751483623798077, "rewards/margins": 18.63209330952251, "rewards/rejected": -11.880609685724432, "step": 2184 }, { "epoch": 0.5988762505139098, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44460435.2, "logits/rejected": -18514937.14285714, "logps/chosen": -436.4451171875, "logps/rejected": -528.2497209821429, "loss": 0.0793, "rewards/chosen": 6.180952835083008, "rewards/margins": 16.994935117449078, "rewards/rejected": -10.813982282366071, "step": 2185 }, { "epoch": 0.5991503357544197, "grad_norm": 9.25, "kl": 4.789362907409668, "learning_rate": 5e-06, "logits/chosen": -25215620.923076924, "logits/rejected": -10762240.0, "logps/chosen": -430.8313551682692, "logps/rejected": -572.1481267755681, "loss": 0.0237, "rewards/chosen": 7.460747352013221, "rewards/margins": 17.490833522556546, "rewards/rejected": -10.030086170543324, "step": 2186 }, { "epoch": 0.5994244209949294, "grad_norm": 2.890625, "kl": 0.9782218933105469, "learning_rate": 5e-06, "logits/chosen": -35283924.571428575, "logits/rejected": -17479784.0, "logps/chosen": -423.9850376674107, "logps/rejected": -543.0978515625, "loss": 0.007, "rewards/chosen": 7.39567620413644, "rewards/margins": 18.90359170096261, "rewards/rejected": -11.507915496826172, "step": 2187 }, { "epoch": 0.5996985062354392, "grad_norm": 5.9375, "kl": 3.2888388633728027, "learning_rate": 5e-06, "logits/chosen": -32931453.09090909, "logits/rejected": -25735313.230769232, "logps/chosen": -430.23495205965907, "logps/rejected": -574.4508713942307, "loss": 0.0109, "rewards/chosen": 7.320572592995384, "rewards/margins": 19.197718213488173, "rewards/rejected": -11.877145620492788, "step": 2188 }, { "epoch": 0.5999725914759491, "grad_norm": 2.734375, "kl": 1.8177287578582764, "learning_rate": 5e-06, "logits/chosen": -28131313.230769232, "logits/rejected": -23194909.09090909, "logps/chosen": -393.1641376201923, "logps/rejected": -594.4780717329545, "loss": 0.0273, "rewards/chosen": 6.558665935809795, "rewards/margins": 19.073691681548432, "rewards/rejected": -12.515025745738637, "step": 2189 }, { "epoch": 0.6002466767164588, "grad_norm": 3.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10933206.857142856, "logits/rejected": -29962352.0, "logps/chosen": -511.3933803013393, "logps/rejected": -644.8236328125, "loss": 0.0182, "rewards/chosen": 6.798444475446429, "rewards/margins": 21.989621843610493, "rewards/rejected": -15.191177368164062, "step": 2190 }, { "epoch": 0.6005207619569686, "grad_norm": 0.49609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30437856.0, "logits/rejected": -18674170.666666668, "logps/chosen": -450.7813313802083, "logps/rejected": -480.69829644097223, "loss": 0.021, "rewards/chosen": 6.3756459554036455, "rewards/margins": 17.673858642578125, "rewards/rejected": -11.298212687174479, "step": 2191 }, { "epoch": 0.6007948471974784, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13283822.933333334, "logits/rejected": -29446282.666666668, "logps/chosen": -372.037890625, "logps/rejected": -457.6717122395833, "loss": 0.0638, "rewards/chosen": 5.344114176432291, "rewards/margins": 14.7434087117513, "rewards/rejected": -9.39929453531901, "step": 2192 }, { "epoch": 0.6010689324379882, "grad_norm": 2.875, "kl": 5.162571907043457, "learning_rate": 5e-06, "logits/chosen": -25485414.85714286, "logits/rejected": -20776916.8, "logps/chosen": -425.67494419642856, "logps/rejected": -455.10771484375, "loss": 0.0359, "rewards/chosen": 7.00762939453125, "rewards/margins": 18.27763214111328, "rewards/rejected": -11.270002746582032, "step": 2193 }, { "epoch": 0.601343017678498, "grad_norm": 5.53125, "kl": 5.714724540710449, "learning_rate": 5e-06, "logits/chosen": -19761680.0, "logits/rejected": -36270373.333333336, "logps/chosen": -377.5932888454861, "logps/rejected": -428.7765299479167, "loss": 0.0568, "rewards/chosen": 6.7635345458984375, "rewards/margins": 18.87397003173828, "rewards/rejected": -12.110435485839844, "step": 2194 }, { "epoch": 0.6016171029190078, "grad_norm": 6.65625, "kl": 1.1521995067596436, "learning_rate": 5e-06, "logits/chosen": -27860149.333333332, "logits/rejected": -33711648.0, "logps/chosen": -497.8890380859375, "logps/rejected": -705.2659505208334, "loss": 0.0215, "rewards/chosen": 6.590413411458333, "rewards/margins": 18.143877665201824, "rewards/rejected": -11.55346425374349, "step": 2195 }, { "epoch": 0.6018911881595176, "grad_norm": 8.375, "kl": 5.956332206726074, "learning_rate": 5e-06, "logits/chosen": -28796224.0, "logits/rejected": -12498891.2, "logps/chosen": -402.6757114955357, "logps/rejected": -654.1390625, "loss": 0.0381, "rewards/chosen": 6.262894766671317, "rewards/margins": 19.256048148018973, "rewards/rejected": -12.993153381347657, "step": 2196 }, { "epoch": 0.6021652734000275, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24450624.0, "logits/rejected": -4004005.0, "logps/chosen": -505.3919677734375, "logps/rejected": -537.07373046875, "loss": 0.0262, "rewards/chosen": 8.312037467956543, "rewards/margins": 18.254374504089355, "rewards/rejected": -9.942337036132812, "step": 2197 }, { "epoch": 0.6024393586405372, "grad_norm": 5.125, "kl": 1.152191162109375, "learning_rate": 5e-06, "logits/chosen": -17002477.53846154, "logits/rejected": -19150842.181818184, "logps/chosen": -343.7086838942308, "logps/rejected": -416.40633877840907, "loss": 0.0206, "rewards/chosen": 6.872990534855769, "rewards/margins": 15.219000916380981, "rewards/rejected": -8.346010381525213, "step": 2198 }, { "epoch": 0.602713443881047, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6885761.6, "logits/rejected": -16070052.57142857, "logps/chosen": -433.0412109375, "logps/rejected": -547.5890066964286, "loss": 0.0097, "rewards/chosen": 7.574167633056641, "rewards/margins": 19.15115476335798, "rewards/rejected": -11.576987130301339, "step": 2199 }, { "epoch": 0.6029875291215568, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12802402.909090908, "logits/rejected": -41504580.92307692, "logps/chosen": -491.48970170454544, "logps/rejected": -556.9084284855769, "loss": 0.0235, "rewards/chosen": 7.296288230202415, "rewards/margins": 19.537225683252295, "rewards/rejected": -12.24093745304988, "step": 2200 }, { "epoch": 0.6032616143620666, "grad_norm": 5.5625, "kl": 1.661350965499878, "learning_rate": 5e-06, "logits/chosen": -23022016.0, "logits/rejected": -25138682.666666668, "logps/chosen": -324.7130940755208, "logps/rejected": -579.317626953125, "loss": 0.0663, "rewards/chosen": 7.504355112711589, "rewards/margins": 17.708365122477215, "rewards/rejected": -10.204010009765625, "step": 2201 }, { "epoch": 0.6035356996025764, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31673572.0, "logits/rejected": -24489804.0, "logps/chosen": -379.821044921875, "logps/rejected": -467.80255126953125, "loss": 0.0245, "rewards/chosen": 5.106947898864746, "rewards/margins": 14.952818870544434, "rewards/rejected": -9.845870971679688, "step": 2202 }, { "epoch": 0.6038097848430862, "grad_norm": 10.0625, "kl": 3.2385292053222656, "learning_rate": 5e-06, "logits/chosen": -31444234.666666668, "logits/rejected": 26736200.0, "logps/chosen": -404.1475423177083, "logps/rejected": -533.6051432291666, "loss": 0.0465, "rewards/chosen": 5.714305877685547, "rewards/margins": 16.151487986246742, "rewards/rejected": -10.437182108561197, "step": 2203 }, { "epoch": 0.604083870083596, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55170662.4, "logits/rejected": -35821588.571428575, "logps/chosen": -376.352294921875, "logps/rejected": -613.7390485491071, "loss": 0.0282, "rewards/chosen": 6.302594757080078, "rewards/margins": 16.748656027657645, "rewards/rejected": -10.446061270577568, "step": 2204 }, { "epoch": 0.6043579553241057, "grad_norm": 4.4375, "kl": 8.374228477478027, "learning_rate": 5e-06, "logits/chosen": -32320134.4, "logits/rejected": -28131756.0, "logps/chosen": -400.551513671875, "logps/rejected": -568.64111328125, "loss": 0.0162, "rewards/chosen": 6.81468505859375, "rewards/margins": 16.114491653442382, "rewards/rejected": -9.299806594848633, "step": 2205 }, { "epoch": 0.6046320405646156, "grad_norm": 11.3125, "kl": 16.873455047607422, "learning_rate": 5e-06, "logits/chosen": -20078905.6, "logits/rejected": -41725315.55555555, "logps/chosen": -364.32483723958336, "logps/rejected": -527.2006293402778, "loss": 0.167, "rewards/chosen": 6.464699300130208, "rewards/margins": 16.78651394314236, "rewards/rejected": -10.321814643012154, "step": 2206 }, { "epoch": 0.6049061258051254, "grad_norm": 3.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33610870.15384615, "logits/rejected": -19031214.545454547, "logps/chosen": -458.69779146634613, "logps/rejected": -454.1975763494318, "loss": 0.0253, "rewards/chosen": 6.2027411827674275, "rewards/margins": 16.43494666039527, "rewards/rejected": -10.232205477627842, "step": 2207 }, { "epoch": 0.6051802110456352, "grad_norm": 9.125, "kl": 2.977275848388672, "learning_rate": 5e-06, "logits/chosen": -23771072.0, "logits/rejected": -28571829.333333332, "logps/chosen": -479.2791748046875, "logps/rejected": -494.4981689453125, "loss": 0.0361, "rewards/chosen": 7.596284866333008, "rewards/margins": 17.217138290405273, "rewards/rejected": -9.620853424072266, "step": 2208 }, { "epoch": 0.605454296286145, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37170116.0, "logits/rejected": 7328840.0, "logps/chosen": -398.0444641113281, "logps/rejected": -639.2219848632812, "loss": 0.0611, "rewards/chosen": 7.310476779937744, "rewards/margins": 19.898430347442627, "rewards/rejected": -12.587953567504883, "step": 2209 }, { "epoch": 0.6057283815266548, "grad_norm": 11.0, "kl": 3.0522563457489014, "learning_rate": 5e-06, "logits/chosen": -12626013.6, "logits/rejected": -6196970.285714285, "logps/chosen": -405.0981689453125, "logps/rejected": -742.1803850446429, "loss": 0.0212, "rewards/chosen": 7.93250732421875, "rewards/margins": 19.78261043003627, "rewards/rejected": -11.850103105817523, "step": 2210 }, { "epoch": 0.6060024667671646, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5699824.888888889, "logits/rejected": -19937454.933333334, "logps/chosen": -317.35557725694446, "logps/rejected": -490.7828125, "loss": 0.084, "rewards/chosen": 5.15890375773112, "rewards/margins": 16.366183217366537, "rewards/rejected": -11.207279459635417, "step": 2211 }, { "epoch": 0.6062765520076744, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32469704.727272727, "logits/rejected": -36530119.384615384, "logps/chosen": -349.08571555397725, "logps/rejected": -551.3809720552885, "loss": 0.053, "rewards/chosen": 4.857719421386719, "rewards/margins": 17.705928509051983, "rewards/rejected": -12.848209087665264, "step": 2212 }, { "epoch": 0.6065506372481841, "grad_norm": 3.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30222850.90909091, "logits/rejected": -8093676.923076923, "logps/chosen": -328.38345614346593, "logps/rejected": -526.7566481370193, "loss": 0.0173, "rewards/chosen": 6.276912342418324, "rewards/margins": 17.7938988959039, "rewards/rejected": -11.516986553485577, "step": 2213 }, { "epoch": 0.606824722488694, "grad_norm": 2.5, "kl": 1.378225326538086, "learning_rate": 5e-06, "logits/chosen": -21569822.0, "logits/rejected": -40466236.0, "logps/chosen": -447.0928955078125, "logps/rejected": -594.4880981445312, "loss": 0.0068, "rewards/chosen": 7.886535167694092, "rewards/margins": 21.771926403045654, "rewards/rejected": -13.885391235351562, "step": 2214 }, { "epoch": 0.6070988077292038, "grad_norm": 4.40625, "kl": 6.561585903167725, "learning_rate": 5e-06, "logits/chosen": -33439818.666666668, "logits/rejected": -29446546.666666668, "logps/chosen": -527.9283854166666, "logps/rejected": -524.9613850911459, "loss": 0.0126, "rewards/chosen": 8.343863169352213, "rewards/margins": 20.65662892659505, "rewards/rejected": -12.312765757242838, "step": 2215 }, { "epoch": 0.6073728929697135, "grad_norm": 9.375, "kl": 6.46578311920166, "learning_rate": 5e-06, "logits/chosen": -26986016.0, "logits/rejected": -55840976.0, "logps/chosen": -389.3118591308594, "logps/rejected": -443.64239501953125, "loss": 0.037, "rewards/chosen": 7.304097652435303, "rewards/margins": 15.58944845199585, "rewards/rejected": -8.285350799560547, "step": 2216 }, { "epoch": 0.6076469782102234, "grad_norm": 9.9375, "kl": 1.1343930959701538, "learning_rate": 5e-06, "logits/chosen": -3047837.8666666667, "logits/rejected": -32393998.222222224, "logps/chosen": -444.96751302083334, "logps/rejected": -474.8603515625, "loss": 0.0641, "rewards/chosen": 6.009226989746094, "rewards/margins": 15.990991889105903, "rewards/rejected": -9.98176489935981, "step": 2217 }, { "epoch": 0.6079210634507332, "grad_norm": 10.5625, "kl": 10.529559135437012, "learning_rate": 5e-06, "logits/chosen": -36874252.8, "logits/rejected": -7347346.666666667, "logps/chosen": -421.5343424479167, "logps/rejected": -597.5794813368055, "loss": 0.0565, "rewards/chosen": 7.704297383626302, "rewards/margins": 20.909359402126736, "rewards/rejected": -13.205062018500435, "step": 2218 }, { "epoch": 0.608195148691243, "grad_norm": 1.8046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22132467.2, "logits/rejected": 28570875.42857143, "logps/chosen": -415.97646484375, "logps/rejected": -556.2882952008929, "loss": 0.0057, "rewards/chosen": 6.372903823852539, "rewards/margins": 15.694548198154994, "rewards/rejected": -9.321644374302455, "step": 2219 }, { "epoch": 0.6084692339317528, "grad_norm": 7.0, "kl": 2.639923095703125, "learning_rate": 5e-06, "logits/chosen": 20789735.384615384, "logits/rejected": 101690833.45454545, "logps/chosen": -408.6981670673077, "logps/rejected": -493.8663884943182, "loss": 0.0396, "rewards/chosen": 5.673254159780649, "rewards/margins": 16.606948479072198, "rewards/rejected": -10.933694319291549, "step": 2220 }, { "epoch": 0.6087433191722625, "grad_norm": 9.0625, "kl": 12.60730266571045, "learning_rate": 5e-06, "logits/chosen": -27130236.444444444, "logits/rejected": -33253416.533333335, "logps/chosen": -457.4415690104167, "logps/rejected": -550.8982421875, "loss": 0.0284, "rewards/chosen": 8.146057976616753, "rewards/margins": 18.544522942437066, "rewards/rejected": -10.398464965820313, "step": 2221 }, { "epoch": 0.6090174044127724, "grad_norm": 4.875, "kl": 0.00672785472124815, "learning_rate": 5e-06, "logits/chosen": -5939803.6, "logits/rejected": -44286157.71428572, "logps/chosen": -415.200390625, "logps/rejected": -508.80726841517856, "loss": 0.0321, "rewards/chosen": 5.707733154296875, "rewards/margins": 15.876656668526786, "rewards/rejected": -10.168923514229911, "step": 2222 }, { "epoch": 0.6092914896532822, "grad_norm": 11.875, "kl": 0.9351577758789062, "learning_rate": 5e-06, "logits/chosen": -22385014.0, "logits/rejected": 9310759.0, "logps/chosen": -460.2674255371094, "logps/rejected": -606.7841186523438, "loss": 0.0682, "rewards/chosen": 6.377255439758301, "rewards/margins": 21.05961322784424, "rewards/rejected": -14.682357788085938, "step": 2223 }, { "epoch": 0.6095655748937919, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48796712.72727273, "logits/rejected": -28113659.076923076, "logps/chosen": -352.8942205255682, "logps/rejected": -523.6200796274038, "loss": 0.0396, "rewards/chosen": 6.838109796697443, "rewards/margins": 16.714420878803814, "rewards/rejected": -9.87631108210637, "step": 2224 }, { "epoch": 0.6098396601343018, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23051000.0, "logits/rejected": -35185312.0, "logps/chosen": -358.2611083984375, "logps/rejected": -525.485107421875, "loss": 0.0476, "rewards/chosen": 5.963507970174153, "rewards/margins": 16.35377566019694, "rewards/rejected": -10.390267690022787, "step": 2225 }, { "epoch": 0.6101137453748116, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32422044.8, "logits/rejected": -37127314.28571428, "logps/chosen": -450.56953125, "logps/rejected": -493.89306640625, "loss": 0.0954, "rewards/chosen": 7.637179565429688, "rewards/margins": 16.811651829310826, "rewards/rejected": -9.174472263881139, "step": 2226 }, { "epoch": 0.6103878306153213, "grad_norm": 1.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19627138.285714287, "logits/rejected": -26551916.8, "logps/chosen": -494.6511928013393, "logps/rejected": -583.94716796875, "loss": 0.0051, "rewards/chosen": 6.930153982979911, "rewards/margins": 20.396908133370534, "rewards/rejected": -13.466754150390624, "step": 2227 }, { "epoch": 0.6106619158558312, "grad_norm": 2.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1109473.6363636365, "logits/rejected": -2094990.1538461538, "logps/chosen": -425.3722478693182, "logps/rejected": -763.1319861778846, "loss": 0.0056, "rewards/chosen": 7.4795448996803975, "rewards/margins": 25.27896459619482, "rewards/rejected": -17.799419696514423, "step": 2228 }, { "epoch": 0.6109360010963409, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31556036.923076924, "logits/rejected": -20873771.636363637, "logps/chosen": -413.36583533653845, "logps/rejected": -456.50772372159093, "loss": 0.032, "rewards/chosen": 6.902976989746094, "rewards/margins": 17.437883550470524, "rewards/rejected": -10.534906560724432, "step": 2229 }, { "epoch": 0.6112100863368508, "grad_norm": 2.796875, "kl": 2.9898574352264404, "learning_rate": 5e-06, "logits/chosen": -37660104.53333333, "logits/rejected": -45785749.333333336, "logps/chosen": -406.4861328125, "logps/rejected": -639.0295138888889, "loss": 0.0082, "rewards/chosen": 7.177137247721354, "rewards/margins": 21.349217393663196, "rewards/rejected": -14.172080145941841, "step": 2230 }, { "epoch": 0.6114841715773606, "grad_norm": 12.5625, "kl": 7.213981628417969, "learning_rate": 5e-06, "logits/chosen": -9138173.0, "logits/rejected": -16767059.0, "logps/chosen": -434.93841552734375, "logps/rejected": -543.642578125, "loss": 0.0535, "rewards/chosen": 7.151169776916504, "rewards/margins": 17.64907741546631, "rewards/rejected": -10.497907638549805, "step": 2231 }, { "epoch": 0.6117582568178703, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10112381.866666667, "logits/rejected": -35595889.777777776, "logps/chosen": -415.6391276041667, "logps/rejected": -662.8716362847222, "loss": 0.0151, "rewards/chosen": 6.0669504801432295, "rewards/margins": 17.233535766601562, "rewards/rejected": -11.166585286458334, "step": 2232 }, { "epoch": 0.6120323420583802, "grad_norm": 6.875, "kl": 1.7060697078704834, "learning_rate": 5e-06, "logits/chosen": -18183333.333333332, "logits/rejected": 13633440.0, "logps/chosen": -381.26996527777777, "logps/rejected": -484.55771484375, "loss": 0.0436, "rewards/chosen": 6.63770506117079, "rewards/margins": 18.808417341444226, "rewards/rejected": -12.170712280273438, "step": 2233 }, { "epoch": 0.61230642729889, "grad_norm": 0.625, "kl": 0.4904734492301941, "learning_rate": 5e-06, "logits/chosen": -52908445.538461536, "logits/rejected": -30421937.454545453, "logps/chosen": -457.30615234375, "logps/rejected": -414.54811789772725, "loss": 0.0016, "rewards/chosen": 7.998025160569411, "rewards/margins": 20.36848081575407, "rewards/rejected": -12.370455655184658, "step": 2234 }, { "epoch": 0.6125805125393997, "grad_norm": 2.953125, "kl": 2.3960700035095215, "learning_rate": 5e-06, "logits/chosen": 14248686.545454545, "logits/rejected": -25378050.46153846, "logps/chosen": -441.77565696022725, "logps/rejected": -514.7870342548077, "loss": 0.0103, "rewards/chosen": 5.812588778409091, "rewards/margins": 18.3911551095389, "rewards/rejected": -12.578566331129808, "step": 2235 }, { "epoch": 0.6128545977799096, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15912428.8, "logits/rejected": -49016845.71428572, "logps/chosen": -389.178466796875, "logps/rejected": -563.1731305803571, "loss": 0.0573, "rewards/chosen": 5.948181533813477, "rewards/margins": 16.135121318272184, "rewards/rejected": -10.186939784458705, "step": 2236 }, { "epoch": 0.6131286830204193, "grad_norm": 6.875, "kl": 0.5161794424057007, "learning_rate": 5e-06, "logits/chosen": -33270985.846153848, "logits/rejected": -27296357.818181816, "logps/chosen": -407.37992037259613, "logps/rejected": -490.0179332386364, "loss": 0.0718, "rewards/chosen": 6.7048163780799275, "rewards/margins": 14.38975567584271, "rewards/rejected": -7.684939297762784, "step": 2237 }, { "epoch": 0.6134027682609291, "grad_norm": 1.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19058714.285714287, "logits/rejected": -36890764.8, "logps/chosen": -305.9133998325893, "logps/rejected": -631.20546875, "loss": 0.0091, "rewards/chosen": 6.091927664620536, "rewards/margins": 17.792318289620535, "rewards/rejected": -11.700390625, "step": 2238 }, { "epoch": 0.613676853501439, "grad_norm": 8.0, "kl": 2.360995054244995, "learning_rate": 5e-06, "logits/chosen": -30128535.272727273, "logits/rejected": -38909828.92307692, "logps/chosen": -401.47265625, "logps/rejected": -485.85096153846155, "loss": 0.0631, "rewards/chosen": 6.66457089510831, "rewards/margins": 15.39655378648451, "rewards/rejected": -8.731982891376202, "step": 2239 }, { "epoch": 0.6139509387419487, "grad_norm": 6.5, "kl": 10.810160636901855, "learning_rate": 5e-06, "logits/chosen": -31649990.85714286, "logits/rejected": -55283372.8, "logps/chosen": -601.8902064732143, "logps/rejected": -470.69775390625, "loss": 0.0284, "rewards/chosen": 7.123325892857143, "rewards/margins": 16.49766137259347, "rewards/rejected": -9.374335479736327, "step": 2240 }, { "epoch": 0.6142250239824586, "grad_norm": 8.6875, "kl": 6.74765682220459, "learning_rate": 5e-06, "logits/chosen": -49009142.15384615, "logits/rejected": -34234007.27272727, "logps/chosen": -433.27599158653845, "logps/rejected": -519.7243430397727, "loss": 0.0536, "rewards/chosen": 6.441916245680589, "rewards/margins": 16.299938068523275, "rewards/rejected": -9.858021822842685, "step": 2241 }, { "epoch": 0.6144991092229684, "grad_norm": 1.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28022919.384615384, "logits/rejected": -24360661.818181816, "logps/chosen": -408.11031400240387, "logps/rejected": -642.1769353693181, "loss": 0.0193, "rewards/chosen": 6.162169236403245, "rewards/margins": 17.614832311243443, "rewards/rejected": -11.4526630748402, "step": 2242 }, { "epoch": 0.6147731944634781, "grad_norm": 2.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18307364.0, "logits/rejected": -36041344.0, "logps/chosen": -454.84222412109375, "logps/rejected": -645.6376953125, "loss": 0.0084, "rewards/chosen": 6.382572650909424, "rewards/margins": 18.34472131729126, "rewards/rejected": -11.962148666381836, "step": 2243 }, { "epoch": 0.615047279703988, "grad_norm": 2.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23437768.888888888, "logits/rejected": -36871859.2, "logps/chosen": -511.52338324652777, "logps/rejected": -480.7215169270833, "loss": 0.0101, "rewards/chosen": 8.913897196451822, "rewards/margins": 16.54248809814453, "rewards/rejected": -7.628590901692708, "step": 2244 }, { "epoch": 0.6153213649444977, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20416142.769230768, "logits/rejected": -2545280.0, "logps/chosen": -411.72607421875, "logps/rejected": -608.9821555397727, "loss": 0.0111, "rewards/chosen": 6.372613173264724, "rewards/margins": 18.506321780331486, "rewards/rejected": -12.133708607066762, "step": 2245 }, { "epoch": 0.6155954501850075, "grad_norm": 5.53125, "kl": 4.147731781005859, "learning_rate": 5e-06, "logits/chosen": -51524772.571428575, "logits/rejected": -27026579.2, "logps/chosen": -437.31312779017856, "logps/rejected": -548.12275390625, "loss": 0.0432, "rewards/chosen": 7.031842912946429, "rewards/margins": 17.223289598737445, "rewards/rejected": -10.191446685791016, "step": 2246 }, { "epoch": 0.6158695354255174, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19896105.14285714, "logits/rejected": -23128293.647058822, "logps/chosen": -318.18722098214283, "logps/rejected": -503.6773897058824, "loss": 0.0594, "rewards/chosen": 4.902975899832589, "rewards/margins": 14.449802430737922, "rewards/rejected": -9.546826530905332, "step": 2247 }, { "epoch": 0.6161436206660271, "grad_norm": 1.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15960664.727272727, "logits/rejected": -31329907.692307692, "logps/chosen": -560.5245028409091, "logps/rejected": -517.140625, "loss": 0.0055, "rewards/chosen": 7.164037531072443, "rewards/margins": 16.682721918279476, "rewards/rejected": -9.518684387207031, "step": 2248 }, { "epoch": 0.6164177059065369, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13313173.333333334, "logits/rejected": -31268636.444444444, "logps/chosen": -416.2941487630208, "logps/rejected": -374.9311794704861, "loss": 0.0347, "rewards/chosen": 9.879608154296875, "rewards/margins": 17.607304043240017, "rewards/rejected": -7.727695888943142, "step": 2249 }, { "epoch": 0.6166917911470468, "grad_norm": 12.25, "kl": 5.08280086517334, "learning_rate": 5e-06, "logits/chosen": -19477312.0, "logits/rejected": -18892861.09090909, "logps/chosen": -391.3108097956731, "logps/rejected": -699.3249733664773, "loss": 0.0505, "rewards/chosen": 6.236360990084135, "rewards/margins": 20.905282720819223, "rewards/rejected": -14.668921730735086, "step": 2250 }, { "epoch": 0.6169658763875565, "grad_norm": 8.0, "kl": 0.6101049184799194, "learning_rate": 5e-06, "logits/chosen": -30530242.666666668, "logits/rejected": -15835861.333333334, "logps/chosen": -413.7733154296875, "logps/rejected": -377.9165852864583, "loss": 0.0352, "rewards/chosen": 6.229886372884114, "rewards/margins": 14.519634246826172, "rewards/rejected": -8.289747873942057, "step": 2251 }, { "epoch": 0.6172399616280664, "grad_norm": 9.75, "kl": 0.8595353960990906, "learning_rate": 5e-06, "logits/chosen": -35914132.571428575, "logits/rejected": -43363289.6, "logps/chosen": -425.61178152901783, "logps/rejected": -641.447900390625, "loss": 0.0439, "rewards/chosen": 7.12231935773577, "rewards/margins": 20.492755453927177, "rewards/rejected": -13.370436096191407, "step": 2252 }, { "epoch": 0.6175140468685761, "grad_norm": 7.34375, "kl": 2.339046001434326, "learning_rate": 5e-06, "logits/chosen": -30420754.666666668, "logits/rejected": -31623290.666666668, "logps/chosen": -444.8187662760417, "logps/rejected": -636.5015869140625, "loss": 0.0336, "rewards/chosen": 7.250525156656901, "rewards/margins": 17.609773635864258, "rewards/rejected": -10.359248479207357, "step": 2253 }, { "epoch": 0.6177881321090859, "grad_norm": 10.0625, "kl": 10.378274917602539, "learning_rate": 5e-06, "logits/chosen": -25889460.57142857, "logits/rejected": -8796839.2, "logps/chosen": -517.6070731026786, "logps/rejected": -658.84580078125, "loss": 0.0351, "rewards/chosen": 7.0812481471470425, "rewards/margins": 17.963485390799388, "rewards/rejected": -10.882237243652344, "step": 2254 }, { "epoch": 0.6180622173495958, "grad_norm": 14.5625, "kl": 3.533064603805542, "learning_rate": 5e-06, "logits/chosen": -20515974.0, "logits/rejected": -30520112.0, "logps/chosen": -347.19403076171875, "logps/rejected": -715.058837890625, "loss": 0.0674, "rewards/chosen": 6.336322784423828, "rewards/margins": 20.053903579711914, "rewards/rejected": -13.717580795288086, "step": 2255 }, { "epoch": 0.6183363025901055, "grad_norm": 4.0, "kl": 3.4687678813934326, "learning_rate": 5e-06, "logits/chosen": -45791689.84615385, "logits/rejected": -19243872.0, "logps/chosen": -432.42296424278845, "logps/rejected": -650.6376953125, "loss": 0.0108, "rewards/chosen": 8.064470731295073, "rewards/margins": 22.8517326941857, "rewards/rejected": -14.787261962890625, "step": 2256 }, { "epoch": 0.6186103878306153, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28727532.8, "logits/rejected": -23845440.0, "logps/chosen": -399.797607421875, "logps/rejected": -488.7725306919643, "loss": 0.0394, "rewards/chosen": 5.358429718017578, "rewards/margins": 14.265192522321428, "rewards/rejected": -8.90676280430385, "step": 2257 }, { "epoch": 0.6188844730711252, "grad_norm": 6.34375, "kl": 6.4546918869018555, "learning_rate": 5e-06, "logits/chosen": -24765212.8, "logits/rejected": -21167282.285714287, "logps/chosen": -369.341845703125, "logps/rejected": -551.2506277901786, "loss": 0.044, "rewards/chosen": 6.696479034423828, "rewards/margins": 17.978526851109095, "rewards/rejected": -11.282047816685267, "step": 2258 }, { "epoch": 0.6191585583116349, "grad_norm": 4.34375, "kl": 5.331413269042969, "learning_rate": 5e-06, "logits/chosen": -21245874.90909091, "logits/rejected": -21189026.46153846, "logps/chosen": -553.6702325994319, "logps/rejected": -514.5766977163462, "loss": 0.0216, "rewards/chosen": 7.913204539905895, "rewards/margins": 17.510519227781494, "rewards/rejected": -9.5973146878756, "step": 2259 }, { "epoch": 0.6194326435521447, "grad_norm": 8.3125, "kl": 2.3706374168395996, "learning_rate": 5e-06, "logits/chosen": -32771608.0, "logits/rejected": -8251354.666666667, "logps/chosen": -401.4986165364583, "logps/rejected": -555.4042561848959, "loss": 0.0417, "rewards/chosen": 7.855735778808594, "rewards/margins": 16.887304306030273, "rewards/rejected": -9.03156852722168, "step": 2260 }, { "epoch": 0.6197067287926545, "grad_norm": 3.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50648473.6, "logits/rejected": -36129106.28571428, "logps/chosen": -442.24443359375, "logps/rejected": -524.4289202008929, "loss": 0.0096, "rewards/chosen": 5.793862152099609, "rewards/margins": 15.535272543770926, "rewards/rejected": -9.741410391671318, "step": 2261 }, { "epoch": 0.6199808140331643, "grad_norm": 2.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13992154.181818182, "logits/rejected": -30809858.46153846, "logps/chosen": -533.6478604403409, "logps/rejected": -465.76900540865387, "loss": 0.0338, "rewards/chosen": 6.862068176269531, "rewards/margins": 16.451420123760517, "rewards/rejected": -9.589351947490986, "step": 2262 }, { "epoch": 0.6202548992736742, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24092411.076923076, "logits/rejected": -15696613.818181818, "logps/chosen": -384.38269981971155, "logps/rejected": -408.2080078125, "loss": 0.0519, "rewards/chosen": 6.16513178898738, "rewards/margins": 14.78270492020187, "rewards/rejected": -8.617573131214488, "step": 2263 }, { "epoch": 0.6205289845141839, "grad_norm": 3.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -706937.7777777778, "logits/rejected": -16197093.333333334, "logps/chosen": -411.6455349392361, "logps/rejected": -556.9069010416666, "loss": 0.0133, "rewards/chosen": 5.864325629340278, "rewards/margins": 17.728632269965278, "rewards/rejected": -11.864306640625, "step": 2264 }, { "epoch": 0.6208030697546937, "grad_norm": 8.875, "kl": 0.21106529235839844, "learning_rate": 5e-06, "logits/chosen": -9195838.857142856, "logits/rejected": -33793667.2, "logps/chosen": -377.0289829799107, "logps/rejected": -459.35625, "loss": 0.0377, "rewards/chosen": 7.010033743722098, "rewards/margins": 17.784449332101005, "rewards/rejected": -10.774415588378906, "step": 2265 }, { "epoch": 0.6210771549952036, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26176996.923076924, "logits/rejected": -37068238.54545455, "logps/chosen": -388.7800480769231, "logps/rejected": -442.1077325994318, "loss": 0.0112, "rewards/chosen": 6.57044924222506, "rewards/margins": 14.694805598759153, "rewards/rejected": -8.124356356534092, "step": 2266 }, { "epoch": 0.6213512402357133, "grad_norm": 9.1875, "kl": 5.470054626464844, "learning_rate": 5e-06, "logits/chosen": -17624567.272727273, "logits/rejected": -32765065.846153848, "logps/chosen": -380.1687677556818, "logps/rejected": -490.2918043870192, "loss": 0.0284, "rewards/chosen": 7.290783968838778, "rewards/margins": 16.378360614909994, "rewards/rejected": -9.087576646071215, "step": 2267 }, { "epoch": 0.6216253254762231, "grad_norm": 1.96875, "kl": 8.556510925292969, "learning_rate": 5e-06, "logits/chosen": -28251669.333333332, "logits/rejected": -14326384.0, "logps/chosen": -490.3694254557292, "logps/rejected": -701.0380859375, "loss": 0.007, "rewards/chosen": 8.061495463053385, "rewards/margins": 18.824952443440754, "rewards/rejected": -10.76345698038737, "step": 2268 }, { "epoch": 0.6218994107167329, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10465737.6, "logits/rejected": -26639102.222222224, "logps/chosen": -327.58160807291665, "logps/rejected": -612.82177734375, "loss": 0.0747, "rewards/chosen": 5.00411376953125, "rewards/margins": 16.85748867458767, "rewards/rejected": -11.853374905056423, "step": 2269 }, { "epoch": 0.6221734959572427, "grad_norm": 10.5625, "kl": 9.041720390319824, "learning_rate": 5e-06, "logits/chosen": -32585597.866666667, "logits/rejected": -23220049.777777776, "logps/chosen": -429.07737630208334, "logps/rejected": -485.07530381944446, "loss": 0.077, "rewards/chosen": 6.9366709391276045, "rewards/margins": 15.97585890028212, "rewards/rejected": -9.039187961154514, "step": 2270 }, { "epoch": 0.6224475811977525, "grad_norm": 4.9375, "kl": 3.9046497344970703, "learning_rate": 5e-06, "logits/chosen": -28914906.666666668, "logits/rejected": -20650328.0, "logps/chosen": -381.34912109375, "logps/rejected": -402.1669921875, "loss": 0.0386, "rewards/chosen": 7.138304392496745, "rewards/margins": 15.103984832763672, "rewards/rejected": -7.965680440266927, "step": 2271 }, { "epoch": 0.6227216664382623, "grad_norm": 8.5, "kl": 3.326498031616211, "learning_rate": 5e-06, "logits/chosen": -34911008.0, "logits/rejected": -23523399.272727273, "logps/chosen": -402.85580679086536, "logps/rejected": -500.2424982244318, "loss": 0.0413, "rewards/chosen": 7.170740567720854, "rewards/margins": 16.258745633638824, "rewards/rejected": -9.088005065917969, "step": 2272 }, { "epoch": 0.6229957516787721, "grad_norm": 9.375, "kl": 1.8985061645507812, "learning_rate": 5e-06, "logits/chosen": -18583352.0, "logits/rejected": -7194081.0, "logps/chosen": -464.0875549316406, "logps/rejected": -621.821533203125, "loss": 0.0456, "rewards/chosen": 5.002586364746094, "rewards/margins": 18.935165405273438, "rewards/rejected": -13.932579040527344, "step": 2273 }, { "epoch": 0.623269836919282, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37601540.92307692, "logits/rejected": -11611325.090909092, "logps/chosen": -360.8505859375, "logps/rejected": -693.8053977272727, "loss": 0.0458, "rewards/chosen": 6.0396564190204325, "rewards/margins": 18.793036934379096, "rewards/rejected": -12.753380515358664, "step": 2274 }, { "epoch": 0.6235439221597917, "grad_norm": 1.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22005771.2, "logits/rejected": 195073.14285714287, "logps/chosen": -285.017724609375, "logps/rejected": -591.7710658482143, "loss": 0.0041, "rewards/chosen": 6.733045196533203, "rewards/margins": 18.823516736711774, "rewards/rejected": -12.090471540178571, "step": 2275 }, { "epoch": 0.6238180074003015, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4046875.2, "logits/rejected": -14534667.42857143, "logps/chosen": -313.912646484375, "logps/rejected": -504.3140345982143, "loss": 0.0466, "rewards/chosen": 5.134371566772461, "rewards/margins": 14.909637941632951, "rewards/rejected": -9.775266374860491, "step": 2276 }, { "epoch": 0.6240920926408113, "grad_norm": 0.7265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37502485.333333336, "logits/rejected": -17180034.666666668, "logps/chosen": -506.0890299479167, "logps/rejected": -505.0467122395833, "loss": 0.003, "rewards/chosen": 7.593138376871745, "rewards/margins": 18.119576772054035, "rewards/rejected": -10.526438395182291, "step": 2277 }, { "epoch": 0.6243661778813211, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18931196.0, "logits/rejected": -33858780.0, "logps/chosen": -343.61328125, "logps/rejected": -412.05316162109375, "loss": 0.0242, "rewards/chosen": 7.2034010887146, "rewards/margins": 16.633633136749268, "rewards/rejected": -9.430232048034668, "step": 2278 }, { "epoch": 0.6246402631218309, "grad_norm": 3.375, "kl": 1.0449600219726562, "learning_rate": 5e-06, "logits/chosen": -17770948.8, "logits/rejected": -23363945.14285714, "logps/chosen": -585.177880859375, "logps/rejected": -548.9298967633929, "loss": 0.0116, "rewards/chosen": 8.766486358642577, "rewards/margins": 18.333898598807195, "rewards/rejected": -9.56741224016462, "step": 2279 }, { "epoch": 0.6249143483623407, "grad_norm": 1.3984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9247786.4, "logits/rejected": -30178532.57142857, "logps/chosen": -517.93046875, "logps/rejected": -482.6301967075893, "loss": 0.0039, "rewards/chosen": 7.4687355041503904, "rewards/margins": 18.806722041538784, "rewards/rejected": -11.337986537388392, "step": 2280 }, { "epoch": 0.6251884336028505, "grad_norm": 3.6875, "kl": 0.43939846754074097, "learning_rate": 5e-06, "logits/chosen": -45393979.07692308, "logits/rejected": -13707629.090909092, "logps/chosen": -433.9523737980769, "logps/rejected": -711.4117542613636, "loss": 0.0113, "rewards/chosen": 6.892977201021635, "rewards/margins": 19.459043596174332, "rewards/rejected": -12.5660663951527, "step": 2281 }, { "epoch": 0.6254625188433602, "grad_norm": 6.96875, "kl": 17.30350112915039, "learning_rate": 5e-06, "logits/chosen": -7529601.230769231, "logits/rejected": -19361057.454545453, "logps/chosen": -527.7155573918269, "logps/rejected": -492.30366654829544, "loss": 0.043, "rewards/chosen": 8.7046872652494, "rewards/margins": 18.072014175094925, "rewards/rejected": -9.367326909845525, "step": 2282 }, { "epoch": 0.6257366040838701, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43543896.615384616, "logits/rejected": -37276416.0, "logps/chosen": -472.63585486778845, "logps/rejected": -463.02388139204544, "loss": 0.055, "rewards/chosen": 6.948205801156851, "rewards/margins": 14.88785590325202, "rewards/rejected": -7.939650102095171, "step": 2283 }, { "epoch": 0.6260106893243799, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3146891.272727273, "logits/rejected": -25475953.230769232, "logps/chosen": -354.2637384588068, "logps/rejected": -637.0505558894231, "loss": 0.0191, "rewards/chosen": 7.122901222922585, "rewards/margins": 19.3488075949929, "rewards/rejected": -12.225906372070312, "step": 2284 }, { "epoch": 0.6262847745648897, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19408733.09090909, "logits/rejected": -38241070.76923077, "logps/chosen": -330.7771661931818, "logps/rejected": -365.75048828125, "loss": 0.0679, "rewards/chosen": 5.283225666392934, "rewards/margins": 12.858606431867694, "rewards/rejected": -7.57538076547476, "step": 2285 }, { "epoch": 0.6265588598053995, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5485379.0, "logits/rejected": -36871957.333333336, "logps/chosen": -492.8407796223958, "logps/rejected": -488.9180908203125, "loss": 0.0499, "rewards/chosen": 6.5744279225667315, "rewards/margins": 16.61695671081543, "rewards/rejected": -10.042528788248697, "step": 2286 }, { "epoch": 0.6268329450459093, "grad_norm": 2.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34342513.23076923, "logits/rejected": -1856580.3636363635, "logps/chosen": -410.1330754206731, "logps/rejected": -674.4056729403409, "loss": 0.0058, "rewards/chosen": 6.63081535926232, "rewards/margins": 20.743795594968994, "rewards/rejected": -14.112980235706676, "step": 2287 }, { "epoch": 0.6271070302864191, "grad_norm": 9.6875, "kl": 6.125638961791992, "learning_rate": 5e-06, "logits/chosen": -32831960.0, "logits/rejected": -46639928.0, "logps/chosen": -465.3251037597656, "logps/rejected": -707.6123046875, "loss": 0.0614, "rewards/chosen": 7.355838298797607, "rewards/margins": 20.11037588119507, "rewards/rejected": -12.754537582397461, "step": 2288 }, { "epoch": 0.6273811155269289, "grad_norm": 13.75, "kl": 1.5324440002441406, "learning_rate": 5e-06, "logits/chosen": -29377655.272727273, "logits/rejected": -56797366.15384615, "logps/chosen": -438.94655539772725, "logps/rejected": -730.4921875, "loss": 0.0251, "rewards/chosen": 7.305580139160156, "rewards/margins": 22.5052003126878, "rewards/rejected": -15.199620173527645, "step": 2289 }, { "epoch": 0.6276552007674386, "grad_norm": 6.1875, "kl": 2.48762583732605, "learning_rate": 5e-06, "logits/chosen": -12623454.933333334, "logits/rejected": -12154736.0, "logps/chosen": -492.37975260416664, "logps/rejected": -427.2916666666667, "loss": 0.0333, "rewards/chosen": 6.629606119791666, "rewards/margins": 15.18689439561632, "rewards/rejected": -8.557288275824654, "step": 2290 }, { "epoch": 0.6279292860079485, "grad_norm": 10.0, "kl": 2.3145334720611572, "learning_rate": 5e-06, "logits/chosen": -23949728.0, "logits/rejected": -25529539.2, "logps/chosen": -411.63643973214283, "logps/rejected": -507.02646484375, "loss": 0.0739, "rewards/chosen": 5.701673235212054, "rewards/margins": 14.91877681187221, "rewards/rejected": -9.217103576660156, "step": 2291 }, { "epoch": 0.6282033712484583, "grad_norm": 2.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12644534.222222222, "logits/rejected": -13222967.466666667, "logps/chosen": -386.4619140625, "logps/rejected": -594.2703125, "loss": 0.0093, "rewards/chosen": 6.029425726996528, "rewards/margins": 16.267874823676216, "rewards/rejected": -10.238449096679688, "step": 2292 }, { "epoch": 0.628477456488968, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37427508.0, "logits/rejected": -1673542.0, "logps/chosen": -483.31781005859375, "logps/rejected": -564.7496948242188, "loss": 0.0151, "rewards/chosen": 5.641016960144043, "rewards/margins": 15.399352073669434, "rewards/rejected": -9.75833511352539, "step": 2293 }, { "epoch": 0.6287515417294779, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34402316.307692304, "logits/rejected": -43835624.72727273, "logps/chosen": -474.27016977163464, "logps/rejected": -569.9651988636364, "loss": 0.0405, "rewards/chosen": 6.367481525127705, "rewards/margins": 17.13222151536208, "rewards/rejected": -10.764739990234375, "step": 2294 }, { "epoch": 0.6290256269699877, "grad_norm": 2.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13494114.909090908, "logits/rejected": -14723051.076923076, "logps/chosen": -453.0692027698864, "logps/rejected": -712.8766526442307, "loss": 0.0077, "rewards/chosen": 6.962916981090199, "rewards/margins": 21.580088048548134, "rewards/rejected": -14.617171067457933, "step": 2295 }, { "epoch": 0.6292997122104975, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25244100.57142857, "logits/rejected": -40936000.0, "logps/chosen": -393.15391322544644, "logps/rejected": -584.1951976102941, "loss": 0.015, "rewards/chosen": 5.582432883126395, "rewards/margins": 16.03374118965213, "rewards/rejected": -10.451308306525736, "step": 2296 }, { "epoch": 0.6295737974510073, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53077652.0, "logits/rejected": -37517112.0, "logps/chosen": -499.0904541015625, "logps/rejected": -514.0933837890625, "loss": 0.0613, "rewards/chosen": 6.013070106506348, "rewards/margins": 16.237783432006836, "rewards/rejected": -10.224713325500488, "step": 2297 }, { "epoch": 0.629847882691517, "grad_norm": 10.0, "kl": 8.113983154296875, "learning_rate": 5e-06, "logits/chosen": -8341437.818181818, "logits/rejected": -17020419.692307692, "logps/chosen": -349.5145152698864, "logps/rejected": -507.88773287259613, "loss": 0.0759, "rewards/chosen": 6.918300281871449, "rewards/margins": 16.412371068567666, "rewards/rejected": -9.494070786696215, "step": 2298 }, { "epoch": 0.6301219679320269, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20421089.777777776, "logits/rejected": -18416699.733333334, "logps/chosen": -357.68462456597223, "logps/rejected": -532.60068359375, "loss": 0.024, "rewards/chosen": 6.925285339355469, "rewards/margins": 16.77421315511068, "rewards/rejected": -9.848927815755209, "step": 2299 }, { "epoch": 0.6303960531725367, "grad_norm": 1.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9707054.0, "logits/rejected": -14030388.0, "logps/chosen": -566.62255859375, "logps/rejected": -677.9676513671875, "loss": 0.0041, "rewards/chosen": 6.222118377685547, "rewards/margins": 22.764619827270508, "rewards/rejected": -16.54250144958496, "step": 2300 }, { "epoch": 0.6306701384130464, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37980002.90909091, "logits/rejected": -22337585.230769232, "logps/chosen": -473.77530184659093, "logps/rejected": -572.0253155048077, "loss": 0.0115, "rewards/chosen": 7.055314497514204, "rewards/margins": 19.938596498716127, "rewards/rejected": -12.883282001201923, "step": 2301 }, { "epoch": 0.6309442236535563, "grad_norm": 17.125, "kl": 5.685277462005615, "learning_rate": 5e-06, "logits/chosen": -22372866.666666668, "logits/rejected": -14088564.0, "logps/chosen": -456.0694580078125, "logps/rejected": -469.5743001302083, "loss": 0.041, "rewards/chosen": 6.926605860392253, "rewards/margins": 18.088432947794598, "rewards/rejected": -11.161827087402344, "step": 2302 }, { "epoch": 0.631218308894066, "grad_norm": 1.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25157341.09090909, "logits/rejected": -37053828.92307692, "logps/chosen": -373.95987215909093, "logps/rejected": -547.251953125, "loss": 0.0075, "rewards/chosen": 5.347729422829368, "rewards/margins": 14.709054026570353, "rewards/rejected": -9.361324603740986, "step": 2303 }, { "epoch": 0.6314923941345758, "grad_norm": 5.375, "kl": 1.20572030544281, "learning_rate": 5e-06, "logits/chosen": -29836357.818181816, "logits/rejected": -26018929.230769232, "logps/chosen": -418.43776633522725, "logps/rejected": -552.5463115985577, "loss": 0.0437, "rewards/chosen": 7.083278309215199, "rewards/margins": 16.88207415600757, "rewards/rejected": -9.798795846792368, "step": 2304 }, { "epoch": 0.6317664793750857, "grad_norm": 1.0, "kl": 1.6449006795883179, "learning_rate": 5e-06, "logits/chosen": -10193989.818181818, "logits/rejected": -25633612.307692308, "logps/chosen": -419.53196022727275, "logps/rejected": -599.2531550480769, "loss": 0.003, "rewards/chosen": 7.188887856223366, "rewards/margins": 19.139768640478174, "rewards/rejected": -11.950880784254808, "step": 2305 }, { "epoch": 0.6320405646155954, "grad_norm": 14.375, "kl": 9.879964828491211, "learning_rate": 5e-06, "logits/chosen": -7618883.733333333, "logits/rejected": 7301377.777777778, "logps/chosen": -356.38701171875, "logps/rejected": -561.7055121527778, "loss": 0.0941, "rewards/chosen": 7.386067199707031, "rewards/margins": 15.116829257541234, "rewards/rejected": -7.730762057834202, "step": 2306 }, { "epoch": 0.6323146498561053, "grad_norm": 8.0625, "kl": 3.6283748149871826, "learning_rate": 5e-06, "logits/chosen": -26239396.57142857, "logits/rejected": -30913555.2, "logps/chosen": -517.2726353236607, "logps/rejected": -508.691259765625, "loss": 0.0179, "rewards/chosen": 7.782998221261161, "rewards/margins": 19.39622606549944, "rewards/rejected": -11.613227844238281, "step": 2307 }, { "epoch": 0.6325887350966151, "grad_norm": 8.625, "kl": 6.9409990310668945, "learning_rate": 5e-06, "logits/chosen": -26666866.666666668, "logits/rejected": -24800722.666666668, "logps/chosen": -475.7073160807292, "logps/rejected": -515.9648844401041, "loss": 0.026, "rewards/chosen": 7.165111541748047, "rewards/margins": 19.997486114501953, "rewards/rejected": -12.832374572753906, "step": 2308 }, { "epoch": 0.6328628203371248, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32202144.0, "logits/rejected": -12682520.615384616, "logps/chosen": -424.1964666193182, "logps/rejected": -480.58067908653845, "loss": 0.0083, "rewards/chosen": 6.471744884144176, "rewards/margins": 15.837552050610523, "rewards/rejected": -9.365807166466347, "step": 2309 }, { "epoch": 0.6331369055776347, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21516749.09090909, "logits/rejected": -33420253.53846154, "logps/chosen": -421.96102627840907, "logps/rejected": -614.6456580528846, "loss": 0.0741, "rewards/chosen": 5.479234175248579, "rewards/margins": 19.430617325789445, "rewards/rejected": -13.951383150540865, "step": 2310 }, { "epoch": 0.6334109908181444, "grad_norm": 9.25, "kl": 5.516953945159912, "learning_rate": 5e-06, "logits/chosen": 29320667.42857143, "logits/rejected": -886903.2, "logps/chosen": -509.23779296875, "logps/rejected": -472.78642578125, "loss": 0.0465, "rewards/chosen": 8.177165985107422, "rewards/margins": 14.009296417236328, "rewards/rejected": -5.832130432128906, "step": 2311 }, { "epoch": 0.6336850760586542, "grad_norm": 15.1875, "kl": 12.079658508300781, "learning_rate": 5e-06, "logits/chosen": -9545305.6, "logits/rejected": -13750808.0, "logps/chosen": -454.905078125, "logps/rejected": -503.0817057291667, "loss": 0.0949, "rewards/chosen": 6.467893473307291, "rewards/margins": 16.031575181749133, "rewards/rejected": -9.563681708441841, "step": 2312 }, { "epoch": 0.6339591612991641, "grad_norm": 0.423828125, "kl": 4.344091415405273, "learning_rate": 5e-06, "logits/chosen": -8998414.857142856, "logits/rejected": -20918832.0, "logps/chosen": -433.5353306361607, "logps/rejected": -486.877294921875, "loss": 0.0011, "rewards/chosen": 8.680709838867188, "rewards/margins": 20.301110076904298, "rewards/rejected": -11.62040023803711, "step": 2313 }, { "epoch": 0.6342332465396738, "grad_norm": 4.90625, "kl": 3.0371971130371094, "learning_rate": 5e-06, "logits/chosen": -37654754.90909091, "logits/rejected": -26303320.615384616, "logps/chosen": -494.57936789772725, "logps/rejected": -546.6742037259615, "loss": 0.0187, "rewards/chosen": 7.742182644930753, "rewards/margins": 18.387422841745657, "rewards/rejected": -10.645240196814903, "step": 2314 }, { "epoch": 0.6345073317801836, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15077793.0, "logits/rejected": -23345014.0, "logps/chosen": -342.3008728027344, "logps/rejected": -609.821044921875, "loss": 0.0719, "rewards/chosen": 5.483484745025635, "rewards/margins": 17.1940598487854, "rewards/rejected": -11.710575103759766, "step": 2315 }, { "epoch": 0.6347814170206935, "grad_norm": 7.21875, "kl": 6.7192206382751465, "learning_rate": 5e-06, "logits/chosen": -20094996.363636363, "logits/rejected": -3233352.6153846155, "logps/chosen": -370.68379350142044, "logps/rejected": -667.7420372596154, "loss": 0.032, "rewards/chosen": 6.343107050115412, "rewards/margins": 17.87626322499522, "rewards/rejected": -11.533156174879808, "step": 2316 }, { "epoch": 0.6350555022612032, "grad_norm": 2.734375, "kl": 0.5754903554916382, "learning_rate": 5e-06, "logits/chosen": -19414284.0, "logits/rejected": -22842405.333333332, "logps/chosen": -523.7522786458334, "logps/rejected": -642.9419759114584, "loss": 0.0416, "rewards/chosen": 7.413881301879883, "rewards/margins": 18.492673873901367, "rewards/rejected": -11.078792572021484, "step": 2317 }, { "epoch": 0.635329587501713, "grad_norm": 3.90625, "kl": 0.7668317556381226, "learning_rate": 5e-06, "logits/chosen": -9128134.666666666, "logits/rejected": -17408577.333333332, "logps/chosen": -369.7271321614583, "logps/rejected": -542.6063639322916, "loss": 0.0099, "rewards/chosen": 7.391726175944011, "rewards/margins": 18.684546152750652, "rewards/rejected": -11.29281997680664, "step": 2318 }, { "epoch": 0.6356036727422228, "grad_norm": 1.3671875, "kl": 6.559651851654053, "learning_rate": 5e-06, "logits/chosen": 355193.4736842105, "logits/rejected": -21128665.6, "logps/chosen": -512.1345600328947, "logps/rejected": -463.432470703125, "loss": 0.0049, "rewards/chosen": 8.336070010536595, "rewards/margins": 17.838362643593236, "rewards/rejected": -9.50229263305664, "step": 2319 }, { "epoch": 0.6358777579827326, "grad_norm": 0.71484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145204.0, "logits/rejected": -31504169.14285714, "logps/chosen": -449.46748046875, "logps/rejected": -507.3549107142857, "loss": 0.002, "rewards/chosen": 7.40081787109375, "rewards/margins": 19.58673095703125, "rewards/rejected": -12.1859130859375, "step": 2320 }, { "epoch": 0.6361518432232425, "grad_norm": 5.09375, "kl": 0.19303512573242188, "learning_rate": 5e-06, "logits/chosen": -16429948.444444444, "logits/rejected": -17306257.066666666, "logps/chosen": -501.5050998263889, "logps/rejected": -517.4429036458333, "loss": 0.013, "rewards/chosen": 9.921808878580729, "rewards/margins": 19.121229044596355, "rewards/rejected": -9.199420166015624, "step": 2321 }, { "epoch": 0.6364259284637522, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30123704.0, "logits/rejected": -29932821.333333332, "logps/chosen": -346.3811848958333, "logps/rejected": -434.4442545572917, "loss": 0.0375, "rewards/chosen": 6.315926869710286, "rewards/margins": 17.9724858601888, "rewards/rejected": -11.656558990478516, "step": 2322 }, { "epoch": 0.636700013704262, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22319869.53846154, "logits/rejected": -21658976.0, "logps/chosen": -459.07327974759613, "logps/rejected": -587.9909446022727, "loss": 0.0184, "rewards/chosen": 6.18115234375, "rewards/margins": 17.56654219193892, "rewards/rejected": -11.38538984818892, "step": 2323 }, { "epoch": 0.6369740989447719, "grad_norm": 6.90625, "kl": 4.074743270874023, "learning_rate": 5e-06, "logits/chosen": -6238227.636363637, "logits/rejected": -25485080.615384616, "logps/chosen": -524.9512606534091, "logps/rejected": -371.5505558894231, "loss": 0.022, "rewards/chosen": 8.001977400346236, "rewards/margins": 17.499626373077607, "rewards/rejected": -9.49764897273137, "step": 2324 }, { "epoch": 0.6372481841852816, "grad_norm": 2.984375, "kl": 1.0259336233139038, "learning_rate": 5e-06, "logits/chosen": -20433590.85714286, "logits/rejected": -27995561.411764707, "logps/chosen": -483.96456473214283, "logps/rejected": -561.9739200367648, "loss": 0.008, "rewards/chosen": 7.787757328578404, "rewards/margins": 18.86858223666664, "rewards/rejected": -11.080824908088236, "step": 2325 }, { "epoch": 0.6375222694257914, "grad_norm": 15.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3121972.727272727, "logits/rejected": 67202402.46153846, "logps/chosen": -390.68661221590907, "logps/rejected": -484.59652944711536, "loss": 0.0801, "rewards/chosen": 5.6017608642578125, "rewards/margins": 15.38384540264423, "rewards/rejected": -9.782084538386417, "step": 2326 }, { "epoch": 0.6377963546663012, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29437728.0, "logits/rejected": -16977150.769230768, "logps/chosen": -311.2415660511364, "logps/rejected": -580.9782902644231, "loss": 0.0595, "rewards/chosen": 4.952129017223012, "rewards/margins": 14.542488044792123, "rewards/rejected": -9.59035902756911, "step": 2327 }, { "epoch": 0.638070439906811, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18652440.0, "logits/rejected": -5001249.142857143, "logps/chosen": -467.490185546875, "logps/rejected": -627.8726981026786, "loss": 0.0137, "rewards/chosen": 6.315210342407227, "rewards/margins": 19.141584941319056, "rewards/rejected": -12.82637459891183, "step": 2328 }, { "epoch": 0.6383445251473208, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28123346.285714287, "logits/rejected": -21821596.8, "logps/chosen": -368.73740931919644, "logps/rejected": -530.20439453125, "loss": 0.0166, "rewards/chosen": 7.064862387520926, "rewards/margins": 16.483714621407646, "rewards/rejected": -9.418852233886719, "step": 2329 }, { "epoch": 0.6386186103878306, "grad_norm": 6.1875, "kl": 7.7794342041015625, "learning_rate": 5e-06, "logits/chosen": 61729867.63636363, "logits/rejected": -31922451.692307692, "logps/chosen": -469.67724609375, "logps/rejected": -513.8130258413462, "loss": 0.0501, "rewards/chosen": 7.168978604403409, "rewards/margins": 16.779873614544634, "rewards/rejected": -9.610895010141226, "step": 2330 }, { "epoch": 0.6388926956283404, "grad_norm": 6.53125, "kl": 7.858129024505615, "learning_rate": 5e-06, "logits/chosen": 1369768.0, "logits/rejected": -35586483.2, "logps/chosen": -357.281005859375, "logps/rejected": -428.3232421875, "loss": 0.0542, "rewards/chosen": 5.778694152832031, "rewards/margins": 14.647206878662109, "rewards/rejected": -8.868512725830078, "step": 2331 }, { "epoch": 0.6391667808688503, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17287218.285714287, "logits/rejected": -10285784.8, "logps/chosen": -392.54202706473217, "logps/rejected": -336.4615234375, "loss": 0.0393, "rewards/chosen": 6.529656546456473, "rewards/margins": 15.074109976632254, "rewards/rejected": -8.544453430175782, "step": 2332 }, { "epoch": 0.63944086610936, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41154666.666666664, "logits/rejected": -19178042.666666668, "logps/chosen": -536.8623046875, "logps/rejected": -552.2638346354166, "loss": 0.0548, "rewards/chosen": 7.7133839925130205, "rewards/margins": 19.895955403645832, "rewards/rejected": -12.182571411132812, "step": 2333 }, { "epoch": 0.6397149513498698, "grad_norm": 6.4375, "kl": 7.528522491455078, "learning_rate": 5e-06, "logits/chosen": -2465454.285714286, "logits/rejected": -45524620.8, "logps/chosen": -361.009765625, "logps/rejected": -650.120263671875, "loss": 0.0248, "rewards/chosen": 6.015021732875279, "rewards/margins": 18.587505558558874, "rewards/rejected": -12.572483825683594, "step": 2334 }, { "epoch": 0.6399890365903796, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30254729.14285714, "logits/rejected": -13082000.0, "logps/chosen": -356.91238839285717, "logps/rejected": -554.8513327205883, "loss": 0.0357, "rewards/chosen": 6.560591561453683, "rewards/margins": 17.476464471897156, "rewards/rejected": -10.915872910443474, "step": 2335 }, { "epoch": 0.6402631218308894, "grad_norm": 7.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29771928.888888888, "logits/rejected": -21605442.133333333, "logps/chosen": -345.9914822048611, "logps/rejected": -557.30625, "loss": 0.0716, "rewards/chosen": 5.677116394042969, "rewards/margins": 15.64331512451172, "rewards/rejected": -9.96619873046875, "step": 2336 }, { "epoch": 0.6405372070713992, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21565904.0, "logits/rejected": -19146709.333333332, "logps/chosen": -339.96202256944446, "logps/rejected": -467.99853515625, "loss": 0.0318, "rewards/chosen": 6.763929578993055, "rewards/margins": 15.667203097873264, "rewards/rejected": -8.903273518880209, "step": 2337 }, { "epoch": 0.640811292311909, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45095341.71428572, "logits/rejected": -29859275.29411765, "logps/chosen": -327.95455496651783, "logps/rejected": -416.1090877757353, "loss": 0.0279, "rewards/chosen": 6.724583217075893, "rewards/margins": 17.959997609883796, "rewards/rejected": -11.235414392807904, "step": 2338 }, { "epoch": 0.6410853775524188, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17092460.0, "logits/rejected": -24352448.0, "logps/chosen": -421.8753255208333, "logps/rejected": -449.3505045572917, "loss": 0.0287, "rewards/chosen": 7.7597707112630205, "rewards/margins": 17.45901934305827, "rewards/rejected": -9.699248631795248, "step": 2339 }, { "epoch": 0.6413594627929285, "grad_norm": 4.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14446902.222222222, "logits/rejected": -49818965.333333336, "logps/chosen": -284.30419921875, "logps/rejected": -558.7053385416667, "loss": 0.0318, "rewards/chosen": 5.588070339626736, "rewards/margins": 17.380123562282986, "rewards/rejected": -11.79205322265625, "step": 2340 }, { "epoch": 0.6416335480334384, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32604261.818181816, "logits/rejected": -7479740.307692308, "logps/chosen": -429.22811612215907, "logps/rejected": -346.5366962139423, "loss": 0.0224, "rewards/chosen": 6.569667469371449, "rewards/margins": 15.146788910552338, "rewards/rejected": -8.57712144118089, "step": 2341 }, { "epoch": 0.6419076332739482, "grad_norm": 2.28125, "kl": 3.3816030025482178, "learning_rate": 5e-06, "logits/chosen": -27410640.0, "logits/rejected": -13437534.666666666, "logps/chosen": -489.5791015625, "logps/rejected": -482.6736653645833, "loss": 0.0073, "rewards/chosen": 7.363979339599609, "rewards/margins": 18.330762227376304, "rewards/rejected": -10.966782887776693, "step": 2342 }, { "epoch": 0.642181718514458, "grad_norm": 13.875, "kl": 10.152392387390137, "learning_rate": 5e-06, "logits/chosen": -8672636.0, "logits/rejected": -37641458.666666664, "logps/chosen": -504.8428548177083, "logps/rejected": -487.8741048177083, "loss": 0.0532, "rewards/chosen": 7.474520365397136, "rewards/margins": 20.21362813313802, "rewards/rejected": -12.739107767740885, "step": 2343 }, { "epoch": 0.6424558037549678, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40198976.0, "logits/rejected": -22923808.0, "logps/chosen": -414.36807528409093, "logps/rejected": -607.5328275240385, "loss": 0.0296, "rewards/chosen": 7.251310868696733, "rewards/margins": 21.752019815511638, "rewards/rejected": -14.500708946814903, "step": 2344 }, { "epoch": 0.6427298889954776, "grad_norm": 8.25, "kl": 0.8501071929931641, "learning_rate": 5e-06, "logits/chosen": -30434025.14285714, "logits/rejected": -33630716.8, "logps/chosen": -353.69492885044644, "logps/rejected": -477.63974609375, "loss": 0.1252, "rewards/chosen": 3.9225575583321706, "rewards/margins": 16.000522341047013, "rewards/rejected": -12.077964782714844, "step": 2345 }, { "epoch": 0.6430039742359874, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44128211.692307696, "logits/rejected": -16780404.363636363, "logps/chosen": -479.3534405048077, "logps/rejected": -483.9073597301136, "loss": 0.0178, "rewards/chosen": 7.617608290452224, "rewards/margins": 17.081545529665647, "rewards/rejected": -9.463937239213424, "step": 2346 }, { "epoch": 0.6432780594764972, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3874041.777777778, "logits/rejected": -30948821.333333332, "logps/chosen": -465.44048394097223, "logps/rejected": -476.5234049479167, "loss": 0.0682, "rewards/chosen": 4.74562750922309, "rewards/margins": 15.011470201280382, "rewards/rejected": -10.265842692057292, "step": 2347 }, { "epoch": 0.643552144717007, "grad_norm": 6.8125, "kl": 5.170435905456543, "learning_rate": 5e-06, "logits/chosen": -39074262.85714286, "logits/rejected": -9117756.0, "logps/chosen": -397.8086635044643, "logps/rejected": -531.473193359375, "loss": 0.021, "rewards/chosen": 6.424537658691406, "rewards/margins": 18.928053283691405, "rewards/rejected": -12.503515625, "step": 2348 }, { "epoch": 0.6438262299575168, "grad_norm": 7.0625, "kl": 4.23336935043335, "learning_rate": 5e-06, "logits/chosen": 7816816.0, "logits/rejected": -32774717.866666667, "logps/chosen": -443.05211046006946, "logps/rejected": -536.3595703125, "loss": 0.0873, "rewards/chosen": 5.78870349460178, "rewards/margins": 15.840245649549697, "rewards/rejected": -10.051542154947917, "step": 2349 }, { "epoch": 0.6441003151980266, "grad_norm": 5.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21924380.8, "logits/rejected": -16180092.57142857, "logps/chosen": -488.2400390625, "logps/rejected": -641.7616489955357, "loss": 0.0156, "rewards/chosen": 7.394422912597657, "rewards/margins": 19.201798139299665, "rewards/rejected": -11.807375226702009, "step": 2350 }, { "epoch": 0.6443744004385363, "grad_norm": 13.4375, "kl": 4.134966850280762, "learning_rate": 5e-06, "logits/chosen": -13911009.6, "logits/rejected": -30102571.42857143, "logps/chosen": -463.824462890625, "logps/rejected": -531.302734375, "loss": 0.0456, "rewards/chosen": 7.647559356689453, "rewards/margins": 18.627834102085657, "rewards/rejected": -10.980274745396205, "step": 2351 }, { "epoch": 0.6446484856790462, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9465977.333333334, "logits/rejected": -13664372.0, "logps/chosen": -492.649169921875, "logps/rejected": -701.13232421875, "loss": 0.025, "rewards/chosen": 6.622893651326497, "rewards/margins": 20.400360743204754, "rewards/rejected": -13.777467091878256, "step": 2352 }, { "epoch": 0.644922570919556, "grad_norm": 1.0546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25064622.222222224, "logits/rejected": -23515569.066666666, "logps/chosen": -460.23687065972223, "logps/rejected": -651.0613932291667, "loss": 0.0033, "rewards/chosen": 7.374298095703125, "rewards/margins": 22.99669392903646, "rewards/rejected": -15.622395833333334, "step": 2353 }, { "epoch": 0.6451966561600658, "grad_norm": 10.5, "kl": 2.155231475830078, "learning_rate": 5e-06, "logits/chosen": -13254733.714285715, "logits/rejected": -25953347.2, "logps/chosen": -387.84312220982144, "logps/rejected": -531.51884765625, "loss": 0.0367, "rewards/chosen": 6.082830156598773, "rewards/margins": 19.67585231236049, "rewards/rejected": -13.59302215576172, "step": 2354 }, { "epoch": 0.6454707414005756, "grad_norm": 2.421875, "kl": 5.181491851806641, "learning_rate": 5e-06, "logits/chosen": 105593708.3076923, "logits/rejected": -31465506.90909091, "logps/chosen": -528.9339693509615, "logps/rejected": -591.6859019886364, "loss": 0.0046, "rewards/chosen": 9.117188673753004, "rewards/margins": 21.01971414205911, "rewards/rejected": -11.902525468306107, "step": 2355 }, { "epoch": 0.6457448266410853, "grad_norm": 6.90625, "kl": 4.034279823303223, "learning_rate": 5e-06, "logits/chosen": -23985317.333333332, "logits/rejected": 26595333.333333332, "logps/chosen": -418.4737955729167, "logps/rejected": -409.1017252604167, "loss": 0.0199, "rewards/chosen": 6.720266342163086, "rewards/margins": 17.179765701293945, "rewards/rejected": -10.45949935913086, "step": 2356 }, { "epoch": 0.6460189118815952, "grad_norm": 14.0, "kl": 2.889268398284912, "learning_rate": 5e-06, "logits/chosen": -13208564.57142857, "logits/rejected": -21171721.6, "logps/chosen": -421.3032924107143, "logps/rejected": -573.799658203125, "loss": 0.0686, "rewards/chosen": 5.283702850341797, "rewards/margins": 14.694955444335937, "rewards/rejected": -9.41125259399414, "step": 2357 }, { "epoch": 0.646292997122105, "grad_norm": 3.953125, "kl": 8.157577514648438, "learning_rate": 5e-06, "logits/chosen": -23945902.769230768, "logits/rejected": 13293589.818181818, "logps/chosen": -450.8552809495192, "logps/rejected": -797.7583451704545, "loss": 0.0079, "rewards/chosen": 8.58715585561899, "rewards/margins": 23.505053860324246, "rewards/rejected": -14.917898004705256, "step": 2358 }, { "epoch": 0.6465670823626147, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28768258.46153846, "logits/rejected": -41018801.45454545, "logps/chosen": -458.1608323317308, "logps/rejected": -779.330078125, "loss": 0.0087, "rewards/chosen": 6.51407975416917, "rewards/margins": 23.026547571995874, "rewards/rejected": -16.512467817826703, "step": 2359 }, { "epoch": 0.6468411676031246, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29731560.0, "logits/rejected": -15944374.666666666, "logps/chosen": -396.8806966145833, "logps/rejected": -730.0804850260416, "loss": 0.0249, "rewards/chosen": 5.815690358479817, "rewards/margins": 21.746487935384113, "rewards/rejected": -15.930797576904297, "step": 2360 }, { "epoch": 0.6471152528436344, "grad_norm": 9.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32752887.466666665, "logits/rejected": -15490561.777777778, "logps/chosen": -397.6343098958333, "logps/rejected": -488.8595920138889, "loss": 0.0195, "rewards/chosen": 6.761081949869792, "rewards/margins": 17.094765387641058, "rewards/rejected": -10.333683437771267, "step": 2361 }, { "epoch": 0.6473893380841441, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39789122.666666664, "logits/rejected": -32285781.333333332, "logps/chosen": -553.6888834635416, "logps/rejected": -458.9791666666667, "loss": 0.014, "rewards/chosen": 7.516263961791992, "rewards/margins": 17.75932947794596, "rewards/rejected": -10.24306551615397, "step": 2362 }, { "epoch": 0.647663423324654, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18326115.692307692, "logits/rejected": -30175403.636363637, "logps/chosen": -356.01724008413464, "logps/rejected": -479.7454723011364, "loss": 0.04, "rewards/chosen": 5.8964397723858175, "rewards/margins": 17.24023138726508, "rewards/rejected": -11.343791614879262, "step": 2363 }, { "epoch": 0.6479375085651637, "grad_norm": 0.72265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24024907.42857143, "logits/rejected": -15302847.05882353, "logps/chosen": -698.2177734375, "logps/rejected": -581.2410386029412, "loss": 0.0021, "rewards/chosen": 9.396852765764509, "rewards/margins": 22.64561500869879, "rewards/rejected": -13.248762242934284, "step": 2364 }, { "epoch": 0.6482115938056736, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1761070.4615384615, "logits/rejected": -25238597.818181816, "logps/chosen": -409.23989633413464, "logps/rejected": -593.0818536931819, "loss": 0.0223, "rewards/chosen": 6.754403334397536, "rewards/margins": 18.655319694038873, "rewards/rejected": -11.900916359641336, "step": 2365 }, { "epoch": 0.6484856790461834, "grad_norm": 8.625, "kl": 5.327315330505371, "learning_rate": 5e-06, "logits/chosen": -20136120.0, "logits/rejected": -24128426.666666668, "logps/chosen": -402.1129964192708, "logps/rejected": -531.659912109375, "loss": 0.0299, "rewards/chosen": 5.934655507405599, "rewards/margins": 16.89653778076172, "rewards/rejected": -10.96188227335612, "step": 2366 }, { "epoch": 0.6487597642866931, "grad_norm": 3.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17566180.8, "logits/rejected": -23193666.285714287, "logps/chosen": -429.942919921875, "logps/rejected": -498.27197265625, "loss": 0.0123, "rewards/chosen": 6.3926342010498045, "rewards/margins": 16.23071665082659, "rewards/rejected": -9.838082449776786, "step": 2367 }, { "epoch": 0.649033849527203, "grad_norm": 0.87109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27534781.333333332, "logits/rejected": -33993546.666666664, "logps/chosen": -432.480712890625, "logps/rejected": -390.7391764322917, "loss": 0.0243, "rewards/chosen": 7.254103342692058, "rewards/margins": 16.214122772216797, "rewards/rejected": -8.96001942952474, "step": 2368 }, { "epoch": 0.6493079347677128, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30589177.6, "logits/rejected": -12942620.57142857, "logps/chosen": -341.2389404296875, "logps/rejected": -569.0523158482143, "loss": 0.0375, "rewards/chosen": 5.284690856933594, "rewards/margins": 16.175351824079243, "rewards/rejected": -10.890660967145648, "step": 2369 }, { "epoch": 0.6495820200082225, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33577688.0, "logits/rejected": -39207992.0, "logps/chosen": -279.7102355957031, "logps/rejected": -620.41162109375, "loss": 0.0242, "rewards/chosen": 6.151074409484863, "rewards/margins": 16.85827350616455, "rewards/rejected": -10.707199096679688, "step": 2370 }, { "epoch": 0.6498561052487324, "grad_norm": 5.5, "kl": 5.667426109313965, "learning_rate": 5e-06, "logits/chosen": -29888366.769230768, "logits/rejected": -9928642.181818182, "logps/chosen": -438.35437950721155, "logps/rejected": -565.68408203125, "loss": 0.0178, "rewards/chosen": 7.300264211801382, "rewards/margins": 17.42004154445408, "rewards/rejected": -10.1197773326527, "step": 2371 }, { "epoch": 0.6501301904892421, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4937357.818181818, "logits/rejected": 17582385.230769232, "logps/chosen": -454.974609375, "logps/rejected": -511.13882211538464, "loss": 0.0297, "rewards/chosen": 7.677060213955966, "rewards/margins": 17.98216087501366, "rewards/rejected": -10.305100661057692, "step": 2372 }, { "epoch": 0.6504042757297519, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28550678.153846152, "logits/rejected": -7688848.0, "logps/chosen": -529.1025390625, "logps/rejected": -731.4957386363636, "loss": 0.0245, "rewards/chosen": 6.783702556903545, "rewards/margins": 18.665741020149284, "rewards/rejected": -11.882038463245738, "step": 2373 }, { "epoch": 0.6506783609702618, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12875878.666666666, "logits/rejected": -14778764.0, "logps/chosen": -540.1568603515625, "logps/rejected": -422.6492513020833, "loss": 0.0131, "rewards/chosen": 7.062317530314128, "rewards/margins": 17.698633193969727, "rewards/rejected": -10.6363156636556, "step": 2374 }, { "epoch": 0.6509524462107715, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31383603.2, "logits/rejected": 9424926.285714285, "logps/chosen": -388.638525390625, "logps/rejected": -504.67640904017856, "loss": 0.0114, "rewards/chosen": 7.081339263916016, "rewards/margins": 18.945257023402625, "rewards/rejected": -11.863917759486608, "step": 2375 }, { "epoch": 0.6512265314512814, "grad_norm": 5.3125, "kl": 1.1310060024261475, "learning_rate": 5e-06, "logits/chosen": -20371012.0, "logits/rejected": -30331644.0, "logps/chosen": -417.0988464355469, "logps/rejected": -439.4571838378906, "loss": 0.0157, "rewards/chosen": 7.391225814819336, "rewards/margins": 16.17526149749756, "rewards/rejected": -8.784035682678223, "step": 2376 }, { "epoch": 0.6515006166917912, "grad_norm": 6.65625, "kl": 4.513184547424316, "learning_rate": 5e-06, "logits/chosen": -15327075.2, "logits/rejected": -15445168.0, "logps/chosen": -356.13154296875, "logps/rejected": -476.2216099330357, "loss": 0.0294, "rewards/chosen": 6.579014587402344, "rewards/margins": 16.143630109514508, "rewards/rejected": -9.564615522112165, "step": 2377 }, { "epoch": 0.6517747019323009, "grad_norm": 5.875, "kl": 8.134163856506348, "learning_rate": 5e-06, "logits/chosen": -33753467.428571425, "logits/rejected": -41146387.2, "logps/chosen": -438.63462611607144, "logps/rejected": -379.715478515625, "loss": 0.0179, "rewards/chosen": 7.405079432896206, "rewards/margins": 14.913815089634486, "rewards/rejected": -7.508735656738281, "step": 2378 }, { "epoch": 0.6520487871728108, "grad_norm": 1.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13849171.692307692, "logits/rejected": 2136717.8181818184, "logps/chosen": -375.2252854567308, "logps/rejected": -592.2590997869319, "loss": 0.0034, "rewards/chosen": 7.080760075495793, "rewards/margins": 19.69147646177065, "rewards/rejected": -12.610716386274857, "step": 2379 }, { "epoch": 0.6523228724133205, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42510741.333333336, "logits/rejected": 18425.777777777777, "logps/chosen": -439.92138671875, "logps/rejected": -381.84019639756946, "loss": 0.0485, "rewards/chosen": 7.237536112467448, "rewards/margins": 15.51375240749783, "rewards/rejected": -8.276216295030382, "step": 2380 }, { "epoch": 0.6525969576538303, "grad_norm": 9.3125, "kl": 5.333033561706543, "learning_rate": 5e-06, "logits/chosen": -33383278.545454547, "logits/rejected": -29971401.846153848, "logps/chosen": -529.0334250710227, "logps/rejected": -475.2678786057692, "loss": 0.0136, "rewards/chosen": 8.305741743607955, "rewards/margins": 19.79770857804305, "rewards/rejected": -11.491966834435097, "step": 2381 }, { "epoch": 0.6528710428943402, "grad_norm": 9.6875, "kl": 3.6118011474609375, "learning_rate": 5e-06, "logits/chosen": -36270010.666666664, "logits/rejected": -22816098.666666668, "logps/chosen": -447.77490234375, "logps/rejected": -484.1936442057292, "loss": 0.029, "rewards/chosen": 6.04641850789388, "rewards/margins": 16.085221608479817, "rewards/rejected": -10.038803100585938, "step": 2382 }, { "epoch": 0.6531451281348499, "grad_norm": 17.375, "kl": 7.914766311645508, "learning_rate": 5e-06, "logits/chosen": -22342656.0, "logits/rejected": -34157972.0, "logps/chosen": -386.4498291015625, "logps/rejected": -456.197265625, "loss": 0.0636, "rewards/chosen": 5.904258728027344, "rewards/margins": 15.940570831298828, "rewards/rejected": -10.036312103271484, "step": 2383 }, { "epoch": 0.6534192133753597, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27401958.4, "logits/rejected": -51448872.421052635, "logps/chosen": -400.2741943359375, "logps/rejected": -496.0094058388158, "loss": 0.0155, "rewards/chosen": 6.2424766540527346, "rewards/margins": 16.13041289480109, "rewards/rejected": -9.887936240748354, "step": 2384 }, { "epoch": 0.6536932986158696, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36740945.777777776, "logits/rejected": -14035246.933333334, "logps/chosen": -362.35907660590277, "logps/rejected": -387.929296875, "loss": 0.0385, "rewards/chosen": 7.046055263943142, "rewards/margins": 15.856329515245225, "rewards/rejected": -8.810274251302083, "step": 2385 }, { "epoch": 0.6539673838563793, "grad_norm": 2.203125, "kl": 5.006761074066162, "learning_rate": 5e-06, "logits/chosen": -25786560.0, "logits/rejected": -33272020.363636363, "logps/chosen": -543.6484375, "logps/rejected": -524.2081409801136, "loss": 0.0064, "rewards/chosen": 9.660604623647837, "rewards/margins": 21.894971300671983, "rewards/rejected": -12.234366677024148, "step": 2386 }, { "epoch": 0.6542414690968892, "grad_norm": 11.6875, "kl": 7.444933891296387, "learning_rate": 5e-06, "logits/chosen": -57329527.46666667, "logits/rejected": -23499516.444444444, "logps/chosen": -412.46897786458334, "logps/rejected": -672.9491102430555, "loss": 0.0736, "rewards/chosen": 6.31374766031901, "rewards/margins": 17.425733608669706, "rewards/rejected": -11.111985948350695, "step": 2387 }, { "epoch": 0.6545155543373989, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35475936.0, "logits/rejected": -29407063.466666665, "logps/chosen": -442.09711371527777, "logps/rejected": -630.287109375, "loss": 0.0301, "rewards/chosen": 4.94252183702257, "rewards/margins": 18.126990424262154, "rewards/rejected": -13.184468587239584, "step": 2388 }, { "epoch": 0.6547896395779087, "grad_norm": 14.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7205937.0, "logits/rejected": -22157312.0, "logps/chosen": -449.02728271484375, "logps/rejected": -451.1173095703125, "loss": 0.0315, "rewards/chosen": 7.400700569152832, "rewards/margins": 17.390746116638184, "rewards/rejected": -9.990045547485352, "step": 2389 }, { "epoch": 0.6550637248184186, "grad_norm": 3.59375, "kl": 8.980210304260254, "learning_rate": 5e-06, "logits/chosen": -23252118.85714286, "logits/rejected": 12459897.6, "logps/chosen": -494.0917271205357, "logps/rejected": -490.17294921875, "loss": 0.0097, "rewards/chosen": 6.742616925920759, "rewards/margins": 15.973977552141461, "rewards/rejected": -9.231360626220702, "step": 2390 }, { "epoch": 0.6553378100589283, "grad_norm": 2.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11900219.076923076, "logits/rejected": -60312855.27272727, "logps/chosen": -448.23929537259613, "logps/rejected": -532.3605291193181, "loss": 0.0292, "rewards/chosen": 6.800325833834135, "rewards/margins": 19.100111101057145, "rewards/rejected": -12.299785267223012, "step": 2391 }, { "epoch": 0.6556118952994381, "grad_norm": 4.25, "kl": 6.45037841796875, "learning_rate": 5e-06, "logits/chosen": -14901505.142857144, "logits/rejected": -24192339.2, "logps/chosen": -511.79788643973217, "logps/rejected": -525.985205078125, "loss": 0.0095, "rewards/chosen": 6.607219151088169, "rewards/margins": 18.422755650111608, "rewards/rejected": -11.815536499023438, "step": 2392 }, { "epoch": 0.655885980539948, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17877380.923076924, "logits/rejected": -34483726.54545455, "logps/chosen": -484.39261568509613, "logps/rejected": -502.771484375, "loss": 0.0287, "rewards/chosen": 7.803254934457632, "rewards/margins": 16.704276718459763, "rewards/rejected": -8.90102178400213, "step": 2393 }, { "epoch": 0.6561600657804577, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10191058.909090908, "logits/rejected": -15609942.153846154, "logps/chosen": -420.00830078125, "logps/rejected": -484.2668269230769, "loss": 0.071, "rewards/chosen": 6.601380781693892, "rewards/margins": 18.397549182384996, "rewards/rejected": -11.796168400691105, "step": 2394 }, { "epoch": 0.6564341510209675, "grad_norm": 2.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27188806.4, "logits/rejected": -22595318.85714286, "logps/chosen": -444.5001953125, "logps/rejected": -581.2555454799107, "loss": 0.0073, "rewards/chosen": 6.686713409423828, "rewards/margins": 21.449082837785994, "rewards/rejected": -14.762369428362165, "step": 2395 }, { "epoch": 0.6567082362614773, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32880802.666666668, "logits/rejected": -8163137.333333333, "logps/chosen": -463.6217854817708, "logps/rejected": -580.4785970052084, "loss": 0.0303, "rewards/chosen": 8.827757517496744, "rewards/margins": 20.060492197672524, "rewards/rejected": -11.232734680175781, "step": 2396 }, { "epoch": 0.6569823215019871, "grad_norm": 4.15625, "kl": 2.8998496532440186, "learning_rate": 5e-06, "logits/chosen": -16017709.333333334, "logits/rejected": -29467674.666666668, "logps/chosen": -475.038818359375, "logps/rejected": -513.74609375, "loss": 0.0341, "rewards/chosen": 6.62507438659668, "rewards/margins": 20.606045405069985, "rewards/rejected": -13.980971018473307, "step": 2397 }, { "epoch": 0.657256406742497, "grad_norm": 4.4375, "kl": 0.6854079961776733, "learning_rate": 5e-06, "logits/chosen": -11332352.0, "logits/rejected": -31084032.0, "logps/chosen": -375.257568359375, "logps/rejected": -445.96181640625, "loss": 0.0193, "rewards/chosen": 7.713701520647321, "rewards/margins": 18.086128888811384, "rewards/rejected": -10.372427368164063, "step": 2398 }, { "epoch": 0.6575304919830067, "grad_norm": 3.6875, "kl": 15.477231979370117, "learning_rate": 5e-06, "logits/chosen": -15458796.307692308, "logits/rejected": -40096866.90909091, "logps/chosen": -432.18201622596155, "logps/rejected": -635.4456676136364, "loss": 0.0478, "rewards/chosen": 6.991032527043269, "rewards/margins": 19.803033148492133, "rewards/rejected": -12.812000621448863, "step": 2399 }, { "epoch": 0.6578045772235165, "grad_norm": 1.0625, "kl": 1.1016604900360107, "learning_rate": 5e-06, "logits/chosen": -30279901.333333332, "logits/rejected": -26960880.0, "logps/chosen": -498.2027180989583, "logps/rejected": -525.9344075520834, "loss": 0.0027, "rewards/chosen": 8.305161158243815, "rewards/margins": 18.97705014546712, "rewards/rejected": -10.671888987223307, "step": 2400 }, { "epoch": 0.6580786624640264, "grad_norm": 7.59375, "kl": 0.7276992797851562, "learning_rate": 5e-06, "logits/chosen": -12555019.076923076, "logits/rejected": -45627531.63636363, "logps/chosen": -389.86429537259613, "logps/rejected": -661.2076083096591, "loss": 0.0546, "rewards/chosen": 6.453988295335036, "rewards/margins": 20.616835640860604, "rewards/rejected": -14.162847345525568, "step": 2401 }, { "epoch": 0.6583527477045361, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33483202.46153846, "logits/rejected": -27063924.363636363, "logps/chosen": -372.24673227163464, "logps/rejected": -778.8721590909091, "loss": 0.0097, "rewards/chosen": 5.363064105694111, "rewards/margins": 17.68224190665292, "rewards/rejected": -12.319177800958807, "step": 2402 }, { "epoch": 0.6586268329450459, "grad_norm": 5.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24763034.666666668, "logits/rejected": -4209767.333333333, "logps/chosen": -484.9635823567708, "logps/rejected": -460.5018717447917, "loss": 0.0174, "rewards/chosen": 6.251057942708333, "rewards/margins": 16.118555704752605, "rewards/rejected": -9.867497762044271, "step": 2403 }, { "epoch": 0.6589009181855557, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36848508.8, "logits/rejected": -41782665.14285714, "logps/chosen": -351.828564453125, "logps/rejected": -503.7963169642857, "loss": 0.017, "rewards/chosen": 6.394371414184571, "rewards/margins": 18.9405216217041, "rewards/rejected": -12.546150207519531, "step": 2404 }, { "epoch": 0.6591750034260655, "grad_norm": 3.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53193314.90909091, "logits/rejected": -37988913.23076923, "logps/chosen": -412.06196732954544, "logps/rejected": -457.06971153846155, "loss": 0.0082, "rewards/chosen": 5.686680880459872, "rewards/margins": 15.292544011469488, "rewards/rejected": -9.605863131009615, "step": 2405 }, { "epoch": 0.6594490886665753, "grad_norm": 2.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4052252.6666666665, "logits/rejected": -26612074.666666668, "logps/chosen": -491.6318766276042, "logps/rejected": -723.4607747395834, "loss": 0.0051, "rewards/chosen": 6.503957748413086, "rewards/margins": 22.661558151245117, "rewards/rejected": -16.15760040283203, "step": 2406 }, { "epoch": 0.6597231739070851, "grad_norm": 8.4375, "kl": 9.3551025390625, "learning_rate": 5e-06, "logits/chosen": -43835799.27272727, "logits/rejected": -27036580.923076924, "logps/chosen": -414.65749289772725, "logps/rejected": -509.48670372596155, "loss": 0.0906, "rewards/chosen": 6.626642400568182, "rewards/margins": 17.130858521361453, "rewards/rejected": -10.50421612079327, "step": 2407 }, { "epoch": 0.6599972591475949, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30879546.666666668, "logits/rejected": -54589546.666666664, "logps/chosen": -317.24855550130206, "logps/rejected": -634.1912841796875, "loss": 0.0187, "rewards/chosen": 4.749409993489583, "rewards/margins": 21.851236979166664, "rewards/rejected": -17.101826985677082, "step": 2408 }, { "epoch": 0.6602713443881048, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21216740.266666666, "logits/rejected": -29947260.444444444, "logps/chosen": -439.45625, "logps/rejected": -511.1832682291667, "loss": 0.0646, "rewards/chosen": 7.129347229003907, "rewards/margins": 21.19706743028429, "rewards/rejected": -14.067720201280382, "step": 2409 }, { "epoch": 0.6605454296286145, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41000064.0, "logits/rejected": -46000439.46666667, "logps/chosen": -433.16924370659723, "logps/rejected": -567.2109375, "loss": 0.0131, "rewards/chosen": 7.1161693996853295, "rewards/margins": 20.550688849555122, "rewards/rejected": -13.434519449869791, "step": 2410 }, { "epoch": 0.6608195148691243, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40523477.333333336, "logits/rejected": -25257578.666666668, "logps/chosen": -405.4505859375, "logps/rejected": -373.28754340277777, "loss": 0.0466, "rewards/chosen": 6.730853271484375, "rewards/margins": 14.458789910210504, "rewards/rejected": -7.727936638726129, "step": 2411 }, { "epoch": 0.6610936001096341, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31989713.454545453, "logits/rejected": -13843849.846153846, "logps/chosen": -465.60986328125, "logps/rejected": -536.1025390625, "loss": 0.0115, "rewards/chosen": 7.989095514470881, "rewards/margins": 19.767149385038792, "rewards/rejected": -11.77805387056791, "step": 2412 }, { "epoch": 0.6613676853501439, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35921400.0, "logits/rejected": -23393309.333333332, "logps/chosen": -504.9838053385417, "logps/rejected": -468.97607421875, "loss": 0.011, "rewards/chosen": 6.9152374267578125, "rewards/margins": 17.002888361612953, "rewards/rejected": -10.087650934855143, "step": 2413 }, { "epoch": 0.6616417705906537, "grad_norm": 9.5625, "kl": 1.3121846914291382, "learning_rate": 5e-06, "logits/chosen": -28607941.333333332, "logits/rejected": -22750205.333333332, "logps/chosen": -453.4391276041667, "logps/rejected": -512.5880533854166, "loss": 0.0298, "rewards/chosen": 7.134044011433919, "rewards/margins": 18.698517481486004, "rewards/rejected": -11.564473470052084, "step": 2414 }, { "epoch": 0.6619158558311635, "grad_norm": 3.421875, "kl": 2.3284378051757812, "learning_rate": 5e-06, "logits/chosen": -28608093.333333332, "logits/rejected": -27065194.666666668, "logps/chosen": -475.6542561848958, "logps/rejected": -525.0315755208334, "loss": 0.0124, "rewards/chosen": 8.211662928263346, "rewards/margins": 18.007545471191406, "rewards/rejected": -9.79588254292806, "step": 2415 }, { "epoch": 0.6621899410716733, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24480729.6, "logits/rejected": -6326390.285714285, "logps/chosen": -441.609912109375, "logps/rejected": -530.7398856026786, "loss": 0.0091, "rewards/chosen": 5.628167724609375, "rewards/margins": 18.07066933768136, "rewards/rejected": -12.442501613071986, "step": 2416 }, { "epoch": 0.662464026312183, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27671616.0, "logits/rejected": -30386965.333333332, "logps/chosen": -366.8681640625, "logps/rejected": -617.51953125, "loss": 0.0509, "rewards/chosen": 5.819865926106771, "rewards/margins": 16.955013020833334, "rewards/rejected": -11.135147094726562, "step": 2417 }, { "epoch": 0.6627381115526929, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38743923.2, "logits/rejected": -28263888.0, "logps/chosen": -526.79248046875, "logps/rejected": -470.38364955357144, "loss": 0.039, "rewards/chosen": 8.05074462890625, "rewards/margins": 19.817538016183036, "rewards/rejected": -11.766793387276786, "step": 2418 }, { "epoch": 0.6630121967932027, "grad_norm": 0.60546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25941980.444444444, "logits/rejected": 34755541.333333336, "logps/chosen": -557.8088650173611, "logps/rejected": -457.39716796875, "loss": 0.0016, "rewards/chosen": 9.447589450412327, "rewards/margins": 24.955736626519098, "rewards/rejected": -15.508147176106771, "step": 2419 }, { "epoch": 0.6632862820337125, "grad_norm": 6.3125, "kl": 12.99555492401123, "learning_rate": 5e-06, "logits/chosen": -14303483.076923076, "logits/rejected": 66952768.0, "logps/chosen": -529.2439903846154, "logps/rejected": -575.1284623579545, "loss": 0.0215, "rewards/chosen": 7.952676626352163, "rewards/margins": 19.38336565777972, "rewards/rejected": -11.430689031427557, "step": 2420 }, { "epoch": 0.6635603672742223, "grad_norm": 11.125, "kl": 0.5585874319076538, "learning_rate": 5e-06, "logits/chosen": -10150430.933333334, "logits/rejected": -27074192.0, "logps/chosen": -455.14537760416664, "logps/rejected": -623.2037217881945, "loss": 0.1063, "rewards/chosen": 6.732235717773437, "rewards/margins": 15.764168124728734, "rewards/rejected": -9.031932406955296, "step": 2421 }, { "epoch": 0.663834452514732, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34213769.14285714, "logits/rejected": -37607158.4, "logps/chosen": -418.547607421875, "logps/rejected": -525.48095703125, "loss": 0.0185, "rewards/chosen": 6.464845929827009, "rewards/margins": 18.168671308244978, "rewards/rejected": -11.703825378417969, "step": 2422 }, { "epoch": 0.6641085377552419, "grad_norm": 7.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36531002.18181818, "logits/rejected": -27939062.153846152, "logps/chosen": -345.28664328835225, "logps/rejected": -512.1439302884615, "loss": 0.0282, "rewards/chosen": 5.8157525496049365, "rewards/margins": 16.88191583273294, "rewards/rejected": -11.066163283128004, "step": 2423 }, { "epoch": 0.6643826229957517, "grad_norm": 3.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6713208.0, "logits/rejected": -27485705.14285714, "logps/chosen": -402.9030029296875, "logps/rejected": -506.90980747767856, "loss": 0.0514, "rewards/chosen": 6.399425506591797, "rewards/margins": 19.274695805140905, "rewards/rejected": -12.875270298549108, "step": 2424 }, { "epoch": 0.6646567082362614, "grad_norm": 2.484375, "kl": 9.67886734008789, "learning_rate": 5e-06, "logits/chosen": -24139744.0, "logits/rejected": -36974150.4, "logps/chosen": -396.26639229910717, "logps/rejected": -553.278466796875, "loss": 0.0116, "rewards/chosen": 7.726963588169643, "rewards/margins": 16.75570504324777, "rewards/rejected": -9.028741455078125, "step": 2425 }, { "epoch": 0.6649307934767713, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39336352.0, "logits/rejected": -23593234.0, "logps/chosen": -320.43487548828125, "logps/rejected": -461.9276123046875, "loss": 0.0701, "rewards/chosen": 5.0041351318359375, "rewards/margins": 12.810271263122559, "rewards/rejected": -7.806136131286621, "step": 2426 }, { "epoch": 0.6652048787172811, "grad_norm": 8.125, "kl": 3.3424792289733887, "learning_rate": 5e-06, "logits/chosen": -28365740.0, "logits/rejected": -8470219.0, "logps/chosen": -307.9923095703125, "logps/rejected": -649.0552978515625, "loss": 0.0763, "rewards/chosen": 5.968907833099365, "rewards/margins": 18.238824367523193, "rewards/rejected": -12.269916534423828, "step": 2427 }, { "epoch": 0.6654789639577908, "grad_norm": 5.15625, "kl": 6.906120300292969, "learning_rate": 5e-06, "logits/chosen": -20216939.2, "logits/rejected": -7835445.714285715, "logps/chosen": -446.1888671875, "logps/rejected": -691.05126953125, "loss": 0.0222, "rewards/chosen": 8.150788116455079, "rewards/margins": 20.483016640799384, "rewards/rejected": -12.332228524344307, "step": 2428 }, { "epoch": 0.6657530491983007, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10966867.555555556, "logits/rejected": -2719804.2666666666, "logps/chosen": -353.6735568576389, "logps/rejected": -548.8755859375, "loss": 0.024, "rewards/chosen": 6.201808929443359, "rewards/margins": 17.404427337646485, "rewards/rejected": -11.202618408203126, "step": 2429 }, { "epoch": 0.6660271344388105, "grad_norm": 1.421875, "kl": 5.284012317657471, "learning_rate": 5e-06, "logits/chosen": -32661897.846153848, "logits/rejected": -28934941.09090909, "logps/chosen": -527.1168870192307, "logps/rejected": -519.2955433238636, "loss": 0.0034, "rewards/chosen": 9.362674419696514, "rewards/margins": 19.103663411173788, "rewards/rejected": -9.740988991477273, "step": 2430 }, { "epoch": 0.6663012196793203, "grad_norm": 1.3984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13292042.0, "logits/rejected": -27315458.0, "logps/chosen": -498.2758483886719, "logps/rejected": -437.8033142089844, "loss": 0.0036, "rewards/chosen": 8.763221740722656, "rewards/margins": 18.745359420776367, "rewards/rejected": -9.982137680053711, "step": 2431 }, { "epoch": 0.6665753049198301, "grad_norm": 4.65625, "kl": 2.552082061767578, "learning_rate": 5e-06, "logits/chosen": -29711896.888888888, "logits/rejected": -38811264.0, "logps/chosen": -374.81035698784723, "logps/rejected": -548.357421875, "loss": 0.0141, "rewards/chosen": 6.820089128282335, "rewards/margins": 17.19281455145942, "rewards/rejected": -10.372725423177084, "step": 2432 }, { "epoch": 0.6668493901603398, "grad_norm": 3.671875, "kl": 5.1908674240112305, "learning_rate": 5e-06, "logits/chosen": -45172277.333333336, "logits/rejected": -39003944.0, "logps/chosen": -541.5013020833334, "logps/rejected": -427.4015299479167, "loss": 0.0132, "rewards/chosen": 8.41897964477539, "rewards/margins": 16.02618408203125, "rewards/rejected": -7.607204437255859, "step": 2433 }, { "epoch": 0.6671234754008497, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10349610.0, "logits/rejected": -32807157.333333332, "logps/chosen": -455.23193359375, "logps/rejected": -619.2906494140625, "loss": 0.0169, "rewards/chosen": 7.6279551188151045, "rewards/margins": 17.516155242919922, "rewards/rejected": -9.888200124104818, "step": 2434 }, { "epoch": 0.6673975606413595, "grad_norm": 1.828125, "kl": 2.373732328414917, "learning_rate": 5e-06, "logits/chosen": -18888888.0, "logits/rejected": -13402053.333333334, "logps/chosen": -451.1107584635417, "logps/rejected": -445.8369140625, "loss": 0.0056, "rewards/chosen": 8.347919464111328, "rewards/margins": 17.562217712402344, "rewards/rejected": -9.214298248291016, "step": 2435 }, { "epoch": 0.6676716458818692, "grad_norm": 3.5, "kl": 11.64498519897461, "learning_rate": 5e-06, "logits/chosen": -26763374.933333334, "logits/rejected": -19788156.444444444, "logps/chosen": -422.1841796875, "logps/rejected": -553.8721788194445, "loss": 0.012, "rewards/chosen": 7.300200398763021, "rewards/margins": 19.150803290473092, "rewards/rejected": -11.85060289171007, "step": 2436 }, { "epoch": 0.6679457311223791, "grad_norm": 2.8125, "kl": 4.028592586517334, "learning_rate": 5e-06, "logits/chosen": -30991374.222222224, "logits/rejected": -27913273.6, "logps/chosen": -450.3276638454861, "logps/rejected": -568.1482421875, "loss": 0.0084, "rewards/chosen": 7.003792656792535, "rewards/margins": 18.624678887261286, "rewards/rejected": -11.62088623046875, "step": 2437 }, { "epoch": 0.6682198163628889, "grad_norm": 6.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25030472.0, "logits/rejected": -7144409.333333333, "logps/chosen": -391.3216959635417, "logps/rejected": -581.8849690755209, "loss": 0.0261, "rewards/chosen": 6.685868581136067, "rewards/margins": 19.627613067626953, "rewards/rejected": -12.941744486490885, "step": 2438 }, { "epoch": 0.6684939016033986, "grad_norm": 2.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30735264.0, "logits/rejected": -35872182.85714286, "logps/chosen": -395.3958984375, "logps/rejected": -537.9934430803571, "loss": 0.0085, "rewards/chosen": 6.887783050537109, "rewards/margins": 18.066051483154297, "rewards/rejected": -11.178268432617188, "step": 2439 }, { "epoch": 0.6687679868439085, "grad_norm": 11.4375, "kl": 0.3797881007194519, "learning_rate": 5e-06, "logits/chosen": -42553677.71428572, "logits/rejected": -24851838.4, "logps/chosen": -417.0825892857143, "logps/rejected": -562.3650390625, "loss": 0.0405, "rewards/chosen": 6.313555036272321, "rewards/margins": 18.391664777483257, "rewards/rejected": -12.078109741210938, "step": 2440 }, { "epoch": 0.6690420720844182, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16453192.888888888, "logits/rejected": -10203362.133333333, "logps/chosen": -389.6321614583333, "logps/rejected": -394.8339518229167, "loss": 0.0157, "rewards/chosen": 8.127564324273003, "rewards/margins": 16.736971876356336, "rewards/rejected": -8.609407552083333, "step": 2441 }, { "epoch": 0.6693161573249281, "grad_norm": 15.125, "kl": 6.482230186462402, "learning_rate": 5e-06, "logits/chosen": -29383805.53846154, "logits/rejected": -14894237.090909092, "logps/chosen": -360.9056865985577, "logps/rejected": -534.3692294034091, "loss": 0.1041, "rewards/chosen": 5.593744718111479, "rewards/margins": 16.33415387560438, "rewards/rejected": -10.740409157492898, "step": 2442 }, { "epoch": 0.6695902425654379, "grad_norm": 14.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18012421.333333332, "logits/rejected": -23367109.333333332, "logps/chosen": -442.97705078125, "logps/rejected": -516.8039143880209, "loss": 0.0455, "rewards/chosen": 5.911112467447917, "rewards/margins": 16.10337766011556, "rewards/rejected": -10.192265192667643, "step": 2443 }, { "epoch": 0.6698643278059476, "grad_norm": 12.875, "kl": 9.08342456817627, "learning_rate": 5e-06, "logits/chosen": -19802940.23529412, "logits/rejected": -30278546.285714287, "logps/chosen": -488.04176240808823, "logps/rejected": -896.6991489955357, "loss": 0.08, "rewards/chosen": 7.1038822847254135, "rewards/margins": 26.30641655160599, "rewards/rejected": -19.20253426688058, "step": 2444 }, { "epoch": 0.6701384130464575, "grad_norm": 9.0, "kl": 9.147181510925293, "learning_rate": 5e-06, "logits/chosen": -27962536.533333335, "logits/rejected": -22520519.111111112, "logps/chosen": -370.36858723958335, "logps/rejected": -500.2429470486111, "loss": 0.0903, "rewards/chosen": 6.742433675130209, "rewards/margins": 17.562698703342015, "rewards/rejected": -10.820265028211805, "step": 2445 }, { "epoch": 0.6704124982869673, "grad_norm": 7.1875, "kl": 3.324281692504883, "learning_rate": 5e-06, "logits/chosen": -20208519.384615384, "logits/rejected": -26620640.0, "logps/chosen": -390.6031024639423, "logps/rejected": -503.6510564630682, "loss": 0.0336, "rewards/chosen": 5.936251126802885, "rewards/margins": 16.499714511257785, "rewards/rejected": -10.5634633844549, "step": 2446 }, { "epoch": 0.670686583527477, "grad_norm": 6.40625, "kl": 7.816205024719238, "learning_rate": 5e-06, "logits/chosen": -11129080.615384616, "logits/rejected": -16647149.090909092, "logps/chosen": -328.71176382211536, "logps/rejected": -634.1387606534091, "loss": 0.0176, "rewards/chosen": 6.650658827561599, "rewards/margins": 19.331700198300236, "rewards/rejected": -12.681041370738637, "step": 2447 }, { "epoch": 0.6709606687679869, "grad_norm": 6.875, "kl": 5.784738063812256, "learning_rate": 5e-06, "logits/chosen": -19481364.0, "logits/rejected": -50560277.333333336, "logps/chosen": -474.8658040364583, "logps/rejected": -679.3194173177084, "loss": 0.0277, "rewards/chosen": 7.642716725667317, "rewards/margins": 20.029149373372395, "rewards/rejected": -12.386432647705078, "step": 2448 }, { "epoch": 0.6712347540084966, "grad_norm": 8.4375, "kl": 2.1080451011657715, "learning_rate": 5e-06, "logits/chosen": -34310058.666666664, "logits/rejected": -28920833.777777776, "logps/chosen": -437.22063802083335, "logps/rejected": -406.52978515625, "loss": 0.0281, "rewards/chosen": 6.4876246134440105, "rewards/margins": 16.332397800021702, "rewards/rejected": -9.84477318657769, "step": 2449 }, { "epoch": 0.6715088392490064, "grad_norm": 3.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38942919.11111111, "logits/rejected": -20484309.333333332, "logps/chosen": -452.68638780381946, "logps/rejected": -620.1588541666666, "loss": 0.0087, "rewards/chosen": 5.733968946668837, "rewards/margins": 18.068001132541234, "rewards/rejected": -12.334032185872395, "step": 2450 }, { "epoch": 0.6717829244895163, "grad_norm": 2.59375, "kl": 5.378354549407959, "learning_rate": 5e-06, "logits/chosen": -17871867.42857143, "logits/rejected": -29336947.2, "logps/chosen": -373.27755301339283, "logps/rejected": -496.44013671875, "loss": 0.0475, "rewards/chosen": 7.431656973702567, "rewards/margins": 17.649576895577567, "rewards/rejected": -10.217919921875, "step": 2451 }, { "epoch": 0.672057009730026, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36734914.90909091, "logits/rejected": -26823751.384615384, "logps/chosen": -398.36110617897725, "logps/rejected": -697.8608774038462, "loss": 0.028, "rewards/chosen": 5.32981352372603, "rewards/margins": 21.401919518317378, "rewards/rejected": -16.072105994591347, "step": 2452 }, { "epoch": 0.6723310949705359, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17210340.363636363, "logits/rejected": -21789302.153846152, "logps/chosen": -294.05934836647725, "logps/rejected": -607.7851186899038, "loss": 0.0714, "rewards/chosen": 5.056086453524503, "rewards/margins": 15.535690147559961, "rewards/rejected": -10.479603694035458, "step": 2453 }, { "epoch": 0.6726051802110457, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39410083.55555555, "logits/rejected": -22392507.733333334, "logps/chosen": -402.25792100694446, "logps/rejected": -493.43382161458334, "loss": 0.0229, "rewards/chosen": 5.670057084825304, "rewards/margins": 16.716011471218533, "rewards/rejected": -11.045954386393229, "step": 2454 }, { "epoch": 0.6728792654515554, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27483476.363636363, "logits/rejected": -7371003.076923077, "logps/chosen": -384.60280539772725, "logps/rejected": -514.1988055889423, "loss": 0.022, "rewards/chosen": 6.814059170809659, "rewards/margins": 19.16994342937336, "rewards/rejected": -12.355884258563702, "step": 2455 }, { "epoch": 0.6731533506920653, "grad_norm": 5.8125, "kl": 7.227742671966553, "learning_rate": 5e-06, "logits/chosen": -24956541.53846154, "logits/rejected": -28446568.727272727, "logps/chosen": -518.7442908653846, "logps/rejected": -442.21071555397725, "loss": 0.0252, "rewards/chosen": 8.881983243502104, "rewards/margins": 16.987677407431434, "rewards/rejected": -8.105694163929332, "step": 2456 }, { "epoch": 0.673427435932575, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10165420.0, "logits/rejected": -18593204.0, "logps/chosen": -456.1442565917969, "logps/rejected": -549.6728515625, "loss": 0.0076, "rewards/chosen": 8.052903175354004, "rewards/margins": 20.137064933776855, "rewards/rejected": -12.084161758422852, "step": 2457 }, { "epoch": 0.6737015211730848, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51422261.333333336, "logits/rejected": -30190963.2, "logps/chosen": -367.61073133680554, "logps/rejected": -717.11328125, "loss": 0.018, "rewards/chosen": 5.210396660698785, "rewards/margins": 16.625537448459202, "rewards/rejected": -11.415140787760416, "step": 2458 }, { "epoch": 0.6739756064135947, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33817291.428571425, "logits/rejected": -9266230.588235294, "logps/chosen": -542.5466657366071, "logps/rejected": -635.5035615808823, "loss": 0.0165, "rewards/chosen": 7.278719765799386, "rewards/margins": 21.50588453917944, "rewards/rejected": -14.227164773380055, "step": 2459 }, { "epoch": 0.6742496916541044, "grad_norm": 3.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35667786.666666664, "logits/rejected": -25526617.6, "logps/chosen": -549.2916666666666, "logps/rejected": -613.251171875, "loss": 0.007, "rewards/chosen": 8.584058973524305, "rewards/margins": 21.2261467827691, "rewards/rejected": -12.642087809244792, "step": 2460 }, { "epoch": 0.6745237768946142, "grad_norm": 6.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22235209.846153848, "logits/rejected": -32117934.545454547, "logps/chosen": -411.96420522836536, "logps/rejected": -619.0930841619319, "loss": 0.0282, "rewards/chosen": 5.317495492788462, "rewards/margins": 16.19140032788257, "rewards/rejected": -10.873904835094105, "step": 2461 }, { "epoch": 0.674797862135124, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35945085.09090909, "logits/rejected": -23873176.615384616, "logps/chosen": -466.52592329545456, "logps/rejected": -447.92247596153845, "loss": 0.0148, "rewards/chosen": 7.2735137939453125, "rewards/margins": 19.72777029184195, "rewards/rejected": -12.454256497896635, "step": 2462 }, { "epoch": 0.6750719473756338, "grad_norm": 3.65625, "kl": 3.3184618949890137, "learning_rate": 5e-06, "logits/chosen": -834090.8571428572, "logits/rejected": -8583200.0, "logps/chosen": -433.14990234375, "logps/rejected": -595.12685546875, "loss": 0.0382, "rewards/chosen": 7.247400556291852, "rewards/margins": 18.715421186174666, "rewards/rejected": -11.468020629882812, "step": 2463 }, { "epoch": 0.6753460326161436, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10677200.0, "logits/rejected": -20804859.076923076, "logps/chosen": -371.2626287286932, "logps/rejected": -519.8669621394231, "loss": 0.0435, "rewards/chosen": 5.25619853626598, "rewards/margins": 15.97183574329723, "rewards/rejected": -10.71563720703125, "step": 2464 }, { "epoch": 0.6756201178566534, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11472317.333333334, "logits/rejected": -29607866.666666668, "logps/chosen": -360.7281087239583, "logps/rejected": -679.7373453776041, "loss": 0.0831, "rewards/chosen": 4.529615084330241, "rewards/margins": 18.846517244974773, "rewards/rejected": -14.316902160644531, "step": 2465 }, { "epoch": 0.6758942030971632, "grad_norm": 3.9375, "kl": 9.044803619384766, "learning_rate": 5e-06, "logits/chosen": -45921875.692307696, "logits/rejected": -38804826.18181818, "logps/chosen": -451.25668569711536, "logps/rejected": -397.15114524147725, "loss": 0.0613, "rewards/chosen": 6.9332134540264425, "rewards/margins": 18.222357796622322, "rewards/rejected": -11.28914434259588, "step": 2466 }, { "epoch": 0.6761682883376731, "grad_norm": 2.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5022655.0, "logits/rejected": -15707877.0, "logps/chosen": -488.4986572265625, "logps/rejected": -576.91064453125, "loss": 0.0036, "rewards/chosen": 9.046950340270996, "rewards/margins": 21.57048511505127, "rewards/rejected": -12.523534774780273, "step": 2467 }, { "epoch": 0.6764423735781828, "grad_norm": 3.859375, "kl": 2.8339128494262695, "learning_rate": 5e-06, "logits/chosen": -32254746.666666668, "logits/rejected": -29964752.0, "logps/chosen": -455.1371663411458, "logps/rejected": -652.0533854166666, "loss": 0.0265, "rewards/chosen": 7.496644337972005, "rewards/margins": 20.528764088948567, "rewards/rejected": -13.032119750976562, "step": 2468 }, { "epoch": 0.6767164588186926, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26274360.0, "logits/rejected": 16454369.333333334, "logps/chosen": -444.1380208333333, "logps/rejected": -805.1922200520834, "loss": 0.035, "rewards/chosen": 4.983654975891113, "rewards/margins": 22.299216906229656, "rewards/rejected": -17.315561930338543, "step": 2469 }, { "epoch": 0.6769905440592024, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35645046.4, "logits/rejected": -12345344.0, "logps/chosen": -394.7698974609375, "logps/rejected": -542.2950265066964, "loss": 0.0079, "rewards/chosen": 7.107428741455078, "rewards/margins": 17.625367627825057, "rewards/rejected": -10.517938886369977, "step": 2470 }, { "epoch": 0.6772646292997122, "grad_norm": 9.6875, "kl": 4.847840785980225, "learning_rate": 5e-06, "logits/chosen": -32144896.0, "logits/rejected": 5853186.222222222, "logps/chosen": -462.6931640625, "logps/rejected": -602.3211263020834, "loss": 0.0313, "rewards/chosen": 7.398907979329427, "rewards/margins": 15.989189147949219, "rewards/rejected": -8.590281168619791, "step": 2471 }, { "epoch": 0.677538714540222, "grad_norm": 8.0625, "kl": 3.2624309062957764, "learning_rate": 5e-06, "logits/chosen": -17962261.333333332, "logits/rejected": -39420789.333333336, "logps/chosen": -354.3209635416667, "logps/rejected": -514.4615885416666, "loss": 0.0334, "rewards/chosen": 6.125372568766276, "rewards/margins": 16.653663635253906, "rewards/rejected": -10.52829106648763, "step": 2472 }, { "epoch": 0.6778127997807318, "grad_norm": 6.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12580117.333333334, "logits/rejected": -41597232.0, "logps/chosen": -547.4672444661459, "logps/rejected": -482.82861328125, "loss": 0.011, "rewards/chosen": 8.604929606119791, "rewards/margins": 20.18818918863932, "rewards/rejected": -11.583259582519531, "step": 2473 }, { "epoch": 0.6780868850212416, "grad_norm": 2.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36799200.0, "logits/rejected": -40748740.571428575, "logps/chosen": -351.108837890625, "logps/rejected": -414.77029854910717, "loss": 0.0377, "rewards/chosen": 7.19937744140625, "rewards/margins": 18.75988071986607, "rewards/rejected": -11.560503278459821, "step": 2474 }, { "epoch": 0.6783609702617514, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13853993.333333334, "logits/rejected": -16096434.666666666, "logps/chosen": -560.8734130859375, "logps/rejected": -444.9517415364583, "loss": 0.0062, "rewards/chosen": 7.559764862060547, "rewards/margins": 17.73833338419596, "rewards/rejected": -10.178568522135416, "step": 2475 }, { "epoch": 0.6786350555022612, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23963560.0, "logits/rejected": 13473074.666666666, "logps/chosen": -457.7668050130208, "logps/rejected": -584.3710123697916, "loss": 0.0158, "rewards/chosen": 8.232007344563803, "rewards/margins": 21.124104817708336, "rewards/rejected": -12.892097473144531, "step": 2476 }, { "epoch": 0.678909140742771, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35286374.4, "logits/rejected": -33950349.71428572, "logps/chosen": -423.826171875, "logps/rejected": -431.23207310267856, "loss": 0.0709, "rewards/chosen": 5.163102722167968, "rewards/margins": 13.891837419782366, "rewards/rejected": -8.728734697614398, "step": 2477 }, { "epoch": 0.6791832259832808, "grad_norm": 2.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14430392.0, "logits/rejected": -19045152.0, "logps/chosen": -490.307177734375, "logps/rejected": -551.1880580357143, "loss": 0.0069, "rewards/chosen": 8.271860504150391, "rewards/margins": 20.81591044834682, "rewards/rejected": -12.544049944196429, "step": 2478 }, { "epoch": 0.6794573112237906, "grad_norm": 4.875, "kl": 8.037883758544922, "learning_rate": 5e-06, "logits/chosen": -23595176.0, "logits/rejected": -9860824.0, "logps/chosen": -407.82623291015625, "logps/rejected": -424.17822265625, "loss": 0.0185, "rewards/chosen": 6.95111608505249, "rewards/margins": 16.03332281112671, "rewards/rejected": -9.082206726074219, "step": 2479 }, { "epoch": 0.6797313964643004, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50930205.09090909, "logits/rejected": -16309505.23076923, "logps/chosen": -369.40793678977275, "logps/rejected": -564.1311974158654, "loss": 0.0523, "rewards/chosen": 5.441558144309304, "rewards/margins": 19.512330755487188, "rewards/rejected": -14.070772611177885, "step": 2480 }, { "epoch": 0.6800054817048102, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17220451.2, "logits/rejected": -24456438.85714286, "logps/chosen": -427.144287109375, "logps/rejected": -517.3304268973214, "loss": 0.012, "rewards/chosen": 8.291046905517579, "rewards/margins": 20.022240447998048, "rewards/rejected": -11.731193542480469, "step": 2481 }, { "epoch": 0.68027956694532, "grad_norm": 4.84375, "kl": 0.39296597242355347, "learning_rate": 5e-06, "logits/chosen": -13926734.4, "logits/rejected": -23753101.714285713, "logps/chosen": -454.4521484375, "logps/rejected": -590.7276785714286, "loss": 0.009, "rewards/chosen": 7.731578826904297, "rewards/margins": 20.936432429722377, "rewards/rejected": -13.20485360281808, "step": 2482 }, { "epoch": 0.6805536521858297, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40112923.428571425, "logits/rejected": -20160334.4, "logps/chosen": -344.54652622767856, "logps/rejected": -564.509619140625, "loss": 0.0084, "rewards/chosen": 6.047070639474051, "rewards/margins": 20.638038962227956, "rewards/rejected": -14.590968322753906, "step": 2483 }, { "epoch": 0.6808277374263396, "grad_norm": 13.9375, "kl": 0.09941991418600082, "learning_rate": 5e-06, "logits/chosen": -34471719.384615384, "logits/rejected": -17569745.454545453, "logps/chosen": -463.3909254807692, "logps/rejected": -698.4275124289773, "loss": 0.0619, "rewards/chosen": 5.8241436298076925, "rewards/margins": 18.47656089942772, "rewards/rejected": -12.65241726962003, "step": 2484 }, { "epoch": 0.6811018226668494, "grad_norm": 13.375, "kl": 0.14623260498046875, "learning_rate": 5e-06, "logits/chosen": -31177452.307692308, "logits/rejected": -3040746.5454545454, "logps/chosen": -385.49947415865387, "logps/rejected": -740.3477894176136, "loss": 0.0355, "rewards/chosen": 5.465830876277043, "rewards/margins": 24.444703002076047, "rewards/rejected": -18.978872125799004, "step": 2485 }, { "epoch": 0.6813759079073591, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36745210.666666664, "logits/rejected": -20886600.0, "logps/chosen": -421.7549641927083, "logps/rejected": -675.4735921223959, "loss": 0.0598, "rewards/chosen": 5.400363922119141, "rewards/margins": 17.97822697957357, "rewards/rejected": -12.577863057454428, "step": 2486 }, { "epoch": 0.681649993147869, "grad_norm": 4.1875, "kl": 4.125528335571289, "learning_rate": 5e-06, "logits/chosen": -30808214.85714286, "logits/rejected": 1344288.4, "logps/chosen": -368.78257533482144, "logps/rejected": -621.300830078125, "loss": 0.057, "rewards/chosen": 5.0691359383719305, "rewards/margins": 17.897938428606306, "rewards/rejected": -12.828802490234375, "step": 2487 }, { "epoch": 0.6819240783883788, "grad_norm": 2.03125, "kl": 0.24519602954387665, "learning_rate": 5e-06, "logits/chosen": -33372544.0, "logits/rejected": -4160393.4545454546, "logps/chosen": -438.48159555288464, "logps/rejected": -514.9480646306819, "loss": 0.0052, "rewards/chosen": 7.659139779897837, "rewards/margins": 19.209113701240167, "rewards/rejected": -11.54997392134233, "step": 2488 }, { "epoch": 0.6821981636288886, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6604129.714285715, "logits/rejected": -29971237.647058822, "logps/chosen": -548.2587890625, "logps/rejected": -506.4970128676471, "loss": 0.0186, "rewards/chosen": 6.886116027832031, "rewards/margins": 18.29087246165556, "rewards/rejected": -11.404756433823529, "step": 2489 }, { "epoch": 0.6824722488693984, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36814960.0, "logits/rejected": -28480266.0, "logps/chosen": -354.0772705078125, "logps/rejected": -647.1279296875, "loss": 0.0342, "rewards/chosen": 4.62580680847168, "rewards/margins": 16.723648071289062, "rewards/rejected": -12.097841262817383, "step": 2490 }, { "epoch": 0.6827463341099081, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52669603.55555555, "logits/rejected": -38107524.266666666, "logps/chosen": -558.7917209201389, "logps/rejected": -512.5982421875, "loss": 0.0116, "rewards/chosen": 7.454132080078125, "rewards/margins": 20.460265096028646, "rewards/rejected": -13.006133015950521, "step": 2491 }, { "epoch": 0.683020419350418, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8663520.0, "logits/rejected": -23556809.846153848, "logps/chosen": -304.88332297585225, "logps/rejected": -418.64438100961536, "loss": 0.0759, "rewards/chosen": 6.889386263760653, "rewards/margins": 15.089329806241121, "rewards/rejected": -8.199943542480469, "step": 2492 }, { "epoch": 0.6832945045909278, "grad_norm": 4.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26180232.727272727, "logits/rejected": -20105223.384615384, "logps/chosen": -450.73530717329544, "logps/rejected": -545.9845252403846, "loss": 0.0101, "rewards/chosen": 7.458898370916193, "rewards/margins": 18.416603355140953, "rewards/rejected": -10.95770498422476, "step": 2493 }, { "epoch": 0.6835685898314375, "grad_norm": 5.71875, "kl": 1.6687934398651123, "learning_rate": 5e-06, "logits/chosen": -33482048.0, "logits/rejected": -9562004.0, "logps/chosen": -536.3672572544643, "logps/rejected": -556.75517578125, "loss": 0.0294, "rewards/chosen": 6.811129978724888, "rewards/margins": 16.904367283412387, "rewards/rejected": -10.0932373046875, "step": 2494 }, { "epoch": 0.6838426750719474, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23889952.0, "logits/rejected": 33479603.2, "logps/chosen": -466.69430106026783, "logps/rejected": -643.322314453125, "loss": 0.0377, "rewards/chosen": 7.913416181291852, "rewards/margins": 23.552437482561384, "rewards/rejected": -15.639021301269532, "step": 2495 }, { "epoch": 0.6841167603124572, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23475649.454545453, "logits/rejected": -28990237.53846154, "logps/chosen": -406.29683061079544, "logps/rejected": -334.25150240384613, "loss": 0.0247, "rewards/chosen": 6.894130359996449, "rewards/margins": 14.061025712873553, "rewards/rejected": -7.166895352877104, "step": 2496 }, { "epoch": 0.6843908455529669, "grad_norm": 1.4140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14107898.666666666, "logits/rejected": -34578120.53333333, "logps/chosen": -424.4782986111111, "logps/rejected": -722.7399739583333, "loss": 0.0042, "rewards/chosen": 6.519984351264106, "rewards/margins": 21.694852108425565, "rewards/rejected": -15.174867757161458, "step": 2497 }, { "epoch": 0.6846649307934768, "grad_norm": 5.3125, "kl": 2.8025996685028076, "learning_rate": 5e-06, "logits/chosen": -8867438.0, "logits/rejected": -31839844.0, "logps/chosen": -506.87371826171875, "logps/rejected": -548.94189453125, "loss": 0.0151, "rewards/chosen": 6.6704607009887695, "rewards/margins": 21.38427448272705, "rewards/rejected": -14.713813781738281, "step": 2498 }, { "epoch": 0.6849390160339865, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47362128.0, "logits/rejected": -16634456.0, "logps/chosen": -326.8629455566406, "logps/rejected": -763.2113037109375, "loss": 0.0432, "rewards/chosen": 6.255960464477539, "rewards/margins": 17.56969165802002, "rewards/rejected": -11.31373119354248, "step": 2499 }, { "epoch": 0.6852131012744964, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10599468.444444444, "logits/rejected": -9246064.0, "logps/chosen": -381.2661946614583, "logps/rejected": -531.5733723958333, "loss": 0.0258, "rewards/chosen": 6.063198513454861, "rewards/margins": 19.341500515407986, "rewards/rejected": -13.278302001953126, "step": 2500 }, { "epoch": 0.6854871865150062, "grad_norm": 4.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17892296.0, "logits/rejected": -26184064.0, "logps/chosen": -445.427734375, "logps/rejected": -493.80394071691177, "loss": 0.0111, "rewards/chosen": 6.351378304617746, "rewards/margins": 14.759025830180704, "rewards/rejected": -8.40764752556296, "step": 2501 }, { "epoch": 0.6857612717555159, "grad_norm": 4.78125, "kl": 0.28839111328125, "learning_rate": 5e-06, "logits/chosen": -10739261.714285715, "logits/rejected": -24790473.6, "logps/chosen": -374.33677455357144, "logps/rejected": -598.583349609375, "loss": 0.0131, "rewards/chosen": 6.56275394984654, "rewards/margins": 18.9727774483817, "rewards/rejected": -12.410023498535157, "step": 2502 }, { "epoch": 0.6860353569960258, "grad_norm": 4.9375, "kl": 1.6072425842285156, "learning_rate": 5e-06, "logits/chosen": -25308096.0, "logits/rejected": -25067342.769230768, "logps/chosen": -376.6180308948864, "logps/rejected": -526.5353064903846, "loss": 0.0356, "rewards/chosen": 7.1690826416015625, "rewards/margins": 16.571421109713043, "rewards/rejected": -9.402338468111479, "step": 2503 }, { "epoch": 0.6863094422365356, "grad_norm": 2.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25223234.666666668, "logits/rejected": -23494850.666666668, "logps/chosen": -386.262939453125, "logps/rejected": -564.6585693359375, "loss": 0.0163, "rewards/chosen": 6.639484405517578, "rewards/margins": 18.28446706136068, "rewards/rejected": -11.6449826558431, "step": 2504 }, { "epoch": 0.6865835274770453, "grad_norm": 3.453125, "kl": 1.8358548879623413, "learning_rate": 5e-06, "logits/chosen": -23100398.222222224, "logits/rejected": -38384008.53333333, "logps/chosen": -420.80235460069446, "logps/rejected": -561.630078125, "loss": 0.0381, "rewards/chosen": 7.057308197021484, "rewards/margins": 20.02913131713867, "rewards/rejected": -12.971823120117188, "step": 2505 }, { "epoch": 0.6868576127175552, "grad_norm": 8.3125, "kl": 6.665740013122559, "learning_rate": 5e-06, "logits/chosen": -17840727.272727273, "logits/rejected": -25744354.46153846, "logps/chosen": -472.85009765625, "logps/rejected": -591.8600135216346, "loss": 0.0556, "rewards/chosen": 7.104264692826704, "rewards/margins": 17.336234672919854, "rewards/rejected": -10.23196998009315, "step": 2506 }, { "epoch": 0.687131697958065, "grad_norm": 1.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28590515.2, "logits/rejected": -32414091.42857143, "logps/chosen": -554.539306640625, "logps/rejected": -618.4231305803571, "loss": 0.0019, "rewards/chosen": 9.12335433959961, "rewards/margins": 24.50795429774693, "rewards/rejected": -15.384599958147321, "step": 2507 }, { "epoch": 0.6874057831985747, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32716317.333333332, "logits/rejected": -25022240.0, "logps/chosen": -395.9141438802083, "logps/rejected": -587.3765462239584, "loss": 0.0187, "rewards/chosen": 7.308839162190755, "rewards/margins": 18.387866973876953, "rewards/rejected": -11.079027811686197, "step": 2508 }, { "epoch": 0.6876798684390846, "grad_norm": 2.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35469344.0, "logits/rejected": -22406405.647058822, "logps/chosen": -389.849365234375, "logps/rejected": -492.68054917279414, "loss": 0.0049, "rewards/chosen": 6.848002842494419, "rewards/margins": 19.936482277237065, "rewards/rejected": -13.088479434742647, "step": 2509 }, { "epoch": 0.6879539536795943, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10381769.6, "logits/rejected": 7782130.285714285, "logps/chosen": -482.987548828125, "logps/rejected": -426.55723353794644, "loss": 0.0369, "rewards/chosen": 7.647939300537109, "rewards/margins": 18.861959075927736, "rewards/rejected": -11.214019775390625, "step": 2510 }, { "epoch": 0.6882280389201042, "grad_norm": 11.25, "kl": 6.548795700073242, "learning_rate": 5e-06, "logits/chosen": -19953992.533333335, "logits/rejected": -13643460.444444444, "logps/chosen": -390.27076822916666, "logps/rejected": -320.6177029079861, "loss": 0.1013, "rewards/chosen": 6.360001627604166, "rewards/margins": 15.101978895399306, "rewards/rejected": -8.74197726779514, "step": 2511 }, { "epoch": 0.688502124160614, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13326376.727272727, "logits/rejected": -34370449.23076923, "logps/chosen": -472.2008167613636, "logps/rejected": -448.94249549278845, "loss": 0.0342, "rewards/chosen": 6.9110024192116475, "rewards/margins": 17.40926995977655, "rewards/rejected": -10.498267540564903, "step": 2512 }, { "epoch": 0.6887762094011237, "grad_norm": 2.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23638408.0, "logits/rejected": 27114054.85714286, "logps/chosen": -385.5360595703125, "logps/rejected": -594.1925223214286, "loss": 0.0071, "rewards/chosen": 6.931755065917969, "rewards/margins": 18.031915937151226, "rewards/rejected": -11.100160871233259, "step": 2513 }, { "epoch": 0.6890502946416336, "grad_norm": 4.5, "kl": 2.0835227966308594, "learning_rate": 5e-06, "logits/chosen": -21409842.666666668, "logits/rejected": -33288890.666666668, "logps/chosen": -447.9469807942708, "logps/rejected": -692.6000162760416, "loss": 0.0269, "rewards/chosen": 6.409478505452474, "rewards/margins": 19.740230560302734, "rewards/rejected": -13.33075205485026, "step": 2514 }, { "epoch": 0.6893243798821433, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27524605.714285713, "logits/rejected": -32460448.0, "logps/chosen": -437.6532505580357, "logps/rejected": -582.6294921875, "loss": 0.0188, "rewards/chosen": 6.387917654854911, "rewards/margins": 21.173080008370537, "rewards/rejected": -14.785162353515625, "step": 2515 }, { "epoch": 0.6895984651226531, "grad_norm": 8.375, "kl": 2.970468044281006, "learning_rate": 5e-06, "logits/chosen": -34877197.473684214, "logits/rejected": -17184126.4, "logps/chosen": -325.95985814144734, "logps/rejected": -883.4859375, "loss": 0.0328, "rewards/chosen": 5.26274952135588, "rewards/margins": 29.994692270379318, "rewards/rejected": -24.73194274902344, "step": 2516 }, { "epoch": 0.689872550363163, "grad_norm": 4.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3053413.2, "logits/rejected": -25627286.85714286, "logps/chosen": -449.8345703125, "logps/rejected": -420.35728236607144, "loss": 0.0118, "rewards/chosen": 7.24633560180664, "rewards/margins": 15.712119402204241, "rewards/rejected": -8.4657838003976, "step": 2517 }, { "epoch": 0.6901466356036727, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19997950.4, "logits/rejected": -28120740.57142857, "logps/chosen": -329.17802734375, "logps/rejected": -589.5248325892857, "loss": 0.0167, "rewards/chosen": 7.561723327636718, "rewards/margins": 21.9649172101702, "rewards/rejected": -14.403193882533483, "step": 2518 }, { "epoch": 0.6904207208441825, "grad_norm": 2.1875, "kl": 0.3436877131462097, "learning_rate": 5e-06, "logits/chosen": -4199341.6, "logits/rejected": -33363803.42857143, "logps/chosen": -441.74287109375, "logps/rejected": -750.1511579241071, "loss": 0.0041, "rewards/chosen": 8.71245880126953, "rewards/margins": 26.248016139439173, "rewards/rejected": -17.535557338169642, "step": 2519 }, { "epoch": 0.6906948060846924, "grad_norm": 1.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42367922.666666664, "logits/rejected": -31842893.333333332, "logps/chosen": -475.9017333984375, "logps/rejected": -759.91259765625, "loss": 0.0038, "rewards/chosen": 8.815165837605795, "rewards/margins": 22.79889233907064, "rewards/rejected": -13.983726501464844, "step": 2520 }, { "epoch": 0.6909688913252021, "grad_norm": 11.1875, "kl": 0.9417744278907776, "learning_rate": 5e-06, "logits/chosen": -29828132.923076924, "logits/rejected": -11452868.363636363, "logps/chosen": -395.56678185096155, "logps/rejected": -528.6600674715909, "loss": 0.0196, "rewards/chosen": 8.192826491135817, "rewards/margins": 20.402749975244483, "rewards/rejected": -12.209923484108664, "step": 2521 }, { "epoch": 0.691242976565712, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18845188.0, "logits/rejected": -15986913.333333334, "logps/chosen": -421.89453125, "logps/rejected": -496.3765055338542, "loss": 0.0222, "rewards/chosen": 7.098734537760417, "rewards/margins": 18.72145716349284, "rewards/rejected": -11.622722625732422, "step": 2522 }, { "epoch": 0.6915170618062217, "grad_norm": 7.09375, "kl": 8.168704986572266, "learning_rate": 5e-06, "logits/chosen": -16165904.842105264, "logits/rejected": 6905940.8, "logps/chosen": -400.96016652960526, "logps/rejected": -469.96708984375, "loss": 0.0344, "rewards/chosen": 6.486950121427837, "rewards/margins": 14.41396046688682, "rewards/rejected": -7.927010345458984, "step": 2523 }, { "epoch": 0.6917911470467315, "grad_norm": 3.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19850893.333333332, "logits/rejected": -13361664.0, "logps/chosen": -497.033935546875, "logps/rejected": -558.137939453125, "loss": 0.0084, "rewards/chosen": 8.42928695678711, "rewards/margins": 20.575188954671226, "rewards/rejected": -12.145901997884115, "step": 2524 }, { "epoch": 0.6920652322872414, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48724874.666666664, "logits/rejected": 18008616.0, "logps/chosen": -372.0679117838542, "logps/rejected": -430.7972412109375, "loss": 0.0369, "rewards/chosen": 5.547722498575847, "rewards/margins": 19.23978106180827, "rewards/rejected": -13.692058563232422, "step": 2525 }, { "epoch": 0.6923393175277511, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26209170.666666668, "logits/rejected": 1273750.3333333333, "logps/chosen": -372.5884602864583, "logps/rejected": -562.7591145833334, "loss": 0.0173, "rewards/chosen": 6.138435363769531, "rewards/margins": 17.84232838948568, "rewards/rejected": -11.703893025716146, "step": 2526 }, { "epoch": 0.6926134027682609, "grad_norm": 6.6875, "kl": 1.4955190420150757, "learning_rate": 5e-06, "logits/chosen": -45520311.27272727, "logits/rejected": -37660972.307692304, "logps/chosen": -412.0654296875, "logps/rejected": -531.2750525841346, "loss": 0.0208, "rewards/chosen": 6.545030073686079, "rewards/margins": 18.555901374016607, "rewards/rejected": -12.010871300330528, "step": 2527 }, { "epoch": 0.6928874880087708, "grad_norm": 1.5, "kl": 3.026763916015625, "learning_rate": 5e-06, "logits/chosen": -47624836.266666666, "logits/rejected": -33231075.555555556, "logps/chosen": -501.41106770833335, "logps/rejected": -470.5208333333333, "loss": 0.0027, "rewards/chosen": 8.343433634440105, "rewards/margins": 20.90027855767144, "rewards/rejected": -12.556844923231337, "step": 2528 }, { "epoch": 0.6931615732492805, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38980907.63636363, "logits/rejected": -34706530.461538464, "logps/chosen": -369.7692205255682, "logps/rejected": -449.54439603365387, "loss": 0.0261, "rewards/chosen": 5.76108828457919, "rewards/margins": 17.63270904967835, "rewards/rejected": -11.87162076509916, "step": 2529 }, { "epoch": 0.6934356584897903, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19767321.846153848, "logits/rejected": -21849163.636363637, "logps/chosen": -444.71469350961536, "logps/rejected": -481.2141779119318, "loss": 0.015, "rewards/chosen": 6.34532224214994, "rewards/margins": 16.707896039202495, "rewards/rejected": -10.362573797052557, "step": 2530 }, { "epoch": 0.6937097437303001, "grad_norm": 3.015625, "kl": 1.6602122783660889, "learning_rate": 5e-06, "logits/chosen": -38515060.36363637, "logits/rejected": -19123971.692307692, "logps/chosen": -352.9549005681818, "logps/rejected": -493.92176231971155, "loss": 0.0345, "rewards/chosen": 7.350385492498225, "rewards/margins": 16.843840645743416, "rewards/rejected": -9.493455153245192, "step": 2531 }, { "epoch": 0.6939838289708099, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31016163.2, "logits/rejected": -47976297.14285714, "logps/chosen": -331.87587890625, "logps/rejected": -526.2371651785714, "loss": 0.0295, "rewards/chosen": 5.273063659667969, "rewards/margins": 17.063983372279576, "rewards/rejected": -11.790919712611608, "step": 2532 }, { "epoch": 0.6942579142113198, "grad_norm": 2.765625, "kl": 0.026284536346793175, "learning_rate": 5e-06, "logits/chosen": -37537205.333333336, "logits/rejected": -33883018.666666664, "logps/chosen": -421.6787923177083, "logps/rejected": -676.2635091145834, "loss": 0.011, "rewards/chosen": 7.198394139607747, "rewards/margins": 20.661943435668945, "rewards/rejected": -13.463549296061197, "step": 2533 }, { "epoch": 0.6945319994518295, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25074500.57142857, "logits/rejected": -20180992.0, "logps/chosen": -465.10508510044644, "logps/rejected": -486.14499080882354, "loss": 0.0085, "rewards/chosen": 6.369529179164341, "rewards/margins": 17.836600712367467, "rewards/rejected": -11.467071533203125, "step": 2534 }, { "epoch": 0.6948060846923393, "grad_norm": 0.494140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12427992.0, "logits/rejected": -41073378.666666664, "logps/chosen": -360.5379231770833, "logps/rejected": -661.1294759114584, "loss": 0.0017, "rewards/chosen": 7.14276123046875, "rewards/margins": 19.28319549560547, "rewards/rejected": -12.140434265136719, "step": 2535 }, { "epoch": 0.6950801699328492, "grad_norm": 0.89453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45657961.14285714, "logits/rejected": -28577225.411764707, "logps/chosen": -407.06727818080356, "logps/rejected": -500.3169806985294, "loss": 0.0033, "rewards/chosen": 7.071560450962612, "rewards/margins": 19.344597632143678, "rewards/rejected": -12.273037181181067, "step": 2536 }, { "epoch": 0.6953542551733589, "grad_norm": 6.21875, "kl": 0.5941569209098816, "learning_rate": 5e-06, "logits/chosen": 9462163.2, "logits/rejected": -16738916.57142857, "logps/chosen": -530.477783203125, "logps/rejected": -526.4522530691964, "loss": 0.0294, "rewards/chosen": 6.74328842163086, "rewards/margins": 17.201512037004743, "rewards/rejected": -10.458223615373884, "step": 2537 }, { "epoch": 0.6956283404138687, "grad_norm": 5.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24949506.90909091, "logits/rejected": -34814503.384615384, "logps/chosen": -381.58620383522725, "logps/rejected": -310.31734525240387, "loss": 0.0226, "rewards/chosen": 7.167523470791903, "rewards/margins": 16.311204923616422, "rewards/rejected": -9.14368145282452, "step": 2538 }, { "epoch": 0.6959024256543785, "grad_norm": 9.9375, "kl": 5.172549247741699, "learning_rate": 5e-06, "logits/chosen": 10974947.692307692, "logits/rejected": -44336215.27272727, "logps/chosen": -495.84322415865387, "logps/rejected": -547.0343128551136, "loss": 0.0298, "rewards/chosen": 6.941631610576923, "rewards/margins": 17.258004568673513, "rewards/rejected": -10.316372958096592, "step": 2539 }, { "epoch": 0.6961765108948883, "grad_norm": 3.734375, "kl": 4.151793956756592, "learning_rate": 5e-06, "logits/chosen": -25392418.46153846, "logits/rejected": -32460328.727272727, "logps/chosen": -377.4986102764423, "logps/rejected": -515.8240411931819, "loss": 0.0133, "rewards/chosen": 6.916736896221455, "rewards/margins": 18.713961461207248, "rewards/rejected": -11.797224564985795, "step": 2540 }, { "epoch": 0.6964505961353981, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17491803.42857143, "logits/rejected": -39895955.2, "logps/chosen": -390.0341099330357, "logps/rejected": -362.8431884765625, "loss": 0.0225, "rewards/chosen": 6.793362753731864, "rewards/margins": 14.88486589704241, "rewards/rejected": -8.091503143310547, "step": 2541 }, { "epoch": 0.6967246813759079, "grad_norm": 23.0, "kl": 7.156922817230225, "learning_rate": 5e-06, "logits/chosen": -15395671.384615384, "logits/rejected": -631162.5454545454, "logps/chosen": -356.38198617788464, "logps/rejected": -539.9274680397727, "loss": 0.0911, "rewards/chosen": 5.3677203838641825, "rewards/margins": 18.288320581396142, "rewards/rejected": -12.92060019753196, "step": 2542 }, { "epoch": 0.6969987666164177, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11972444.0, "logits/rejected": -21884030.0, "logps/chosen": -368.98675537109375, "logps/rejected": -434.82745361328125, "loss": 0.0417, "rewards/chosen": 5.281680583953857, "rewards/margins": 15.735555171966553, "rewards/rejected": -10.453874588012695, "step": 2543 }, { "epoch": 0.6972728518569276, "grad_norm": 5.34375, "kl": 0.09688949584960938, "learning_rate": 5e-06, "logits/chosen": -24267435.2, "logits/rejected": -32549590.85714286, "logps/chosen": -453.059375, "logps/rejected": -507.98514229910717, "loss": 0.016, "rewards/chosen": 7.231008911132813, "rewards/margins": 17.709175763811384, "rewards/rejected": -10.478166852678571, "step": 2544 }, { "epoch": 0.6975469370974373, "grad_norm": 2.3125, "kl": 0.22769546508789062, "learning_rate": 5e-06, "logits/chosen": -21165828.57142857, "logits/rejected": -14532382.11764706, "logps/chosen": -445.7377232142857, "logps/rejected": -442.6117302389706, "loss": 0.0048, "rewards/chosen": 8.745485578264509, "rewards/margins": 17.9635094394203, "rewards/rejected": -9.21802386115579, "step": 2545 }, { "epoch": 0.6978210223379471, "grad_norm": 2.40625, "kl": 6.704052925109863, "learning_rate": 5e-06, "logits/chosen": -38134972.0, "logits/rejected": -30442152.0, "logps/chosen": -397.1434631347656, "logps/rejected": -421.7760009765625, "loss": 0.0387, "rewards/chosen": 6.856868743896484, "rewards/margins": 18.4556941986084, "rewards/rejected": -11.598825454711914, "step": 2546 }, { "epoch": 0.6980951075784569, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38317541.81818182, "logits/rejected": -35316145.23076923, "logps/chosen": -393.0470525568182, "logps/rejected": -579.5593073918269, "loss": 0.0253, "rewards/chosen": 7.240823225541548, "rewards/margins": 19.312023082813184, "rewards/rejected": -12.071199857271635, "step": 2547 }, { "epoch": 0.6983691928189667, "grad_norm": 9.4375, "kl": 7.008827209472656, "learning_rate": 5e-06, "logits/chosen": -1620494.0, "logits/rejected": -40838085.333333336, "logps/chosen": -542.5815836588541, "logps/rejected": -595.7293701171875, "loss": 0.0345, "rewards/chosen": 7.221179962158203, "rewards/margins": 18.248140970865883, "rewards/rejected": -11.026961008707682, "step": 2548 }, { "epoch": 0.6986432780594765, "grad_norm": 9.1875, "kl": 1.1218808889389038, "learning_rate": 5e-06, "logits/chosen": -30805290.666666668, "logits/rejected": -21006576.0, "logps/chosen": -413.9704182942708, "logps/rejected": -498.892822265625, "loss": 0.0441, "rewards/chosen": 7.485059102376302, "rewards/margins": 16.693897883097332, "rewards/rejected": -9.20883878072103, "step": 2549 }, { "epoch": 0.6989173632999863, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35858981.333333336, "logits/rejected": 16953861.333333332, "logps/chosen": -462.0019938151042, "logps/rejected": -630.7605794270834, "loss": 0.0518, "rewards/chosen": 6.827716827392578, "rewards/margins": 20.80065027872721, "rewards/rejected": -13.972933451334635, "step": 2550 }, { "epoch": 0.6991914485404961, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24008962.0, "logits/rejected": -15209712.0, "logps/chosen": -487.773193359375, "logps/rejected": -371.56146240234375, "loss": 0.0063, "rewards/chosen": 6.907665252685547, "rewards/margins": 15.659092903137207, "rewards/rejected": -8.75142765045166, "step": 2551 }, { "epoch": 0.6994655337810058, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24923511.272727273, "logits/rejected": -16537095.384615384, "logps/chosen": -462.5411931818182, "logps/rejected": -539.9687875600962, "loss": 0.008, "rewards/chosen": 7.763155850497159, "rewards/margins": 17.940961824430453, "rewards/rejected": -10.177805973933292, "step": 2552 }, { "epoch": 0.6997396190215157, "grad_norm": 9.25, "kl": 11.20605754852295, "learning_rate": 5e-06, "logits/chosen": -34151131.428571425, "logits/rejected": 7702809.6, "logps/chosen": -513.1828962053571, "logps/rejected": -473.02373046875, "loss": 0.0343, "rewards/chosen": 7.734226226806641, "rewards/margins": 18.501465606689454, "rewards/rejected": -10.767239379882813, "step": 2553 }, { "epoch": 0.7000137042620255, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7358801.454545454, "logits/rejected": -12167619.692307692, "logps/chosen": -437.27641157670456, "logps/rejected": -556.7617938701923, "loss": 0.0488, "rewards/chosen": 5.7527531710538, "rewards/margins": 17.489473329557406, "rewards/rejected": -11.736720158503605, "step": 2554 }, { "epoch": 0.7002877895025353, "grad_norm": 2.734375, "kl": 0.4363740384578705, "learning_rate": 5e-06, "logits/chosen": -28986042.181818184, "logits/rejected": -18481757.53846154, "logps/chosen": -434.99027876420456, "logps/rejected": -590.6644381009615, "loss": 0.0088, "rewards/chosen": 7.684065385298296, "rewards/margins": 22.074104682548896, "rewards/rejected": -14.3900392972506, "step": 2555 }, { "epoch": 0.7005618747430451, "grad_norm": 11.3125, "kl": 3.0295512676239014, "learning_rate": 5e-06, "logits/chosen": -17009031.384615384, "logits/rejected": -19147524.363636363, "logps/chosen": -389.32534555288464, "logps/rejected": -470.0579279119318, "loss": 0.0517, "rewards/chosen": 6.603324890136719, "rewards/margins": 14.430173006924715, "rewards/rejected": -7.826848116787997, "step": 2556 }, { "epoch": 0.7008359599835549, "grad_norm": 12.5, "kl": 1.5613536834716797, "learning_rate": 5e-06, "logits/chosen": 16023635.2, "logits/rejected": -35963026.28571428, "logps/chosen": -397.81376953125, "logps/rejected": -654.0105329241071, "loss": 0.0872, "rewards/chosen": 6.254082107543946, "rewards/margins": 16.583687210083006, "rewards/rejected": -10.329605102539062, "step": 2557 }, { "epoch": 0.7011100452240647, "grad_norm": 7.59375, "kl": 3.7924716472625732, "learning_rate": 5e-06, "logits/chosen": -16004132.363636363, "logits/rejected": -26359042.46153846, "logps/chosen": -465.34641335227275, "logps/rejected": -457.67052283653845, "loss": 0.0186, "rewards/chosen": 6.409982854669744, "rewards/margins": 17.88978726046902, "rewards/rejected": -11.479804405799278, "step": 2558 }, { "epoch": 0.7013841304645745, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19022423.272727273, "logits/rejected": 3969114.4615384615, "logps/chosen": -461.1404474431818, "logps/rejected": -511.1476862980769, "loss": 0.0298, "rewards/chosen": 7.2952728271484375, "rewards/margins": 18.375692514272835, "rewards/rejected": -11.0804196871244, "step": 2559 }, { "epoch": 0.7016582157050842, "grad_norm": 1.953125, "kl": 3.213219404220581, "learning_rate": 5e-06, "logits/chosen": 4355231.333333333, "logits/rejected": -14337210.666666666, "logps/chosen": -477.5693359375, "logps/rejected": -507.6549479166667, "loss": 0.0041, "rewards/chosen": 7.499610265096028, "rewards/margins": 17.543074289957683, "rewards/rejected": -10.043464024861654, "step": 2560 }, { "epoch": 0.7019323009455941, "grad_norm": 1.9453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15750023.384615384, "logits/rejected": -21456142.545454547, "logps/chosen": -475.75939002403845, "logps/rejected": -585.7506214488636, "loss": 0.0043, "rewards/chosen": 7.759370657113882, "rewards/margins": 21.5609245567055, "rewards/rejected": -13.80155389959162, "step": 2561 }, { "epoch": 0.7022063861861039, "grad_norm": 5.46875, "kl": 2.0250449180603027, "learning_rate": 5e-06, "logits/chosen": 425981.71428571426, "logits/rejected": -12248508.8, "logps/chosen": -402.50980050223217, "logps/rejected": -771.20517578125, "loss": 0.0399, "rewards/chosen": 5.850729261125837, "rewards/margins": 16.546983991350444, "rewards/rejected": -10.69625473022461, "step": 2562 }, { "epoch": 0.7024804714266136, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11121663.272727273, "logits/rejected": -24565248.0, "logps/chosen": -318.09195223721593, "logps/rejected": -524.9541766826923, "loss": 0.0842, "rewards/chosen": 5.332279552112926, "rewards/margins": 15.58359650298432, "rewards/rejected": -10.251316950871395, "step": 2563 }, { "epoch": 0.7027545566671235, "grad_norm": 13.875, "kl": 9.700579643249512, "learning_rate": 5e-06, "logits/chosen": -18147294.11764706, "logits/rejected": -15339315.42857143, "logps/chosen": -389.07068589154414, "logps/rejected": -615.5822405133929, "loss": 0.0786, "rewards/chosen": 6.646266712861903, "rewards/margins": 14.885929973185565, "rewards/rejected": -8.239663260323661, "step": 2564 }, { "epoch": 0.7030286419076333, "grad_norm": 1.703125, "kl": 0.49871063232421875, "learning_rate": 5e-06, "logits/chosen": -21469790.4, "logits/rejected": -14758282.285714285, "logps/chosen": -493.1912109375, "logps/rejected": -531.1240931919643, "loss": 0.0048, "rewards/chosen": 8.223622131347657, "rewards/margins": 19.020735822405136, "rewards/rejected": -10.797113691057477, "step": 2565 }, { "epoch": 0.7033027271481431, "grad_norm": 6.15625, "kl": 3.0068259239196777, "learning_rate": 5e-06, "logits/chosen": -28395788.0, "logits/rejected": -27667970.0, "logps/chosen": -367.69439697265625, "logps/rejected": -566.0055541992188, "loss": 0.0298, "rewards/chosen": 6.568473815917969, "rewards/margins": 25.061853408813477, "rewards/rejected": -18.493379592895508, "step": 2566 }, { "epoch": 0.7035768123886529, "grad_norm": 9.0, "kl": 18.605844497680664, "learning_rate": 5e-06, "logits/chosen": -21075303.529411763, "logits/rejected": -17768106.285714287, "logps/chosen": -403.11497587316177, "logps/rejected": -806.7307477678571, "loss": 0.1018, "rewards/chosen": 8.096989351160387, "rewards/margins": 21.390773356461725, "rewards/rejected": -13.293784005301339, "step": 2567 }, { "epoch": 0.7038508976291626, "grad_norm": 8.8125, "kl": 7.4085516929626465, "learning_rate": 5e-06, "logits/chosen": -34741805.71428572, "logits/rejected": -22249833.6, "logps/chosen": -485.3038853236607, "logps/rejected": -615.750927734375, "loss": 0.0282, "rewards/chosen": 7.925285884312221, "rewards/margins": 16.963919612339566, "rewards/rejected": -9.038633728027344, "step": 2568 }, { "epoch": 0.7041249828696725, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19841426.90909091, "logits/rejected": -38950872.615384616, "logps/chosen": -263.4992009943182, "logps/rejected": -446.44054236778845, "loss": 0.0283, "rewards/chosen": 4.991239374334162, "rewards/margins": 13.880703532612408, "rewards/rejected": -8.889464158278246, "step": 2569 }, { "epoch": 0.7043990681101823, "grad_norm": 6.78125, "kl": 0.18077406287193298, "learning_rate": 5e-06, "logits/chosen": -18507879.111111112, "logits/rejected": -26135750.4, "logps/chosen": -485.07318793402777, "logps/rejected": -545.53623046875, "loss": 0.0371, "rewards/chosen": 8.15152316623264, "rewards/margins": 20.107029554578993, "rewards/rejected": -11.955506388346354, "step": 2570 }, { "epoch": 0.704673153350692, "grad_norm": 2.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23835596.8, "logits/rejected": -9088324.0, "logps/chosen": -431.15009765625, "logps/rejected": -509.21561104910717, "loss": 0.0075, "rewards/chosen": 7.351829528808594, "rewards/margins": 17.3063714163644, "rewards/rejected": -9.954541887555804, "step": 2571 }, { "epoch": 0.7049472385912019, "grad_norm": 4.53125, "kl": 4.3454742431640625, "learning_rate": 5e-06, "logits/chosen": -20093801.14285714, "logits/rejected": -9923308.8, "logps/chosen": -479.45706612723217, "logps/rejected": -469.652490234375, "loss": 0.0087, "rewards/chosen": 7.710748944963727, "rewards/margins": 17.53282230922154, "rewards/rejected": -9.822073364257813, "step": 2572 }, { "epoch": 0.7052213238317117, "grad_norm": 1.5859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22856123.076923076, "logits/rejected": 57620497.45454545, "logps/chosen": -480.4000901442308, "logps/rejected": -609.5100763494319, "loss": 0.0046, "rewards/chosen": 6.9215569129356975, "rewards/margins": 24.58494109040374, "rewards/rejected": -17.66338417746804, "step": 2573 }, { "epoch": 0.7054954090722214, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13169316.923076924, "logits/rejected": -22255281.454545453, "logps/chosen": -530.6268404447115, "logps/rejected": -678.4753639914773, "loss": 0.0053, "rewards/chosen": 8.63152606670673, "rewards/margins": 21.58871129176, "rewards/rejected": -12.957185225053268, "step": 2574 }, { "epoch": 0.7057694943127313, "grad_norm": 3.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25867068.444444444, "logits/rejected": -27581873.066666666, "logps/chosen": -328.9490017361111, "logps/rejected": -508.50234375, "loss": 0.009, "rewards/chosen": 5.364981757269965, "rewards/margins": 16.75825466579861, "rewards/rejected": -11.393272908528646, "step": 2575 }, { "epoch": 0.706043579553241, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16573667.555555556, "logits/rejected": -181582.93333333332, "logps/chosen": -354.03187391493054, "logps/rejected": -638.1151041666667, "loss": 0.0303, "rewards/chosen": 4.656467861599392, "rewards/margins": 15.734569464789496, "rewards/rejected": -11.078101603190104, "step": 2576 }, { "epoch": 0.7063176647937509, "grad_norm": 5.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19851664.0, "logits/rejected": 31725368.0, "logps/chosen": -352.4154459635417, "logps/rejected": -588.2793782552084, "loss": 0.019, "rewards/chosen": 4.683314959208171, "rewards/margins": 18.599155108133953, "rewards/rejected": -13.915840148925781, "step": 2577 }, { "epoch": 0.7065917500342607, "grad_norm": 3.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27187904.0, "logits/rejected": -14916937.846153846, "logps/chosen": -427.53005149147725, "logps/rejected": -716.4674729567307, "loss": 0.0142, "rewards/chosen": 6.967977350408381, "rewards/margins": 23.17678331495165, "rewards/rejected": -16.20880596454327, "step": 2578 }, { "epoch": 0.7068658352747704, "grad_norm": 2.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1444647.3846153845, "logits/rejected": -36232046.54545455, "logps/chosen": -464.96304086538464, "logps/rejected": -546.2842240767045, "loss": 0.0075, "rewards/chosen": 7.278271014873798, "rewards/margins": 20.210818844241697, "rewards/rejected": -12.932547829367898, "step": 2579 }, { "epoch": 0.7071399205152803, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21383045.333333332, "logits/rejected": -39777298.666666664, "logps/chosen": -499.1304931640625, "logps/rejected": -547.0313313802084, "loss": 0.0187, "rewards/chosen": 7.712627410888672, "rewards/margins": 19.723331451416016, "rewards/rejected": -12.010704040527344, "step": 2580 }, { "epoch": 0.70741400575579, "grad_norm": 6.46875, "kl": 5.634732246398926, "learning_rate": 5e-06, "logits/chosen": -16959122.285714287, "logits/rejected": -17480844.8, "logps/chosen": -340.51747349330356, "logps/rejected": -455.12138671875, "loss": 0.0724, "rewards/chosen": 6.283867972237723, "rewards/margins": 17.699092429024834, "rewards/rejected": -11.41522445678711, "step": 2581 }, { "epoch": 0.7076880909962998, "grad_norm": 6.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23541194.666666668, "logits/rejected": -19703219.555555556, "logps/chosen": -410.1221028645833, "logps/rejected": -623.6486002604166, "loss": 0.0247, "rewards/chosen": 7.047461446126302, "rewards/margins": 21.87421145968967, "rewards/rejected": -14.826750013563368, "step": 2582 }, { "epoch": 0.7079621762368097, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23162521.6, "logits/rejected": -14001882.285714285, "logps/chosen": -295.8267333984375, "logps/rejected": -610.966796875, "loss": 0.09, "rewards/chosen": 4.186277389526367, "rewards/margins": 15.551957539149694, "rewards/rejected": -11.365680149623326, "step": 2583 }, { "epoch": 0.7082362614773194, "grad_norm": 5.53125, "kl": 3.1148831844329834, "learning_rate": 5e-06, "logits/chosen": -4642834.857142857, "logits/rejected": -26427070.4, "logps/chosen": -441.4994419642857, "logps/rejected": -579.576708984375, "loss": 0.0275, "rewards/chosen": 8.0073607308524, "rewards/margins": 20.011008344377792, "rewards/rejected": -12.003647613525391, "step": 2584 }, { "epoch": 0.7085103467178292, "grad_norm": 7.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35057450.666666664, "logits/rejected": -8985006.933333334, "logps/chosen": -335.1253255208333, "logps/rejected": -501.90208333333334, "loss": 0.053, "rewards/chosen": 5.659107208251953, "rewards/margins": 16.715365346272787, "rewards/rejected": -11.056258138020834, "step": 2585 }, { "epoch": 0.7087844319583391, "grad_norm": 0.2197265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31229501.09090909, "logits/rejected": -33997154.461538464, "logps/chosen": -552.5278764204545, "logps/rejected": -460.48091947115387, "loss": 0.0005, "rewards/chosen": 9.16111200506037, "rewards/margins": 20.25561203323044, "rewards/rejected": -11.094500028170073, "step": 2586 }, { "epoch": 0.7090585171988488, "grad_norm": 8.5, "kl": 2.3261656761169434, "learning_rate": 5e-06, "logits/chosen": -18243700.0, "logits/rejected": -21082152.0, "logps/chosen": -363.67626953125, "logps/rejected": -506.8909505208333, "loss": 0.0401, "rewards/chosen": 5.944129943847656, "rewards/margins": 16.687053680419922, "rewards/rejected": -10.742923736572266, "step": 2587 }, { "epoch": 0.7093326024393587, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30821142.4, "logits/rejected": -23807746.285714287, "logps/chosen": -396.2338623046875, "logps/rejected": -631.4633091517857, "loss": 0.024, "rewards/chosen": 7.406610107421875, "rewards/margins": 22.09112047467913, "rewards/rejected": -14.684510367257255, "step": 2588 }, { "epoch": 0.7096066876798685, "grad_norm": 8.125, "kl": 12.182056427001953, "learning_rate": 5e-06, "logits/chosen": -13097200.0, "logits/rejected": -3678794.285714286, "logps/chosen": -379.318359375, "logps/rejected": -665.8915318080357, "loss": 0.033, "rewards/chosen": 7.3063803280101105, "rewards/margins": 17.64942515397272, "rewards/rejected": -10.343044825962611, "step": 2589 }, { "epoch": 0.7098807729203782, "grad_norm": 6.9375, "kl": 5.118979454040527, "learning_rate": 5e-06, "logits/chosen": -17778023.272727273, "logits/rejected": -16918043.076923076, "logps/chosen": -303.0595703125, "logps/rejected": -486.376953125, "loss": 0.0325, "rewards/chosen": 6.581602616743608, "rewards/margins": 17.546300821371013, "rewards/rejected": -10.964698204627403, "step": 2590 }, { "epoch": 0.7101548581608881, "grad_norm": 2.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23438464.0, "logits/rejected": -27717589.333333332, "logps/chosen": -342.207275390625, "logps/rejected": -711.31298828125, "loss": 0.0115, "rewards/chosen": 6.510457992553711, "rewards/margins": 21.17068417867025, "rewards/rejected": -14.660226186116537, "step": 2591 }, { "epoch": 0.7104289434013978, "grad_norm": 13.5625, "kl": 17.702728271484375, "learning_rate": 5e-06, "logits/chosen": -18733348.57142857, "logits/rejected": -19188950.4, "logps/chosen": -506.04286411830356, "logps/rejected": -471.602197265625, "loss": 0.0438, "rewards/chosen": 8.199815477643694, "rewards/margins": 15.845720781598772, "rewards/rejected": -7.645905303955078, "step": 2592 }, { "epoch": 0.7107030286419076, "grad_norm": 9.5, "kl": 3.7467055320739746, "learning_rate": 5e-06, "logits/chosen": -11485690.666666666, "logits/rejected": 12054789.333333334, "logps/chosen": -430.0075276692708, "logps/rejected": -511.8765869140625, "loss": 0.0333, "rewards/chosen": 7.155325571695964, "rewards/margins": 16.623390197753906, "rewards/rejected": -9.468064626057943, "step": 2593 }, { "epoch": 0.7109771138824175, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26552436.363636363, "logits/rejected": -4679364.923076923, "logps/chosen": -266.6949351917614, "logps/rejected": -388.6796875, "loss": 0.0552, "rewards/chosen": 6.694380326704546, "rewards/margins": 13.794872950840663, "rewards/rejected": -7.100492624136118, "step": 2594 }, { "epoch": 0.7112511991229272, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18011544.0, "logits/rejected": -36521590.85714286, "logps/chosen": -387.8021728515625, "logps/rejected": -562.537109375, "loss": 0.0428, "rewards/chosen": 5.821242904663086, "rewards/margins": 18.675351442609514, "rewards/rejected": -12.854108537946429, "step": 2595 }, { "epoch": 0.711525284363437, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19694851.555555556, "logits/rejected": -7175283.2, "logps/chosen": -401.6317545572917, "logps/rejected": -511.2251302083333, "loss": 0.039, "rewards/chosen": 5.835266537136501, "rewards/margins": 15.471897803412544, "rewards/rejected": -9.636631266276042, "step": 2596 }, { "epoch": 0.7117993696039469, "grad_norm": 2.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24696548.57142857, "logits/rejected": -11857106.823529411, "logps/chosen": -469.31480189732144, "logps/rejected": -668.8839613970588, "loss": 0.0038, "rewards/chosen": 7.046894618443081, "rewards/margins": 19.803861602013853, "rewards/rejected": -12.756966983570772, "step": 2597 }, { "epoch": 0.7120734548444566, "grad_norm": 5.6875, "kl": 5.368436336517334, "learning_rate": 5e-06, "logits/chosen": -24979872.0, "logits/rejected": -12482886.666666666, "logps/chosen": -487.506591796875, "logps/rejected": -703.9689127604166, "loss": 0.0127, "rewards/chosen": 7.370084762573242, "rewards/margins": 19.19927406311035, "rewards/rejected": -11.82918930053711, "step": 2598 }, { "epoch": 0.7123475400849665, "grad_norm": 9.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1884881.6, "logits/rejected": -11666035.42857143, "logps/chosen": -581.86220703125, "logps/rejected": -597.8922293526786, "loss": 0.0225, "rewards/chosen": 8.544448852539062, "rewards/margins": 19.377467564174108, "rewards/rejected": -10.833018711635045, "step": 2599 }, { "epoch": 0.7126216253254762, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1041317.8181818182, "logits/rejected": -16772930.461538462, "logps/chosen": -490.7529296875, "logps/rejected": -582.9695012019231, "loss": 0.0099, "rewards/chosen": 7.139339100230824, "rewards/margins": 19.16311698860222, "rewards/rejected": -12.023777888371395, "step": 2600 }, { "epoch": 0.712895710565986, "grad_norm": 3.84375, "kl": 0.24924597144126892, "learning_rate": 5e-06, "logits/chosen": -17407413.714285713, "logits/rejected": -17262435.2, "logps/chosen": -452.21634347098217, "logps/rejected": -656.78037109375, "loss": 0.0149, "rewards/chosen": 7.5985919407435825, "rewards/margins": 19.896324484688893, "rewards/rejected": -12.297732543945312, "step": 2601 }, { "epoch": 0.7131697958064959, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23668497.066666666, "logits/rejected": -22868821.333333332, "logps/chosen": -391.18720703125, "logps/rejected": -629.4321831597222, "loss": 0.0689, "rewards/chosen": 5.778464762369792, "rewards/margins": 19.676724243164063, "rewards/rejected": -13.898259480794271, "step": 2602 }, { "epoch": 0.7134438810470056, "grad_norm": 3.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11475128.8, "logits/rejected": -18912928.0, "logps/chosen": -427.32880859375, "logps/rejected": -566.7313058035714, "loss": 0.0083, "rewards/chosen": 7.2669921875, "rewards/margins": 21.902714756556918, "rewards/rejected": -14.63572256905692, "step": 2603 }, { "epoch": 0.7137179662875154, "grad_norm": 7.65625, "kl": 5.387345790863037, "learning_rate": 5e-06, "logits/chosen": -12772664.533333333, "logits/rejected": -14305945.777777778, "logps/chosen": -388.43883463541664, "logps/rejected": -489.2320963541667, "loss": 0.0316, "rewards/chosen": 6.957390340169271, "rewards/margins": 20.93263634575738, "rewards/rejected": -13.975246005588108, "step": 2604 }, { "epoch": 0.7139920515280253, "grad_norm": 9.875, "kl": 6.356036186218262, "learning_rate": 5e-06, "logits/chosen": -24569220.0, "logits/rejected": -7228568.0, "logps/chosen": -424.05169677734375, "logps/rejected": -559.0640869140625, "loss": 0.0578, "rewards/chosen": 7.443178176879883, "rewards/margins": 18.857048988342285, "rewards/rejected": -11.413870811462402, "step": 2605 }, { "epoch": 0.714266136768535, "grad_norm": 0.76953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9560300.0, "logits/rejected": -20515349.333333332, "logps/chosen": -402.4003499348958, "logps/rejected": -562.216796875, "loss": 0.0029, "rewards/chosen": 7.326997756958008, "rewards/margins": 21.064994176228844, "rewards/rejected": -13.737996419270834, "step": 2606 }, { "epoch": 0.7145402220090448, "grad_norm": 10.1875, "kl": 0.17282883822917938, "learning_rate": 5e-06, "logits/chosen": -43103344.0, "logits/rejected": -22243078.0, "logps/chosen": -438.2596740722656, "logps/rejected": -445.7430419921875, "loss": 0.0496, "rewards/chosen": 6.877728462219238, "rewards/margins": 14.77550220489502, "rewards/rejected": -7.897773742675781, "step": 2607 }, { "epoch": 0.7148143072495546, "grad_norm": 4.125, "kl": 1.6081836223602295, "learning_rate": 5e-06, "logits/chosen": -33744759.46666667, "logits/rejected": -82061653.33333333, "logps/chosen": -384.42565104166664, "logps/rejected": -594.2899848090278, "loss": 0.0403, "rewards/chosen": 6.090129597981771, "rewards/margins": 20.30707312689887, "rewards/rejected": -14.216943528917101, "step": 2608 }, { "epoch": 0.7150883924900644, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17126773.714285713, "logits/rejected": -21184783.05882353, "logps/chosen": -435.0693359375, "logps/rejected": -587.8583984375, "loss": 0.0228, "rewards/chosen": 6.591334751674107, "rewards/margins": 19.939990901145613, "rewards/rejected": -13.348656149471507, "step": 2609 }, { "epoch": 0.7153624777305742, "grad_norm": 5.0625, "kl": 3.7785301208496094, "learning_rate": 5e-06, "logits/chosen": -21725004.0, "logits/rejected": -41058376.0, "logps/chosen": -510.3192138671875, "logps/rejected": -669.19921875, "loss": 0.0108, "rewards/chosen": 7.788875102996826, "rewards/margins": 22.07545804977417, "rewards/rejected": -14.286582946777344, "step": 2610 }, { "epoch": 0.715636562971084, "grad_norm": 8.25, "kl": 5.578543663024902, "learning_rate": 5e-06, "logits/chosen": -22943620.57142857, "logits/rejected": -29064896.0, "logps/chosen": -440.3882533482143, "logps/rejected": -603.17880859375, "loss": 0.0113, "rewards/chosen": 7.379636492047991, "rewards/margins": 19.430806841169087, "rewards/rejected": -12.051170349121094, "step": 2611 }, { "epoch": 0.7159106482115938, "grad_norm": 6.875, "kl": 4.422034740447998, "learning_rate": 5e-06, "logits/chosen": -41410385.45454545, "logits/rejected": -35858439.384615384, "logps/chosen": -433.8800159801136, "logps/rejected": -482.0560772235577, "loss": 0.0548, "rewards/chosen": 6.203491904518821, "rewards/margins": 15.751679160378195, "rewards/rejected": -9.548187255859375, "step": 2612 }, { "epoch": 0.7161847334521037, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8903846.153846154, "logits/rejected": -27268459.636363637, "logps/chosen": -436.8256084735577, "logps/rejected": -434.84126420454544, "loss": 0.0136, "rewards/chosen": 6.774804335374099, "rewards/margins": 18.247280947811955, "rewards/rejected": -11.472476612437855, "step": 2613 }, { "epoch": 0.7164588186926134, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55081440.0, "logits/rejected": -16648913.333333334, "logps/chosen": -540.3990478515625, "logps/rejected": -537.2507731119791, "loss": 0.019, "rewards/chosen": 6.929258982340495, "rewards/margins": 20.69614028930664, "rewards/rejected": -13.766881306966146, "step": 2614 }, { "epoch": 0.7167329039331232, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37252768.0, "logits/rejected": -31668322.666666668, "logps/chosen": -445.1585286458333, "logps/rejected": -664.8103434244791, "loss": 0.0112, "rewards/chosen": 7.742584864298503, "rewards/margins": 21.102001825968426, "rewards/rejected": -13.359416961669922, "step": 2615 }, { "epoch": 0.717006989173633, "grad_norm": 4.90625, "kl": 0.08795039355754852, "learning_rate": 5e-06, "logits/chosen": -16743741.090909092, "logits/rejected": -17268966.153846152, "logps/chosen": -415.18625710227275, "logps/rejected": -589.3641075721154, "loss": 0.0135, "rewards/chosen": 7.322934237393466, "rewards/margins": 20.777966586026277, "rewards/rejected": -13.455032348632812, "step": 2616 }, { "epoch": 0.7172810744141428, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36734280.0, "logits/rejected": -5946881.333333333, "logps/chosen": -411.3572591145833, "logps/rejected": -533.7097981770834, "loss": 0.0241, "rewards/chosen": 7.283722559611003, "rewards/margins": 17.73341178894043, "rewards/rejected": -10.449689229329428, "step": 2617 }, { "epoch": 0.7175551596546526, "grad_norm": 2.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9141596.666666666, "logits/rejected": -18615553.333333332, "logps/chosen": -360.5467122395833, "logps/rejected": -479.8835042317708, "loss": 0.0096, "rewards/chosen": 5.957888285319011, "rewards/margins": 16.21747080485026, "rewards/rejected": -10.25958251953125, "step": 2618 }, { "epoch": 0.7178292448951624, "grad_norm": 1.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2129838.769230769, "logits/rejected": -24572136.727272727, "logps/chosen": -357.47765174278845, "logps/rejected": -725.2999822443181, "loss": 0.0056, "rewards/chosen": 6.53419201190655, "rewards/margins": 18.262070929253852, "rewards/rejected": -11.7278789173473, "step": 2619 }, { "epoch": 0.7181033301356722, "grad_norm": 8.8125, "kl": 13.055513381958008, "learning_rate": 5e-06, "logits/chosen": -28203749.647058822, "logits/rejected": -14678465.142857144, "logps/chosen": -427.32286879595586, "logps/rejected": -489.75551060267856, "loss": 0.0208, "rewards/chosen": 7.491857640883502, "rewards/margins": 22.24682546663685, "rewards/rejected": -14.754967825753349, "step": 2620 }, { "epoch": 0.7183774153761819, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33049552.0, "logits/rejected": 48069270.4, "logps/chosen": -430.85996791294644, "logps/rejected": -710.462451171875, "loss": 0.0118, "rewards/chosen": 7.970747811453683, "rewards/margins": 21.836842782156808, "rewards/rejected": -13.866094970703125, "step": 2621 }, { "epoch": 0.7186515006166918, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 40117672.0, "logits/rejected": -26303004.0, "logps/chosen": -363.1537170410156, "logps/rejected": -628.488525390625, "loss": 0.021, "rewards/chosen": 5.653667449951172, "rewards/margins": 17.930513381958008, "rewards/rejected": -12.276845932006836, "step": 2622 }, { "epoch": 0.7189255858572016, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42207694.222222224, "logits/rejected": -21217305.6, "logps/chosen": -398.2723795572917, "logps/rejected": -507.60016276041665, "loss": 0.0295, "rewards/chosen": 7.790707906087239, "rewards/margins": 21.107848612467446, "rewards/rejected": -13.317140706380208, "step": 2623 }, { "epoch": 0.7191996710977114, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18114131.692307692, "logits/rejected": -11058281.454545455, "logps/chosen": -447.2825270432692, "logps/rejected": -558.1444424715909, "loss": 0.0213, "rewards/chosen": 8.043950594388521, "rewards/margins": 20.287110495400597, "rewards/rejected": -12.243159901012074, "step": 2624 }, { "epoch": 0.7194737563382212, "grad_norm": 1.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38716721.45454545, "logits/rejected": -37603763.692307696, "logps/chosen": -420.10440340909093, "logps/rejected": -644.1859975961538, "loss": 0.0024, "rewards/chosen": 6.889371004971591, "rewards/margins": 20.190524334674116, "rewards/rejected": -13.301153329702524, "step": 2625 }, { "epoch": 0.719747841578731, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23059124.57142857, "logits/rejected": -8516277.6, "logps/chosen": -434.07205636160717, "logps/rejected": -412.666845703125, "loss": 0.0396, "rewards/chosen": 5.047840118408203, "rewards/margins": 15.06540298461914, "rewards/rejected": -10.017562866210938, "step": 2626 }, { "epoch": 0.7200219268192408, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15920224.0, "logits/rejected": -28541712.0, "logps/chosen": -399.9828776041667, "logps/rejected": -512.5474175347222, "loss": 0.0714, "rewards/chosen": 5.4299875895182295, "rewards/margins": 19.247387356228298, "rewards/rejected": -13.81739976671007, "step": 2627 }, { "epoch": 0.7202960120597506, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15234440.727272727, "logits/rejected": -24123827.692307692, "logps/chosen": -409.4060724431818, "logps/rejected": -737.2739633413462, "loss": 0.0318, "rewards/chosen": 6.320637096058238, "rewards/margins": 25.159839976917613, "rewards/rejected": -18.839202880859375, "step": 2628 }, { "epoch": 0.7205700973002603, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11890407.384615384, "logits/rejected": -26820925.09090909, "logps/chosen": -473.0417668269231, "logps/rejected": -461.57705965909093, "loss": 0.0359, "rewards/chosen": 6.177974994365986, "rewards/margins": 17.94707761110959, "rewards/rejected": -11.769102616743607, "step": 2629 }, { "epoch": 0.7208441825407702, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13238995.555555556, "logits/rejected": -29117013.333333332, "logps/chosen": -502.30620659722223, "logps/rejected": -565.0411458333333, "loss": 0.0267, "rewards/chosen": 8.835350884331596, "rewards/margins": 22.946319919162327, "rewards/rejected": -14.11096903483073, "step": 2630 }, { "epoch": 0.72111826778128, "grad_norm": 5.75, "kl": 6.59589147567749, "learning_rate": 5e-06, "logits/chosen": -16324375.466666667, "logits/rejected": -26408284.444444444, "logps/chosen": -386.98492838541665, "logps/rejected": -582.5594618055555, "loss": 0.0123, "rewards/chosen": 7.666431172688802, "rewards/margins": 18.87448476155599, "rewards/rejected": -11.208053588867188, "step": 2631 }, { "epoch": 0.7213923530217897, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18346444.307692308, "logits/rejected": -40178769.45454545, "logps/chosen": -341.68783804086536, "logps/rejected": -598.6545188210227, "loss": 0.06, "rewards/chosen": 4.641294626089243, "rewards/margins": 18.357324400148194, "rewards/rejected": -13.71602977405895, "step": 2632 }, { "epoch": 0.7216664382622996, "grad_norm": 1.8671875, "kl": 2.5956740379333496, "learning_rate": 5e-06, "logits/chosen": -39821383.384615384, "logits/rejected": -33585364.36363637, "logps/chosen": -502.35606971153845, "logps/rejected": -469.83163174715907, "loss": 0.005, "rewards/chosen": 7.126857464130108, "rewards/margins": 19.90161970445326, "rewards/rejected": -12.774762240323154, "step": 2633 }, { "epoch": 0.7219405235028094, "grad_norm": 15.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11604795.555555556, "logits/rejected": -12734685.866666667, "logps/chosen": -294.1544596354167, "logps/rejected": -517.0826822916666, "loss": 0.064, "rewards/chosen": 3.6587740580240884, "rewards/margins": 15.644377899169921, "rewards/rejected": -11.985603841145833, "step": 2634 }, { "epoch": 0.7222146087433192, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28300726.153846152, "logits/rejected": -14398190.545454545, "logps/chosen": -381.1809645432692, "logps/rejected": -525.0435458096591, "loss": 0.0404, "rewards/chosen": 7.053570087139423, "rewards/margins": 16.655742938701923, "rewards/rejected": -9.6021728515625, "step": 2635 }, { "epoch": 0.722488693983829, "grad_norm": 1.140625, "kl": 0.38903936743736267, "learning_rate": 5e-06, "logits/chosen": -23953445.333333332, "logits/rejected": -30151645.333333332, "logps/chosen": -413.8717854817708, "logps/rejected": -549.096923828125, "loss": 0.004, "rewards/chosen": 6.656779607137044, "rewards/margins": 20.757815678914387, "rewards/rejected": -14.101036071777344, "step": 2636 }, { "epoch": 0.7227627792243387, "grad_norm": 7.25, "kl": 11.249488830566406, "learning_rate": 5e-06, "logits/chosen": -19357534.769230768, "logits/rejected": -23883226.181818184, "logps/chosen": -531.1221829927885, "logps/rejected": -593.3679421164773, "loss": 0.0274, "rewards/chosen": 8.647554250863882, "rewards/margins": 19.95525744244769, "rewards/rejected": -11.307703191583807, "step": 2637 }, { "epoch": 0.7230368644648486, "grad_norm": 2.46875, "kl": 9.155160903930664, "learning_rate": 5e-06, "logits/chosen": -20671814.4, "logits/rejected": -46603064.88888889, "logps/chosen": -442.0892578125, "logps/rejected": -563.8114691840278, "loss": 0.0079, "rewards/chosen": 6.885252888997396, "rewards/margins": 19.45135735405816, "rewards/rejected": -12.566104465060764, "step": 2638 }, { "epoch": 0.7233109497053584, "grad_norm": 1.140625, "kl": 1.1110255718231201, "learning_rate": 5e-06, "logits/chosen": -8066301.714285715, "logits/rejected": -23563267.2, "logps/chosen": -526.4709123883929, "logps/rejected": -490.35283203125, "loss": 0.0033, "rewards/chosen": 9.421636308942523, "rewards/margins": 19.882070268903462, "rewards/rejected": -10.460433959960938, "step": 2639 }, { "epoch": 0.7235850349458681, "grad_norm": 5.5625, "kl": 0.7222824096679688, "learning_rate": 5e-06, "logits/chosen": 10624040.615384616, "logits/rejected": -17292737.454545453, "logps/chosen": -440.3251953125, "logps/rejected": -726.3124112215909, "loss": 0.0103, "rewards/chosen": 7.533931438739483, "rewards/margins": 19.464338876150705, "rewards/rejected": -11.93040743741122, "step": 2640 }, { "epoch": 0.723859120186378, "grad_norm": 6.96875, "kl": 7.835216522216797, "learning_rate": 5e-06, "logits/chosen": -8518411.333333334, "logits/rejected": -17052945.333333332, "logps/chosen": -415.7190348307292, "logps/rejected": -678.9264322916666, "loss": 0.0669, "rewards/chosen": 7.65081787109375, "rewards/margins": 23.718453725179035, "rewards/rejected": -16.067635854085285, "step": 2641 }, { "epoch": 0.7241332054268877, "grad_norm": 2.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29463648.0, "logits/rejected": -38654500.571428575, "logps/chosen": -463.882373046875, "logps/rejected": -634.5029296875, "loss": 0.0048, "rewards/chosen": 8.336727905273438, "rewards/margins": 23.07566419328962, "rewards/rejected": -14.738936288016182, "step": 2642 }, { "epoch": 0.7244072906673975, "grad_norm": 3.28125, "kl": 2.9897868633270264, "learning_rate": 5e-06, "logits/chosen": -37685826.666666664, "logits/rejected": -6739244.666666667, "logps/chosen": -398.4989013671875, "logps/rejected": -541.1667073567709, "loss": 0.0105, "rewards/chosen": 7.515331268310547, "rewards/margins": 16.12211481730143, "rewards/rejected": -8.606783548990885, "step": 2643 }, { "epoch": 0.7246813759079074, "grad_norm": 1.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41832029.538461536, "logits/rejected": -22689472.0, "logps/chosen": -535.3582857572115, "logps/rejected": -635.9125532670455, "loss": 0.0043, "rewards/chosen": 8.62957763671875, "rewards/margins": 21.522001786665484, "rewards/rejected": -12.892424149946732, "step": 2644 }, { "epoch": 0.7249554611484171, "grad_norm": 6.21875, "kl": 8.101367950439453, "learning_rate": 5e-06, "logits/chosen": -18781586.666666668, "logits/rejected": -16085928.0, "logps/chosen": -390.6508382161458, "logps/rejected": -574.9107259114584, "loss": 0.0396, "rewards/chosen": 7.932188669840495, "rewards/margins": 19.172074635823567, "rewards/rejected": -11.239885965983072, "step": 2645 }, { "epoch": 0.725229546388927, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12266331.2, "logits/rejected": -38845184.0, "logps/chosen": -410.74853515625, "logps/rejected": -681.3715122767857, "loss": 0.0163, "rewards/chosen": 6.155493927001953, "rewards/margins": 20.071784210205077, "rewards/rejected": -13.916290283203125, "step": 2646 }, { "epoch": 0.7255036316294368, "grad_norm": 2.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11509166.222222222, "logits/rejected": -9818698.666666666, "logps/chosen": -432.76337348090277, "logps/rejected": -486.0810546875, "loss": 0.0069, "rewards/chosen": 7.5594329833984375, "rewards/margins": 18.919894409179687, "rewards/rejected": -11.36046142578125, "step": 2647 }, { "epoch": 0.7257777168699465, "grad_norm": 7.59375, "kl": 0.8262647390365601, "learning_rate": 5e-06, "logits/chosen": -27047241.846153848, "logits/rejected": -13721460.363636363, "logps/chosen": -326.0248272235577, "logps/rejected": -566.8916015625, "loss": 0.0324, "rewards/chosen": 6.810566828801082, "rewards/margins": 17.545033381535458, "rewards/rejected": -10.734466552734375, "step": 2648 }, { "epoch": 0.7260518021104564, "grad_norm": 4.21875, "kl": 0.4468180537223816, "learning_rate": 5e-06, "logits/chosen": -28499811.2, "logits/rejected": -46959341.71428572, "logps/chosen": -367.27158203125, "logps/rejected": -558.4281529017857, "loss": 0.013, "rewards/chosen": 6.025337982177734, "rewards/margins": 16.5455689566476, "rewards/rejected": -10.520230974469866, "step": 2649 }, { "epoch": 0.7263258873509661, "grad_norm": 3.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14424458.666666666, "logits/rejected": -23843648.0, "logps/chosen": -422.3870442708333, "logps/rejected": -566.1597086588541, "loss": 0.01, "rewards/chosen": 6.761892954508464, "rewards/margins": 20.775811513264973, "rewards/rejected": -14.01391855875651, "step": 2650 }, { "epoch": 0.7265999725914759, "grad_norm": 2.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21368925.714285713, "logits/rejected": -22719265.88235294, "logps/chosen": -408.48880440848217, "logps/rejected": -480.9989659926471, "loss": 0.0045, "rewards/chosen": 6.818302699497768, "rewards/margins": 16.672876566398045, "rewards/rejected": -9.854573866900276, "step": 2651 }, { "epoch": 0.7268740578319858, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1877067.5, "logits/rejected": -24088514.666666668, "logps/chosen": -382.2664388020833, "logps/rejected": -508.0021158854167, "loss": 0.0354, "rewards/chosen": 6.698601404825847, "rewards/margins": 16.32672627766927, "rewards/rejected": -9.628124872843424, "step": 2652 }, { "epoch": 0.7271481430724955, "grad_norm": 9.25, "kl": 2.2567317485809326, "learning_rate": 5e-06, "logits/chosen": -22453642.666666668, "logits/rejected": -12969736.0, "logps/chosen": -416.3486735026042, "logps/rejected": -635.8216552734375, "loss": 0.0802, "rewards/chosen": 6.615076065063477, "rewards/margins": 21.004299799601235, "rewards/rejected": -14.38922373453776, "step": 2653 }, { "epoch": 0.7274222283130053, "grad_norm": 6.21875, "kl": 4.242058753967285, "learning_rate": 5e-06, "logits/chosen": -20473276.0, "logits/rejected": -5119358.0, "logps/chosen": -390.36260986328125, "logps/rejected": -399.891845703125, "loss": 0.0501, "rewards/chosen": 6.4406280517578125, "rewards/margins": 14.36224889755249, "rewards/rejected": -7.921620845794678, "step": 2654 }, { "epoch": 0.7276963135535152, "grad_norm": 11.75, "kl": 14.508136749267578, "learning_rate": 5e-06, "logits/chosen": -4456695.142857143, "logits/rejected": -21699168.0, "logps/chosen": -480.56131417410717, "logps/rejected": -528.299755859375, "loss": 0.1056, "rewards/chosen": 5.676660810198102, "rewards/margins": 15.570133863176618, "rewards/rejected": -9.893473052978516, "step": 2655 }, { "epoch": 0.7279703987940249, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -590743.0, "logits/rejected": -23004092.0, "logps/chosen": -406.42864990234375, "logps/rejected": -466.5083923339844, "loss": 0.0332, "rewards/chosen": 5.842583656311035, "rewards/margins": 15.703547477722168, "rewards/rejected": -9.860963821411133, "step": 2656 }, { "epoch": 0.7282444840345348, "grad_norm": 4.59375, "kl": 4.000467300415039, "learning_rate": 5e-06, "logits/chosen": -23862807.272727273, "logits/rejected": -10308596.307692308, "logps/chosen": -396.99365234375, "logps/rejected": -487.23550180288464, "loss": 0.0709, "rewards/chosen": 5.815146012739702, "rewards/margins": 16.69126342560028, "rewards/rejected": -10.876117412860577, "step": 2657 }, { "epoch": 0.7285185692750445, "grad_norm": 7.125, "kl": 6.423708915710449, "learning_rate": 5e-06, "logits/chosen": -16287708.57142857, "logits/rejected": -14522548.8, "logps/chosen": -407.4361049107143, "logps/rejected": -330.4301025390625, "loss": 0.0378, "rewards/chosen": 6.792657034737723, "rewards/margins": 12.731528799874443, "rewards/rejected": -5.938871765136719, "step": 2658 }, { "epoch": 0.7287926545155543, "grad_norm": 3.578125, "kl": 11.26325511932373, "learning_rate": 5e-06, "logits/chosen": -10232261.333333334, "logits/rejected": -33049280.0, "logps/chosen": -424.73072916666666, "logps/rejected": -502.05653211805554, "loss": 0.0166, "rewards/chosen": 7.164010111490885, "rewards/margins": 17.092669338650175, "rewards/rejected": -9.928659227159288, "step": 2659 }, { "epoch": 0.7290667397560642, "grad_norm": 11.125, "kl": 2.549776792526245, "learning_rate": 5e-06, "logits/chosen": -17107891.692307692, "logits/rejected": -13918266.181818182, "logps/chosen": -495.7174729567308, "logps/rejected": -480.87215909090907, "loss": 0.0404, "rewards/chosen": 8.247179471529448, "rewards/margins": 17.78483400144777, "rewards/rejected": -9.537654529918324, "step": 2660 }, { "epoch": 0.7293408249965739, "grad_norm": 1.9765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14465568.888888888, "logits/rejected": -22791622.4, "logps/chosen": -483.24609375, "logps/rejected": -595.3781901041667, "loss": 0.0053, "rewards/chosen": 7.443887498643663, "rewards/margins": 20.73209550645616, "rewards/rejected": -13.2882080078125, "step": 2661 }, { "epoch": 0.7296149102370837, "grad_norm": 7.90625, "kl": 1.6464920043945312, "learning_rate": 5e-06, "logits/chosen": 436384.6153846154, "logits/rejected": -20496110.545454547, "logps/chosen": -447.39246544471155, "logps/rejected": -449.4689275568182, "loss": 0.0333, "rewards/chosen": 6.689586932842548, "rewards/margins": 17.04253451474063, "rewards/rejected": -10.352947581898082, "step": 2662 }, { "epoch": 0.7298889954775936, "grad_norm": 4.96875, "kl": 4.639494895935059, "learning_rate": 5e-06, "logits/chosen": -14362264.615384616, "logits/rejected": -6753346.909090909, "logps/chosen": -359.55337289663464, "logps/rejected": -364.4074041193182, "loss": 0.0267, "rewards/chosen": 6.950752258300781, "rewards/margins": 13.3333608453924, "rewards/rejected": -6.382608587091619, "step": 2663 }, { "epoch": 0.7301630807181033, "grad_norm": 1.5546875, "kl": 2.6637420654296875, "learning_rate": 5e-06, "logits/chosen": -9587412.923076924, "logits/rejected": -43200637.09090909, "logps/chosen": -337.8242938701923, "logps/rejected": -519.5831409801136, "loss": 0.0068, "rewards/chosen": 7.033336345966045, "rewards/margins": 19.36573103257826, "rewards/rejected": -12.332394686612217, "step": 2664 }, { "epoch": 0.7304371659586131, "grad_norm": 3.34375, "kl": 7.239335536956787, "learning_rate": 5e-06, "logits/chosen": -5790277.333333333, "logits/rejected": -29709900.8, "logps/chosen": -441.68994140625, "logps/rejected": -591.6975260416667, "loss": 0.0146, "rewards/chosen": 8.48467763264974, "rewards/margins": 19.307146708170574, "rewards/rejected": -10.822469075520834, "step": 2665 }, { "epoch": 0.730711251199123, "grad_norm": 6.09375, "kl": 3.5676207542419434, "learning_rate": 5e-06, "logits/chosen": 12558645.333333334, "logits/rejected": -26655235.555555556, "logps/chosen": -437.54401041666665, "logps/rejected": -390.089599609375, "loss": 0.0249, "rewards/chosen": 7.190079752604166, "rewards/margins": 16.055739678276908, "rewards/rejected": -8.865659925672743, "step": 2666 }, { "epoch": 0.7309853364396327, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1859396.0, "logits/rejected": -3254543.6, "logps/chosen": -323.53585379464283, "logps/rejected": -539.581689453125, "loss": 0.0535, "rewards/chosen": 6.243833814348493, "rewards/margins": 16.00331028529576, "rewards/rejected": -9.759476470947266, "step": 2667 }, { "epoch": 0.7312594216801426, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13114403.42857143, "logits/rejected": -18984442.352941178, "logps/chosen": -406.21435546875, "logps/rejected": -439.49333639705884, "loss": 0.0532, "rewards/chosen": 5.770454406738281, "rewards/margins": 14.412145726821002, "rewards/rejected": -8.641691320082721, "step": 2668 }, { "epoch": 0.7315335069206523, "grad_norm": 2.390625, "kl": 9.330307006835938, "learning_rate": 5e-06, "logits/chosen": -32738601.846153848, "logits/rejected": -21724002.90909091, "logps/chosen": -547.0467998798077, "logps/rejected": -484.0672052556818, "loss": 0.0043, "rewards/chosen": 9.09197998046875, "rewards/margins": 17.660683371803977, "rewards/rejected": -8.568703391335227, "step": 2669 }, { "epoch": 0.7318075921611621, "grad_norm": 1.5078125, "kl": 3.192103147506714, "learning_rate": 5e-06, "logits/chosen": -24739868.0, "logits/rejected": -22145558.0, "logps/chosen": -347.2518310546875, "logps/rejected": -583.3622436523438, "loss": 0.0047, "rewards/chosen": 7.662715911865234, "rewards/margins": 19.51686954498291, "rewards/rejected": -11.854153633117676, "step": 2670 }, { "epoch": 0.732081677401672, "grad_norm": 0.69921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30079112.727272727, "logits/rejected": 12061996.307692308, "logps/chosen": -404.9815784801136, "logps/rejected": -557.63525390625, "loss": 0.0017, "rewards/chosen": 8.522439436479049, "rewards/margins": 20.939773186103448, "rewards/rejected": -12.4173337496244, "step": 2671 }, { "epoch": 0.7323557626421817, "grad_norm": 14.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41582450.666666664, "logits/rejected": -2060945.0, "logps/chosen": -374.5331217447917, "logps/rejected": -551.2835693359375, "loss": 0.0485, "rewards/chosen": 6.556082407633464, "rewards/margins": 17.076361338297527, "rewards/rejected": -10.520278930664062, "step": 2672 }, { "epoch": 0.7326298478826915, "grad_norm": 4.875, "kl": 7.356524467468262, "learning_rate": 5e-06, "logits/chosen": 9911488.0, "logits/rejected": -3925164.4444444445, "logps/chosen": -578.3005859375, "logps/rejected": -476.63628472222223, "loss": 0.0114, "rewards/chosen": 7.394559733072916, "rewards/margins": 17.06181131998698, "rewards/rejected": -9.667251586914062, "step": 2673 }, { "epoch": 0.7329039331232013, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26565868.307692308, "logits/rejected": -29944183.272727273, "logps/chosen": -329.37582632211536, "logps/rejected": -642.3413529829545, "loss": 0.0441, "rewards/chosen": 6.750729487492488, "rewards/margins": 21.983664612670044, "rewards/rejected": -15.232935125177557, "step": 2674 }, { "epoch": 0.7331780183637111, "grad_norm": 7.125, "kl": 2.4564433097839355, "learning_rate": 5e-06, "logits/chosen": -23621222.4, "logits/rejected": -28712146.285714287, "logps/chosen": -380.5232666015625, "logps/rejected": -580.865234375, "loss": 0.0307, "rewards/chosen": 6.953469085693359, "rewards/margins": 19.727621568952287, "rewards/rejected": -12.774152483258929, "step": 2675 }, { "epoch": 0.7334521036042209, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17571066.666666668, "logits/rejected": -11420470.666666666, "logps/chosen": -394.8638916015625, "logps/rejected": -550.5489095052084, "loss": 0.0243, "rewards/chosen": 7.3489030202229815, "rewards/margins": 18.798343022664387, "rewards/rejected": -11.449440002441406, "step": 2676 }, { "epoch": 0.7337261888447307, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8887977.454545455, "logits/rejected": -14861508.923076924, "logps/chosen": -500.0654296875, "logps/rejected": -374.0075871394231, "loss": 0.0569, "rewards/chosen": 7.048998746004972, "rewards/margins": 17.236712369051848, "rewards/rejected": -10.187713623046875, "step": 2677 }, { "epoch": 0.7340002740852405, "grad_norm": 1.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33299214.222222224, "logits/rejected": -15722738.133333333, "logps/chosen": -468.1647135416667, "logps/rejected": -627.5692057291667, "loss": 0.0052, "rewards/chosen": 8.221702575683594, "rewards/margins": 21.97362314860026, "rewards/rejected": -13.751920572916667, "step": 2678 }, { "epoch": 0.7342743593257504, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20097604.0, "logits/rejected": -347573.0, "logps/chosen": -466.0044250488281, "logps/rejected": -604.3165893554688, "loss": 0.036, "rewards/chosen": 7.389938831329346, "rewards/margins": 19.85542917251587, "rewards/rejected": -12.465490341186523, "step": 2679 }, { "epoch": 0.7345484445662601, "grad_norm": 2.84375, "kl": 0.4533348083496094, "learning_rate": 5e-06, "logits/chosen": -22709916.444444444, "logits/rejected": -9480025.6, "logps/chosen": -565.3901909722222, "logps/rejected": -538.9626953125, "loss": 0.0068, "rewards/chosen": 8.878650241427952, "rewards/margins": 18.555783420138887, "rewards/rejected": -9.677133178710937, "step": 2680 }, { "epoch": 0.7348225298067699, "grad_norm": 3.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31114008.888888888, "logits/rejected": -14157173.333333334, "logps/chosen": -433.6559244791667, "logps/rejected": -550.4983723958334, "loss": 0.0131, "rewards/chosen": 6.20028813680013, "rewards/margins": 19.885439300537108, "rewards/rejected": -13.685151163736979, "step": 2681 }, { "epoch": 0.7350966150472797, "grad_norm": 12.8125, "kl": 3.033669948577881, "learning_rate": 5e-06, "logits/chosen": -20267140.363636363, "logits/rejected": -22273947.076923076, "logps/chosen": -353.97099165482956, "logps/rejected": -421.50157752403845, "loss": 0.0645, "rewards/chosen": 5.686582391912287, "rewards/margins": 14.62991829185219, "rewards/rejected": -8.943335899939903, "step": 2682 }, { "epoch": 0.7353707002877895, "grad_norm": 4.09375, "kl": 0.7302157282829285, "learning_rate": 5e-06, "logits/chosen": -22874760.0, "logits/rejected": -9424528.0, "logps/chosen": -386.0685221354167, "logps/rejected": -426.5681966145833, "loss": 0.0153, "rewards/chosen": 8.218021392822266, "rewards/margins": 19.67214330037435, "rewards/rejected": -11.454121907552084, "step": 2683 }, { "epoch": 0.7356447855282993, "grad_norm": 6.84375, "kl": 0.6203429102897644, "learning_rate": 5e-06, "logits/chosen": -19367044.8, "logits/rejected": -17361378.285714287, "logps/chosen": -382.373828125, "logps/rejected": -372.33412388392856, "loss": 0.0217, "rewards/chosen": 7.104678344726563, "rewards/margins": 16.702584184919086, "rewards/rejected": -9.597905840192523, "step": 2684 }, { "epoch": 0.7359188707688091, "grad_norm": 4.78125, "kl": 0.47612762451171875, "learning_rate": 5e-06, "logits/chosen": -33877414.4, "logits/rejected": -12835200.888888888, "logps/chosen": -350.3816731770833, "logps/rejected": -486.6467013888889, "loss": 0.0167, "rewards/chosen": 6.806805928548177, "rewards/margins": 19.7803471883138, "rewards/rejected": -12.973541259765625, "step": 2685 }, { "epoch": 0.7361929560093189, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14760059.076923076, "logits/rejected": -29002356.363636363, "logps/chosen": -304.20179161658655, "logps/rejected": -457.8099254261364, "loss": 0.0604, "rewards/chosen": 6.053218548114483, "rewards/margins": 15.882213299091045, "rewards/rejected": -9.828994750976562, "step": 2686 }, { "epoch": 0.7364670412498286, "grad_norm": 6.78125, "kl": 2.9663150310516357, "learning_rate": 5e-06, "logits/chosen": -32558716.8, "logits/rejected": 9230676.0, "logps/chosen": -419.816015625, "logps/rejected": -715.5191127232143, "loss": 0.0178, "rewards/chosen": 6.926384735107422, "rewards/margins": 21.50123563494001, "rewards/rejected": -14.574850899832589, "step": 2687 }, { "epoch": 0.7367411264903385, "grad_norm": 12.875, "kl": 4.707769393920898, "learning_rate": 5e-06, "logits/chosen": -28731330.666666668, "logits/rejected": -18602218.666666668, "logps/chosen": -361.1649576822917, "logps/rejected": -552.7494303385416, "loss": 0.059, "rewards/chosen": 5.571547190348308, "rewards/margins": 17.42926534016927, "rewards/rejected": -11.857718149820963, "step": 2688 }, { "epoch": 0.7370152117308483, "grad_norm": 8.625, "kl": 1.8038686513900757, "learning_rate": 5e-06, "logits/chosen": -7621378.666666667, "logits/rejected": -33234514.666666668, "logps/chosen": -410.6732584635417, "logps/rejected": -705.8971354166666, "loss": 0.0331, "rewards/chosen": 8.475722630818685, "rewards/margins": 19.64297421773275, "rewards/rejected": -11.167251586914062, "step": 2689 }, { "epoch": 0.7372892969713581, "grad_norm": 1.34375, "kl": 0.9315058588981628, "learning_rate": 5e-06, "logits/chosen": -13841827.692307692, "logits/rejected": -49173288.72727273, "logps/chosen": -506.80528846153845, "logps/rejected": -502.6536754261364, "loss": 0.0029, "rewards/chosen": 9.53683589054988, "rewards/margins": 19.33441002052147, "rewards/rejected": -9.797574129971592, "step": 2690 }, { "epoch": 0.7375633822118679, "grad_norm": 12.4375, "kl": 6.031213283538818, "learning_rate": 5e-06, "logits/chosen": -21074141.866666667, "logits/rejected": -28819873.777777776, "logps/chosen": -449.9322265625, "logps/rejected": -563.4959852430555, "loss": 0.0928, "rewards/chosen": 6.860644022623698, "rewards/margins": 18.51048533121745, "rewards/rejected": -11.64984130859375, "step": 2691 }, { "epoch": 0.7378374674523777, "grad_norm": 3.671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42777989.333333336, "logits/rejected": -25452802.666666668, "logps/chosen": -252.07084147135416, "logps/rejected": -614.5467122395834, "loss": 0.0643, "rewards/chosen": 4.269676844278972, "rewards/margins": 16.808427810668945, "rewards/rejected": -12.538750966389975, "step": 2692 }, { "epoch": 0.7381115526928875, "grad_norm": 7.59375, "kl": 2.275054454803467, "learning_rate": 5e-06, "logits/chosen": -19313840.0, "logits/rejected": -3923576.8, "logps/chosen": -403.49183872767856, "logps/rejected": -606.36015625, "loss": 0.074, "rewards/chosen": 5.261073521205357, "rewards/margins": 16.232603672572544, "rewards/rejected": -10.971530151367187, "step": 2693 }, { "epoch": 0.7383856379333973, "grad_norm": 4.59375, "kl": 2.4275588989257812, "learning_rate": 5e-06, "logits/chosen": -26998626.46153846, "logits/rejected": -20452834.90909091, "logps/chosen": -502.2555588942308, "logps/rejected": -563.9434481534091, "loss": 0.0231, "rewards/chosen": 6.957728459284856, "rewards/margins": 20.12239576219679, "rewards/rejected": -13.164667302911932, "step": 2694 }, { "epoch": 0.738659723173907, "grad_norm": 8.5, "kl": 6.591874599456787, "learning_rate": 5e-06, "logits/chosen": -13354048.0, "logits/rejected": -17558184.0, "logps/chosen": -327.16033935546875, "logps/rejected": -491.4618225097656, "loss": 0.081, "rewards/chosen": 5.672626495361328, "rewards/margins": 15.484601974487305, "rewards/rejected": -9.811975479125977, "step": 2695 }, { "epoch": 0.7389338084144169, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23326512.0, "logits/rejected": -13556891.0, "logps/chosen": -434.98822021484375, "logps/rejected": -535.5385131835938, "loss": 0.0301, "rewards/chosen": 7.904122829437256, "rewards/margins": 19.703553676605225, "rewards/rejected": -11.799430847167969, "step": 2696 }, { "epoch": 0.7392078936549267, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20074132.923076924, "logits/rejected": -41803296.0, "logps/chosen": -356.7965745192308, "logps/rejected": -663.5252574573864, "loss": 0.0303, "rewards/chosen": 4.801141885610727, "rewards/margins": 20.729324314144108, "rewards/rejected": -15.92818242853338, "step": 2697 }, { "epoch": 0.7394819788954364, "grad_norm": 7.375, "kl": 1.4415035247802734, "learning_rate": 5e-06, "logits/chosen": -30114603.42857143, "logits/rejected": -13243134.4, "logps/chosen": -477.9063197544643, "logps/rejected": -501.009423828125, "loss": 0.015, "rewards/chosen": 7.140796661376953, "rewards/margins": 18.808121490478516, "rewards/rejected": -11.667324829101563, "step": 2698 }, { "epoch": 0.7397560641359463, "grad_norm": 1.375, "kl": 3.1477432250976562, "learning_rate": 5e-06, "logits/chosen": -23600864.0, "logits/rejected": -20144937.333333332, "logps/chosen": -457.1627604166667, "logps/rejected": -529.58251953125, "loss": 0.0038, "rewards/chosen": 7.380817413330078, "rewards/margins": 19.186308542887367, "rewards/rejected": -11.805491129557291, "step": 2699 }, { "epoch": 0.7400301493764561, "grad_norm": 12.9375, "kl": 7.984150409698486, "learning_rate": 5e-06, "logits/chosen": -19824832.0, "logits/rejected": -33133641.6, "logps/chosen": -344.44789341517856, "logps/rejected": -502.24052734375, "loss": 0.0682, "rewards/chosen": 6.361656188964844, "rewards/margins": 15.731674194335938, "rewards/rejected": -9.370018005371094, "step": 2700 }, { "epoch": 0.7403042346169659, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3246681.1428571427, "logits/rejected": -20444492.8, "logps/chosen": -437.06138392857144, "logps/rejected": -488.16240234375, "loss": 0.0689, "rewards/chosen": 6.747227260044643, "rewards/margins": 15.07874276297433, "rewards/rejected": -8.331515502929687, "step": 2701 }, { "epoch": 0.7405783198574757, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17645602.666666668, "logits/rejected": -13632040.0, "logps/chosen": -361.2789713541667, "logps/rejected": -738.81396484375, "loss": 0.025, "rewards/chosen": 6.29255739847819, "rewards/margins": 22.557297388712566, "rewards/rejected": -16.264739990234375, "step": 2702 }, { "epoch": 0.7408524050979854, "grad_norm": 5.6875, "kl": 8.14381217956543, "learning_rate": 5e-06, "logits/chosen": -20145149.714285713, "logits/rejected": 7958145.882352941, "logps/chosen": -403.0532924107143, "logps/rejected": -537.9503676470588, "loss": 0.0164, "rewards/chosen": 6.654112134660993, "rewards/margins": 19.87265479464491, "rewards/rejected": -13.218542659983916, "step": 2703 }, { "epoch": 0.7411264903384953, "grad_norm": 5.5625, "kl": 10.222280502319336, "learning_rate": 5e-06, "logits/chosen": 3853473.230769231, "logits/rejected": -43322225.45454545, "logps/chosen": -507.53617037259613, "logps/rejected": -542.2218128551136, "loss": 0.0523, "rewards/chosen": 8.617762052095854, "rewards/margins": 21.3530359334879, "rewards/rejected": -12.735273881392045, "step": 2704 }, { "epoch": 0.7414005755790051, "grad_norm": 9.0, "kl": 5.414065837860107, "learning_rate": 5e-06, "logits/chosen": -7149730.0, "logits/rejected": -18844080.0, "logps/chosen": -369.1717529296875, "logps/rejected": -374.7088317871094, "loss": 0.0777, "rewards/chosen": 6.8678388595581055, "rewards/margins": 15.225717544555664, "rewards/rejected": -8.357878684997559, "step": 2705 }, { "epoch": 0.7416746608195148, "grad_norm": 3.0625, "kl": 4.72934627532959, "learning_rate": 5e-06, "logits/chosen": -11517253.0, "logits/rejected": -9744127.0, "logps/chosen": -340.8044738769531, "logps/rejected": -492.0197448730469, "loss": 0.0483, "rewards/chosen": 5.96434211730957, "rewards/margins": 14.80648136138916, "rewards/rejected": -8.84213924407959, "step": 2706 }, { "epoch": 0.7419487460600247, "grad_norm": 3.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23483281.777777776, "logits/rejected": -36414685.86666667, "logps/chosen": -403.9538302951389, "logps/rejected": -600.3591145833333, "loss": 0.0072, "rewards/chosen": 7.825984530978733, "rewards/margins": 19.032791985405815, "rewards/rejected": -11.206807454427084, "step": 2707 }, { "epoch": 0.7422228313005345, "grad_norm": 10.875, "kl": 9.122020721435547, "learning_rate": 5e-06, "logits/chosen": -9592632.0, "logits/rejected": -21673325.333333332, "logps/chosen": -499.9569091796875, "logps/rejected": -605.9954427083334, "loss": 0.0474, "rewards/chosen": 7.771870930989583, "rewards/margins": 18.419994354248047, "rewards/rejected": -10.648123423258463, "step": 2708 }, { "epoch": 0.7424969165410442, "grad_norm": 3.828125, "kl": 1.7049955129623413, "learning_rate": 5e-06, "logits/chosen": -29805609.411764707, "logits/rejected": -19518891.42857143, "logps/chosen": -373.91647518382354, "logps/rejected": -355.66904994419644, "loss": 0.0134, "rewards/chosen": 6.804925357594209, "rewards/margins": 16.155485137170103, "rewards/rejected": -9.350559779575892, "step": 2709 }, { "epoch": 0.7427710017815541, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28922334.222222224, "logits/rejected": 10980142.933333334, "logps/chosen": -364.59619140625, "logps/rejected": -733.05, "loss": 0.0056, "rewards/chosen": 6.160168117947048, "rewards/margins": 23.552662828233508, "rewards/rejected": -17.39249471028646, "step": 2710 }, { "epoch": 0.7430450870220638, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3961905.777777778, "logits/rejected": -19583360.0, "logps/chosen": -378.3442111545139, "logps/rejected": -583.7723958333333, "loss": 0.0301, "rewards/chosen": 7.803306579589844, "rewards/margins": 19.08994801839193, "rewards/rejected": -11.286641438802084, "step": 2711 }, { "epoch": 0.7433191722625737, "grad_norm": 7.96875, "kl": 1.345346450805664, "learning_rate": 5e-06, "logits/chosen": -25207694.769230768, "logits/rejected": 20514605.09090909, "logps/chosen": -384.21548227163464, "logps/rejected": -692.5055042613636, "loss": 0.0638, "rewards/chosen": 6.101988572340745, "rewards/margins": 15.49294995928144, "rewards/rejected": -9.390961386940695, "step": 2712 }, { "epoch": 0.7435932575030835, "grad_norm": 1.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7485404.363636363, "logits/rejected": -6565165.538461538, "logps/chosen": -549.1693004261364, "logps/rejected": -453.6696213942308, "loss": 0.0052, "rewards/chosen": 8.236290671608664, "rewards/margins": 18.123953119024527, "rewards/rejected": -9.887662447415865, "step": 2713 }, { "epoch": 0.7438673427435932, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3801290.1818181816, "logits/rejected": -20813449.846153848, "logps/chosen": -416.0980113636364, "logps/rejected": -610.6669170673077, "loss": 0.0147, "rewards/chosen": 5.925612016157671, "rewards/margins": 17.842815479198535, "rewards/rejected": -11.917203463040865, "step": 2714 }, { "epoch": 0.7441414279841031, "grad_norm": 7.90625, "kl": 3.0875232219696045, "learning_rate": 5e-06, "logits/chosen": -14519560.727272727, "logits/rejected": 5525094.769230769, "logps/chosen": -545.3326526988636, "logps/rejected": -685.5635516826923, "loss": 0.0218, "rewards/chosen": 7.926984613591975, "rewards/margins": 23.105278868775265, "rewards/rejected": -15.178294255183292, "step": 2715 }, { "epoch": 0.7444155132246129, "grad_norm": 1.21875, "kl": 5.225282669067383, "learning_rate": 5e-06, "logits/chosen": -69410156.3076923, "logits/rejected": -37153605.81818182, "logps/chosen": -594.1336763822115, "logps/rejected": -809.6459517045455, "loss": 0.0028, "rewards/chosen": 9.220966045673077, "rewards/margins": 25.564661946330038, "rewards/rejected": -16.34369590065696, "step": 2716 }, { "epoch": 0.7446895984651226, "grad_norm": 5.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21696764.444444444, "logits/rejected": -3079723.7333333334, "logps/chosen": -392.79248046875, "logps/rejected": -651.6735026041666, "loss": 0.0139, "rewards/chosen": 7.262157864040798, "rewards/margins": 18.309708319769967, "rewards/rejected": -11.047550455729167, "step": 2717 }, { "epoch": 0.7449636837056325, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28339538.666666668, "logits/rejected": -9436904.0, "logps/chosen": -441.1502685546875, "logps/rejected": -560.123779296875, "loss": 0.0529, "rewards/chosen": 6.932758967081706, "rewards/margins": 17.18712552388509, "rewards/rejected": -10.254366556803385, "step": 2718 }, { "epoch": 0.7452377689461422, "grad_norm": 5.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20570290.666666668, "logits/rejected": -395989.3333333333, "logps/chosen": -519.0508626302084, "logps/rejected": -596.6267496744791, "loss": 0.0081, "rewards/chosen": 8.867019653320312, "rewards/margins": 21.301756540934242, "rewards/rejected": -12.434736887613932, "step": 2719 }, { "epoch": 0.745511854186652, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52358848.0, "logits/rejected": -22695570.82352941, "logps/chosen": -430.2479771205357, "logps/rejected": -473.55870863970586, "loss": 0.0255, "rewards/chosen": 7.260049547467913, "rewards/margins": 17.936766071479862, "rewards/rejected": -10.67671652401195, "step": 2720 }, { "epoch": 0.7457859394271619, "grad_norm": 3.125, "kl": 0.11250432580709457, "learning_rate": 5e-06, "logits/chosen": -17661312.0, "logits/rejected": -31243225.6, "logps/chosen": -371.06821986607144, "logps/rejected": -560.464453125, "loss": 0.0066, "rewards/chosen": 7.718727111816406, "rewards/margins": 20.592347717285158, "rewards/rejected": -12.87362060546875, "step": 2721 }, { "epoch": 0.7460600246676716, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7465828.0, "logits/rejected": -19918285.714285713, "logps/chosen": -364.5410400390625, "logps/rejected": -424.9153529575893, "loss": 0.0406, "rewards/chosen": 6.161744689941406, "rewards/margins": 15.062951115199498, "rewards/rejected": -8.901206425258092, "step": 2722 }, { "epoch": 0.7463341099081815, "grad_norm": 5.46875, "kl": 5.474950313568115, "learning_rate": 5e-06, "logits/chosen": -8948506.181818182, "logits/rejected": -10779616.0, "logps/chosen": -369.56394264914775, "logps/rejected": -691.7393329326923, "loss": 0.0227, "rewards/chosen": 6.408914739435369, "rewards/margins": 18.63317316228693, "rewards/rejected": -12.224258422851562, "step": 2723 }, { "epoch": 0.7466081951486913, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -97616.3, "logits/rejected": -17057113.14285714, "logps/chosen": -384.51748046875, "logps/rejected": -439.1149204799107, "loss": 0.0341, "rewards/chosen": 7.227957916259766, "rewards/margins": 17.108730098179407, "rewards/rejected": -9.880772181919642, "step": 2724 }, { "epoch": 0.746882280389201, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10509120.0, "logits/rejected": -17512349.09090909, "logps/chosen": -398.5988581730769, "logps/rejected": -462.60764382102275, "loss": 0.0298, "rewards/chosen": 7.187856820913462, "rewards/margins": 17.57561733005764, "rewards/rejected": -10.387760509144176, "step": 2725 }, { "epoch": 0.7471563656297109, "grad_norm": 14.125, "kl": 4.8497419357299805, "learning_rate": 5e-06, "logits/chosen": -22252985.333333332, "logits/rejected": -10599402.0, "logps/chosen": -349.8673909505208, "logps/rejected": -407.8448079427083, "loss": 0.0534, "rewards/chosen": 7.15899658203125, "rewards/margins": 15.66872787475586, "rewards/rejected": -8.50973129272461, "step": 2726 }, { "epoch": 0.7474304508702206, "grad_norm": 2.34375, "kl": 1.6240921020507812, "learning_rate": 5e-06, "logits/chosen": 9771.09090909091, "logits/rejected": -24846429.53846154, "logps/chosen": -528.6538529829545, "logps/rejected": -853.9887319711538, "loss": 0.0063, "rewards/chosen": 7.323668740012429, "rewards/margins": 25.157546303488992, "rewards/rejected": -17.833877563476562, "step": 2727 }, { "epoch": 0.7477045361107304, "grad_norm": 1.8984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16770327.0, "logits/rejected": -21987540.0, "logps/chosen": -402.4157409667969, "logps/rejected": -533.7528686523438, "loss": 0.0037, "rewards/chosen": 6.8133039474487305, "rewards/margins": 21.22376251220703, "rewards/rejected": -14.4104585647583, "step": 2728 }, { "epoch": 0.7479786213512403, "grad_norm": 2.59375, "kl": 0.7968101501464844, "learning_rate": 5e-06, "logits/chosen": -6740416.0, "logits/rejected": -15835468.307692308, "logps/chosen": -379.58345170454544, "logps/rejected": -460.7274639423077, "loss": 0.0087, "rewards/chosen": 7.763413862748579, "rewards/margins": 18.87491100818127, "rewards/rejected": -11.111497145432692, "step": 2729 }, { "epoch": 0.74825270659175, "grad_norm": 1.3515625, "kl": 9.00290584564209, "learning_rate": 5e-06, "logits/chosen": 1671782.5, "logits/rejected": 33299524.0, "logps/chosen": -374.1004943847656, "logps/rejected": -704.9652099609375, "loss": 0.0037, "rewards/chosen": 8.541259765625, "rewards/margins": 25.27133560180664, "rewards/rejected": -16.73007583618164, "step": 2730 }, { "epoch": 0.7485267918322598, "grad_norm": 0.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26151072.0, "logits/rejected": -10408676.0, "logps/chosen": -439.2056579589844, "logps/rejected": -412.05889892578125, "loss": 0.0017, "rewards/chosen": 7.123013973236084, "rewards/margins": 17.488196849822998, "rewards/rejected": -10.365182876586914, "step": 2731 }, { "epoch": 0.7488008770727697, "grad_norm": 2.71875, "kl": 0.2917823791503906, "learning_rate": 5e-06, "logits/chosen": -4629557.230769231, "logits/rejected": -39143831.27272727, "logps/chosen": -499.44249549278845, "logps/rejected": -611.1733842329545, "loss": 0.0063, "rewards/chosen": 7.607479388897236, "rewards/margins": 19.917750218531467, "rewards/rejected": -12.310270829634232, "step": 2732 }, { "epoch": 0.7490749623132794, "grad_norm": 1.515625, "kl": 6.904669761657715, "learning_rate": 5e-06, "logits/chosen": -10589012.0, "logits/rejected": 45044188.0, "logps/chosen": -434.1133117675781, "logps/rejected": -550.5843505859375, "loss": 0.0395, "rewards/chosen": 9.77753734588623, "rewards/margins": 26.50007152557373, "rewards/rejected": -16.7225341796875, "step": 2733 }, { "epoch": 0.7493490475537893, "grad_norm": 1.671875, "kl": 0.37591552734375, "learning_rate": 5e-06, "logits/chosen": -49024413.09090909, "logits/rejected": -18218036.923076924, "logps/chosen": -479.75803444602275, "logps/rejected": -786.2782451923077, "loss": 0.0039, "rewards/chosen": 8.57069951837713, "rewards/margins": 22.057331645405377, "rewards/rejected": -13.486632127028246, "step": 2734 }, { "epoch": 0.749623132794299, "grad_norm": 13.3125, "kl": 0.5666402578353882, "learning_rate": 5e-06, "logits/chosen": -14675450.181818182, "logits/rejected": -44741129.84615385, "logps/chosen": -324.36032936789775, "logps/rejected": -650.3517127403846, "loss": 0.1001, "rewards/chosen": 5.128230701793324, "rewards/margins": 18.383547802905102, "rewards/rejected": -13.255317101111778, "step": 2735 }, { "epoch": 0.7498972180348088, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5641752.615384615, "logits/rejected": -43062461.09090909, "logps/chosen": -429.32312950721155, "logps/rejected": -492.89457563920456, "loss": 0.0318, "rewards/chosen": 7.55483891413762, "rewards/margins": 22.183104294996994, "rewards/rejected": -14.628265380859375, "step": 2736 }, { "epoch": 0.7501713032753187, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15204491.636363637, "logits/rejected": -30660952.615384616, "logps/chosen": -344.34736772017044, "logps/rejected": -603.7743389423077, "loss": 0.0184, "rewards/chosen": 5.831935535777699, "rewards/margins": 17.332247754076977, "rewards/rejected": -11.500312218299278, "step": 2737 }, { "epoch": 0.7504453885158284, "grad_norm": 5.9375, "kl": 1.8681621551513672, "learning_rate": 5e-06, "logits/chosen": -22276523.2, "logits/rejected": -33226018.285714287, "logps/chosen": -439.54560546875, "logps/rejected": -520.0145089285714, "loss": 0.0106, "rewards/chosen": 6.629454803466797, "rewards/margins": 17.98721455165318, "rewards/rejected": -11.357759748186384, "step": 2738 }, { "epoch": 0.7507194737563382, "grad_norm": 5.625, "kl": 5.744556427001953, "learning_rate": 5e-06, "logits/chosen": -33910948.571428575, "logits/rejected": -14960761.6, "logps/chosen": -419.879150390625, "logps/rejected": -612.1427734375, "loss": 0.0201, "rewards/chosen": 6.7450441632952005, "rewards/margins": 17.69531042916434, "rewards/rejected": -10.950266265869141, "step": 2739 }, { "epoch": 0.750993558996848, "grad_norm": 2.0625, "kl": 10.068503379821777, "learning_rate": 5e-06, "logits/chosen": -26118510.545454547, "logits/rejected": -12309021.538461538, "logps/chosen": -450.9464222301136, "logps/rejected": -553.5127704326923, "loss": 0.0056, "rewards/chosen": 8.2376840764826, "rewards/margins": 21.945818654307118, "rewards/rejected": -13.70813457782452, "step": 2740 }, { "epoch": 0.7512676442373578, "grad_norm": 5.25, "kl": 2.1954150199890137, "learning_rate": 5e-06, "logits/chosen": -30929367.272727273, "logits/rejected": -68295995.07692307, "logps/chosen": -385.61811967329544, "logps/rejected": -472.44193209134613, "loss": 0.0235, "rewards/chosen": 5.688493902033025, "rewards/margins": 18.529289565719925, "rewards/rejected": -12.8407956636869, "step": 2741 }, { "epoch": 0.7515417294778676, "grad_norm": 12.4375, "kl": 5.383383750915527, "learning_rate": 5e-06, "logits/chosen": -39008448.0, "logits/rejected": -27296901.818181816, "logps/chosen": -427.80382361778845, "logps/rejected": -701.7187056107955, "loss": 0.0548, "rewards/chosen": 6.6383220966045675, "rewards/margins": 21.48512385441707, "rewards/rejected": -14.8468017578125, "step": 2742 }, { "epoch": 0.7518158147183774, "grad_norm": 5.71875, "kl": 0.46112823486328125, "learning_rate": 5e-06, "logits/chosen": -27835113.6, "logits/rejected": -25422358.85714286, "logps/chosen": -469.121630859375, "logps/rejected": -590.0643136160714, "loss": 0.0156, "rewards/chosen": 8.385595703125, "rewards/margins": 20.444896806989398, "rewards/rejected": -12.059301103864398, "step": 2743 }, { "epoch": 0.7520898999588872, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30213900.0, "logits/rejected": -28964984.0, "logps/chosen": -377.7645263671875, "logps/rejected": -563.6650390625, "loss": 0.02, "rewards/chosen": 6.788107872009277, "rewards/margins": 20.746228218078613, "rewards/rejected": -13.958120346069336, "step": 2744 }, { "epoch": 0.7523639851993971, "grad_norm": 0.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27290707.2, "logits/rejected": -34218598.85714286, "logps/chosen": -534.023388671875, "logps/rejected": -397.85714285714283, "loss": 0.0013, "rewards/chosen": 10.040146636962891, "rewards/margins": 21.959920283726284, "rewards/rejected": -11.919773646763392, "step": 2745 }, { "epoch": 0.7526380704399068, "grad_norm": 1.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6738304.5, "logits/rejected": -39812080.0, "logps/chosen": -559.9094848632812, "logps/rejected": -654.3690795898438, "loss": 0.0032, "rewards/chosen": 8.14369010925293, "rewards/margins": 20.12713050842285, "rewards/rejected": -11.983440399169922, "step": 2746 }, { "epoch": 0.7529121556804166, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19883777.333333332, "logits/rejected": -24596608.0, "logps/chosen": -514.8895670572916, "logps/rejected": -610.1061197916666, "loss": 0.0501, "rewards/chosen": 6.6147816975911455, "rewards/margins": 19.1687494913737, "rewards/rejected": -12.553967793782553, "step": 2747 }, { "epoch": 0.7531862409209265, "grad_norm": 2.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20126242.90909091, "logits/rejected": -16345344.0, "logps/chosen": -498.96786221590907, "logps/rejected": -364.33360877403845, "loss": 0.0034, "rewards/chosen": 7.822874589399858, "rewards/margins": 18.663453108780867, "rewards/rejected": -10.84057851938101, "step": 2748 }, { "epoch": 0.7534603261614362, "grad_norm": 9.3125, "kl": 2.3401541709899902, "learning_rate": 5e-06, "logits/chosen": -17043862.4, "logits/rejected": -28987584.0, "logps/chosen": -432.306640625, "logps/rejected": -652.8836805555555, "loss": 0.038, "rewards/chosen": 6.648186747233073, "rewards/margins": 21.163343811035155, "rewards/rejected": -14.515157063802084, "step": 2749 }, { "epoch": 0.753734411401946, "grad_norm": 7.84375, "kl": 22.168609619140625, "learning_rate": 5e-06, "logits/chosen": -26040752.94117647, "logits/rejected": -33782582.85714286, "logps/chosen": -456.5464728860294, "logps/rejected": -505.61802455357144, "loss": 0.0483, "rewards/chosen": 7.978160184972427, "rewards/margins": 22.75079550863314, "rewards/rejected": -14.772635323660714, "step": 2750 }, { "epoch": 0.7540084966424558, "grad_norm": 10.375, "kl": 8.380030632019043, "learning_rate": 5e-06, "logits/chosen": -14371001.0, "logits/rejected": -27816052.0, "logps/chosen": -409.1214294433594, "logps/rejected": -423.74658203125, "loss": 0.0453, "rewards/chosen": 5.918787479400635, "rewards/margins": 18.271969318389893, "rewards/rejected": -12.353181838989258, "step": 2751 }, { "epoch": 0.7542825818829656, "grad_norm": 1.765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33878440.0, "logits/rejected": -35669781.333333336, "logps/chosen": -322.2492268880208, "logps/rejected": -469.897216796875, "loss": 0.0074, "rewards/chosen": 7.585936228434245, "rewards/margins": 21.22555414835612, "rewards/rejected": -13.639617919921875, "step": 2752 }, { "epoch": 0.7545566671234754, "grad_norm": 12.0625, "kl": 8.464765548706055, "learning_rate": 5e-06, "logits/chosen": -4653962.285714285, "logits/rejected": -32812800.0, "logps/chosen": -425.22872488839283, "logps/rejected": -504.896435546875, "loss": 0.0791, "rewards/chosen": 7.126728602818081, "rewards/margins": 16.585006495884485, "rewards/rejected": -9.458277893066406, "step": 2753 }, { "epoch": 0.7548307523639852, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5264890.181818182, "logits/rejected": -32070449.230769232, "logps/chosen": -510.7052112926136, "logps/rejected": -539.6261268028846, "loss": 0.0155, "rewards/chosen": 8.375927318226207, "rewards/margins": 22.690070572432937, "rewards/rejected": -14.31414325420673, "step": 2754 }, { "epoch": 0.755104837604495, "grad_norm": 4.34375, "kl": 1.6548932790756226, "learning_rate": 5e-06, "logits/chosen": -27793870.769230768, "logits/rejected": -46987066.18181818, "logps/chosen": -326.2920673076923, "logps/rejected": -682.5851828835227, "loss": 0.0266, "rewards/chosen": 5.363355783315805, "rewards/margins": 22.850682665418077, "rewards/rejected": -17.487326882102273, "step": 2755 }, { "epoch": 0.7553789228450047, "grad_norm": 13.1875, "kl": 6.2933349609375, "learning_rate": 5e-06, "logits/chosen": 1085426.3529411764, "logits/rejected": -33965492.571428575, "logps/chosen": -470.99060776654414, "logps/rejected": -506.5685337611607, "loss": 0.0543, "rewards/chosen": 6.3771523868336395, "rewards/margins": 18.189860560312994, "rewards/rejected": -11.812708173479352, "step": 2756 }, { "epoch": 0.7556530080855146, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45183284.36363637, "logits/rejected": -30228406.153846152, "logps/chosen": -414.92555930397725, "logps/rejected": -497.68734975961536, "loss": 0.0539, "rewards/chosen": 6.167322332208807, "rewards/margins": 17.516559627506282, "rewards/rejected": -11.349237295297476, "step": 2757 }, { "epoch": 0.7559270933260244, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39877617.777777776, "logits/rejected": 36475225.6, "logps/chosen": -500.0118815104167, "logps/rejected": -604.329296875, "loss": 0.0319, "rewards/chosen": 6.8258717854817705, "rewards/margins": 20.620510864257813, "rewards/rejected": -13.794639078776042, "step": 2758 }, { "epoch": 0.7562011785665342, "grad_norm": 11.25, "kl": 1.819867491722107, "learning_rate": 5e-06, "logits/chosen": -28009268.363636363, "logits/rejected": -34833220.92307692, "logps/chosen": -373.8841441761364, "logps/rejected": -520.8594876802885, "loss": 0.0364, "rewards/chosen": 8.830890308726918, "rewards/margins": 21.651241542576077, "rewards/rejected": -12.82035123384916, "step": 2759 }, { "epoch": 0.756475263807044, "grad_norm": 1.0390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18306666.666666668, "logits/rejected": -4572035.2, "logps/chosen": -324.7732747395833, "logps/rejected": -536.0904296875, "loss": 0.0032, "rewards/chosen": 5.601757473415798, "rewards/margins": 18.917286851671008, "rewards/rejected": -13.315529378255208, "step": 2760 }, { "epoch": 0.7567493490475538, "grad_norm": 7.5625, "kl": 0.9712969660758972, "learning_rate": 5e-06, "logits/chosen": -7352921.0, "logits/rejected": -21148762.0, "logps/chosen": -362.6809387207031, "logps/rejected": -546.5841674804688, "loss": 0.0358, "rewards/chosen": 6.126959800720215, "rewards/margins": 22.56828212738037, "rewards/rejected": -16.441322326660156, "step": 2761 }, { "epoch": 0.7570234342880636, "grad_norm": 3.921875, "kl": 3.365769863128662, "learning_rate": 5e-06, "logits/chosen": -33767571.2, "logits/rejected": -24094855.111111112, "logps/chosen": -455.7794921875, "logps/rejected": -504.2360026041667, "loss": 0.009, "rewards/chosen": 8.870934041341146, "rewards/margins": 22.885657925075954, "rewards/rejected": -14.01472388373481, "step": 2762 }, { "epoch": 0.7572975195285734, "grad_norm": 8.5625, "kl": 0.948267936706543, "learning_rate": 5e-06, "logits/chosen": -15870901.333333334, "logits/rejected": -10099600.0, "logps/chosen": -401.43549262152777, "logps/rejected": -619.2928385416667, "loss": 0.0306, "rewards/chosen": 5.308479309082031, "rewards/margins": 21.777992248535156, "rewards/rejected": -16.469512939453125, "step": 2763 }, { "epoch": 0.7575716047690831, "grad_norm": 9.875, "kl": 2.8153579235076904, "learning_rate": 5e-06, "logits/chosen": -36245689.14285714, "logits/rejected": -15960742.4, "logps/chosen": -427.4033203125, "logps/rejected": -663.59462890625, "loss": 0.0283, "rewards/chosen": 7.290718078613281, "rewards/margins": 19.553060913085936, "rewards/rejected": -12.262342834472657, "step": 2764 }, { "epoch": 0.757845690009593, "grad_norm": 2.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22745989.333333332, "logits/rejected": 23967293.333333332, "logps/chosen": -381.1148274739583, "logps/rejected": -573.3662923177084, "loss": 0.0093, "rewards/chosen": 7.484049479166667, "rewards/margins": 27.16640853881836, "rewards/rejected": -19.68235905965169, "step": 2765 }, { "epoch": 0.7581197752501028, "grad_norm": 3.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21825736.727272727, "logits/rejected": -36042651.07692308, "logps/chosen": -380.2742808948864, "logps/rejected": -554.7297926682693, "loss": 0.0421, "rewards/chosen": 6.8695595481178975, "rewards/margins": 20.68705269340035, "rewards/rejected": -13.817493145282452, "step": 2766 }, { "epoch": 0.7583938604906125, "grad_norm": 3.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9446620.363636363, "logits/rejected": -42183030.15384615, "logps/chosen": -285.73486328125, "logps/rejected": -577.3092322716346, "loss": 0.0323, "rewards/chosen": 6.386251969770952, "rewards/margins": 21.962156229085856, "rewards/rejected": -15.575904259314903, "step": 2767 }, { "epoch": 0.7586679457311224, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11040245.333333334, "logits/rejected": -24943891.2, "logps/chosen": -364.21503363715277, "logps/rejected": -518.6682291666667, "loss": 0.0239, "rewards/chosen": 6.433459811740452, "rewards/margins": 19.39150627983941, "rewards/rejected": -12.958046468098958, "step": 2768 }, { "epoch": 0.7589420309716322, "grad_norm": 9.625, "kl": 3.3568739891052246, "learning_rate": 5e-06, "logits/chosen": 2932208.0, "logits/rejected": -32703424.0, "logps/chosen": -403.65875244140625, "logps/rejected": -448.2911071777344, "loss": 0.0555, "rewards/chosen": 6.451761245727539, "rewards/margins": 15.779071807861328, "rewards/rejected": -9.327310562133789, "step": 2769 }, { "epoch": 0.759216116212142, "grad_norm": 6.625, "kl": 10.46141242980957, "learning_rate": 5e-06, "logits/chosen": -4652798.933333334, "logits/rejected": -21647473.777777776, "logps/chosen": -415.65341796875, "logps/rejected": -566.7164713541666, "loss": 0.0606, "rewards/chosen": 6.5907038370768225, "rewards/margins": 18.30269758436415, "rewards/rejected": -11.711993747287327, "step": 2770 }, { "epoch": 0.7594902014526518, "grad_norm": 4.15625, "kl": 4.638254165649414, "learning_rate": 5e-06, "logits/chosen": -16916924.0, "logits/rejected": -14543397.0, "logps/chosen": -375.1133117675781, "logps/rejected": -470.99066162109375, "loss": 0.0385, "rewards/chosen": 7.458029747009277, "rewards/margins": 18.15080738067627, "rewards/rejected": -10.692777633666992, "step": 2771 }, { "epoch": 0.7597642866931615, "grad_norm": 7.6875, "kl": 0.0429433211684227, "learning_rate": 5e-06, "logits/chosen": -23627853.09090909, "logits/rejected": 19833094.153846152, "logps/chosen": -357.65793678977275, "logps/rejected": -709.8342848557693, "loss": 0.0247, "rewards/chosen": 6.201148293235085, "rewards/margins": 24.122542054503114, "rewards/rejected": -17.92139376126803, "step": 2772 }, { "epoch": 0.7600383719336714, "grad_norm": 7.3125, "kl": 12.601542472839355, "learning_rate": 5e-06, "logits/chosen": -32012768.0, "logits/rejected": -36909309.71428572, "logps/chosen": -517.175830078125, "logps/rejected": -567.7024623325893, "loss": 0.0233, "rewards/chosen": 9.082037353515625, "rewards/margins": 20.431782967703683, "rewards/rejected": -11.349745614188057, "step": 2773 }, { "epoch": 0.7603124571741812, "grad_norm": 13.4375, "kl": 10.489957809448242, "learning_rate": 5e-06, "logits/chosen": -11317462.153846154, "logits/rejected": -12490869.818181818, "logps/chosen": -378.5908954326923, "logps/rejected": -560.5603693181819, "loss": 0.0762, "rewards/chosen": 6.56368901179387, "rewards/margins": 16.516286196408572, "rewards/rejected": -9.952597184614701, "step": 2774 }, { "epoch": 0.7605865424146909, "grad_norm": 3.203125, "kl": 3.2603378295898438, "learning_rate": 5e-06, "logits/chosen": -11482518.545454545, "logits/rejected": -4079618.153846154, "logps/chosen": -481.03315873579544, "logps/rejected": -485.361328125, "loss": 0.0088, "rewards/chosen": 6.446437488902699, "rewards/margins": 15.781927202131365, "rewards/rejected": -9.335489713228666, "step": 2775 }, { "epoch": 0.7608606276552008, "grad_norm": 7.25, "kl": 5.048122406005859, "learning_rate": 5e-06, "logits/chosen": -30170673.230769232, "logits/rejected": -21854478.545454547, "logps/chosen": -468.8615910456731, "logps/rejected": -365.1594904119318, "loss": 0.0265, "rewards/chosen": 7.086051354041467, "rewards/margins": 15.6043036400855, "rewards/rejected": -8.518252286044033, "step": 2776 }, { "epoch": 0.7611347128957106, "grad_norm": 12.5625, "kl": 8.620306015014648, "learning_rate": 5e-06, "logits/chosen": -7132297.230769231, "logits/rejected": -7290752.7272727275, "logps/chosen": -487.20541616586536, "logps/rejected": -611.6945578835227, "loss": 0.072, "rewards/chosen": 6.250758244441106, "rewards/margins": 15.176277320701761, "rewards/rejected": -8.925519076260654, "step": 2777 }, { "epoch": 0.7614087981362203, "grad_norm": 3.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 27794440.0, "logits/rejected": -27030894.0, "logps/chosen": -520.4371337890625, "logps/rejected": -433.82806396484375, "loss": 0.0055, "rewards/chosen": 8.080987930297852, "rewards/margins": 20.127267837524414, "rewards/rejected": -12.046279907226562, "step": 2778 }, { "epoch": 0.7616828833767302, "grad_norm": 7.09375, "kl": 3.4103317260742188, "learning_rate": 5e-06, "logits/chosen": -20120382.666666668, "logits/rejected": -23402672.0, "logps/chosen": -278.4970703125, "logps/rejected": -476.1376546223958, "loss": 0.0409, "rewards/chosen": 5.894421895345052, "rewards/margins": 15.181224822998047, "rewards/rejected": -9.286802927652994, "step": 2779 }, { "epoch": 0.7619569686172399, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11418368.727272727, "logits/rejected": 88285006.76923077, "logps/chosen": -497.0296519886364, "logps/rejected": -657.0009014423077, "loss": 0.0214, "rewards/chosen": 8.048785816539418, "rewards/margins": 23.285444939886773, "rewards/rejected": -15.236659123347355, "step": 2780 }, { "epoch": 0.7622310538577498, "grad_norm": 8.0, "kl": 1.2204806804656982, "learning_rate": 5e-06, "logits/chosen": -8877542.0, "logits/rejected": -38662420.0, "logps/chosen": -379.4473571777344, "logps/rejected": -642.2404174804688, "loss": 0.0234, "rewards/chosen": 6.5233964920043945, "rewards/margins": 20.88601016998291, "rewards/rejected": -14.362613677978516, "step": 2781 }, { "epoch": 0.7625051390982596, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25631069.09090909, "logits/rejected": -31366857.846153848, "logps/chosen": -357.8538263494318, "logps/rejected": -487.42660757211536, "loss": 0.0136, "rewards/chosen": 6.778081026944247, "rewards/margins": 21.566406410057226, "rewards/rejected": -14.78832538311298, "step": 2782 }, { "epoch": 0.7627792243387693, "grad_norm": 14.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2971445.6, "logits/rejected": -21118742.85714286, "logps/chosen": -396.394140625, "logps/rejected": -573.5926688058036, "loss": 0.0641, "rewards/chosen": 5.307440567016601, "rewards/margins": 12.822988183157785, "rewards/rejected": -7.515547616141183, "step": 2783 }, { "epoch": 0.7630533095792792, "grad_norm": 4.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11155323.076923076, "logits/rejected": -24446801.454545453, "logps/chosen": -354.4502704326923, "logps/rejected": -528.2449396306819, "loss": 0.0382, "rewards/chosen": 6.098743145282452, "rewards/margins": 17.129416779204682, "rewards/rejected": -11.03067363392223, "step": 2784 }, { "epoch": 0.763327394819789, "grad_norm": 1.8359375, "kl": 7.237586975097656, "learning_rate": 5e-06, "logits/chosen": -20393757.714285713, "logits/rejected": -36194105.6, "logps/chosen": -398.58837890625, "logps/rejected": -444.62587890625, "loss": 0.004, "rewards/chosen": 8.090175083705358, "rewards/margins": 19.485150364467074, "rewards/rejected": -11.394975280761718, "step": 2785 }, { "epoch": 0.7636014800602987, "grad_norm": 2.609375, "kl": 1.2128448486328125, "learning_rate": 5e-06, "logits/chosen": -24912144.0, "logits/rejected": -20315877.333333332, "logps/chosen": -389.2418619791667, "logps/rejected": -537.0008138020834, "loss": 0.0103, "rewards/chosen": 7.044209162394206, "rewards/margins": 18.151730219523113, "rewards/rejected": -11.107521057128906, "step": 2786 }, { "epoch": 0.7638755653008086, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9541568.0, "logits/rejected": -11481917.714285715, "logps/chosen": -365.0228515625, "logps/rejected": -517.5274483816964, "loss": 0.0113, "rewards/chosen": 6.764102172851563, "rewards/margins": 16.086162894112725, "rewards/rejected": -9.322060721261161, "step": 2787 }, { "epoch": 0.7641496505413183, "grad_norm": 5.09375, "kl": 6.521088600158691, "learning_rate": 5e-06, "logits/chosen": -27421030.4, "logits/rejected": -21560013.714285713, "logps/chosen": -513.34091796875, "logps/rejected": -619.1610630580357, "loss": 0.0524, "rewards/chosen": 6.687256622314453, "rewards/margins": 18.37834461757115, "rewards/rejected": -11.691087995256696, "step": 2788 }, { "epoch": 0.7644237357818281, "grad_norm": 6.6875, "kl": 0.7760137319564819, "learning_rate": 5e-06, "logits/chosen": -22276337.066666666, "logits/rejected": -29321342.222222224, "logps/chosen": -435.3965169270833, "logps/rejected": -341.10096571180554, "loss": 0.041, "rewards/chosen": 5.757536315917969, "rewards/margins": 14.152857632107205, "rewards/rejected": -8.395321316189236, "step": 2789 }, { "epoch": 0.764697821022338, "grad_norm": 3.921875, "kl": 3.9700002670288086, "learning_rate": 5e-06, "logits/chosen": -9861978.0, "logits/rejected": -4631718.5, "logps/chosen": -405.92230224609375, "logps/rejected": -506.7681884765625, "loss": 0.0594, "rewards/chosen": 6.673892021179199, "rewards/margins": 18.026334762573242, "rewards/rejected": -11.352442741394043, "step": 2790 }, { "epoch": 0.7649719062628477, "grad_norm": 3.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23093628.8, "logits/rejected": -37716566.85714286, "logps/chosen": -372.560107421875, "logps/rejected": -705.7066127232143, "loss": 0.0113, "rewards/chosen": 6.293927001953125, "rewards/margins": 20.072068568638393, "rewards/rejected": -13.778141566685267, "step": 2791 }, { "epoch": 0.7652459915033576, "grad_norm": 5.53125, "kl": 3.795163631439209, "learning_rate": 5e-06, "logits/chosen": -13291613.333333334, "logits/rejected": -32752218.666666668, "logps/chosen": -440.1899007161458, "logps/rejected": -513.69189453125, "loss": 0.0476, "rewards/chosen": 8.991508483886719, "rewards/margins": 19.819487253824867, "rewards/rejected": -10.82797876993815, "step": 2792 }, { "epoch": 0.7655200767438674, "grad_norm": 0.291015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15665459.2, "logits/rejected": -41156448.0, "logps/chosen": -409.542236328125, "logps/rejected": -630.5796595982143, "loss": 0.0009, "rewards/chosen": 8.324118041992188, "rewards/margins": 22.33738228934152, "rewards/rejected": -14.01326424734933, "step": 2793 }, { "epoch": 0.7657941619843771, "grad_norm": 1.3359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3131858.1818181816, "logits/rejected": -15882695.384615384, "logps/chosen": -495.89377663352275, "logps/rejected": -524.9518479567307, "loss": 0.0034, "rewards/chosen": 8.753265380859375, "rewards/margins": 19.419004000150238, "rewards/rejected": -10.665738619290865, "step": 2794 }, { "epoch": 0.766068247224887, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13885409.6, "logits/rejected": -27715172.57142857, "logps/chosen": -367.67373046875, "logps/rejected": -516.8992396763393, "loss": 0.0303, "rewards/chosen": 6.462983703613281, "rewards/margins": 16.4281010219029, "rewards/rejected": -9.96511731828962, "step": 2795 }, { "epoch": 0.7663423324653967, "grad_norm": 4.96875, "kl": 5.90837287902832, "learning_rate": 5e-06, "logits/chosen": -6164307.428571428, "logits/rejected": -15107537.6, "logps/chosen": -436.0736607142857, "logps/rejected": -590.968017578125, "loss": 0.0298, "rewards/chosen": 7.567193167550223, "rewards/margins": 20.280417960030693, "rewards/rejected": -12.713224792480469, "step": 2796 }, { "epoch": 0.7666164177059065, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41429021.333333336, "logits/rejected": -30401749.333333332, "logps/chosen": -423.6914469401042, "logps/rejected": -540.2763671875, "loss": 0.0379, "rewards/chosen": 6.636752446492513, "rewards/margins": 17.299569447835285, "rewards/rejected": -10.662817001342773, "step": 2797 }, { "epoch": 0.7668905029464164, "grad_norm": 13.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18635230.769230768, "logits/rejected": -23447570.90909091, "logps/chosen": -383.5866511418269, "logps/rejected": -538.7832919034091, "loss": 0.0334, "rewards/chosen": 6.635807917668269, "rewards/margins": 16.61599091216401, "rewards/rejected": -9.980182994495738, "step": 2798 }, { "epoch": 0.7671645881869261, "grad_norm": 3.546875, "kl": 6.321282386779785, "learning_rate": 5e-06, "logits/chosen": -34771248.0, "logits/rejected": -13755553.333333334, "logps/chosen": -492.0751139322917, "logps/rejected": -492.2294514973958, "loss": 0.0125, "rewards/chosen": 7.508309682210286, "rewards/margins": 17.384496053059895, "rewards/rejected": -9.87618637084961, "step": 2799 }, { "epoch": 0.7674386734274359, "grad_norm": 2.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10484704.0, "logits/rejected": -18974986.666666668, "logps/chosen": -441.7309163411458, "logps/rejected": -584.9000244140625, "loss": 0.0316, "rewards/chosen": 7.919079462687175, "rewards/margins": 19.034894943237305, "rewards/rejected": -11.11581548055013, "step": 2800 }, { "epoch": 0.7677127586679457, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14610322.133333333, "logits/rejected": -28312161.777777776, "logps/chosen": -458.4232421875, "logps/rejected": -556.4165581597222, "loss": 0.0164, "rewards/chosen": 7.640055338541667, "rewards/margins": 18.28455098470052, "rewards/rejected": -10.644495646158854, "step": 2801 }, { "epoch": 0.7679868439084555, "grad_norm": 2.90625, "kl": 4.4722490310668945, "learning_rate": 5e-06, "logits/chosen": -54667392.0, "logits/rejected": -25545288.0, "logps/chosen": -503.2737630208333, "logps/rejected": -529.4990641276041, "loss": 0.0062, "rewards/chosen": 7.746613184611003, "rewards/margins": 19.436713536580402, "rewards/rejected": -11.6901003519694, "step": 2802 }, { "epoch": 0.7682609291489654, "grad_norm": 4.0, "kl": 2.389575958251953, "learning_rate": 5e-06, "logits/chosen": -16544644.0, "logits/rejected": -35055706.666666664, "logps/chosen": -417.4725748697917, "logps/rejected": -531.7978515625, "loss": 0.0111, "rewards/chosen": 7.228902816772461, "rewards/margins": 19.174506505330406, "rewards/rejected": -11.945603688557943, "step": 2803 }, { "epoch": 0.7685350143894751, "grad_norm": 11.6875, "kl": 0.8067407608032227, "learning_rate": 5e-06, "logits/chosen": -13661553.23076923, "logits/rejected": -22936864.0, "logps/chosen": -403.8019831730769, "logps/rejected": -494.30859375, "loss": 0.0443, "rewards/chosen": 5.471716073843149, "rewards/margins": 19.047695133235905, "rewards/rejected": -13.575979059392756, "step": 2804 }, { "epoch": 0.7688090996299849, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 16012308.923076924, "logits/rejected": -34141457.45454545, "logps/chosen": -390.3220402644231, "logps/rejected": -677.2750355113636, "loss": 0.0379, "rewards/chosen": 5.717508756197416, "rewards/margins": 19.783405917507785, "rewards/rejected": -14.06589716131037, "step": 2805 }, { "epoch": 0.7690831848704948, "grad_norm": 8.4375, "kl": 4.226692199707031, "learning_rate": 5e-06, "logits/chosen": -23445642.666666668, "logits/rejected": -4729795.555555556, "logps/chosen": -384.3920572916667, "logps/rejected": -507.30040147569446, "loss": 0.0426, "rewards/chosen": 6.511273701985677, "rewards/margins": 14.940277947319878, "rewards/rejected": -8.429004245334202, "step": 2806 }, { "epoch": 0.7693572701110045, "grad_norm": 5.84375, "kl": 12.123927116394043, "learning_rate": 5e-06, "logits/chosen": -22689018.666666668, "logits/rejected": -36444232.0, "logps/chosen": -425.8274739583333, "logps/rejected": -516.2394205729166, "loss": 0.0469, "rewards/chosen": 6.634878158569336, "rewards/margins": 16.66643842061361, "rewards/rejected": -10.031560262044271, "step": 2807 }, { "epoch": 0.7696313553515143, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15727796.0, "logits/rejected": -37734944.0, "logps/chosen": -412.4119873046875, "logps/rejected": -512.010009765625, "loss": 0.0465, "rewards/chosen": 5.960187276204427, "rewards/margins": 18.46774419148763, "rewards/rejected": -12.507556915283203, "step": 2808 }, { "epoch": 0.7699054405920241, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24367693.333333332, "logits/rejected": -29983299.555555556, "logps/chosen": -388.0152994791667, "logps/rejected": -384.1111111111111, "loss": 0.0094, "rewards/chosen": 5.367031097412109, "rewards/margins": 16.742972903781467, "rewards/rejected": -11.375941806369358, "step": 2809 }, { "epoch": 0.7701795258325339, "grad_norm": 5.25, "kl": 1.9305700063705444, "learning_rate": 5e-06, "logits/chosen": -16649452.0, "logits/rejected": -41201797.333333336, "logps/chosen": -395.3647867838542, "logps/rejected": -618.1743977864584, "loss": 0.0162, "rewards/chosen": 6.8584645589192705, "rewards/margins": 21.74362055460612, "rewards/rejected": -14.88515599568685, "step": 2810 }, { "epoch": 0.7704536110730437, "grad_norm": 6.34375, "kl": 7.296306610107422, "learning_rate": 5e-06, "logits/chosen": -16285592.727272727, "logits/rejected": 3758679.3846153845, "logps/chosen": -431.2667791193182, "logps/rejected": -490.60633263221155, "loss": 0.0252, "rewards/chosen": 8.001141634854404, "rewards/margins": 17.242596126102903, "rewards/rejected": -9.241454491248497, "step": 2811 }, { "epoch": 0.7707276963135535, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29894137.6, "logits/rejected": -30754189.714285713, "logps/chosen": -434.13642578125, "logps/rejected": -619.95166015625, "loss": 0.0379, "rewards/chosen": 8.404371643066407, "rewards/margins": 24.367204502650672, "rewards/rejected": -15.962832859584264, "step": 2812 }, { "epoch": 0.7710017815540633, "grad_norm": 13.625, "kl": 1.6725845336914062, "learning_rate": 5e-06, "logits/chosen": -21456325.818181816, "logits/rejected": -22783227.076923076, "logps/chosen": -566.2841352982955, "logps/rejected": -682.7322716346154, "loss": 0.0363, "rewards/chosen": 7.469745982776988, "rewards/margins": 25.630942177939247, "rewards/rejected": -18.16119619516226, "step": 2813 }, { "epoch": 0.7712758667945732, "grad_norm": 12.375, "kl": 13.2116060256958, "learning_rate": 5e-06, "logits/chosen": -25363914.666666668, "logits/rejected": -53611790.222222224, "logps/chosen": -459.03740234375, "logps/rejected": -581.2180989583334, "loss": 0.0672, "rewards/chosen": 8.381198628743489, "rewards/margins": 19.57114512125651, "rewards/rejected": -11.189946492513021, "step": 2814 }, { "epoch": 0.7715499520350829, "grad_norm": 3.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6353616.0, "logits/rejected": 2633504.0, "logps/chosen": -395.72216796875, "logps/rejected": -681.0286959134615, "loss": 0.0072, "rewards/chosen": 7.6667938232421875, "rewards/margins": 25.72256587101863, "rewards/rejected": -18.055772047776443, "step": 2815 }, { "epoch": 0.7718240372755927, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10336154.909090908, "logits/rejected": -31536546.46153846, "logps/chosen": -331.8724476207386, "logps/rejected": -581.6519681490385, "loss": 0.0443, "rewards/chosen": 7.583138899369673, "rewards/margins": 20.938151059450803, "rewards/rejected": -13.35501216008113, "step": 2816 }, { "epoch": 0.7720981225161025, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34743431.384615384, "logits/rejected": -31628578.90909091, "logps/chosen": -408.64637169471155, "logps/rejected": -369.9503284801136, "loss": 0.0096, "rewards/chosen": 7.187347998985877, "rewards/margins": 18.07269207080761, "rewards/rejected": -10.885344071821732, "step": 2817 }, { "epoch": 0.7723722077566123, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17819303.272727273, "logits/rejected": -18659105.230769232, "logps/chosen": -396.6134144176136, "logps/rejected": -537.9655949519231, "loss": 0.0162, "rewards/chosen": 7.8607177734375, "rewards/margins": 20.037774892953728, "rewards/rejected": -12.177057119516226, "step": 2818 }, { "epoch": 0.7726462929971221, "grad_norm": 6.15625, "kl": 8.23203182220459, "learning_rate": 5e-06, "logits/chosen": -26212890.666666668, "logits/rejected": -34453029.333333336, "logps/chosen": -530.7855631510416, "logps/rejected": -618.326416015625, "loss": 0.0535, "rewards/chosen": 7.660545984903972, "rewards/margins": 20.66975466410319, "rewards/rejected": -13.009208679199219, "step": 2819 }, { "epoch": 0.7729203782376319, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16619962.181818182, "logits/rejected": -18142032.0, "logps/chosen": -344.142333984375, "logps/rejected": -393.17236328125, "loss": 0.0652, "rewards/chosen": 5.317474018443715, "rewards/margins": 16.21760407027665, "rewards/rejected": -10.900130051832933, "step": 2820 }, { "epoch": 0.7731944634781417, "grad_norm": 8.125, "kl": 13.588447570800781, "learning_rate": 5e-06, "logits/chosen": -21738710.153846152, "logits/rejected": -11335099.636363637, "logps/chosen": -845.2741887019231, "logps/rejected": -627.4592507102273, "loss": 0.1087, "rewards/chosen": 10.26156264085036, "rewards/margins": 26.98656458287806, "rewards/rejected": -16.7250019420277, "step": 2821 }, { "epoch": 0.7734685487186514, "grad_norm": 4.0625, "kl": 0.3636443018913269, "learning_rate": 5e-06, "logits/chosen": -27517676.8, "logits/rejected": 11331805.714285715, "logps/chosen": -344.9762939453125, "logps/rejected": -442.00558035714283, "loss": 0.0163, "rewards/chosen": 6.104502868652344, "rewards/margins": 16.801250784737725, "rewards/rejected": -10.69674791608538, "step": 2822 }, { "epoch": 0.7737426339591613, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36776460.0, "logits/rejected": -23501068.0, "logps/chosen": -476.2791748046875, "logps/rejected": -720.8363647460938, "loss": 0.0077, "rewards/chosen": 7.336752414703369, "rewards/margins": 21.732472896575928, "rewards/rejected": -14.395720481872559, "step": 2823 }, { "epoch": 0.7740167191996711, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21757174.4, "logits/rejected": -32642096.0, "logps/chosen": -443.52978515625, "logps/rejected": -526.3249162946429, "loss": 0.0683, "rewards/chosen": 6.818023681640625, "rewards/margins": 17.906465802873882, "rewards/rejected": -11.088442121233259, "step": 2824 }, { "epoch": 0.774290804440181, "grad_norm": 3.453125, "kl": 7.509147644042969, "learning_rate": 5e-06, "logits/chosen": -6948418.909090909, "logits/rejected": -36704866.461538464, "logps/chosen": -452.0035511363636, "logps/rejected": -631.5872896634615, "loss": 0.0444, "rewards/chosen": 6.4690163352272725, "rewards/margins": 20.880254785497705, "rewards/rejected": -14.411238450270433, "step": 2825 }, { "epoch": 0.7745648896806907, "grad_norm": 6.1875, "kl": 17.531883239746094, "learning_rate": 5e-06, "logits/chosen": -20945472.0, "logits/rejected": -3295581.714285714, "logps/chosen": -465.3341854319853, "logps/rejected": -578.8700823102679, "loss": 0.0746, "rewards/chosen": 7.721064848058364, "rewards/margins": 19.44348785656841, "rewards/rejected": -11.722423008510045, "step": 2826 }, { "epoch": 0.7748389749212005, "grad_norm": 1.3515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4953390.4, "logits/rejected": -57331570.28571428, "logps/chosen": -505.707177734375, "logps/rejected": -682.60986328125, "loss": 0.0047, "rewards/chosen": 8.36856689453125, "rewards/margins": 25.16057434082031, "rewards/rejected": -16.792007446289062, "step": 2827 }, { "epoch": 0.7751130601617103, "grad_norm": 2.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26688128.0, "logits/rejected": -32031776.0, "logps/chosen": -343.0675706129808, "logps/rejected": -521.2676225142045, "loss": 0.0064, "rewards/chosen": 6.176868145282452, "rewards/margins": 17.022586529071514, "rewards/rejected": -10.845718383789062, "step": 2828 }, { "epoch": 0.7753871454022201, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25890856.533333335, "logits/rejected": -27513018.666666668, "logps/chosen": -480.71936848958336, "logps/rejected": -560.9308810763889, "loss": 0.02, "rewards/chosen": 7.081168619791667, "rewards/margins": 20.25359293619792, "rewards/rejected": -13.17242431640625, "step": 2829 }, { "epoch": 0.7756612306427298, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35953459.692307696, "logits/rejected": -19667351.272727273, "logps/chosen": -427.00232872596155, "logps/rejected": -492.2200816761364, "loss": 0.0712, "rewards/chosen": 6.018797067495493, "rewards/margins": 18.162917717353448, "rewards/rejected": -12.144120649857955, "step": 2830 }, { "epoch": 0.7759353158832397, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46668468.36363637, "logits/rejected": -25658889.846153848, "logps/chosen": -317.45822975852275, "logps/rejected": -613.8309795673077, "loss": 0.0728, "rewards/chosen": 5.653013749556108, "rewards/margins": 16.72398824625082, "rewards/rejected": -11.070974496694712, "step": 2831 }, { "epoch": 0.7762094011237495, "grad_norm": 7.34375, "kl": 1.3504054546356201, "learning_rate": 5e-06, "logits/chosen": -21966354.666666668, "logits/rejected": -15801930.666666666, "logps/chosen": -360.0289713541667, "logps/rejected": -581.0619710286459, "loss": 0.0375, "rewards/chosen": 5.548119227091472, "rewards/margins": 17.41499392191569, "rewards/rejected": -11.866874694824219, "step": 2832 }, { "epoch": 0.7764834863642592, "grad_norm": 11.9375, "kl": 3.5843138694763184, "learning_rate": 5e-06, "logits/chosen": -30068140.8, "logits/rejected": -10186405.333333334, "logps/chosen": -428.53733723958334, "logps/rejected": -507.2298177083333, "loss": 0.0633, "rewards/chosen": 7.788785807291666, "rewards/margins": 15.838615078396266, "rewards/rejected": -8.049829271104601, "step": 2833 }, { "epoch": 0.7767575716047691, "grad_norm": 4.90625, "kl": 0.17704519629478455, "learning_rate": 5e-06, "logits/chosen": -11908376.0, "logits/rejected": -31322616.0, "logps/chosen": -375.1988118489583, "logps/rejected": -786.3131510416666, "loss": 0.0095, "rewards/chosen": 7.477780659993489, "rewards/margins": 23.652735392252602, "rewards/rejected": -16.174954732259113, "step": 2834 }, { "epoch": 0.7770316568452789, "grad_norm": 7.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18293016.888888888, "logits/rejected": -24618257.066666666, "logps/chosen": -457.75916883680554, "logps/rejected": -500.7682291666667, "loss": 0.0664, "rewards/chosen": 5.5537914699978295, "rewards/margins": 18.537989468044707, "rewards/rejected": -12.984197998046875, "step": 2835 }, { "epoch": 0.7773057420857887, "grad_norm": 8.625, "kl": 2.535532236099243, "learning_rate": 5e-06, "logits/chosen": -15465584.0, "logits/rejected": -15865892.923076924, "logps/chosen": -487.21977095170456, "logps/rejected": -451.74305138221155, "loss": 0.0393, "rewards/chosen": 6.672699668190696, "rewards/margins": 17.031675592169062, "rewards/rejected": -10.358975923978365, "step": 2836 }, { "epoch": 0.7775798273262985, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8824113.6, "logits/rejected": -26392724.57142857, "logps/chosen": -282.6585693359375, "logps/rejected": -530.49267578125, "loss": 0.0407, "rewards/chosen": 5.5301166534423825, "rewards/margins": 18.85423513139997, "rewards/rejected": -13.324118477957589, "step": 2837 }, { "epoch": 0.7778539125668082, "grad_norm": 3.5, "kl": 4.491074085235596, "learning_rate": 5e-06, "logits/chosen": -1290689.0666666667, "logits/rejected": -31918784.0, "logps/chosen": -458.6204427083333, "logps/rejected": -406.8623318142361, "loss": 0.0234, "rewards/chosen": 6.838956705729166, "rewards/margins": 19.36610633002387, "rewards/rejected": -12.527149624294704, "step": 2838 }, { "epoch": 0.7781279978073181, "grad_norm": 4.71875, "kl": 0.12153689563274384, "learning_rate": 5e-06, "logits/chosen": -11447749.714285715, "logits/rejected": -41613740.8, "logps/chosen": -342.35916573660717, "logps/rejected": -600.81318359375, "loss": 0.0204, "rewards/chosen": 7.625658852713449, "rewards/margins": 20.93053250994001, "rewards/rejected": -13.304873657226562, "step": 2839 }, { "epoch": 0.7784020830478279, "grad_norm": 1.9140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19834236.8, "logits/rejected": -28283990.85714286, "logps/chosen": -488.04697265625, "logps/rejected": -583.4552873883929, "loss": 0.0141, "rewards/chosen": 6.7544715881347654, "rewards/margins": 19.231424277169364, "rewards/rejected": -12.476952689034599, "step": 2840 }, { "epoch": 0.7786761682883376, "grad_norm": 1.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19304696.615384616, "logits/rejected": -10792196.363636363, "logps/chosen": -463.5205829326923, "logps/rejected": -528.7774325284091, "loss": 0.0092, "rewards/chosen": 6.716212346003606, "rewards/margins": 24.01371989216838, "rewards/rejected": -17.297507546164773, "step": 2841 }, { "epoch": 0.7789502535288475, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9091724.8, "logits/rejected": -12825409.142857144, "logps/chosen": -302.3318115234375, "logps/rejected": -519.4906180245536, "loss": 0.0382, "rewards/chosen": 5.4953960418701175, "rewards/margins": 16.41942253112793, "rewards/rejected": -10.924026489257812, "step": 2842 }, { "epoch": 0.7792243387693573, "grad_norm": 3.890625, "kl": 4.063755035400391, "learning_rate": 5e-06, "logits/chosen": -20631689.14285714, "logits/rejected": 26925836.8, "logps/chosen": -334.15830775669644, "logps/rejected": -576.071484375, "loss": 0.055, "rewards/chosen": 6.603640420096261, "rewards/margins": 22.401046425955638, "rewards/rejected": -15.797406005859376, "step": 2843 }, { "epoch": 0.779498424009867, "grad_norm": 5.90625, "kl": 8.215481758117676, "learning_rate": 5e-06, "logits/chosen": -21502877.333333332, "logits/rejected": -8471850.0, "logps/chosen": -570.9708251953125, "logps/rejected": -410.0921223958333, "loss": 0.0293, "rewards/chosen": 9.11336580912272, "rewards/margins": 20.35080909729004, "rewards/rejected": -11.237443288167318, "step": 2844 }, { "epoch": 0.7797725092503769, "grad_norm": 5.03125, "kl": 4.9956488609313965, "learning_rate": 5e-06, "logits/chosen": -28586791.384615384, "logits/rejected": -3449506.1818181816, "logps/chosen": -430.7175105168269, "logps/rejected": -662.9293323863636, "loss": 0.1217, "rewards/chosen": 6.403877258300781, "rewards/margins": 18.55200958251953, "rewards/rejected": -12.14813232421875, "step": 2845 }, { "epoch": 0.7800465944908866, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14659390.76923077, "logits/rejected": -27705917.09090909, "logps/chosen": -424.0090519831731, "logps/rejected": -532.5851384943181, "loss": 0.0326, "rewards/chosen": 5.9791118915264425, "rewards/margins": 18.005774544669197, "rewards/rejected": -12.026662653142756, "step": 2846 }, { "epoch": 0.7803206797313965, "grad_norm": 0.9609375, "kl": 5.343292236328125, "learning_rate": 5e-06, "logits/chosen": -27801770.666666668, "logits/rejected": -31206170.666666668, "logps/chosen": -438.7115071614583, "logps/rejected": -477.4562174479167, "loss": 0.0027, "rewards/chosen": 7.862211227416992, "rewards/margins": 22.06529426574707, "rewards/rejected": -14.203083038330078, "step": 2847 }, { "epoch": 0.7805947649719063, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24674023.111111112, "logits/rejected": -17360068.266666666, "logps/chosen": -413.8174641927083, "logps/rejected": -400.26689453125, "loss": 0.0524, "rewards/chosen": 6.364210340711805, "rewards/margins": 16.068409559461806, "rewards/rejected": -9.70419921875, "step": 2848 }, { "epoch": 0.780868850212416, "grad_norm": 7.71875, "kl": 5.377932548522949, "learning_rate": 5e-06, "logits/chosen": -31052138.666666668, "logits/rejected": -26898802.666666668, "logps/chosen": -392.446044921875, "logps/rejected": -405.1601155598958, "loss": 0.0296, "rewards/chosen": 6.793966929117839, "rewards/margins": 17.054391225179035, "rewards/rejected": -10.260424296061197, "step": 2849 }, { "epoch": 0.7811429354529259, "grad_norm": 1.5390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16789190.666666668, "logits/rejected": -9655892.0, "logps/chosen": -388.3678385416667, "logps/rejected": -753.4952799479166, "loss": 0.0036, "rewards/chosen": 6.1981252034505205, "rewards/margins": 20.40496063232422, "rewards/rejected": -14.206835428873697, "step": 2850 }, { "epoch": 0.7814170206934357, "grad_norm": 1.7890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35234560.0, "logits/rejected": -31665527.466666665, "logps/chosen": -356.4167751736111, "logps/rejected": -442.56653645833336, "loss": 0.007, "rewards/chosen": 6.933572133382161, "rewards/margins": 19.598063913981118, "rewards/rejected": -12.664491780598958, "step": 2851 }, { "epoch": 0.7816911059339454, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19769068.0, "logits/rejected": -22614728.0, "logps/chosen": -458.62310791015625, "logps/rejected": -534.7234497070312, "loss": 0.0427, "rewards/chosen": 6.474660873413086, "rewards/margins": 18.89506721496582, "rewards/rejected": -12.420406341552734, "step": 2852 }, { "epoch": 0.7819651911744553, "grad_norm": 5.28125, "kl": 5.103212356567383, "learning_rate": 5e-06, "logits/chosen": -29839925.333333332, "logits/rejected": -5319226.133333334, "logps/chosen": -447.44769965277777, "logps/rejected": -441.4768880208333, "loss": 0.0216, "rewards/chosen": 6.8475290934244795, "rewards/margins": 15.184462483723959, "rewards/rejected": -8.33693339029948, "step": 2853 }, { "epoch": 0.782239276414965, "grad_norm": 4.78125, "kl": 4.375783443450928, "learning_rate": 5e-06, "logits/chosen": -20335769.14285714, "logits/rejected": 9536307.2, "logps/chosen": -465.608642578125, "logps/rejected": -449.622802734375, "loss": 0.0203, "rewards/chosen": 9.867186410086495, "rewards/margins": 19.759018380301338, "rewards/rejected": -9.891831970214843, "step": 2854 }, { "epoch": 0.7825133616554748, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13326866.666666666, "logits/rejected": -28659671.466666665, "logps/chosen": -426.2947591145833, "logps/rejected": -538.48154296875, "loss": 0.0373, "rewards/chosen": 6.317175971137153, "rewards/margins": 17.52715827094184, "rewards/rejected": -11.209982299804688, "step": 2855 }, { "epoch": 0.7827874468959847, "grad_norm": 12.5, "kl": 1.0265891551971436, "learning_rate": 5e-06, "logits/chosen": -28788005.333333332, "logits/rejected": -30228178.666666668, "logps/chosen": -335.186767578125, "logps/rejected": -525.13232421875, "loss": 0.0658, "rewards/chosen": 6.126027425130208, "rewards/margins": 16.358471552530926, "rewards/rejected": -10.232444127400717, "step": 2856 }, { "epoch": 0.7830615321364944, "grad_norm": 5.40625, "kl": 5.9848432540893555, "learning_rate": 5e-06, "logits/chosen": -12971976.615384616, "logits/rejected": -15433570.909090908, "logps/chosen": -478.2451171875, "logps/rejected": -452.41495028409093, "loss": 0.0299, "rewards/chosen": 6.192129868727464, "rewards/margins": 16.201324676300263, "rewards/rejected": -10.009194807572799, "step": 2857 }, { "epoch": 0.7833356173770043, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25584290.90909091, "logits/rejected": -28913831.384615384, "logps/chosen": -356.57057883522725, "logps/rejected": -505.5505183293269, "loss": 0.0452, "rewards/chosen": 5.617275931618431, "rewards/margins": 18.199541932219393, "rewards/rejected": -12.582266000600962, "step": 2858 }, { "epoch": 0.7836097026175141, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9113705.333333334, "logits/rejected": -8856267.333333334, "logps/chosen": -310.4967041015625, "logps/rejected": -688.5184733072916, "loss": 0.0194, "rewards/chosen": 7.7338714599609375, "rewards/margins": 22.62132136027018, "rewards/rejected": -14.887449900309244, "step": 2859 }, { "epoch": 0.7838837878580238, "grad_norm": 3.359375, "kl": 2.8115553855895996, "learning_rate": 5e-06, "logits/chosen": -8661968.727272727, "logits/rejected": -15310368.0, "logps/chosen": -440.7731267755682, "logps/rejected": -595.4918870192307, "loss": 0.0177, "rewards/chosen": 8.668790643865412, "rewards/margins": 22.42867012290688, "rewards/rejected": -13.759879479041466, "step": 2860 }, { "epoch": 0.7841578730985337, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42019232.0, "logits/rejected": -27922618.666666668, "logps/chosen": -424.7982584635417, "logps/rejected": -551.9915771484375, "loss": 0.0746, "rewards/chosen": 6.5746199289957685, "rewards/margins": 19.955677668253582, "rewards/rejected": -13.381057739257812, "step": 2861 }, { "epoch": 0.7844319583390434, "grad_norm": 12.25, "kl": 1.3162015676498413, "learning_rate": 5e-06, "logits/chosen": -11556357.818181818, "logits/rejected": 90228440.61538461, "logps/chosen": -349.3308771306818, "logps/rejected": -592.6906550480769, "loss": 0.0207, "rewards/chosen": 6.890751925381747, "rewards/margins": 17.82707347736492, "rewards/rejected": -10.936321551983173, "step": 2862 }, { "epoch": 0.7847060435795532, "grad_norm": 3.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10178219.2, "logits/rejected": -19719058.285714287, "logps/chosen": -398.419091796875, "logps/rejected": -380.77242606026783, "loss": 0.0355, "rewards/chosen": 7.167796325683594, "rewards/margins": 15.012430245535715, "rewards/rejected": -7.844633919852121, "step": 2863 }, { "epoch": 0.7849801288200631, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22404544.0, "logits/rejected": -16034029.333333334, "logps/chosen": -396.91845703125, "logps/rejected": -530.3962809244791, "loss": 0.033, "rewards/chosen": 6.5205332438151045, "rewards/margins": 16.369771321614582, "rewards/rejected": -9.849238077799479, "step": 2864 }, { "epoch": 0.7852542140605728, "grad_norm": 2.421875, "kl": 6.2767462730407715, "learning_rate": 5e-06, "logits/chosen": -39047312.0, "logits/rejected": -38903776.0, "logps/chosen": -509.1624348958333, "logps/rejected": -531.8264973958334, "loss": 0.0082, "rewards/chosen": 7.685483296712239, "rewards/margins": 19.82374318440755, "rewards/rejected": -12.138259887695312, "step": 2865 }, { "epoch": 0.7855282993010826, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7771589.333333333, "logits/rejected": -21300646.4, "logps/chosen": -431.11170789930554, "logps/rejected": -738.2708333333334, "loss": 0.0043, "rewards/chosen": 7.121292961968316, "rewards/margins": 22.778525627983942, "rewards/rejected": -15.657232666015625, "step": 2866 }, { "epoch": 0.7858023845415925, "grad_norm": 3.71875, "kl": 4.18095588684082, "learning_rate": 5e-06, "logits/chosen": 13093624.0, "logits/rejected": -22027161.6, "logps/chosen": -466.77015904017856, "logps/rejected": -638.530419921875, "loss": 0.0076, "rewards/chosen": 8.000876290457589, "rewards/margins": 23.31075940813337, "rewards/rejected": -15.309883117675781, "step": 2867 }, { "epoch": 0.7860764697821022, "grad_norm": 4.34375, "kl": 2.1111011505126953, "learning_rate": 5e-06, "logits/chosen": -14957288.0, "logits/rejected": -15918430.666666666, "logps/chosen": -441.4148356119792, "logps/rejected": -596.9871419270834, "loss": 0.0145, "rewards/chosen": 7.300533294677734, "rewards/margins": 20.965379079182945, "rewards/rejected": -13.664845784505209, "step": 2868 }, { "epoch": 0.7863505550226121, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20336262.666666668, "logits/rejected": -19021948.0, "logps/chosen": -382.2298583984375, "logps/rejected": -600.064697265625, "loss": 0.023, "rewards/chosen": 6.630672454833984, "rewards/margins": 22.36634953816732, "rewards/rejected": -15.735677083333334, "step": 2869 }, { "epoch": 0.7866246402631218, "grad_norm": 0.921875, "kl": 2.074662685394287, "learning_rate": 5e-06, "logits/chosen": -12611770.666666666, "logits/rejected": -20516246.666666668, "logps/chosen": -466.3364664713542, "logps/rejected": -471.2091471354167, "loss": 0.0042, "rewards/chosen": 7.684284845987956, "rewards/margins": 18.33131726582845, "rewards/rejected": -10.647032419840494, "step": 2870 }, { "epoch": 0.7868987255036316, "grad_norm": 3.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34808376.88888889, "logits/rejected": -17971669.333333332, "logps/chosen": -328.66620551215277, "logps/rejected": -481.8052734375, "loss": 0.0438, "rewards/chosen": 6.848253038194445, "rewards/margins": 16.424500189887155, "rewards/rejected": -9.576247151692709, "step": 2871 }, { "epoch": 0.7871728107441415, "grad_norm": 0.53515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21299901.866666667, "logits/rejected": -23063893.333333332, "logps/chosen": -470.25462239583334, "logps/rejected": -415.0628255208333, "loss": 0.0019, "rewards/chosen": 6.8460337320963545, "rewards/margins": 18.366856045193142, "rewards/rejected": -11.520822313096788, "step": 2872 }, { "epoch": 0.7874468959846512, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31872128.0, "logits/rejected": -24697722.181818184, "logps/chosen": -419.67371544471155, "logps/rejected": -509.63671875, "loss": 0.013, "rewards/chosen": 5.963643587552584, "rewards/margins": 17.464256713440367, "rewards/rejected": -11.500613125887783, "step": 2873 }, { "epoch": 0.787720981225161, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -834722.1538461539, "logits/rejected": -26349684.363636363, "logps/chosen": -401.50454477163464, "logps/rejected": -430.22745028409093, "loss": 0.0397, "rewards/chosen": 6.2303936298076925, "rewards/margins": 16.455942807497678, "rewards/rejected": -10.225549177689986, "step": 2874 }, { "epoch": 0.7879950664656709, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31814579.2, "logits/rejected": -11767389.714285715, "logps/chosen": -438.11552734375, "logps/rejected": -544.4513113839286, "loss": 0.0203, "rewards/chosen": 8.005134582519531, "rewards/margins": 19.388978140694753, "rewards/rejected": -11.383843558175224, "step": 2875 }, { "epoch": 0.7882691517061806, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4961688.888888889, "logits/rejected": -9619556.266666668, "logps/chosen": -322.08672417534723, "logps/rejected": -622.7278645833334, "loss": 0.0394, "rewards/chosen": 5.551285637749566, "rewards/margins": 17.093802727593317, "rewards/rejected": -11.54251708984375, "step": 2876 }, { "epoch": 0.7885432369466904, "grad_norm": 9.8125, "kl": 0.6504402160644531, "learning_rate": 5e-06, "logits/chosen": -49204499.2, "logits/rejected": -22251997.714285713, "logps/chosen": -580.521923828125, "logps/rejected": -489.08803013392856, "loss": 0.0508, "rewards/chosen": 8.442635345458985, "rewards/margins": 20.4764771597726, "rewards/rejected": -12.033841814313616, "step": 2877 }, { "epoch": 0.7888173221872002, "grad_norm": 17.75, "kl": 0.9189020991325378, "learning_rate": 5e-06, "logits/chosen": -16600676.57142857, "logits/rejected": -23158888.0, "logps/chosen": -336.88668387276783, "logps/rejected": -470.71669921875, "loss": 0.0275, "rewards/chosen": 5.709363664899554, "rewards/margins": 15.122947038922991, "rewards/rejected": -9.413583374023437, "step": 2878 }, { "epoch": 0.78909140742771, "grad_norm": 5.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9987388.444444444, "logits/rejected": -29638144.0, "logps/chosen": -320.92005750868054, "logps/rejected": -550.6598958333333, "loss": 0.0265, "rewards/chosen": 6.531728956434462, "rewards/margins": 20.47297854953342, "rewards/rejected": -13.941249593098958, "step": 2879 }, { "epoch": 0.7893654926682199, "grad_norm": 1.03125, "kl": 6.72705078125, "learning_rate": 5e-06, "logits/chosen": -54648384.0, "logits/rejected": -8418682.666666666, "logps/chosen": -549.7294108072916, "logps/rejected": -630.442138671875, "loss": 0.0038, "rewards/chosen": 8.140851338704428, "rewards/margins": 27.688891092936196, "rewards/rejected": -19.54803975423177, "step": 2880 }, { "epoch": 0.7896395779087296, "grad_norm": 2.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36502860.0, "logits/rejected": -17450833.6, "logps/chosen": -394.6905212402344, "logps/rejected": -675.59833984375, "loss": 0.0037, "rewards/chosen": 8.988911628723145, "rewards/margins": 24.835777473449706, "rewards/rejected": -15.846865844726562, "step": 2881 }, { "epoch": 0.7899136631492394, "grad_norm": 6.875, "kl": 10.597582817077637, "learning_rate": 5e-06, "logits/chosen": -13710525.090909092, "logits/rejected": -32264273.230769232, "logps/chosen": -405.5367542613636, "logps/rejected": -588.4933894230769, "loss": 0.0244, "rewards/chosen": 7.577973799272017, "rewards/margins": 17.7246248471987, "rewards/rejected": -10.146651047926683, "step": 2882 }, { "epoch": 0.7901877483897493, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21034080.0, "logits/rejected": -32457837.17647059, "logps/chosen": -383.83984375, "logps/rejected": -541.2319623161765, "loss": 0.0597, "rewards/chosen": 8.676801409040179, "rewards/margins": 20.329437768759846, "rewards/rejected": -11.652636359719668, "step": 2883 }, { "epoch": 0.790461833630259, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30796656.0, "logits/rejected": -32119274.666666668, "logps/chosen": -426.6282145182292, "logps/rejected": -407.5849609375, "loss": 0.0417, "rewards/chosen": 6.6275984446207685, "rewards/margins": 15.61178970336914, "rewards/rejected": -8.984191258748373, "step": 2884 }, { "epoch": 0.7907359188707688, "grad_norm": 1.7109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46714816.0, "logits/rejected": -18665670.4, "logps/chosen": -529.1270616319445, "logps/rejected": -601.8186848958334, "loss": 0.0025, "rewards/chosen": 8.958267211914062, "rewards/margins": 21.06194051106771, "rewards/rejected": -12.103673299153646, "step": 2885 }, { "epoch": 0.7910100041112786, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39365980.8, "logits/rejected": -31230880.0, "logps/chosen": -437.1517578125, "logps/rejected": -471.59305245535717, "loss": 0.0103, "rewards/chosen": 6.51204833984375, "rewards/margins": 17.640037972586494, "rewards/rejected": -11.127989632742745, "step": 2886 }, { "epoch": 0.7912840893517884, "grad_norm": 1.6796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24787692.0, "logits/rejected": -21873690.0, "logps/chosen": -355.5706787109375, "logps/rejected": -521.5496826171875, "loss": 0.005, "rewards/chosen": 7.442408561706543, "rewards/margins": 20.24868392944336, "rewards/rejected": -12.806275367736816, "step": 2887 }, { "epoch": 0.7915581745922982, "grad_norm": 5.0625, "kl": 9.244460105895996, "learning_rate": 5e-06, "logits/chosen": -15398766.11764706, "logits/rejected": 14528640.0, "logps/chosen": -401.5880916819853, "logps/rejected": -672.4836774553571, "loss": 0.0132, "rewards/chosen": 7.580014397116268, "rewards/margins": 21.034410236262474, "rewards/rejected": -13.454395839146205, "step": 2888 }, { "epoch": 0.791832259832808, "grad_norm": 7.6875, "kl": 6.053360939025879, "learning_rate": 5e-06, "logits/chosen": -16268353.142857144, "logits/rejected": -15902632.0, "logps/chosen": -436.6389857700893, "logps/rejected": -443.05556640625, "loss": 0.0231, "rewards/chosen": 8.306722913469587, "rewards/margins": 20.53006275721959, "rewards/rejected": -12.22333984375, "step": 2889 }, { "epoch": 0.7921063450733178, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16703242.666666666, "logits/rejected": -11735410.666666666, "logps/chosen": -324.52972412109375, "logps/rejected": -626.8025309244791, "loss": 0.098, "rewards/chosen": 5.208025932312012, "rewards/margins": 18.463962872823082, "rewards/rejected": -13.255936940511068, "step": 2890 }, { "epoch": 0.7923804303138277, "grad_norm": 8.4375, "kl": 7.759315490722656, "learning_rate": 5e-06, "logits/chosen": -25304028.0, "logits/rejected": -720036.0, "logps/chosen": -487.8620300292969, "logps/rejected": -486.28424072265625, "loss": 0.0237, "rewards/chosen": 7.267566204071045, "rewards/margins": 20.198258876800537, "rewards/rejected": -12.930692672729492, "step": 2891 }, { "epoch": 0.7926545155543374, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10964103.384615384, "logits/rejected": -21816029.09090909, "logps/chosen": -449.1129807692308, "logps/rejected": -684.9600941051136, "loss": 0.0137, "rewards/chosen": 6.656198354867788, "rewards/margins": 24.08246233746722, "rewards/rejected": -17.426263982599433, "step": 2892 }, { "epoch": 0.7929286007948472, "grad_norm": 4.75, "kl": 0.2536824643611908, "learning_rate": 5e-06, "logits/chosen": -20625568.0, "logits/rejected": -15743584.0, "logps/chosen": -444.02298677884613, "logps/rejected": -463.20015092329544, "loss": 0.023, "rewards/chosen": 6.933024479792668, "rewards/margins": 18.60682624870247, "rewards/rejected": -11.6738017689098, "step": 2893 }, { "epoch": 0.793202686035357, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11678922.4, "logits/rejected": -8576378.857142856, "logps/chosen": -428.172509765625, "logps/rejected": -299.19520786830356, "loss": 0.0668, "rewards/chosen": 6.088691711425781, "rewards/margins": 13.152403695242747, "rewards/rejected": -7.063711983816964, "step": 2894 }, { "epoch": 0.7934767712758668, "grad_norm": 6.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12626055.272727273, "logits/rejected": 12925249.23076923, "logps/chosen": -390.03053977272725, "logps/rejected": -578.7553335336538, "loss": 0.0516, "rewards/chosen": 5.467315673828125, "rewards/margins": 16.677061814528244, "rewards/rejected": -11.20974614070012, "step": 2895 }, { "epoch": 0.7937508565163766, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19494267.42857143, "logits/rejected": -11602771.2, "logps/chosen": -361.89760044642856, "logps/rejected": -491.291796875, "loss": 0.0486, "rewards/chosen": 7.579740251813616, "rewards/margins": 19.408275713239398, "rewards/rejected": -11.828535461425782, "step": 2896 }, { "epoch": 0.7940249417568864, "grad_norm": 8.1875, "kl": 8.821514129638672, "learning_rate": 5e-06, "logits/chosen": 68716881.06666666, "logits/rejected": -22201664.0, "logps/chosen": -424.53880208333334, "logps/rejected": -560.7071940104166, "loss": 0.0626, "rewards/chosen": 6.13821055094401, "rewards/margins": 20.066551038953993, "rewards/rejected": -13.928340488009983, "step": 2897 }, { "epoch": 0.7942990269973962, "grad_norm": 2.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27398947.555555556, "logits/rejected": -19910999.466666665, "logps/chosen": -378.99365234375, "logps/rejected": -671.9454427083333, "loss": 0.0072, "rewards/chosen": 6.841688368055555, "rewards/margins": 22.621451144748264, "rewards/rejected": -15.779762776692708, "step": 2898 }, { "epoch": 0.7945731122379059, "grad_norm": 5.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30264134.85714286, "logits/rejected": 21734768.0, "logps/chosen": -433.31253487723217, "logps/rejected": -503.4908203125, "loss": 0.0291, "rewards/chosen": 7.086239406040737, "rewards/margins": 18.28629128592355, "rewards/rejected": -11.200051879882812, "step": 2899 }, { "epoch": 0.7948471974784158, "grad_norm": 9.75, "kl": 8.900300979614258, "learning_rate": 5e-06, "logits/chosen": -30532176.0, "logits/rejected": -26363442.666666668, "logps/chosen": -469.5016682942708, "logps/rejected": -629.7933756510416, "loss": 0.0278, "rewards/chosen": 7.093247095743815, "rewards/margins": 22.69905153910319, "rewards/rejected": -15.605804443359375, "step": 2900 }, { "epoch": 0.7951212827189256, "grad_norm": 5.96875, "kl": 2.1122500896453857, "learning_rate": 5e-06, "logits/chosen": -26857133.714285713, "logits/rejected": -26539388.8, "logps/chosen": -446.85682896205356, "logps/rejected": -495.41787109375, "loss": 0.0194, "rewards/chosen": 6.864289964948382, "rewards/margins": 19.29488307407924, "rewards/rejected": -12.430593109130859, "step": 2901 }, { "epoch": 0.7953953679594354, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24264482.46153846, "logits/rejected": -21188992.0, "logps/chosen": -412.39058743990387, "logps/rejected": -435.68172940340907, "loss": 0.0456, "rewards/chosen": 6.673393836388221, "rewards/margins": 17.450759780990495, "rewards/rejected": -10.777365944602273, "step": 2902 }, { "epoch": 0.7956694531999452, "grad_norm": 13.8125, "kl": 2.1490478515625, "learning_rate": 5e-06, "logits/chosen": -14424086.857142856, "logits/rejected": -17462331.2, "logps/chosen": -482.8662806919643, "logps/rejected": -679.816552734375, "loss": 0.0655, "rewards/chosen": 8.037155151367188, "rewards/margins": 22.388572692871094, "rewards/rejected": -14.351417541503906, "step": 2903 }, { "epoch": 0.795943538440455, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14001992.0, "logits/rejected": -11817012.57142857, "logps/chosen": -303.076171875, "logps/rejected": -467.4990931919643, "loss": 0.0548, "rewards/chosen": 5.619272613525391, "rewards/margins": 15.220277077811105, "rewards/rejected": -9.601004464285714, "step": 2904 }, { "epoch": 0.7962176236809648, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18227356.307692308, "logits/rejected": -13326234.181818182, "logps/chosen": -435.85141225961536, "logps/rejected": -752.2523082386364, "loss": 0.0184, "rewards/chosen": 7.383201012244592, "rewards/margins": 24.47797335111178, "rewards/rejected": -17.094772338867188, "step": 2905 }, { "epoch": 0.7964917089214746, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30231220.57142857, "logits/rejected": 114851878.4, "logps/chosen": -326.76077706473217, "logps/rejected": -662.0904296875, "loss": 0.0523, "rewards/chosen": 5.2315826416015625, "rewards/margins": 28.596249389648438, "rewards/rejected": -23.364666748046876, "step": 2906 }, { "epoch": 0.7967657941619843, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10084274.461538462, "logits/rejected": 59548869.81818182, "logps/chosen": -385.9163161057692, "logps/rejected": -517.1178977272727, "loss": 0.0372, "rewards/chosen": 6.7994842529296875, "rewards/margins": 21.335506092418324, "rewards/rejected": -14.536021839488637, "step": 2907 }, { "epoch": 0.7970398794024942, "grad_norm": 8.1875, "kl": 0.6095403432846069, "learning_rate": 5e-06, "logits/chosen": -33393338.666666668, "logits/rejected": -32332792.0, "logps/chosen": -481.4646402994792, "logps/rejected": -673.7671712239584, "loss": 0.0186, "rewards/chosen": 5.76461919148763, "rewards/margins": 19.372236887613933, "rewards/rejected": -13.607617696126303, "step": 2908 }, { "epoch": 0.797313964643004, "grad_norm": 8.0625, "kl": 7.593690395355225, "learning_rate": 5e-06, "logits/chosen": 11696692.266666668, "logits/rejected": -26617948.444444444, "logps/chosen": -422.1266276041667, "logps/rejected": -474.13275824652777, "loss": 0.0268, "rewards/chosen": 7.013691202799479, "rewards/margins": 19.5864256117079, "rewards/rejected": -12.57273440890842, "step": 2909 }, { "epoch": 0.7975880498835137, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23127508.8, "logits/rejected": -18805614.85714286, "logps/chosen": -424.8681640625, "logps/rejected": -437.6982421875, "loss": 0.0433, "rewards/chosen": 8.758076477050782, "rewards/margins": 18.743498883928574, "rewards/rejected": -9.98542240687779, "step": 2910 }, { "epoch": 0.7978621351240236, "grad_norm": 8.3125, "kl": 2.8216285705566406, "learning_rate": 5e-06, "logits/chosen": 7064714.181818182, "logits/rejected": -49301499.07692308, "logps/chosen": -402.18217329545456, "logps/rejected": -473.02110877403845, "loss": 0.023, "rewards/chosen": 7.488153631036932, "rewards/margins": 18.455851948344623, "rewards/rejected": -10.967698317307692, "step": 2911 }, { "epoch": 0.7981362203645334, "grad_norm": 7.65625, "kl": 3.8770651817321777, "learning_rate": 5e-06, "logits/chosen": -14832581.333333334, "logits/rejected": -11673750.666666666, "logps/chosen": -392.42431640625, "logps/rejected": -645.83154296875, "loss": 0.0396, "rewards/chosen": 7.017647425333659, "rewards/margins": 24.852898279825848, "rewards/rejected": -17.835250854492188, "step": 2912 }, { "epoch": 0.7984103056050431, "grad_norm": 3.453125, "kl": 6.0404486656188965, "learning_rate": 5e-06, "logits/chosen": -44098656.0, "logits/rejected": -14168952.615384616, "logps/chosen": -430.8587535511364, "logps/rejected": -592.8323317307693, "loss": 0.0196, "rewards/chosen": 7.751980868252841, "rewards/margins": 18.031734333171713, "rewards/rejected": -10.27975346491887, "step": 2913 }, { "epoch": 0.798684390845553, "grad_norm": 10.8125, "kl": 0.6271114349365234, "learning_rate": 5e-06, "logits/chosen": -20363786.666666668, "logits/rejected": -7085421.866666666, "logps/chosen": -441.3694118923611, "logps/rejected": -447.6322916666667, "loss": 0.047, "rewards/chosen": 7.426598442925347, "rewards/margins": 17.53453606499566, "rewards/rejected": -10.107937622070313, "step": 2914 }, { "epoch": 0.7989584760860627, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 369002.4, "logits/rejected": -8037789.714285715, "logps/chosen": -425.56220703125, "logps/rejected": -487.7893763950893, "loss": 0.0354, "rewards/chosen": 8.020793151855468, "rewards/margins": 20.027528817313055, "rewards/rejected": -12.006735665457589, "step": 2915 }, { "epoch": 0.7992325613265726, "grad_norm": 7.78125, "kl": 0.33002471923828125, "learning_rate": 5e-06, "logits/chosen": -19381149.53846154, "logits/rejected": -24186141.09090909, "logps/chosen": -357.67518028846155, "logps/rejected": -601.8814808238636, "loss": 0.0563, "rewards/chosen": 6.958979679987981, "rewards/margins": 19.730858009178323, "rewards/rejected": -12.771878329190342, "step": 2916 }, { "epoch": 0.7995066465670824, "grad_norm": 1.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20600320.0, "logits/rejected": -11648048.0, "logps/chosen": -348.82598876953125, "logps/rejected": -778.0691528320312, "loss": 0.0021, "rewards/chosen": 7.63640022277832, "rewards/margins": 26.894121170043945, "rewards/rejected": -19.257720947265625, "step": 2917 }, { "epoch": 0.7997807318075921, "grad_norm": 0.83984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19067906.666666668, "logits/rejected": -24057472.0, "logps/chosen": -457.9903971354167, "logps/rejected": -498.9789225260417, "loss": 0.005, "rewards/chosen": 8.65463383992513, "rewards/margins": 21.039928436279297, "rewards/rejected": -12.385294596354166, "step": 2918 }, { "epoch": 0.800054817048102, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9981502.545454545, "logits/rejected": -28236930.46153846, "logps/chosen": -326.73928000710225, "logps/rejected": -512.1220703125, "loss": 0.0222, "rewards/chosen": 5.557456970214844, "rewards/margins": 20.742880601149338, "rewards/rejected": -15.185423630934496, "step": 2919 }, { "epoch": 0.8003289022886118, "grad_norm": 3.078125, "kl": 3.574104070663452, "learning_rate": 5e-06, "logits/chosen": -26542029.714285713, "logits/rejected": -32193590.4, "logps/chosen": -463.90980747767856, "logps/rejected": -563.253173828125, "loss": 0.0442, "rewards/chosen": 6.649022783551898, "rewards/margins": 21.921167864118303, "rewards/rejected": -15.272145080566407, "step": 2920 }, { "epoch": 0.8006029875291215, "grad_norm": 3.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34219527.384615384, "logits/rejected": -22968778.181818184, "logps/chosen": -334.7506760817308, "logps/rejected": -515.4572088068181, "loss": 0.0099, "rewards/chosen": 6.385186415452224, "rewards/margins": 18.125788388552365, "rewards/rejected": -11.740601973100143, "step": 2921 }, { "epoch": 0.8008770727696314, "grad_norm": 3.46875, "kl": 4.330951690673828, "learning_rate": 5e-06, "logits/chosen": -26629418.666666668, "logits/rejected": -9926856.666666666, "logps/chosen": -422.2461751302083, "logps/rejected": -519.916748046875, "loss": 0.0448, "rewards/chosen": 6.756628672281901, "rewards/margins": 15.638542811075848, "rewards/rejected": -8.881914138793945, "step": 2922 }, { "epoch": 0.8011511580101411, "grad_norm": 5.96875, "kl": 2.7307441234588623, "learning_rate": 5e-06, "logits/chosen": -28224670.11764706, "logits/rejected": -22201968.0, "logps/chosen": -425.4627470128676, "logps/rejected": -418.35693359375, "loss": 0.0287, "rewards/chosen": 6.145176607019761, "rewards/margins": 15.300758490041524, "rewards/rejected": -9.155581883021764, "step": 2923 }, { "epoch": 0.8014252432506509, "grad_norm": 2.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36894061.71428572, "logits/rejected": -23973260.8, "logps/chosen": -356.30106026785717, "logps/rejected": -624.915771484375, "loss": 0.0487, "rewards/chosen": 7.2563983372279575, "rewards/margins": 19.090651266915458, "rewards/rejected": -11.8342529296875, "step": 2924 }, { "epoch": 0.8016993284911608, "grad_norm": 7.96875, "kl": 8.103200912475586, "learning_rate": 5e-06, "logits/chosen": -23452261.333333332, "logits/rejected": -21548526.666666668, "logps/chosen": -374.26904296875, "logps/rejected": -523.2708740234375, "loss": 0.0255, "rewards/chosen": 7.594289779663086, "rewards/margins": 20.57869784037272, "rewards/rejected": -12.984408060709635, "step": 2925 }, { "epoch": 0.8019734137316705, "grad_norm": 4.53125, "kl": 2.862699508666992, "learning_rate": 5e-06, "logits/chosen": -47913767.384615384, "logits/rejected": 42054231.27272727, "logps/chosen": -457.60452974759613, "logps/rejected": -796.3243075284091, "loss": 0.018, "rewards/chosen": 7.250506474421575, "rewards/margins": 24.35631091778095, "rewards/rejected": -17.105804443359375, "step": 2926 }, { "epoch": 0.8022474989721804, "grad_norm": 0.416015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18354730.666666668, "logits/rejected": -27435514.666666668, "logps/chosen": -436.6797281901042, "logps/rejected": -588.5630696614584, "loss": 0.0013, "rewards/chosen": 8.129596710205078, "rewards/margins": 22.51305262247721, "rewards/rejected": -14.383455912272135, "step": 2927 }, { "epoch": 0.8025215842126902, "grad_norm": 8.75, "kl": 10.480623245239258, "learning_rate": 5e-06, "logits/chosen": -36062011.733333334, "logits/rejected": -25285424.0, "logps/chosen": -400.3919270833333, "logps/rejected": -569.7599826388889, "loss": 0.0443, "rewards/chosen": 7.3742116292317705, "rewards/margins": 22.087604098849827, "rewards/rejected": -14.713392469618055, "step": 2928 }, { "epoch": 0.8027956694531999, "grad_norm": 4.15625, "kl": 1.027836561203003, "learning_rate": 5e-06, "logits/chosen": -35880872.0, "logits/rejected": -26169610.666666668, "logps/chosen": -443.463134765625, "logps/rejected": -492.2818196614583, "loss": 0.018, "rewards/chosen": 8.01871109008789, "rewards/margins": 19.676308949788414, "rewards/rejected": -11.657597859700521, "step": 2929 }, { "epoch": 0.8030697546937098, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17171837.53846154, "logits/rejected": -30906885.818181816, "logps/chosen": -410.69478665865387, "logps/rejected": -766.5926846590909, "loss": 0.0231, "rewards/chosen": 7.331060556265024, "rewards/margins": 27.818093359887186, "rewards/rejected": -20.48703280362216, "step": 2930 }, { "epoch": 0.8033438399342195, "grad_norm": 7.65625, "kl": 4.8543524742126465, "learning_rate": 5e-06, "logits/chosen": -21033646.933333334, "logits/rejected": -18897820.444444444, "logps/chosen": -422.30872395833336, "logps/rejected": -326.8385416666667, "loss": 0.0361, "rewards/chosen": 7.132804870605469, "rewards/margins": 15.937447441948784, "rewards/rejected": -8.804642571343315, "step": 2931 }, { "epoch": 0.8036179251747293, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27674080.0, "logits/rejected": -26528112.0, "logps/chosen": -281.83481852213544, "logps/rejected": -749.6354166666666, "loss": 0.0572, "rewards/chosen": 4.551554997762044, "rewards/margins": 18.24353090922038, "rewards/rejected": -13.691975911458334, "step": 2932 }, { "epoch": 0.8038920104152392, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5200148.666666667, "logits/rejected": -3603324.0, "logps/chosen": -406.7721354166667, "logps/rejected": -664.7139078776041, "loss": 0.0087, "rewards/chosen": 7.498743057250977, "rewards/margins": 20.823628107706703, "rewards/rejected": -13.324885050455729, "step": 2933 }, { "epoch": 0.8041660956557489, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30086699.42857143, "logits/rejected": -29628275.2, "logps/chosen": -369.98304966517856, "logps/rejected": -560.1966796875, "loss": 0.042, "rewards/chosen": 5.6320005144391745, "rewards/margins": 19.840145656040736, "rewards/rejected": -14.208145141601562, "step": 2934 }, { "epoch": 0.8044401808962587, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19608496.0, "logits/rejected": -29066429.333333332, "logps/chosen": -418.1909586588542, "logps/rejected": -608.1735432942709, "loss": 0.0325, "rewards/chosen": 6.99560546875, "rewards/margins": 20.725781758626304, "rewards/rejected": -13.730176289876303, "step": 2935 }, { "epoch": 0.8047142661367686, "grad_norm": 4.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22481371.636363637, "logits/rejected": -33360049.230769232, "logps/chosen": -515.2134232954545, "logps/rejected": -554.0128455528846, "loss": 0.0158, "rewards/chosen": 8.098770141601562, "rewards/margins": 22.151452871469353, "rewards/rejected": -14.052682729867788, "step": 2936 }, { "epoch": 0.8049883513772783, "grad_norm": 6.09375, "kl": 7.791644096374512, "learning_rate": 5e-06, "logits/chosen": -11964154.285714285, "logits/rejected": -29385641.6, "logps/chosen": -430.4310825892857, "logps/rejected": -488.467529296875, "loss": 0.0569, "rewards/chosen": 6.533732822963169, "rewards/margins": 19.30753348214286, "rewards/rejected": -12.773800659179688, "step": 2937 }, { "epoch": 0.8052624366177882, "grad_norm": 5.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16395808.0, "logits/rejected": -28277008.0, "logps/chosen": -396.94998604910717, "logps/rejected": -543.46337890625, "loss": 0.0119, "rewards/chosen": 5.666681562151227, "rewards/margins": 19.31331089564732, "rewards/rejected": -13.646629333496094, "step": 2938 }, { "epoch": 0.8055365218582979, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17269660.0, "logits/rejected": -27539412.0, "logps/chosen": -482.7665710449219, "logps/rejected": -592.7437744140625, "loss": 0.0226, "rewards/chosen": 6.707770347595215, "rewards/margins": 19.889079093933105, "rewards/rejected": -13.18130874633789, "step": 2939 }, { "epoch": 0.8058106070988077, "grad_norm": 1.5546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27833422.222222224, "logits/rejected": -37424128.0, "logps/chosen": -494.61756727430554, "logps/rejected": -776.6604166666667, "loss": 0.0037, "rewards/chosen": 6.700508541531033, "rewards/margins": 25.3893675910102, "rewards/rejected": -18.688859049479166, "step": 2940 }, { "epoch": 0.8060846923393176, "grad_norm": 5.09375, "kl": 7.036757946014404, "learning_rate": 5e-06, "logits/chosen": -20827485.866666667, "logits/rejected": -22027027.555555556, "logps/chosen": -420.9982421875, "logps/rejected": -520.2938368055555, "loss": 0.031, "rewards/chosen": 6.356710815429688, "rewards/margins": 20.84026353624132, "rewards/rejected": -14.483552720811632, "step": 2941 }, { "epoch": 0.8063587775798273, "grad_norm": 9.0625, "kl": 2.078566312789917, "learning_rate": 5e-06, "logits/chosen": -26891378.285714287, "logits/rejected": -56886233.6, "logps/chosen": -499.06236049107144, "logps/rejected": -480.240869140625, "loss": 0.0458, "rewards/chosen": 7.3634507315499445, "rewards/margins": 21.17252665928432, "rewards/rejected": -13.809075927734375, "step": 2942 }, { "epoch": 0.8066328628203371, "grad_norm": 12.4375, "kl": 4.668548583984375, "learning_rate": 5e-06, "logits/chosen": -6060640.615384615, "logits/rejected": 808270.5454545454, "logps/chosen": -502.58984375, "logps/rejected": -407.40602805397725, "loss": 0.0467, "rewards/chosen": 7.11350602370042, "rewards/margins": 16.37787078644012, "rewards/rejected": -9.264364762739701, "step": 2943 }, { "epoch": 0.806906948060847, "grad_norm": 18.125, "kl": 8.545214653015137, "learning_rate": 5e-06, "logits/chosen": -28295619.76470588, "logits/rejected": -15443017.142857144, "logps/chosen": -438.84978170955884, "logps/rejected": -639.5335518973214, "loss": 0.0951, "rewards/chosen": 6.649835923138787, "rewards/margins": 21.07633106648421, "rewards/rejected": -14.426495143345424, "step": 2944 }, { "epoch": 0.8071810333013567, "grad_norm": 5.84375, "kl": 3.3766937255859375, "learning_rate": 5e-06, "logits/chosen": -5277508.923076923, "logits/rejected": 901636.9090909091, "logps/chosen": -494.52261117788464, "logps/rejected": -563.0690252130681, "loss": 0.029, "rewards/chosen": 5.573218712439904, "rewards/margins": 16.930917326386993, "rewards/rejected": -11.357698613947088, "step": 2945 }, { "epoch": 0.8074551185418665, "grad_norm": 6.28125, "kl": 1.8419723510742188, "learning_rate": 5e-06, "logits/chosen": -20941074.90909091, "logits/rejected": -42443416.615384616, "logps/chosen": -426.8729137073864, "logps/rejected": -441.58556189903845, "loss": 0.0624, "rewards/chosen": 8.047550548206676, "rewards/margins": 18.112287721433837, "rewards/rejected": -10.064737173227163, "step": 2946 }, { "epoch": 0.8077292037823763, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7289088.0, "logits/rejected": -42775419.07692308, "logps/chosen": -302.33320756392044, "logps/rejected": -544.9800555889423, "loss": 0.014, "rewards/chosen": 6.763268904252485, "rewards/margins": 23.723673520388303, "rewards/rejected": -16.96040461613582, "step": 2947 }, { "epoch": 0.8080032890228861, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 32293326.769230768, "logits/rejected": -884780.3636363636, "logps/chosen": -395.24906099759613, "logps/rejected": -566.7301136363636, "loss": 0.0437, "rewards/chosen": 6.610713078425481, "rewards/margins": 21.693059214345226, "rewards/rejected": -15.082346135919744, "step": 2948 }, { "epoch": 0.808277374263396, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58663590.4, "logits/rejected": 1601457.7142857143, "logps/chosen": -420.77421875, "logps/rejected": -648.6959402901786, "loss": 0.0139, "rewards/chosen": 6.930445861816406, "rewards/margins": 20.57920597621373, "rewards/rejected": -13.648760114397321, "step": 2949 }, { "epoch": 0.8085514595039057, "grad_norm": 3.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24479936.0, "logits/rejected": -40808372.36363637, "logps/chosen": -474.42578125, "logps/rejected": -508.34774502840907, "loss": 0.0064, "rewards/chosen": 7.334375234750601, "rewards/margins": 21.449523605666794, "rewards/rejected": -14.115148370916193, "step": 2950 }, { "epoch": 0.8088255447444155, "grad_norm": 5.34375, "kl": 3.905163049697876, "learning_rate": 5e-06, "logits/chosen": -19050910.222222224, "logits/rejected": -9231565.333333334, "logps/chosen": -384.64911566840277, "logps/rejected": -608.7244466145834, "loss": 0.0502, "rewards/chosen": 6.142951117621528, "rewards/margins": 16.662147945827908, "rewards/rejected": -10.51919682820638, "step": 2951 }, { "epoch": 0.8090996299849254, "grad_norm": 0.275390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26381357.333333332, "logits/rejected": -26193498.666666668, "logps/chosen": -365.5855305989583, "logps/rejected": -580.7435980902778, "loss": 0.0008, "rewards/chosen": 7.046719233194987, "rewards/margins": 19.308076858520508, "rewards/rejected": -12.261357625325521, "step": 2952 }, { "epoch": 0.8093737152254351, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30873353.6, "logits/rejected": -24274498.285714287, "logps/chosen": -375.7708984375, "logps/rejected": -558.0650111607143, "loss": 0.0187, "rewards/chosen": 7.397855377197265, "rewards/margins": 18.860369982038225, "rewards/rejected": -11.46251460484096, "step": 2953 }, { "epoch": 0.8096478004659449, "grad_norm": 8.9375, "kl": 3.4640414714813232, "learning_rate": 5e-06, "logits/chosen": -38053991.384615384, "logits/rejected": -26326909.09090909, "logps/chosen": -314.2931565504808, "logps/rejected": -478.51167436079544, "loss": 0.0595, "rewards/chosen": 5.784198467548077, "rewards/margins": 16.148962594412424, "rewards/rejected": -10.364764126864346, "step": 2954 }, { "epoch": 0.8099218857064547, "grad_norm": 6.4375, "kl": 2.7148895263671875, "learning_rate": 5e-06, "logits/chosen": -10946038.153846154, "logits/rejected": -20422052.363636363, "logps/chosen": -414.9625901442308, "logps/rejected": -558.9720791903409, "loss": 0.0307, "rewards/chosen": 6.870703477125901, "rewards/margins": 18.9010726288482, "rewards/rejected": -12.0303691517223, "step": 2955 }, { "epoch": 0.8101959709469645, "grad_norm": 4.59375, "kl": 3.007747173309326, "learning_rate": 5e-06, "logits/chosen": -22422232.615384616, "logits/rejected": -25954251.636363637, "logps/chosen": -428.65054086538464, "logps/rejected": -506.9098455255682, "loss": 0.0162, "rewards/chosen": 8.473077627328726, "rewards/margins": 21.928491445688103, "rewards/rejected": -13.455413818359375, "step": 2956 }, { "epoch": 0.8104700561874743, "grad_norm": 11.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11421923.692307692, "logits/rejected": 71512250.18181819, "logps/chosen": -458.71131310096155, "logps/rejected": -561.29443359375, "loss": 0.0529, "rewards/chosen": 6.770052396334135, "rewards/margins": 22.32816896238527, "rewards/rejected": -15.558116566051137, "step": 2957 }, { "epoch": 0.8107441414279841, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16567880.0, "logits/rejected": -2729865.4, "logps/chosen": -503.70455496651783, "logps/rejected": -656.27470703125, "loss": 0.0078, "rewards/chosen": 7.906960623604911, "rewards/margins": 24.267351858956474, "rewards/rejected": -16.360391235351564, "step": 2958 }, { "epoch": 0.8110182266684939, "grad_norm": 6.0625, "kl": 14.26395320892334, "learning_rate": 5e-06, "logits/chosen": -22053882.0, "logits/rejected": -4269037.5, "logps/chosen": -369.7419738769531, "logps/rejected": -526.8917236328125, "loss": 0.0919, "rewards/chosen": 6.275437355041504, "rewards/margins": 19.92252540588379, "rewards/rejected": -13.647088050842285, "step": 2959 }, { "epoch": 0.8112923119090037, "grad_norm": 4.25, "kl": 2.05066180229187, "learning_rate": 5e-06, "logits/chosen": 18513959.384615384, "logits/rejected": -35290926.54545455, "logps/chosen": -457.1486253004808, "logps/rejected": -539.8262606534091, "loss": 0.0168, "rewards/chosen": 8.144960256723257, "rewards/margins": 21.722227990210474, "rewards/rejected": -13.577267733487217, "step": 2960 }, { "epoch": 0.8115663971495135, "grad_norm": 7.4375, "kl": 5.439858913421631, "learning_rate": 5e-06, "logits/chosen": -15087093.333333334, "logits/rejected": -47844010.666666664, "logps/chosen": -389.4254150390625, "logps/rejected": -633.0452880859375, "loss": 0.0607, "rewards/chosen": 5.270134290059407, "rewards/margins": 18.22128454844157, "rewards/rejected": -12.951150258382162, "step": 2961 }, { "epoch": 0.8118404823900233, "grad_norm": 3.203125, "kl": 1.9766597747802734, "learning_rate": 5e-06, "logits/chosen": -26106938.181818184, "logits/rejected": -33657127.384615384, "logps/chosen": -376.1188299005682, "logps/rejected": -605.2761418269231, "loss": 0.0202, "rewards/chosen": 6.974944374778054, "rewards/margins": 19.945694449898244, "rewards/rejected": -12.970750075120192, "step": 2962 }, { "epoch": 0.8121145676305331, "grad_norm": 1.171875, "kl": 3.767390012741089, "learning_rate": 5e-06, "logits/chosen": -28422100.363636363, "logits/rejected": -16638359.384615384, "logps/chosen": -451.35031960227275, "logps/rejected": -491.3792067307692, "loss": 0.0439, "rewards/chosen": 7.148022738370028, "rewards/margins": 17.19126497282015, "rewards/rejected": -10.04324223445012, "step": 2963 }, { "epoch": 0.8123886528710429, "grad_norm": 5.34375, "kl": 1.576348066329956, "learning_rate": 5e-06, "logits/chosen": -14208157.538461538, "logits/rejected": -20022884.363636363, "logps/chosen": -378.2539813701923, "logps/rejected": -628.4304421164773, "loss": 0.0448, "rewards/chosen": 6.2001471886268025, "rewards/margins": 20.101741870800097, "rewards/rejected": -13.901594682173295, "step": 2964 }, { "epoch": 0.8126627381115527, "grad_norm": 6.3125, "kl": 8.471686363220215, "learning_rate": 5e-06, "logits/chosen": -16724442.666666666, "logits/rejected": -13405389.333333334, "logps/chosen": -442.5685221354167, "logps/rejected": -662.3993326822916, "loss": 0.0728, "rewards/chosen": 7.8514353434244795, "rewards/margins": 21.22769546508789, "rewards/rejected": -13.376260121663412, "step": 2965 }, { "epoch": 0.8129368233520625, "grad_norm": 20.0, "kl": 3.14251708984375, "learning_rate": 5e-06, "logits/chosen": -11972701.714285715, "logits/rejected": -28084662.4, "logps/chosen": -391.0844029017857, "logps/rejected": -736.84375, "loss": 0.0493, "rewards/chosen": 5.244922637939453, "rewards/margins": 23.518909454345703, "rewards/rejected": -18.27398681640625, "step": 2966 }, { "epoch": 0.8132109085925723, "grad_norm": 10.375, "kl": 0.1015116423368454, "learning_rate": 5e-06, "logits/chosen": -7633789.6, "logits/rejected": -18587333.714285713, "logps/chosen": -513.6568359375, "logps/rejected": -534.4659598214286, "loss": 0.0288, "rewards/chosen": 6.311717224121094, "rewards/margins": 17.713475690569197, "rewards/rejected": -11.401758466448102, "step": 2967 }, { "epoch": 0.813484993833082, "grad_norm": 1.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17486032.0, "logits/rejected": -27331108.0, "logps/chosen": -468.76312255859375, "logps/rejected": -573.9779052734375, "loss": 0.0046, "rewards/chosen": 7.12348747253418, "rewards/margins": 19.67691707611084, "rewards/rejected": -12.55342960357666, "step": 2968 }, { "epoch": 0.8137590790735919, "grad_norm": 2.71875, "kl": 5.020530700683594, "learning_rate": 5e-06, "logits/chosen": 23095788.307692308, "logits/rejected": 780779.2727272727, "logps/chosen": -489.98839393028845, "logps/rejected": -718.5211736505681, "loss": 0.005, "rewards/chosen": 7.797354478102464, "rewards/margins": 23.59219867199451, "rewards/rejected": -15.794844193892045, "step": 2969 }, { "epoch": 0.8140331643141017, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16318068.0, "logits/rejected": 38590844.0, "logps/chosen": -421.98602294921875, "logps/rejected": -448.3902893066406, "loss": 0.0262, "rewards/chosen": 7.053119659423828, "rewards/margins": 16.789642333984375, "rewards/rejected": -9.736522674560547, "step": 2970 }, { "epoch": 0.8143072495546115, "grad_norm": 5.8125, "kl": 2.75313138961792, "learning_rate": 5e-06, "logits/chosen": -20057803.42857143, "logits/rejected": -20299928.0, "logps/chosen": -430.57861328125, "logps/rejected": -664.78720703125, "loss": 0.0146, "rewards/chosen": 8.344140189034599, "rewards/margins": 22.850966971261162, "rewards/rejected": -14.506826782226563, "step": 2971 }, { "epoch": 0.8145813347951213, "grad_norm": 2.734375, "kl": 0.10400199890136719, "learning_rate": 5e-06, "logits/chosen": -14997332.266666668, "logits/rejected": -30720679.111111112, "logps/chosen": -369.26438802083334, "logps/rejected": -573.2050238715278, "loss": 0.0233, "rewards/chosen": 6.856388854980469, "rewards/margins": 20.97302025689019, "rewards/rejected": -14.116631401909721, "step": 2972 }, { "epoch": 0.814855420035631, "grad_norm": 8.375, "kl": 19.15093231201172, "learning_rate": 5e-06, "logits/chosen": -19712149.818181816, "logits/rejected": 850596.3076923077, "logps/chosen": -545.1421342329545, "logps/rejected": -534.6522686298077, "loss": 0.0618, "rewards/chosen": 7.29572226784446, "rewards/margins": 17.60126660753797, "rewards/rejected": -10.30554433969351, "step": 2973 }, { "epoch": 0.8151295052761409, "grad_norm": 2.78125, "kl": 1.4147758483886719, "learning_rate": 5e-06, "logits/chosen": -17074224.0, "logits/rejected": -22739499.636363637, "logps/chosen": -496.0793644831731, "logps/rejected": -548.9930308948864, "loss": 0.0074, "rewards/chosen": 8.744147667518028, "rewards/margins": 19.73067970542641, "rewards/rejected": -10.98653203790838, "step": 2974 }, { "epoch": 0.8154035905166507, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24467303.111111112, "logits/rejected": -28135136.0, "logps/chosen": -554.4782443576389, "logps/rejected": -570.5274739583333, "loss": 0.0331, "rewards/chosen": 6.034270392523871, "rewards/margins": 20.841996426052518, "rewards/rejected": -14.807726033528645, "step": 2975 }, { "epoch": 0.8156776757571604, "grad_norm": 10.4375, "kl": 7.5776286125183105, "learning_rate": 5e-06, "logits/chosen": -6791384.5, "logits/rejected": -25988052.0, "logps/chosen": -428.77789306640625, "logps/rejected": -496.07861328125, "loss": 0.0662, "rewards/chosen": 6.112661838531494, "rewards/margins": 20.003032207489014, "rewards/rejected": -13.89037036895752, "step": 2976 }, { "epoch": 0.8159517609976703, "grad_norm": 4.78125, "kl": 2.277632474899292, "learning_rate": 5e-06, "logits/chosen": 44741225.14285714, "logits/rejected": -17970971.2, "logps/chosen": -519.955810546875, "logps/rejected": -503.06640625, "loss": 0.0216, "rewards/chosen": 7.192353929792132, "rewards/margins": 20.492331041608537, "rewards/rejected": -13.299977111816407, "step": 2977 }, { "epoch": 0.8162258462381801, "grad_norm": 2.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24008953.14285714, "logits/rejected": -26808568.470588237, "logps/chosen": -541.2309919084821, "logps/rejected": -439.03466796875, "loss": 0.0242, "rewards/chosen": 8.752831050327845, "rewards/margins": 19.973377997133912, "rewards/rejected": -11.220546946806067, "step": 2978 }, { "epoch": 0.8164999314786898, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11756656.0, "logits/rejected": -31497262.769230768, "logps/chosen": -453.7935901988636, "logps/rejected": -653.7888371394231, "loss": 0.0462, "rewards/chosen": 7.102746443314985, "rewards/margins": 18.796562408233857, "rewards/rejected": -11.69381596491887, "step": 2979 }, { "epoch": 0.8167740167191997, "grad_norm": 9.0625, "kl": 10.110336303710938, "learning_rate": 5e-06, "logits/chosen": -15654452.0, "logits/rejected": -5168475.0, "logps/chosen": -431.0262145996094, "logps/rejected": -664.0042114257812, "loss": 0.1107, "rewards/chosen": 7.65463924407959, "rewards/margins": 23.007784843444824, "rewards/rejected": -15.353145599365234, "step": 2980 }, { "epoch": 0.8170481019597094, "grad_norm": 10.875, "kl": 4.859006404876709, "learning_rate": 5e-06, "logits/chosen": -7422132.363636363, "logits/rejected": -20626980.923076924, "logps/chosen": -387.1651722301136, "logps/rejected": -608.6954627403846, "loss": 0.0571, "rewards/chosen": 5.116176258433949, "rewards/margins": 17.466799756030102, "rewards/rejected": -12.350623497596153, "step": 2981 }, { "epoch": 0.8173221872002193, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15055160.727272727, "logits/rejected": -13387702.153846154, "logps/chosen": -377.80411044034093, "logps/rejected": -437.5157001201923, "loss": 0.0244, "rewards/chosen": 7.635220614346591, "rewards/margins": 16.822064486416902, "rewards/rejected": -9.186843872070312, "step": 2982 }, { "epoch": 0.8175962724407291, "grad_norm": 3.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27670130.666666668, "logits/rejected": -38964477.333333336, "logps/chosen": -429.8306477864583, "logps/rejected": -542.7388509114584, "loss": 0.0124, "rewards/chosen": 6.6639862060546875, "rewards/margins": 18.833267211914062, "rewards/rejected": -12.169281005859375, "step": 2983 }, { "epoch": 0.8178703576812388, "grad_norm": 1.9140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17262540.0, "logits/rejected": -2897438.0, "logps/chosen": -434.28564453125, "logps/rejected": -653.8324381510416, "loss": 0.006, "rewards/chosen": 7.121072769165039, "rewards/margins": 19.877484003702797, "rewards/rejected": -12.75641123453776, "step": 2984 }, { "epoch": 0.8181444429217487, "grad_norm": 4.15625, "kl": 7.859609603881836, "learning_rate": 5e-06, "logits/chosen": -10109862.153846154, "logits/rejected": -2652080.0, "logps/chosen": -307.12300931490387, "logps/rejected": -439.23885830965907, "loss": 0.0476, "rewards/chosen": 6.414438687838041, "rewards/margins": 16.3916926884151, "rewards/rejected": -9.97725400057706, "step": 2985 }, { "epoch": 0.8184185281622585, "grad_norm": 0.91015625, "kl": 1.9727885723114014, "learning_rate": 5e-06, "logits/chosen": -26265410.46153846, "logits/rejected": -14141396.363636363, "logps/chosen": -357.83743990384613, "logps/rejected": -534.8548029119319, "loss": 0.0026, "rewards/chosen": 7.879537729116587, "rewards/margins": 19.417453739192936, "rewards/rejected": -11.53791601007635, "step": 2986 }, { "epoch": 0.8186926134027682, "grad_norm": 4.875, "kl": 3.3638153076171875, "learning_rate": 5e-06, "logits/chosen": -31935040.0, "logits/rejected": -11557444.0, "logps/chosen": -514.8031005859375, "logps/rejected": -506.956787109375, "loss": 0.0148, "rewards/chosen": 8.161571502685547, "rewards/margins": 20.69153722127279, "rewards/rejected": -12.52996571858724, "step": 2987 }, { "epoch": 0.8189666986432781, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26517702.4, "logits/rejected": -31571753.14285714, "logps/chosen": -354.1538330078125, "logps/rejected": -660.8715122767857, "loss": 0.0366, "rewards/chosen": 5.70169677734375, "rewards/margins": 20.402923148018974, "rewards/rejected": -14.701226370675224, "step": 2988 }, { "epoch": 0.8192407838837878, "grad_norm": 1.015625, "kl": 2.6749885082244873, "learning_rate": 5e-06, "logits/chosen": -4922218.4, "logits/rejected": 38612086.85714286, "logps/chosen": -384.48974609375, "logps/rejected": -654.6392996651786, "loss": 0.0019, "rewards/chosen": 8.687699890136718, "rewards/margins": 25.615123639787946, "rewards/rejected": -16.92742374965123, "step": 2989 }, { "epoch": 0.8195148691242976, "grad_norm": 10.125, "kl": 2.5866122245788574, "learning_rate": 5e-06, "logits/chosen": -10418299.636363637, "logits/rejected": -26673147.076923076, "logps/chosen": -418.6550958806818, "logps/rejected": -492.8913386418269, "loss": 0.0596, "rewards/chosen": 5.944646661931818, "rewards/margins": 17.797466251399968, "rewards/rejected": -11.85281958946815, "step": 2990 }, { "epoch": 0.8197889543648075, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3995639.3333333335, "logits/rejected": -57402890.666666664, "logps/chosen": -452.4641520182292, "logps/rejected": -508.5266927083333, "loss": 0.0477, "rewards/chosen": 5.7850290934244795, "rewards/margins": 17.724191029866535, "rewards/rejected": -11.939161936442057, "step": 2991 }, { "epoch": 0.8200630396053172, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42541933.333333336, "logits/rejected": -32869034.666666668, "logps/chosen": -343.7786051432292, "logps/rejected": -452.11317274305554, "loss": 0.052, "rewards/chosen": 6.888851801554362, "rewards/margins": 17.752580642700195, "rewards/rejected": -10.863728841145834, "step": 2992 }, { "epoch": 0.8203371248458271, "grad_norm": 3.828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18313526.0, "logits/rejected": -23776142.0, "logps/chosen": -462.21185302734375, "logps/rejected": -491.2329406738281, "loss": 0.0081, "rewards/chosen": 10.152057647705078, "rewards/margins": 23.600571632385254, "rewards/rejected": -13.448513984680176, "step": 2993 }, { "epoch": 0.8206112100863369, "grad_norm": 5.90625, "kl": 1.3726876974105835, "learning_rate": 5e-06, "logits/chosen": -16806562.90909091, "logits/rejected": -35680305.23076923, "logps/chosen": -400.30215731534093, "logps/rejected": -555.7129657451923, "loss": 0.0354, "rewards/chosen": 7.981561834161932, "rewards/margins": 24.826017259717823, "rewards/rejected": -16.84445542555589, "step": 2994 }, { "epoch": 0.8208852953268466, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7523565.090909091, "logits/rejected": -24992110.769230768, "logps/chosen": -497.6539417613636, "logps/rejected": -513.3008563701923, "loss": 0.058, "rewards/chosen": 7.097227616743608, "rewards/margins": 19.831370213648658, "rewards/rejected": -12.734142596905048, "step": 2995 }, { "epoch": 0.8211593805673565, "grad_norm": 8.0625, "kl": 3.773115873336792, "learning_rate": 5e-06, "logits/chosen": -11801846.153846154, "logits/rejected": -40994144.0, "logps/chosen": -379.5383864182692, "logps/rejected": -623.8870738636364, "loss": 0.0451, "rewards/chosen": 5.752745995154748, "rewards/margins": 23.725475764774774, "rewards/rejected": -17.972729769620027, "step": 2996 }, { "epoch": 0.8214334658078662, "grad_norm": 0.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28500652.307692308, "logits/rejected": -34337890.90909091, "logps/chosen": -404.8415715144231, "logps/rejected": -483.14990234375, "loss": 0.003, "rewards/chosen": 7.616997938889724, "rewards/margins": 21.894153034770405, "rewards/rejected": -14.277155095880682, "step": 2997 }, { "epoch": 0.821707551048376, "grad_norm": 2.5625, "kl": 2.7491111755371094, "learning_rate": 5e-06, "logits/chosen": -33156652.307692308, "logits/rejected": -17913588.363636363, "logps/chosen": -595.7551832932693, "logps/rejected": -584.4755415482955, "loss": 0.01, "rewards/chosen": 7.499713604266827, "rewards/margins": 19.729780077100635, "rewards/rejected": -12.230066472833807, "step": 2998 }, { "epoch": 0.8219816362888859, "grad_norm": 0.80859375, "kl": 3.7713470458984375, "learning_rate": 5e-06, "logits/chosen": -14506625.777777778, "logits/rejected": -20931946.666666668, "logps/chosen": -509.29448784722223, "logps/rejected": -545.9958984375, "loss": 0.0026, "rewards/chosen": 8.472345140245226, "rewards/margins": 21.816449144151477, "rewards/rejected": -13.34410400390625, "step": 2999 }, { "epoch": 0.8222557215293956, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13271334.153846154, "logits/rejected": -21166800.0, "logps/chosen": -454.89637169471155, "logps/rejected": -572.3176935369319, "loss": 0.0228, "rewards/chosen": 6.069163395808293, "rewards/margins": 19.80192235133031, "rewards/rejected": -13.732758955522018, "step": 3000 }, { "epoch": 0.8225298067699054, "grad_norm": 11.0625, "kl": 3.4555444717407227, "learning_rate": 5e-06, "logits/chosen": -9246415.333333334, "logits/rejected": -27238733.333333332, "logps/chosen": -342.3016764322917, "logps/rejected": -799.7574055989584, "loss": 0.0712, "rewards/chosen": 4.938561121622722, "rewards/margins": 31.755221684773765, "rewards/rejected": -26.816660563151043, "step": 3001 }, { "epoch": 0.8228038920104153, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32676265.14285714, "logits/rejected": -28839788.8, "logps/chosen": -377.77218191964283, "logps/rejected": -451.23876953125, "loss": 0.0246, "rewards/chosen": 6.521275111607143, "rewards/margins": 19.121326991489955, "rewards/rejected": -12.600051879882812, "step": 3002 }, { "epoch": 0.823077977250925, "grad_norm": 6.5625, "kl": 2.6847548484802246, "learning_rate": 5e-06, "logits/chosen": -13933280.0, "logits/rejected": -19975076.363636363, "logps/chosen": -417.23715444711536, "logps/rejected": -714.4341264204545, "loss": 0.0162, "rewards/chosen": 5.869915301983173, "rewards/margins": 20.264780858179904, "rewards/rejected": -14.394865556196732, "step": 3003 }, { "epoch": 0.8233520624914349, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36323122.28571428, "logits/rejected": -23382881.88235294, "logps/chosen": -444.0166713169643, "logps/rejected": -601.1120749080883, "loss": 0.0536, "rewards/chosen": 5.901943751743862, "rewards/margins": 20.738505964519597, "rewards/rejected": -14.836562212775736, "step": 3004 }, { "epoch": 0.8236261477319446, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19234588.307692308, "logits/rejected": 13941218.909090908, "logps/chosen": -371.7841796875, "logps/rejected": -729.6188299005681, "loss": 0.0466, "rewards/chosen": 5.383524968073918, "rewards/margins": 21.704433948009996, "rewards/rejected": -16.32090897993608, "step": 3005 }, { "epoch": 0.8239002329724544, "grad_norm": 8.9375, "kl": 22.187721252441406, "learning_rate": 5e-06, "logits/chosen": -34458960.0, "logits/rejected": -93212512.0, "logps/chosen": -416.24029541015625, "logps/rejected": -459.9315185546875, "loss": 0.0741, "rewards/chosen": 7.69056510925293, "rewards/margins": 21.524577140808105, "rewards/rejected": -13.834012031555176, "step": 3006 }, { "epoch": 0.8241743182129643, "grad_norm": 6.25, "kl": 5.665071964263916, "learning_rate": 5e-06, "logits/chosen": -39412770.13333333, "logits/rejected": -52291712.0, "logps/chosen": -563.7427734375, "logps/rejected": -495.03559027777777, "loss": 0.037, "rewards/chosen": 7.526112874348958, "rewards/margins": 19.877830505371094, "rewards/rejected": -12.351717631022135, "step": 3007 }, { "epoch": 0.824448403453474, "grad_norm": 13.4375, "kl": 1.7380092144012451, "learning_rate": 5e-06, "logits/chosen": -13707976.888888888, "logits/rejected": -34175042.666666664, "logps/chosen": -344.17811414930554, "logps/rejected": -337.23158772786456, "loss": 0.0785, "rewards/chosen": 6.57991706000434, "rewards/margins": 14.1486267513699, "rewards/rejected": -7.56870969136556, "step": 3008 }, { "epoch": 0.8247224886939838, "grad_norm": 1.828125, "kl": 3.364922523498535, "learning_rate": 5e-06, "logits/chosen": -38685673.84615385, "logits/rejected": -32130615.272727273, "logps/chosen": -495.51915564903845, "logps/rejected": -526.1516779119319, "loss": 0.0129, "rewards/chosen": 7.4232342059795675, "rewards/margins": 20.032634174907123, "rewards/rejected": -12.609399968927557, "step": 3009 }, { "epoch": 0.8249965739344937, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41562493.333333336, "logits/rejected": -23333546.666666668, "logps/chosen": -474.4164225260417, "logps/rejected": -473.8439534505208, "loss": 0.0434, "rewards/chosen": 6.725715637207031, "rewards/margins": 18.683497111002602, "rewards/rejected": -11.957781473795572, "step": 3010 }, { "epoch": 0.8252706591750034, "grad_norm": 6.5, "kl": 10.892541885375977, "learning_rate": 5e-06, "logits/chosen": -24624758.153846152, "logits/rejected": -47294330.18181818, "logps/chosen": -334.49755859375, "logps/rejected": -592.3669211647727, "loss": 0.0211, "rewards/chosen": 7.3174297626201925, "rewards/margins": 22.280776764129424, "rewards/rejected": -14.963347001509232, "step": 3011 }, { "epoch": 0.8255447444155132, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12417512.0, "logits/rejected": -12087810.666666666, "logps/chosen": -456.8170166015625, "logps/rejected": -549.5262451171875, "loss": 0.034, "rewards/chosen": 5.349393844604492, "rewards/margins": 15.826118469238281, "rewards/rejected": -10.476724624633789, "step": 3012 }, { "epoch": 0.825818829656023, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -709019.1428571428, "logits/rejected": -27893536.0, "logps/chosen": -385.78787667410717, "logps/rejected": -687.84482421875, "loss": 0.041, "rewards/chosen": 7.548550197056362, "rewards/margins": 19.257097407749722, "rewards/rejected": -11.708547210693359, "step": 3013 }, { "epoch": 0.8260929148965328, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34967947.63636363, "logits/rejected": -22642870.153846152, "logps/chosen": -464.92622514204544, "logps/rejected": -526.8941556490385, "loss": 0.0187, "rewards/chosen": 8.208550886674361, "rewards/margins": 18.916415501307775, "rewards/rejected": -10.707864614633413, "step": 3014 }, { "epoch": 0.8263670001370427, "grad_norm": 9.25, "kl": 18.249536514282227, "learning_rate": 5e-06, "logits/chosen": -21478725.333333332, "logits/rejected": 3685354.6666666665, "logps/chosen": -430.40814887152777, "logps/rejected": -613.0563151041666, "loss": 0.1902, "rewards/chosen": 7.325974358452691, "rewards/margins": 20.398167504204643, "rewards/rejected": -13.072193145751953, "step": 3015 }, { "epoch": 0.8266410853775524, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26915093.333333332, "logits/rejected": -38758842.666666664, "logps/chosen": -406.01025390625, "logps/rejected": -503.5823160807292, "loss": 0.0071, "rewards/chosen": 7.594118118286133, "rewards/margins": 17.072376251220703, "rewards/rejected": -9.47825813293457, "step": 3016 }, { "epoch": 0.8269151706180622, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37154885.333333336, "logits/rejected": -29133109.333333332, "logps/chosen": -400.1703287760417, "logps/rejected": -558.37060546875, "loss": 0.026, "rewards/chosen": 8.294000625610352, "rewards/margins": 24.016097386678062, "rewards/rejected": -15.722096761067709, "step": 3017 }, { "epoch": 0.8271892558585721, "grad_norm": 2.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19131637.333333332, "logits/rejected": -54270749.86666667, "logps/chosen": -460.15516493055554, "logps/rejected": -633.2216145833333, "loss": 0.0185, "rewards/chosen": 8.27043236626519, "rewards/margins": 24.44322289360894, "rewards/rejected": -16.17279052734375, "step": 3018 }, { "epoch": 0.8274633410990818, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6286634.0, "logits/rejected": -12865810.0, "logps/chosen": -367.2996520996094, "logps/rejected": -427.8921203613281, "loss": 0.0417, "rewards/chosen": 6.118628978729248, "rewards/margins": 15.544469356536865, "rewards/rejected": -9.425840377807617, "step": 3019 }, { "epoch": 0.8277374263395916, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17719185.454545453, "logits/rejected": -34162205.538461536, "logps/chosen": -416.25661399147725, "logps/rejected": -486.6545222355769, "loss": 0.0163, "rewards/chosen": 8.10207089510831, "rewards/margins": 19.112615892103502, "rewards/rejected": -11.010544996995192, "step": 3020 }, { "epoch": 0.8280115115801014, "grad_norm": 5.8125, "kl": 2.1949920654296875, "learning_rate": 5e-06, "logits/chosen": -7071946.133333334, "logits/rejected": -21650741.333333332, "logps/chosen": -403.3005859375, "logps/rejected": -589.8742404513889, "loss": 0.0436, "rewards/chosen": 6.872549438476563, "rewards/margins": 19.78074917263455, "rewards/rejected": -12.908199734157986, "step": 3021 }, { "epoch": 0.8282855968206112, "grad_norm": 3.796875, "kl": 0.720550537109375, "learning_rate": 5e-06, "logits/chosen": -22439385.14285714, "logits/rejected": -5794932.0, "logps/chosen": -465.42201450892856, "logps/rejected": -418.179833984375, "loss": 0.0125, "rewards/chosen": 8.110331399100167, "rewards/margins": 18.446474892752512, "rewards/rejected": -10.336143493652344, "step": 3022 }, { "epoch": 0.828559682061121, "grad_norm": 0.56640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7941271.5, "logits/rejected": -24546040.0, "logps/chosen": -441.7156982421875, "logps/rejected": -537.6048583984375, "loss": 0.0019, "rewards/chosen": 7.043392181396484, "rewards/margins": 16.71914291381836, "rewards/rejected": -9.675750732421875, "step": 3023 }, { "epoch": 0.8288337673016308, "grad_norm": 5.09375, "kl": 4.225397109985352, "learning_rate": 5e-06, "logits/chosen": -19710145.333333332, "logits/rejected": -22405608.0, "logps/chosen": -436.4909261067708, "logps/rejected": -432.8754069010417, "loss": 0.0622, "rewards/chosen": 6.837445576985677, "rewards/margins": 18.992212931315105, "rewards/rejected": -12.154767354329428, "step": 3024 }, { "epoch": 0.8291078525421406, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15411497.846153846, "logits/rejected": -28873733.818181816, "logps/chosen": -352.4958683894231, "logps/rejected": -622.0582830255681, "loss": 0.0204, "rewards/chosen": 6.417608407827524, "rewards/margins": 22.74407436130764, "rewards/rejected": -16.326465953480113, "step": 3025 }, { "epoch": 0.8293819377826505, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31078944.0, "logits/rejected": -10767376.888888888, "logps/chosen": -432.7640380859375, "logps/rejected": -520.0346137152778, "loss": 0.0308, "rewards/chosen": 8.68283208211263, "rewards/margins": 21.25079811943902, "rewards/rejected": -12.56796603732639, "step": 3026 }, { "epoch": 0.8296560230231602, "grad_norm": 0.75390625, "kl": 0.516302764415741, "learning_rate": 5e-06, "logits/chosen": -47501682.28571428, "logits/rejected": 6507841.6, "logps/chosen": -497.288330078125, "logps/rejected": -592.030029296875, "loss": 0.0026, "rewards/chosen": 7.8393723624093195, "rewards/margins": 21.473411669049945, "rewards/rejected": -13.634039306640625, "step": 3027 }, { "epoch": 0.82993010826367, "grad_norm": 2.890625, "kl": 2.3726038932800293, "learning_rate": 5e-06, "logits/chosen": -17046208.0, "logits/rejected": -29430978.90909091, "logps/chosen": -481.1198542668269, "logps/rejected": -459.96803977272725, "loss": 0.0095, "rewards/chosen": 8.42447486290565, "rewards/margins": 19.62772332038079, "rewards/rejected": -11.203248457475143, "step": 3028 }, { "epoch": 0.8302041935041798, "grad_norm": 3.09375, "kl": 2.9452414512634277, "learning_rate": 5e-06, "logits/chosen": -1516517.6666666667, "logits/rejected": -17427253.333333332, "logps/chosen": -418.7130533854167, "logps/rejected": -623.8433430989584, "loss": 0.0129, "rewards/chosen": 7.567324956258138, "rewards/margins": 22.01209831237793, "rewards/rejected": -14.444773356119791, "step": 3029 }, { "epoch": 0.8304782787446896, "grad_norm": 6.0625, "kl": 15.233776092529297, "learning_rate": 5e-06, "logits/chosen": -22980700.0, "logits/rejected": -31984944.0, "logps/chosen": -500.8255615234375, "logps/rejected": -640.25830078125, "loss": 0.0253, "rewards/chosen": 7.803154468536377, "rewards/margins": 23.059139728546143, "rewards/rejected": -15.255985260009766, "step": 3030 }, { "epoch": 0.8307523639851994, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31734574.222222224, "logits/rejected": -38560904.53333333, "logps/chosen": -364.3068033854167, "logps/rejected": -520.7668619791667, "loss": 0.0293, "rewards/chosen": 6.086385515001085, "rewards/margins": 18.74494162665473, "rewards/rejected": -12.658556111653645, "step": 3031 }, { "epoch": 0.8310264492257092, "grad_norm": 4.75, "kl": 0.5097700953483582, "learning_rate": 5e-06, "logits/chosen": 21592496.0, "logits/rejected": -3650364.0, "logps/chosen": -422.1197916666667, "logps/rejected": -612.87646484375, "loss": 0.034, "rewards/chosen": 6.170874913533528, "rewards/margins": 19.015696843465168, "rewards/rejected": -12.84482192993164, "step": 3032 }, { "epoch": 0.831300534466219, "grad_norm": 11.0, "kl": 22.01634979248047, "learning_rate": 5e-06, "logits/chosen": -26520746.666666668, "logits/rejected": -109384533.33333333, "logps/chosen": -464.4757486979167, "logps/rejected": -576.4449055989584, "loss": 0.0449, "rewards/chosen": 8.26057603624132, "rewards/margins": 25.27828386094835, "rewards/rejected": -17.01770782470703, "step": 3033 }, { "epoch": 0.8315746197067287, "grad_norm": 5.90625, "kl": 7.900546073913574, "learning_rate": 5e-06, "logits/chosen": -17142154.0, "logits/rejected": -20218472.0, "logps/chosen": -453.17047119140625, "logps/rejected": -449.22503662109375, "loss": 0.0229, "rewards/chosen": 7.226002216339111, "rewards/margins": 19.34696054458618, "rewards/rejected": -12.12095832824707, "step": 3034 }, { "epoch": 0.8318487049472386, "grad_norm": 3.140625, "kl": 2.0312747955322266, "learning_rate": 5e-06, "logits/chosen": -19047562.181818184, "logits/rejected": -5740502.153846154, "logps/chosen": -571.6716086647727, "logps/rejected": -551.9929387019231, "loss": 0.0082, "rewards/chosen": 9.561365300958807, "rewards/margins": 22.819424289089817, "rewards/rejected": -13.25805898813101, "step": 3035 }, { "epoch": 0.8321227901877484, "grad_norm": 11.5, "kl": 5.546632289886475, "learning_rate": 5e-06, "logits/chosen": -18124913.333333332, "logits/rejected": 12932556.0, "logps/chosen": -473.492919921875, "logps/rejected": -596.8677978515625, "loss": 0.0572, "rewards/chosen": 7.352972666422526, "rewards/margins": 19.879491170247395, "rewards/rejected": -12.52651850382487, "step": 3036 }, { "epoch": 0.8323968754282582, "grad_norm": 3.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18365442.46153846, "logits/rejected": -20020494.545454547, "logps/chosen": -265.99173677884613, "logps/rejected": -560.1779119318181, "loss": 0.024, "rewards/chosen": 5.462162898137019, "rewards/margins": 22.86968770060506, "rewards/rejected": -17.40752480246804, "step": 3037 }, { "epoch": 0.832670960668768, "grad_norm": 5.875, "kl": 1.8313071727752686, "learning_rate": 5e-06, "logits/chosen": -12845533.0, "logits/rejected": -9584748.0, "logps/chosen": -301.6040954589844, "logps/rejected": -399.22509765625, "loss": 0.0254, "rewards/chosen": 8.449505805969238, "rewards/margins": 17.41270923614502, "rewards/rejected": -8.963203430175781, "step": 3038 }, { "epoch": 0.8329450459092778, "grad_norm": 7.1875, "kl": 6.436845302581787, "learning_rate": 5e-06, "logits/chosen": -16256980.8, "logits/rejected": -29680500.57142857, "logps/chosen": -439.496044921875, "logps/rejected": -636.1413225446429, "loss": 0.0512, "rewards/chosen": 8.186920928955079, "rewards/margins": 22.471704428536555, "rewards/rejected": -14.284783499581474, "step": 3039 }, { "epoch": 0.8332191311497876, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24463184.0, "logits/rejected": -22757821.333333332, "logps/chosen": -244.298583984375, "logps/rejected": -500.6197102864583, "loss": 0.0335, "rewards/chosen": 7.02651850382487, "rewards/margins": 18.286375681559246, "rewards/rejected": -11.259857177734375, "step": 3040 }, { "epoch": 0.8334932163902974, "grad_norm": 6.59375, "kl": 1.3487262725830078, "learning_rate": 5e-06, "logits/chosen": -2405308.3076923075, "logits/rejected": -1242392.1818181819, "logps/chosen": -376.45511568509613, "logps/rejected": -584.9209872159091, "loss": 0.0321, "rewards/chosen": 5.943089998685396, "rewards/margins": 18.79791787954477, "rewards/rejected": -12.854827880859375, "step": 3041 }, { "epoch": 0.8337673016308071, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 42279325.09090909, "logits/rejected": -22103008.0, "logps/chosen": -497.8772638494318, "logps/rejected": -527.8517878605769, "loss": 0.0327, "rewards/chosen": 5.84081407026811, "rewards/margins": 19.94207608949888, "rewards/rejected": -14.10126201923077, "step": 3042 }, { "epoch": 0.834041386871317, "grad_norm": 3.6875, "kl": 6.508357048034668, "learning_rate": 5e-06, "logits/chosen": -11366932.57142857, "logits/rejected": -15752358.4, "logps/chosen": -401.26157924107144, "logps/rejected": -519.65234375, "loss": 0.0407, "rewards/chosen": 7.058075496128628, "rewards/margins": 19.984448787144252, "rewards/rejected": -12.926373291015626, "step": 3043 }, { "epoch": 0.8343154721118268, "grad_norm": 14.0, "kl": 20.66639518737793, "learning_rate": 5e-06, "logits/chosen": -33352711.529411763, "logits/rejected": -16931529.14285714, "logps/chosen": -399.59880514705884, "logps/rejected": -435.5059291294643, "loss": 0.1235, "rewards/chosen": 6.529987110811121, "rewards/margins": 15.805530323701745, "rewards/rejected": -9.275543212890625, "step": 3044 }, { "epoch": 0.8345895573523365, "grad_norm": 0.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16888092.8, "logits/rejected": -26430404.57142857, "logps/chosen": -443.18916015625, "logps/rejected": -714.5517578125, "loss": 0.0016, "rewards/chosen": 7.515119934082032, "rewards/margins": 25.505955941336495, "rewards/rejected": -17.990836007254465, "step": 3045 }, { "epoch": 0.8348636425928464, "grad_norm": 10.625, "kl": 1.4111175537109375, "learning_rate": 5e-06, "logits/chosen": -21507371.42857143, "logits/rejected": -35483654.4, "logps/chosen": -390.0736607142857, "logps/rejected": -466.40791015625, "loss": 0.0437, "rewards/chosen": 7.238246372767857, "rewards/margins": 18.959382084437777, "rewards/rejected": -11.721135711669922, "step": 3046 }, { "epoch": 0.8351377278333562, "grad_norm": 2.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30678581.333333332, "logits/rejected": -23669696.0, "logps/chosen": -494.0610758463542, "logps/rejected": -434.3761393229167, "loss": 0.0052, "rewards/chosen": 8.71744155883789, "rewards/margins": 17.849876403808594, "rewards/rejected": -9.132434844970703, "step": 3047 }, { "epoch": 0.835411813073866, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6896668.8, "logits/rejected": -16334980.57142857, "logps/chosen": -520.831640625, "logps/rejected": -471.01925223214283, "loss": 0.0227, "rewards/chosen": 7.882855224609375, "rewards/margins": 19.141199602399553, "rewards/rejected": -11.258344377790179, "step": 3048 }, { "epoch": 0.8356858983143758, "grad_norm": 2.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12919424.0, "logits/rejected": 1396492.923076923, "logps/chosen": -420.57492897727275, "logps/rejected": -452.78763521634613, "loss": 0.0077, "rewards/chosen": 8.435423417524857, "rewards/margins": 20.586762434952742, "rewards/rejected": -12.151339017427885, "step": 3049 }, { "epoch": 0.8359599835548855, "grad_norm": 6.84375, "kl": 3.011685848236084, "learning_rate": 5e-06, "logits/chosen": -19498292.0, "logits/rejected": -13226466.666666666, "logps/chosen": -496.8500162760417, "logps/rejected": -461.4683837890625, "loss": 0.0228, "rewards/chosen": 6.569044748942058, "rewards/margins": 16.165627161661785, "rewards/rejected": -9.596582412719727, "step": 3050 }, { "epoch": 0.8362340687953954, "grad_norm": 8.5, "kl": 18.57172966003418, "learning_rate": 5e-06, "logits/chosen": -13390211.0, "logits/rejected": -27236180.0, "logps/chosen": -392.1568603515625, "logps/rejected": -676.2089233398438, "loss": 0.1019, "rewards/chosen": 6.96631383895874, "rewards/margins": 19.327109813690186, "rewards/rejected": -12.360795974731445, "step": 3051 }, { "epoch": 0.8365081540359052, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21327148.444444444, "logits/rejected": -6490836.266666667, "logps/chosen": -489.9283854166667, "logps/rejected": -467.34609375, "loss": 0.0339, "rewards/chosen": 8.391920301649305, "rewards/margins": 16.810299004448787, "rewards/rejected": -8.41837870279948, "step": 3052 }, { "epoch": 0.8367822392764149, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18816593.6, "logits/rejected": -19818409.14285714, "logps/chosen": -531.538232421875, "logps/rejected": -384.82742745535717, "loss": 0.0604, "rewards/chosen": 8.202149963378906, "rewards/margins": 19.898033578055244, "rewards/rejected": -11.695883614676339, "step": 3053 }, { "epoch": 0.8370563245169248, "grad_norm": 11.6875, "kl": 8.74416732788086, "learning_rate": 5e-06, "logits/chosen": -19952474.352941178, "logits/rejected": -9695203.42857143, "logps/chosen": -413.48779296875, "logps/rejected": -406.3557826450893, "loss": 0.0173, "rewards/chosen": 6.6592725865981155, "rewards/margins": 16.61778175931017, "rewards/rejected": -9.958509172712054, "step": 3054 }, { "epoch": 0.8373304097574346, "grad_norm": 3.703125, "kl": 2.7974658012390137, "learning_rate": 5e-06, "logits/chosen": -7206860.266666667, "logits/rejected": -7775173.333333333, "logps/chosen": -429.19567057291664, "logps/rejected": -840.0022786458334, "loss": 0.0127, "rewards/chosen": 7.719891357421875, "rewards/margins": 24.743105061848958, "rewards/rejected": -17.023213704427082, "step": 3055 }, { "epoch": 0.8376044949979443, "grad_norm": 6.15625, "kl": 1.8314399719238281, "learning_rate": 5e-06, "logits/chosen": -14122026.666666666, "logits/rejected": -18526740.0, "logps/chosen": -453.152099609375, "logps/rejected": -472.3063557942708, "loss": 0.0297, "rewards/chosen": 6.6902116139729815, "rewards/margins": 15.467811584472656, "rewards/rejected": -8.777599970499674, "step": 3056 }, { "epoch": 0.8378785802384542, "grad_norm": 1.3515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4429016.0, "logits/rejected": -32427625.14285714, "logps/chosen": -435.32255859375, "logps/rejected": -561.9446149553571, "loss": 0.0031, "rewards/chosen": 8.250381469726562, "rewards/margins": 22.660261971609934, "rewards/rejected": -14.40988050188337, "step": 3057 }, { "epoch": 0.8381526654789639, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29103638.4, "logits/rejected": 6434453.714285715, "logps/chosen": -243.453662109375, "logps/rejected": -589.9854910714286, "loss": 0.0147, "rewards/chosen": 5.918161010742187, "rewards/margins": 17.81623774937221, "rewards/rejected": -11.898076738630023, "step": 3058 }, { "epoch": 0.8384267507194737, "grad_norm": 11.9375, "kl": 4.4419755935668945, "learning_rate": 5e-06, "logits/chosen": -26512069.818181816, "logits/rejected": -33338990.769230768, "logps/chosen": -348.7415216619318, "logps/rejected": -654.7435396634615, "loss": 0.0834, "rewards/chosen": 6.118644714355469, "rewards/margins": 17.93725057748648, "rewards/rejected": -11.81860586313101, "step": 3059 }, { "epoch": 0.8387008359599836, "grad_norm": 3.875, "kl": 4.617099285125732, "learning_rate": 5e-06, "logits/chosen": 1795476.3333333333, "logits/rejected": -29624162.666666668, "logps/chosen": -505.0414225260417, "logps/rejected": -549.8516438802084, "loss": 0.0253, "rewards/chosen": 7.70277214050293, "rewards/margins": 18.01624870300293, "rewards/rejected": -10.3134765625, "step": 3060 }, { "epoch": 0.8389749212004933, "grad_norm": 0.71484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9191349.333333334, "logits/rejected": -9474812.0, "logps/chosen": -465.8145345052083, "logps/rejected": -460.6668294270833, "loss": 0.0019, "rewards/chosen": 7.733978271484375, "rewards/margins": 20.298812866210938, "rewards/rejected": -12.564834594726562, "step": 3061 }, { "epoch": 0.8392490064410032, "grad_norm": 7.28125, "kl": 22.34404182434082, "learning_rate": 5e-06, "logits/chosen": -14909174.222222222, "logits/rejected": 23827794.666666668, "logps/chosen": -479.63878038194446, "logps/rejected": -405.5511067708333, "loss": 0.026, "rewards/chosen": 9.158312479654947, "rewards/margins": 17.65397771199544, "rewards/rejected": -8.495665232340494, "step": 3062 }, { "epoch": 0.839523091681513, "grad_norm": 0.9453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 366574.0, "logits/rejected": -26590064.0, "logps/chosen": -308.0875244140625, "logps/rejected": -484.8311767578125, "loss": 0.0031, "rewards/chosen": 6.467896938323975, "rewards/margins": 19.98842477798462, "rewards/rejected": -13.520527839660645, "step": 3063 }, { "epoch": 0.8397971769220227, "grad_norm": 3.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13692866.0, "logits/rejected": -28267856.0, "logps/chosen": -246.69561767578125, "logps/rejected": -529.53369140625, "loss": 0.0422, "rewards/chosen": 4.373522758483887, "rewards/margins": 15.495678520202636, "rewards/rejected": -11.12215576171875, "step": 3064 }, { "epoch": 0.8400712621625326, "grad_norm": 1.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34292913.45454545, "logits/rejected": -27002345.846153848, "logps/chosen": -398.77596768465907, "logps/rejected": -854.076171875, "loss": 0.0057, "rewards/chosen": 6.775479403409091, "rewards/margins": 28.783381108637457, "rewards/rejected": -22.007901705228367, "step": 3065 }, { "epoch": 0.8403453474030423, "grad_norm": 3.484375, "kl": 1.7490642070770264, "learning_rate": 5e-06, "logits/chosen": -9889444.57142857, "logits/rejected": -39453494.4, "logps/chosen": -426.81919642857144, "logps/rejected": -494.97197265625, "loss": 0.0106, "rewards/chosen": 6.740482330322266, "rewards/margins": 17.697501373291015, "rewards/rejected": -10.95701904296875, "step": 3066 }, { "epoch": 0.8406194326435521, "grad_norm": 8.1875, "kl": 11.288844108581543, "learning_rate": 5e-06, "logits/chosen": -30668856.0, "logits/rejected": -12298583.0, "logps/chosen": -368.17974853515625, "logps/rejected": -406.4898986816406, "loss": 0.04, "rewards/chosen": 7.550025939941406, "rewards/margins": 15.645768165588379, "rewards/rejected": -8.095742225646973, "step": 3067 }, { "epoch": 0.840893517884062, "grad_norm": 5.5625, "kl": 5.142234802246094, "learning_rate": 5e-06, "logits/chosen": -19833974.85714286, "logits/rejected": -41369296.0, "logps/chosen": -396.5244838169643, "logps/rejected": -566.86943359375, "loss": 0.0153, "rewards/chosen": 7.580681392124721, "rewards/margins": 18.8343015398298, "rewards/rejected": -11.253620147705078, "step": 3068 }, { "epoch": 0.8411676031245717, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11441974.857142856, "logits/rejected": -24155367.529411763, "logps/chosen": -371.08279854910717, "logps/rejected": -602.0854204963235, "loss": 0.0221, "rewards/chosen": 6.125100816999163, "rewards/margins": 16.54176628689806, "rewards/rejected": -10.416665469898897, "step": 3069 }, { "epoch": 0.8414416883650815, "grad_norm": 2.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14246555.636363637, "logits/rejected": -828224.0, "logps/chosen": -420.15505149147725, "logps/rejected": -587.7573617788462, "loss": 0.0216, "rewards/chosen": 7.307892539284446, "rewards/margins": 19.32488613528805, "rewards/rejected": -12.016993596003605, "step": 3070 }, { "epoch": 0.8417157736055914, "grad_norm": 13.625, "kl": 2.390592098236084, "learning_rate": 5e-06, "logits/chosen": -12867076.363636363, "logits/rejected": -16918872.615384616, "logps/chosen": -386.31942471590907, "logps/rejected": -512.8753756009615, "loss": 0.0289, "rewards/chosen": 7.828771417791193, "rewards/margins": 17.764977941979893, "rewards/rejected": -9.936206524188702, "step": 3071 }, { "epoch": 0.8419898588461011, "grad_norm": 8.5625, "kl": 3.0827701091766357, "learning_rate": 5e-06, "logits/chosen": -29878467.76470588, "logits/rejected": -62485869.71428572, "logps/chosen": -457.69528377757354, "logps/rejected": -699.1671316964286, "loss": 0.0224, "rewards/chosen": 6.525408576516544, "rewards/margins": 22.30379646365382, "rewards/rejected": -15.778387887137276, "step": 3072 }, { "epoch": 0.842263944086611, "grad_norm": 11.5, "kl": 0.9078814387321472, "learning_rate": 5e-06, "logits/chosen": 28891313.454545453, "logits/rejected": -23467564.307692308, "logps/chosen": -579.8463689630681, "logps/rejected": -542.5525841346154, "loss": 0.0609, "rewards/chosen": 7.431067033247515, "rewards/margins": 18.34307077047708, "rewards/rejected": -10.912003737229567, "step": 3073 }, { "epoch": 0.8425380293271207, "grad_norm": 9.625, "kl": 0.6521136164665222, "learning_rate": 5e-06, "logits/chosen": -21882285.09090909, "logits/rejected": -26641368.615384616, "logps/chosen": -419.8014026988636, "logps/rejected": -489.61512169471155, "loss": 0.0516, "rewards/chosen": 6.044830322265625, "rewards/margins": 13.249610314002403, "rewards/rejected": -7.204779991736779, "step": 3074 }, { "epoch": 0.8428121145676305, "grad_norm": 3.734375, "kl": 4.637363433837891, "learning_rate": 5e-06, "logits/chosen": -15307318.857142856, "logits/rejected": -21432958.4, "logps/chosen": -388.19876534598217, "logps/rejected": -507.987255859375, "loss": 0.0082, "rewards/chosen": 8.144717625209264, "rewards/margins": 20.267010716029574, "rewards/rejected": -12.122293090820312, "step": 3075 }, { "epoch": 0.8430861998081404, "grad_norm": 14.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10638329.333333334, "logits/rejected": -25476293.333333332, "logps/chosen": -363.1772867838542, "logps/rejected": -733.0882975260416, "loss": 0.0274, "rewards/chosen": 6.638240814208984, "rewards/margins": 21.056870778401695, "rewards/rejected": -14.418629964192709, "step": 3076 }, { "epoch": 0.8433602850486501, "grad_norm": 2.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19574722.90909091, "logits/rejected": -27584241.230769232, "logps/chosen": -335.4595392400568, "logps/rejected": -635.2331730769231, "loss": 0.0073, "rewards/chosen": 7.1116111061789775, "rewards/margins": 20.460705203609866, "rewards/rejected": -13.34909409743089, "step": 3077 }, { "epoch": 0.8436343702891599, "grad_norm": 5.1875, "kl": 1.217511534690857, "learning_rate": 5e-06, "logits/chosen": -39182759.384615384, "logits/rejected": -23247138.90909091, "logps/chosen": -418.0406024639423, "logps/rejected": -407.8876953125, "loss": 0.0261, "rewards/chosen": 6.910495464618389, "rewards/margins": 19.0824037565218, "rewards/rejected": -12.171908291903408, "step": 3078 }, { "epoch": 0.8439084555296698, "grad_norm": 10.25, "kl": 0.9199041128158569, "learning_rate": 5e-06, "logits/chosen": 11473785.333333334, "logits/rejected": -14288677.333333334, "logps/chosen": -354.9694010416667, "logps/rejected": -450.8922526041667, "loss": 0.0381, "rewards/chosen": 8.13098398844401, "rewards/margins": 17.73423131306966, "rewards/rejected": -9.60324732462565, "step": 3079 }, { "epoch": 0.8441825407701795, "grad_norm": 3.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30986025.14285714, "logits/rejected": -19516153.6, "logps/chosen": -438.04603794642856, "logps/rejected": -589.60009765625, "loss": 0.0204, "rewards/chosen": 7.249068123953683, "rewards/margins": 22.8684086390904, "rewards/rejected": -15.619340515136718, "step": 3080 }, { "epoch": 0.8444566260106893, "grad_norm": 7.875, "kl": 2.502382278442383, "learning_rate": 5e-06, "logits/chosen": -30657810.285714287, "logits/rejected": -3767464.8, "logps/chosen": -434.96641322544644, "logps/rejected": -495.822705078125, "loss": 0.0195, "rewards/chosen": 6.673041752406529, "rewards/margins": 16.822977665492466, "rewards/rejected": -10.149935913085937, "step": 3081 }, { "epoch": 0.8447307112511991, "grad_norm": 0.1904296875, "kl": 0.11779403686523438, "learning_rate": 5e-06, "logits/chosen": 6673351.428571428, "logits/rejected": -22640809.411764707, "logps/chosen": -528.6954868861607, "logps/rejected": -580.1098920036765, "loss": 0.0004, "rewards/chosen": 9.494199480329241, "rewards/margins": 23.340504878709297, "rewards/rejected": -13.846305398380055, "step": 3082 }, { "epoch": 0.8450047964917089, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38363575.27272727, "logits/rejected": -28535645.53846154, "logps/chosen": -414.1638849431818, "logps/rejected": -637.3389423076923, "loss": 0.0315, "rewards/chosen": 6.199623107910156, "rewards/margins": 20.514318026029144, "rewards/rejected": -14.31469491811899, "step": 3083 }, { "epoch": 0.8452788817322188, "grad_norm": 12.3125, "kl": 6.549201965332031, "learning_rate": 5e-06, "logits/chosen": -11166612.0, "logits/rejected": -24568248.0, "logps/chosen": -460.3223876953125, "logps/rejected": -419.9205017089844, "loss": 0.0561, "rewards/chosen": 7.37081241607666, "rewards/margins": 20.564130783081055, "rewards/rejected": -13.193318367004395, "step": 3084 }, { "epoch": 0.8455529669727285, "grad_norm": 14.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7561457.454545454, "logits/rejected": -26027694.769230768, "logps/chosen": -397.4021661931818, "logps/rejected": -554.9938401442307, "loss": 0.0325, "rewards/chosen": 6.769062389026988, "rewards/margins": 18.391673801662204, "rewards/rejected": -11.622611412635216, "step": 3085 }, { "epoch": 0.8458270522132383, "grad_norm": 12.25, "kl": 5.2133684158325195, "learning_rate": 5e-06, "logits/chosen": -6008286.666666667, "logits/rejected": -23430442.666666668, "logps/chosen": -413.4817301432292, "logps/rejected": -551.6761067708334, "loss": 0.0538, "rewards/chosen": 6.985700607299805, "rewards/margins": 21.098557154337563, "rewards/rejected": -14.11285654703776, "step": 3086 }, { "epoch": 0.8461011374537482, "grad_norm": 0.8359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31946699.636363637, "logits/rejected": -31609016.615384616, "logps/chosen": -386.931640625, "logps/rejected": -600.5295973557693, "loss": 0.0031, "rewards/chosen": 8.313458529385654, "rewards/margins": 21.294178462528684, "rewards/rejected": -12.980719933143028, "step": 3087 }, { "epoch": 0.8463752226942579, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10515778.666666666, "logits/rejected": -32725318.4, "logps/chosen": -534.7169053819445, "logps/rejected": -499.6363932291667, "loss": 0.0494, "rewards/chosen": 6.4605907864040795, "rewards/margins": 16.03490922715929, "rewards/rejected": -9.574318440755208, "step": 3088 }, { "epoch": 0.8466493079347677, "grad_norm": 2.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35996930.90909091, "logits/rejected": -21101435.076923076, "logps/chosen": -371.55599698153407, "logps/rejected": -696.1105769230769, "loss": 0.0087, "rewards/chosen": 6.59302451393821, "rewards/margins": 22.514812656215856, "rewards/rejected": -15.921788142277645, "step": 3089 }, { "epoch": 0.8469233931752775, "grad_norm": 8.3125, "kl": 4.387738227844238, "learning_rate": 5e-06, "logits/chosen": -4853394.0, "logits/rejected": -1191819.25, "logps/chosen": -389.4585266113281, "logps/rejected": -627.125244140625, "loss": 0.0329, "rewards/chosen": 6.736742973327637, "rewards/margins": 20.03480625152588, "rewards/rejected": -13.298063278198242, "step": 3090 }, { "epoch": 0.8471974784157873, "grad_norm": 0.99609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9862889.142857144, "logits/rejected": -33099241.411764707, "logps/chosen": -317.41312081473217, "logps/rejected": -531.2087545955883, "loss": 0.0028, "rewards/chosen": 6.845756530761719, "rewards/margins": 21.04601557114545, "rewards/rejected": -14.200259040383731, "step": 3091 }, { "epoch": 0.8474715636562971, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32487371.636363637, "logits/rejected": 4699779.692307692, "logps/chosen": -394.5779918323864, "logps/rejected": -506.43160306490387, "loss": 0.0171, "rewards/chosen": 7.384977860884233, "rewards/margins": 19.953520341352984, "rewards/rejected": -12.56854248046875, "step": 3092 }, { "epoch": 0.8477456488968069, "grad_norm": 5.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20050619.2, "logits/rejected": -29577273.14285714, "logps/chosen": -317.384375, "logps/rejected": -640.1764787946429, "loss": 0.052, "rewards/chosen": 5.771800994873047, "rewards/margins": 19.79521015712193, "rewards/rejected": -14.023409162248884, "step": 3093 }, { "epoch": 0.8480197341373167, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23031889.454545453, "logits/rejected": -12870902.153846154, "logps/chosen": -403.13285688920456, "logps/rejected": -669.9945913461538, "loss": 0.0195, "rewards/chosen": 7.429039001464844, "rewards/margins": 23.723463205190804, "rewards/rejected": -16.29442420372596, "step": 3094 }, { "epoch": 0.8482938193778266, "grad_norm": 13.1875, "kl": 0.3967437744140625, "learning_rate": 5e-06, "logits/chosen": -43288812.307692304, "logits/rejected": -17007645.09090909, "logps/chosen": -399.0477764423077, "logps/rejected": -649.7324662642045, "loss": 0.0449, "rewards/chosen": 7.99859853891226, "rewards/margins": 20.930115706437117, "rewards/rejected": -12.931517167524857, "step": 3095 }, { "epoch": 0.8485679046183363, "grad_norm": 2.703125, "kl": 1.8036375045776367, "learning_rate": 5e-06, "logits/chosen": -15689918.76923077, "logits/rejected": 98055.27272727272, "logps/chosen": -365.0935246394231, "logps/rejected": -513.6486150568181, "loss": 0.0151, "rewards/chosen": 7.8542327880859375, "rewards/margins": 20.195589932528407, "rewards/rejected": -12.34135714444247, "step": 3096 }, { "epoch": 0.8488419898588461, "grad_norm": 1.4921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10879709.090909092, "logits/rejected": -13912866.461538462, "logps/chosen": -426.9705699573864, "logps/rejected": -659.5117938701923, "loss": 0.0051, "rewards/chosen": 8.907901417125355, "rewards/margins": 21.671243467531006, "rewards/rejected": -12.76334205040565, "step": 3097 }, { "epoch": 0.8491160750993559, "grad_norm": 4.03125, "kl": 11.636991500854492, "learning_rate": 5e-06, "logits/chosen": -14825554.0, "logits/rejected": -24071158.0, "logps/chosen": -443.99652099609375, "logps/rejected": -468.216064453125, "loss": 0.0163, "rewards/chosen": 7.367401599884033, "rewards/margins": 23.78453493118286, "rewards/rejected": -16.417133331298828, "step": 3098 }, { "epoch": 0.8493901603398657, "grad_norm": 3.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20810731.2, "logits/rejected": -20403241.14285714, "logps/chosen": -378.8668701171875, "logps/rejected": -654.5641043526786, "loss": 0.0151, "rewards/chosen": 5.586615371704101, "rewards/margins": 20.715541240147182, "rewards/rejected": -15.12892586844308, "step": 3099 }, { "epoch": 0.8496642455803755, "grad_norm": 2.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26648993.6, "logits/rejected": -43092086.85714286, "logps/chosen": -471.05146484375, "logps/rejected": -594.1985909598214, "loss": 0.0074, "rewards/chosen": 7.9186454772949215, "rewards/margins": 23.15072293962751, "rewards/rejected": -15.232077462332589, "step": 3100 }, { "epoch": 0.8499383308208853, "grad_norm": 0.76953125, "kl": 2.6660983562469482, "learning_rate": 5e-06, "logits/chosen": -24183502.545454547, "logits/rejected": -30941809.230769232, "logps/chosen": -461.19948508522725, "logps/rejected": -573.4504206730769, "loss": 0.0021, "rewards/chosen": 7.690655795010653, "rewards/margins": 22.269757357510652, "rewards/rejected": -14.5791015625, "step": 3101 }, { "epoch": 0.8502124160613951, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20809545.846153848, "logits/rejected": -29488189.09090909, "logps/chosen": -339.45755709134613, "logps/rejected": -572.6257990056819, "loss": 0.0503, "rewards/chosen": 5.027592585637019, "rewards/margins": 17.117981490555344, "rewards/rejected": -12.090388904918324, "step": 3102 }, { "epoch": 0.8504865013019048, "grad_norm": 2.484375, "kl": 2.0258681774139404, "learning_rate": 5e-06, "logits/chosen": -50655227.07692308, "logits/rejected": -14660709.818181818, "logps/chosen": -443.65613731971155, "logps/rejected": -512.1215376420455, "loss": 0.0078, "rewards/chosen": 9.191454373873198, "rewards/margins": 19.952615484491098, "rewards/rejected": -10.761161110617898, "step": 3103 }, { "epoch": 0.8507605865424147, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33246212.923076924, "logits/rejected": -4936373.818181818, "logps/chosen": -421.37635216346155, "logps/rejected": -431.25532670454544, "loss": 0.0106, "rewards/chosen": 6.987131558931791, "rewards/margins": 18.578139138388465, "rewards/rejected": -11.591007579456676, "step": 3104 }, { "epoch": 0.8510346717829245, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27775315.555555556, "logits/rejected": -33531061.333333332, "logps/chosen": -352.8411458333333, "logps/rejected": -497.7388671875, "loss": 0.0105, "rewards/chosen": 6.689057244194879, "rewards/margins": 21.21391075981988, "rewards/rejected": -14.524853515625, "step": 3105 }, { "epoch": 0.8513087570234343, "grad_norm": 1.4609375, "kl": 1.1898086071014404, "learning_rate": 5e-06, "logits/chosen": -32236674.666666668, "logits/rejected": -42364634.666666664, "logps/chosen": -481.753173828125, "logps/rejected": -533.3761393229166, "loss": 0.0039, "rewards/chosen": 7.307053883870442, "rewards/margins": 21.185949961344402, "rewards/rejected": -13.878896077473959, "step": 3106 }, { "epoch": 0.8515828422639441, "grad_norm": 6.34375, "kl": 3.7332167625427246, "learning_rate": 5e-06, "logits/chosen": -52207282.28571428, "logits/rejected": -39070556.8, "logps/chosen": -455.34165736607144, "logps/rejected": -525.227099609375, "loss": 0.0175, "rewards/chosen": 7.6496462140764505, "rewards/margins": 20.85549795968192, "rewards/rejected": -13.205851745605468, "step": 3107 }, { "epoch": 0.8518569275044539, "grad_norm": 3.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10813788.307692308, "logits/rejected": -29397329.454545453, "logps/chosen": -424.50390625, "logps/rejected": -649.2403231534091, "loss": 0.0116, "rewards/chosen": 7.314260629507212, "rewards/margins": 21.91667484736943, "rewards/rejected": -14.602414217862217, "step": 3108 }, { "epoch": 0.8521310127449637, "grad_norm": 1.1015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22024404.57142857, "logits/rejected": 10977829.6, "logps/chosen": -428.00411551339283, "logps/rejected": -651.43603515625, "loss": 0.0023, "rewards/chosen": 7.9311948503766745, "rewards/margins": 26.417135402134488, "rewards/rejected": -18.485940551757814, "step": 3109 }, { "epoch": 0.8524050979854735, "grad_norm": 14.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10810633.333333334, "logits/rejected": -21717540.0, "logps/chosen": -420.2162679036458, "logps/rejected": -614.1562093098959, "loss": 0.0212, "rewards/chosen": 6.838919321695964, "rewards/margins": 19.995733896891277, "rewards/rejected": -13.156814575195312, "step": 3110 }, { "epoch": 0.8526791832259832, "grad_norm": 3.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27217936.0, "logits/rejected": -29254381.333333332, "logps/chosen": -428.1899820963542, "logps/rejected": -590.825439453125, "loss": 0.0134, "rewards/chosen": 7.5669294993082685, "rewards/margins": 21.324534734090168, "rewards/rejected": -13.7576052347819, "step": 3111 }, { "epoch": 0.8529532684664931, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20409262.4, "logits/rejected": -25153888.0, "logps/chosen": -483.16396484375, "logps/rejected": -507.36635044642856, "loss": 0.0126, "rewards/chosen": 6.837347412109375, "rewards/margins": 18.857914079938617, "rewards/rejected": -12.020566667829241, "step": 3112 }, { "epoch": 0.8532273537070029, "grad_norm": 2.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19209690.666666668, "logits/rejected": -26658605.333333332, "logps/chosen": -392.3487548828125, "logps/rejected": -489.9335123697917, "loss": 0.0077, "rewards/chosen": 7.715457916259766, "rewards/margins": 19.263033548990883, "rewards/rejected": -11.54757563273112, "step": 3113 }, { "epoch": 0.8535014389475126, "grad_norm": 10.9375, "kl": 11.654260635375977, "learning_rate": 5e-06, "logits/chosen": -1582786.3529411764, "logits/rejected": -34971282.28571428, "logps/chosen": -356.0228630514706, "logps/rejected": -766.1545758928571, "loss": 0.0609, "rewards/chosen": 6.07913522159352, "rewards/margins": 25.94732544201763, "rewards/rejected": -19.868190220424108, "step": 3114 }, { "epoch": 0.8537755241880225, "grad_norm": 1.734375, "kl": 4.872922420501709, "learning_rate": 5e-06, "logits/chosen": -28412172.8, "logits/rejected": -47193112.88888889, "logps/chosen": -415.07490234375, "logps/rejected": -636.1629774305555, "loss": 0.0044, "rewards/chosen": 8.083002726236979, "rewards/margins": 24.17129347059462, "rewards/rejected": -16.08829074435764, "step": 3115 }, { "epoch": 0.8540496094285323, "grad_norm": 3.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22321316.57142857, "logits/rejected": 60744953.6, "logps/chosen": -531.13720703125, "logps/rejected": -517.55654296875, "loss": 0.0205, "rewards/chosen": 7.566799708775112, "rewards/margins": 23.553506251743862, "rewards/rejected": -15.98670654296875, "step": 3116 }, { "epoch": 0.8543236946690421, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29575621.333333332, "logits/rejected": -10930774.666666666, "logps/chosen": -403.9887288411458, "logps/rejected": -495.2403157552083, "loss": 0.0278, "rewards/chosen": 8.31807772318522, "rewards/margins": 19.368268966674805, "rewards/rejected": -11.050191243489584, "step": 3117 }, { "epoch": 0.8545977799095519, "grad_norm": 4.8125, "kl": 2.9580702781677246, "learning_rate": 5e-06, "logits/chosen": -25514569.14285714, "logits/rejected": -29190674.82352941, "logps/chosen": -288.1796875, "logps/rejected": -533.0337775735294, "loss": 0.0333, "rewards/chosen": 5.717225211007254, "rewards/margins": 17.148133414132253, "rewards/rejected": -11.430908203125, "step": 3118 }, { "epoch": 0.8548718651500616, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20059153.230769232, "logits/rejected": -26059514.181818184, "logps/chosen": -442.05419921875, "logps/rejected": -617.9793590198864, "loss": 0.0263, "rewards/chosen": 7.17840810922476, "rewards/margins": 19.882402860201324, "rewards/rejected": -12.703994750976562, "step": 3119 }, { "epoch": 0.8551459503905715, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20584816.0, "logits/rejected": -37428244.571428575, "logps/chosen": -313.864892578125, "logps/rejected": -590.0470145089286, "loss": 0.0474, "rewards/chosen": 5.549734497070313, "rewards/margins": 17.89878147670201, "rewards/rejected": -12.349046979631696, "step": 3120 }, { "epoch": 0.8554200356310813, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33279595.636363637, "logits/rejected": -23139037.53846154, "logps/chosen": -417.96981534090907, "logps/rejected": -510.640625, "loss": 0.037, "rewards/chosen": 7.879395224831321, "rewards/margins": 18.773553221375792, "rewards/rejected": -10.894157996544472, "step": 3121 }, { "epoch": 0.855694120871591, "grad_norm": 4.375, "kl": 1.5423177480697632, "learning_rate": 5e-06, "logits/chosen": -13259709.090909092, "logits/rejected": -11503670.153846154, "logps/chosen": -337.0555974786932, "logps/rejected": -472.82861328125, "loss": 0.0211, "rewards/chosen": 8.492847095836293, "rewards/margins": 20.74386473969146, "rewards/rejected": -12.251017643855167, "step": 3122 }, { "epoch": 0.8559682061121009, "grad_norm": 0.283203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18793049.333333332, "logits/rejected": -23410131.555555556, "logps/chosen": -410.7428792317708, "logps/rejected": -563.9857855902778, "loss": 0.001, "rewards/chosen": 7.731421788533528, "rewards/margins": 21.493186314900715, "rewards/rejected": -13.761764526367188, "step": 3123 }, { "epoch": 0.8562422913526107, "grad_norm": 11.3125, "kl": 4.457822322845459, "learning_rate": 5e-06, "logits/chosen": -6833095.0, "logits/rejected": -16939434.0, "logps/chosen": -447.315185546875, "logps/rejected": -516.1588134765625, "loss": 0.0336, "rewards/chosen": 6.6086530685424805, "rewards/margins": 18.456658363342285, "rewards/rejected": -11.848005294799805, "step": 3124 }, { "epoch": 0.8565163765931204, "grad_norm": 1.1796875, "kl": 1.7146899700164795, "learning_rate": 5e-06, "logits/chosen": -3254795.7333333334, "logits/rejected": -22223992.888888888, "logps/chosen": -519.4892578125, "logps/rejected": -684.8978949652778, "loss": 0.0046, "rewards/chosen": 8.007538859049479, "rewards/margins": 20.07860327826606, "rewards/rejected": -12.07106441921658, "step": 3125 }, { "epoch": 0.8567904618336303, "grad_norm": 1.5390625, "kl": 2.8508338928222656, "learning_rate": 5e-06, "logits/chosen": -16884103.384615384, "logits/rejected": 34106082.90909091, "logps/chosen": -438.88423978365387, "logps/rejected": -727.5217507102273, "loss": 0.0031, "rewards/chosen": 7.608528724083533, "rewards/margins": 25.41145175320285, "rewards/rejected": -17.802923029119317, "step": 3126 }, { "epoch": 0.85706454707414, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1946219.7142857143, "logits/rejected": -19757805.17647059, "logps/chosen": -471.7340611049107, "logps/rejected": -707.8141659007352, "loss": 0.0041, "rewards/chosen": 8.758477347237724, "rewards/margins": 24.817720942136624, "rewards/rejected": -16.0592435948989, "step": 3127 }, { "epoch": 0.8573386323146499, "grad_norm": 3.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26570193.066666666, "logits/rejected": 57251669.333333336, "logps/chosen": -320.12916666666666, "logps/rejected": -653.3949110243055, "loss": 0.0359, "rewards/chosen": 4.796486409505208, "rewards/margins": 21.293765258789065, "rewards/rejected": -16.497278849283855, "step": 3128 }, { "epoch": 0.8576127175551597, "grad_norm": 5.6875, "kl": 2.9260001182556152, "learning_rate": 5e-06, "logits/chosen": 4089326.153846154, "logits/rejected": -58176384.0, "logps/chosen": -399.4011042668269, "logps/rejected": -687.2548828125, "loss": 0.024, "rewards/chosen": 8.142434927133413, "rewards/margins": 25.684923718859267, "rewards/rejected": -17.54248879172585, "step": 3129 }, { "epoch": 0.8578868027956694, "grad_norm": 8.375, "kl": 9.691754341125488, "learning_rate": 5e-06, "logits/chosen": -6998535.05882353, "logits/rejected": -32603332.57142857, "logps/chosen": -395.7469267003676, "logps/rejected": -570.6534598214286, "loss": 0.0577, "rewards/chosen": 7.351587632123162, "rewards/margins": 21.267582548766576, "rewards/rejected": -13.915994916643415, "step": 3130 }, { "epoch": 0.8581608880361793, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17344876.8, "logits/rejected": -27046236.444444444, "logps/chosen": -395.25481770833335, "logps/rejected": -496.8429361979167, "loss": 0.0211, "rewards/chosen": 7.439136759440104, "rewards/margins": 17.800093587239584, "rewards/rejected": -10.360956827799479, "step": 3131 }, { "epoch": 0.858434973276689, "grad_norm": 11.8125, "kl": 5.991198539733887, "learning_rate": 5e-06, "logits/chosen": -5358748.266666667, "logits/rejected": -18914471.111111112, "logps/chosen": -427.2026692708333, "logps/rejected": -479.37217881944446, "loss": 0.0607, "rewards/chosen": 7.498921712239583, "rewards/margins": 21.06257544623481, "rewards/rejected": -13.563653733995226, "step": 3132 }, { "epoch": 0.8587090585171988, "grad_norm": 2.59375, "kl": 4.364678859710693, "learning_rate": 5e-06, "logits/chosen": -40329600.0, "logits/rejected": -21991856.0, "logps/chosen": -365.4130533854167, "logps/rejected": -441.19737413194446, "loss": 0.0108, "rewards/chosen": 7.031657409667969, "rewards/margins": 16.92845204671224, "rewards/rejected": -9.896794637044271, "step": 3133 }, { "epoch": 0.8589831437577087, "grad_norm": 1.8515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20843342.545454547, "logits/rejected": -23208066.46153846, "logps/chosen": -380.87198153409093, "logps/rejected": -650.9710787259615, "loss": 0.0274, "rewards/chosen": 5.7303855202414775, "rewards/margins": 23.047289441515517, "rewards/rejected": -17.31690392127404, "step": 3134 }, { "epoch": 0.8592572289982184, "grad_norm": 4.5, "kl": 0.40050697326660156, "learning_rate": 5e-06, "logits/chosen": -11710947.636363637, "logits/rejected": -26385179.076923076, "logps/chosen": -401.8724254261364, "logps/rejected": -613.0374098557693, "loss": 0.0178, "rewards/chosen": 7.047265486283735, "rewards/margins": 19.373419754988664, "rewards/rejected": -12.326154268704927, "step": 3135 }, { "epoch": 0.8595313142387282, "grad_norm": 6.5625, "kl": 4.676017761230469, "learning_rate": 5e-06, "logits/chosen": -53479909.333333336, "logits/rejected": -32006666.666666668, "logps/chosen": -368.783203125, "logps/rejected": -477.302001953125, "loss": 0.0476, "rewards/chosen": 5.708573659261067, "rewards/margins": 15.540316263834637, "rewards/rejected": -9.831742604573568, "step": 3136 }, { "epoch": 0.8598053994792381, "grad_norm": 3.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3285187.0, "logits/rejected": -4861928.0, "logps/chosen": -310.980224609375, "logps/rejected": -534.5877685546875, "loss": 0.0195, "rewards/chosen": 4.834011554718018, "rewards/margins": 19.681233882904053, "rewards/rejected": -14.847222328186035, "step": 3137 }, { "epoch": 0.8600794847197478, "grad_norm": 5.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34349520.0, "logits/rejected": -9773989.714285715, "logps/chosen": -432.69306640625, "logps/rejected": -590.80224609375, "loss": 0.033, "rewards/chosen": 7.052513122558594, "rewards/margins": 19.983027866908483, "rewards/rejected": -12.930514744349889, "step": 3138 }, { "epoch": 0.8603535699602577, "grad_norm": 12.25, "kl": 3.0050430297851562, "learning_rate": 5e-06, "logits/chosen": -3328078.8, "logits/rejected": -21417389.714285713, "logps/chosen": -382.9885986328125, "logps/rejected": -567.7673688616071, "loss": 0.0672, "rewards/chosen": 5.710154342651367, "rewards/margins": 17.920481491088868, "rewards/rejected": -12.2103271484375, "step": 3139 }, { "epoch": 0.8606276552007674, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11128416.8, "logits/rejected": -18663104.0, "logps/chosen": -393.219921875, "logps/rejected": -507.1406947544643, "loss": 0.0125, "rewards/chosen": 6.861688232421875, "rewards/margins": 17.339510672433036, "rewards/rejected": -10.477822440011161, "step": 3140 }, { "epoch": 0.8609017404412772, "grad_norm": 2.09375, "kl": 1.8469715118408203, "learning_rate": 5e-06, "logits/chosen": -21285130.666666668, "logits/rejected": -27386090.666666668, "logps/chosen": -413.5290120442708, "logps/rejected": -550.4463297526041, "loss": 0.0049, "rewards/chosen": 8.179845174153646, "rewards/margins": 18.75683911641439, "rewards/rejected": -10.576993942260742, "step": 3141 }, { "epoch": 0.8611758256817871, "grad_norm": 264.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9353497.454545455, "logits/rejected": 40545563.07692308, "logps/chosen": -455.34641335227275, "logps/rejected": -523.5731670673077, "loss": 0.069, "rewards/chosen": 5.82231972434304, "rewards/margins": 15.326999477573207, "rewards/rejected": -9.504679753230167, "step": 3142 }, { "epoch": 0.8614499109222968, "grad_norm": 4.90625, "kl": 0.9593290090560913, "learning_rate": 5e-06, "logits/chosen": -29047720.0, "logits/rejected": -35620933.333333336, "logps/chosen": -371.440673828125, "logps/rejected": -696.8387044270834, "loss": 0.0378, "rewards/chosen": 6.4290110270182295, "rewards/margins": 22.137985229492188, "rewards/rejected": -15.708974202473959, "step": 3143 }, { "epoch": 0.8617239961628066, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22543485.09090909, "logits/rejected": -15338166.153846154, "logps/chosen": -404.56471946022725, "logps/rejected": -385.48715444711536, "loss": 0.0697, "rewards/chosen": 7.255329478870738, "rewards/margins": 17.02615009654652, "rewards/rejected": -9.770820617675781, "step": 3144 }, { "epoch": 0.8619980814033165, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29479480.0, "logits/rejected": -48133024.0, "logps/chosen": -336.5453694661458, "logps/rejected": -482.4503987630208, "loss": 0.0364, "rewards/chosen": 7.0514882405598955, "rewards/margins": 19.139918009440105, "rewards/rejected": -12.088429768880209, "step": 3145 }, { "epoch": 0.8622721666438262, "grad_norm": 13.8125, "kl": 13.376441955566406, "learning_rate": 5e-06, "logits/chosen": -24181102.222222224, "logits/rejected": -29495477.333333332, "logps/chosen": -410.2206759982639, "logps/rejected": -654.2296549479166, "loss": 0.0888, "rewards/chosen": 7.27203369140625, "rewards/margins": 21.371376037597656, "rewards/rejected": -14.099342346191406, "step": 3146 }, { "epoch": 0.862546251884336, "grad_norm": 1.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27039099.076923076, "logits/rejected": -27239700.363636363, "logps/chosen": -354.43502103365387, "logps/rejected": -500.36221590909093, "loss": 0.0073, "rewards/chosen": 8.188859205979567, "rewards/margins": 20.60155396361451, "rewards/rejected": -12.412694757634943, "step": 3147 }, { "epoch": 0.8628203371248458, "grad_norm": 1.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11601456.0, "logits/rejected": -45011276.8, "logps/chosen": -423.14620535714283, "logps/rejected": -445.6072265625, "loss": 0.0037, "rewards/chosen": 7.1498516627720425, "rewards/margins": 21.467057473318917, "rewards/rejected": -14.317205810546875, "step": 3148 }, { "epoch": 0.8630944223653556, "grad_norm": 4.03125, "kl": 8.011747360229492, "learning_rate": 5e-06, "logits/chosen": -18842678.666666668, "logits/rejected": -22327632.0, "logps/chosen": -486.8097737630208, "logps/rejected": -612.5039876302084, "loss": 0.0124, "rewards/chosen": 7.342199325561523, "rewards/margins": 20.260427474975586, "rewards/rejected": -12.918228149414062, "step": 3149 }, { "epoch": 0.8633685076058655, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2526579.1666666665, "logits/rejected": -31273565.333333332, "logps/chosen": -294.6822916666667, "logps/rejected": -377.3094889322917, "loss": 0.0311, "rewards/chosen": 6.320415496826172, "rewards/margins": 17.749192555745445, "rewards/rejected": -11.428777058919271, "step": 3150 }, { "epoch": 0.8636425928463752, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 13724873.6, "logits/rejected": -26219917.714285713, "logps/chosen": -534.261181640625, "logps/rejected": -588.6323939732143, "loss": 0.0257, "rewards/chosen": 7.842802429199219, "rewards/margins": 20.896086120605467, "rewards/rejected": -13.05328369140625, "step": 3151 }, { "epoch": 0.863916678086885, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5978164.0, "logits/rejected": 1035520.8571428572, "logps/chosen": -451.163232421875, "logps/rejected": -642.4524274553571, "loss": 0.0269, "rewards/chosen": 6.187412261962891, "rewards/margins": 19.571300506591797, "rewards/rejected": -13.383888244628906, "step": 3152 }, { "epoch": 0.8641907633273949, "grad_norm": 1.234375, "kl": 2.1598548889160156, "learning_rate": 5e-06, "logits/chosen": -19121344.0, "logits/rejected": -14950080.0, "logps/chosen": -528.1487630208334, "logps/rejected": -569.073828125, "loss": 0.0014, "rewards/chosen": 9.10966067843967, "rewards/margins": 20.011901685926652, "rewards/rejected": -10.90224100748698, "step": 3153 }, { "epoch": 0.8644648485679046, "grad_norm": 0.98046875, "kl": 2.2044882774353027, "learning_rate": 5e-06, "logits/chosen": -2017847.6363636365, "logits/rejected": -9556891.692307692, "logps/chosen": -588.2023703835227, "logps/rejected": -497.62338491586536, "loss": 0.0021, "rewards/chosen": 9.562713623046875, "rewards/margins": 20.792724609375, "rewards/rejected": -11.230010986328125, "step": 3154 }, { "epoch": 0.8647389338084144, "grad_norm": 8.0, "kl": 8.13802433013916, "learning_rate": 5e-06, "logits/chosen": -31911470.933333334, "logits/rejected": -12321317.333333334, "logps/chosen": -455.1175130208333, "logps/rejected": -451.46356879340277, "loss": 0.0882, "rewards/chosen": 7.306487019856771, "rewards/margins": 17.146664767795137, "rewards/rejected": -9.840177747938368, "step": 3155 }, { "epoch": 0.8650130190489242, "grad_norm": 4.15625, "kl": 5.618684768676758, "learning_rate": 5e-06, "logits/chosen": -28233737.14285714, "logits/rejected": -35252032.0, "logps/chosen": -479.644287109375, "logps/rejected": -717.5913947610294, "loss": 0.0046, "rewards/chosen": 6.794695172991071, "rewards/margins": 22.61132761209953, "rewards/rejected": -15.816632439108457, "step": 3156 }, { "epoch": 0.865287104289434, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32626595.555555556, "logits/rejected": -31450329.6, "logps/chosen": -502.06260850694446, "logps/rejected": -565.3595052083333, "loss": 0.0156, "rewards/chosen": 6.787715488009983, "rewards/margins": 21.041843499077693, "rewards/rejected": -14.254128011067708, "step": 3157 }, { "epoch": 0.8655611895299438, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2610914.6666666665, "logits/rejected": -17239912.0, "logps/chosen": -490.1195475260417, "logps/rejected": -411.420166015625, "loss": 0.0066, "rewards/chosen": 7.619864781697591, "rewards/margins": 17.44199816385905, "rewards/rejected": -9.822133382161459, "step": 3158 }, { "epoch": 0.8658352747704536, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12350104.0, "logits/rejected": -21414886.4, "logps/chosen": -374.94398716517856, "logps/rejected": -489.4158203125, "loss": 0.0075, "rewards/chosen": 6.6721698216029575, "rewards/margins": 17.02626528058733, "rewards/rejected": -10.354095458984375, "step": 3159 }, { "epoch": 0.8661093600109634, "grad_norm": 0.365234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24800971.636363637, "logits/rejected": 50321196.307692304, "logps/chosen": -525.8690962357955, "logps/rejected": -552.6642503004807, "loss": 0.0009, "rewards/chosen": 8.719866665926846, "rewards/margins": 27.479118187110743, "rewards/rejected": -18.759251521183895, "step": 3160 }, { "epoch": 0.8663834452514733, "grad_norm": 6.59375, "kl": 2.8736283779144287, "learning_rate": 5e-06, "logits/chosen": -20234342.85714286, "logits/rejected": -42306089.6, "logps/chosen": -381.1083984375, "logps/rejected": -459.518359375, "loss": 0.0447, "rewards/chosen": 6.819292340959821, "rewards/margins": 16.97599857875279, "rewards/rejected": -10.156706237792969, "step": 3161 }, { "epoch": 0.866657530491983, "grad_norm": 0.12890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30308956.444444444, "logits/rejected": -26554692.266666666, "logps/chosen": -478.20209418402777, "logps/rejected": -646.739453125, "loss": 0.0004, "rewards/chosen": 8.489171346028646, "rewards/margins": 25.27686258951823, "rewards/rejected": -16.787691243489583, "step": 3162 }, { "epoch": 0.8669316157324928, "grad_norm": 1.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2543782.0, "logits/rejected": -24856786.666666668, "logps/chosen": -398.6488850911458, "logps/rejected": -547.6736246744791, "loss": 0.0032, "rewards/chosen": 8.70384152730306, "rewards/margins": 22.223095575968426, "rewards/rejected": -13.519254048665365, "step": 3163 }, { "epoch": 0.8672057009730026, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40067842.666666664, "logits/rejected": -12917016.0, "logps/chosen": -463.72119140625, "logps/rejected": -512.9991048177084, "loss": 0.0367, "rewards/chosen": 7.2934926350911455, "rewards/margins": 18.32274881998698, "rewards/rejected": -11.029256184895834, "step": 3164 }, { "epoch": 0.8674797862135124, "grad_norm": 0.1884765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2742445.4545454546, "logits/rejected": -21949577.846153848, "logps/chosen": -512.3514737215909, "logps/rejected": -537.9208233173077, "loss": 0.0005, "rewards/chosen": 9.0091552734375, "rewards/margins": 22.31741450383113, "rewards/rejected": -13.30825923039363, "step": 3165 }, { "epoch": 0.8677538714540222, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40158514.666666664, "logits/rejected": -22200618.666666668, "logps/chosen": -466.6464029947917, "logps/rejected": -568.6923014322916, "loss": 0.0423, "rewards/chosen": 6.0076249440511065, "rewards/margins": 18.656140009562176, "rewards/rejected": -12.648515065511068, "step": 3166 }, { "epoch": 0.868027956694532, "grad_norm": 1.609375, "kl": 3.186666488647461, "learning_rate": 5e-06, "logits/chosen": -12971429.818181818, "logits/rejected": -30648691.692307692, "logps/chosen": -467.3536931818182, "logps/rejected": -561.1949368990385, "loss": 0.0234, "rewards/chosen": 7.492822126908735, "rewards/margins": 21.850535092653928, "rewards/rejected": -14.357712965745192, "step": 3167 }, { "epoch": 0.8683020419350418, "grad_norm": 2.5625, "kl": 13.889026641845703, "learning_rate": 5e-06, "logits/chosen": -25562002.285714287, "logits/rejected": -34079545.6, "logps/chosen": -423.90523856026783, "logps/rejected": -458.02958984375, "loss": 0.0496, "rewards/chosen": 8.474485124860491, "rewards/margins": 17.180428423200333, "rewards/rejected": -8.705943298339843, "step": 3168 }, { "epoch": 0.8685761271755515, "grad_norm": 1.9296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41866474.666666664, "logits/rejected": -21591404.8, "logps/chosen": -429.4970703125, "logps/rejected": -664.66328125, "loss": 0.0028, "rewards/chosen": 6.967369503445095, "rewards/margins": 21.711725785997178, "rewards/rejected": -14.744356282552083, "step": 3169 }, { "epoch": 0.8688502124160614, "grad_norm": 16.0, "kl": 4.597938537597656, "learning_rate": 5e-06, "logits/chosen": -19482749.714285713, "logits/rejected": -2251122.4, "logps/chosen": -375.26492745535717, "logps/rejected": -405.34345703125, "loss": 0.052, "rewards/chosen": 7.396936144147601, "rewards/margins": 16.04223872593471, "rewards/rejected": -8.64530258178711, "step": 3170 }, { "epoch": 0.8691242976565712, "grad_norm": 4.46875, "kl": 2.957514762878418, "learning_rate": 5e-06, "logits/chosen": -10901085.090909092, "logits/rejected": -18062921.846153848, "logps/chosen": -547.2792080965909, "logps/rejected": -545.4225135216346, "loss": 0.0113, "rewards/chosen": 8.1883413141424, "rewards/margins": 22.424939402333507, "rewards/rejected": -14.236598088191105, "step": 3171 }, { "epoch": 0.869398382897081, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11413944.727272727, "logits/rejected": -36984196.92307692, "logps/chosen": -365.8895818536932, "logps/rejected": -691.6820162259615, "loss": 0.0363, "rewards/chosen": 5.012128656560725, "rewards/margins": 21.562402845262646, "rewards/rejected": -16.550274188701923, "step": 3172 }, { "epoch": 0.8696724681375908, "grad_norm": 7.375, "kl": 1.524082899093628, "learning_rate": 5e-06, "logits/chosen": -21939219.692307692, "logits/rejected": 74324340.36363636, "logps/chosen": -453.94643930288464, "logps/rejected": -539.7320223721591, "loss": 0.0118, "rewards/chosen": 6.656046940730168, "rewards/margins": 22.00577422455474, "rewards/rejected": -15.349727283824574, "step": 3173 }, { "epoch": 0.8699465533781006, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28307996.444444444, "logits/rejected": -15961083.733333332, "logps/chosen": -320.28314887152777, "logps/rejected": -350.16432291666666, "loss": 0.0409, "rewards/chosen": 6.581891377766927, "rewards/margins": 15.767271423339842, "rewards/rejected": -9.185380045572916, "step": 3174 }, { "epoch": 0.8702206386186104, "grad_norm": 2.234375, "kl": 2.7071433067321777, "learning_rate": 5e-06, "logits/chosen": -2268608.8, "logits/rejected": -22806496.0, "logps/chosen": -500.1794921875, "logps/rejected": -609.8454938616071, "loss": 0.006, "rewards/chosen": 8.26336441040039, "rewards/margins": 20.2001341683524, "rewards/rejected": -11.936769757952009, "step": 3175 }, { "epoch": 0.8704947238591202, "grad_norm": 3.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35068666.18181818, "logits/rejected": 3151965.5384615385, "logps/chosen": -411.20729758522725, "logps/rejected": -946.6681189903846, "loss": 0.0084, "rewards/chosen": 6.540205522017046, "rewards/margins": 35.30138669314084, "rewards/rejected": -28.7611811711238, "step": 3176 }, { "epoch": 0.87076880909963, "grad_norm": 6.34375, "kl": 10.862166404724121, "learning_rate": 5e-06, "logits/chosen": -32126172.8, "logits/rejected": -29852694.85714286, "logps/chosen": -441.67060546875, "logps/rejected": -556.6501116071429, "loss": 0.0224, "rewards/chosen": 8.136669158935547, "rewards/margins": 19.313468388148717, "rewards/rejected": -11.17679922921317, "step": 3177 }, { "epoch": 0.8710428943401398, "grad_norm": 5.25, "kl": 1.9175708293914795, "learning_rate": 5e-06, "logits/chosen": -6142199.2727272725, "logits/rejected": -12446422.153846154, "logps/chosen": -448.5359552556818, "logps/rejected": -595.0501802884615, "loss": 0.0127, "rewards/chosen": 8.561208551580256, "rewards/margins": 22.400331617235302, "rewards/rejected": -13.839123065655048, "step": 3178 }, { "epoch": 0.8713169795806496, "grad_norm": 1.390625, "kl": 3.4038748741149902, "learning_rate": 5e-06, "logits/chosen": -22435481.14285714, "logits/rejected": -26169715.2, "logps/chosen": -377.26834542410717, "logps/rejected": -438.667333984375, "loss": 0.0036, "rewards/chosen": 7.159152439662388, "rewards/margins": 18.22049124581473, "rewards/rejected": -11.061338806152344, "step": 3179 }, { "epoch": 0.8715910648211593, "grad_norm": 3.703125, "kl": 3.242008924484253, "learning_rate": 5e-06, "logits/chosen": -12516497.454545455, "logits/rejected": -61540637.538461536, "logps/chosen": -434.6834161931818, "logps/rejected": -462.1008864182692, "loss": 0.0453, "rewards/chosen": 7.40624375776811, "rewards/margins": 18.00394343662929, "rewards/rejected": -10.597699678861177, "step": 3180 }, { "epoch": 0.8718651500616692, "grad_norm": 4.5, "kl": 8.882064819335938, "learning_rate": 5e-06, "logits/chosen": -13067894.4, "logits/rejected": -25783921.777777776, "logps/chosen": -455.0823567708333, "logps/rejected": -250.95046657986111, "loss": 0.0377, "rewards/chosen": 7.6158192952473955, "rewards/margins": 15.685543484157986, "rewards/rejected": -8.069724188910591, "step": 3181 }, { "epoch": 0.872139235302179, "grad_norm": 3.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41419724.8, "logits/rejected": 13013285.333333334, "logps/chosen": -486.154296875, "logps/rejected": -601.7275390625, "loss": 0.0071, "rewards/chosen": 8.19795633951823, "rewards/margins": 22.7966552734375, "rewards/rejected": -14.598698933919271, "step": 3182 }, { "epoch": 0.8724133205426888, "grad_norm": 5.96875, "kl": 0.6556282043457031, "learning_rate": 5e-06, "logits/chosen": 20031069.714285713, "logits/rejected": -29130156.8, "logps/chosen": -560.0069056919643, "logps/rejected": -577.076708984375, "loss": 0.0317, "rewards/chosen": 7.380458286830357, "rewards/margins": 20.29951651436942, "rewards/rejected": -12.919058227539063, "step": 3183 }, { "epoch": 0.8726874057831986, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9670863.333333334, "logits/rejected": -28612272.0, "logps/chosen": -528.5943196614584, "logps/rejected": -542.1112060546875, "loss": 0.008, "rewards/chosen": 8.631248474121094, "rewards/margins": 21.382328033447266, "rewards/rejected": -12.751079559326172, "step": 3184 }, { "epoch": 0.8729614910237083, "grad_norm": 9.9375, "kl": 6.341155529022217, "learning_rate": 5e-06, "logits/chosen": -9361827.692307692, "logits/rejected": -18944023.272727273, "logps/chosen": -482.5969426081731, "logps/rejected": -533.03125, "loss": 0.0446, "rewards/chosen": 8.968907282902645, "rewards/margins": 21.83179180438702, "rewards/rejected": -12.862884521484375, "step": 3185 }, { "epoch": 0.8732355762642182, "grad_norm": 4.625, "kl": 4.06303596496582, "learning_rate": 5e-06, "logits/chosen": -6157614.285714285, "logits/rejected": -46830313.6, "logps/chosen": -453.58517020089283, "logps/rejected": -619.509423828125, "loss": 0.0135, "rewards/chosen": 7.035718645368304, "rewards/margins": 18.25872355869838, "rewards/rejected": -11.223004913330078, "step": 3186 }, { "epoch": 0.873509661504728, "grad_norm": 7.15625, "kl": 6.628115653991699, "learning_rate": 5e-06, "logits/chosen": -23120403.692307692, "logits/rejected": -44661445.81818182, "logps/chosen": -400.7258488581731, "logps/rejected": -638.9381214488636, "loss": 0.0432, "rewards/chosen": 7.011467566856971, "rewards/margins": 20.86899764054305, "rewards/rejected": -13.85753007368608, "step": 3187 }, { "epoch": 0.8737837467452377, "grad_norm": 1.125, "kl": 8.186946868896484, "learning_rate": 5e-06, "logits/chosen": -8122930.666666667, "logits/rejected": -13871014.222222222, "logps/chosen": -527.01591796875, "logps/rejected": -518.6095920138889, "loss": 0.0023, "rewards/chosen": 9.176224772135416, "rewards/margins": 21.53785383436415, "rewards/rejected": -12.361629062228733, "step": 3188 }, { "epoch": 0.8740578319857476, "grad_norm": 4.0, "kl": 3.9878592491149902, "learning_rate": 5e-06, "logits/chosen": -13739684.923076924, "logits/rejected": 119918801.45454545, "logps/chosen": -428.24891075721155, "logps/rejected": -491.4608043323864, "loss": 0.0091, "rewards/chosen": 8.010841369628906, "rewards/margins": 19.44976529208097, "rewards/rejected": -11.43892392245206, "step": 3189 }, { "epoch": 0.8743319172262574, "grad_norm": 5.4375, "kl": 7.476861476898193, "learning_rate": 5e-06, "logits/chosen": -6346473.846153846, "logits/rejected": -31474807.272727273, "logps/chosen": -389.1432542067308, "logps/rejected": -508.3499200994318, "loss": 0.0213, "rewards/chosen": 6.219122666579026, "rewards/margins": 18.622756477836127, "rewards/rejected": -12.403633811257102, "step": 3190 }, { "epoch": 0.8746060024667671, "grad_norm": 0.71484375, "kl": 1.5904897451400757, "learning_rate": 5e-06, "logits/chosen": -23340829.09090909, "logits/rejected": -22821267.692307692, "logps/chosen": -484.4563654119318, "logps/rejected": -495.09900841346155, "loss": 0.0023, "rewards/chosen": 7.607752713290128, "rewards/margins": 19.705249332881476, "rewards/rejected": -12.097496619591347, "step": 3191 }, { "epoch": 0.874880087707277, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9243687.2, "logits/rejected": -17096585.14285714, "logps/chosen": -362.3077392578125, "logps/rejected": -518.8822544642857, "loss": 0.0325, "rewards/chosen": 5.513520431518555, "rewards/margins": 16.945949063982283, "rewards/rejected": -11.432428632463727, "step": 3192 }, { "epoch": 0.8751541729477867, "grad_norm": 11.375, "kl": 7.28342342376709, "learning_rate": 5e-06, "logits/chosen": -26351126.85714286, "logits/rejected": -12848106.4, "logps/chosen": -390.53700474330356, "logps/rejected": -574.09052734375, "loss": 0.0798, "rewards/chosen": 5.827968052455357, "rewards/margins": 18.62753622872489, "rewards/rejected": -12.799568176269531, "step": 3193 }, { "epoch": 0.8754282581882966, "grad_norm": 6.0, "kl": 2.7055869102478027, "learning_rate": 5e-06, "logits/chosen": -52271882.666666664, "logits/rejected": -30079477.333333332, "logps/chosen": -568.8559163411459, "logps/rejected": -612.9208984375, "loss": 0.0092, "rewards/chosen": 8.5652707417806, "rewards/margins": 22.476057688395183, "rewards/rejected": -13.910786946614584, "step": 3194 }, { "epoch": 0.8757023434288064, "grad_norm": 3.078125, "kl": 8.265227317810059, "learning_rate": 5e-06, "logits/chosen": -26491910.85714286, "logits/rejected": -20166566.4, "logps/chosen": -584.2923409598214, "logps/rejected": -357.1662353515625, "loss": 0.0068, "rewards/chosen": 9.743141174316406, "rewards/margins": 19.483753967285157, "rewards/rejected": -9.74061279296875, "step": 3195 }, { "epoch": 0.8759764286693161, "grad_norm": 1.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16087057.777777778, "logits/rejected": -27972661.333333332, "logps/chosen": -437.3299153645833, "logps/rejected": -471.80882161458334, "loss": 0.0048, "rewards/chosen": 6.970879448784722, "rewards/margins": 17.703941175672743, "rewards/rejected": -10.733061726888021, "step": 3196 }, { "epoch": 0.876250513909826, "grad_norm": 1.2578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29828538.666666668, "logits/rejected": -13397709.333333334, "logps/chosen": -440.6895751953125, "logps/rejected": -400.9492594401042, "loss": 0.0036, "rewards/chosen": 9.086783727010092, "rewards/margins": 17.91275405883789, "rewards/rejected": -8.825970331827799, "step": 3197 }, { "epoch": 0.8765245991503358, "grad_norm": 4.65625, "kl": 8.31484603881836, "learning_rate": 5e-06, "logits/chosen": -12443510.588235294, "logits/rejected": -20745645.714285713, "logps/chosen": -407.15412454044116, "logps/rejected": -543.5262974330357, "loss": 0.0208, "rewards/chosen": 7.1947008020737595, "rewards/margins": 20.58871979272666, "rewards/rejected": -13.394018990652901, "step": 3198 }, { "epoch": 0.8767986843908455, "grad_norm": 2.34375, "kl": 4.569572925567627, "learning_rate": 5e-06, "logits/chosen": -30346573.714285713, "logits/rejected": -23451796.8, "logps/chosen": -439.2001953125, "logps/rejected": -646.142578125, "loss": 0.0297, "rewards/chosen": 6.5768230983189175, "rewards/margins": 22.789808327811105, "rewards/rejected": -16.212985229492187, "step": 3199 }, { "epoch": 0.8770727696313554, "grad_norm": 2.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26654263.272727273, "logits/rejected": -24711170.46153846, "logps/chosen": -403.2916370738636, "logps/rejected": -620.2149188701923, "loss": 0.0142, "rewards/chosen": 8.153409784490412, "rewards/margins": 21.578315414748825, "rewards/rejected": -13.424905630258413, "step": 3200 }, { "epoch": 0.8773468548718651, "grad_norm": 1.1328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28721797.333333332, "logits/rejected": -14689474.666666666, "logps/chosen": -345.1105143229167, "logps/rejected": -459.375, "loss": 0.0035, "rewards/chosen": 7.337057749430339, "rewards/margins": 18.85095469156901, "rewards/rejected": -11.513896942138672, "step": 3201 }, { "epoch": 0.8776209401123749, "grad_norm": 1.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50239336.72727273, "logits/rejected": 36964792.615384616, "logps/chosen": -460.20747514204544, "logps/rejected": -899.0356820913462, "loss": 0.0048, "rewards/chosen": 6.700413097034801, "rewards/margins": 48.349512593729514, "rewards/rejected": -41.64909949669471, "step": 3202 }, { "epoch": 0.8778950253528848, "grad_norm": 2.984375, "kl": 3.262734889984131, "learning_rate": 5e-06, "logits/chosen": -15846592.0, "logits/rejected": -17097664.0, "logps/chosen": -426.89122817095586, "logps/rejected": -599.8334263392857, "loss": 0.0227, "rewards/chosen": 6.816902609432445, "rewards/margins": 19.739344364454766, "rewards/rejected": -12.922441755022321, "step": 3203 }, { "epoch": 0.8781691105933945, "grad_norm": 12.0625, "kl": 0.13463720679283142, "learning_rate": 5e-06, "logits/chosen": -28322197.333333332, "logits/rejected": -23185122.666666668, "logps/chosen": -378.1623942057292, "logps/rejected": -587.2187906901041, "loss": 0.0629, "rewards/chosen": 5.779300053914388, "rewards/margins": 18.047971725463867, "rewards/rejected": -12.268671671549479, "step": 3204 }, { "epoch": 0.8784431958339043, "grad_norm": 3.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -802321.8571428572, "logits/rejected": -38076953.6, "logps/chosen": -547.8897530691964, "logps/rejected": -682.506494140625, "loss": 0.0129, "rewards/chosen": 8.273824964250837, "rewards/margins": 25.25192555018834, "rewards/rejected": -16.9781005859375, "step": 3205 }, { "epoch": 0.8787172810744142, "grad_norm": 1.4609375, "kl": 4.436384201049805, "learning_rate": 5e-06, "logits/chosen": -15672320.0, "logits/rejected": -30298444.8, "logps/chosen": -434.1386021205357, "logps/rejected": -604.421630859375, "loss": 0.0054, "rewards/chosen": 7.252002716064453, "rewards/margins": 22.34420394897461, "rewards/rejected": -15.092201232910156, "step": 3206 }, { "epoch": 0.8789913663149239, "grad_norm": 6.71875, "kl": 4.989443778991699, "learning_rate": 5e-06, "logits/chosen": -18852660.923076924, "logits/rejected": -15739104.0, "logps/chosen": -433.7877854567308, "logps/rejected": -456.8053089488636, "loss": 0.0308, "rewards/chosen": 6.97680898813101, "rewards/margins": 18.98552591817362, "rewards/rejected": -12.008716930042613, "step": 3207 }, { "epoch": 0.8792654515554338, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1226917.5555555555, "logits/rejected": -7297043.2, "logps/chosen": -488.3860677083333, "logps/rejected": -448.04388020833335, "loss": 0.0039, "rewards/chosen": 8.799327426486546, "rewards/margins": 21.262978956434463, "rewards/rejected": -12.463651529947917, "step": 3208 }, { "epoch": 0.8795395367959435, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30506557.09090909, "logits/rejected": -15961010.461538462, "logps/chosen": -366.50297407670456, "logps/rejected": -551.4542893629807, "loss": 0.0574, "rewards/chosen": 5.440392927689985, "rewards/margins": 18.570702986283735, "rewards/rejected": -13.13031005859375, "step": 3209 }, { "epoch": 0.8798136220364533, "grad_norm": 14.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14442862.76923077, "logits/rejected": -48436104.72727273, "logps/chosen": -360.1254131610577, "logps/rejected": -495.4343927556818, "loss": 0.0271, "rewards/chosen": 7.128231928898738, "rewards/margins": 16.713983549104704, "rewards/rejected": -9.585751620205967, "step": 3210 }, { "epoch": 0.8800877072769632, "grad_norm": 1.3828125, "kl": 6.178009033203125, "learning_rate": 5e-06, "logits/chosen": -18093730.0, "logits/rejected": -16623450.0, "logps/chosen": -447.10833740234375, "logps/rejected": -620.852783203125, "loss": 0.0024, "rewards/chosen": 8.873225212097168, "rewards/margins": 25.081332206726074, "rewards/rejected": -16.208106994628906, "step": 3211 }, { "epoch": 0.8803617925174729, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27699138.46153846, "logits/rejected": -17430839.272727273, "logps/chosen": -345.91162109375, "logps/rejected": -768.4699928977273, "loss": 0.0078, "rewards/chosen": 6.175666222205529, "rewards/margins": 26.15525273676519, "rewards/rejected": -19.97958651455966, "step": 3212 }, { "epoch": 0.8806358777579827, "grad_norm": 1.8125, "kl": 8.027090072631836, "learning_rate": 5e-06, "logits/chosen": -5221369.066666666, "logits/rejected": -27682414.222222224, "logps/chosen": -522.93095703125, "logps/rejected": -467.26453993055554, "loss": 0.0437, "rewards/chosen": 8.538765462239583, "rewards/margins": 21.218963283962673, "rewards/rejected": -12.680197821723091, "step": 3213 }, { "epoch": 0.8809099629984926, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21917790.0, "logits/rejected": -26878952.0, "logps/chosen": -356.3764343261719, "logps/rejected": -660.5711669921875, "loss": 0.0447, "rewards/chosen": 6.173887729644775, "rewards/margins": 21.972740650177002, "rewards/rejected": -15.798852920532227, "step": 3214 }, { "epoch": 0.8811840482390023, "grad_norm": 13.1875, "kl": 16.149646759033203, "learning_rate": 5e-06, "logits/chosen": -33576756.705882356, "logits/rejected": -15480754.285714285, "logps/chosen": -417.81985294117646, "logps/rejected": -638.1234654017857, "loss": 0.1324, "rewards/chosen": 6.479208553538603, "rewards/margins": 20.503474387801994, "rewards/rejected": -14.024265834263392, "step": 3215 }, { "epoch": 0.8814581334795121, "grad_norm": 3.609375, "kl": 0.6089484095573425, "learning_rate": 5e-06, "logits/chosen": -27170872.0, "logits/rejected": -27791882.666666668, "logps/chosen": -350.38427734375, "logps/rejected": -671.4813639322916, "loss": 0.0695, "rewards/chosen": 5.564538319905599, "rewards/margins": 18.314829508463543, "rewards/rejected": -12.750291188557943, "step": 3216 }, { "epoch": 0.8817322187200219, "grad_norm": 2.359375, "kl": 3.077131986618042, "learning_rate": 5e-06, "logits/chosen": -18529837.09090909, "logits/rejected": -8565084.307692308, "logps/chosen": -408.54585404829544, "logps/rejected": -503.41000600961536, "loss": 0.0385, "rewards/chosen": 10.05973261052912, "rewards/margins": 21.268681772938976, "rewards/rejected": -11.208949162409855, "step": 3217 }, { "epoch": 0.8820063039605317, "grad_norm": 7.3125, "kl": 6.260440826416016, "learning_rate": 5e-06, "logits/chosen": -16344406.153846154, "logits/rejected": -28025332.363636363, "logps/chosen": -400.97513521634613, "logps/rejected": -714.2665127840909, "loss": 0.0634, "rewards/chosen": 7.417241610013521, "rewards/margins": 23.719856689026308, "rewards/rejected": -16.302615079012785, "step": 3218 }, { "epoch": 0.8822803892010416, "grad_norm": 0.63671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36898973.538461536, "logits/rejected": -27348221.09090909, "logps/chosen": -470.0754582331731, "logps/rejected": -513.9871271306819, "loss": 0.0015, "rewards/chosen": 9.114378122182993, "rewards/margins": 20.84045650242092, "rewards/rejected": -11.726078380237926, "step": 3219 }, { "epoch": 0.8825544744415513, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36171960.0, "logits/rejected": -26659076.0, "logps/chosen": -399.140625, "logps/rejected": -500.1515197753906, "loss": 0.0053, "rewards/chosen": 7.320361614227295, "rewards/margins": 18.53947687149048, "rewards/rejected": -11.219115257263184, "step": 3220 }, { "epoch": 0.8828285596820611, "grad_norm": 7.46875, "kl": 1.345106840133667, "learning_rate": 5e-06, "logits/chosen": 1249959.5, "logits/rejected": -17193858.0, "logps/chosen": -359.1897888183594, "logps/rejected": -520.11767578125, "loss": 0.055, "rewards/chosen": 5.781913757324219, "rewards/margins": 12.256040096282959, "rewards/rejected": -6.47412633895874, "step": 3221 }, { "epoch": 0.883102644922571, "grad_norm": 6.4375, "kl": 2.6050708293914795, "learning_rate": 5e-06, "logits/chosen": -33832888.615384616, "logits/rejected": -17101722.181818184, "logps/chosen": -347.9285231370192, "logps/rejected": -550.2399236505681, "loss": 0.0244, "rewards/chosen": 6.440057020920974, "rewards/margins": 18.275156674685178, "rewards/rejected": -11.835099653764205, "step": 3222 }, { "epoch": 0.8833767301630807, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20163361.230769232, "logits/rejected": -42422289.45454545, "logps/chosen": -352.62391075721155, "logps/rejected": -617.9169921875, "loss": 0.0542, "rewards/chosen": 6.9231438269981975, "rewards/margins": 21.949707991593367, "rewards/rejected": -15.02656416459517, "step": 3223 }, { "epoch": 0.8836508154035905, "grad_norm": 2.34375, "kl": 11.116971969604492, "learning_rate": 5e-06, "logits/chosen": -36479862.4, "logits/rejected": -26194596.57142857, "logps/chosen": -522.82529296875, "logps/rejected": -624.6535993303571, "loss": 0.0555, "rewards/chosen": 8.883546447753906, "rewards/margins": 23.872751944405692, "rewards/rejected": -14.989205496651786, "step": 3224 }, { "epoch": 0.8839249006441003, "grad_norm": 13.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69733632.0, "logits/rejected": -38340016.0, "logps/chosen": -488.1712890625, "logps/rejected": -525.1149553571429, "loss": 0.0709, "rewards/chosen": 7.881427764892578, "rewards/margins": 18.273502349853516, "rewards/rejected": -10.392074584960938, "step": 3225 }, { "epoch": 0.8841989858846101, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18728761.14285714, "logits/rejected": -23744718.4, "logps/chosen": -347.81065150669644, "logps/rejected": -345.52724609375, "loss": 0.0579, "rewards/chosen": 5.644641876220703, "rewards/margins": 17.03212661743164, "rewards/rejected": -11.387484741210937, "step": 3226 }, { "epoch": 0.8844730711251199, "grad_norm": 3.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29630821.818181816, "logits/rejected": -24755950.769230768, "logps/chosen": -342.84676846590907, "logps/rejected": -598.2684420072115, "loss": 0.0096, "rewards/chosen": 6.870990406383168, "rewards/margins": 18.125445972789418, "rewards/rejected": -11.25445556640625, "step": 3227 }, { "epoch": 0.8847471563656297, "grad_norm": 10.1875, "kl": 8.211261749267578, "learning_rate": 5e-06, "logits/chosen": -21798974.545454547, "logits/rejected": -24631296.0, "logps/chosen": -463.20481178977275, "logps/rejected": -521.91796875, "loss": 0.0397, "rewards/chosen": 8.655975341796875, "rewards/margins": 20.75291325495793, "rewards/rejected": -12.096937913161058, "step": 3228 }, { "epoch": 0.8850212416061395, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41726968.88888889, "logits/rejected": -18974336.0, "logps/chosen": -413.1697048611111, "logps/rejected": -524.1396809895833, "loss": 0.0296, "rewards/chosen": 7.8819224039713545, "rewards/margins": 17.489337158203124, "rewards/rejected": -9.60741475423177, "step": 3229 }, { "epoch": 0.8852953268466494, "grad_norm": 5.875, "kl": 3.3855419158935547, "learning_rate": 5e-06, "logits/chosen": -41981248.0, "logits/rejected": -9268004.363636363, "logps/chosen": -401.7417743389423, "logps/rejected": -364.7477361505682, "loss": 0.0271, "rewards/chosen": 8.618561377892128, "rewards/margins": 18.96642223438183, "rewards/rejected": -10.347860856489701, "step": 3230 }, { "epoch": 0.8855694120871591, "grad_norm": 9.1875, "kl": 0.9567846059799194, "learning_rate": 5e-06, "logits/chosen": 1782868.8235294118, "logits/rejected": -47970006.85714286, "logps/chosen": -365.58191636029414, "logps/rejected": -603.8791155133929, "loss": 0.0687, "rewards/chosen": 6.316866257611443, "rewards/margins": 17.90764188365776, "rewards/rejected": -11.590775626046318, "step": 3231 }, { "epoch": 0.8858434973276689, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20957425.454545453, "logits/rejected": -31004236.307692308, "logps/chosen": -400.94140625, "logps/rejected": -550.6796499399038, "loss": 0.0259, "rewards/chosen": 7.200594815340909, "rewards/margins": 20.607779122732737, "rewards/rejected": -13.407184307391827, "step": 3232 }, { "epoch": 0.8861175825681787, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11620022.857142856, "logits/rejected": -18695305.411764707, "logps/chosen": -408.00394112723217, "logps/rejected": -482.07892922794116, "loss": 0.0351, "rewards/chosen": 7.047061375209263, "rewards/margins": 16.544236704081047, "rewards/rejected": -9.497175328871784, "step": 3233 }, { "epoch": 0.8863916678086885, "grad_norm": 2.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35477460.0, "logits/rejected": -30092682.0, "logps/chosen": -442.61346435546875, "logps/rejected": -574.8838500976562, "loss": 0.0072, "rewards/chosen": 7.878837585449219, "rewards/margins": 18.752997398376465, "rewards/rejected": -10.874159812927246, "step": 3234 }, { "epoch": 0.8866657530491983, "grad_norm": 1.9296875, "kl": 3.6798102855682373, "learning_rate": 5e-06, "logits/chosen": -16043466.181818182, "logits/rejected": -22780214.153846152, "logps/chosen": -384.73659446022725, "logps/rejected": -549.3113356370193, "loss": 0.0036, "rewards/chosen": 8.59074679287997, "rewards/margins": 21.5423646940218, "rewards/rejected": -12.951617901141827, "step": 3235 }, { "epoch": 0.8869398382897081, "grad_norm": 10.625, "kl": 4.579585075378418, "learning_rate": 5e-06, "logits/chosen": -7307372.8, "logits/rejected": -40877660.44444445, "logps/chosen": -378.65205078125, "logps/rejected": -405.95350477430554, "loss": 0.1039, "rewards/chosen": 6.167076619466146, "rewards/margins": 15.764069112141927, "rewards/rejected": -9.596992492675781, "step": 3236 }, { "epoch": 0.8872139235302179, "grad_norm": 2.859375, "kl": 2.0041847229003906, "learning_rate": 5e-06, "logits/chosen": -34567738.666666664, "logits/rejected": -27282781.333333332, "logps/chosen": -377.6537679036458, "logps/rejected": -562.3733723958334, "loss": 0.0152, "rewards/chosen": 7.13112195332845, "rewards/margins": 19.26497968037923, "rewards/rejected": -12.133857727050781, "step": 3237 }, { "epoch": 0.8874880087707276, "grad_norm": 7.90625, "kl": 8.515624046325684, "learning_rate": 5e-06, "logits/chosen": -20098509.53846154, "logits/rejected": -39245637.81818182, "logps/chosen": -463.47055288461536, "logps/rejected": -641.7962979403409, "loss": 0.0603, "rewards/chosen": 7.36254648061899, "rewards/margins": 22.704405777937883, "rewards/rejected": -15.341859297318893, "step": 3238 }, { "epoch": 0.8877620940112375, "grad_norm": 2.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26134370.46153846, "logits/rejected": -22497213.09090909, "logps/chosen": -355.5878155048077, "logps/rejected": -574.5882457386364, "loss": 0.0108, "rewards/chosen": 6.702585073617788, "rewards/margins": 18.84906251280458, "rewards/rejected": -12.14647743918679, "step": 3239 }, { "epoch": 0.8880361792517473, "grad_norm": 2.578125, "kl": 2.637073278427124, "learning_rate": 5e-06, "logits/chosen": -35110082.461538464, "logits/rejected": -24004939.636363637, "logps/chosen": -528.8465294471154, "logps/rejected": -447.39688387784093, "loss": 0.0063, "rewards/chosen": 9.799873938927284, "rewards/margins": 23.168158684577143, "rewards/rejected": -13.368284745649857, "step": 3240 }, { "epoch": 0.8883102644922571, "grad_norm": 15.4375, "kl": 6.596280574798584, "learning_rate": 5e-06, "logits/chosen": -9986984.666666666, "logits/rejected": 27031077.333333332, "logps/chosen": -392.3957112630208, "logps/rejected": -610.51611328125, "loss": 0.0307, "rewards/chosen": 7.412587483723958, "rewards/margins": 19.362889607747395, "rewards/rejected": -11.950302124023438, "step": 3241 }, { "epoch": 0.8885843497327669, "grad_norm": 3.03125, "kl": 3.1758735179901123, "learning_rate": 5e-06, "logits/chosen": -29956166.0, "logits/rejected": 19353276.0, "logps/chosen": -415.4676513671875, "logps/rejected": -559.314697265625, "loss": 0.0081, "rewards/chosen": 8.19990348815918, "rewards/margins": 23.30184555053711, "rewards/rejected": -15.10194206237793, "step": 3242 }, { "epoch": 0.8888584349732767, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6949664.0, "logits/rejected": -21293011.692307692, "logps/chosen": -564.7020152698864, "logps/rejected": -666.8571965144231, "loss": 0.0241, "rewards/chosen": 6.512542031028054, "rewards/margins": 16.254537035535265, "rewards/rejected": -9.741995004507212, "step": 3243 }, { "epoch": 0.8891325202137865, "grad_norm": 0.9765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31071170.666666668, "logits/rejected": 11758368.0, "logps/chosen": -461.9267578125, "logps/rejected": -570.314697265625, "loss": 0.0023, "rewards/chosen": 7.589200337727864, "rewards/margins": 21.634119669596355, "rewards/rejected": -14.04491933186849, "step": 3244 }, { "epoch": 0.8894066054542963, "grad_norm": 4.71875, "kl": 0.6440035700798035, "learning_rate": 5e-06, "logits/chosen": -13650864.0, "logits/rejected": -10280398.545454545, "logps/chosen": -352.9581956129808, "logps/rejected": -395.66623757102275, "loss": 0.024, "rewards/chosen": 4.956295893742488, "rewards/margins": 15.518719599797176, "rewards/rejected": -10.562423706054688, "step": 3245 }, { "epoch": 0.889680690694806, "grad_norm": 5.09375, "kl": 7.820162773132324, "learning_rate": 5e-06, "logits/chosen": -24887162.666666668, "logits/rejected": -18934580.0, "logps/chosen": -412.9095052083333, "logps/rejected": -670.3108723958334, "loss": 0.0158, "rewards/chosen": 7.128684997558594, "rewards/margins": 19.75416056315104, "rewards/rejected": -12.625475565592447, "step": 3246 }, { "epoch": 0.8899547759353159, "grad_norm": 13.5, "kl": 7.234199523925781, "learning_rate": 5e-06, "logits/chosen": -27825954.285714287, "logits/rejected": -26593708.8, "logps/chosen": -419.59737723214283, "logps/rejected": -523.5751953125, "loss": 0.0565, "rewards/chosen": 7.654515947614398, "rewards/margins": 19.98647700718471, "rewards/rejected": -12.331961059570313, "step": 3247 }, { "epoch": 0.8902288611758257, "grad_norm": 4.78125, "kl": 4.804170608520508, "learning_rate": 5e-06, "logits/chosen": -17914388.0, "logits/rejected": -38657592.0, "logps/chosen": -377.8473205566406, "logps/rejected": -432.611572265625, "loss": 0.0221, "rewards/chosen": 7.371123313903809, "rewards/margins": 18.325772285461426, "rewards/rejected": -10.954648971557617, "step": 3248 }, { "epoch": 0.8905029464163354, "grad_norm": 3.359375, "kl": 3.8903567790985107, "learning_rate": 5e-06, "logits/chosen": 9733331.333333334, "logits/rejected": -5229635.0, "logps/chosen": -425.03125, "logps/rejected": -435.4539388020833, "loss": 0.0296, "rewards/chosen": 7.814868291219075, "rewards/margins": 17.332443873087566, "rewards/rejected": -9.51757558186849, "step": 3249 }, { "epoch": 0.8907770316568453, "grad_norm": 4.75, "kl": 0.3783671259880066, "learning_rate": 5e-06, "logits/chosen": -19628456.533333335, "logits/rejected": -26085703.111111112, "logps/chosen": -371.69781901041665, "logps/rejected": -464.9138454861111, "loss": 0.0197, "rewards/chosen": 7.9822230021158855, "rewards/margins": 20.478937276204427, "rewards/rejected": -12.496714274088541, "step": 3250 }, { "epoch": 0.8910511168973551, "grad_norm": 4.6875, "kl": 9.29456901550293, "learning_rate": 5e-06, "logits/chosen": -14207396.363636363, "logits/rejected": -15413304.615384616, "logps/chosen": -267.20015092329544, "logps/rejected": -502.5461989182692, "loss": 0.0352, "rewards/chosen": 7.559844970703125, "rewards/margins": 18.99899174616887, "rewards/rejected": -11.439146775465746, "step": 3251 }, { "epoch": 0.8913252021378649, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8623325.090909092, "logits/rejected": -30061097.846153848, "logps/chosen": -455.12895063920456, "logps/rejected": -630.4149639423077, "loss": 0.0542, "rewards/chosen": 6.37502011385831, "rewards/margins": 22.74652467740999, "rewards/rejected": -16.37150456355168, "step": 3252 }, { "epoch": 0.8915992873783747, "grad_norm": 0.5546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30682077.09090909, "logits/rejected": -42229883.07692308, "logps/chosen": -443.88676313920456, "logps/rejected": -582.9907977764423, "loss": 0.0017, "rewards/chosen": 9.172198208895596, "rewards/margins": 25.53758517178622, "rewards/rejected": -16.365386962890625, "step": 3253 }, { "epoch": 0.8918733726188844, "grad_norm": 5.25, "kl": 6.2825727462768555, "learning_rate": 5e-06, "logits/chosen": -38280713.14285714, "logits/rejected": -22864782.4, "logps/chosen": -358.9469517299107, "logps/rejected": -428.0935546875, "loss": 0.0698, "rewards/chosen": 5.893016270228794, "rewards/margins": 14.550309208461215, "rewards/rejected": -8.657292938232422, "step": 3254 }, { "epoch": 0.8921474578593943, "grad_norm": 0.302734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20675216.0, "logits/rejected": -15089314.0, "logps/chosen": -462.48016357421875, "logps/rejected": -560.0872802734375, "loss": 0.0014, "rewards/chosen": 7.614752769470215, "rewards/margins": 23.237372398376465, "rewards/rejected": -15.62261962890625, "step": 3255 }, { "epoch": 0.8924215430999041, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7496715.2, "logits/rejected": -553685.7142857143, "logps/chosen": -477.6796875, "logps/rejected": -586.4926060267857, "loss": 0.0397, "rewards/chosen": 7.13824234008789, "rewards/margins": 20.09374553135463, "rewards/rejected": -12.955503191266741, "step": 3256 }, { "epoch": 0.8926956283404138, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24596992.0, "logits/rejected": -37172072.0, "logps/chosen": -372.56756591796875, "logps/rejected": -535.0221557617188, "loss": 0.0186, "rewards/chosen": 8.130743026733398, "rewards/margins": 21.714662551879883, "rewards/rejected": -13.583919525146484, "step": 3257 }, { "epoch": 0.8929697135809237, "grad_norm": 2.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10477274.285714285, "logits/rejected": -21416798.11764706, "logps/chosen": -498.22422572544644, "logps/rejected": -618.2403492647059, "loss": 0.0182, "rewards/chosen": 8.580824715750557, "rewards/margins": 23.519140612177488, "rewards/rejected": -14.93831589642693, "step": 3258 }, { "epoch": 0.8932437988214335, "grad_norm": 1.4765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8472916.444444444, "logits/rejected": -13560296.533333333, "logps/chosen": -380.42298719618054, "logps/rejected": -617.4619791666667, "loss": 0.0036, "rewards/chosen": 7.398857964409722, "rewards/margins": 21.708610365125868, "rewards/rejected": -14.309752400716146, "step": 3259 }, { "epoch": 0.8935178840619432, "grad_norm": 5.65625, "kl": 6.186586380004883, "learning_rate": 5e-06, "logits/chosen": -34002786.666666664, "logits/rejected": -24032426.666666668, "logps/chosen": -398.94873046875, "logps/rejected": -482.4241536458333, "loss": 0.0221, "rewards/chosen": 8.141326904296875, "rewards/margins": 20.968124389648438, "rewards/rejected": -12.826797485351562, "step": 3260 }, { "epoch": 0.8937919693024531, "grad_norm": 7.84375, "kl": 4.848623752593994, "learning_rate": 5e-06, "logits/chosen": -9765989.714285715, "logits/rejected": -29061248.0, "logps/chosen": -406.53250558035717, "logps/rejected": -408.15927734375, "loss": 0.0255, "rewards/chosen": 7.07335444859096, "rewards/margins": 18.36395961216518, "rewards/rejected": -11.290605163574218, "step": 3261 }, { "epoch": 0.8940660545429628, "grad_norm": 4.0625, "kl": 1.4335403442382812, "learning_rate": 5e-06, "logits/chosen": -28030698.666666668, "logits/rejected": -17926590.666666668, "logps/chosen": -517.23681640625, "logps/rejected": -499.6307373046875, "loss": 0.0085, "rewards/chosen": 7.804893493652344, "rewards/margins": 20.441182454427086, "rewards/rejected": -12.63628896077474, "step": 3262 }, { "epoch": 0.8943401397834727, "grad_norm": 2.3125, "kl": 2.4388110637664795, "learning_rate": 5e-06, "logits/chosen": -25153641.846153848, "logits/rejected": -8974000.0, "logps/chosen": -423.4836989182692, "logps/rejected": -361.3936656605114, "loss": 0.0085, "rewards/chosen": 8.547644981971153, "rewards/margins": 17.26070067932556, "rewards/rejected": -8.713055697354404, "step": 3263 }, { "epoch": 0.8946142250239825, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47929358.222222224, "logits/rejected": -35883669.333333336, "logps/chosen": -407.88226996527777, "logps/rejected": -581.3569661458333, "loss": 0.033, "rewards/chosen": 7.557284884982639, "rewards/margins": 20.04415724012587, "rewards/rejected": -12.486872355143229, "step": 3264 }, { "epoch": 0.8948883102644922, "grad_norm": 4.75, "kl": 6.9351911544799805, "learning_rate": 5e-06, "logits/chosen": -25928552.0, "logits/rejected": -36372784.0, "logps/chosen": -528.3594563802084, "logps/rejected": -504.8844807942708, "loss": 0.0492, "rewards/chosen": 8.79972775777181, "rewards/margins": 17.525281270345054, "rewards/rejected": -8.725553512573242, "step": 3265 }, { "epoch": 0.8951623955050021, "grad_norm": 1.8203125, "kl": 1.251556396484375, "learning_rate": 5e-06, "logits/chosen": -25024546.133333333, "logits/rejected": -57798720.0, "logps/chosen": -561.7895182291667, "logps/rejected": -420.4456380208333, "loss": 0.0042, "rewards/chosen": 8.66520487467448, "rewards/margins": 21.299366082085506, "rewards/rejected": -12.634161207411024, "step": 3266 }, { "epoch": 0.8954364807455119, "grad_norm": 5.5, "kl": 4.426759243011475, "learning_rate": 5e-06, "logits/chosen": -2732600.727272727, "logits/rejected": -9844343.384615384, "logps/chosen": -399.86820845170456, "logps/rejected": -543.3680513822115, "loss": 0.0143, "rewards/chosen": 9.014307195490057, "rewards/margins": 22.17362186458561, "rewards/rejected": -13.159314669095552, "step": 3267 }, { "epoch": 0.8957105659860216, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1513986.0, "logits/rejected": -24948548.0, "logps/chosen": -296.0464782714844, "logps/rejected": -579.311767578125, "loss": 0.0166, "rewards/chosen": 6.027951240539551, "rewards/margins": 20.820116996765137, "rewards/rejected": -14.792165756225586, "step": 3268 }, { "epoch": 0.8959846512265315, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42215744.0, "logits/rejected": -24506328.615384616, "logps/chosen": -357.41537198153407, "logps/rejected": -581.6946364182693, "loss": 0.0134, "rewards/chosen": 7.015277515758168, "rewards/margins": 19.535771243222108, "rewards/rejected": -12.520493727463942, "step": 3269 }, { "epoch": 0.8962587364670412, "grad_norm": 12.875, "kl": 9.649497985839844, "learning_rate": 5e-06, "logits/chosen": 73620999.52941176, "logits/rejected": -37814742.85714286, "logps/chosen": -336.42580997242646, "logps/rejected": -404.19022042410717, "loss": 0.0571, "rewards/chosen": 5.41325782327091, "rewards/margins": 13.470160572468732, "rewards/rejected": -8.056902749197823, "step": 3270 }, { "epoch": 0.896532821707551, "grad_norm": 1.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20637604.923076924, "logits/rejected": -32685352.727272727, "logps/chosen": -506.9035456730769, "logps/rejected": -557.9881480823864, "loss": 0.0054, "rewards/chosen": 7.201697129469651, "rewards/margins": 18.66140699053144, "rewards/rejected": -11.45970986106179, "step": 3271 }, { "epoch": 0.8968069069480609, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 29931418.666666668, "logits/rejected": -39794386.666666664, "logps/chosen": -514.921142578125, "logps/rejected": -506.8558756510417, "loss": 0.0497, "rewards/chosen": 5.817975997924805, "rewards/margins": 19.32566261291504, "rewards/rejected": -13.507686614990234, "step": 3272 }, { "epoch": 0.8970809921885706, "grad_norm": 3.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5020721.846153846, "logits/rejected": -55526318.54545455, "logps/chosen": -544.6286057692307, "logps/rejected": -501.5062144886364, "loss": 0.0111, "rewards/chosen": 8.043867844801683, "rewards/margins": 19.868229472553814, "rewards/rejected": -11.82436162775213, "step": 3273 }, { "epoch": 0.8973550774290805, "grad_norm": 8.8125, "kl": 2.472839832305908, "learning_rate": 5e-06, "logits/chosen": -9333807.111111112, "logits/rejected": -21346265.6, "logps/chosen": -399.6330295138889, "logps/rejected": -503.27623697916664, "loss": 0.0215, "rewards/chosen": 7.2946582370334205, "rewards/margins": 18.252613152398006, "rewards/rejected": -10.957954915364583, "step": 3274 }, { "epoch": 0.8976291626695903, "grad_norm": 3.953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16452522.666666666, "logits/rejected": -19393169.777777776, "logps/chosen": -379.53753255208335, "logps/rejected": -445.21978081597223, "loss": 0.0395, "rewards/chosen": 5.465037027994792, "rewards/margins": 16.932533264160156, "rewards/rejected": -11.467496236165365, "step": 3275 }, { "epoch": 0.8979032479101, "grad_norm": 3.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24017180.0, "logits/rejected": -27939586.0, "logps/chosen": -454.31378173828125, "logps/rejected": -641.167236328125, "loss": 0.0059, "rewards/chosen": 6.601651668548584, "rewards/margins": 19.51970624923706, "rewards/rejected": -12.918054580688477, "step": 3276 }, { "epoch": 0.8981773331506099, "grad_norm": 11.25, "kl": 5.83911657333374, "learning_rate": 5e-06, "logits/chosen": -18926806.85714286, "logits/rejected": -40329222.4, "logps/chosen": -503.3309849330357, "logps/rejected": -465.16923828125, "loss": 0.037, "rewards/chosen": 8.052865709577288, "rewards/margins": 19.475077165876115, "rewards/rejected": -11.422211456298829, "step": 3277 }, { "epoch": 0.8984514183911196, "grad_norm": 8.75, "kl": 5.002000331878662, "learning_rate": 5e-06, "logits/chosen": -41034776.88888889, "logits/rejected": -50876168.53333333, "logps/chosen": -412.00678168402777, "logps/rejected": -657.5606770833333, "loss": 0.0155, "rewards/chosen": 8.184183756510416, "rewards/margins": 23.990234375, "rewards/rejected": -15.806050618489584, "step": 3278 }, { "epoch": 0.8987255036316294, "grad_norm": 0.62109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15535637.333333334, "logits/rejected": -18724253.333333332, "logps/chosen": -499.831298828125, "logps/rejected": -517.4775390625, "loss": 0.0021, "rewards/chosen": 7.399769465128581, "rewards/margins": 24.578421910603844, "rewards/rejected": -17.17865244547526, "step": 3279 }, { "epoch": 0.8989995888721393, "grad_norm": 5.9375, "kl": 2.679523468017578, "learning_rate": 5e-06, "logits/chosen": -20396353.333333332, "logits/rejected": -35951306.666666664, "logps/chosen": -402.4847005208333, "logps/rejected": -604.6202392578125, "loss": 0.0259, "rewards/chosen": 7.597684224446614, "rewards/margins": 25.267824808756508, "rewards/rejected": -17.670140584309895, "step": 3280 }, { "epoch": 0.899273674112649, "grad_norm": 12.125, "kl": 8.645370483398438, "learning_rate": 5e-06, "logits/chosen": -24609770.666666668, "logits/rejected": -29221781.333333332, "logps/chosen": -388.2260416666667, "logps/rejected": -514.3255750868055, "loss": 0.1185, "rewards/chosen": 6.993094380696615, "rewards/margins": 19.641464063856336, "rewards/rejected": -12.648369683159721, "step": 3281 }, { "epoch": 0.8995477593531588, "grad_norm": 13.6875, "kl": 3.4817237854003906, "learning_rate": 5e-06, "logits/chosen": -27715874.90909091, "logits/rejected": -13040940.307692308, "logps/chosen": -401.4315074573864, "logps/rejected": -592.7224308894231, "loss": 0.0731, "rewards/chosen": 7.350491610440341, "rewards/margins": 19.860050654911493, "rewards/rejected": -12.509559044471153, "step": 3282 }, { "epoch": 0.8998218445936687, "grad_norm": 1.0546875, "kl": 1.5985807180404663, "learning_rate": 5e-06, "logits/chosen": -12234441.142857144, "logits/rejected": -32354873.6, "logps/chosen": -404.85756138392856, "logps/rejected": -532.83427734375, "loss": 0.003, "rewards/chosen": 7.789879935128348, "rewards/margins": 22.405726187569755, "rewards/rejected": -14.615846252441406, "step": 3283 }, { "epoch": 0.9000959298341784, "grad_norm": 3.484375, "kl": 7.507717132568359, "learning_rate": 5e-06, "logits/chosen": -9595502.666666666, "logits/rejected": -20425009.333333332, "logps/chosen": -573.1148274739584, "logps/rejected": -497.5091145833333, "loss": 0.0103, "rewards/chosen": 8.022148768107096, "rewards/margins": 22.559995651245117, "rewards/rejected": -14.537846883138021, "step": 3284 }, { "epoch": 0.9003700150746883, "grad_norm": 4.0, "kl": 0.04975064843893051, "learning_rate": 5e-06, "logits/chosen": -31902880.0, "logits/rejected": -32576755.2, "logps/chosen": -521.7803780691964, "logps/rejected": -597.17646484375, "loss": 0.0089, "rewards/chosen": 8.060741969517299, "rewards/margins": 23.30248849051339, "rewards/rejected": -15.241746520996093, "step": 3285 }, { "epoch": 0.900644100315198, "grad_norm": 3.375, "kl": 2.3222084045410156, "learning_rate": 5e-06, "logits/chosen": -28077840.0, "logits/rejected": -45163539.2, "logps/chosen": -400.31895228794644, "logps/rejected": -638.448388671875, "loss": 0.0068, "rewards/chosen": 7.968287876674107, "rewards/margins": 25.165590122767856, "rewards/rejected": -17.19730224609375, "step": 3286 }, { "epoch": 0.9009181855557078, "grad_norm": 1.0859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17165484.0, "logits/rejected": -7780477.5, "logps/chosen": -423.764892578125, "logps/rejected": -645.2080078125, "loss": 0.003, "rewards/chosen": 8.617183685302734, "rewards/margins": 23.477739334106445, "rewards/rejected": -14.860555648803711, "step": 3287 }, { "epoch": 0.9011922707962177, "grad_norm": 0.65625, "kl": 1.9979280233383179, "learning_rate": 5e-06, "logits/chosen": -41892352.0, "logits/rejected": -43194363.733333334, "logps/chosen": -470.9806857638889, "logps/rejected": -441.7190755208333, "loss": 0.0016, "rewards/chosen": 7.864627414279514, "rewards/margins": 18.13599582248264, "rewards/rejected": -10.271368408203125, "step": 3288 }, { "epoch": 0.9014663560367274, "grad_norm": 15.0, "kl": 23.70978355407715, "learning_rate": 5e-06, "logits/chosen": -17219866.94736842, "logits/rejected": -36470704.0, "logps/chosen": -467.84765625, "logps/rejected": -448.21650390625, "loss": 0.1184, "rewards/chosen": 7.271706028988487, "rewards/margins": 17.964610692074423, "rewards/rejected": -10.692904663085937, "step": 3289 }, { "epoch": 0.9017404412772372, "grad_norm": 7.65625, "kl": 11.809857368469238, "learning_rate": 5e-06, "logits/chosen": -16688362.0, "logits/rejected": 1417108.5, "logps/chosen": -383.31585693359375, "logps/rejected": -605.9210205078125, "loss": 0.0697, "rewards/chosen": 6.945493698120117, "rewards/margins": 15.371232986450195, "rewards/rejected": -8.425739288330078, "step": 3290 }, { "epoch": 0.902014526517747, "grad_norm": 9.5, "kl": 1.9428876638412476, "learning_rate": 5e-06, "logits/chosen": -19896504.0, "logits/rejected": -38969388.0, "logps/chosen": -322.2588195800781, "logps/rejected": -583.6301879882812, "loss": 0.0605, "rewards/chosen": 5.710681438446045, "rewards/margins": 17.808331966400146, "rewards/rejected": -12.097650527954102, "step": 3291 }, { "epoch": 0.9022886117582568, "grad_norm": 2.09375, "kl": 1.7962188720703125, "learning_rate": 5e-06, "logits/chosen": -4399932.923076923, "logits/rejected": -49989137.45454545, "logps/chosen": -558.8298527644231, "logps/rejected": -584.8534268465909, "loss": 0.0048, "rewards/chosen": 8.287368774414062, "rewards/margins": 25.591967496004973, "rewards/rejected": -17.30459872159091, "step": 3292 }, { "epoch": 0.9025626969987666, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24190373.818181816, "logits/rejected": -2121428.4615384615, "logps/chosen": -359.8349609375, "logps/rejected": -500.31043419471155, "loss": 0.0131, "rewards/chosen": 7.415772871537642, "rewards/margins": 20.138087559413243, "rewards/rejected": -12.7223146878756, "step": 3293 }, { "epoch": 0.9028367822392764, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31601645.714285713, "logits/rejected": -3818264.9411764704, "logps/chosen": -480.02284458705356, "logps/rejected": -606.3738511029412, "loss": 0.105, "rewards/chosen": 6.168605804443359, "rewards/margins": 19.0066660712747, "rewards/rejected": -12.838060266831341, "step": 3294 }, { "epoch": 0.9031108674797862, "grad_norm": 9.125, "kl": 1.5473588705062866, "learning_rate": 5e-06, "logits/chosen": -34153308.8, "logits/rejected": -34619963.428571425, "logps/chosen": -493.9662109375, "logps/rejected": -492.79439871651783, "loss": 0.0247, "rewards/chosen": 7.0137184143066404, "rewards/margins": 21.138778359549384, "rewards/rejected": -14.125059945242745, "step": 3295 }, { "epoch": 0.9033849527202961, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3449764.0, "logits/rejected": -6625284.0, "logps/chosen": -345.2143249511719, "logps/rejected": -496.99749755859375, "loss": 0.0138, "rewards/chosen": 6.764804840087891, "rewards/margins": 19.25751495361328, "rewards/rejected": -12.49271011352539, "step": 3296 }, { "epoch": 0.9036590379608058, "grad_norm": 17.0, "kl": 6.4465227127075195, "learning_rate": 5e-06, "logits/chosen": -10127926.857142856, "logits/rejected": -31868121.6, "logps/chosen": -406.71337890625, "logps/rejected": -432.22109375, "loss": 0.075, "rewards/chosen": 6.733001708984375, "rewards/margins": 20.215536499023436, "rewards/rejected": -13.482534790039063, "step": 3297 }, { "epoch": 0.9039331232013156, "grad_norm": 14.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19434801.777777776, "logits/rejected": -10889654.4, "logps/chosen": -341.4130045572917, "logps/rejected": -749.2998046875, "loss": 0.0679, "rewards/chosen": 6.726075490315755, "rewards/margins": 22.735958099365234, "rewards/rejected": -16.00988260904948, "step": 3298 }, { "epoch": 0.9042072084418254, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25710056.888888888, "logits/rejected": -31325843.2, "logps/chosen": -385.25884331597223, "logps/rejected": -589.96875, "loss": 0.0423, "rewards/chosen": 5.708502875434028, "rewards/margins": 18.298617214626734, "rewards/rejected": -12.590114339192708, "step": 3299 }, { "epoch": 0.9044812936823352, "grad_norm": 10.875, "kl": 1.775200605392456, "learning_rate": 5e-06, "logits/chosen": -24216354.46153846, "logits/rejected": -54533608.72727273, "logps/chosen": -402.7688176081731, "logps/rejected": -464.7224786931818, "loss": 0.0836, "rewards/chosen": 5.796341529259315, "rewards/margins": 19.626916258485167, "rewards/rejected": -13.830574729225852, "step": 3300 }, { "epoch": 0.904755378922845, "grad_norm": 1.8203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13424325.818181818, "logits/rejected": -9618467.076923076, "logps/chosen": -457.52077414772725, "logps/rejected": -594.8576096754807, "loss": 0.0047, "rewards/chosen": 8.088642467151988, "rewards/margins": 21.246160120397178, "rewards/rejected": -13.157517653245192, "step": 3301 }, { "epoch": 0.9050294641633548, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 20555970.90909091, "logits/rejected": -22158340.923076924, "logps/chosen": -378.83695845170456, "logps/rejected": -463.9070012019231, "loss": 0.0354, "rewards/chosen": 6.946484652432528, "rewards/margins": 19.613466276155485, "rewards/rejected": -12.666981623722958, "step": 3302 }, { "epoch": 0.9053035494038646, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27979022.545454547, "logits/rejected": -14154664.615384616, "logps/chosen": -328.6751154119318, "logps/rejected": -554.8251953125, "loss": 0.0184, "rewards/chosen": 6.936605973677202, "rewards/margins": 21.32911249974391, "rewards/rejected": -14.392506526066708, "step": 3303 }, { "epoch": 0.9055776346443744, "grad_norm": 8.375, "kl": 4.2201056480407715, "learning_rate": 5e-06, "logits/chosen": -20936901.818181816, "logits/rejected": -18860345.846153848, "logps/chosen": -411.34499289772725, "logps/rejected": -597.7161207932693, "loss": 0.0122, "rewards/chosen": 6.73729775168679, "rewards/margins": 18.493430611136912, "rewards/rejected": -11.75613285945012, "step": 3304 }, { "epoch": 0.9058517198848842, "grad_norm": 2.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41422557.09090909, "logits/rejected": -37803426.461538464, "logps/chosen": -390.5486505681818, "logps/rejected": -487.20620492788464, "loss": 0.0101, "rewards/chosen": 7.379303672096946, "rewards/margins": 18.997115608695506, "rewards/rejected": -11.617811936598558, "step": 3305 }, { "epoch": 0.906125805125394, "grad_norm": 2.671875, "kl": 3.5476317405700684, "learning_rate": 5e-06, "logits/chosen": -34732945.06666667, "logits/rejected": -38179360.0, "logps/chosen": -389.464453125, "logps/rejected": -506.6282552083333, "loss": 0.008, "rewards/chosen": 6.89185791015625, "rewards/margins": 22.378406439887154, "rewards/rejected": -15.486548529730904, "step": 3306 }, { "epoch": 0.9063998903659038, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3932558.8571428573, "logits/rejected": -30794208.0, "logps/chosen": -394.23496791294644, "logps/rejected": -610.3265739889706, "loss": 0.0137, "rewards/chosen": 7.777460370744977, "rewards/margins": 20.734716912277605, "rewards/rejected": -12.95725654153263, "step": 3307 }, { "epoch": 0.9066739756064136, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4495482.666666667, "logits/rejected": -32790775.466666665, "logps/chosen": -432.0858561197917, "logps/rejected": -500.7975260416667, "loss": 0.0032, "rewards/chosen": 8.912160237630209, "rewards/margins": 22.178865559895833, "rewards/rejected": -13.266705322265626, "step": 3308 }, { "epoch": 0.9069480608469234, "grad_norm": 2.546875, "kl": 0.0378367118537426, "learning_rate": 5e-06, "logits/chosen": -28232520.727272727, "logits/rejected": 25076256.0, "logps/chosen": -438.41015625, "logps/rejected": -633.1521935096154, "loss": 0.011, "rewards/chosen": 5.796332966197621, "rewards/margins": 19.1668597934963, "rewards/rejected": -13.370526827298677, "step": 3309 }, { "epoch": 0.9072221460874332, "grad_norm": 3.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4815945.777777778, "logits/rejected": 51088093.86666667, "logps/chosen": -350.8306477864583, "logps/rejected": -584.8909505208334, "loss": 0.0382, "rewards/chosen": 6.03243891398112, "rewards/margins": 25.651264190673828, "rewards/rejected": -19.618825276692707, "step": 3310 }, { "epoch": 0.907496231327943, "grad_norm": 2.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18692823.272727273, "logits/rejected": -22085021.53846154, "logps/chosen": -410.48197798295456, "logps/rejected": -676.4742337740385, "loss": 0.0104, "rewards/chosen": 6.540905345569957, "rewards/margins": 21.567096470119235, "rewards/rejected": -15.026191124549278, "step": 3311 }, { "epoch": 0.9077703165684528, "grad_norm": 9.125, "kl": 5.7908477783203125, "learning_rate": 5e-06, "logits/chosen": 1945320.7272727273, "logits/rejected": -48074515.692307696, "logps/chosen": -469.8546697443182, "logps/rejected": -462.57335486778845, "loss": 0.0238, "rewards/chosen": 9.134721235795455, "rewards/margins": 18.53656144575639, "rewards/rejected": -9.401840209960938, "step": 3312 }, { "epoch": 0.9080444018089626, "grad_norm": 2.765625, "kl": 7.032042503356934, "learning_rate": 5e-06, "logits/chosen": -9830882.285714285, "logits/rejected": -13648312.0, "logps/chosen": -466.3935546875, "logps/rejected": -474.48330078125, "loss": 0.0107, "rewards/chosen": 7.508291516985212, "rewards/margins": 23.014451490129744, "rewards/rejected": -15.50615997314453, "step": 3313 }, { "epoch": 0.9083184870494724, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11875931.076923076, "logits/rejected": -36352558.54545455, "logps/chosen": -540.8062650240385, "logps/rejected": -554.5087002840909, "loss": 0.0661, "rewards/chosen": 7.28453357403095, "rewards/margins": 22.081801141058648, "rewards/rejected": -14.7972675670277, "step": 3314 }, { "epoch": 0.9085925722899821, "grad_norm": 3.796875, "kl": 7.250267028808594, "learning_rate": 5e-06, "logits/chosen": -10800596.0, "logits/rejected": 763860.6666666666, "logps/chosen": -437.3125, "logps/rejected": -757.8673502604166, "loss": 0.0448, "rewards/chosen": 8.914283752441406, "rewards/margins": 21.32230885823568, "rewards/rejected": -12.408025105794271, "step": 3315 }, { "epoch": 0.908866657530492, "grad_norm": 0.68359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18196176.0, "logits/rejected": -31139337.14285714, "logps/chosen": -516.197314453125, "logps/rejected": -455.26046316964283, "loss": 0.0028, "rewards/chosen": 6.789205932617188, "rewards/margins": 20.5324702671596, "rewards/rejected": -13.743264334542411, "step": 3316 }, { "epoch": 0.9091407427710018, "grad_norm": 11.0, "kl": 3.4795475006103516, "learning_rate": 5e-06, "logits/chosen": -13274249.846153846, "logits/rejected": -23091538.90909091, "logps/chosen": -378.0248272235577, "logps/rejected": -443.63991477272725, "loss": 0.042, "rewards/chosen": 7.051564730130709, "rewards/margins": 16.875454029003222, "rewards/rejected": -9.823889298872514, "step": 3317 }, { "epoch": 0.9094148280115116, "grad_norm": 12.1875, "kl": 6.969302177429199, "learning_rate": 5e-06, "logits/chosen": -31188893.53846154, "logits/rejected": -33680360.72727273, "logps/chosen": -520.9982346754807, "logps/rejected": -593.1823064630681, "loss": 0.0202, "rewards/chosen": 9.191443223219652, "rewards/margins": 20.8227430223585, "rewards/rejected": -11.63129979913885, "step": 3318 }, { "epoch": 0.9096889132520214, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1085160.3333333333, "logits/rejected": -19498680.0, "logps/chosen": -311.38816324869794, "logps/rejected": -502.2137044270833, "loss": 0.0324, "rewards/chosen": 5.814308802286784, "rewards/margins": 16.301945368448894, "rewards/rejected": -10.48763656616211, "step": 3319 }, { "epoch": 0.9099629984925311, "grad_norm": 9.5625, "kl": 1.4459731578826904, "learning_rate": 5e-06, "logits/chosen": -5980937.714285715, "logits/rejected": -2280762.4, "logps/chosen": -427.83558872767856, "logps/rejected": -282.2814453125, "loss": 0.0479, "rewards/chosen": 6.707512991768973, "rewards/margins": 13.047360937935967, "rewards/rejected": -6.3398479461669925, "step": 3320 }, { "epoch": 0.910237083733041, "grad_norm": 7.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1831506.1818181819, "logits/rejected": 13196342.153846154, "logps/chosen": -446.53520063920456, "logps/rejected": -612.6285306490385, "loss": 0.0331, "rewards/chosen": 6.385941938920454, "rewards/margins": 23.39297976193728, "rewards/rejected": -17.007037823016827, "step": 3321 }, { "epoch": 0.9105111689735508, "grad_norm": 6.40625, "kl": 11.881876945495605, "learning_rate": 5e-06, "logits/chosen": -17743579.733333334, "logits/rejected": -1220169.888888889, "logps/chosen": -514.1330078125, "logps/rejected": -496.5764973958333, "loss": 0.0247, "rewards/chosen": 8.094116719563802, "rewards/margins": 18.461661953396266, "rewards/rejected": -10.367545233832466, "step": 3322 }, { "epoch": 0.9107852542140605, "grad_norm": 5.90625, "kl": 0.20232391357421875, "learning_rate": 5e-06, "logits/chosen": 24814053.818181816, "logits/rejected": -45069582.76923077, "logps/chosen": -438.43039772727275, "logps/rejected": -564.9410682091346, "loss": 0.0219, "rewards/chosen": 6.607441295276988, "rewards/margins": 21.51169474141581, "rewards/rejected": -14.904253446138823, "step": 3323 }, { "epoch": 0.9110593394545704, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14095203.2, "logits/rejected": -17238633.14285714, "logps/chosen": -408.8573974609375, "logps/rejected": -479.0831821986607, "loss": 0.01, "rewards/chosen": 8.587049865722657, "rewards/margins": 19.091767011369978, "rewards/rejected": -10.504717145647321, "step": 3324 }, { "epoch": 0.9113334246950802, "grad_norm": 3.984375, "kl": 6.286547660827637, "learning_rate": 5e-06, "logits/chosen": -4521013.818181818, "logits/rejected": -28297031.384615384, "logps/chosen": -334.03080610795456, "logps/rejected": -435.3109600360577, "loss": 0.0128, "rewards/chosen": 6.454290216619318, "rewards/margins": 18.66423706908326, "rewards/rejected": -12.209946852463942, "step": 3325 }, { "epoch": 0.9116075099355899, "grad_norm": 5.28125, "kl": 0.21935781836509705, "learning_rate": 5e-06, "logits/chosen": -22919533.714285713, "logits/rejected": -41415939.2, "logps/chosen": -388.0287388392857, "logps/rejected": -601.068505859375, "loss": 0.0346, "rewards/chosen": 6.055495125906808, "rewards/margins": 18.69427533830915, "rewards/rejected": -12.638780212402343, "step": 3326 }, { "epoch": 0.9118815951760998, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17457587.2, "logits/rejected": -21540884.57142857, "logps/chosen": -351.455029296875, "logps/rejected": -588.5281808035714, "loss": 0.003, "rewards/chosen": 8.06529541015625, "rewards/margins": 21.472942679268975, "rewards/rejected": -13.407647269112724, "step": 3327 }, { "epoch": 0.9121556804166095, "grad_norm": 10.5625, "kl": 0.8077990412712097, "learning_rate": 5e-06, "logits/chosen": -29333545.6, "logits/rejected": -32522457.14285714, "logps/chosen": -368.6446533203125, "logps/rejected": -545.244384765625, "loss": 0.0227, "rewards/chosen": 8.349209594726563, "rewards/margins": 20.90127694266183, "rewards/rejected": -12.552067347935267, "step": 3328 }, { "epoch": 0.9124297656571194, "grad_norm": 3.046875, "kl": 10.729201316833496, "learning_rate": 5e-06, "logits/chosen": -39929856.0, "logits/rejected": -27090435.2, "logps/chosen": -523.8369489397321, "logps/rejected": -642.53251953125, "loss": 0.0139, "rewards/chosen": 9.105447496686663, "rewards/margins": 24.920718492780413, "rewards/rejected": -15.81527099609375, "step": 3329 }, { "epoch": 0.9127038508976292, "grad_norm": 1.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32867744.0, "logits/rejected": -17678092.8, "logps/chosen": -448.0390896267361, "logps/rejected": -493.96546223958336, "loss": 0.005, "rewards/chosen": 8.367798699273003, "rewards/margins": 21.106566196017795, "rewards/rejected": -12.738767496744792, "step": 3330 }, { "epoch": 0.9129779361381389, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20395031.272727273, "logits/rejected": -31491766.153846152, "logps/chosen": -442.13423295454544, "logps/rejected": -601.3223407451923, "loss": 0.032, "rewards/chosen": 6.586248224431818, "rewards/margins": 19.68912180653819, "rewards/rejected": -13.10287358210637, "step": 3331 }, { "epoch": 0.9132520213786488, "grad_norm": 5.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16453632.0, "logits/rejected": -7729715.428571428, "logps/chosen": -467.888916015625, "logps/rejected": -484.02103097098217, "loss": 0.0167, "rewards/chosen": 8.435335540771485, "rewards/margins": 18.788296399797712, "rewards/rejected": -10.352960859026227, "step": 3332 }, { "epoch": 0.9135261066191586, "grad_norm": 9.4375, "kl": 12.308806419372559, "learning_rate": 5e-06, "logits/chosen": -14411598.11764706, "logits/rejected": -15040121.142857144, "logps/chosen": -406.1475183823529, "logps/rejected": -414.3553989955357, "loss": 0.0543, "rewards/chosen": 7.638027415556066, "rewards/margins": 15.243520688609916, "rewards/rejected": -7.605493273053851, "step": 3333 }, { "epoch": 0.9138001918596683, "grad_norm": 7.78125, "kl": 2.6674141883850098, "learning_rate": 5e-06, "logits/chosen": -10213242.666666666, "logits/rejected": -16308610.666666666, "logps/chosen": -326.64406331380206, "logps/rejected": -433.8812255859375, "loss": 0.0824, "rewards/chosen": 4.804329554239909, "rewards/margins": 17.169084548950195, "rewards/rejected": -12.364754994710287, "step": 3334 }, { "epoch": 0.9140742771001782, "grad_norm": 2.890625, "kl": 10.128726959228516, "learning_rate": 5e-06, "logits/chosen": -30104874.0, "logits/rejected": -47511092.0, "logps/chosen": -431.91748046875, "logps/rejected": -484.55621337890625, "loss": 0.0448, "rewards/chosen": 8.155357360839844, "rewards/margins": 21.755064964294434, "rewards/rejected": -13.59970760345459, "step": 3335 }, { "epoch": 0.914348362340688, "grad_norm": 15.25, "kl": 7.72272253036499, "learning_rate": 5e-06, "logits/chosen": -12058270.857142856, "logits/rejected": -7061145.6, "logps/chosen": -440.4955357142857, "logps/rejected": -636.43876953125, "loss": 0.06, "rewards/chosen": 7.683462960379464, "rewards/margins": 20.607466561453684, "rewards/rejected": -12.924003601074219, "step": 3336 }, { "epoch": 0.9146224475811977, "grad_norm": 10.5625, "kl": 12.809988021850586, "learning_rate": 5e-06, "logits/chosen": -13071043.0, "logits/rejected": 15791000.0, "logps/chosen": -411.1241455078125, "logps/rejected": -388.14642333984375, "loss": 0.0713, "rewards/chosen": 8.219348907470703, "rewards/margins": 18.48321533203125, "rewards/rejected": -10.263866424560547, "step": 3337 }, { "epoch": 0.9148965328217076, "grad_norm": 3.84375, "kl": 3.723644256591797, "learning_rate": 5e-06, "logits/chosen": -11229681.0, "logits/rejected": -46033828.0, "logps/chosen": -320.697265625, "logps/rejected": -603.2988891601562, "loss": 0.0402, "rewards/chosen": 6.5072340965271, "rewards/margins": 20.72993803024292, "rewards/rejected": -14.22270393371582, "step": 3338 }, { "epoch": 0.9151706180622173, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36436725.333333336, "logits/rejected": -26084914.666666668, "logps/chosen": -405.906982421875, "logps/rejected": -503.2132568359375, "loss": 0.0171, "rewards/chosen": 5.912756601969401, "rewards/margins": 17.22365951538086, "rewards/rejected": -11.310902913411459, "step": 3339 }, { "epoch": 0.9154447033027272, "grad_norm": 23.0, "kl": 7.9164533615112305, "learning_rate": 5e-06, "logits/chosen": -32049652.57142857, "logits/rejected": -15002644.8, "logps/chosen": -441.8412388392857, "logps/rejected": -619.527978515625, "loss": 0.0576, "rewards/chosen": 8.029076167515345, "rewards/margins": 18.041194697788782, "rewards/rejected": -10.012118530273437, "step": 3340 }, { "epoch": 0.915718788543237, "grad_norm": 10.4375, "kl": 10.484478950500488, "learning_rate": 5e-06, "logits/chosen": 4128715.2, "logits/rejected": 12848218.666666666, "logps/chosen": -450.5734375, "logps/rejected": -711.0851779513889, "loss": 0.0396, "rewards/chosen": 7.244621785481771, "rewards/margins": 22.652460394965278, "rewards/rejected": -15.407838609483507, "step": 3341 }, { "epoch": 0.9159928737837467, "grad_norm": 1.0234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27029120.0, "logits/rejected": -27617276.0, "logps/chosen": -345.2381591796875, "logps/rejected": -615.7932739257812, "loss": 0.0023, "rewards/chosen": 6.331422805786133, "rewards/margins": 20.670973777770996, "rewards/rejected": -14.339550971984863, "step": 3342 }, { "epoch": 0.9162669590242566, "grad_norm": 3.203125, "kl": 0.9199193716049194, "learning_rate": 5e-06, "logits/chosen": -19519228.0, "logits/rejected": -28133781.333333332, "logps/chosen": -517.896484375, "logps/rejected": -548.3609212239584, "loss": 0.011, "rewards/chosen": 6.795858383178711, "rewards/margins": 20.40238126118978, "rewards/rejected": -13.606522878011068, "step": 3343 }, { "epoch": 0.9165410442647663, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16578236.444444444, "logits/rejected": -8345756.8, "logps/chosen": -343.8055013020833, "logps/rejected": -568.5280598958333, "loss": 0.0179, "rewards/chosen": 6.073999616834852, "rewards/margins": 19.677099185519747, "rewards/rejected": -13.603099568684895, "step": 3344 }, { "epoch": 0.9168151295052761, "grad_norm": 1.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -955450.2666666667, "logits/rejected": -25980302.222222224, "logps/chosen": -334.21640625, "logps/rejected": -549.1081271701389, "loss": 0.0056, "rewards/chosen": 7.664180501302083, "rewards/margins": 19.21521521674262, "rewards/rejected": -11.551034715440538, "step": 3345 }, { "epoch": 0.917089214745786, "grad_norm": 6.34375, "kl": 3.6408298015594482, "learning_rate": 5e-06, "logits/chosen": -36823800.0, "logits/rejected": -27353968.0, "logps/chosen": -489.185302734375, "logps/rejected": -471.0218912760417, "loss": 0.014, "rewards/chosen": 8.043793360392252, "rewards/margins": 19.722386042277016, "rewards/rejected": -11.678592681884766, "step": 3346 }, { "epoch": 0.9173632999862957, "grad_norm": 5.0625, "kl": 5.772083759307861, "learning_rate": 5e-06, "logits/chosen": -8294191.111111111, "logits/rejected": -35144085.333333336, "logps/chosen": -291.99565972222223, "logps/rejected": -610.5045572916666, "loss": 0.0155, "rewards/chosen": 5.559413062201606, "rewards/margins": 18.120762549506292, "rewards/rejected": -12.561349487304687, "step": 3347 }, { "epoch": 0.9176373852268055, "grad_norm": 3.40625, "kl": 5.43698787689209, "learning_rate": 5e-06, "logits/chosen": -28829243.42857143, "logits/rejected": -33776768.0, "logps/chosen": -418.4281529017857, "logps/rejected": -606.9734375, "loss": 0.0075, "rewards/chosen": 7.974837166922433, "rewards/margins": 19.612604195731027, "rewards/rejected": -11.637767028808593, "step": 3348 }, { "epoch": 0.9179114704673154, "grad_norm": 0.57421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33789760.0, "logits/rejected": -29408128.0, "logps/chosen": -448.42703683035717, "logps/rejected": -756.9375, "loss": 0.0019, "rewards/chosen": 6.513699667794364, "rewards/margins": 24.7712428950462, "rewards/rejected": -18.257543227251837, "step": 3349 }, { "epoch": 0.9181855557078251, "grad_norm": 8.0, "kl": 8.651805877685547, "learning_rate": 5e-06, "logits/chosen": -13385966.0, "logits/rejected": -23468596.0, "logps/chosen": -438.2493591308594, "logps/rejected": -654.1311645507812, "loss": 0.0971, "rewards/chosen": 7.524724006652832, "rewards/margins": 21.26654815673828, "rewards/rejected": -13.74182415008545, "step": 3350 }, { "epoch": 0.9184596409483349, "grad_norm": 1.28125, "kl": 4.378342628479004, "learning_rate": 5e-06, "logits/chosen": -20906614.153846152, "logits/rejected": -21334728.727272727, "logps/chosen": -445.24429086538464, "logps/rejected": -577.6187855113636, "loss": 0.0034, "rewards/chosen": 9.146744948167067, "rewards/margins": 20.396038882382264, "rewards/rejected": -11.2492939342152, "step": 3351 }, { "epoch": 0.9187337261888447, "grad_norm": 4.5625, "kl": 5.513841152191162, "learning_rate": 5e-06, "logits/chosen": -10949084.307692308, "logits/rejected": -28297911.272727273, "logps/chosen": -473.40685096153845, "logps/rejected": -498.48659446022725, "loss": 0.0087, "rewards/chosen": 8.235797588641827, "rewards/margins": 20.340141349739127, "rewards/rejected": -12.1043437610973, "step": 3352 }, { "epoch": 0.9190078114293545, "grad_norm": 10.0, "kl": 15.014749526977539, "learning_rate": 5e-06, "logits/chosen": -29736750.0, "logits/rejected": -7058952.5, "logps/chosen": -475.98114013671875, "logps/rejected": -511.1832275390625, "loss": 0.0329, "rewards/chosen": 7.744750499725342, "rewards/margins": 18.585761547088623, "rewards/rejected": -10.841011047363281, "step": 3353 }, { "epoch": 0.9192818966698644, "grad_norm": 3.6875, "kl": 1.1266670227050781, "learning_rate": 5e-06, "logits/chosen": -12611994.666666666, "logits/rejected": -23486538.666666668, "logps/chosen": -373.43994140625, "logps/rejected": -370.2648111979167, "loss": 0.0353, "rewards/chosen": 7.173103332519531, "rewards/margins": 17.758376439412437, "rewards/rejected": -10.585273106892904, "step": 3354 }, { "epoch": 0.9195559819103741, "grad_norm": 7.3125, "kl": 5.344974517822266, "learning_rate": 5e-06, "logits/chosen": -23385284.57142857, "logits/rejected": -31110784.0, "logps/chosen": -398.56005859375, "logps/rejected": -591.41904296875, "loss": 0.0451, "rewards/chosen": 7.308277675083706, "rewards/margins": 22.130013820103237, "rewards/rejected": -14.821736145019532, "step": 3355 }, { "epoch": 0.9198300671508839, "grad_norm": 9.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10707987.2, "logits/rejected": -36296549.05263158, "logps/chosen": -488.419384765625, "logps/rejected": -640.4945518092105, "loss": 0.0384, "rewards/chosen": 8.317501068115234, "rewards/margins": 22.023589485570007, "rewards/rejected": -13.70608841745477, "step": 3356 }, { "epoch": 0.9201041523913938, "grad_norm": 2.390625, "kl": 0.7407932281494141, "learning_rate": 5e-06, "logits/chosen": -7770287.384615385, "logits/rejected": -30951985.454545453, "logps/chosen": -505.97220552884613, "logps/rejected": -604.0204634232955, "loss": 0.008, "rewards/chosen": 8.06890399639423, "rewards/margins": 22.32479250180971, "rewards/rejected": -14.255888505415482, "step": 3357 }, { "epoch": 0.9203782376319035, "grad_norm": 0.9140625, "kl": 5.944798946380615, "learning_rate": 5e-06, "logits/chosen": -15717223.384615384, "logits/rejected": -9284888.727272727, "logps/chosen": -461.3288010817308, "logps/rejected": -627.7811168323864, "loss": 0.0024, "rewards/chosen": 9.22305415226863, "rewards/margins": 20.63383393187623, "rewards/rejected": -11.4107797796076, "step": 3358 }, { "epoch": 0.9206523228724133, "grad_norm": 3.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34517419.63636363, "logits/rejected": -13559933.538461538, "logps/chosen": -445.67569247159093, "logps/rejected": -477.15283203125, "loss": 0.0092, "rewards/chosen": 6.999617143110796, "rewards/margins": 17.121683934351783, "rewards/rejected": -10.122066791240986, "step": 3359 }, { "epoch": 0.9209264081129231, "grad_norm": 1.734375, "kl": 6.277764320373535, "learning_rate": 5e-06, "logits/chosen": -19074528.0, "logits/rejected": -34452902.4, "logps/chosen": -386.4130161830357, "logps/rejected": -540.15810546875, "loss": 0.0051, "rewards/chosen": 8.330276489257812, "rewards/margins": 21.721803283691408, "rewards/rejected": -13.391526794433593, "step": 3360 }, { "epoch": 0.9212004933534329, "grad_norm": 4.375, "kl": 1.5443992614746094, "learning_rate": 5e-06, "logits/chosen": -8690977.333333334, "logits/rejected": -20239154.666666668, "logps/chosen": -384.4140625, "logps/rejected": -559.2417805989584, "loss": 0.0111, "rewards/chosen": 6.130195617675781, "rewards/margins": 19.726567586263023, "rewards/rejected": -13.59637196858724, "step": 3361 }, { "epoch": 0.9214745785939427, "grad_norm": 8.0625, "kl": 10.062572479248047, "learning_rate": 5e-06, "logits/chosen": -14332957.866666667, "logits/rejected": -30902627.555555556, "logps/chosen": -515.60283203125, "logps/rejected": -833.7744140625, "loss": 0.029, "rewards/chosen": 8.325358072916666, "rewards/margins": 28.76229010687934, "rewards/rejected": -20.436932033962673, "step": 3362 }, { "epoch": 0.9217486638344525, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20292208.0, "logits/rejected": -34671781.64705882, "logps/chosen": -444.50003487723217, "logps/rejected": -600.6672794117648, "loss": 0.0108, "rewards/chosen": 7.2511187962123325, "rewards/margins": 24.78714928506803, "rewards/rejected": -17.536030488855697, "step": 3363 }, { "epoch": 0.9220227490749623, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7888766.545454546, "logits/rejected": -35803145.84615385, "logps/chosen": -450.3074396306818, "logps/rejected": -524.9089543269231, "loss": 0.0422, "rewards/chosen": 6.674138849431818, "rewards/margins": 18.65461464194985, "rewards/rejected": -11.980475792518028, "step": 3364 }, { "epoch": 0.9222968343154722, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28384774.4, "logits/rejected": -15906018.285714285, "logps/chosen": -436.725, "logps/rejected": -567.8333565848214, "loss": 0.0229, "rewards/chosen": 6.30821533203125, "rewards/margins": 20.23296116420201, "rewards/rejected": -13.924745832170759, "step": 3365 }, { "epoch": 0.9225709195559819, "grad_norm": 1.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11418992.8, "logits/rejected": -3850138.285714286, "logps/chosen": -411.0880859375, "logps/rejected": -402.52553013392856, "loss": 0.0079, "rewards/chosen": 8.303677368164063, "rewards/margins": 19.4287602015904, "rewards/rejected": -11.125082833426339, "step": 3366 }, { "epoch": 0.9228450047964917, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5556144.615384615, "logits/rejected": 55080145.45454545, "logps/chosen": -354.8401442307692, "logps/rejected": -735.0070578835227, "loss": 0.0697, "rewards/chosen": 6.082790081317608, "rewards/margins": 38.23622968980483, "rewards/rejected": -32.15343960848722, "step": 3367 }, { "epoch": 0.9231190900370015, "grad_norm": 1.65625, "kl": 2.299424648284912, "learning_rate": 5e-06, "logits/chosen": -23961895.384615384, "logits/rejected": -24282909.09090909, "logps/chosen": -462.81092247596155, "logps/rejected": -499.3963068181818, "loss": 0.0031, "rewards/chosen": 8.403408930851864, "rewards/margins": 19.80479276430357, "rewards/rejected": -11.401383833451705, "step": 3368 }, { "epoch": 0.9233931752775113, "grad_norm": 0.275390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4879679.466666667, "logits/rejected": -39442965.333333336, "logps/chosen": -426.60436197916664, "logps/rejected": -661.9214409722222, "loss": 0.001, "rewards/chosen": 9.388763427734375, "rewards/margins": 25.323189629448784, "rewards/rejected": -15.934426201714409, "step": 3369 }, { "epoch": 0.9236672605180211, "grad_norm": 13.625, "kl": 12.514715194702148, "learning_rate": 5e-06, "logits/chosen": -46109138.28571428, "logits/rejected": -61838937.6, "logps/chosen": -354.2506626674107, "logps/rejected": -502.616552734375, "loss": 0.0551, "rewards/chosen": 9.021724700927734, "rewards/margins": 21.094026947021483, "rewards/rejected": -12.07230224609375, "step": 3370 }, { "epoch": 0.9239413457585309, "grad_norm": 9.5625, "kl": 9.11406135559082, "learning_rate": 5e-06, "logits/chosen": -14676361.142857144, "logits/rejected": -26907334.4, "logps/chosen": -315.12203543526783, "logps/rejected": -604.768359375, "loss": 0.0696, "rewards/chosen": 5.230806623186384, "rewards/margins": 20.080393110002788, "rewards/rejected": -14.849586486816406, "step": 3371 }, { "epoch": 0.9242154309990407, "grad_norm": 10.25, "kl": 12.492095947265625, "learning_rate": 5e-06, "logits/chosen": -19511568.0, "logits/rejected": -33651626.666666664, "logps/chosen": -405.1120198567708, "logps/rejected": -623.6302897135416, "loss": 0.0275, "rewards/chosen": 8.808015823364258, "rewards/margins": 23.734952926635742, "rewards/rejected": -14.926937103271484, "step": 3372 }, { "epoch": 0.9244895162395504, "grad_norm": 1.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41232491.63636363, "logits/rejected": -32417590.153846152, "logps/chosen": -495.13876065340907, "logps/rejected": -570.6780724158654, "loss": 0.0062, "rewards/chosen": 8.317302357066762, "rewards/margins": 21.268419065675538, "rewards/rejected": -12.951116708608774, "step": 3373 }, { "epoch": 0.9247636014800603, "grad_norm": 1.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22849996.0, "logits/rejected": -19651524.0, "logps/chosen": -388.0376281738281, "logps/rejected": -536.0582275390625, "loss": 0.0047, "rewards/chosen": 7.743842601776123, "rewards/margins": 21.751519680023193, "rewards/rejected": -14.00767707824707, "step": 3374 }, { "epoch": 0.9250376867205701, "grad_norm": 0.68359375, "kl": 4.536065101623535, "learning_rate": 5e-06, "logits/chosen": -41152828.8, "logits/rejected": -33572557.71428572, "logps/chosen": -566.35458984375, "logps/rejected": -652.0510602678571, "loss": 0.0013, "rewards/chosen": 10.64853744506836, "rewards/margins": 26.42458964756557, "rewards/rejected": -15.77605220249721, "step": 3375 }, { "epoch": 0.9253117719610799, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39555157.333333336, "logits/rejected": -29198007.466666665, "logps/chosen": -592.5280490451389, "logps/rejected": -579.7295572916667, "loss": 0.0477, "rewards/chosen": 9.079884847005209, "rewards/margins": 22.83074951171875, "rewards/rejected": -13.750864664713541, "step": 3376 }, { "epoch": 0.9255858572015897, "grad_norm": 4.8125, "kl": 4.550167083740234, "learning_rate": 5e-06, "logits/chosen": -24229558.153846152, "logits/rejected": -39547191.27272727, "logps/chosen": -497.39644681490387, "logps/rejected": -561.3672318892045, "loss": 0.0094, "rewards/chosen": 8.360382080078125, "rewards/margins": 22.411386663263492, "rewards/rejected": -14.05100458318537, "step": 3377 }, { "epoch": 0.9258599424420995, "grad_norm": 7.4375, "kl": 3.998944044113159, "learning_rate": 5e-06, "logits/chosen": -17644409.333333332, "logits/rejected": -46704282.666666664, "logps/chosen": -374.7574462890625, "logps/rejected": -584.3626708984375, "loss": 0.0255, "rewards/chosen": 6.4333241780598955, "rewards/margins": 19.696797688802082, "rewards/rejected": -13.263473510742188, "step": 3378 }, { "epoch": 0.9261340276826093, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55220078.222222224, "logits/rejected": -10087877.333333334, "logps/chosen": -409.44493272569446, "logps/rejected": -773.777734375, "loss": 0.0087, "rewards/chosen": 7.137084113226996, "rewards/margins": 27.018334113227, "rewards/rejected": -19.88125, "step": 3379 }, { "epoch": 0.9264081129231191, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20480470.153846152, "logits/rejected": -12551063.272727273, "logps/chosen": -432.39652193509613, "logps/rejected": -517.19140625, "loss": 0.0246, "rewards/chosen": 7.009070763221154, "rewards/margins": 19.219357683942036, "rewards/rejected": -12.21028692072088, "step": 3380 }, { "epoch": 0.9266821981636288, "grad_norm": 22.25, "kl": 6.884671688079834, "learning_rate": 5e-06, "logits/chosen": -16373760.0, "logits/rejected": -27549061.333333332, "logps/chosen": -447.00927734375, "logps/rejected": -536.000732421875, "loss": 0.0693, "rewards/chosen": 7.90220324198405, "rewards/margins": 21.59245491027832, "rewards/rejected": -13.690251668294271, "step": 3381 }, { "epoch": 0.9269562834041387, "grad_norm": 6.9375, "kl": 1.2552413940429688, "learning_rate": 5e-06, "logits/chosen": -26291272.0, "logits/rejected": -24422390.85714286, "logps/chosen": -430.88505859375, "logps/rejected": -582.3169294084821, "loss": 0.0145, "rewards/chosen": 6.675546264648437, "rewards/margins": 21.33895045689174, "rewards/rejected": -14.663404192243304, "step": 3382 }, { "epoch": 0.9272303686446485, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22154002.285714287, "logits/rejected": -14545045.647058824, "logps/chosen": -554.223388671875, "logps/rejected": -653.0343520220588, "loss": 0.0172, "rewards/chosen": 7.613816397530692, "rewards/margins": 21.106267784823892, "rewards/rejected": -13.4924513872932, "step": 3383 }, { "epoch": 0.9275044538851582, "grad_norm": 0.7734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12331727.2, "logits/rejected": -29784681.14285714, "logps/chosen": -535.16083984375, "logps/rejected": -534.1060267857143, "loss": 0.0016, "rewards/chosen": 9.143683624267577, "rewards/margins": 24.53513368879046, "rewards/rejected": -15.39145006452288, "step": 3384 }, { "epoch": 0.9277785391256681, "grad_norm": 3.140625, "kl": 4.454029083251953, "learning_rate": 5e-06, "logits/chosen": -44855992.0, "logits/rejected": -29226278.0, "logps/chosen": -375.2306213378906, "logps/rejected": -503.8863525390625, "loss": 0.0406, "rewards/chosen": 6.460363388061523, "rewards/margins": 19.38943862915039, "rewards/rejected": -12.929075241088867, "step": 3385 }, { "epoch": 0.9280526243661779, "grad_norm": 3.59375, "kl": 3.7664923667907715, "learning_rate": 5e-06, "logits/chosen": -33465832.727272727, "logits/rejected": -29878951.384615384, "logps/chosen": -327.8447265625, "logps/rejected": -589.3822866586538, "loss": 0.0277, "rewards/chosen": 6.072291981090199, "rewards/margins": 23.080738307712796, "rewards/rejected": -17.008446326622597, "step": 3386 }, { "epoch": 0.9283267096066877, "grad_norm": 0.84765625, "kl": 2.8233847618103027, "learning_rate": 5e-06, "logits/chosen": -19742025.333333332, "logits/rejected": -41035061.333333336, "logps/chosen": -426.9639892578125, "logps/rejected": -550.7426350911459, "loss": 0.0016, "rewards/chosen": 10.505862553914389, "rewards/margins": 25.228439966837566, "rewards/rejected": -14.722577412923178, "step": 3387 }, { "epoch": 0.9286007948471975, "grad_norm": 7.96875, "kl": 5.106563568115234, "learning_rate": 5e-06, "logits/chosen": -25551273.846153848, "logits/rejected": -29957536.0, "logps/chosen": -430.0541240985577, "logps/rejected": -568.3672762784091, "loss": 0.0347, "rewards/chosen": 8.806387094350962, "rewards/margins": 21.536214975210335, "rewards/rejected": -12.729827880859375, "step": 3388 }, { "epoch": 0.9288748800877072, "grad_norm": 3.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25863656.727272727, "logits/rejected": -33376910.769230768, "logps/chosen": -382.80082563920456, "logps/rejected": -652.0518329326923, "loss": 0.013, "rewards/chosen": 6.578089627352628, "rewards/margins": 24.305753107671137, "rewards/rejected": -17.72766348031851, "step": 3389 }, { "epoch": 0.9291489653282171, "grad_norm": 8.6875, "kl": 12.896116256713867, "learning_rate": 5e-06, "logits/chosen": -33494820.0, "logits/rejected": -31246086.0, "logps/chosen": -382.0301513671875, "logps/rejected": -591.2089233398438, "loss": 0.0431, "rewards/chosen": 7.164003372192383, "rewards/margins": 19.848447799682617, "rewards/rejected": -12.684444427490234, "step": 3390 }, { "epoch": 0.9294230505687269, "grad_norm": 2.90625, "kl": 6.915323734283447, "learning_rate": 5e-06, "logits/chosen": -26546039.272727273, "logits/rejected": -21579542.153846152, "logps/chosen": -409.69442471590907, "logps/rejected": -616.9637545072115, "loss": 0.0467, "rewards/chosen": 7.125471635298296, "rewards/margins": 20.218381228146853, "rewards/rejected": -13.092909592848558, "step": 3391 }, { "epoch": 0.9296971358092366, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23879357.53846154, "logits/rejected": -31503778.90909091, "logps/chosen": -338.31287560096155, "logps/rejected": -614.3560014204545, "loss": 0.0209, "rewards/chosen": 7.1596832275390625, "rewards/margins": 23.95425831187855, "rewards/rejected": -16.79457508433949, "step": 3392 }, { "epoch": 0.9299712210497465, "grad_norm": 12.9375, "kl": 6.368080139160156, "learning_rate": 5e-06, "logits/chosen": -27618414.769230768, "logits/rejected": -19056430.545454547, "logps/chosen": -351.9773137019231, "logps/rejected": -471.79434481534093, "loss": 0.1006, "rewards/chosen": 6.149282602163462, "rewards/margins": 19.14517937506829, "rewards/rejected": -12.99589677290483, "step": 3393 }, { "epoch": 0.9302453062902563, "grad_norm": 3.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25137235.2, "logits/rejected": -58476178.28571428, "logps/chosen": -494.1876953125, "logps/rejected": -641.2269810267857, "loss": 0.0111, "rewards/chosen": 8.253870391845703, "rewards/margins": 23.80929685320173, "rewards/rejected": -15.555426461356026, "step": 3394 }, { "epoch": 0.930519391530766, "grad_norm": 4.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32201332.0, "logits/rejected": -8029650.5, "logps/chosen": -445.8554992675781, "logps/rejected": -347.0872802734375, "loss": 0.013, "rewards/chosen": 7.633358955383301, "rewards/margins": 18.426295280456543, "rewards/rejected": -10.792936325073242, "step": 3395 }, { "epoch": 0.9307934767712759, "grad_norm": 6.9375, "kl": 7.748190879821777, "learning_rate": 5e-06, "logits/chosen": -30043665.454545453, "logits/rejected": -16778427.076923076, "logps/chosen": -396.1749378551136, "logps/rejected": -613.2966496394231, "loss": 0.0777, "rewards/chosen": 6.388296647505327, "rewards/margins": 17.970391280167586, "rewards/rejected": -11.58209463266226, "step": 3396 }, { "epoch": 0.9310675620117856, "grad_norm": 7.0, "kl": 0.7843354940414429, "learning_rate": 5e-06, "logits/chosen": -28296568.615384616, "logits/rejected": -11164696.727272727, "logps/chosen": -382.6115910456731, "logps/rejected": -642.9716352982955, "loss": 0.0865, "rewards/chosen": 7.251071636493389, "rewards/margins": 19.118281331095663, "rewards/rejected": -11.867209694602273, "step": 3397 }, { "epoch": 0.9313416472522955, "grad_norm": 0.8359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22038394.181818184, "logits/rejected": -6843496.615384615, "logps/chosen": -401.4529474431818, "logps/rejected": -555.4577448918269, "loss": 0.0074, "rewards/chosen": 8.32950800115412, "rewards/margins": 21.07126121254234, "rewards/rejected": -12.741753211388222, "step": 3398 }, { "epoch": 0.9316157324928053, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35207674.666666664, "logits/rejected": -17608216.0, "logps/chosen": -306.26991780598956, "logps/rejected": -626.5785725911459, "loss": 0.0436, "rewards/chosen": 5.894677480061849, "rewards/margins": 19.916205088297527, "rewards/rejected": -14.021527608235678, "step": 3399 }, { "epoch": 0.931889817733315, "grad_norm": 4.15625, "kl": 1.9527359008789062, "learning_rate": 5e-06, "logits/chosen": -51903360.0, "logits/rejected": -29211242.666666668, "logps/chosen": -376.51953125, "logps/rejected": -580.7463785807291, "loss": 0.0105, "rewards/chosen": 7.436511357625325, "rewards/margins": 17.612452189127605, "rewards/rejected": -10.17594083150228, "step": 3400 }, { "epoch": 0.9321639029738249, "grad_norm": 7.8125, "kl": 1.7211673259735107, "learning_rate": 5e-06, "logits/chosen": -9733131.076923076, "logits/rejected": -27034466.90909091, "logps/chosen": -421.95849609375, "logps/rejected": -441.73237748579544, "loss": 0.0206, "rewards/chosen": 7.815953181340144, "rewards/margins": 19.368303312288297, "rewards/rejected": -11.552350130948154, "step": 3401 }, { "epoch": 0.9324379882143347, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12447077.333333334, "logits/rejected": -26344445.866666667, "logps/chosen": -404.4858127170139, "logps/rejected": -663.2051432291667, "loss": 0.0101, "rewards/chosen": 7.2527211507161455, "rewards/margins": 22.2493896484375, "rewards/rejected": -14.996668497721354, "step": 3402 }, { "epoch": 0.9327120734548444, "grad_norm": 12.9375, "kl": 2.0380046367645264, "learning_rate": 5e-06, "logits/chosen": -6556893.333333333, "logits/rejected": -24484264.0, "logps/chosen": -335.3662923177083, "logps/rejected": -496.7454427083333, "loss": 0.0302, "rewards/chosen": 6.803058624267578, "rewards/margins": 18.088197072347008, "rewards/rejected": -11.285138448079428, "step": 3403 }, { "epoch": 0.9329861586953543, "grad_norm": 3.953125, "kl": 0.6109498739242554, "learning_rate": 5e-06, "logits/chosen": -30195917.333333332, "logits/rejected": -16371033.333333334, "logps/chosen": -462.5885823567708, "logps/rejected": -618.2353515625, "loss": 0.0184, "rewards/chosen": 6.177724202473958, "rewards/margins": 20.800453186035156, "rewards/rejected": -14.622728983561197, "step": 3404 }, { "epoch": 0.933260243935864, "grad_norm": 7.21875, "kl": 5.7952880859375, "learning_rate": 5e-06, "logits/chosen": -26290666.666666668, "logits/rejected": 83041226.66666667, "logps/chosen": -448.1223551432292, "logps/rejected": -513.0697021484375, "loss": 0.025, "rewards/chosen": 7.332375844319661, "rewards/margins": 16.97630246480306, "rewards/rejected": -9.643926620483398, "step": 3405 }, { "epoch": 0.9335343291763738, "grad_norm": 1.5703125, "kl": 6.417951583862305, "learning_rate": 5e-06, "logits/chosen": -23586386.285714287, "logits/rejected": -32818598.4, "logps/chosen": -414.41524832589283, "logps/rejected": -432.6564453125, "loss": 0.0043, "rewards/chosen": 8.478536878313337, "rewards/margins": 18.392880140032087, "rewards/rejected": -9.91434326171875, "step": 3406 }, { "epoch": 0.9338084144168837, "grad_norm": 0.828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25187817.846153848, "logits/rejected": -10567297.454545455, "logps/chosen": -388.67074819711536, "logps/rejected": -446.62801846590907, "loss": 0.003, "rewards/chosen": 8.978814932016226, "rewards/margins": 19.28485214126694, "rewards/rejected": -10.30603720925071, "step": 3407 }, { "epoch": 0.9340824996573934, "grad_norm": 3.859375, "kl": 2.0786032676696777, "learning_rate": 5e-06, "logits/chosen": -30406893.714285713, "logits/rejected": -26210744.0, "logps/chosen": -355.72701590401783, "logps/rejected": -390.92255859375, "loss": 0.0496, "rewards/chosen": 7.181155613490513, "rewards/margins": 15.699711826869418, "rewards/rejected": -8.518556213378906, "step": 3408 }, { "epoch": 0.9343565848979033, "grad_norm": 31.75, "kl": 0.7564811706542969, "learning_rate": 5e-06, "logits/chosen": -21325510.85714286, "logits/rejected": -14687382.4, "logps/chosen": -452.6568080357143, "logps/rejected": -566.33095703125, "loss": 0.0477, "rewards/chosen": 7.537989480154855, "rewards/margins": 18.167980630057198, "rewards/rejected": -10.629991149902343, "step": 3409 }, { "epoch": 0.9346306701384131, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11131103.111111112, "logits/rejected": -28209642.666666668, "logps/chosen": -352.3041178385417, "logps/rejected": -427.713671875, "loss": 0.0409, "rewards/chosen": 7.162240770128038, "rewards/margins": 17.411534796820746, "rewards/rejected": -10.249294026692708, "step": 3410 }, { "epoch": 0.9349047553789228, "grad_norm": 2.96875, "kl": 10.877851486206055, "learning_rate": 5e-06, "logits/chosen": -21545878.666666668, "logits/rejected": -50456058.666666664, "logps/chosen": -393.5066731770833, "logps/rejected": -522.91845703125, "loss": 0.0159, "rewards/chosen": 8.00325075785319, "rewards/margins": 19.656574885050453, "rewards/rejected": -11.653324127197266, "step": 3411 }, { "epoch": 0.9351788406194327, "grad_norm": 15.375, "kl": 9.074888229370117, "learning_rate": 5e-06, "logits/chosen": -41998678.85714286, "logits/rejected": -19706931.2, "logps/chosen": -388.68593052455356, "logps/rejected": -549.3001953125, "loss": 0.0724, "rewards/chosen": 6.77184077671596, "rewards/margins": 22.166285051618303, "rewards/rejected": -15.394444274902344, "step": 3412 }, { "epoch": 0.9354529258599424, "grad_norm": 1.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38255788.0, "logits/rejected": -34627769.6, "logps/chosen": -391.75726318359375, "logps/rejected": -602.18896484375, "loss": 0.0038, "rewards/chosen": 10.006354331970215, "rewards/margins": 22.58885250091553, "rewards/rejected": -12.582498168945312, "step": 3413 }, { "epoch": 0.9357270111004522, "grad_norm": 0.283203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37992323.55555555, "logits/rejected": -35442545.06666667, "logps/chosen": -547.7611219618055, "logps/rejected": -673.9326822916667, "loss": 0.0007, "rewards/chosen": 8.502001444498697, "rewards/margins": 25.411095682779944, "rewards/rejected": -16.90909423828125, "step": 3414 }, { "epoch": 0.9360010963409621, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24727570.0, "logits/rejected": -22491330.0, "logps/chosen": -332.92791748046875, "logps/rejected": -741.558349609375, "loss": 0.0083, "rewards/chosen": 6.6514105796813965, "rewards/margins": 21.515642642974854, "rewards/rejected": -14.864232063293457, "step": 3415 }, { "epoch": 0.9362751815814718, "grad_norm": 3.15625, "kl": 1.2437922954559326, "learning_rate": 5e-06, "logits/chosen": -33059601.454545453, "logits/rejected": 11000503.384615384, "logps/chosen": -508.92489346590907, "logps/rejected": -694.7184495192307, "loss": 0.0092, "rewards/chosen": 7.779735218394887, "rewards/margins": 24.919624275260873, "rewards/rejected": -17.139889056865986, "step": 3416 }, { "epoch": 0.9365492668219816, "grad_norm": 4.1875, "kl": 6.605035305023193, "learning_rate": 5e-06, "logits/chosen": -22875991.466666665, "logits/rejected": -39466112.0, "logps/chosen": -389.67962239583335, "logps/rejected": -1010.5775824652778, "loss": 0.0123, "rewards/chosen": 8.483728535970052, "rewards/margins": 32.29070315890842, "rewards/rejected": -23.80697462293837, "step": 3417 }, { "epoch": 0.9368233520624915, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16403845.333333334, "logits/rejected": -45628970.666666664, "logps/chosen": -404.5725911458333, "logps/rejected": -511.01458333333335, "loss": 0.047, "rewards/chosen": 5.974240620930989, "rewards/margins": 17.78717803955078, "rewards/rejected": -11.812937418619791, "step": 3418 }, { "epoch": 0.9370974373030012, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39007543.27272727, "logits/rejected": -45141612.307692304, "logps/chosen": -420.3955078125, "logps/rejected": -714.2254356971154, "loss": 0.0158, "rewards/chosen": 7.382972717285156, "rewards/margins": 25.94177070030799, "rewards/rejected": -18.558797983022835, "step": 3419 }, { "epoch": 0.9373715225435111, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8382080.0, "logits/rejected": -34803259.428571425, "logps/chosen": -449.36396484375, "logps/rejected": -758.9401506696429, "loss": 0.039, "rewards/chosen": 7.739715576171875, "rewards/margins": 27.233880179268972, "rewards/rejected": -19.494164603097097, "step": 3420 }, { "epoch": 0.9376456077840208, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20670208.0, "logits/rejected": -14764347.076923076, "logps/chosen": -420.86692116477275, "logps/rejected": -782.1479867788462, "loss": 0.0157, "rewards/chosen": 6.49432373046875, "rewards/margins": 23.457321166992188, "rewards/rejected": -16.962997436523438, "step": 3421 }, { "epoch": 0.9379196930245306, "grad_norm": 1.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25063323.076923076, "logits/rejected": 4549752.363636363, "logps/chosen": -372.09314903846155, "logps/rejected": -677.8528497869319, "loss": 0.0031, "rewards/chosen": 8.50589810884916, "rewards/margins": 24.313085275930128, "rewards/rejected": -15.807187167080967, "step": 3422 }, { "epoch": 0.9381937782650405, "grad_norm": 1.109375, "kl": 0.92041015625, "learning_rate": 5e-06, "logits/chosen": -25153361.777777776, "logits/rejected": -16838790.4, "logps/chosen": -383.43096245659723, "logps/rejected": -488.1752604166667, "loss": 0.003, "rewards/chosen": 7.861071268717448, "rewards/margins": 20.252455139160155, "rewards/rejected": -12.391383870442708, "step": 3423 }, { "epoch": 0.9384678635055502, "grad_norm": 8.375, "kl": 0.36981043219566345, "learning_rate": 5e-06, "logits/chosen": -9536131.42857143, "logits/rejected": -37938771.2, "logps/chosen": -367.900390625, "logps/rejected": -734.348291015625, "loss": 0.0272, "rewards/chosen": 6.119538443429129, "rewards/margins": 18.83657488141741, "rewards/rejected": -12.71703643798828, "step": 3424 }, { "epoch": 0.93874194874606, "grad_norm": 3.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40636213.333333336, "logits/rejected": -34734498.13333333, "logps/chosen": -382.9345703125, "logps/rejected": -471.59583333333336, "loss": 0.0082, "rewards/chosen": 8.556411743164062, "rewards/margins": 21.766018676757813, "rewards/rejected": -13.20960693359375, "step": 3425 }, { "epoch": 0.9390160339865699, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21199052.8, "logits/rejected": -28963734.85714286, "logps/chosen": -426.973681640625, "logps/rejected": -532.5344587053571, "loss": 0.0107, "rewards/chosen": 6.214456176757812, "rewards/margins": 18.56299525669643, "rewards/rejected": -12.348539079938616, "step": 3426 }, { "epoch": 0.9392901192270796, "grad_norm": 2.40625, "kl": 4.914234161376953, "learning_rate": 5e-06, "logits/chosen": -43976196.266666666, "logits/rejected": -27545690.666666668, "logps/chosen": -506.83483072916664, "logps/rejected": -574.876953125, "loss": 0.0072, "rewards/chosen": 8.91383768717448, "rewards/margins": 23.247340562608507, "rewards/rejected": -14.333502875434029, "step": 3427 }, { "epoch": 0.9395642044675894, "grad_norm": 0.98828125, "kl": 1.8802541494369507, "learning_rate": 5e-06, "logits/chosen": -16024062.666666666, "logits/rejected": -25475082.666666668, "logps/chosen": -448.5851236979167, "logps/rejected": -599.4901529947916, "loss": 0.0034, "rewards/chosen": 8.490619659423828, "rewards/margins": 22.654665629069008, "rewards/rejected": -14.164045969645182, "step": 3428 }, { "epoch": 0.9398382897080992, "grad_norm": 7.6875, "kl": 14.381593704223633, "learning_rate": 5e-06, "logits/chosen": -18672338.666666668, "logits/rejected": -26306680.0, "logps/chosen": -519.50341796875, "logps/rejected": -627.8221028645834, "loss": 0.0238, "rewards/chosen": 8.85653305053711, "rewards/margins": 26.795093536376953, "rewards/rejected": -17.938560485839844, "step": 3429 }, { "epoch": 0.940112374948609, "grad_norm": 11.1875, "kl": 8.933425903320312, "learning_rate": 5e-06, "logits/chosen": -17360958.85714286, "logits/rejected": -16556995.2, "logps/chosen": -401.58206612723217, "logps/rejected": -558.95693359375, "loss": 0.0251, "rewards/chosen": 7.475626264299665, "rewards/margins": 20.959378705705916, "rewards/rejected": -13.48375244140625, "step": 3430 }, { "epoch": 0.9403864601891189, "grad_norm": 7.34375, "kl": 4.501676082611084, "learning_rate": 5e-06, "logits/chosen": -8432894.76923077, "logits/rejected": -457272.7272727273, "logps/chosen": -414.9242412860577, "logps/rejected": -405.3203125, "loss": 0.0366, "rewards/chosen": 7.280244680551382, "rewards/margins": 17.40515590214229, "rewards/rejected": -10.124911221590908, "step": 3431 }, { "epoch": 0.9406605454296286, "grad_norm": 18.125, "kl": 2.327228307723999, "learning_rate": 5e-06, "logits/chosen": -3380752.5714285714, "logits/rejected": -36437504.0, "logps/chosen": -357.13706752232144, "logps/rejected": -774.196142578125, "loss": 0.0571, "rewards/chosen": 6.471590314592634, "rewards/margins": 23.51155003138951, "rewards/rejected": -17.039959716796876, "step": 3432 }, { "epoch": 0.9409346306701384, "grad_norm": 5.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29438094.222222224, "logits/rejected": -18993152.0, "logps/chosen": -373.2575954861111, "logps/rejected": -490.10559895833336, "loss": 0.0209, "rewards/chosen": 6.2977417839898004, "rewards/margins": 18.199458906385633, "rewards/rejected": -11.901717122395834, "step": 3433 }, { "epoch": 0.9412087159106483, "grad_norm": 10.4375, "kl": 11.453731536865234, "learning_rate": 5e-06, "logits/chosen": -6946972.266666667, "logits/rejected": -30796199.111111112, "logps/chosen": -397.1162109375, "logps/rejected": -634.8573133680555, "loss": 0.0413, "rewards/chosen": 8.064152018229167, "rewards/margins": 21.160228135850694, "rewards/rejected": -13.096076117621529, "step": 3434 }, { "epoch": 0.941482801151158, "grad_norm": 9.3125, "kl": 8.138809204101562, "learning_rate": 5e-06, "logits/chosen": -15229150.76923077, "logits/rejected": -31982362.181818184, "logps/chosen": -356.16466346153845, "logps/rejected": -415.64901455965907, "loss": 0.0178, "rewards/chosen": 8.952622633713942, "rewards/margins": 22.914067835240928, "rewards/rejected": -13.961445201526988, "step": 3435 }, { "epoch": 0.9417568863916678, "grad_norm": 1.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24617596.8, "logits/rejected": -63909577.14285714, "logps/chosen": -381.643359375, "logps/rejected": -424.15011160714283, "loss": 0.0045, "rewards/chosen": 7.022040557861328, "rewards/margins": 17.698555319649834, "rewards/rejected": -10.676514761788505, "step": 3436 }, { "epoch": 0.9420309716321776, "grad_norm": 7.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46875699.2, "logits/rejected": -39640444.631578945, "logps/chosen": -547.476171875, "logps/rejected": -564.4800061677631, "loss": 0.0084, "rewards/chosen": 9.35692901611328, "rewards/margins": 22.90625827186986, "rewards/rejected": -13.549329255756579, "step": 3437 }, { "epoch": 0.9423050568726874, "grad_norm": 11.0625, "kl": 18.91948699951172, "learning_rate": 5e-06, "logits/chosen": -38662212.0, "logits/rejected": -25670372.0, "logps/chosen": -375.06134033203125, "logps/rejected": -448.0779724121094, "loss": 0.1268, "rewards/chosen": 7.563299179077148, "rewards/margins": 16.58810043334961, "rewards/rejected": -9.024801254272461, "step": 3438 }, { "epoch": 0.9425791421131972, "grad_norm": 5.59375, "kl": 0.6133651733398438, "learning_rate": 5e-06, "logits/chosen": -46449157.333333336, "logits/rejected": -15783918.666666666, "logps/chosen": -388.2109375, "logps/rejected": -491.2742513020833, "loss": 0.0364, "rewards/chosen": 6.878045399983724, "rewards/margins": 19.372852325439453, "rewards/rejected": -12.494806925455729, "step": 3439 }, { "epoch": 0.942853227353707, "grad_norm": 4.15625, "kl": 7.840646266937256, "learning_rate": 5e-06, "logits/chosen": -27514848.0, "logits/rejected": -23225673.6, "logps/chosen": -439.37935965401783, "logps/rejected": -538.62744140625, "loss": 0.0077, "rewards/chosen": 8.824000222342354, "rewards/margins": 22.314059121268137, "rewards/rejected": -13.490058898925781, "step": 3440 }, { "epoch": 0.9431273125942168, "grad_norm": 3.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33358611.692307692, "logits/rejected": 7304728.7272727275, "logps/chosen": -431.8722956730769, "logps/rejected": -626.3705166903409, "loss": 0.0074, "rewards/chosen": 7.430299025315505, "rewards/margins": 21.55717446920755, "rewards/rejected": -14.126875443892045, "step": 3441 }, { "epoch": 0.9434013978347267, "grad_norm": 0.38671875, "kl": 1.4422569274902344, "learning_rate": 5e-06, "logits/chosen": 38597752.0, "logits/rejected": -10437261.0, "logps/chosen": -527.1065673828125, "logps/rejected": -636.275146484375, "loss": 0.0015, "rewards/chosen": 9.009142875671387, "rewards/margins": 25.588765144348145, "rewards/rejected": -16.579622268676758, "step": 3442 }, { "epoch": 0.9436754830752364, "grad_norm": 3.421875, "kl": 0.09761810302734375, "learning_rate": 5e-06, "logits/chosen": -18834360.0, "logits/rejected": -40831228.0, "logps/chosen": -429.3080749511719, "logps/rejected": -527.181640625, "loss": 0.0518, "rewards/chosen": 6.292591094970703, "rewards/margins": 19.132850646972656, "rewards/rejected": -12.840259552001953, "step": 3443 }, { "epoch": 0.9439495683157462, "grad_norm": 1.4921875, "kl": 3.4750709533691406, "learning_rate": 5e-06, "logits/chosen": -43972676.92307692, "logits/rejected": -39257018.18181818, "logps/chosen": -537.5822190504807, "logps/rejected": -615.7786310369319, "loss": 0.0043, "rewards/chosen": 8.606405404897837, "rewards/margins": 25.1239460765065, "rewards/rejected": -16.517540671608664, "step": 3444 }, { "epoch": 0.944223653556256, "grad_norm": 6.0, "kl": 0.8819955587387085, "learning_rate": 5e-06, "logits/chosen": -45304960.0, "logits/rejected": -19918068.57142857, "logps/chosen": -458.906982421875, "logps/rejected": -393.17208426339283, "loss": 0.0307, "rewards/chosen": 8.231139373779296, "rewards/margins": 19.788348933628626, "rewards/rejected": -11.55720955984933, "step": 3445 }, { "epoch": 0.9444977387967658, "grad_norm": 1.1796875, "kl": 2.5557315349578857, "learning_rate": 5e-06, "logits/chosen": -27064951.466666665, "logits/rejected": -22395555.555555556, "logps/chosen": -468.63798828125, "logps/rejected": -459.98621961805554, "loss": 0.0032, "rewards/chosen": 8.345637003580729, "rewards/margins": 20.32199944390191, "rewards/rejected": -11.97636244032118, "step": 3446 }, { "epoch": 0.9447718240372756, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23301696.0, "logits/rejected": -28729000.0, "logps/chosen": -386.01806640625, "logps/rejected": -610.7843017578125, "loss": 0.0492, "rewards/chosen": 6.003445625305176, "rewards/margins": 23.86717128753662, "rewards/rejected": -17.863725662231445, "step": 3447 }, { "epoch": 0.9450459092777854, "grad_norm": 9.1875, "kl": 9.868945121765137, "learning_rate": 5e-06, "logits/chosen": -27462400.0, "logits/rejected": -39071845.81818182, "logps/chosen": -415.8051006610577, "logps/rejected": -596.3027787642045, "loss": 0.0262, "rewards/chosen": 7.323006850022536, "rewards/margins": 28.441541284947963, "rewards/rejected": -21.118534434925426, "step": 3448 }, { "epoch": 0.9453199945182952, "grad_norm": 4.9375, "kl": 2.3947906494140625, "learning_rate": 5e-06, "logits/chosen": -31458594.90909091, "logits/rejected": -30983891.692307692, "logps/chosen": -367.6719415838068, "logps/rejected": -471.71078725961536, "loss": 0.0231, "rewards/chosen": 8.104101701216264, "rewards/margins": 21.184678671243304, "rewards/rejected": -13.080576970027042, "step": 3449 }, { "epoch": 0.9455940797588049, "grad_norm": 1.90625, "kl": 0.28998440504074097, "learning_rate": 5e-06, "logits/chosen": -32756302.769230768, "logits/rejected": -39716887.27272727, "logps/chosen": -388.06002103365387, "logps/rejected": -586.2642933238636, "loss": 0.0053, "rewards/chosen": 6.956165020282452, "rewards/margins": 22.65045656857791, "rewards/rejected": -15.694291548295455, "step": 3450 }, { "epoch": 0.9458681649993148, "grad_norm": 3.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10312564.0, "logits/rejected": -29113778.285714287, "logps/chosen": -462.761328125, "logps/rejected": -502.25589425223217, "loss": 0.0117, "rewards/chosen": 7.119194030761719, "rewards/margins": 20.794194902692524, "rewards/rejected": -13.675000871930804, "step": 3451 }, { "epoch": 0.9461422502398246, "grad_norm": 7.96875, "kl": 5.165338039398193, "learning_rate": 5e-06, "logits/chosen": -36090993.23076923, "logits/rejected": -31595371.636363637, "logps/chosen": -324.7233323317308, "logps/rejected": -752.6908735795455, "loss": 0.099, "rewards/chosen": 5.56381342961238, "rewards/margins": 21.254152951540647, "rewards/rejected": -15.690339521928268, "step": 3452 }, { "epoch": 0.9464163354803344, "grad_norm": 3.734375, "kl": 4.704298973083496, "learning_rate": 5e-06, "logits/chosen": -5243384.0, "logits/rejected": -56727074.90909091, "logps/chosen": -416.66687950721155, "logps/rejected": -530.6193625710227, "loss": 0.0135, "rewards/chosen": 7.407478919396033, "rewards/margins": 21.987304580795183, "rewards/rejected": -14.579825661399148, "step": 3453 }, { "epoch": 0.9466904207208442, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24758129.230769232, "logits/rejected": -37629483.63636363, "logps/chosen": -480.9168043870192, "logps/rejected": -502.32839133522725, "loss": 0.033, "rewards/chosen": 8.013508723332333, "rewards/margins": 19.69030537638631, "rewards/rejected": -11.676796653053977, "step": 3454 }, { "epoch": 0.946964505961354, "grad_norm": 9.5625, "kl": 12.257509231567383, "learning_rate": 5e-06, "logits/chosen": -34427720.0, "logits/rejected": -45474424.0, "logps/chosen": -367.7168273925781, "logps/rejected": -626.6493530273438, "loss": 0.0418, "rewards/chosen": 7.897889137268066, "rewards/margins": 25.41500186920166, "rewards/rejected": -17.517112731933594, "step": 3455 }, { "epoch": 0.9472385912018638, "grad_norm": 6.65625, "kl": 11.490344047546387, "learning_rate": 5e-06, "logits/chosen": -22234021.647058822, "logits/rejected": -191558.85714285713, "logps/chosen": -477.4399988511029, "logps/rejected": -546.8710239955357, "loss": 0.0466, "rewards/chosen": 8.673949297736673, "rewards/margins": 23.761183794806986, "rewards/rejected": -15.087234497070312, "step": 3456 }, { "epoch": 0.9475126764423736, "grad_norm": 3.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54478169.6, "logits/rejected": -20921600.0, "logps/chosen": -336.578857421875, "logps/rejected": -466.0127650669643, "loss": 0.032, "rewards/chosen": 5.660713195800781, "rewards/margins": 18.49222128731864, "rewards/rejected": -12.831508091517858, "step": 3457 }, { "epoch": 0.9477867616828833, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3835091.6, "logits/rejected": -44102870.85714286, "logps/chosen": -396.1656494140625, "logps/rejected": -480.27633231026783, "loss": 0.0274, "rewards/chosen": 4.689364242553711, "rewards/margins": 17.612799998692104, "rewards/rejected": -12.923435756138392, "step": 3458 }, { "epoch": 0.9480608469233932, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29664697.6, "logits/rejected": -23642080.0, "logps/chosen": -482.930419921875, "logps/rejected": -661.1321847098214, "loss": 0.0079, "rewards/chosen": 9.448400115966797, "rewards/margins": 24.664682551792687, "rewards/rejected": -15.216282435825892, "step": 3459 }, { "epoch": 0.948334932163903, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17854686.666666668, "logits/rejected": -22332408.0, "logps/chosen": -345.2437337239583, "logps/rejected": -468.1377766927083, "loss": 0.0119, "rewards/chosen": 6.772082010904948, "rewards/margins": 22.06480662027995, "rewards/rejected": -15.292724609375, "step": 3460 }, { "epoch": 0.9486090174044127, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16682672.0, "logits/rejected": -34806646.15384615, "logps/chosen": -346.61936257102275, "logps/rejected": -476.06685697115387, "loss": 0.0044, "rewards/chosen": 6.979795976118608, "rewards/margins": 20.42682215550563, "rewards/rejected": -13.44702617938702, "step": 3461 }, { "epoch": 0.9488831026449226, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18801760.0, "logits/rejected": -39741115.428571425, "logps/chosen": -296.828857421875, "logps/rejected": -497.7069614955357, "loss": 0.0306, "rewards/chosen": 4.927056884765625, "rewards/margins": 15.827473667689732, "rewards/rejected": -10.900416782924108, "step": 3462 }, { "epoch": 0.9491571878854324, "grad_norm": 2.8125, "kl": 12.43424129486084, "learning_rate": 5e-06, "logits/chosen": -37935025.45454545, "logits/rejected": -17652823.384615384, "logps/chosen": -415.9469549005682, "logps/rejected": -371.5778996394231, "loss": 0.0481, "rewards/chosen": 7.984311884099787, "rewards/margins": 17.339686733859402, "rewards/rejected": -9.355374849759615, "step": 3463 }, { "epoch": 0.9494312731259422, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24169610.666666668, "logits/rejected": -12840993.333333334, "logps/chosen": -366.4215494791667, "logps/rejected": -571.8579508463541, "loss": 0.0602, "rewards/chosen": 4.894045511881511, "rewards/margins": 20.10240427652995, "rewards/rejected": -15.208358764648438, "step": 3464 }, { "epoch": 0.949705358366452, "grad_norm": 1.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35145669.333333336, "logits/rejected": -53219786.666666664, "logps/chosen": -363.1541748046875, "logps/rejected": -636.3192545572916, "loss": 0.0064, "rewards/chosen": 7.2267195383707685, "rewards/margins": 22.098026911417644, "rewards/rejected": -14.871307373046875, "step": 3465 }, { "epoch": 0.9499794436069617, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29548634.0, "logits/rejected": -8778664.0, "logps/chosen": -426.51165771484375, "logps/rejected": -460.453125, "loss": 0.0244, "rewards/chosen": 6.680514335632324, "rewards/margins": 18.326653480529785, "rewards/rejected": -11.646139144897461, "step": 3466 }, { "epoch": 0.9502535288474716, "grad_norm": 11.5625, "kl": 3.5223708152770996, "learning_rate": 5e-06, "logits/chosen": -33779168.0, "logits/rejected": -36726020.571428575, "logps/chosen": -509.672509765625, "logps/rejected": -600.9432198660714, "loss": 0.0213, "rewards/chosen": 8.48681182861328, "rewards/margins": 25.062510681152343, "rewards/rejected": -16.575698852539062, "step": 3467 }, { "epoch": 0.9505276140879814, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41725814.15384615, "logits/rejected": -26794309.818181816, "logps/chosen": -404.2289287860577, "logps/rejected": -468.01455965909093, "loss": 0.0466, "rewards/chosen": 5.320456871619592, "rewards/margins": 17.8433173119605, "rewards/rejected": -12.522860440340908, "step": 3468 }, { "epoch": 0.9508016993284911, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13212747.42857143, "logits/rejected": -28581246.11764706, "logps/chosen": -463.0198451450893, "logps/rejected": -553.1954848345588, "loss": 0.0267, "rewards/chosen": 8.269490923200335, "rewards/margins": 21.31960258163324, "rewards/rejected": -13.050111658432904, "step": 3469 }, { "epoch": 0.951075784569001, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18283498.666666668, "logits/rejected": -24052981.333333332, "logps/chosen": -407.9171142578125, "logps/rejected": -629.7786458333334, "loss": 0.0177, "rewards/chosen": 8.657002766927084, "rewards/margins": 25.186361948649086, "rewards/rejected": -16.529359181722004, "step": 3470 }, { "epoch": 0.9513498698095108, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46758109.538461536, "logits/rejected": -17778350.545454547, "logps/chosen": -518.5924353966346, "logps/rejected": -436.8753551136364, "loss": 0.0055, "rewards/chosen": 8.809502234825722, "rewards/margins": 22.964986521047315, "rewards/rejected": -14.155484286221592, "step": 3471 }, { "epoch": 0.9516239550500205, "grad_norm": 8.5625, "kl": 0.8235740661621094, "learning_rate": 5e-06, "logits/chosen": -43716942.76923077, "logits/rejected": -23894901.818181816, "logps/chosen": -427.9021559495192, "logps/rejected": -622.1334339488636, "loss": 0.0393, "rewards/chosen": 7.32743424635667, "rewards/margins": 22.575558802464627, "rewards/rejected": -15.248124556107955, "step": 3472 }, { "epoch": 0.9518980402905304, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21954925.09090909, "logits/rejected": -38413430.15384615, "logps/chosen": -398.8992365056818, "logps/rejected": -569.9347205528846, "loss": 0.012, "rewards/chosen": 8.761471835049717, "rewards/margins": 23.22026958332195, "rewards/rejected": -14.458797748272236, "step": 3473 }, { "epoch": 0.9521721255310401, "grad_norm": 14.75, "kl": 8.449524879455566, "learning_rate": 5e-06, "logits/chosen": 6997565.6, "logits/rejected": -25004521.14285714, "logps/chosen": -518.8626953125, "logps/rejected": -346.46083286830356, "loss": 0.0669, "rewards/chosen": 7.573148345947265, "rewards/margins": 17.636869049072267, "rewards/rejected": -10.063720703125, "step": 3474 }, { "epoch": 0.95244621077155, "grad_norm": 13.5625, "kl": 14.540665626525879, "learning_rate": 5e-06, "logits/chosen": -17204813.714285713, "logits/rejected": -32813264.0, "logps/chosen": -410.22830636160717, "logps/rejected": -556.719873046875, "loss": 0.0801, "rewards/chosen": 6.325401306152344, "rewards/margins": 18.923741149902344, "rewards/rejected": -12.59833984375, "step": 3475 }, { "epoch": 0.9527202960120598, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26570277.818181816, "logits/rejected": -33866382.76923077, "logps/chosen": -500.62508877840907, "logps/rejected": -559.7603290264423, "loss": 0.0347, "rewards/chosen": 8.910521073774857, "rewards/margins": 20.948008397242404, "rewards/rejected": -12.037487323467548, "step": 3476 }, { "epoch": 0.9529943812525695, "grad_norm": 15.5, "kl": 18.598604202270508, "learning_rate": 5e-06, "logits/chosen": -22894240.0, "logits/rejected": -28737993.6, "logps/chosen": -426.96641138980266, "logps/rejected": -700.35341796875, "loss": 0.1537, "rewards/chosen": 6.623533951608758, "rewards/margins": 21.2856295936986, "rewards/rejected": -14.662095642089843, "step": 3477 }, { "epoch": 0.9532684664930794, "grad_norm": 10.625, "kl": 0.2079060971736908, "learning_rate": 5e-06, "logits/chosen": -12316218.181818182, "logits/rejected": -35803438.76923077, "logps/chosen": -394.8917347301136, "logps/rejected": -563.8008939302885, "loss": 0.0425, "rewards/chosen": 6.9788596413352275, "rewards/margins": 20.867592551491477, "rewards/rejected": -13.88873291015625, "step": 3478 }, { "epoch": 0.9535425517335891, "grad_norm": 2.96875, "kl": 1.6159350872039795, "learning_rate": 5e-06, "logits/chosen": -25492441.6, "logits/rejected": -54832636.44444445, "logps/chosen": -398.4798828125, "logps/rejected": -604.7567274305555, "loss": 0.0114, "rewards/chosen": 8.175813802083333, "rewards/margins": 21.314149136013455, "rewards/rejected": -13.138335333930122, "step": 3479 }, { "epoch": 0.9538166369740989, "grad_norm": 4.21875, "kl": 3.924204111099243, "learning_rate": 5e-06, "logits/chosen": -9679000.0, "logits/rejected": -27139306.666666668, "logps/chosen": -432.2177327473958, "logps/rejected": -447.7799072265625, "loss": 0.0073, "rewards/chosen": 7.0713653564453125, "rewards/margins": 19.100387573242188, "rewards/rejected": -12.029022216796875, "step": 3480 }, { "epoch": 0.9540907222146088, "grad_norm": 10.5, "kl": 3.368633270263672, "learning_rate": 5e-06, "logits/chosen": -26641842.0, "logits/rejected": -32009124.0, "logps/chosen": -407.6898193359375, "logps/rejected": -433.2930603027344, "loss": 0.0212, "rewards/chosen": 8.742722511291504, "rewards/margins": 20.760096549987793, "rewards/rejected": -12.017374038696289, "step": 3481 }, { "epoch": 0.9543648074551185, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6034719.333333333, "logits/rejected": -6357474.0, "logps/chosen": -432.1560465494792, "logps/rejected": -680.2853190104166, "loss": 0.0339, "rewards/chosen": 7.314579010009766, "rewards/margins": 22.622859954833984, "rewards/rejected": -15.308280944824219, "step": 3482 }, { "epoch": 0.9546388926956283, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35322437.81818182, "logits/rejected": -9065558.76923077, "logps/chosen": -452.3836115056818, "logps/rejected": -608.0279447115385, "loss": 0.0473, "rewards/chosen": 7.620414733886719, "rewards/margins": 24.151925307053787, "rewards/rejected": -16.53151057316707, "step": 3483 }, { "epoch": 0.9549129779361382, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35973195.63636363, "logits/rejected": 19646912.0, "logps/chosen": -348.34017666903407, "logps/rejected": -608.9723557692307, "loss": 0.012, "rewards/chosen": 6.279554887251421, "rewards/margins": 21.333413717630027, "rewards/rejected": -15.053858830378605, "step": 3484 }, { "epoch": 0.9551870631766479, "grad_norm": 3.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48245916.44444445, "logits/rejected": -22967189.333333332, "logps/chosen": -683.0985243055555, "logps/rejected": -546.7223958333333, "loss": 0.0038, "rewards/chosen": 11.564806620279947, "rewards/margins": 22.367437235514323, "rewards/rejected": -10.802630615234374, "step": 3485 }, { "epoch": 0.9554611484171578, "grad_norm": 3.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25711316.8, "logits/rejected": -27258736.0, "logps/chosen": -330.24873046875, "logps/rejected": -426.2710658482143, "loss": 0.0102, "rewards/chosen": 7.407294464111328, "rewards/margins": 18.68063932146345, "rewards/rejected": -11.27334485735212, "step": 3486 }, { "epoch": 0.9557352336576675, "grad_norm": 1.1953125, "kl": 3.266040802001953, "learning_rate": 5e-06, "logits/chosen": -6324883.2, "logits/rejected": -8978522.666666666, "logps/chosen": -447.7475911458333, "logps/rejected": -680.1243489583334, "loss": 0.0049, "rewards/chosen": 8.301929219563801, "rewards/margins": 22.338709682888457, "rewards/rejected": -14.036780463324654, "step": 3487 }, { "epoch": 0.9560093188981773, "grad_norm": 1.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16500189.333333334, "logits/rejected": -26359482.666666668, "logps/chosen": -359.9419352213542, "logps/rejected": -479.7579752604167, "loss": 0.0031, "rewards/chosen": 8.331645329793295, "rewards/margins": 20.10059928894043, "rewards/rejected": -11.768953959147135, "step": 3488 }, { "epoch": 0.9562834041386872, "grad_norm": 4.4375, "kl": 1.9096959829330444, "learning_rate": 5e-06, "logits/chosen": -48586250.666666664, "logits/rejected": -48505525.333333336, "logps/chosen": -433.1398111979167, "logps/rejected": -691.03662109375, "loss": 0.0174, "rewards/chosen": 8.212013880411783, "rewards/margins": 24.508743921915688, "rewards/rejected": -16.296730041503906, "step": 3489 }, { "epoch": 0.9565574893791969, "grad_norm": 7.0625, "kl": 3.7513670921325684, "learning_rate": 5e-06, "logits/chosen": -21216567.466666665, "logits/rejected": -29031678.222222224, "logps/chosen": -421.03525390625, "logps/rejected": -498.0491536458333, "loss": 0.046, "rewards/chosen": 6.184070841471354, "rewards/margins": 19.871333482530382, "rewards/rejected": -13.687262641059029, "step": 3490 }, { "epoch": 0.9568315746197067, "grad_norm": 2.96875, "kl": 4.086931228637695, "learning_rate": 5e-06, "logits/chosen": -33011460.923076924, "logits/rejected": -26140736.0, "logps/chosen": -389.3934795673077, "logps/rejected": -527.4821111505681, "loss": 0.0171, "rewards/chosen": 8.489391033466045, "rewards/margins": 20.27322809179346, "rewards/rejected": -11.783837058327414, "step": 3491 }, { "epoch": 0.9571056598602166, "grad_norm": 1.640625, "kl": 3.1298789978027344, "learning_rate": 5e-06, "logits/chosen": -18036441.14285714, "logits/rejected": -18213920.0, "logps/chosen": -376.648681640625, "logps/rejected": -474.631640625, "loss": 0.0041, "rewards/chosen": 8.750745500837054, "rewards/margins": 21.11329585484096, "rewards/rejected": -12.362550354003906, "step": 3492 }, { "epoch": 0.9573797451007263, "grad_norm": 0.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37677465.14285714, "logits/rejected": -44111820.8, "logps/chosen": -441.11879185267856, "logps/rejected": -516.188916015625, "loss": 0.001, "rewards/chosen": 8.530114310128349, "rewards/margins": 23.947776358468193, "rewards/rejected": -15.417662048339844, "step": 3493 }, { "epoch": 0.9576538303412361, "grad_norm": 1.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14849543.111111112, "logits/rejected": -31809237.333333332, "logps/chosen": -387.4744466145833, "logps/rejected": -577.7677083333333, "loss": 0.0038, "rewards/chosen": 7.500464545355903, "rewards/margins": 21.466610378689236, "rewards/rejected": -13.966145833333334, "step": 3494 }, { "epoch": 0.957927915581746, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1591399.6363636365, "logits/rejected": -19471160.615384616, "logps/chosen": -308.3612171519886, "logps/rejected": -519.8533653846154, "loss": 0.0462, "rewards/chosen": 6.401912342418324, "rewards/margins": 18.607736974329384, "rewards/rejected": -12.205824631911058, "step": 3495 }, { "epoch": 0.9582020008222557, "grad_norm": 6.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14359949.090909092, "logits/rejected": -35400157.538461536, "logps/chosen": -373.06764914772725, "logps/rejected": -568.9601862980769, "loss": 0.0285, "rewards/chosen": 7.310435208407315, "rewards/margins": 22.041706805462603, "rewards/rejected": -14.731271597055288, "step": 3496 }, { "epoch": 0.9584760860627656, "grad_norm": 7.21875, "kl": 0.2313130795955658, "learning_rate": 5e-06, "logits/chosen": -36604468.571428575, "logits/rejected": -45710416.0, "logps/chosen": -380.09266880580356, "logps/rejected": -636.10654296875, "loss": 0.023, "rewards/chosen": 7.544419424874442, "rewards/margins": 20.887691824776784, "rewards/rejected": -13.343272399902343, "step": 3497 }, { "epoch": 0.9587501713032753, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25055939.555555556, "logits/rejected": -31960635.733333334, "logps/chosen": -378.94292534722223, "logps/rejected": -514.8358072916667, "loss": 0.0098, "rewards/chosen": 7.498731825086805, "rewards/margins": 21.077817111545137, "rewards/rejected": -13.579085286458334, "step": 3498 }, { "epoch": 0.9590242565437851, "grad_norm": 2.53125, "kl": 6.67417049407959, "learning_rate": 5e-06, "logits/chosen": -20282589.53846154, "logits/rejected": -31810984.727272727, "logps/chosen": -437.3205754206731, "logps/rejected": -442.8494318181818, "loss": 0.0102, "rewards/chosen": 8.824201143704927, "rewards/margins": 18.358231017639586, "rewards/rejected": -9.534029873934658, "step": 3499 }, { "epoch": 0.959298341784295, "grad_norm": 3.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37148026.18181818, "logits/rejected": -29252160.0, "logps/chosen": -325.25545987215907, "logps/rejected": -533.2506009615385, "loss": 0.0374, "rewards/chosen": 5.65296103737571, "rewards/margins": 19.965081915155157, "rewards/rejected": -14.312120877779448, "step": 3500 }, { "epoch": 0.9595724270248047, "grad_norm": 0.97265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36522327.27272727, "logits/rejected": -32567891.692307692, "logps/chosen": -385.9554332386364, "logps/rejected": -595.0011268028846, "loss": 0.0023, "rewards/chosen": 7.379728837446733, "rewards/margins": 22.190464686680507, "rewards/rejected": -14.810735849233774, "step": 3501 }, { "epoch": 0.9598465122653145, "grad_norm": 9.5, "kl": 5.2213568687438965, "learning_rate": 5e-06, "logits/chosen": -29564989.09090909, "logits/rejected": -21262852.923076924, "logps/chosen": -388.2203480113636, "logps/rejected": -628.578125, "loss": 0.0587, "rewards/chosen": 6.585444363680753, "rewards/margins": 21.00986758145419, "rewards/rejected": -14.424423217773438, "step": 3502 }, { "epoch": 0.9601205975058243, "grad_norm": 9.4375, "kl": 13.004454612731934, "learning_rate": 5e-06, "logits/chosen": -34373376.0, "logits/rejected": -23664796.0, "logps/chosen": -373.25823974609375, "logps/rejected": -570.1533203125, "loss": 0.0482, "rewards/chosen": 6.747780799865723, "rewards/margins": 16.90368938446045, "rewards/rejected": -10.155908584594727, "step": 3503 }, { "epoch": 0.9603946827463341, "grad_norm": 1.0078125, "kl": 4.032529354095459, "learning_rate": 5e-06, "logits/chosen": -24450089.14285714, "logits/rejected": -49769424.0, "logps/chosen": -380.454345703125, "logps/rejected": -676.3556640625, "loss": 0.0037, "rewards/chosen": 7.5347121102469305, "rewards/margins": 24.846824537004743, "rewards/rejected": -17.312112426757814, "step": 3504 }, { "epoch": 0.9606687679868439, "grad_norm": 1.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6096718.857142857, "logits/rejected": -45373900.8, "logps/chosen": -393.62845284598217, "logps/rejected": -608.2669921875, "loss": 0.0074, "rewards/chosen": 8.05252456665039, "rewards/margins": 22.40707321166992, "rewards/rejected": -14.35454864501953, "step": 3505 }, { "epoch": 0.9609428532273537, "grad_norm": 1.7734375, "kl": 0.9396858215332031, "learning_rate": 5e-06, "logits/chosen": -30875516.8, "logits/rejected": -32746688.0, "logps/chosen": -340.6291015625, "logps/rejected": -531.6363002232143, "loss": 0.0066, "rewards/chosen": 6.528956604003906, "rewards/margins": 21.01976318359375, "rewards/rejected": -14.490806579589844, "step": 3506 }, { "epoch": 0.9612169384678635, "grad_norm": 0.345703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6918222.0, "logits/rejected": -43475141.333333336, "logps/chosen": -459.3160807291667, "logps/rejected": -564.9418131510416, "loss": 0.0011, "rewards/chosen": 7.3059641520182295, "rewards/margins": 22.187389373779297, "rewards/rejected": -14.881425221761068, "step": 3507 }, { "epoch": 0.9614910237083732, "grad_norm": 11.6875, "kl": 5.058400630950928, "learning_rate": 5e-06, "logits/chosen": -20357387.076923076, "logits/rejected": -18352482.90909091, "logps/chosen": -418.8249323918269, "logps/rejected": -558.6079989346591, "loss": 0.0333, "rewards/chosen": 7.462120056152344, "rewards/margins": 20.090037259188563, "rewards/rejected": -12.62791720303622, "step": 3508 }, { "epoch": 0.9617651089488831, "grad_norm": 2.953125, "kl": 12.650940895080566, "learning_rate": 5e-06, "logits/chosen": -23218429.866666667, "logits/rejected": -22417696.0, "logps/chosen": -468.9122721354167, "logps/rejected": -480.83935546875, "loss": 0.0122, "rewards/chosen": 7.89049072265625, "rewards/margins": 18.662231106228298, "rewards/rejected": -10.771740383572048, "step": 3509 }, { "epoch": 0.9620391941893929, "grad_norm": 14.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31416570.181818184, "logits/rejected": -32852669.53846154, "logps/chosen": -345.48140092329544, "logps/rejected": -552.5873647836538, "loss": 0.0525, "rewards/chosen": 5.997440684925426, "rewards/margins": 18.584530810376148, "rewards/rejected": -12.587090125450722, "step": 3510 }, { "epoch": 0.9623132794299027, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10009757.090909092, "logits/rejected": -35553602.461538464, "logps/chosen": -303.85251686789775, "logps/rejected": -474.28549429086536, "loss": 0.0581, "rewards/chosen": 6.069184736772017, "rewards/margins": 18.912712897454107, "rewards/rejected": -12.84352816068209, "step": 3511 }, { "epoch": 0.9625873646704125, "grad_norm": 2.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15394114.666666666, "logits/rejected": -31959824.0, "logps/chosen": -496.8338216145833, "logps/rejected": -602.5423990885416, "loss": 0.0229, "rewards/chosen": 5.367678324381511, "rewards/margins": 20.410293579101562, "rewards/rejected": -15.042615254720053, "step": 3512 }, { "epoch": 0.9628614499109223, "grad_norm": 2.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34825973.333333336, "logits/rejected": -20428940.0, "logps/chosen": -523.9337565104166, "logps/rejected": -384.2376708984375, "loss": 0.0055, "rewards/chosen": 8.139310201009115, "rewards/margins": 19.56264368693034, "rewards/rejected": -11.423333485921225, "step": 3513 }, { "epoch": 0.9631355351514321, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41259036.8, "logits/rejected": -14191626.285714285, "logps/chosen": -405.6914306640625, "logps/rejected": -483.0431431361607, "loss": 0.0142, "rewards/chosen": 7.420861053466797, "rewards/margins": 17.21574914114816, "rewards/rejected": -9.794888087681361, "step": 3514 }, { "epoch": 0.9634096203919419, "grad_norm": 12.625, "kl": 9.15355110168457, "learning_rate": 5e-06, "logits/chosen": -9596122.461538462, "logits/rejected": -26546411.636363637, "logps/chosen": -563.6759690504807, "logps/rejected": -467.3494318181818, "loss": 0.0382, "rewards/chosen": 7.991337702824519, "rewards/margins": 18.58230644172722, "rewards/rejected": -10.5909687389027, "step": 3515 }, { "epoch": 0.9636837056324516, "grad_norm": 0.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35865989.81818182, "logits/rejected": -23187798.153846152, "logps/chosen": -543.0529563210227, "logps/rejected": -686.3159555288462, "loss": 0.0013, "rewards/chosen": 8.76632274280895, "rewards/margins": 22.70537561803431, "rewards/rejected": -13.93905287522536, "step": 3516 }, { "epoch": 0.9639577908729615, "grad_norm": 4.75, "kl": 8.702896118164062, "learning_rate": 5e-06, "logits/chosen": -24419194.181818184, "logits/rejected": -32644093.53846154, "logps/chosen": -446.74311967329544, "logps/rejected": -507.0386493389423, "loss": 0.0123, "rewards/chosen": 7.718525279651988, "rewards/margins": 22.06513219446569, "rewards/rejected": -14.346606914813702, "step": 3517 }, { "epoch": 0.9642318761134713, "grad_norm": 7.3125, "kl": 2.5528018474578857, "learning_rate": 5e-06, "logits/chosen": -29763121.230769232, "logits/rejected": -37302702.54545455, "logps/chosen": -459.0987079326923, "logps/rejected": -599.9386541193181, "loss": 0.0338, "rewards/chosen": 6.612849895770733, "rewards/margins": 20.085587988366612, "rewards/rejected": -13.47273809259588, "step": 3518 }, { "epoch": 0.964505961353981, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5577232.4, "logits/rejected": -35686768.0, "logps/chosen": -374.5649169921875, "logps/rejected": -623.2008928571429, "loss": 0.004, "rewards/chosen": 7.658928680419922, "rewards/margins": 20.18013185773577, "rewards/rejected": -12.521203177315849, "step": 3519 }, { "epoch": 0.9647800465944909, "grad_norm": 12.75, "kl": 2.9217491149902344, "learning_rate": 5e-06, "logits/chosen": -22179133.333333332, "logits/rejected": -19768901.333333332, "logps/chosen": -313.74228922526044, "logps/rejected": -601.8114420572916, "loss": 0.07, "rewards/chosen": 6.424011866251628, "rewards/margins": 19.618195215861004, "rewards/rejected": -13.194183349609375, "step": 3520 }, { "epoch": 0.9650541318350007, "grad_norm": 6.125, "kl": 8.11544418334961, "learning_rate": 5e-06, "logits/chosen": -19469456.0, "logits/rejected": -35846936.0, "logps/chosen": -437.1904296875, "logps/rejected": -532.3946533203125, "loss": 0.0234, "rewards/chosen": 8.486283302307129, "rewards/margins": 20.731740951538086, "rewards/rejected": -12.245457649230957, "step": 3521 }, { "epoch": 0.9653282170755105, "grad_norm": 9.1875, "kl": 8.450611114501953, "learning_rate": 5e-06, "logits/chosen": -18814320.0, "logits/rejected": -28001541.818181816, "logps/chosen": -452.2600285456731, "logps/rejected": -492.3132990056818, "loss": 0.1393, "rewards/chosen": 6.734277578500601, "rewards/margins": 18.33988851267141, "rewards/rejected": -11.60561093417081, "step": 3522 }, { "epoch": 0.9656023023160203, "grad_norm": 2.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11627623.466666667, "logits/rejected": -36688814.222222224, "logps/chosen": -459.7584635416667, "logps/rejected": -668.8950737847222, "loss": 0.0056, "rewards/chosen": 8.472954813639323, "rewards/margins": 21.57437744140625, "rewards/rejected": -13.101422627766928, "step": 3523 }, { "epoch": 0.96587638755653, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4744120.4, "logits/rejected": -28162850.285714287, "logps/chosen": -373.006787109375, "logps/rejected": -475.87339564732144, "loss": 0.0341, "rewards/chosen": 7.355895233154297, "rewards/margins": 20.959552546909876, "rewards/rejected": -13.60365731375558, "step": 3524 }, { "epoch": 0.9661504727970399, "grad_norm": 3.828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13520666.666666666, "logits/rejected": 8028576.0, "logps/chosen": -432.2862955729167, "logps/rejected": -602.3522135416666, "loss": 0.0035, "rewards/chosen": 8.28987948099772, "rewards/margins": 20.38061968485514, "rewards/rejected": -12.090740203857422, "step": 3525 }, { "epoch": 0.9664245580375497, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36512384.0, "logits/rejected": -40497344.0, "logps/chosen": -285.8279371995192, "logps/rejected": -864.5326704545455, "loss": 0.0538, "rewards/chosen": 5.810982924241286, "rewards/margins": 23.744543382337877, "rewards/rejected": -17.93356045809659, "step": 3526 }, { "epoch": 0.9666986432780594, "grad_norm": 13.625, "kl": 4.096163749694824, "learning_rate": 5e-06, "logits/chosen": -29435202.285714287, "logits/rejected": -28109446.4, "logps/chosen": -522.8622349330357, "logps/rejected": -484.4392578125, "loss": 0.0141, "rewards/chosen": 8.341444287981306, "rewards/margins": 18.655095563616072, "rewards/rejected": -10.313651275634765, "step": 3527 }, { "epoch": 0.9669727285185693, "grad_norm": 9.25, "kl": 7.0249786376953125, "learning_rate": 5e-06, "logits/chosen": -28365508.923076924, "logits/rejected": -30817137.454545453, "logps/chosen": -369.4426832932692, "logps/rejected": -607.1237571022727, "loss": 0.0276, "rewards/chosen": 6.833125187800481, "rewards/margins": 22.60900964270105, "rewards/rejected": -15.775884454900568, "step": 3528 }, { "epoch": 0.9672468137590791, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32500512.0, "logits/rejected": -22571735.466666665, "logps/chosen": -334.09092881944446, "logps/rejected": -622.0912760416667, "loss": 0.0334, "rewards/chosen": 5.333130730523004, "rewards/margins": 19.426300896538628, "rewards/rejected": -14.093170166015625, "step": 3529 }, { "epoch": 0.9675208989995888, "grad_norm": 9.8125, "kl": 4.386848449707031, "learning_rate": 5e-06, "logits/chosen": -18929560.888888888, "logits/rejected": -22163876.266666666, "logps/chosen": -423.7014431423611, "logps/rejected": -598.5565104166667, "loss": 0.0551, "rewards/chosen": 9.678970336914062, "rewards/margins": 20.69458923339844, "rewards/rejected": -11.015618896484375, "step": 3530 }, { "epoch": 0.9677949842400987, "grad_norm": 7.625, "kl": 1.0646095275878906, "learning_rate": 5e-06, "logits/chosen": -31517230.933333334, "logits/rejected": -45882424.88888889, "logps/chosen": -417.12688802083335, "logps/rejected": -745.3572048611111, "loss": 0.0445, "rewards/chosen": 6.990958658854167, "rewards/margins": 20.490803527832032, "rewards/rejected": -13.499844868977865, "step": 3531 }, { "epoch": 0.9680690694806084, "grad_norm": 2.59375, "kl": 2.0914320945739746, "learning_rate": 5e-06, "logits/chosen": -8867417.846153846, "logits/rejected": -30041888.0, "logps/chosen": -378.41128305288464, "logps/rejected": -693.70703125, "loss": 0.0072, "rewards/chosen": 8.692338209885817, "rewards/margins": 23.602223963170616, "rewards/rejected": -14.9098857532848, "step": 3532 }, { "epoch": 0.9683431547211183, "grad_norm": 6.28125, "kl": 19.230701446533203, "learning_rate": 5e-06, "logits/chosen": -34255917.176470585, "logits/rejected": -31369010.285714287, "logps/chosen": -505.3411075367647, "logps/rejected": -382.40513392857144, "loss": 0.0502, "rewards/chosen": 8.158661786247702, "rewards/margins": 17.81826654001444, "rewards/rejected": -9.659604753766741, "step": 3533 }, { "epoch": 0.9686172399616281, "grad_norm": 1.3984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20141402.666666668, "logits/rejected": -55440021.333333336, "logps/chosen": -435.3724365234375, "logps/rejected": -482.6632486979167, "loss": 0.0226, "rewards/chosen": 7.966393788655599, "rewards/margins": 21.012100219726562, "rewards/rejected": -13.045706431070963, "step": 3534 }, { "epoch": 0.9688913252021378, "grad_norm": 1.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20242313.6, "logits/rejected": -27677824.0, "logps/chosen": -428.943408203125, "logps/rejected": -565.0154854910714, "loss": 0.0033, "rewards/chosen": 6.148410034179688, "rewards/margins": 21.792451913016183, "rewards/rejected": -15.644041878836495, "step": 3535 }, { "epoch": 0.9691654104426477, "grad_norm": 6.6875, "kl": 0.5893707275390625, "learning_rate": 5e-06, "logits/chosen": -30163342.769230768, "logits/rejected": -24106168.727272727, "logps/chosen": -397.81385216346155, "logps/rejected": -672.3748224431819, "loss": 0.0242, "rewards/chosen": 6.893027672400842, "rewards/margins": 22.65574101801519, "rewards/rejected": -15.762713345614346, "step": 3536 }, { "epoch": 0.9694394956831575, "grad_norm": 9.625, "kl": 2.756671905517578, "learning_rate": 5e-06, "logits/chosen": -10150798.4, "logits/rejected": -42725261.71428572, "logps/chosen": -400.3873779296875, "logps/rejected": -561.9821428571429, "loss": 0.038, "rewards/chosen": 7.778680419921875, "rewards/margins": 20.845246887207033, "rewards/rejected": -13.066566467285156, "step": 3537 }, { "epoch": 0.9697135809236672, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32340596.0, "logits/rejected": -30701948.0, "logps/chosen": -440.1888427734375, "logps/rejected": -691.2135009765625, "loss": 0.0174, "rewards/chosen": 7.200658798217773, "rewards/margins": 21.612574577331543, "rewards/rejected": -14.41191577911377, "step": 3538 }, { "epoch": 0.9699876661641771, "grad_norm": 2.734375, "kl": 16.632286071777344, "learning_rate": 5e-06, "logits/chosen": -14451886.933333334, "logits/rejected": -57495285.333333336, "logps/chosen": -433.2023111979167, "logps/rejected": -698.9497612847222, "loss": 0.1253, "rewards/chosen": 7.028330485026042, "rewards/margins": 25.84561496310764, "rewards/rejected": -18.817284478081596, "step": 3539 }, { "epoch": 0.9702617514046868, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18196907.636363637, "logits/rejected": -31824406.153846152, "logps/chosen": -467.13334517045456, "logps/rejected": -596.5655048076923, "loss": 0.0311, "rewards/chosen": 7.411164023659446, "rewards/margins": 21.050835936219542, "rewards/rejected": -13.639671912560097, "step": 3540 }, { "epoch": 0.9705358366451966, "grad_norm": 5.96875, "kl": 0.0365346297621727, "learning_rate": 5e-06, "logits/chosen": -17014333.333333332, "logits/rejected": -33203058.666666668, "logps/chosen": -407.9897054036458, "logps/rejected": -464.337158203125, "loss": 0.0614, "rewards/chosen": 8.49478022257487, "rewards/margins": 21.593100229899086, "rewards/rejected": -13.098320007324219, "step": 3541 }, { "epoch": 0.9708099218857065, "grad_norm": 3.15625, "kl": 5.360725402832031, "learning_rate": 5e-06, "logits/chosen": -46031168.0, "logits/rejected": -27990180.0, "logps/chosen": -462.08880615234375, "logps/rejected": -467.1872863769531, "loss": 0.0053, "rewards/chosen": 7.711245536804199, "rewards/margins": 20.058691024780273, "rewards/rejected": -12.347445487976074, "step": 3542 }, { "epoch": 0.9710840071262162, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19706833.6, "logits/rejected": -39945152.0, "logps/chosen": -338.6061279296875, "logps/rejected": -699.7013113839286, "loss": 0.0128, "rewards/chosen": 6.2372089385986325, "rewards/margins": 23.40181857517787, "rewards/rejected": -17.16460963657924, "step": 3543 }, { "epoch": 0.9713580923667261, "grad_norm": 7.03125, "kl": 3.8296303749084473, "learning_rate": 5e-06, "logits/chosen": -6216679.2727272725, "logits/rejected": -36582119.384615384, "logps/chosen": -332.7618963068182, "logps/rejected": -517.0686598557693, "loss": 0.0624, "rewards/chosen": 6.093304720791903, "rewards/margins": 18.37221543105332, "rewards/rejected": -12.278910710261417, "step": 3544 }, { "epoch": 0.9716321776072359, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28441760.0, "logits/rejected": -53562688.0, "logps/chosen": -401.3419189453125, "logps/rejected": -547.8071695963541, "loss": 0.0215, "rewards/chosen": 7.01066780090332, "rewards/margins": 18.367910385131836, "rewards/rejected": -11.357242584228516, "step": 3545 }, { "epoch": 0.9719062628477456, "grad_norm": 7.5625, "kl": 5.924106597900391, "learning_rate": 5e-06, "logits/chosen": -16846377.14285714, "logits/rejected": -35491254.4, "logps/chosen": -335.62437220982144, "logps/rejected": -565.35556640625, "loss": 0.0583, "rewards/chosen": 8.661420549665179, "rewards/margins": 20.88053981236049, "rewards/rejected": -12.219119262695312, "step": 3546 }, { "epoch": 0.9721803480882555, "grad_norm": 10.5625, "kl": 10.228775024414062, "learning_rate": 5e-06, "logits/chosen": -25690953.6, "logits/rejected": -26300905.14285714, "logps/chosen": -486.68544921875, "logps/rejected": -706.5576869419643, "loss": 0.0727, "rewards/chosen": 7.493565368652344, "rewards/margins": 21.80130680629185, "rewards/rejected": -14.307741437639509, "step": 3547 }, { "epoch": 0.9724544333287652, "grad_norm": 3.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31726546.285714287, "logits/rejected": -33160455.529411763, "logps/chosen": -373.26639229910717, "logps/rejected": -571.5479090073529, "loss": 0.0624, "rewards/chosen": 6.1331024169921875, "rewards/margins": 20.737932093003216, "rewards/rejected": -14.604829676011029, "step": 3548 }, { "epoch": 0.972728518569275, "grad_norm": 6.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20874488.615384616, "logits/rejected": -22946196.363636363, "logps/chosen": -373.1749924879808, "logps/rejected": -712.7975408380681, "loss": 0.0308, "rewards/chosen": 7.144061748798077, "rewards/margins": 25.120912778627623, "rewards/rejected": -17.976851029829547, "step": 3549 }, { "epoch": 0.9730026038097849, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21064997.818181816, "logits/rejected": -48141115.07692308, "logps/chosen": -454.6990855823864, "logps/rejected": -630.8203125, "loss": 0.0404, "rewards/chosen": 6.148569280450994, "rewards/margins": 24.0846302592671, "rewards/rejected": -17.936060978816105, "step": 3550 }, { "epoch": 0.9732766890502946, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21390976.0, "logits/rejected": -20120994.90909091, "logps/chosen": -429.208984375, "logps/rejected": -609.3746448863636, "loss": 0.0148, "rewards/chosen": 7.120702303372896, "rewards/margins": 22.43823631660088, "rewards/rejected": -15.317534013227982, "step": 3551 }, { "epoch": 0.9735507742908044, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37976608.0, "logits/rejected": -29787545.14285714, "logps/chosen": -425.34619140625, "logps/rejected": -554.0260532924107, "loss": 0.0443, "rewards/chosen": 6.994672393798828, "rewards/margins": 19.628318677629743, "rewards/rejected": -12.633646283830915, "step": 3552 }, { "epoch": 0.9738248595313143, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42790179.55555555, "logits/rejected": -50908450.13333333, "logps/chosen": -426.5231662326389, "logps/rejected": -655.3393880208333, "loss": 0.0153, "rewards/chosen": 7.2991943359375, "rewards/margins": 22.9747314453125, "rewards/rejected": -15.675537109375, "step": 3553 }, { "epoch": 0.974098944771824, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31576640.0, "logits/rejected": -36929571.55555555, "logps/chosen": -384.1858723958333, "logps/rejected": -794.5413953993055, "loss": 0.0401, "rewards/chosen": 7.378455607096354, "rewards/margins": 27.819778781467015, "rewards/rejected": -20.44132317437066, "step": 3554 }, { "epoch": 0.9743730300123339, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19929049.6, "logits/rejected": -41443881.14285714, "logps/chosen": -311.769775390625, "logps/rejected": -489.34814453125, "loss": 0.0166, "rewards/chosen": 7.801390075683594, "rewards/margins": 21.91660919189453, "rewards/rejected": -14.115219116210938, "step": 3555 }, { "epoch": 0.9746471152528436, "grad_norm": 3.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11654760.0, "logits/rejected": -31952774.4, "logps/chosen": -281.54056222098217, "logps/rejected": -558.431494140625, "loss": 0.0448, "rewards/chosen": 5.4049546377999445, "rewards/margins": 20.070965685163223, "rewards/rejected": -14.66601104736328, "step": 3556 }, { "epoch": 0.9749212004933534, "grad_norm": 8.25, "kl": 7.270308494567871, "learning_rate": 5e-06, "logits/chosen": -28575072.0, "logits/rejected": -38444266.666666664, "logps/chosen": -461.3680419921875, "logps/rejected": -584.7613118489584, "loss": 0.0263, "rewards/chosen": 9.213942209879557, "rewards/margins": 26.046078999837242, "rewards/rejected": -16.832136789957683, "step": 3557 }, { "epoch": 0.9751952857338633, "grad_norm": 3.15625, "kl": 12.323177337646484, "learning_rate": 5e-06, "logits/chosen": -12951035.733333332, "logits/rejected": -63571889.777777776, "logps/chosen": -446.98346354166665, "logps/rejected": -560.0725911458334, "loss": 0.0485, "rewards/chosen": 8.075748697916667, "rewards/margins": 20.634434848361543, "rewards/rejected": -12.558686150444878, "step": 3558 }, { "epoch": 0.975469370974373, "grad_norm": 4.25, "kl": 7.554084777832031, "learning_rate": 5e-06, "logits/chosen": -803484.6666666666, "logits/rejected": -50948224.0, "logps/chosen": -492.7294514973958, "logps/rejected": -644.7333170572916, "loss": 0.0118, "rewards/chosen": 8.226719538370768, "rewards/margins": 24.408611933390297, "rewards/rejected": -16.18189239501953, "step": 3559 }, { "epoch": 0.9757434562148828, "grad_norm": 4.21875, "kl": 4.749965667724609, "learning_rate": 5e-06, "logits/chosen": -20147332.8, "logits/rejected": -45693805.71428572, "logps/chosen": -372.9798095703125, "logps/rejected": -468.00558035714283, "loss": 0.0212, "rewards/chosen": 7.61644287109375, "rewards/margins": 23.431234959193638, "rewards/rejected": -15.814792088099889, "step": 3560 }, { "epoch": 0.9760175414553927, "grad_norm": 7.6875, "kl": 0.17806372046470642, "learning_rate": 5e-06, "logits/chosen": -19771641.333333332, "logits/rejected": -3220664.0, "logps/chosen": -426.3463541666667, "logps/rejected": -439.4940999348958, "loss": 0.022, "rewards/chosen": 6.625211079915364, "rewards/margins": 17.53482437133789, "rewards/rejected": -10.909613291422525, "step": 3561 }, { "epoch": 0.9762916266959024, "grad_norm": 1.9765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4671830.181818182, "logits/rejected": -47325902.76923077, "logps/chosen": -493.09157492897725, "logps/rejected": -577.9942157451923, "loss": 0.006, "rewards/chosen": 7.950892361727628, "rewards/margins": 24.17608274446501, "rewards/rejected": -16.22519038273738, "step": 3562 }, { "epoch": 0.9765657119364122, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29825770.666666668, "logits/rejected": -42790960.0, "logps/chosen": -356.5733235677083, "logps/rejected": -480.21484375, "loss": 0.0111, "rewards/chosen": 7.634641011555989, "rewards/margins": 20.710782368977863, "rewards/rejected": -13.076141357421875, "step": 3563 }, { "epoch": 0.976839797176922, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47612401.45454545, "logits/rejected": -26137538.46153846, "logps/chosen": -533.5088778409091, "logps/rejected": -578.6849459134615, "loss": 0.0042, "rewards/chosen": 6.839703646573153, "rewards/margins": 20.588363220641664, "rewards/rejected": -13.74865957406851, "step": 3564 }, { "epoch": 0.9771138824174318, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10142395.0, "logits/rejected": -43458012.0, "logps/chosen": -349.23052978515625, "logps/rejected": -623.4749755859375, "loss": 0.0264, "rewards/chosen": 6.526202201843262, "rewards/margins": 24.275324821472168, "rewards/rejected": -17.749122619628906, "step": 3565 }, { "epoch": 0.9773879676579417, "grad_norm": 3.75, "kl": 1.7272777557373047, "learning_rate": 5e-06, "logits/chosen": -29711433.14285714, "logits/rejected": -33113068.8, "logps/chosen": -401.8916713169643, "logps/rejected": -554.208203125, "loss": 0.0262, "rewards/chosen": 8.604849679129464, "rewards/margins": 23.421622140066965, "rewards/rejected": -14.8167724609375, "step": 3566 }, { "epoch": 0.9776620528984514, "grad_norm": 1.7890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19315890.46153846, "logits/rejected": -30928965.818181816, "logps/chosen": -421.1165114182692, "logps/rejected": -625.8044211647727, "loss": 0.007, "rewards/chosen": 8.601359440730167, "rewards/margins": 25.62239576219679, "rewards/rejected": -17.02103632146662, "step": 3567 }, { "epoch": 0.9779361381389612, "grad_norm": 8.4375, "kl": 1.8298118114471436, "learning_rate": 5e-06, "logits/chosen": -41215906.13333333, "logits/rejected": -13121091.555555556, "logps/chosen": -434.79567057291666, "logps/rejected": -694.4776475694445, "loss": 0.0605, "rewards/chosen": 6.846635945638021, "rewards/margins": 27.611358981662327, "rewards/rejected": -20.764723036024307, "step": 3568 }, { "epoch": 0.9782102233794711, "grad_norm": 3.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26681618.285714287, "logits/rejected": -45477420.8, "logps/chosen": -380.21505301339283, "logps/rejected": -504.183447265625, "loss": 0.0123, "rewards/chosen": 7.41624995640346, "rewards/margins": 21.12763148716518, "rewards/rejected": -13.711381530761718, "step": 3569 }, { "epoch": 0.9784843086199808, "grad_norm": 2.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23363227.636363637, "logits/rejected": -26477289.846153848, "logps/chosen": -444.9064275568182, "logps/rejected": -602.5181790865385, "loss": 0.003, "rewards/chosen": 9.86792685768821, "rewards/margins": 24.906353183559606, "rewards/rejected": -15.038426325871395, "step": 3570 }, { "epoch": 0.9787583938604906, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33025300.0, "logits/rejected": -29366226.0, "logps/chosen": -545.0746459960938, "logps/rejected": -425.994873046875, "loss": 0.0129, "rewards/chosen": 8.32065200805664, "rewards/margins": 20.93206024169922, "rewards/rejected": -12.611408233642578, "step": 3571 }, { "epoch": 0.9790324791010004, "grad_norm": 1.0703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2262151.111111111, "logits/rejected": -42201190.4, "logps/chosen": -403.5636393229167, "logps/rejected": -557.7044270833334, "loss": 0.0025, "rewards/chosen": 8.02232191297743, "rewards/margins": 25.982874891493054, "rewards/rejected": -17.960552978515626, "step": 3572 }, { "epoch": 0.9793065643415102, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20981019.636363637, "logits/rejected": -28277979.076923076, "logps/chosen": -393.35311612215907, "logps/rejected": -638.8125751201923, "loss": 0.0051, "rewards/chosen": 5.747853365811435, "rewards/margins": 26.626461482548212, "rewards/rejected": -20.87860811673678, "step": 3573 }, { "epoch": 0.97958064958202, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13489372.444444444, "logits/rejected": -45479210.666666664, "logps/chosen": -457.3591579861111, "logps/rejected": -511.42164713541666, "loss": 0.0126, "rewards/chosen": 6.241010030110677, "rewards/margins": 21.816020202636718, "rewards/rejected": -15.575010172526042, "step": 3574 }, { "epoch": 0.9798547348225298, "grad_norm": 1.3203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28255977.6, "logits/rejected": -17245497.14285714, "logps/chosen": -445.735595703125, "logps/rejected": -749.1872209821429, "loss": 0.0029, "rewards/chosen": 7.326282501220703, "rewards/margins": 29.7900755746024, "rewards/rejected": -22.463793073381698, "step": 3575 }, { "epoch": 0.9801288200630396, "grad_norm": 3.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33777657.6, "logits/rejected": -55349397.333333336, "logps/chosen": -454.29986979166665, "logps/rejected": -548.2758246527778, "loss": 0.0074, "rewards/chosen": 8.045513916015626, "rewards/margins": 21.519791836208768, "rewards/rejected": -13.474277920193142, "step": 3576 }, { "epoch": 0.9804029053035495, "grad_norm": 4.5, "kl": 7.562534332275391, "learning_rate": 5e-06, "logits/chosen": -21244462.666666668, "logits/rejected": -16437718.666666666, "logps/chosen": -403.9708658854167, "logps/rejected": -403.95263671875, "loss": 0.0099, "rewards/chosen": 7.656859079996745, "rewards/margins": 21.32149378458659, "rewards/rejected": -13.664634704589844, "step": 3577 }, { "epoch": 0.9806769905440592, "grad_norm": 14.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21529305.333333332, "logits/rejected": -34902208.0, "logps/chosen": -325.4227701822917, "logps/rejected": -512.9514973958334, "loss": 0.052, "rewards/chosen": 6.004405975341797, "rewards/margins": 19.087980906168617, "rewards/rejected": -13.083574930826822, "step": 3578 }, { "epoch": 0.980951075784569, "grad_norm": 6.90625, "kl": 6.691150665283203, "learning_rate": 5e-06, "logits/chosen": -25185536.0, "logits/rejected": -15297749.0, "logps/chosen": -338.0955810546875, "logps/rejected": -696.1741943359375, "loss": 0.0321, "rewards/chosen": 6.479888916015625, "rewards/margins": 24.32341766357422, "rewards/rejected": -17.843528747558594, "step": 3579 }, { "epoch": 0.9812251610250788, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33253076.57142857, "logits/rejected": -41786566.4, "logps/chosen": -380.68411690848217, "logps/rejected": -566.6609375, "loss": 0.0272, "rewards/chosen": 6.353349958147321, "rewards/margins": 22.806371198381697, "rewards/rejected": -16.453021240234374, "step": 3580 }, { "epoch": 0.9814992462655886, "grad_norm": 7.78125, "kl": 2.59900164604187, "learning_rate": 5e-06, "logits/chosen": -12699725.333333334, "logits/rejected": -24377613.333333332, "logps/chosen": -453.1611328125, "logps/rejected": -571.9591064453125, "loss": 0.0191, "rewards/chosen": 8.562185287475586, "rewards/margins": 24.027722040812172, "rewards/rejected": -15.465536753336588, "step": 3581 }, { "epoch": 0.9817733315060984, "grad_norm": 2.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18593574.153846152, "logits/rejected": -20827402.181818184, "logps/chosen": -416.86283052884613, "logps/rejected": -584.1973100142045, "loss": 0.0214, "rewards/chosen": 8.310040987454927, "rewards/margins": 23.119561362099816, "rewards/rejected": -14.809520374644887, "step": 3582 }, { "epoch": 0.9820474167466082, "grad_norm": 3.40625, "kl": 2.366853713989258, "learning_rate": 5e-06, "logits/chosen": -23398462.222222224, "logits/rejected": -28483466.666666668, "logps/chosen": -398.54454210069446, "logps/rejected": -454.4078776041667, "loss": 0.0164, "rewards/chosen": 6.78425767686632, "rewards/margins": 19.078533596462673, "rewards/rejected": -12.294275919596354, "step": 3583 }, { "epoch": 0.982321501987118, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10224628.363636363, "logits/rejected": -26062638.769230768, "logps/chosen": -372.18033114346593, "logps/rejected": -544.9340444711538, "loss": 0.0293, "rewards/chosen": 4.622932087291371, "rewards/margins": 18.50672349729738, "rewards/rejected": -13.88379141000601, "step": 3584 }, { "epoch": 0.9825955872276277, "grad_norm": 6.40625, "kl": 7.0762939453125, "learning_rate": 5e-06, "logits/chosen": -4714595.076923077, "logits/rejected": -18805693.09090909, "logps/chosen": -517.4765249399038, "logps/rejected": -461.85107421875, "loss": 0.0118, "rewards/chosen": 9.473702650803785, "rewards/margins": 21.12185716962481, "rewards/rejected": -11.648154518821023, "step": 3585 }, { "epoch": 0.9828696724681376, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20401136.0, "logits/rejected": -32663542.153846152, "logps/chosen": -407.80033735795456, "logps/rejected": -438.6862229567308, "loss": 0.0101, "rewards/chosen": 7.235531893643466, "rewards/margins": 18.586984060861013, "rewards/rejected": -11.351452167217548, "step": 3586 }, { "epoch": 0.9831437577086474, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19624273.230769232, "logits/rejected": -35466717.09090909, "logps/chosen": -317.6305588942308, "logps/rejected": -594.3020241477273, "loss": 0.0209, "rewards/chosen": 6.7949969951923075, "rewards/margins": 22.98270229526333, "rewards/rejected": -16.187705300071023, "step": 3587 }, { "epoch": 0.9834178429491572, "grad_norm": 0.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26914626.666666668, "logits/rejected": 8277124.0, "logps/chosen": -379.9068196614583, "logps/rejected": -546.18896484375, "loss": 0.0013, "rewards/chosen": 8.490914026896158, "rewards/margins": 23.36557960510254, "rewards/rejected": -14.87466557820638, "step": 3588 }, { "epoch": 0.983691928189667, "grad_norm": 2.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14091065.6, "logits/rejected": -18231353.14285714, "logps/chosen": -348.579296875, "logps/rejected": -568.8995535714286, "loss": 0.0093, "rewards/chosen": 6.235654067993164, "rewards/margins": 19.99219790867397, "rewards/rejected": -13.756543840680804, "step": 3589 }, { "epoch": 0.9839660134301768, "grad_norm": 5.15625, "kl": 8.106104850769043, "learning_rate": 5e-06, "logits/chosen": -18533200.0, "logits/rejected": -12710817.0, "logps/chosen": -375.63812255859375, "logps/rejected": -560.1165771484375, "loss": 0.0449, "rewards/chosen": 7.7163310050964355, "rewards/margins": 18.883193492889404, "rewards/rejected": -11.166862487792969, "step": 3590 }, { "epoch": 0.9842400986706866, "grad_norm": 2.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19933764.0, "logits/rejected": -19412858.666666668, "logps/chosen": -442.373046875, "logps/rejected": -741.8025716145834, "loss": 0.0051, "rewards/chosen": 6.737379709879558, "rewards/margins": 22.204922993977863, "rewards/rejected": -15.467543284098307, "step": 3591 }, { "epoch": 0.9845141839111964, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13954090.666666666, "logits/rejected": -39073066.666666664, "logps/chosen": -449.1911892361111, "logps/rejected": -500.4018880208333, "loss": 0.0214, "rewards/chosen": 6.823911878797743, "rewards/margins": 19.76974826388889, "rewards/rejected": -12.945836385091146, "step": 3592 }, { "epoch": 0.9847882691517061, "grad_norm": 11.3125, "kl": 0.5545228123664856, "learning_rate": 5e-06, "logits/chosen": -15167001.142857144, "logits/rejected": 7124444.8, "logps/chosen": -391.43233816964283, "logps/rejected": -331.9753173828125, "loss": 0.0967, "rewards/chosen": 5.097693307059152, "rewards/margins": 11.083023507254463, "rewards/rejected": -5.985330200195312, "step": 3593 }, { "epoch": 0.985062354392216, "grad_norm": 2.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25569808.0, "logits/rejected": -29410994.666666668, "logps/chosen": -548.8646647135416, "logps/rejected": -539.273681640625, "loss": 0.0042, "rewards/chosen": 7.89846928914388, "rewards/margins": 21.153775533040363, "rewards/rejected": -13.255306243896484, "step": 3594 }, { "epoch": 0.9853364396327258, "grad_norm": 1.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3088798.8571428573, "logits/rejected": -26242192.94117647, "logps/chosen": -373.85232979910717, "logps/rejected": -576.7179457720588, "loss": 0.0037, "rewards/chosen": 8.309620448521205, "rewards/margins": 23.055111027565324, "rewards/rejected": -14.745490579044118, "step": 3595 }, { "epoch": 0.9856105248732355, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11563294.4, "logits/rejected": -16052094.857142856, "logps/chosen": -431.00927734375, "logps/rejected": -695.2404436383929, "loss": 0.0502, "rewards/chosen": 6.115864562988281, "rewards/margins": 18.441027178083147, "rewards/rejected": -12.325162615094866, "step": 3596 }, { "epoch": 0.9858846101137454, "grad_norm": 10.375, "kl": 1.4532884359359741, "learning_rate": 5e-06, "logits/chosen": -7343268.0, "logits/rejected": -15752978.666666666, "logps/chosen": -431.1674397786458, "logps/rejected": -450.476318359375, "loss": 0.0301, "rewards/chosen": 6.135073343912761, "rewards/margins": 17.577181498209637, "rewards/rejected": -11.442108154296875, "step": 3597 }, { "epoch": 0.9861586953542552, "grad_norm": 17.375, "kl": 2.8201255798339844, "learning_rate": 5e-06, "logits/chosen": -31849397.333333332, "logits/rejected": -31720765.333333332, "logps/chosen": -375.086669921875, "logps/rejected": -410.3670247395833, "loss": 0.0516, "rewards/chosen": 5.944102605183919, "rewards/margins": 17.56100018819173, "rewards/rejected": -11.616897583007812, "step": 3598 }, { "epoch": 0.986432780594765, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15939437.538461538, "logits/rejected": -6289474.181818182, "logps/chosen": -389.18637319711536, "logps/rejected": -616.1795987215909, "loss": 0.0495, "rewards/chosen": 7.133665818434495, "rewards/margins": 24.90414311335637, "rewards/rejected": -17.770477294921875, "step": 3599 }, { "epoch": 0.9867068658352748, "grad_norm": 13.625, "kl": 7.709907054901123, "learning_rate": 5e-06, "logits/chosen": -8534684.57142857, "logits/rejected": -34700944.0, "logps/chosen": -521.1034109933036, "logps/rejected": -471.765478515625, "loss": 0.0424, "rewards/chosen": 6.6756777082170755, "rewards/margins": 19.19303152901786, "rewards/rejected": -12.517353820800782, "step": 3600 }, { "epoch": 0.9869809510757845, "grad_norm": 12.1875, "kl": 11.694598197937012, "learning_rate": 5e-06, "logits/chosen": -34516238.76923077, "logits/rejected": -38942039.27272727, "logps/chosen": -444.71927584134613, "logps/rejected": -684.5463423295455, "loss": 0.0935, "rewards/chosen": 8.046837439903847, "rewards/margins": 21.08607194807146, "rewards/rejected": -13.039234508167613, "step": 3601 }, { "epoch": 0.9872550363162944, "grad_norm": 8.5, "kl": 11.121437072753906, "learning_rate": 5e-06, "logits/chosen": -13000433.066666666, "logits/rejected": 22790062.222222224, "logps/chosen": -431.57024739583335, "logps/rejected": -662.0048285590278, "loss": 0.0972, "rewards/chosen": 6.9398351033528645, "rewards/margins": 20.622517564561633, "rewards/rejected": -13.682682461208767, "step": 3602 }, { "epoch": 0.9875291215568042, "grad_norm": 13.875, "kl": 10.904380798339844, "learning_rate": 5e-06, "logits/chosen": -15855664.94117647, "logits/rejected": 1508933.7142857143, "logps/chosen": -549.8374310661765, "logps/rejected": -601.9651227678571, "loss": 0.059, "rewards/chosen": 7.43915916891659, "rewards/margins": 15.88980570560744, "rewards/rejected": -8.450646536690849, "step": 3603 }, { "epoch": 0.9878032067973139, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6374894.0, "logits/rejected": -25099017.14285714, "logps/chosen": -417.210693359375, "logps/rejected": -712.4098772321429, "loss": 0.0088, "rewards/chosen": 8.587100982666016, "rewards/margins": 26.74705014910017, "rewards/rejected": -18.159949166434153, "step": 3604 }, { "epoch": 0.9880772920378238, "grad_norm": 14.625, "kl": 6.155295372009277, "learning_rate": 5e-06, "logits/chosen": -1449421.0, "logits/rejected": -25564464.0, "logps/chosen": -359.56549072265625, "logps/rejected": -588.3779296875, "loss": 0.0486, "rewards/chosen": 6.328761577606201, "rewards/margins": 21.56860113143921, "rewards/rejected": -15.239839553833008, "step": 3605 }, { "epoch": 0.9883513772783336, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36522240.0, "logits/rejected": -18185310.0, "logps/chosen": -400.3812561035156, "logps/rejected": -499.67608642578125, "loss": 0.014, "rewards/chosen": 8.53310775756836, "rewards/margins": 19.019734382629395, "rewards/rejected": -10.486626625061035, "step": 3606 }, { "epoch": 0.9886254625188433, "grad_norm": 3.796875, "kl": 0.69879150390625, "learning_rate": 5e-06, "logits/chosen": 8454894.4, "logits/rejected": -16460821.714285715, "logps/chosen": -505.635791015625, "logps/rejected": -506.14589146205356, "loss": 0.0065, "rewards/chosen": 9.190132141113281, "rewards/margins": 20.70035879952567, "rewards/rejected": -11.510226658412389, "step": 3607 }, { "epoch": 0.9888995477593532, "grad_norm": 5.625, "kl": 4.656252861022949, "learning_rate": 5e-06, "logits/chosen": -17040304.0, "logits/rejected": -25927200.0, "logps/chosen": -397.1709716796875, "logps/rejected": -659.7024274553571, "loss": 0.0186, "rewards/chosen": 8.258100128173828, "rewards/margins": 25.204297637939455, "rewards/rejected": -16.946197509765625, "step": 3608 }, { "epoch": 0.9891736329998629, "grad_norm": 19.0, "kl": 9.428572654724121, "learning_rate": 5e-06, "logits/chosen": -16596646.4, "logits/rejected": -43390101.333333336, "logps/chosen": -368.1940104166667, "logps/rejected": -715.2808159722222, "loss": 0.1227, "rewards/chosen": 6.807832336425781, "rewards/margins": 19.697230699327257, "rewards/rejected": -12.889398362901476, "step": 3609 }, { "epoch": 0.9894477182403728, "grad_norm": 3.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32886820.923076924, "logits/rejected": -37858231.27272727, "logps/chosen": -437.9258563701923, "logps/rejected": -367.72927024147725, "loss": 0.0416, "rewards/chosen": 8.289321899414062, "rewards/margins": 19.85065390846946, "rewards/rejected": -11.561332009055398, "step": 3610 }, { "epoch": 0.9897218034808826, "grad_norm": 12.6875, "kl": 6.3727006912231445, "learning_rate": 5e-06, "logits/chosen": -26979074.0, "logits/rejected": -13591914.0, "logps/chosen": -385.44793701171875, "logps/rejected": -500.2922668457031, "loss": 0.0236, "rewards/chosen": 6.456875801086426, "rewards/margins": 17.942729949951172, "rewards/rejected": -11.485854148864746, "step": 3611 }, { "epoch": 0.9899958887213923, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5431220.8, "logits/rejected": -32390281.14285714, "logps/chosen": -411.50869140625, "logps/rejected": -464.0733119419643, "loss": 0.0611, "rewards/chosen": 6.037025451660156, "rewards/margins": 16.83579363141741, "rewards/rejected": -10.798768179757255, "step": 3612 }, { "epoch": 0.9902699739619022, "grad_norm": 4.84375, "kl": 7.307845592498779, "learning_rate": 5e-06, "logits/chosen": 35700514.13333333, "logits/rejected": 2933070.222222222, "logps/chosen": -467.45872395833334, "logps/rejected": -423.4021267361111, "loss": 0.0347, "rewards/chosen": 7.462611389160156, "rewards/margins": 21.520067511664497, "rewards/rejected": -14.057456122504341, "step": 3613 }, { "epoch": 0.990544059202412, "grad_norm": 11.5, "kl": 9.125476837158203, "learning_rate": 5e-06, "logits/chosen": -21045892.57142857, "logits/rejected": -23383641.6, "logps/chosen": -359.2505580357143, "logps/rejected": -551.03427734375, "loss": 0.1011, "rewards/chosen": 7.769123077392578, "rewards/margins": 21.24990463256836, "rewards/rejected": -13.480781555175781, "step": 3614 }, { "epoch": 0.9908181444429217, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28798394.666666668, "logits/rejected": -19365922.133333333, "logps/chosen": -417.18785264756946, "logps/rejected": -559.1694010416667, "loss": 0.0165, "rewards/chosen": 8.172709147135416, "rewards/margins": 19.19697062174479, "rewards/rejected": -11.024261474609375, "step": 3615 }, { "epoch": 0.9910922296834316, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3446301.5384615385, "logits/rejected": -22773121.454545453, "logps/chosen": -486.57677283653845, "logps/rejected": -531.6511896306819, "loss": 0.033, "rewards/chosen": 7.933398907001202, "rewards/margins": 22.013239827189413, "rewards/rejected": -14.07984092018821, "step": 3616 }, { "epoch": 0.9913663149239413, "grad_norm": 0.451171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32462218.666666668, "logits/rejected": -17440388.266666666, "logps/chosen": -406.71454535590277, "logps/rejected": -558.8741536458333, "loss": 0.0021, "rewards/chosen": 8.085383945041233, "rewards/margins": 22.801381259494356, "rewards/rejected": -14.715997314453125, "step": 3617 }, { "epoch": 0.9916404001644511, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 16288180.8, "logits/rejected": -30895462.85714286, "logps/chosen": -466.193359375, "logps/rejected": -539.976806640625, "loss": 0.0274, "rewards/chosen": 9.273147583007812, "rewards/margins": 24.06569344656808, "rewards/rejected": -14.792545863560267, "step": 3618 }, { "epoch": 0.991914485404961, "grad_norm": 7.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15885328.0, "logits/rejected": -5100546.181818182, "logps/chosen": -378.44429837740387, "logps/rejected": -616.9588512073864, "loss": 0.0488, "rewards/chosen": 7.141073960524339, "rewards/margins": 24.926357829487404, "rewards/rejected": -17.785283868963067, "step": 3619 }, { "epoch": 0.9921885706454707, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14042841.777777778, "logits/rejected": -2734603.7333333334, "logps/chosen": -452.85313585069446, "logps/rejected": -512.26474609375, "loss": 0.0111, "rewards/chosen": 9.645287407769096, "rewards/margins": 20.595812310112848, "rewards/rejected": -10.95052490234375, "step": 3620 }, { "epoch": 0.9924626558859806, "grad_norm": 10.625, "kl": 1.7067184448242188, "learning_rate": 5e-06, "logits/chosen": -11007397.333333334, "logits/rejected": 12827486.666666666, "logps/chosen": -406.185791015625, "logps/rejected": -594.7978515625, "loss": 0.0385, "rewards/chosen": 7.256401062011719, "rewards/margins": 18.767031351725258, "rewards/rejected": -11.510630289713541, "step": 3621 }, { "epoch": 0.9927367411264904, "grad_norm": 6.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37492544.0, "logits/rejected": -40786870.85714286, "logps/chosen": -409.158203125, "logps/rejected": -658.7349330357143, "loss": 0.0188, "rewards/chosen": 7.309268188476563, "rewards/margins": 24.163422502790176, "rewards/rejected": -16.854154314313615, "step": 3622 }, { "epoch": 0.9930108263670001, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32187392.0, "logits/rejected": -18585292.444444444, "logps/chosen": -442.66809895833336, "logps/rejected": -556.2862955729166, "loss": 0.0418, "rewards/chosen": 6.8881383260091145, "rewards/margins": 21.98871070014106, "rewards/rejected": -15.100572374131945, "step": 3623 }, { "epoch": 0.99328491160751, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22836817.230769232, "logits/rejected": -32644366.545454547, "logps/chosen": -429.66811899038464, "logps/rejected": -618.388671875, "loss": 0.042, "rewards/chosen": 6.196348337026743, "rewards/margins": 22.082323567850608, "rewards/rejected": -15.885975230823863, "step": 3624 }, { "epoch": 0.9935589968480197, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13634228.8, "logits/rejected": -28877737.14285714, "logps/chosen": -432.554296875, "logps/rejected": -509.4014369419643, "loss": 0.014, "rewards/chosen": 7.775106048583984, "rewards/margins": 17.25827669416155, "rewards/rejected": -9.483170645577568, "step": 3625 }, { "epoch": 0.9938330820885295, "grad_norm": 3.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23605104.0, "logits/rejected": -30948925.333333332, "logps/chosen": -364.2631022135417, "logps/rejected": -419.494873046875, "loss": 0.012, "rewards/chosen": 7.01107915242513, "rewards/margins": 19.400211334228516, "rewards/rejected": -12.389132181803385, "step": 3626 }, { "epoch": 0.9941071673290394, "grad_norm": 11.0625, "kl": 9.597711563110352, "learning_rate": 5e-06, "logits/chosen": -27834006.0, "logits/rejected": -26910984.0, "logps/chosen": -509.1912841796875, "logps/rejected": -573.7071533203125, "loss": 0.0559, "rewards/chosen": 6.209501266479492, "rewards/margins": 19.819896697998047, "rewards/rejected": -13.610395431518555, "step": 3627 }, { "epoch": 0.9943812525695491, "grad_norm": 11.0625, "kl": 11.921829223632812, "learning_rate": 5e-06, "logits/chosen": -23882514.82352941, "logits/rejected": -27715995.42857143, "logps/chosen": -490.9742647058824, "logps/rejected": -466.463134765625, "loss": 0.0387, "rewards/chosen": 9.344805549172793, "rewards/margins": 18.454544035326535, "rewards/rejected": -9.10973848615374, "step": 3628 }, { "epoch": 0.9946553378100589, "grad_norm": 1.7109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29154506.666666668, "logits/rejected": -60413542.4, "logps/chosen": -367.59483506944446, "logps/rejected": -616.7327473958334, "loss": 0.0042, "rewards/chosen": 6.670477125379774, "rewards/margins": 21.515720452202693, "rewards/rejected": -14.845243326822917, "step": 3629 }, { "epoch": 0.9949294230505688, "grad_norm": 12.0, "kl": 6.268505096435547, "learning_rate": 5e-06, "logits/chosen": -27911061.333333332, "logits/rejected": -39923522.666666664, "logps/chosen": -366.7002360026042, "logps/rejected": -635.3284098307291, "loss": 0.0527, "rewards/chosen": 6.300426483154297, "rewards/margins": 20.565809885660805, "rewards/rejected": -14.26538340250651, "step": 3630 }, { "epoch": 0.9952035082910785, "grad_norm": 4.90625, "kl": 1.5191377401351929, "learning_rate": 5e-06, "logits/chosen": -27478421.333333332, "logits/rejected": -20490924.0, "logps/chosen": -340.99570719401044, "logps/rejected": -358.2278645833333, "loss": 0.0247, "rewards/chosen": 7.67038091023763, "rewards/margins": 17.053862889607746, "rewards/rejected": -9.383481979370117, "step": 3631 }, { "epoch": 0.9954775935315884, "grad_norm": 7.09375, "kl": 16.061477661132812, "learning_rate": 5e-06, "logits/chosen": -35261196.8, "logits/rejected": -31102858.666666668, "logps/chosen": -421.70908203125, "logps/rejected": -440.92955186631946, "loss": 0.0271, "rewards/chosen": 8.463796997070313, "rewards/margins": 20.695625474717882, "rewards/rejected": -12.23182847764757, "step": 3632 }, { "epoch": 0.9957516787720981, "grad_norm": 9.9375, "kl": 11.213827133178711, "learning_rate": 5e-06, "logits/chosen": -21529618.285714287, "logits/rejected": -20626009.6, "logps/chosen": -456.8916015625, "logps/rejected": -443.22490234375, "loss": 0.0662, "rewards/chosen": 8.337461744035993, "rewards/margins": 18.707200513567244, "rewards/rejected": -10.36973876953125, "step": 3633 }, { "epoch": 0.9960257640126079, "grad_norm": 4.09375, "kl": 0.4137744903564453, "learning_rate": 5e-06, "logits/chosen": -25475540.363636363, "logits/rejected": -32027628.307692308, "logps/chosen": -371.6344549005682, "logps/rejected": -575.2791466346154, "loss": 0.0079, "rewards/chosen": 7.322811473499645, "rewards/margins": 23.175566506552528, "rewards/rejected": -15.852755033052885, "step": 3634 }, { "epoch": 0.9962998492531178, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12339288.0, "logits/rejected": -22482441.14285714, "logps/chosen": -470.05888671875, "logps/rejected": -566.4724469866071, "loss": 0.0558, "rewards/chosen": 8.121621704101562, "rewards/margins": 24.69756905691964, "rewards/rejected": -16.57594735281808, "step": 3635 }, { "epoch": 0.9965739344936275, "grad_norm": 2.203125, "kl": 3.384866237640381, "learning_rate": 5e-06, "logits/chosen": -27234216.0, "logits/rejected": -7600042.666666667, "logps/chosen": -504.1823323567708, "logps/rejected": -546.7577311197916, "loss": 0.0066, "rewards/chosen": 8.560478210449219, "rewards/margins": 21.382692972819008, "rewards/rejected": -12.822214762369791, "step": 3636 }, { "epoch": 0.9968480197341373, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32782828.307692308, "logits/rejected": -21986786.90909091, "logps/chosen": -359.2460186298077, "logps/rejected": -427.43741122159093, "loss": 0.0315, "rewards/chosen": 7.304047217735877, "rewards/margins": 18.96967166287082, "rewards/rejected": -11.665624445134943, "step": 3637 }, { "epoch": 0.9971221049746471, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31171770.181818184, "logits/rejected": -22201282.46153846, "logps/chosen": -365.6346324573864, "logps/rejected": -447.81539212740387, "loss": 0.0611, "rewards/chosen": 6.478625904430043, "rewards/margins": 19.78366435657848, "rewards/rejected": -13.305038452148438, "step": 3638 }, { "epoch": 0.9973961902151569, "grad_norm": 2.953125, "kl": 0.04098256677389145, "learning_rate": 5e-06, "logits/chosen": -18492234.181818184, "logits/rejected": -54803500.307692304, "logps/chosen": -465.29616477272725, "logps/rejected": -642.310546875, "loss": 0.0052, "rewards/chosen": 8.580132917924361, "rewards/margins": 25.611121171004292, "rewards/rejected": -17.03098825307993, "step": 3639 }, { "epoch": 0.9976702754556667, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15378408.0, "logits/rejected": -10622384.0, "logps/chosen": -376.8633117675781, "logps/rejected": -628.2380981445312, "loss": 0.0401, "rewards/chosen": 7.506053447723389, "rewards/margins": 23.489363193511963, "rewards/rejected": -15.983309745788574, "step": 3640 }, { "epoch": 0.9979443606961765, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32604757.333333332, "logits/rejected": -48542097.06666667, "logps/chosen": -420.656982421875, "logps/rejected": -480.6986979166667, "loss": 0.0133, "rewards/chosen": 7.186045328776042, "rewards/margins": 19.25531921386719, "rewards/rejected": -12.069273885091146, "step": 3641 }, { "epoch": 0.9982184459366863, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33602056.0, "logits/rejected": -14034064.0, "logps/chosen": -392.259033203125, "logps/rejected": -701.5033569335938, "loss": 0.025, "rewards/chosen": 6.765414237976074, "rewards/margins": 24.37526035308838, "rewards/rejected": -17.609846115112305, "step": 3642 }, { "epoch": 0.9984925311771962, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36125901.71428572, "logits/rejected": -44250041.6, "logps/chosen": -324.65164620535717, "logps/rejected": -741.03671875, "loss": 0.0302, "rewards/chosen": 5.386114937918527, "rewards/margins": 22.446805245535714, "rewards/rejected": -17.060690307617186, "step": 3643 }, { "epoch": 0.9987666164177059, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28469436.8, "logits/rejected": -34244338.28571428, "logps/chosen": -424.0158203125, "logps/rejected": -689.2735770089286, "loss": 0.0129, "rewards/chosen": 6.912315368652344, "rewards/margins": 24.429286411830358, "rewards/rejected": -17.516971043178014, "step": 3644 }, { "epoch": 0.9990407016582157, "grad_norm": 2.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10331165.333333334, "logits/rejected": -21615513.6, "logps/chosen": -384.4914822048611, "logps/rejected": -629.4067708333333, "loss": 0.0102, "rewards/chosen": 6.4847971598307295, "rewards/margins": 22.71676737467448, "rewards/rejected": -16.23197021484375, "step": 3645 }, { "epoch": 0.9993147868987255, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23935066.181818184, "logits/rejected": -19116199.384615384, "logps/chosen": -405.27463600852275, "logps/rejected": -673.8553185096154, "loss": 0.009, "rewards/chosen": 6.945154363458807, "rewards/margins": 25.63474012254835, "rewards/rejected": -18.689585759089542, "step": 3646 }, { "epoch": 0.9995888721392353, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19072386.666666668, "logits/rejected": -11728288.0, "logps/chosen": -464.8741861979167, "logps/rejected": -448.9371337890625, "loss": 0.0282, "rewards/chosen": 8.191619237263998, "rewards/margins": 21.319589614868164, "rewards/rejected": -13.127970377604166, "step": 3647 }, { "epoch": 0.9998629573797451, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19389143.111111112, "logits/rejected": -21315622.4, "logps/chosen": -538.0897894965278, "logps/rejected": -820.299609375, "loss": 0.0249, "rewards/chosen": 7.493621826171875, "rewards/margins": 30.48381144205729, "rewards/rejected": -22.990189615885416, "step": 3648 }, { "epoch": 1.0, "grad_norm": 0.85546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58475392.0, "logits/rejected": -39694160.0, "logps/chosen": -330.7967529296875, "logps/rejected": -606.4405110677084, "loss": 0.0019, "rewards/chosen": 7.148828506469727, "rewards/margins": 22.359071731567383, "rewards/rejected": -15.210243225097656, "step": 3649 } ], "logging_steps": 1, "max_steps": 3649, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1825, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }